xref: /openbmc/qemu/block/qcow2-refcount.c (revision 0017c64e1ce298796caee2d38bde9d7fc59a1510)
1 /*
2  * Block driver for the QCOW version 2 format
3  *
4  * Copyright (c) 2004-2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu/osdep.h"
26 #include "block/block-io.h"
27 #include "qapi/error.h"
28 #include "qcow2.h"
29 #include "qemu/range.h"
30 #include "qemu/bswap.h"
31 #include "qemu/cutils.h"
32 #include "qemu/memalign.h"
33 #include "trace.h"
34 
35 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,
36                                     uint64_t max);
37 
38 G_GNUC_WARN_UNUSED_RESULT
39 static int update_refcount(BlockDriverState *bs,
40                            int64_t offset, int64_t length, uint64_t addend,
41                            bool decrease, enum qcow2_discard_type type);
42 
43 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index);
44 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index);
45 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index);
46 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index);
47 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index);
48 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index);
49 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index);
50 
51 static void set_refcount_ro0(void *refcount_array, uint64_t index,
52                              uint64_t value);
53 static void set_refcount_ro1(void *refcount_array, uint64_t index,
54                              uint64_t value);
55 static void set_refcount_ro2(void *refcount_array, uint64_t index,
56                              uint64_t value);
57 static void set_refcount_ro3(void *refcount_array, uint64_t index,
58                              uint64_t value);
59 static void set_refcount_ro4(void *refcount_array, uint64_t index,
60                              uint64_t value);
61 static void set_refcount_ro5(void *refcount_array, uint64_t index,
62                              uint64_t value);
63 static void set_refcount_ro6(void *refcount_array, uint64_t index,
64                              uint64_t value);
65 
66 
67 static Qcow2GetRefcountFunc *const get_refcount_funcs[] = {
68     &get_refcount_ro0,
69     &get_refcount_ro1,
70     &get_refcount_ro2,
71     &get_refcount_ro3,
72     &get_refcount_ro4,
73     &get_refcount_ro5,
74     &get_refcount_ro6
75 };
76 
77 static Qcow2SetRefcountFunc *const set_refcount_funcs[] = {
78     &set_refcount_ro0,
79     &set_refcount_ro1,
80     &set_refcount_ro2,
81     &set_refcount_ro3,
82     &set_refcount_ro4,
83     &set_refcount_ro5,
84     &set_refcount_ro6
85 };
86 
87 
88 /*********************************************************/
89 /* refcount handling */
90 
91 static void update_max_refcount_table_index(BDRVQcow2State *s)
92 {
93     unsigned i = s->refcount_table_size - 1;
94     while (i > 0 && (s->refcount_table[i] & REFT_OFFSET_MASK) == 0) {
95         i--;
96     }
97     /* Set s->max_refcount_table_index to the index of the last used entry */
98     s->max_refcount_table_index = i;
99 }
100 
101 int coroutine_fn qcow2_refcount_init(BlockDriverState *bs)
102 {
103     BDRVQcow2State *s = bs->opaque;
104     unsigned int refcount_table_size2, i;
105     int ret;
106 
107     assert(s->refcount_order >= 0 && s->refcount_order <= 6);
108 
109     s->get_refcount = get_refcount_funcs[s->refcount_order];
110     s->set_refcount = set_refcount_funcs[s->refcount_order];
111 
112     assert(s->refcount_table_size <= INT_MAX / REFTABLE_ENTRY_SIZE);
113     refcount_table_size2 = s->refcount_table_size * REFTABLE_ENTRY_SIZE;
114     s->refcount_table = g_try_malloc(refcount_table_size2);
115 
116     if (s->refcount_table_size > 0) {
117         if (s->refcount_table == NULL) {
118             ret = -ENOMEM;
119             goto fail;
120         }
121         BLKDBG_CO_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
122         ret = bdrv_co_pread(bs->file, s->refcount_table_offset,
123                             refcount_table_size2, s->refcount_table, 0);
124         if (ret < 0) {
125             goto fail;
126         }
127         for(i = 0; i < s->refcount_table_size; i++)
128             be64_to_cpus(&s->refcount_table[i]);
129         update_max_refcount_table_index(s);
130     }
131     return 0;
132  fail:
133     return ret;
134 }
135 
136 void qcow2_refcount_close(BlockDriverState *bs)
137 {
138     BDRVQcow2State *s = bs->opaque;
139     g_free(s->refcount_table);
140 }
141 
142 
143 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index)
144 {
145     return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1;
146 }
147 
148 static void set_refcount_ro0(void *refcount_array, uint64_t index,
149                              uint64_t value)
150 {
151     assert(!(value >> 1));
152     ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8));
153     ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8);
154 }
155 
156 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index)
157 {
158     return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4)))
159            & 0x3;
160 }
161 
162 static void set_refcount_ro1(void *refcount_array, uint64_t index,
163                              uint64_t value)
164 {
165     assert(!(value >> 2));
166     ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4)));
167     ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4));
168 }
169 
170 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index)
171 {
172     return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2)))
173            & 0xf;
174 }
175 
176 static void set_refcount_ro2(void *refcount_array, uint64_t index,
177                              uint64_t value)
178 {
179     assert(!(value >> 4));
180     ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2)));
181     ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2));
182 }
183 
184 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index)
185 {
186     return ((const uint8_t *)refcount_array)[index];
187 }
188 
189 static void set_refcount_ro3(void *refcount_array, uint64_t index,
190                              uint64_t value)
191 {
192     assert(!(value >> 8));
193     ((uint8_t *)refcount_array)[index] = value;
194 }
195 
196 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index)
197 {
198     return be16_to_cpu(((const uint16_t *)refcount_array)[index]);
199 }
200 
201 static void set_refcount_ro4(void *refcount_array, uint64_t index,
202                              uint64_t value)
203 {
204     assert(!(value >> 16));
205     ((uint16_t *)refcount_array)[index] = cpu_to_be16(value);
206 }
207 
208 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index)
209 {
210     return be32_to_cpu(((const uint32_t *)refcount_array)[index]);
211 }
212 
213 static void set_refcount_ro5(void *refcount_array, uint64_t index,
214                              uint64_t value)
215 {
216     assert(!(value >> 32));
217     ((uint32_t *)refcount_array)[index] = cpu_to_be32(value);
218 }
219 
220 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index)
221 {
222     return be64_to_cpu(((const uint64_t *)refcount_array)[index]);
223 }
224 
225 static void set_refcount_ro6(void *refcount_array, uint64_t index,
226                              uint64_t value)
227 {
228     ((uint64_t *)refcount_array)[index] = cpu_to_be64(value);
229 }
230 
231 
232 static int load_refcount_block(BlockDriverState *bs,
233                                int64_t refcount_block_offset,
234                                void **refcount_block)
235 {
236     BDRVQcow2State *s = bs->opaque;
237 
238     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
239     return qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
240                            refcount_block);
241 }
242 
243 /*
244  * Retrieves the refcount of the cluster given by its index and stores it in
245  * *refcount. Returns 0 on success and -errno on failure.
246  */
247 int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
248                        uint64_t *refcount)
249 {
250     BDRVQcow2State *s = bs->opaque;
251     uint64_t refcount_table_index, block_index;
252     int64_t refcount_block_offset;
253     int ret;
254     void *refcount_block;
255 
256     refcount_table_index = cluster_index >> s->refcount_block_bits;
257     if (refcount_table_index >= s->refcount_table_size) {
258         *refcount = 0;
259         return 0;
260     }
261     refcount_block_offset =
262         s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
263     if (!refcount_block_offset) {
264         *refcount = 0;
265         return 0;
266     }
267 
268     if (offset_into_cluster(s, refcount_block_offset)) {
269         qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64
270                                 " unaligned (reftable index: %#" PRIx64 ")",
271                                 refcount_block_offset, refcount_table_index);
272         return -EIO;
273     }
274 
275     ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
276                           &refcount_block);
277     if (ret < 0) {
278         return ret;
279     }
280 
281     block_index = cluster_index & (s->refcount_block_size - 1);
282     *refcount = s->get_refcount(refcount_block, block_index);
283 
284     qcow2_cache_put(s->refcount_block_cache, &refcount_block);
285 
286     return 0;
287 }
288 
289 /* Checks if two offsets are described by the same refcount block */
290 static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a,
291     uint64_t offset_b)
292 {
293     uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits);
294     uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits);
295 
296     return (block_a == block_b);
297 }
298 
299 /*
300  * Loads a refcount block. If it doesn't exist yet, it is allocated first
301  * (including growing the refcount table if needed).
302  *
303  * Returns 0 on success or -errno in error case
304  */
305 static int alloc_refcount_block(BlockDriverState *bs,
306                                 int64_t cluster_index, void **refcount_block)
307 {
308     BDRVQcow2State *s = bs->opaque;
309     unsigned int refcount_table_index;
310     int64_t ret;
311 
312     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
313 
314     /* Find the refcount block for the given cluster */
315     refcount_table_index = cluster_index >> s->refcount_block_bits;
316 
317     if (refcount_table_index < s->refcount_table_size) {
318 
319         uint64_t refcount_block_offset =
320             s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
321 
322         /* If it's already there, we're done */
323         if (refcount_block_offset) {
324             if (offset_into_cluster(s, refcount_block_offset)) {
325                 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
326                                         PRIx64 " unaligned (reftable index: "
327                                         "%#x)", refcount_block_offset,
328                                         refcount_table_index);
329                 return -EIO;
330             }
331 
332              return load_refcount_block(bs, refcount_block_offset,
333                                         refcount_block);
334         }
335     }
336 
337     /*
338      * If we came here, we need to allocate something. Something is at least
339      * a cluster for the new refcount block. It may also include a new refcount
340      * table if the old refcount table is too small.
341      *
342      * Note that allocating clusters here needs some special care:
343      *
344      * - We can't use the normal qcow2_alloc_clusters(), it would try to
345      *   increase the refcount and very likely we would end up with an endless
346      *   recursion. Instead we must place the refcount blocks in a way that
347      *   they can describe them themselves.
348      *
349      * - We need to consider that at this point we are inside update_refcounts
350      *   and potentially doing an initial refcount increase. This means that
351      *   some clusters have already been allocated by the caller, but their
352      *   refcount isn't accurate yet. If we allocate clusters for metadata, we
353      *   need to return -EAGAIN to signal the caller that it needs to restart
354      *   the search for free clusters.
355      *
356      * - alloc_clusters_noref and qcow2_free_clusters may load a different
357      *   refcount block into the cache
358      */
359 
360     *refcount_block = NULL;
361 
362     /* We write to the refcount table, so we might depend on L2 tables */
363     ret = qcow2_cache_flush(bs, s->l2_table_cache);
364     if (ret < 0) {
365         return ret;
366     }
367 
368     /* Allocate the refcount block itself and mark it as used */
369     int64_t new_block = alloc_clusters_noref(bs, s->cluster_size, INT64_MAX);
370     if (new_block < 0) {
371         return new_block;
372     }
373 
374     /* The offset must fit in the offset field of the refcount table entry */
375     assert((new_block & REFT_OFFSET_MASK) == new_block);
376 
377     /* If we're allocating the block at offset 0 then something is wrong */
378     if (new_block == 0) {
379         qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
380                                 "allocation of refcount block at offset 0");
381         return -EIO;
382     }
383 
384 #ifdef DEBUG_ALLOC2
385     fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
386         " at %" PRIx64 "\n",
387         refcount_table_index, cluster_index << s->cluster_bits, new_block);
388 #endif
389 
390     if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
391         /* Zero the new refcount block before updating it */
392         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
393                                     refcount_block);
394         if (ret < 0) {
395             goto fail;
396         }
397 
398         memset(*refcount_block, 0, s->cluster_size);
399 
400         /* The block describes itself, need to update the cache */
401         int block_index = (new_block >> s->cluster_bits) &
402             (s->refcount_block_size - 1);
403         s->set_refcount(*refcount_block, block_index, 1);
404     } else {
405         /* Described somewhere else. This can recurse at most twice before we
406          * arrive at a block that describes itself. */
407         ret = update_refcount(bs, new_block, s->cluster_size, 1, false,
408                               QCOW2_DISCARD_NEVER);
409         if (ret < 0) {
410             goto fail;
411         }
412 
413         ret = qcow2_cache_flush(bs, s->refcount_block_cache);
414         if (ret < 0) {
415             goto fail;
416         }
417 
418         /* Initialize the new refcount block only after updating its refcount,
419          * update_refcount uses the refcount cache itself */
420         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
421                                     refcount_block);
422         if (ret < 0) {
423             goto fail;
424         }
425 
426         memset(*refcount_block, 0, s->cluster_size);
427     }
428 
429     /* Now the new refcount block needs to be written to disk */
430     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
431     qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
432     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
433     if (ret < 0) {
434         goto fail;
435     }
436 
437     /* If the refcount table is big enough, just hook the block up there */
438     if (refcount_table_index < s->refcount_table_size) {
439         uint64_t data64 = cpu_to_be64(new_block);
440         BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
441         ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset +
442                                refcount_table_index * REFTABLE_ENTRY_SIZE,
443             sizeof(data64), &data64, 0);
444         if (ret < 0) {
445             goto fail;
446         }
447 
448         s->refcount_table[refcount_table_index] = new_block;
449         /* If there's a hole in s->refcount_table then it can happen
450          * that refcount_table_index < s->max_refcount_table_index */
451         s->max_refcount_table_index =
452             MAX(s->max_refcount_table_index, refcount_table_index);
453 
454         /* The new refcount block may be where the caller intended to put its
455          * data, so let it restart the search. */
456         return -EAGAIN;
457     }
458 
459     qcow2_cache_put(s->refcount_block_cache, refcount_block);
460 
461     /*
462      * If we come here, we need to grow the refcount table. Again, a new
463      * refcount table needs some space and we can't simply allocate to avoid
464      * endless recursion.
465      *
466      * Therefore let's grab new refcount blocks at the end of the image, which
467      * will describe themselves and the new refcount table. This way we can
468      * reference them only in the new table and do the switch to the new
469      * refcount table at once without producing an inconsistent state in
470      * between.
471      */
472     BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
473 
474     /* Calculate the number of refcount blocks needed so far; this will be the
475      * basis for calculating the index of the first cluster used for the
476      * self-describing refcount structures which we are about to create.
477      *
478      * Because we reached this point, there cannot be any refcount entries for
479      * cluster_index or higher indices yet. However, because new_block has been
480      * allocated to describe that cluster (and it will assume this role later
481      * on), we cannot use that index; also, new_block may actually have a higher
482      * cluster index than cluster_index, so it needs to be taken into account
483      * here (and 1 needs to be added to its value because that cluster is used).
484      */
485     uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1,
486                                             (new_block >> s->cluster_bits) + 1),
487                                         s->refcount_block_size);
488 
489     /* Create the new refcount table and blocks */
490     uint64_t meta_offset = (blocks_used * s->refcount_block_size) *
491         s->cluster_size;
492 
493     ret = qcow2_refcount_area(bs, meta_offset, 0, false,
494                               refcount_table_index, new_block);
495     if (ret < 0) {
496         return ret;
497     }
498 
499     ret = load_refcount_block(bs, new_block, refcount_block);
500     if (ret < 0) {
501         return ret;
502     }
503 
504     /* If we were trying to do the initial refcount update for some cluster
505      * allocation, we might have used the same clusters to store newly
506      * allocated metadata. Make the caller search some new space. */
507     return -EAGAIN;
508 
509 fail:
510     if (*refcount_block != NULL) {
511         qcow2_cache_put(s->refcount_block_cache, refcount_block);
512     }
513     return ret;
514 }
515 
516 /*
517  * Starting at @start_offset, this function creates new self-covering refcount
518  * structures: A new refcount table and refcount blocks which cover all of
519  * themselves, and a number of @additional_clusters beyond their end.
520  * @start_offset must be at the end of the image file, that is, there must be
521  * only empty space beyond it.
522  * If @exact_size is false, the refcount table will have 50 % more entries than
523  * necessary so it will not need to grow again soon.
524  * If @new_refblock_offset is not zero, it contains the offset of a refcount
525  * block that should be entered into the new refcount table at index
526  * @new_refblock_index.
527  *
528  * Returns: The offset after the new refcount structures (i.e. where the
529  *          @additional_clusters may be placed) on success, -errno on error.
530  */
531 int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset,
532                             uint64_t additional_clusters, bool exact_size,
533                             int new_refblock_index,
534                             uint64_t new_refblock_offset)
535 {
536     BDRVQcow2State *s = bs->opaque;
537     uint64_t total_refblock_count_u64, additional_refblock_count;
538     int total_refblock_count, table_size, area_reftable_index, table_clusters;
539     int i;
540     uint64_t table_offset, block_offset, end_offset;
541     int ret;
542     uint64_t *new_table;
543 
544     assert(!(start_offset % s->cluster_size));
545 
546     qcow2_refcount_metadata_size(start_offset / s->cluster_size +
547                                  additional_clusters,
548                                  s->cluster_size, s->refcount_order,
549                                  !exact_size, &total_refblock_count_u64);
550     if (total_refblock_count_u64 > QCOW_MAX_REFTABLE_SIZE) {
551         return -EFBIG;
552     }
553     total_refblock_count = total_refblock_count_u64;
554 
555     /* Index in the refcount table of the first refcount block to cover the area
556      * of refcount structures we are about to create; we know that
557      * @total_refblock_count can cover @start_offset, so this will definitely
558      * fit into an int. */
559     area_reftable_index = (start_offset / s->cluster_size) /
560                           s->refcount_block_size;
561 
562     if (exact_size) {
563         table_size = total_refblock_count;
564     } else {
565         table_size = total_refblock_count +
566                      DIV_ROUND_UP(total_refblock_count, 2);
567     }
568     /* The qcow2 file can only store the reftable size in number of clusters */
569     table_size = ROUND_UP(table_size, s->cluster_size / REFTABLE_ENTRY_SIZE);
570     table_clusters = (table_size * REFTABLE_ENTRY_SIZE) / s->cluster_size;
571 
572     if (table_size > QCOW_MAX_REFTABLE_SIZE) {
573         return -EFBIG;
574     }
575 
576     new_table = g_try_new0(uint64_t, table_size);
577 
578     assert(table_size > 0);
579     if (new_table == NULL) {
580         ret = -ENOMEM;
581         goto fail;
582     }
583 
584     /* Fill the new refcount table */
585     if (table_size > s->max_refcount_table_index) {
586         /* We're actually growing the reftable */
587         memcpy(new_table, s->refcount_table,
588                (s->max_refcount_table_index + 1) * REFTABLE_ENTRY_SIZE);
589     } else {
590         /* Improbable case: We're shrinking the reftable. However, the caller
591          * has assured us that there is only empty space beyond @start_offset,
592          * so we can simply drop all of the refblocks that won't fit into the
593          * new reftable. */
594         memcpy(new_table, s->refcount_table, table_size * REFTABLE_ENTRY_SIZE);
595     }
596 
597     if (new_refblock_offset) {
598         assert(new_refblock_index < total_refblock_count);
599         new_table[new_refblock_index] = new_refblock_offset;
600     }
601 
602     /* Count how many new refblocks we have to create */
603     additional_refblock_count = 0;
604     for (i = area_reftable_index; i < total_refblock_count; i++) {
605         if (!new_table[i]) {
606             additional_refblock_count++;
607         }
608     }
609 
610     table_offset = start_offset + additional_refblock_count * s->cluster_size;
611     end_offset = table_offset + table_clusters * s->cluster_size;
612 
613     /* Fill the refcount blocks, and create new ones, if necessary */
614     block_offset = start_offset;
615     for (i = area_reftable_index; i < total_refblock_count; i++) {
616         void *refblock_data;
617         uint64_t first_offset_covered;
618 
619         /* Reuse an existing refblock if possible, create a new one otherwise */
620         if (new_table[i]) {
621             ret = qcow2_cache_get(bs, s->refcount_block_cache, new_table[i],
622                                   &refblock_data);
623             if (ret < 0) {
624                 goto fail;
625             }
626         } else {
627             ret = qcow2_cache_get_empty(bs, s->refcount_block_cache,
628                                         block_offset, &refblock_data);
629             if (ret < 0) {
630                 goto fail;
631             }
632             memset(refblock_data, 0, s->cluster_size);
633             qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
634                                          refblock_data);
635 
636             new_table[i] = block_offset;
637             block_offset += s->cluster_size;
638         }
639 
640         /* First host offset covered by this refblock */
641         first_offset_covered = (uint64_t)i * s->refcount_block_size *
642                                s->cluster_size;
643         if (first_offset_covered < end_offset) {
644             int j, end_index;
645 
646             /* Set the refcount of all of the new refcount structures to 1 */
647 
648             if (first_offset_covered < start_offset) {
649                 assert(i == area_reftable_index);
650                 j = (start_offset - first_offset_covered) / s->cluster_size;
651                 assert(j < s->refcount_block_size);
652             } else {
653                 j = 0;
654             }
655 
656             end_index = MIN((end_offset - first_offset_covered) /
657                             s->cluster_size,
658                             s->refcount_block_size);
659 
660             for (; j < end_index; j++) {
661                 /* The caller guaranteed us this space would be empty */
662                 assert(s->get_refcount(refblock_data, j) == 0);
663                 s->set_refcount(refblock_data, j, 1);
664             }
665 
666             qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
667                                          refblock_data);
668         }
669 
670         qcow2_cache_put(s->refcount_block_cache, &refblock_data);
671     }
672 
673     assert(block_offset == table_offset);
674 
675     /* Write refcount blocks to disk */
676     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
677     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
678     if (ret < 0) {
679         goto fail;
680     }
681 
682     /* Write refcount table to disk */
683     for (i = 0; i < total_refblock_count; i++) {
684         cpu_to_be64s(&new_table[i]);
685     }
686 
687     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
688     ret = bdrv_pwrite_sync(bs->file, table_offset,
689                            table_size * REFTABLE_ENTRY_SIZE, new_table, 0);
690     if (ret < 0) {
691         goto fail;
692     }
693 
694     for (i = 0; i < total_refblock_count; i++) {
695         be64_to_cpus(&new_table[i]);
696     }
697 
698     /* Hook up the new refcount table in the qcow2 header */
699     struct QEMU_PACKED {
700         uint64_t d64;
701         uint32_t d32;
702     } data;
703     data.d64 = cpu_to_be64(table_offset);
704     data.d32 = cpu_to_be32(table_clusters);
705     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
706     ret = bdrv_pwrite_sync(bs->file,
707                            offsetof(QCowHeader, refcount_table_offset),
708                            sizeof(data), &data, 0);
709     if (ret < 0) {
710         goto fail;
711     }
712 
713     /* And switch it in memory */
714     uint64_t old_table_offset = s->refcount_table_offset;
715     uint64_t old_table_size = s->refcount_table_size;
716 
717     g_free(s->refcount_table);
718     s->refcount_table = new_table;
719     s->refcount_table_size = table_size;
720     s->refcount_table_offset = table_offset;
721     update_max_refcount_table_index(s);
722 
723     /* Free old table. */
724     qcow2_free_clusters(bs, old_table_offset,
725                         old_table_size * REFTABLE_ENTRY_SIZE,
726                         QCOW2_DISCARD_OTHER);
727 
728     return end_offset;
729 
730 fail:
731     g_free(new_table);
732     return ret;
733 }
734 
735 void qcow2_process_discards(BlockDriverState *bs, int ret)
736 {
737     BDRVQcow2State *s = bs->opaque;
738     Qcow2DiscardRegion *d, *next;
739 
740     QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
741         QTAILQ_REMOVE(&s->discards, d, next);
742 
743         /* Discard is optional, ignore the return value */
744         if (ret >= 0) {
745             int r2 = bdrv_pdiscard(bs->file, d->offset, d->bytes);
746             if (r2 < 0) {
747                 trace_qcow2_process_discards_failed_region(d->offset, d->bytes,
748                                                            r2);
749             }
750         }
751 
752         g_free(d);
753     }
754 }
755 
756 static void update_refcount_discard(BlockDriverState *bs,
757                                     uint64_t offset, uint64_t length)
758 {
759     BDRVQcow2State *s = bs->opaque;
760     Qcow2DiscardRegion *d, *p, *next;
761 
762     QTAILQ_FOREACH(d, &s->discards, next) {
763         uint64_t new_start = MIN(offset, d->offset);
764         uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
765 
766         if (new_end - new_start <= length + d->bytes) {
767             /* There can't be any overlap, areas ending up here have no
768              * references any more and therefore shouldn't get freed another
769              * time. */
770             assert(d->bytes + length == new_end - new_start);
771             d->offset = new_start;
772             d->bytes = new_end - new_start;
773             goto found;
774         }
775     }
776 
777     d = g_malloc(sizeof(*d));
778     *d = (Qcow2DiscardRegion) {
779         .bs     = bs,
780         .offset = offset,
781         .bytes  = length,
782     };
783     QTAILQ_INSERT_TAIL(&s->discards, d, next);
784 
785 found:
786     /* Merge discard requests if they are adjacent now */
787     QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
788         if (p == d
789             || p->offset > d->offset + d->bytes
790             || d->offset > p->offset + p->bytes)
791         {
792             continue;
793         }
794 
795         /* Still no overlap possible */
796         assert(p->offset == d->offset + d->bytes
797             || d->offset == p->offset + p->bytes);
798 
799         QTAILQ_REMOVE(&s->discards, p, next);
800         d->offset = MIN(d->offset, p->offset);
801         d->bytes += p->bytes;
802         g_free(p);
803     }
804 }
805 
806 /* XXX: cache several refcount block clusters ? */
807 /* @addend is the absolute value of the addend; if @decrease is set, @addend
808  * will be subtracted from the current refcount, otherwise it will be added */
809 static int update_refcount(BlockDriverState *bs,
810                            int64_t offset,
811                            int64_t length,
812                            uint64_t addend,
813                            bool decrease,
814                            enum qcow2_discard_type type)
815 {
816     BDRVQcow2State *s = bs->opaque;
817     int64_t start, last, cluster_offset;
818     void *refcount_block = NULL;
819     int64_t old_table_index = -1;
820     int ret;
821 
822 #ifdef DEBUG_ALLOC2
823     fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64
824             " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "",
825             addend);
826 #endif
827     if (length < 0) {
828         return -EINVAL;
829     } else if (length == 0) {
830         return 0;
831     }
832 
833     if (decrease) {
834         qcow2_cache_set_dependency(bs, s->refcount_block_cache,
835             s->l2_table_cache);
836     }
837 
838     start = start_of_cluster(s, offset);
839     last = start_of_cluster(s, offset + length - 1);
840     for(cluster_offset = start; cluster_offset <= last;
841         cluster_offset += s->cluster_size)
842     {
843         int block_index;
844         uint64_t refcount;
845         int64_t cluster_index = cluster_offset >> s->cluster_bits;
846         int64_t table_index = cluster_index >> s->refcount_block_bits;
847 
848         /* Load the refcount block and allocate it if needed */
849         if (table_index != old_table_index) {
850             if (refcount_block) {
851                 qcow2_cache_put(s->refcount_block_cache, &refcount_block);
852             }
853             ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
854             /* If the caller needs to restart the search for free clusters,
855              * try the same ones first to see if they're still free. */
856             if (ret == -EAGAIN) {
857                 if (s->free_cluster_index > (start >> s->cluster_bits)) {
858                     s->free_cluster_index = (start >> s->cluster_bits);
859                 }
860             }
861             if (ret < 0) {
862                 goto fail;
863             }
864         }
865         old_table_index = table_index;
866 
867         qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
868 
869         /* we can update the count and save it */
870         block_index = cluster_index & (s->refcount_block_size - 1);
871 
872         refcount = s->get_refcount(refcount_block, block_index);
873         if (decrease ? (refcount - addend > refcount)
874                      : (refcount + addend < refcount ||
875                         refcount + addend > s->refcount_max))
876         {
877             ret = -EINVAL;
878             goto fail;
879         }
880         if (decrease) {
881             refcount -= addend;
882         } else {
883             refcount += addend;
884         }
885         if (refcount == 0 && cluster_index < s->free_cluster_index) {
886             s->free_cluster_index = cluster_index;
887         }
888         s->set_refcount(refcount_block, block_index, refcount);
889 
890         if (refcount == 0) {
891             void *table;
892 
893             table = qcow2_cache_is_table_offset(s->refcount_block_cache,
894                                                 offset);
895             if (table != NULL) {
896                 qcow2_cache_put(s->refcount_block_cache, &refcount_block);
897                 old_table_index = -1;
898                 qcow2_cache_discard(s->refcount_block_cache, table);
899             }
900 
901             table = qcow2_cache_is_table_offset(s->l2_table_cache, offset);
902             if (table != NULL) {
903                 qcow2_cache_discard(s->l2_table_cache, table);
904             }
905 
906             if (s->discard_passthrough[type]) {
907                 update_refcount_discard(bs, cluster_offset, s->cluster_size);
908             }
909         }
910     }
911 
912     ret = 0;
913 fail:
914     if (!s->cache_discards) {
915         qcow2_process_discards(bs, ret);
916     }
917 
918     /* Write last changed block to disk */
919     if (refcount_block) {
920         qcow2_cache_put(s->refcount_block_cache, &refcount_block);
921     }
922 
923     /*
924      * Try do undo any updates if an error is returned (This may succeed in
925      * some cases like ENOSPC for allocating a new refcount block)
926      */
927     if (ret < 0) {
928         int dummy;
929         dummy = update_refcount(bs, offset, cluster_offset - offset, addend,
930                                 !decrease, QCOW2_DISCARD_NEVER);
931         (void)dummy;
932     }
933 
934     return ret;
935 }
936 
937 /*
938  * Increases or decreases the refcount of a given cluster.
939  *
940  * @addend is the absolute value of the addend; if @decrease is set, @addend
941  * will be subtracted from the current refcount, otherwise it will be added.
942  *
943  * On success 0 is returned; on failure -errno is returned.
944  */
945 int qcow2_update_cluster_refcount(BlockDriverState *bs,
946                                   int64_t cluster_index,
947                                   uint64_t addend, bool decrease,
948                                   enum qcow2_discard_type type)
949 {
950     BDRVQcow2State *s = bs->opaque;
951     int ret;
952 
953     ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
954                           decrease, type);
955     if (ret < 0) {
956         return ret;
957     }
958 
959     return 0;
960 }
961 
962 
963 
964 /*********************************************************/
965 /* cluster allocation functions */
966 
967 
968 
969 /* return < 0 if error */
970 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,
971                                     uint64_t max)
972 {
973     BDRVQcow2State *s = bs->opaque;
974     uint64_t i, nb_clusters, refcount;
975     int ret;
976 
977     /* We can't allocate clusters if they may still be queued for discard. */
978     if (s->cache_discards) {
979         qcow2_process_discards(bs, 0);
980     }
981 
982     nb_clusters = size_to_clusters(s, size);
983 retry:
984     for(i = 0; i < nb_clusters; i++) {
985         uint64_t next_cluster_index = s->free_cluster_index++;
986         ret = qcow2_get_refcount(bs, next_cluster_index, &refcount);
987 
988         if (ret < 0) {
989             return ret;
990         } else if (refcount != 0) {
991             goto retry;
992         }
993     }
994 
995     /* Make sure that all offsets in the "allocated" range are representable
996      * in the requested max */
997     if (s->free_cluster_index > 0 &&
998         s->free_cluster_index - 1 > (max >> s->cluster_bits))
999     {
1000         return -EFBIG;
1001     }
1002 
1003 #ifdef DEBUG_ALLOC2
1004     fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
1005             size,
1006             (s->free_cluster_index - nb_clusters) << s->cluster_bits);
1007 #endif
1008     return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
1009 }
1010 
1011 int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
1012 {
1013     int64_t offset;
1014     int ret;
1015 
1016     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
1017     do {
1018         offset = alloc_clusters_noref(bs, size, QCOW_MAX_CLUSTER_OFFSET);
1019         if (offset < 0) {
1020             return offset;
1021         }
1022 
1023         ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
1024     } while (ret == -EAGAIN);
1025 
1026     if (ret < 0) {
1027         return ret;
1028     }
1029 
1030     return offset;
1031 }
1032 
1033 int64_t coroutine_fn qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
1034                                              int64_t nb_clusters)
1035 {
1036     BDRVQcow2State *s = bs->opaque;
1037     uint64_t cluster_index, refcount;
1038     uint64_t i;
1039     int ret;
1040 
1041     assert(nb_clusters >= 0);
1042     if (nb_clusters == 0) {
1043         return 0;
1044     }
1045 
1046     do {
1047         /* Check how many clusters there are free */
1048         cluster_index = offset >> s->cluster_bits;
1049         for(i = 0; i < nb_clusters; i++) {
1050             ret = qcow2_get_refcount(bs, cluster_index++, &refcount);
1051             if (ret < 0) {
1052                 return ret;
1053             } else if (refcount != 0) {
1054                 break;
1055             }
1056         }
1057 
1058         /* And then allocate them */
1059         ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false,
1060                               QCOW2_DISCARD_NEVER);
1061     } while (ret == -EAGAIN);
1062 
1063     if (ret < 0) {
1064         return ret;
1065     }
1066 
1067     return i;
1068 }
1069 
1070 /* only used to allocate compressed sectors. We try to allocate
1071    contiguous sectors. size must be <= cluster_size */
1072 int64_t coroutine_fn GRAPH_RDLOCK qcow2_alloc_bytes(BlockDriverState *bs, int size)
1073 {
1074     BDRVQcow2State *s = bs->opaque;
1075     int64_t offset;
1076     size_t free_in_cluster;
1077     int ret;
1078 
1079     BLKDBG_CO_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
1080     assert(size > 0 && size <= s->cluster_size);
1081     assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset));
1082 
1083     offset = s->free_byte_offset;
1084 
1085     if (offset) {
1086         uint64_t refcount;
1087         ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount);
1088         if (ret < 0) {
1089             return ret;
1090         }
1091 
1092         if (refcount == s->refcount_max) {
1093             offset = 0;
1094         }
1095     }
1096 
1097     free_in_cluster = s->cluster_size - offset_into_cluster(s, offset);
1098     do {
1099         if (!offset || free_in_cluster < size) {
1100             int64_t new_cluster;
1101 
1102             new_cluster = alloc_clusters_noref(bs, s->cluster_size,
1103                                                MIN(s->cluster_offset_mask,
1104                                                    QCOW_MAX_CLUSTER_OFFSET));
1105             if (new_cluster < 0) {
1106                 return new_cluster;
1107             }
1108 
1109             if (new_cluster == 0) {
1110                 qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
1111                                         "allocation of compressed cluster "
1112                                         "at offset 0");
1113                 return -EIO;
1114             }
1115 
1116             if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) {
1117                 offset = new_cluster;
1118                 free_in_cluster = s->cluster_size;
1119             } else {
1120                 free_in_cluster += s->cluster_size;
1121             }
1122         }
1123 
1124         assert(offset);
1125         ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
1126         if (ret < 0) {
1127             offset = 0;
1128         }
1129     } while (ret == -EAGAIN);
1130     if (ret < 0) {
1131         return ret;
1132     }
1133 
1134     /* The cluster refcount was incremented; refcount blocks must be flushed
1135      * before the caller's L2 table updates. */
1136     qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
1137 
1138     s->free_byte_offset = offset + size;
1139     if (!offset_into_cluster(s, s->free_byte_offset)) {
1140         s->free_byte_offset = 0;
1141     }
1142 
1143     return offset;
1144 }
1145 
1146 void qcow2_free_clusters(BlockDriverState *bs,
1147                           int64_t offset, int64_t size,
1148                           enum qcow2_discard_type type)
1149 {
1150     int ret;
1151 
1152     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
1153     ret = update_refcount(bs, offset, size, 1, true, type);
1154     if (ret < 0) {
1155         fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
1156         /* TODO Remember the clusters to free them later and avoid leaking */
1157     }
1158 }
1159 
1160 /*
1161  * Free a cluster using its L2 entry (handles clusters of all types, e.g.
1162  * normal cluster, compressed cluster, etc.)
1163  */
1164 void qcow2_free_any_cluster(BlockDriverState *bs, uint64_t l2_entry,
1165                             enum qcow2_discard_type type)
1166 {
1167     BDRVQcow2State *s = bs->opaque;
1168     QCow2ClusterType ctype = qcow2_get_cluster_type(bs, l2_entry);
1169 
1170     if (has_data_file(bs)) {
1171         if (s->discard_passthrough[type] &&
1172             (ctype == QCOW2_CLUSTER_NORMAL ||
1173              ctype == QCOW2_CLUSTER_ZERO_ALLOC))
1174         {
1175             bdrv_pdiscard(s->data_file, l2_entry & L2E_OFFSET_MASK,
1176                           s->cluster_size);
1177         }
1178         return;
1179     }
1180 
1181     switch (ctype) {
1182     case QCOW2_CLUSTER_COMPRESSED:
1183         {
1184             uint64_t coffset;
1185             int csize;
1186 
1187             qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize);
1188             qcow2_free_clusters(bs, coffset, csize, type);
1189         }
1190         break;
1191     case QCOW2_CLUSTER_NORMAL:
1192     case QCOW2_CLUSTER_ZERO_ALLOC:
1193         if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) {
1194             qcow2_signal_corruption(bs, false, -1, -1,
1195                                     "Cannot free unaligned cluster %#llx",
1196                                     l2_entry & L2E_OFFSET_MASK);
1197         } else {
1198             qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
1199                                 s->cluster_size, type);
1200         }
1201         break;
1202     case QCOW2_CLUSTER_ZERO_PLAIN:
1203     case QCOW2_CLUSTER_UNALLOCATED:
1204         break;
1205     default:
1206         abort();
1207     }
1208 }
1209 
1210 int qcow2_write_caches(BlockDriverState *bs)
1211 {
1212     BDRVQcow2State *s = bs->opaque;
1213     int ret;
1214 
1215     ret = qcow2_cache_write(bs, s->l2_table_cache);
1216     if (ret < 0) {
1217         return ret;
1218     }
1219 
1220     if (qcow2_need_accurate_refcounts(s)) {
1221         ret = qcow2_cache_write(bs, s->refcount_block_cache);
1222         if (ret < 0) {
1223             return ret;
1224         }
1225     }
1226 
1227     return 0;
1228 }
1229 
1230 int qcow2_flush_caches(BlockDriverState *bs)
1231 {
1232     int ret = qcow2_write_caches(bs);
1233     if (ret < 0) {
1234         return ret;
1235     }
1236 
1237     return bdrv_flush(bs->file->bs);
1238 }
1239 
1240 /*********************************************************/
1241 /* snapshots and image creation */
1242 
1243 
1244 
1245 /* update the refcounts of snapshots and the copied flag */
1246 int qcow2_update_snapshot_refcount(BlockDriverState *bs,
1247     int64_t l1_table_offset, int l1_size, int addend)
1248 {
1249     BDRVQcow2State *s = bs->opaque;
1250     uint64_t *l1_table, *l2_slice, l2_offset, entry, l1_size2, refcount;
1251     bool l1_allocated = false;
1252     int64_t old_entry, old_l2_offset;
1253     unsigned slice, slice_size2, n_slices;
1254     int i, j, l1_modified = 0;
1255     int ret;
1256 
1257     assert(addend >= -1 && addend <= 1);
1258 
1259     l2_slice = NULL;
1260     l1_table = NULL;
1261     l1_size2 = l1_size * L1E_SIZE;
1262     slice_size2 = s->l2_slice_size * l2_entry_size(s);
1263     n_slices = s->cluster_size / slice_size2;
1264 
1265     s->cache_discards = true;
1266 
1267     /* WARNING: qcow2_snapshot_goto relies on this function not using the
1268      * l1_table_offset when it is the current s->l1_table_offset! Be careful
1269      * when changing this! */
1270     if (l1_table_offset != s->l1_table_offset) {
1271         l1_table = g_try_malloc0(l1_size2);
1272         if (l1_size2 && l1_table == NULL) {
1273             ret = -ENOMEM;
1274             goto fail;
1275         }
1276         l1_allocated = true;
1277 
1278         ret = bdrv_pread(bs->file, l1_table_offset, l1_size2, l1_table, 0);
1279         if (ret < 0) {
1280             goto fail;
1281         }
1282 
1283         for (i = 0; i < l1_size; i++) {
1284             be64_to_cpus(&l1_table[i]);
1285         }
1286     } else {
1287         assert(l1_size == s->l1_size);
1288         l1_table = s->l1_table;
1289         l1_allocated = false;
1290     }
1291 
1292     for (i = 0; i < l1_size; i++) {
1293         l2_offset = l1_table[i];
1294         if (l2_offset) {
1295             old_l2_offset = l2_offset;
1296             l2_offset &= L1E_OFFSET_MASK;
1297 
1298             if (offset_into_cluster(s, l2_offset)) {
1299                 qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
1300                                         PRIx64 " unaligned (L1 index: %#x)",
1301                                         l2_offset, i);
1302                 ret = -EIO;
1303                 goto fail;
1304             }
1305 
1306             for (slice = 0; slice < n_slices; slice++) {
1307                 ret = qcow2_cache_get(bs, s->l2_table_cache,
1308                                       l2_offset + slice * slice_size2,
1309                                       (void **) &l2_slice);
1310                 if (ret < 0) {
1311                     goto fail;
1312                 }
1313 
1314                 for (j = 0; j < s->l2_slice_size; j++) {
1315                     uint64_t cluster_index;
1316                     uint64_t offset;
1317 
1318                     entry = get_l2_entry(s, l2_slice, j);
1319                     old_entry = entry;
1320                     entry &= ~QCOW_OFLAG_COPIED;
1321                     offset = entry & L2E_OFFSET_MASK;
1322 
1323                     switch (qcow2_get_cluster_type(bs, entry)) {
1324                     case QCOW2_CLUSTER_COMPRESSED:
1325                         if (addend != 0) {
1326                             uint64_t coffset;
1327                             int csize;
1328 
1329                             qcow2_parse_compressed_l2_entry(bs, entry,
1330                                                             &coffset, &csize);
1331                             ret = update_refcount(
1332                                 bs, coffset, csize,
1333                                 abs(addend), addend < 0,
1334                                 QCOW2_DISCARD_SNAPSHOT);
1335                             if (ret < 0) {
1336                                 goto fail;
1337                             }
1338                         }
1339                         /* compressed clusters are never modified */
1340                         refcount = 2;
1341                         break;
1342 
1343                     case QCOW2_CLUSTER_NORMAL:
1344                     case QCOW2_CLUSTER_ZERO_ALLOC:
1345                         if (offset_into_cluster(s, offset)) {
1346                             /* Here l2_index means table (not slice) index */
1347                             int l2_index = slice * s->l2_slice_size + j;
1348                             qcow2_signal_corruption(
1349                                 bs, true, -1, -1, "Cluster "
1350                                 "allocation offset %#" PRIx64
1351                                 " unaligned (L2 offset: %#"
1352                                 PRIx64 ", L2 index: %#x)",
1353                                 offset, l2_offset, l2_index);
1354                             ret = -EIO;
1355                             goto fail;
1356                         }
1357 
1358                         cluster_index = offset >> s->cluster_bits;
1359                         assert(cluster_index);
1360                         if (addend != 0) {
1361                             ret = qcow2_update_cluster_refcount(
1362                                 bs, cluster_index, abs(addend), addend < 0,
1363                                 QCOW2_DISCARD_SNAPSHOT);
1364                             if (ret < 0) {
1365                                 goto fail;
1366                             }
1367                         }
1368 
1369                         ret = qcow2_get_refcount(bs, cluster_index, &refcount);
1370                         if (ret < 0) {
1371                             goto fail;
1372                         }
1373                         break;
1374 
1375                     case QCOW2_CLUSTER_ZERO_PLAIN:
1376                     case QCOW2_CLUSTER_UNALLOCATED:
1377                         refcount = 0;
1378                         break;
1379 
1380                     default:
1381                         abort();
1382                     }
1383 
1384                     if (refcount == 1) {
1385                         entry |= QCOW_OFLAG_COPIED;
1386                     }
1387                     if (entry != old_entry) {
1388                         if (addend > 0) {
1389                             qcow2_cache_set_dependency(bs, s->l2_table_cache,
1390                                                        s->refcount_block_cache);
1391                         }
1392                         set_l2_entry(s, l2_slice, j, entry);
1393                         qcow2_cache_entry_mark_dirty(s->l2_table_cache,
1394                                                      l2_slice);
1395                     }
1396                 }
1397 
1398                 qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1399             }
1400 
1401             if (addend != 0) {
1402                 ret = qcow2_update_cluster_refcount(bs, l2_offset >>
1403                                                         s->cluster_bits,
1404                                                     abs(addend), addend < 0,
1405                                                     QCOW2_DISCARD_SNAPSHOT);
1406                 if (ret < 0) {
1407                     goto fail;
1408                 }
1409             }
1410             ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1411                                      &refcount);
1412             if (ret < 0) {
1413                 goto fail;
1414             } else if (refcount == 1) {
1415                 l2_offset |= QCOW_OFLAG_COPIED;
1416             }
1417             if (l2_offset != old_l2_offset) {
1418                 l1_table[i] = l2_offset;
1419                 l1_modified = 1;
1420             }
1421         }
1422     }
1423 
1424     ret = bdrv_flush(bs);
1425 fail:
1426     if (l2_slice) {
1427         qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1428     }
1429 
1430     s->cache_discards = false;
1431     qcow2_process_discards(bs, ret);
1432 
1433     /* Update L1 only if it isn't deleted anyway (addend = -1) */
1434     if (ret == 0 && addend >= 0 && l1_modified) {
1435         for (i = 0; i < l1_size; i++) {
1436             cpu_to_be64s(&l1_table[i]);
1437         }
1438 
1439         ret = bdrv_pwrite_sync(bs->file, l1_table_offset, l1_size2, l1_table,
1440                                0);
1441 
1442         for (i = 0; i < l1_size; i++) {
1443             be64_to_cpus(&l1_table[i]);
1444         }
1445     }
1446     if (l1_allocated)
1447         g_free(l1_table);
1448     return ret;
1449 }
1450 
1451 
1452 
1453 
1454 /*********************************************************/
1455 /* refcount checking functions */
1456 
1457 
1458 static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries)
1459 {
1460     /* This assertion holds because there is no way we can address more than
1461      * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because
1462      * offsets have to be representable in bytes); due to every cluster
1463      * corresponding to one refcount entry, we are well below that limit */
1464     assert(entries < (UINT64_C(1) << (64 - 9)));
1465 
1466     /* Thanks to the assertion this will not overflow, because
1467      * s->refcount_order < 7.
1468      * (note: x << s->refcount_order == x * s->refcount_bits) */
1469     return DIV_ROUND_UP(entries << s->refcount_order, 8);
1470 }
1471 
1472 /**
1473  * Reallocates *array so that it can hold new_size entries. *size must contain
1474  * the current number of entries in *array. If the reallocation fails, *array
1475  * and *size will not be modified and -errno will be returned. If the
1476  * reallocation is successful, *array will be set to the new buffer, *size
1477  * will be set to new_size and 0 will be returned. The size of the reallocated
1478  * refcount array buffer will be aligned to a cluster boundary, and the newly
1479  * allocated area will be zeroed.
1480  */
1481 static int realloc_refcount_array(BDRVQcow2State *s, void **array,
1482                                   int64_t *size, int64_t new_size)
1483 {
1484     int64_t old_byte_size, new_byte_size;
1485     void *new_ptr;
1486 
1487     /* Round to clusters so the array can be directly written to disk */
1488     old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size))
1489                     * s->cluster_size;
1490     new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size))
1491                     * s->cluster_size;
1492 
1493     if (new_byte_size == old_byte_size) {
1494         *size = new_size;
1495         return 0;
1496     }
1497 
1498     assert(new_byte_size > 0);
1499 
1500     if (new_byte_size > SIZE_MAX) {
1501         return -ENOMEM;
1502     }
1503 
1504     new_ptr = g_try_realloc(*array, new_byte_size);
1505     if (!new_ptr) {
1506         return -ENOMEM;
1507     }
1508 
1509     if (new_byte_size > old_byte_size) {
1510         memset((char *)new_ptr + old_byte_size, 0,
1511                new_byte_size - old_byte_size);
1512     }
1513 
1514     *array = new_ptr;
1515     *size  = new_size;
1516 
1517     return 0;
1518 }
1519 
1520 /*
1521  * Increases the refcount for a range of clusters in a given refcount table.
1522  * This is used to construct a temporary refcount table out of L1 and L2 tables
1523  * which can be compared to the refcount table saved in the image.
1524  *
1525  * Modifies the number of errors in res.
1526  */
1527 int coroutine_fn GRAPH_RDLOCK
1528 qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res,
1529                          void **refcount_table,
1530                          int64_t *refcount_table_size,
1531                          int64_t offset, int64_t size)
1532 {
1533     BDRVQcow2State *s = bs->opaque;
1534     uint64_t start, last, cluster_offset, k, refcount;
1535     int64_t file_len;
1536     int ret;
1537 
1538     if (size <= 0) {
1539         return 0;
1540     }
1541 
1542     file_len = bdrv_co_getlength(bs->file->bs);
1543     if (file_len < 0) {
1544         return file_len;
1545     }
1546 
1547     /*
1548      * Last cluster of qcow2 image may be semi-allocated, so it may be OK to
1549      * reference some space after file end but it should be less than one
1550      * cluster.
1551      */
1552     if (offset + size - file_len >= s->cluster_size) {
1553         fprintf(stderr, "ERROR: counting reference for region exceeding the "
1554                 "end of the file by one cluster or more: offset 0x%" PRIx64
1555                 " size 0x%" PRIx64 "\n", offset, size);
1556         res->corruptions++;
1557         return 0;
1558     }
1559 
1560     start = start_of_cluster(s, offset);
1561     last = start_of_cluster(s, offset + size - 1);
1562     for(cluster_offset = start; cluster_offset <= last;
1563         cluster_offset += s->cluster_size) {
1564         k = cluster_offset >> s->cluster_bits;
1565         if (k >= *refcount_table_size) {
1566             ret = realloc_refcount_array(s, refcount_table,
1567                                          refcount_table_size, k + 1);
1568             if (ret < 0) {
1569                 res->check_errors++;
1570                 return ret;
1571             }
1572         }
1573 
1574         refcount = s->get_refcount(*refcount_table, k);
1575         if (refcount == s->refcount_max) {
1576             fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
1577                     "\n", cluster_offset);
1578             fprintf(stderr, "Use qemu-img amend to increase the refcount entry "
1579                     "width or qemu-img convert to create a clean copy if the "
1580                     "image cannot be opened for writing\n");
1581             res->corruptions++;
1582             continue;
1583         }
1584         s->set_refcount(*refcount_table, k, refcount + 1);
1585     }
1586 
1587     return 0;
1588 }
1589 
1590 /* Flags for check_refcounts_l1() and check_refcounts_l2() */
1591 enum {
1592     CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
1593 };
1594 
1595 /*
1596  * Fix L2 entry by making it QCOW2_CLUSTER_ZERO_PLAIN (or making all its present
1597  * subclusters QCOW2_SUBCLUSTER_ZERO_PLAIN).
1598  *
1599  * This function decrements res->corruptions on success, so the caller is
1600  * responsible to increment res->corruptions prior to the call.
1601  *
1602  * On failure in-memory @l2_table may be modified.
1603  */
1604 static int coroutine_fn GRAPH_RDLOCK
1605 fix_l2_entry_by_zero(BlockDriverState *bs, BdrvCheckResult *res,
1606                      uint64_t l2_offset, uint64_t *l2_table,
1607                      int l2_index, bool active,
1608                      bool *metadata_overlap)
1609 {
1610     BDRVQcow2State *s = bs->opaque;
1611     int ret;
1612     int idx = l2_index * (l2_entry_size(s) / sizeof(uint64_t));
1613     uint64_t l2e_offset = l2_offset + (uint64_t)l2_index * l2_entry_size(s);
1614     int ign = active ? QCOW2_OL_ACTIVE_L2 : QCOW2_OL_INACTIVE_L2;
1615 
1616     if (has_subclusters(s)) {
1617         uint64_t l2_bitmap = get_l2_bitmap(s, l2_table, l2_index);
1618 
1619         /* Allocated subclusters become zero */
1620         l2_bitmap |= l2_bitmap << 32;
1621         l2_bitmap &= QCOW_L2_BITMAP_ALL_ZEROES;
1622 
1623         set_l2_bitmap(s, l2_table, l2_index, l2_bitmap);
1624         set_l2_entry(s, l2_table, l2_index, 0);
1625     } else {
1626         set_l2_entry(s, l2_table, l2_index, QCOW_OFLAG_ZERO);
1627     }
1628 
1629     ret = qcow2_pre_write_overlap_check(bs, ign, l2e_offset, l2_entry_size(s),
1630                                         false);
1631     if (metadata_overlap) {
1632         *metadata_overlap = ret < 0;
1633     }
1634     if (ret < 0) {
1635         fprintf(stderr, "ERROR: Overlap check failed\n");
1636         goto fail;
1637     }
1638 
1639     ret = bdrv_co_pwrite_sync(bs->file, l2e_offset, l2_entry_size(s),
1640                               &l2_table[idx], 0);
1641     if (ret < 0) {
1642         fprintf(stderr, "ERROR: Failed to overwrite L2 "
1643                 "table entry: %s\n", strerror(-ret));
1644         goto fail;
1645     }
1646 
1647     res->corruptions--;
1648     res->corruptions_fixed++;
1649     return 0;
1650 
1651 fail:
1652     res->check_errors++;
1653     return ret;
1654 }
1655 
1656 /*
1657  * Increases the refcount in the given refcount table for the all clusters
1658  * referenced in the L2 table. While doing so, performs some checks on L2
1659  * entries.
1660  *
1661  * Returns the number of errors found by the checks or -errno if an internal
1662  * error occurred.
1663  */
1664 static int coroutine_fn GRAPH_RDLOCK
1665 check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
1666                    void **refcount_table,
1667                    int64_t *refcount_table_size, int64_t l2_offset,
1668                    int flags, BdrvCheckMode fix, bool active)
1669 {
1670     BDRVQcow2State *s = bs->opaque;
1671     uint64_t l2_entry, l2_bitmap;
1672     uint64_t next_contiguous_offset = 0;
1673     int i, ret;
1674     size_t l2_size_bytes = s->l2_size * l2_entry_size(s);
1675     g_autofree uint64_t *l2_table = g_malloc(l2_size_bytes);
1676     bool metadata_overlap;
1677 
1678     /* Read L2 table from disk */
1679     ret = bdrv_co_pread(bs->file, l2_offset, l2_size_bytes, l2_table, 0);
1680     if (ret < 0) {
1681         fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
1682         res->check_errors++;
1683         return ret;
1684     }
1685 
1686     /* Do the actual checks */
1687     for (i = 0; i < s->l2_size; i++) {
1688         uint64_t coffset;
1689         int csize;
1690         QCow2ClusterType type;
1691 
1692         l2_entry = get_l2_entry(s, l2_table, i);
1693         l2_bitmap = get_l2_bitmap(s, l2_table, i);
1694         type = qcow2_get_cluster_type(bs, l2_entry);
1695 
1696         if (type != QCOW2_CLUSTER_COMPRESSED) {
1697             /* Check reserved bits of Standard Cluster Descriptor */
1698             if (l2_entry & L2E_STD_RESERVED_MASK) {
1699                 fprintf(stderr, "ERROR found l2 entry with reserved bits set: "
1700                         "%" PRIx64 "\n", l2_entry);
1701                 res->corruptions++;
1702             }
1703         }
1704 
1705         switch (type) {
1706         case QCOW2_CLUSTER_COMPRESSED:
1707             /* Compressed clusters don't have QCOW_OFLAG_COPIED */
1708             if (l2_entry & QCOW_OFLAG_COPIED) {
1709                 fprintf(stderr, "ERROR: coffset=0x%" PRIx64 ": "
1710                     "copied flag must never be set for compressed "
1711                     "clusters\n", l2_entry & s->cluster_offset_mask);
1712                 l2_entry &= ~QCOW_OFLAG_COPIED;
1713                 res->corruptions++;
1714             }
1715 
1716             if (has_data_file(bs)) {
1717                 fprintf(stderr, "ERROR compressed cluster %d with data file, "
1718                         "entry=0x%" PRIx64 "\n", i, l2_entry);
1719                 res->corruptions++;
1720                 break;
1721             }
1722 
1723             if (l2_bitmap) {
1724                 fprintf(stderr, "ERROR compressed cluster %d with non-zero "
1725                         "subcluster allocation bitmap, entry=0x%" PRIx64 "\n",
1726                         i, l2_entry);
1727                 res->corruptions++;
1728                 break;
1729             }
1730 
1731             /* Mark cluster as used */
1732             qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize);
1733             ret = qcow2_inc_refcounts_imrt(
1734                 bs, res, refcount_table, refcount_table_size, coffset, csize);
1735             if (ret < 0) {
1736                 return ret;
1737             }
1738 
1739             if (flags & CHECK_FRAG_INFO) {
1740                 res->bfi.allocated_clusters++;
1741                 res->bfi.compressed_clusters++;
1742 
1743                 /*
1744                  * Compressed clusters are fragmented by nature.  Since they
1745                  * take up sub-sector space but we only have sector granularity
1746                  * I/O we need to re-read the same sectors even for adjacent
1747                  * compressed clusters.
1748                  */
1749                 res->bfi.fragmented_clusters++;
1750             }
1751             break;
1752 
1753         case QCOW2_CLUSTER_ZERO_ALLOC:
1754         case QCOW2_CLUSTER_NORMAL:
1755         {
1756             uint64_t offset = l2_entry & L2E_OFFSET_MASK;
1757 
1758             if ((l2_bitmap >> 32) & l2_bitmap) {
1759                 res->corruptions++;
1760                 fprintf(stderr, "ERROR offset=%" PRIx64 ": Allocated "
1761                         "cluster has corrupted subcluster allocation bitmap\n",
1762                         offset);
1763             }
1764 
1765             /* Correct offsets are cluster aligned */
1766             if (offset_into_cluster(s, offset)) {
1767                 bool contains_data;
1768                 res->corruptions++;
1769 
1770                 if (has_subclusters(s)) {
1771                     contains_data = (l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC);
1772                 } else {
1773                     contains_data = !(l2_entry & QCOW_OFLAG_ZERO);
1774                 }
1775 
1776                 if (!contains_data) {
1777                     fprintf(stderr, "%s offset=%" PRIx64 ": Preallocated "
1778                             "cluster is not properly aligned; L2 entry "
1779                             "corrupted.\n",
1780                             fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR",
1781                             offset);
1782                     if (fix & BDRV_FIX_ERRORS) {
1783                         ret = fix_l2_entry_by_zero(bs, res, l2_offset,
1784                                                    l2_table, i, active,
1785                                                    &metadata_overlap);
1786                         if (metadata_overlap) {
1787                             /*
1788                              * Something is seriously wrong, so abort checking
1789                              * this L2 table.
1790                              */
1791                             return ret;
1792                         }
1793 
1794                         if (ret == 0) {
1795                             /*
1796                              * Skip marking the cluster as used
1797                              * (it is unused now).
1798                              */
1799                             continue;
1800                         }
1801 
1802                         /*
1803                          * Failed to fix.
1804                          * Do not abort, continue checking the rest of this
1805                          * L2 table's entries.
1806                          */
1807                     }
1808                 } else {
1809                     fprintf(stderr, "ERROR offset=%" PRIx64 ": Data cluster is "
1810                         "not properly aligned; L2 entry corrupted.\n", offset);
1811                 }
1812             }
1813 
1814             if (flags & CHECK_FRAG_INFO) {
1815                 res->bfi.allocated_clusters++;
1816                 if (next_contiguous_offset &&
1817                     offset != next_contiguous_offset) {
1818                     res->bfi.fragmented_clusters++;
1819                 }
1820                 next_contiguous_offset = offset + s->cluster_size;
1821             }
1822 
1823             /* Mark cluster as used */
1824             if (!has_data_file(bs)) {
1825                 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table,
1826                                                refcount_table_size,
1827                                                offset, s->cluster_size);
1828                 if (ret < 0) {
1829                     return ret;
1830                 }
1831             }
1832             break;
1833         }
1834 
1835         case QCOW2_CLUSTER_ZERO_PLAIN:
1836             /* Impossible when image has subclusters */
1837             assert(!l2_bitmap);
1838             break;
1839 
1840         case QCOW2_CLUSTER_UNALLOCATED:
1841             if (l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC) {
1842                 res->corruptions++;
1843                 fprintf(stderr, "ERROR: Unallocated "
1844                         "cluster has non-zero subcluster allocation map\n");
1845             }
1846             break;
1847 
1848         default:
1849             abort();
1850         }
1851     }
1852 
1853     return 0;
1854 }
1855 
1856 /*
1857  * Increases the refcount for the L1 table, its L2 tables and all referenced
1858  * clusters in the given refcount table. While doing so, performs some checks
1859  * on L1 and L2 entries.
1860  *
1861  * Returns the number of errors found by the checks or -errno if an internal
1862  * error occurred.
1863  */
1864 static int coroutine_fn GRAPH_RDLOCK
1865 check_refcounts_l1(BlockDriverState *bs, BdrvCheckResult *res,
1866                    void **refcount_table, int64_t *refcount_table_size,
1867                    int64_t l1_table_offset, int l1_size,
1868                    int flags, BdrvCheckMode fix, bool active)
1869 {
1870     BDRVQcow2State *s = bs->opaque;
1871     size_t l1_size_bytes = l1_size * L1E_SIZE;
1872     g_autofree uint64_t *l1_table = NULL;
1873     uint64_t l2_offset;
1874     int i, ret;
1875 
1876     if (!l1_size) {
1877         return 0;
1878     }
1879 
1880     /* Mark L1 table as used */
1881     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, refcount_table_size,
1882                                    l1_table_offset, l1_size_bytes);
1883     if (ret < 0) {
1884         return ret;
1885     }
1886 
1887     l1_table = g_try_malloc(l1_size_bytes);
1888     if (l1_table == NULL) {
1889         res->check_errors++;
1890         return -ENOMEM;
1891     }
1892 
1893     /* Read L1 table entries from disk */
1894     ret = bdrv_co_pread(bs->file, l1_table_offset, l1_size_bytes, l1_table, 0);
1895     if (ret < 0) {
1896         fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1897         res->check_errors++;
1898         return ret;
1899     }
1900 
1901     for (i = 0; i < l1_size; i++) {
1902         be64_to_cpus(&l1_table[i]);
1903     }
1904 
1905     /* Do the actual checks */
1906     for (i = 0; i < l1_size; i++) {
1907         if (!l1_table[i]) {
1908             continue;
1909         }
1910 
1911         if (l1_table[i] & L1E_RESERVED_MASK) {
1912             fprintf(stderr, "ERROR found L1 entry with reserved bits set: "
1913                     "%" PRIx64 "\n", l1_table[i]);
1914             res->corruptions++;
1915         }
1916 
1917         l2_offset = l1_table[i] & L1E_OFFSET_MASK;
1918 
1919         /* Mark L2 table as used */
1920         ret = qcow2_inc_refcounts_imrt(bs, res,
1921                                        refcount_table, refcount_table_size,
1922                                        l2_offset, s->cluster_size);
1923         if (ret < 0) {
1924             return ret;
1925         }
1926 
1927         /* L2 tables are cluster aligned */
1928         if (offset_into_cluster(s, l2_offset)) {
1929             fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1930                 "cluster aligned; L1 entry corrupted\n", l2_offset);
1931             res->corruptions++;
1932         }
1933 
1934         /* Process and check L2 entries */
1935         ret = check_refcounts_l2(bs, res, refcount_table,
1936                                  refcount_table_size, l2_offset, flags,
1937                                  fix, active);
1938         if (ret < 0) {
1939             return ret;
1940         }
1941     }
1942 
1943     return 0;
1944 }
1945 
1946 /*
1947  * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
1948  *
1949  * This function does not print an error message nor does it increment
1950  * check_errors if qcow2_get_refcount fails (this is because such an error will
1951  * have been already detected and sufficiently signaled by the calling function
1952  * (qcow2_check_refcounts) by the time this function is called).
1953  */
1954 static int coroutine_fn GRAPH_RDLOCK
1955 check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
1956 {
1957     BDRVQcow2State *s = bs->opaque;
1958     uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
1959     int ret;
1960     uint64_t refcount;
1961     int i, j;
1962     bool repair;
1963 
1964     if (fix & BDRV_FIX_ERRORS) {
1965         /* Always repair */
1966         repair = true;
1967     } else if (fix & BDRV_FIX_LEAKS) {
1968         /* Repair only if that seems safe: This function is always
1969          * called after the refcounts have been fixed, so the refcount
1970          * is accurate if that repair was successful */
1971         repair = !res->check_errors && !res->corruptions && !res->leaks;
1972     } else {
1973         repair = false;
1974     }
1975 
1976     for (i = 0; i < s->l1_size; i++) {
1977         uint64_t l1_entry = s->l1_table[i];
1978         uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
1979         int l2_dirty = 0;
1980 
1981         if (!l2_offset) {
1982             continue;
1983         }
1984 
1985         ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1986                                  &refcount);
1987         if (ret < 0) {
1988             /* don't print message nor increment check_errors */
1989             continue;
1990         }
1991         if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
1992             res->corruptions++;
1993             fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
1994                     "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1995                     repair ? "Repairing" : "ERROR", i, l1_entry, refcount);
1996             if (repair) {
1997                 s->l1_table[i] = refcount == 1
1998                                ? l1_entry |  QCOW_OFLAG_COPIED
1999                                : l1_entry & ~QCOW_OFLAG_COPIED;
2000                 ret = qcow2_write_l1_entry(bs, i);
2001                 if (ret < 0) {
2002                     res->check_errors++;
2003                     goto fail;
2004                 }
2005                 res->corruptions--;
2006                 res->corruptions_fixed++;
2007             }
2008         }
2009 
2010         ret = bdrv_co_pread(bs->file, l2_offset, s->l2_size * l2_entry_size(s),
2011                             l2_table, 0);
2012         if (ret < 0) {
2013             fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
2014                     strerror(-ret));
2015             res->check_errors++;
2016             goto fail;
2017         }
2018 
2019         for (j = 0; j < s->l2_size; j++) {
2020             uint64_t l2_entry = get_l2_entry(s, l2_table, j);
2021             uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
2022             QCow2ClusterType cluster_type = qcow2_get_cluster_type(bs, l2_entry);
2023 
2024             if (cluster_type == QCOW2_CLUSTER_NORMAL ||
2025                 cluster_type == QCOW2_CLUSTER_ZERO_ALLOC) {
2026                 if (has_data_file(bs)) {
2027                     refcount = 1;
2028                 } else {
2029                     ret = qcow2_get_refcount(bs,
2030                                              data_offset >> s->cluster_bits,
2031                                              &refcount);
2032                     if (ret < 0) {
2033                         /* don't print message nor increment check_errors */
2034                         continue;
2035                     }
2036                 }
2037                 if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
2038                     res->corruptions++;
2039                     fprintf(stderr, "%s OFLAG_COPIED data cluster: "
2040                             "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
2041                             repair ? "Repairing" : "ERROR", l2_entry, refcount);
2042                     if (repair) {
2043                         set_l2_entry(s, l2_table, j,
2044                                      refcount == 1 ?
2045                                      l2_entry |  QCOW_OFLAG_COPIED :
2046                                      l2_entry & ~QCOW_OFLAG_COPIED);
2047                         l2_dirty++;
2048                     }
2049                 }
2050             }
2051         }
2052 
2053         if (l2_dirty > 0) {
2054             ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
2055                                                 l2_offset, s->cluster_size,
2056                                                 false);
2057             if (ret < 0) {
2058                 fprintf(stderr, "ERROR: Could not write L2 table; metadata "
2059                         "overlap check failed: %s\n", strerror(-ret));
2060                 res->check_errors++;
2061                 goto fail;
2062             }
2063 
2064             ret = bdrv_co_pwrite(bs->file, l2_offset, s->cluster_size, l2_table, 0);
2065             if (ret < 0) {
2066                 fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
2067                         strerror(-ret));
2068                 res->check_errors++;
2069                 goto fail;
2070             }
2071             res->corruptions -= l2_dirty;
2072             res->corruptions_fixed += l2_dirty;
2073         }
2074     }
2075 
2076     ret = 0;
2077 
2078 fail:
2079     qemu_vfree(l2_table);
2080     return ret;
2081 }
2082 
2083 /*
2084  * Checks consistency of refblocks and accounts for each refblock in
2085  * *refcount_table.
2086  */
2087 static int coroutine_fn GRAPH_RDLOCK
2088 check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
2089                 BdrvCheckMode fix, bool *rebuild,
2090                 void **refcount_table, int64_t *nb_clusters)
2091 {
2092     BDRVQcow2State *s = bs->opaque;
2093     int64_t i, size;
2094     int ret;
2095 
2096     for(i = 0; i < s->refcount_table_size; i++) {
2097         uint64_t offset, cluster;
2098         offset = s->refcount_table[i] & REFT_OFFSET_MASK;
2099         cluster = offset >> s->cluster_bits;
2100 
2101         if (s->refcount_table[i] & REFT_RESERVED_MASK) {
2102             fprintf(stderr, "ERROR refcount table entry %" PRId64 " has "
2103                     "reserved bits set\n", i);
2104             res->corruptions++;
2105             *rebuild = true;
2106             continue;
2107         }
2108 
2109         /* Refcount blocks are cluster aligned */
2110         if (offset_into_cluster(s, offset)) {
2111             fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
2112                 "cluster aligned; refcount table entry corrupted\n", i);
2113             res->corruptions++;
2114             *rebuild = true;
2115             continue;
2116         }
2117 
2118         if (cluster >= *nb_clusters) {
2119             res->corruptions++;
2120             fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n",
2121                     fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
2122 
2123             if (fix & BDRV_FIX_ERRORS) {
2124                 int64_t new_nb_clusters;
2125                 Error *local_err = NULL;
2126 
2127                 if (offset > INT64_MAX - s->cluster_size) {
2128                     ret = -EINVAL;
2129                     goto resize_fail;
2130                 }
2131 
2132                 ret = bdrv_co_truncate(bs->file, offset + s->cluster_size, false,
2133                                        PREALLOC_MODE_OFF, 0, &local_err);
2134                 if (ret < 0) {
2135                     error_report_err(local_err);
2136                     goto resize_fail;
2137                 }
2138                 size = bdrv_co_getlength(bs->file->bs);
2139                 if (size < 0) {
2140                     ret = size;
2141                     goto resize_fail;
2142                 }
2143 
2144                 new_nb_clusters = size_to_clusters(s, size);
2145                 assert(new_nb_clusters >= *nb_clusters);
2146 
2147                 ret = realloc_refcount_array(s, refcount_table,
2148                                              nb_clusters, new_nb_clusters);
2149                 if (ret < 0) {
2150                     res->check_errors++;
2151                     return ret;
2152                 }
2153 
2154                 if (cluster >= *nb_clusters) {
2155                     ret = -EINVAL;
2156                     goto resize_fail;
2157                 }
2158 
2159                 res->corruptions--;
2160                 res->corruptions_fixed++;
2161                 ret = qcow2_inc_refcounts_imrt(bs, res,
2162                                                refcount_table, nb_clusters,
2163                                                offset, s->cluster_size);
2164                 if (ret < 0) {
2165                     return ret;
2166                 }
2167                 /* No need to check whether the refcount is now greater than 1:
2168                  * This area was just allocated and zeroed, so it can only be
2169                  * exactly 1 after qcow2_inc_refcounts_imrt() */
2170                 continue;
2171 
2172 resize_fail:
2173                 *rebuild = true;
2174                 fprintf(stderr, "ERROR could not resize image: %s\n",
2175                         strerror(-ret));
2176             }
2177             continue;
2178         }
2179 
2180         if (offset != 0) {
2181             ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2182                                            offset, s->cluster_size);
2183             if (ret < 0) {
2184                 return ret;
2185             }
2186             if (s->get_refcount(*refcount_table, cluster) != 1) {
2187                 fprintf(stderr, "ERROR refcount block %" PRId64
2188                         " refcount=%" PRIu64 "\n", i,
2189                         s->get_refcount(*refcount_table, cluster));
2190                 res->corruptions++;
2191                 *rebuild = true;
2192             }
2193         }
2194     }
2195 
2196     return 0;
2197 }
2198 
2199 /*
2200  * Calculates an in-memory refcount table.
2201  */
2202 static int coroutine_fn GRAPH_RDLOCK
2203 calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2204                     BdrvCheckMode fix, bool *rebuild,
2205                     void **refcount_table, int64_t *nb_clusters)
2206 {
2207     BDRVQcow2State *s = bs->opaque;
2208     int64_t i;
2209     QCowSnapshot *sn;
2210     int ret;
2211 
2212     if (!*refcount_table) {
2213         int64_t old_size = 0;
2214         ret = realloc_refcount_array(s, refcount_table,
2215                                      &old_size, *nb_clusters);
2216         if (ret < 0) {
2217             res->check_errors++;
2218             return ret;
2219         }
2220     }
2221 
2222     /* header */
2223     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2224                                    0, s->cluster_size);
2225     if (ret < 0) {
2226         return ret;
2227     }
2228 
2229     /* current L1 table */
2230     ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
2231                              s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO,
2232                              fix, true);
2233     if (ret < 0) {
2234         return ret;
2235     }
2236 
2237     /* snapshots */
2238     if (has_data_file(bs) && s->nb_snapshots) {
2239         fprintf(stderr, "ERROR %d snapshots in image with data file\n",
2240                 s->nb_snapshots);
2241         res->corruptions++;
2242     }
2243 
2244     for (i = 0; i < s->nb_snapshots; i++) {
2245         sn = s->snapshots + i;
2246         if (offset_into_cluster(s, sn->l1_table_offset)) {
2247             fprintf(stderr, "ERROR snapshot %s (%s) l1_offset=%#" PRIx64 ": "
2248                     "L1 table is not cluster aligned; snapshot table entry "
2249                     "corrupted\n", sn->id_str, sn->name, sn->l1_table_offset);
2250             res->corruptions++;
2251             continue;
2252         }
2253         if (sn->l1_size > QCOW_MAX_L1_SIZE / L1E_SIZE) {
2254             fprintf(stderr, "ERROR snapshot %s (%s) l1_size=%#" PRIx32 ": "
2255                     "L1 table is too large; snapshot table entry corrupted\n",
2256                     sn->id_str, sn->name, sn->l1_size);
2257             res->corruptions++;
2258             continue;
2259         }
2260         ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
2261                                  sn->l1_table_offset, sn->l1_size, 0, fix,
2262                                  false);
2263         if (ret < 0) {
2264             return ret;
2265         }
2266     }
2267     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2268                                    s->snapshots_offset, s->snapshots_size);
2269     if (ret < 0) {
2270         return ret;
2271     }
2272 
2273     /* refcount data */
2274     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2275                                    s->refcount_table_offset,
2276                                    s->refcount_table_size *
2277                                    REFTABLE_ENTRY_SIZE);
2278     if (ret < 0) {
2279         return ret;
2280     }
2281 
2282     /* encryption */
2283     if (s->crypto_header.length) {
2284         ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2285                                        s->crypto_header.offset,
2286                                        s->crypto_header.length);
2287         if (ret < 0) {
2288             return ret;
2289         }
2290     }
2291 
2292     /* bitmaps */
2293     ret = qcow2_check_bitmaps_refcounts(bs, res, refcount_table, nb_clusters);
2294     if (ret < 0) {
2295         return ret;
2296     }
2297 
2298     return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters);
2299 }
2300 
2301 /*
2302  * Compares the actual reference count for each cluster in the image against the
2303  * refcount as reported by the refcount structures on-disk.
2304  */
2305 static void coroutine_fn
2306 compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2307                   BdrvCheckMode fix, bool *rebuild,
2308                   int64_t *highest_cluster,
2309                   void *refcount_table, int64_t nb_clusters)
2310 {
2311     BDRVQcow2State *s = bs->opaque;
2312     int64_t i;
2313     uint64_t refcount1, refcount2;
2314     int ret;
2315 
2316     for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) {
2317         ret = qcow2_get_refcount(bs, i, &refcount1);
2318         if (ret < 0) {
2319             fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
2320                     i, strerror(-ret));
2321             res->check_errors++;
2322             continue;
2323         }
2324 
2325         refcount2 = s->get_refcount(refcount_table, i);
2326 
2327         if (refcount1 > 0 || refcount2 > 0) {
2328             *highest_cluster = i;
2329         }
2330 
2331         if (refcount1 != refcount2) {
2332             /* Check if we're allowed to fix the mismatch */
2333             int *num_fixed = NULL;
2334             if (refcount1 == 0) {
2335                 *rebuild = true;
2336             } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
2337                 num_fixed = &res->leaks_fixed;
2338             } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
2339                 num_fixed = &res->corruptions_fixed;
2340             }
2341 
2342             fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64
2343                     " reference=%" PRIu64 "\n",
2344                    num_fixed != NULL     ? "Repairing" :
2345                    refcount1 < refcount2 ? "ERROR" :
2346                                            "Leaked",
2347                    i, refcount1, refcount2);
2348 
2349             if (num_fixed) {
2350                 ret = update_refcount(bs, i << s->cluster_bits, 1,
2351                                       refcount_diff(refcount1, refcount2),
2352                                       refcount1 > refcount2,
2353                                       QCOW2_DISCARD_ALWAYS);
2354                 if (ret >= 0) {
2355                     (*num_fixed)++;
2356                     continue;
2357                 }
2358             }
2359 
2360             /* And if we couldn't, print an error */
2361             if (refcount1 < refcount2) {
2362                 res->corruptions++;
2363             } else {
2364                 res->leaks++;
2365             }
2366         }
2367     }
2368 }
2369 
2370 /*
2371  * Allocates clusters using an in-memory refcount table (IMRT) in contrast to
2372  * the on-disk refcount structures.
2373  *
2374  * On input, *first_free_cluster tells where to start looking, and need not
2375  * actually be a free cluster; the returned offset will not be before that
2376  * cluster.  On output, *first_free_cluster points to the first gap found, even
2377  * if that gap was too small to be used as the returned offset.
2378  *
2379  * Note that *first_free_cluster is a cluster index whereas the return value is
2380  * an offset.
2381  */
2382 static int64_t alloc_clusters_imrt(BlockDriverState *bs,
2383                                    int cluster_count,
2384                                    void **refcount_table,
2385                                    int64_t *imrt_nb_clusters,
2386                                    int64_t *first_free_cluster)
2387 {
2388     BDRVQcow2State *s = bs->opaque;
2389     int64_t cluster = *first_free_cluster, i;
2390     bool first_gap = true;
2391     int contiguous_free_clusters;
2392     int ret;
2393 
2394     /* Starting at *first_free_cluster, find a range of at least cluster_count
2395      * continuously free clusters */
2396     for (contiguous_free_clusters = 0;
2397          cluster < *imrt_nb_clusters &&
2398          contiguous_free_clusters < cluster_count;
2399          cluster++)
2400     {
2401         if (!s->get_refcount(*refcount_table, cluster)) {
2402             contiguous_free_clusters++;
2403             if (first_gap) {
2404                 /* If this is the first free cluster found, update
2405                  * *first_free_cluster accordingly */
2406                 *first_free_cluster = cluster;
2407                 first_gap = false;
2408             }
2409         } else if (contiguous_free_clusters) {
2410             contiguous_free_clusters = 0;
2411         }
2412     }
2413 
2414     /* If contiguous_free_clusters is greater than zero, it contains the number
2415      * of continuously free clusters until the current cluster; the first free
2416      * cluster in the current "gap" is therefore
2417      * cluster - contiguous_free_clusters */
2418 
2419     /* If no such range could be found, grow the in-memory refcount table
2420      * accordingly to append free clusters at the end of the image */
2421     if (contiguous_free_clusters < cluster_count) {
2422         /* contiguous_free_clusters clusters are already empty at the image end;
2423          * we need cluster_count clusters; therefore, we have to allocate
2424          * cluster_count - contiguous_free_clusters new clusters at the end of
2425          * the image (which is the current value of cluster; note that cluster
2426          * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond
2427          * the image end) */
2428         ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters,
2429                                      cluster + cluster_count
2430                                      - contiguous_free_clusters);
2431         if (ret < 0) {
2432             return ret;
2433         }
2434     }
2435 
2436     /* Go back to the first free cluster */
2437     cluster -= contiguous_free_clusters;
2438     for (i = 0; i < cluster_count; i++) {
2439         s->set_refcount(*refcount_table, cluster + i, 1);
2440     }
2441 
2442     return cluster << s->cluster_bits;
2443 }
2444 
2445 /*
2446  * Helper function for rebuild_refcount_structure().
2447  *
2448  * Scan the range of clusters [first_cluster, end_cluster) for allocated
2449  * clusters and write all corresponding refblocks to disk.  The refblock
2450  * and allocation data is taken from the in-memory refcount table
2451  * *refcount_table[] (of size *nb_clusters), which is basically one big
2452  * (unlimited size) refblock for the whole image.
2453  *
2454  * For these refblocks, clusters are allocated using said in-memory
2455  * refcount table.  Care is taken that these allocations are reflected
2456  * in the refblocks written to disk.
2457  *
2458  * The refblocks' offsets are written into a reftable, which is
2459  * *on_disk_reftable_ptr[] (of size *on_disk_reftable_entries_ptr).  If
2460  * that reftable is of insufficient size, it will be resized to fit.
2461  * This reftable is not written to disk.
2462  *
2463  * (If *on_disk_reftable_ptr is not NULL, the entries within are assumed
2464  * to point to existing valid refblocks that do not need to be allocated
2465  * again.)
2466  *
2467  * Return whether the on-disk reftable array was resized (true/false),
2468  * or -errno on error.
2469  */
2470 static int coroutine_fn GRAPH_RDLOCK
2471 rebuild_refcounts_write_refblocks(
2472         BlockDriverState *bs, void **refcount_table, int64_t *nb_clusters,
2473         int64_t first_cluster, int64_t end_cluster,
2474         uint64_t **on_disk_reftable_ptr, uint32_t *on_disk_reftable_entries_ptr,
2475         Error **errp
2476     )
2477 {
2478     BDRVQcow2State *s = bs->opaque;
2479     int64_t cluster;
2480     int64_t refblock_offset, refblock_start, refblock_index;
2481     int64_t first_free_cluster = 0;
2482     uint64_t *on_disk_reftable = *on_disk_reftable_ptr;
2483     uint32_t on_disk_reftable_entries = *on_disk_reftable_entries_ptr;
2484     void *on_disk_refblock;
2485     bool reftable_grown = false;
2486     int ret;
2487 
2488     for (cluster = first_cluster; cluster < end_cluster; cluster++) {
2489         /* Check all clusters to find refblocks that contain non-zero entries */
2490         if (!s->get_refcount(*refcount_table, cluster)) {
2491             continue;
2492         }
2493 
2494         /*
2495          * This cluster is allocated, so we need to create a refblock
2496          * for it.  The data we will write to disk is just the
2497          * respective slice from *refcount_table, so it will contain
2498          * accurate refcounts for all clusters belonging to this
2499          * refblock.  After we have written it, we will therefore skip
2500          * all remaining clusters in this refblock.
2501          */
2502 
2503         refblock_index = cluster >> s->refcount_block_bits;
2504         refblock_start = refblock_index << s->refcount_block_bits;
2505 
2506         if (on_disk_reftable_entries > refblock_index &&
2507             on_disk_reftable[refblock_index])
2508         {
2509             /*
2510              * We can get here after a `goto write_refblocks`: We have a
2511              * reftable from a previous run, and the refblock is already
2512              * allocated.  No need to allocate it again.
2513              */
2514             refblock_offset = on_disk_reftable[refblock_index];
2515         } else {
2516             int64_t refblock_cluster_index;
2517 
2518             /* Don't allocate a cluster in a refblock already written to disk */
2519             if (first_free_cluster < refblock_start) {
2520                 first_free_cluster = refblock_start;
2521             }
2522             refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table,
2523                                                   nb_clusters,
2524                                                   &first_free_cluster);
2525             if (refblock_offset < 0) {
2526                 error_setg_errno(errp, -refblock_offset,
2527                                  "ERROR allocating refblock");
2528                 return refblock_offset;
2529             }
2530 
2531             refblock_cluster_index = refblock_offset / s->cluster_size;
2532             if (refblock_cluster_index >= end_cluster) {
2533                 /*
2534                  * We must write the refblock that holds this refblock's
2535                  * refcount
2536                  */
2537                 end_cluster = refblock_cluster_index + 1;
2538             }
2539 
2540             if (on_disk_reftable_entries <= refblock_index) {
2541                 on_disk_reftable_entries =
2542                     ROUND_UP((refblock_index + 1) * REFTABLE_ENTRY_SIZE,
2543                              s->cluster_size) / REFTABLE_ENTRY_SIZE;
2544                 on_disk_reftable =
2545                     g_try_realloc(on_disk_reftable,
2546                                   on_disk_reftable_entries *
2547                                   REFTABLE_ENTRY_SIZE);
2548                 if (!on_disk_reftable) {
2549                     error_setg(errp, "ERROR allocating reftable memory");
2550                     return -ENOMEM;
2551                 }
2552 
2553                 memset(on_disk_reftable + *on_disk_reftable_entries_ptr, 0,
2554                        (on_disk_reftable_entries -
2555                         *on_disk_reftable_entries_ptr) *
2556                        REFTABLE_ENTRY_SIZE);
2557 
2558                 *on_disk_reftable_ptr = on_disk_reftable;
2559                 *on_disk_reftable_entries_ptr = on_disk_reftable_entries;
2560 
2561                 reftable_grown = true;
2562             } else {
2563                 assert(on_disk_reftable);
2564             }
2565             on_disk_reftable[refblock_index] = refblock_offset;
2566         }
2567 
2568         /* Refblock is allocated, write it to disk */
2569 
2570         ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset,
2571                                             s->cluster_size, false);
2572         if (ret < 0) {
2573             error_setg_errno(errp, -ret, "ERROR writing refblock");
2574             return ret;
2575         }
2576 
2577         /*
2578          * The refblock is simply a slice of *refcount_table.
2579          * Note that the size of *refcount_table is always aligned to
2580          * whole clusters, so the write operation will not result in
2581          * out-of-bounds accesses.
2582          */
2583         on_disk_refblock = (void *)((char *) *refcount_table +
2584                                     refblock_index * s->cluster_size);
2585 
2586         ret = bdrv_co_pwrite(bs->file, refblock_offset, s->cluster_size,
2587                              on_disk_refblock, 0);
2588         if (ret < 0) {
2589             error_setg_errno(errp, -ret, "ERROR writing refblock");
2590             return ret;
2591         }
2592 
2593         /* This refblock is done, skip to its end */
2594         cluster = refblock_start + s->refcount_block_size - 1;
2595     }
2596 
2597     return reftable_grown;
2598 }
2599 
2600 /*
2601  * Creates a new refcount structure based solely on the in-memory information
2602  * given through *refcount_table (this in-memory information is basically just
2603  * the concatenation of all refblocks).  All necessary allocations will be
2604  * reflected in that array.
2605  *
2606  * On success, the old refcount structure is leaked (it will be covered by the
2607  * new refcount structure).
2608  */
2609 static int coroutine_fn GRAPH_RDLOCK
2610 rebuild_refcount_structure(BlockDriverState *bs, BdrvCheckResult *res,
2611                            void **refcount_table, int64_t *nb_clusters,
2612                            Error **errp)
2613 {
2614     BDRVQcow2State *s = bs->opaque;
2615     int64_t reftable_offset = -1;
2616     int64_t reftable_length = 0;
2617     int64_t reftable_clusters;
2618     int64_t refblock_index;
2619     uint32_t on_disk_reftable_entries = 0;
2620     uint64_t *on_disk_reftable = NULL;
2621     int ret = 0;
2622     int reftable_size_changed = 0;
2623     struct {
2624         uint64_t reftable_offset;
2625         uint32_t reftable_clusters;
2626     } QEMU_PACKED reftable_offset_and_clusters;
2627 
2628     qcow2_cache_empty(bs, s->refcount_block_cache);
2629 
2630     /*
2631      * For each refblock containing entries, we try to allocate a
2632      * cluster (in the in-memory refcount table) and write its offset
2633      * into on_disk_reftable[].  We then write the whole refblock to
2634      * disk (as a slice of the in-memory refcount table).
2635      * This is done by rebuild_refcounts_write_refblocks().
2636      *
2637      * Once we have scanned all clusters, we try to find space for the
2638      * reftable.  This will dirty the in-memory refcount table (i.e.
2639      * make it differ from the refblocks we have already written), so we
2640      * need to run rebuild_refcounts_write_refblocks() again for the
2641      * range of clusters where the reftable has been allocated.
2642      *
2643      * This second run might make the reftable grow again, in which case
2644      * we will need to allocate another space for it, which is why we
2645      * repeat all this until the reftable stops growing.
2646      *
2647      * (This loop will terminate, because with every cluster the
2648      * reftable grows, it can accommodate a multitude of more refcounts,
2649      * so that at some point this must be able to cover the reftable
2650      * and all refblocks describing it.)
2651      *
2652      * We then convert the reftable to big-endian and write it to disk.
2653      *
2654      * Note that we never free any reftable allocations.  Doing so would
2655      * needlessly complicate the algorithm: The eventual second check
2656      * run we do will clean up all leaks we have caused.
2657      */
2658 
2659     reftable_size_changed =
2660         rebuild_refcounts_write_refblocks(bs, refcount_table, nb_clusters,
2661                                           0, *nb_clusters,
2662                                           &on_disk_reftable,
2663                                           &on_disk_reftable_entries, errp);
2664     if (reftable_size_changed < 0) {
2665         res->check_errors++;
2666         ret = reftable_size_changed;
2667         goto fail;
2668     }
2669 
2670     /*
2671      * There was no reftable before, so rebuild_refcounts_write_refblocks()
2672      * must have increased its size (from 0 to something).
2673      */
2674     assert(reftable_size_changed);
2675 
2676     do {
2677         int64_t reftable_start_cluster, reftable_end_cluster;
2678         int64_t first_free_cluster = 0;
2679 
2680         reftable_length = on_disk_reftable_entries * REFTABLE_ENTRY_SIZE;
2681         reftable_clusters = size_to_clusters(s, reftable_length);
2682 
2683         reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2684                                               refcount_table, nb_clusters,
2685                                               &first_free_cluster);
2686         if (reftable_offset < 0) {
2687             error_setg_errno(errp, -reftable_offset,
2688                              "ERROR allocating reftable");
2689             res->check_errors++;
2690             ret = reftable_offset;
2691             goto fail;
2692         }
2693 
2694         /*
2695          * We need to update the affected refblocks, so re-run the
2696          * write_refblocks loop for the reftable's range of clusters.
2697          */
2698         assert(offset_into_cluster(s, reftable_offset) == 0);
2699         reftable_start_cluster = reftable_offset / s->cluster_size;
2700         reftable_end_cluster = reftable_start_cluster + reftable_clusters;
2701         reftable_size_changed =
2702             rebuild_refcounts_write_refblocks(bs, refcount_table, nb_clusters,
2703                                               reftable_start_cluster,
2704                                               reftable_end_cluster,
2705                                               &on_disk_reftable,
2706                                               &on_disk_reftable_entries, errp);
2707         if (reftable_size_changed < 0) {
2708             res->check_errors++;
2709             ret = reftable_size_changed;
2710             goto fail;
2711         }
2712 
2713         /*
2714          * If the reftable size has changed, we will need to find a new
2715          * allocation, repeating the loop.
2716          */
2717     } while (reftable_size_changed);
2718 
2719     /* The above loop must have run at least once */
2720     assert(reftable_offset >= 0);
2721 
2722     /*
2723      * All allocations are done, all refblocks are written, convert the
2724      * reftable to big-endian and write it to disk.
2725      */
2726 
2727     for (refblock_index = 0; refblock_index < on_disk_reftable_entries;
2728          refblock_index++)
2729     {
2730         cpu_to_be64s(&on_disk_reftable[refblock_index]);
2731     }
2732 
2733     ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset, reftable_length,
2734                                         false);
2735     if (ret < 0) {
2736         error_setg_errno(errp, -ret, "ERROR writing reftable");
2737         goto fail;
2738     }
2739 
2740     assert(reftable_length < INT_MAX);
2741     ret = bdrv_co_pwrite(bs->file, reftable_offset, reftable_length,
2742                          on_disk_reftable, 0);
2743     if (ret < 0) {
2744         error_setg_errno(errp, -ret, "ERROR writing reftable");
2745         goto fail;
2746     }
2747 
2748     /* Enter new reftable into the image header */
2749     reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset);
2750     reftable_offset_and_clusters.reftable_clusters =
2751         cpu_to_be32(reftable_clusters);
2752     ret = bdrv_co_pwrite_sync(bs->file,
2753                               offsetof(QCowHeader, refcount_table_offset),
2754                               sizeof(reftable_offset_and_clusters),
2755                               &reftable_offset_and_clusters, 0);
2756     if (ret < 0) {
2757         error_setg_errno(errp, -ret, "ERROR setting reftable");
2758         goto fail;
2759     }
2760 
2761     for (refblock_index = 0; refblock_index < on_disk_reftable_entries;
2762          refblock_index++)
2763     {
2764         be64_to_cpus(&on_disk_reftable[refblock_index]);
2765     }
2766     s->refcount_table = on_disk_reftable;
2767     s->refcount_table_offset = reftable_offset;
2768     s->refcount_table_size = on_disk_reftable_entries;
2769     update_max_refcount_table_index(s);
2770 
2771     return 0;
2772 
2773 fail:
2774     g_free(on_disk_reftable);
2775     return ret;
2776 }
2777 
2778 /*
2779  * Checks an image for refcount consistency.
2780  *
2781  * Returns 0 if no errors are found, the number of errors in case the image is
2782  * detected as corrupted, and -errno when an internal error occurred.
2783  */
2784 int coroutine_fn GRAPH_RDLOCK
2785 qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2786 {
2787     BDRVQcow2State *s = bs->opaque;
2788     BdrvCheckResult pre_compare_res;
2789     int64_t size, highest_cluster, nb_clusters;
2790     void *refcount_table = NULL;
2791     bool rebuild = false;
2792     int ret;
2793 
2794     size = bdrv_co_getlength(bs->file->bs);
2795     if (size < 0) {
2796         res->check_errors++;
2797         return size;
2798     }
2799 
2800     nb_clusters = size_to_clusters(s, size);
2801     if (nb_clusters > INT_MAX) {
2802         res->check_errors++;
2803         return -EFBIG;
2804     }
2805 
2806     res->bfi.total_clusters =
2807         size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
2808 
2809     ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table,
2810                               &nb_clusters);
2811     if (ret < 0) {
2812         goto fail;
2813     }
2814 
2815     /* In case we don't need to rebuild the refcount structure (but want to fix
2816      * something), this function is immediately called again, in which case the
2817      * result should be ignored */
2818     pre_compare_res = *res;
2819     compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table,
2820                       nb_clusters);
2821 
2822     if (rebuild && (fix & BDRV_FIX_ERRORS)) {
2823         BdrvCheckResult old_res = *res;
2824         int fresh_leaks = 0;
2825         Error *local_err = NULL;
2826 
2827         fprintf(stderr, "Rebuilding refcount structure\n");
2828         ret = rebuild_refcount_structure(bs, res, &refcount_table,
2829                                          &nb_clusters, &local_err);
2830         if (ret < 0) {
2831             error_report_err(local_err);
2832             goto fail;
2833         }
2834 
2835         res->corruptions = 0;
2836         res->leaks = 0;
2837 
2838         /* Because the old reftable has been exchanged for a new one the
2839          * references have to be recalculated */
2840         rebuild = false;
2841         memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters));
2842         ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table,
2843                                   &nb_clusters);
2844         if (ret < 0) {
2845             goto fail;
2846         }
2847 
2848         if (fix & BDRV_FIX_LEAKS) {
2849             /* The old refcount structures are now leaked, fix it; the result
2850              * can be ignored, aside from leaks which were introduced by
2851              * rebuild_refcount_structure() that could not be fixed */
2852             BdrvCheckResult saved_res = *res;
2853             *res = (BdrvCheckResult){ 0 };
2854 
2855             compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild,
2856                               &highest_cluster, refcount_table, nb_clusters);
2857             if (rebuild) {
2858                 fprintf(stderr, "ERROR rebuilt refcount structure is still "
2859                         "broken\n");
2860             }
2861 
2862             /* Any leaks accounted for here were introduced by
2863              * rebuild_refcount_structure() because that function has created a
2864              * new refcount structure from scratch */
2865             fresh_leaks = res->leaks;
2866             *res = saved_res;
2867         }
2868 
2869         if (res->corruptions < old_res.corruptions) {
2870             res->corruptions_fixed += old_res.corruptions - res->corruptions;
2871         }
2872         if (res->leaks < old_res.leaks) {
2873             res->leaks_fixed += old_res.leaks - res->leaks;
2874         }
2875         res->leaks += fresh_leaks;
2876     } else if (fix) {
2877         if (rebuild) {
2878             fprintf(stderr, "ERROR need to rebuild refcount structures\n");
2879             res->check_errors++;
2880             ret = -EIO;
2881             goto fail;
2882         }
2883 
2884         if (res->leaks || res->corruptions) {
2885             *res = pre_compare_res;
2886             compare_refcounts(bs, res, fix, &rebuild, &highest_cluster,
2887                               refcount_table, nb_clusters);
2888         }
2889     }
2890 
2891     /* check OFLAG_COPIED */
2892     ret = check_oflag_copied(bs, res, fix);
2893     if (ret < 0) {
2894         goto fail;
2895     }
2896 
2897     res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
2898     ret = 0;
2899 
2900 fail:
2901     g_free(refcount_table);
2902 
2903     return ret;
2904 }
2905 
2906 #define overlaps_with(ofs, sz) \
2907     ranges_overlap(offset, size, ofs, sz)
2908 
2909 /*
2910  * Checks if the given offset into the image file is actually free to use by
2911  * looking for overlaps with important metadata sections (L1/L2 tables etc.),
2912  * i.e. a sanity check without relying on the refcount tables.
2913  *
2914  * The ign parameter specifies what checks not to perform (being a bitmask of
2915  * QCow2MetadataOverlap values), i.e., what sections to ignore.
2916  *
2917  * Returns:
2918  * - 0 if writing to this offset will not affect the mentioned metadata
2919  * - a positive QCow2MetadataOverlap value indicating one overlapping section
2920  * - a negative value (-errno) indicating an error while performing a check,
2921  *   e.g. when bdrv_pread failed on QCOW2_OL_INACTIVE_L2
2922  */
2923 int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
2924                                  int64_t size)
2925 {
2926     BDRVQcow2State *s = bs->opaque;
2927     int chk = s->overlap_check & ~ign;
2928     int i, j;
2929 
2930     if (!size) {
2931         return 0;
2932     }
2933 
2934     if (chk & QCOW2_OL_MAIN_HEADER) {
2935         if (offset < s->cluster_size) {
2936             return QCOW2_OL_MAIN_HEADER;
2937         }
2938     }
2939 
2940     /* align range to test to cluster boundaries */
2941     size = ROUND_UP(offset_into_cluster(s, offset) + size, s->cluster_size);
2942     offset = start_of_cluster(s, offset);
2943 
2944     if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
2945         if (overlaps_with(s->l1_table_offset, s->l1_size * L1E_SIZE)) {
2946             return QCOW2_OL_ACTIVE_L1;
2947         }
2948     }
2949 
2950     if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
2951         if (overlaps_with(s->refcount_table_offset,
2952             s->refcount_table_size * REFTABLE_ENTRY_SIZE)) {
2953             return QCOW2_OL_REFCOUNT_TABLE;
2954         }
2955     }
2956 
2957     if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
2958         if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
2959             return QCOW2_OL_SNAPSHOT_TABLE;
2960         }
2961     }
2962 
2963     if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
2964         for (i = 0; i < s->nb_snapshots; i++) {
2965             if (s->snapshots[i].l1_size &&
2966                 overlaps_with(s->snapshots[i].l1_table_offset,
2967                 s->snapshots[i].l1_size * L1E_SIZE)) {
2968                 return QCOW2_OL_INACTIVE_L1;
2969             }
2970         }
2971     }
2972 
2973     if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
2974         for (i = 0; i < s->l1_size; i++) {
2975             if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
2976                 overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
2977                 s->cluster_size)) {
2978                 return QCOW2_OL_ACTIVE_L2;
2979             }
2980         }
2981     }
2982 
2983     if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
2984         unsigned last_entry = s->max_refcount_table_index;
2985         assert(last_entry < s->refcount_table_size);
2986         assert(last_entry + 1 == s->refcount_table_size ||
2987                (s->refcount_table[last_entry + 1] & REFT_OFFSET_MASK) == 0);
2988         for (i = 0; i <= last_entry; i++) {
2989             if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
2990                 overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
2991                 s->cluster_size)) {
2992                 return QCOW2_OL_REFCOUNT_BLOCK;
2993             }
2994         }
2995     }
2996 
2997     if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
2998         for (i = 0; i < s->nb_snapshots; i++) {
2999             uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
3000             uint32_t l1_sz  = s->snapshots[i].l1_size;
3001             uint64_t l1_sz2 = l1_sz * L1E_SIZE;
3002             uint64_t *l1;
3003             int ret;
3004 
3005             ret = qcow2_validate_table(bs, l1_ofs, l1_sz, L1E_SIZE,
3006                                        QCOW_MAX_L1_SIZE, "", NULL);
3007             if (ret < 0) {
3008                 return ret;
3009             }
3010 
3011             l1 = g_try_malloc(l1_sz2);
3012 
3013             if (l1_sz2 && l1 == NULL) {
3014                 return -ENOMEM;
3015             }
3016 
3017             ret = bdrv_pread(bs->file, l1_ofs, l1_sz2, l1, 0);
3018             if (ret < 0) {
3019                 g_free(l1);
3020                 return ret;
3021             }
3022 
3023             for (j = 0; j < l1_sz; j++) {
3024                 uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
3025                 if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
3026                     g_free(l1);
3027                     return QCOW2_OL_INACTIVE_L2;
3028                 }
3029             }
3030 
3031             g_free(l1);
3032         }
3033     }
3034 
3035     if ((chk & QCOW2_OL_BITMAP_DIRECTORY) &&
3036         (s->autoclear_features & QCOW2_AUTOCLEAR_BITMAPS))
3037     {
3038         if (overlaps_with(s->bitmap_directory_offset,
3039                           s->bitmap_directory_size))
3040         {
3041             return QCOW2_OL_BITMAP_DIRECTORY;
3042         }
3043     }
3044 
3045     return 0;
3046 }
3047 
3048 static const char *metadata_ol_names[] = {
3049     [QCOW2_OL_MAIN_HEADER_BITNR]        = "qcow2_header",
3050     [QCOW2_OL_ACTIVE_L1_BITNR]          = "active L1 table",
3051     [QCOW2_OL_ACTIVE_L2_BITNR]          = "active L2 table",
3052     [QCOW2_OL_REFCOUNT_TABLE_BITNR]     = "refcount table",
3053     [QCOW2_OL_REFCOUNT_BLOCK_BITNR]     = "refcount block",
3054     [QCOW2_OL_SNAPSHOT_TABLE_BITNR]     = "snapshot table",
3055     [QCOW2_OL_INACTIVE_L1_BITNR]        = "inactive L1 table",
3056     [QCOW2_OL_INACTIVE_L2_BITNR]        = "inactive L2 table",
3057     [QCOW2_OL_BITMAP_DIRECTORY_BITNR]   = "bitmap directory",
3058 };
3059 QEMU_BUILD_BUG_ON(QCOW2_OL_MAX_BITNR != ARRAY_SIZE(metadata_ol_names));
3060 
3061 /*
3062  * First performs a check for metadata overlaps (through
3063  * qcow2_check_metadata_overlap); if that fails with a negative value (error
3064  * while performing a check), that value is returned. If an impending overlap
3065  * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
3066  * and -EIO returned.
3067  *
3068  * Returns 0 if there were neither overlaps nor errors while checking for
3069  * overlaps; or a negative value (-errno) on error.
3070  */
3071 int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
3072                                   int64_t size, bool data_file)
3073 {
3074     int ret;
3075 
3076     if (data_file && has_data_file(bs)) {
3077         return 0;
3078     }
3079 
3080     ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
3081     if (ret < 0) {
3082         return ret;
3083     } else if (ret > 0) {
3084         int metadata_ol_bitnr = ctz32(ret);
3085         assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
3086 
3087         qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid "
3088                                 "write on metadata (overlaps with %s)",
3089                                 metadata_ol_names[metadata_ol_bitnr]);
3090         return -EIO;
3091     }
3092 
3093     return 0;
3094 }
3095 
3096 /* A pointer to a function of this type is given to walk_over_reftable(). That
3097  * function will create refblocks and pass them to a RefblockFinishOp once they
3098  * are completed (@refblock). @refblock_empty is set if the refblock is
3099  * completely empty.
3100  *
3101  * Along with the refblock, a corresponding reftable entry is passed, in the
3102  * reftable @reftable (which may be reallocated) at @reftable_index.
3103  *
3104  * @allocated should be set to true if a new cluster has been allocated.
3105  */
3106 typedef int (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable,
3107                                uint64_t reftable_index, uint64_t *reftable_size,
3108                                void *refblock, bool refblock_empty,
3109                                bool *allocated, Error **errp);
3110 
3111 /**
3112  * This "operation" for walk_over_reftable() allocates the refblock on disk (if
3113  * it is not empty) and inserts its offset into the new reftable. The size of
3114  * this new reftable is increased as required.
3115  */
3116 static int alloc_refblock(BlockDriverState *bs, uint64_t **reftable,
3117                           uint64_t reftable_index, uint64_t *reftable_size,
3118                           void *refblock, bool refblock_empty, bool *allocated,
3119                           Error **errp)
3120 {
3121     BDRVQcow2State *s = bs->opaque;
3122     int64_t offset;
3123 
3124     if (!refblock_empty && reftable_index >= *reftable_size) {
3125         uint64_t *new_reftable;
3126         uint64_t new_reftable_size;
3127 
3128         new_reftable_size = ROUND_UP(reftable_index + 1,
3129                                      s->cluster_size / REFTABLE_ENTRY_SIZE);
3130         if (new_reftable_size > QCOW_MAX_REFTABLE_SIZE / REFTABLE_ENTRY_SIZE) {
3131             error_setg(errp,
3132                        "This operation would make the refcount table grow "
3133                        "beyond the maximum size supported by QEMU, aborting");
3134             return -ENOTSUP;
3135         }
3136 
3137         new_reftable = g_try_realloc(*reftable, new_reftable_size *
3138                                                 REFTABLE_ENTRY_SIZE);
3139         if (!new_reftable) {
3140             error_setg(errp, "Failed to increase reftable buffer size");
3141             return -ENOMEM;
3142         }
3143 
3144         memset(new_reftable + *reftable_size, 0,
3145                (new_reftable_size - *reftable_size) * REFTABLE_ENTRY_SIZE);
3146 
3147         *reftable      = new_reftable;
3148         *reftable_size = new_reftable_size;
3149     }
3150 
3151     if (!refblock_empty && !(*reftable)[reftable_index]) {
3152         offset = qcow2_alloc_clusters(bs, s->cluster_size);
3153         if (offset < 0) {
3154             error_setg_errno(errp, -offset, "Failed to allocate refblock");
3155             return offset;
3156         }
3157         (*reftable)[reftable_index] = offset;
3158         *allocated = true;
3159     }
3160 
3161     return 0;
3162 }
3163 
3164 /**
3165  * This "operation" for walk_over_reftable() writes the refblock to disk at the
3166  * offset specified by the new reftable's entry. It does not modify the new
3167  * reftable or change any refcounts.
3168  */
3169 static int flush_refblock(BlockDriverState *bs, uint64_t **reftable,
3170                           uint64_t reftable_index, uint64_t *reftable_size,
3171                           void *refblock, bool refblock_empty, bool *allocated,
3172                           Error **errp)
3173 {
3174     BDRVQcow2State *s = bs->opaque;
3175     int64_t offset;
3176     int ret;
3177 
3178     if (reftable_index < *reftable_size && (*reftable)[reftable_index]) {
3179         offset = (*reftable)[reftable_index];
3180 
3181         ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size,
3182                                             false);
3183         if (ret < 0) {
3184             error_setg_errno(errp, -ret, "Overlap check failed");
3185             return ret;
3186         }
3187 
3188         ret = bdrv_pwrite(bs->file, offset, s->cluster_size, refblock, 0);
3189         if (ret < 0) {
3190             error_setg_errno(errp, -ret, "Failed to write refblock");
3191             return ret;
3192         }
3193     } else {
3194         assert(refblock_empty);
3195     }
3196 
3197     return 0;
3198 }
3199 
3200 /**
3201  * This function walks over the existing reftable and every referenced refblock;
3202  * if @new_set_refcount is non-NULL, it is called for every refcount entry to
3203  * create an equal new entry in the passed @new_refblock. Once that
3204  * @new_refblock is completely filled, @operation will be called.
3205  *
3206  * @status_cb and @cb_opaque are used for the amend operation's status callback.
3207  * @index is the index of the walk_over_reftable() calls and @total is the total
3208  * number of walk_over_reftable() calls per amend operation. Both are used for
3209  * calculating the parameters for the status callback.
3210  *
3211  * @allocated is set to true if a new cluster has been allocated.
3212  */
3213 static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
3214                               uint64_t *new_reftable_index,
3215                               uint64_t *new_reftable_size,
3216                               void *new_refblock, int new_refblock_size,
3217                               int new_refcount_bits,
3218                               RefblockFinishOp *operation, bool *allocated,
3219                               Qcow2SetRefcountFunc *new_set_refcount,
3220                               BlockDriverAmendStatusCB *status_cb,
3221                               void *cb_opaque, int index, int total,
3222                               Error **errp)
3223 {
3224     BDRVQcow2State *s = bs->opaque;
3225     uint64_t reftable_index;
3226     bool new_refblock_empty = true;
3227     int refblock_index;
3228     int new_refblock_index = 0;
3229     int ret;
3230 
3231     for (reftable_index = 0; reftable_index < s->refcount_table_size;
3232          reftable_index++)
3233     {
3234         uint64_t refblock_offset = s->refcount_table[reftable_index]
3235                                  & REFT_OFFSET_MASK;
3236 
3237         status_cb(bs, (uint64_t)index * s->refcount_table_size + reftable_index,
3238                   (uint64_t)total * s->refcount_table_size, cb_opaque);
3239 
3240         if (refblock_offset) {
3241             void *refblock;
3242 
3243             if (offset_into_cluster(s, refblock_offset)) {
3244                 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
3245                                         PRIx64 " unaligned (reftable index: %#"
3246                                         PRIx64 ")", refblock_offset,
3247                                         reftable_index);
3248                 error_setg(errp,
3249                            "Image is corrupt (unaligned refblock offset)");
3250                 return -EIO;
3251             }
3252 
3253             ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offset,
3254                                   &refblock);
3255             if (ret < 0) {
3256                 error_setg_errno(errp, -ret, "Failed to retrieve refblock");
3257                 return ret;
3258             }
3259 
3260             for (refblock_index = 0; refblock_index < s->refcount_block_size;
3261                  refblock_index++)
3262             {
3263                 uint64_t refcount;
3264 
3265                 if (new_refblock_index >= new_refblock_size) {
3266                     /* new_refblock is now complete */
3267                     ret = operation(bs, new_reftable, *new_reftable_index,
3268                                     new_reftable_size, new_refblock,
3269                                     new_refblock_empty, allocated, errp);
3270                     if (ret < 0) {
3271                         qcow2_cache_put(s->refcount_block_cache, &refblock);
3272                         return ret;
3273                     }
3274 
3275                     (*new_reftable_index)++;
3276                     new_refblock_index = 0;
3277                     new_refblock_empty = true;
3278                 }
3279 
3280                 refcount = s->get_refcount(refblock, refblock_index);
3281                 if (new_refcount_bits < 64 && refcount >> new_refcount_bits) {
3282                     uint64_t offset;
3283 
3284                     qcow2_cache_put(s->refcount_block_cache, &refblock);
3285 
3286                     offset = ((reftable_index << s->refcount_block_bits)
3287                               + refblock_index) << s->cluster_bits;
3288 
3289                     error_setg(errp, "Cannot decrease refcount entry width to "
3290                                "%i bits: Cluster at offset %#" PRIx64 " has a "
3291                                "refcount of %" PRIu64, new_refcount_bits,
3292                                offset, refcount);
3293                     return -EINVAL;
3294                 }
3295 
3296                 if (new_set_refcount) {
3297                     new_set_refcount(new_refblock, new_refblock_index++,
3298                                      refcount);
3299                 } else {
3300                     new_refblock_index++;
3301                 }
3302                 new_refblock_empty = new_refblock_empty && refcount == 0;
3303             }
3304 
3305             qcow2_cache_put(s->refcount_block_cache, &refblock);
3306         } else {
3307             /* No refblock means every refcount is 0 */
3308             for (refblock_index = 0; refblock_index < s->refcount_block_size;
3309                  refblock_index++)
3310             {
3311                 if (new_refblock_index >= new_refblock_size) {
3312                     /* new_refblock is now complete */
3313                     ret = operation(bs, new_reftable, *new_reftable_index,
3314                                     new_reftable_size, new_refblock,
3315                                     new_refblock_empty, allocated, errp);
3316                     if (ret < 0) {
3317                         return ret;
3318                     }
3319 
3320                     (*new_reftable_index)++;
3321                     new_refblock_index = 0;
3322                     new_refblock_empty = true;
3323                 }
3324 
3325                 if (new_set_refcount) {
3326                     new_set_refcount(new_refblock, new_refblock_index++, 0);
3327                 } else {
3328                     new_refblock_index++;
3329                 }
3330             }
3331         }
3332     }
3333 
3334     if (new_refblock_index > 0) {
3335         /* Complete the potentially existing partially filled final refblock */
3336         if (new_set_refcount) {
3337             for (; new_refblock_index < new_refblock_size;
3338                  new_refblock_index++)
3339             {
3340                 new_set_refcount(new_refblock, new_refblock_index, 0);
3341             }
3342         }
3343 
3344         ret = operation(bs, new_reftable, *new_reftable_index,
3345                         new_reftable_size, new_refblock, new_refblock_empty,
3346                         allocated, errp);
3347         if (ret < 0) {
3348             return ret;
3349         }
3350 
3351         (*new_reftable_index)++;
3352     }
3353 
3354     status_cb(bs, (uint64_t)(index + 1) * s->refcount_table_size,
3355               (uint64_t)total * s->refcount_table_size, cb_opaque);
3356 
3357     return 0;
3358 }
3359 
3360 int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
3361                                 BlockDriverAmendStatusCB *status_cb,
3362                                 void *cb_opaque, Error **errp)
3363 {
3364     BDRVQcow2State *s = bs->opaque;
3365     Qcow2GetRefcountFunc *new_get_refcount;
3366     Qcow2SetRefcountFunc *new_set_refcount;
3367     void *new_refblock = qemu_blockalign(bs->file->bs, s->cluster_size);
3368     uint64_t *new_reftable = NULL, new_reftable_size = 0;
3369     uint64_t *old_reftable, old_reftable_size, old_reftable_offset;
3370     uint64_t new_reftable_index = 0;
3371     uint64_t i;
3372     int64_t new_reftable_offset = 0, allocated_reftable_size = 0;
3373     int new_refblock_size, new_refcount_bits = 1 << refcount_order;
3374     int old_refcount_order;
3375     int walk_index = 0;
3376     int ret;
3377     bool new_allocation;
3378 
3379     assert(s->qcow_version >= 3);
3380     assert(refcount_order >= 0 && refcount_order <= 6);
3381 
3382     /* see qcow2_open() */
3383     new_refblock_size = 1 << (s->cluster_bits - (refcount_order - 3));
3384 
3385     new_get_refcount = get_refcount_funcs[refcount_order];
3386     new_set_refcount = set_refcount_funcs[refcount_order];
3387 
3388 
3389     do {
3390         int total_walks;
3391 
3392         new_allocation = false;
3393 
3394         /* At least we have to do this walk and the one which writes the
3395          * refblocks; also, at least we have to do this loop here at least
3396          * twice (normally), first to do the allocations, and second to
3397          * determine that everything is correctly allocated, this then makes
3398          * three walks in total */
3399         total_walks = MAX(walk_index + 2, 3);
3400 
3401         /* First, allocate the structures so they are present in the refcount
3402          * structures */
3403         ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
3404                                  &new_reftable_size, NULL, new_refblock_size,
3405                                  new_refcount_bits, &alloc_refblock,
3406                                  &new_allocation, NULL, status_cb, cb_opaque,
3407                                  walk_index++, total_walks, errp);
3408         if (ret < 0) {
3409             goto done;
3410         }
3411 
3412         new_reftable_index = 0;
3413 
3414         if (new_allocation) {
3415             if (new_reftable_offset) {
3416                 qcow2_free_clusters(
3417                     bs, new_reftable_offset,
3418                     allocated_reftable_size * REFTABLE_ENTRY_SIZE,
3419                     QCOW2_DISCARD_NEVER);
3420             }
3421 
3422             new_reftable_offset = qcow2_alloc_clusters(bs, new_reftable_size *
3423                                                            REFTABLE_ENTRY_SIZE);
3424             if (new_reftable_offset < 0) {
3425                 error_setg_errno(errp, -new_reftable_offset,
3426                                  "Failed to allocate the new reftable");
3427                 ret = new_reftable_offset;
3428                 goto done;
3429             }
3430             allocated_reftable_size = new_reftable_size;
3431         }
3432     } while (new_allocation);
3433 
3434     /* Second, write the new refblocks */
3435     ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
3436                              &new_reftable_size, new_refblock,
3437                              new_refblock_size, new_refcount_bits,
3438                              &flush_refblock, &new_allocation, new_set_refcount,
3439                              status_cb, cb_opaque, walk_index, walk_index + 1,
3440                              errp);
3441     if (ret < 0) {
3442         goto done;
3443     }
3444     assert(!new_allocation);
3445 
3446 
3447     /* Write the new reftable */
3448     ret = qcow2_pre_write_overlap_check(bs, 0, new_reftable_offset,
3449                                         new_reftable_size * REFTABLE_ENTRY_SIZE,
3450                                         false);
3451     if (ret < 0) {
3452         error_setg_errno(errp, -ret, "Overlap check failed");
3453         goto done;
3454     }
3455 
3456     for (i = 0; i < new_reftable_size; i++) {
3457         cpu_to_be64s(&new_reftable[i]);
3458     }
3459 
3460     ret = bdrv_pwrite(bs->file, new_reftable_offset,
3461                       new_reftable_size * REFTABLE_ENTRY_SIZE, new_reftable,
3462                       0);
3463 
3464     for (i = 0; i < new_reftable_size; i++) {
3465         be64_to_cpus(&new_reftable[i]);
3466     }
3467 
3468     if (ret < 0) {
3469         error_setg_errno(errp, -ret, "Failed to write the new reftable");
3470         goto done;
3471     }
3472 
3473 
3474     /* Empty the refcount cache */
3475     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
3476     if (ret < 0) {
3477         error_setg_errno(errp, -ret, "Failed to flush the refblock cache");
3478         goto done;
3479     }
3480 
3481     /* Update the image header to point to the new reftable; this only updates
3482      * the fields which are relevant to qcow2_update_header(); other fields
3483      * such as s->refcount_table or s->refcount_bits stay stale for now
3484      * (because we have to restore everything if qcow2_update_header() fails) */
3485     old_refcount_order  = s->refcount_order;
3486     old_reftable_size   = s->refcount_table_size;
3487     old_reftable_offset = s->refcount_table_offset;
3488 
3489     s->refcount_order        = refcount_order;
3490     s->refcount_table_size   = new_reftable_size;
3491     s->refcount_table_offset = new_reftable_offset;
3492 
3493     ret = qcow2_update_header(bs);
3494     if (ret < 0) {
3495         s->refcount_order        = old_refcount_order;
3496         s->refcount_table_size   = old_reftable_size;
3497         s->refcount_table_offset = old_reftable_offset;
3498         error_setg_errno(errp, -ret, "Failed to update the qcow2 header");
3499         goto done;
3500     }
3501 
3502     /* Now update the rest of the in-memory information */
3503     old_reftable = s->refcount_table;
3504     s->refcount_table = new_reftable;
3505     update_max_refcount_table_index(s);
3506 
3507     s->refcount_bits = 1 << refcount_order;
3508     s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
3509     s->refcount_max += s->refcount_max - 1;
3510 
3511     s->refcount_block_bits = s->cluster_bits - (refcount_order - 3);
3512     s->refcount_block_size = 1 << s->refcount_block_bits;
3513 
3514     s->get_refcount = new_get_refcount;
3515     s->set_refcount = new_set_refcount;
3516 
3517     /* For cleaning up all old refblocks and the old reftable below the "done"
3518      * label */
3519     new_reftable        = old_reftable;
3520     new_reftable_size   = old_reftable_size;
3521     new_reftable_offset = old_reftable_offset;
3522 
3523 done:
3524     if (new_reftable) {
3525         /* On success, new_reftable actually points to the old reftable (and
3526          * new_reftable_size is the old reftable's size); but that is just
3527          * fine */
3528         for (i = 0; i < new_reftable_size; i++) {
3529             uint64_t offset = new_reftable[i] & REFT_OFFSET_MASK;
3530             if (offset) {
3531                 qcow2_free_clusters(bs, offset, s->cluster_size,
3532                                     QCOW2_DISCARD_OTHER);
3533             }
3534         }
3535         g_free(new_reftable);
3536 
3537         if (new_reftable_offset > 0) {
3538             qcow2_free_clusters(bs, new_reftable_offset,
3539                                 new_reftable_size * REFTABLE_ENTRY_SIZE,
3540                                 QCOW2_DISCARD_OTHER);
3541         }
3542     }
3543 
3544     qemu_vfree(new_refblock);
3545     return ret;
3546 }
3547 
3548 static int64_t coroutine_fn get_refblock_offset(BlockDriverState *bs,
3549                                                 uint64_t offset)
3550 {
3551     BDRVQcow2State *s = bs->opaque;
3552     uint32_t index = offset_to_reftable_index(s, offset);
3553     int64_t covering_refblock_offset = 0;
3554 
3555     if (index < s->refcount_table_size) {
3556         covering_refblock_offset = s->refcount_table[index] & REFT_OFFSET_MASK;
3557     }
3558     if (!covering_refblock_offset) {
3559         qcow2_signal_corruption(bs, true, -1, -1, "Refblock at %#" PRIx64 " is "
3560                                 "not covered by the refcount structures",
3561                                 offset);
3562         return -EIO;
3563     }
3564 
3565     return covering_refblock_offset;
3566 }
3567 
3568 static int coroutine_fn
3569 qcow2_discard_refcount_block(BlockDriverState *bs, uint64_t discard_block_offs)
3570 {
3571     BDRVQcow2State *s = bs->opaque;
3572     int64_t refblock_offs;
3573     uint64_t cluster_index = discard_block_offs >> s->cluster_bits;
3574     uint32_t block_index = cluster_index & (s->refcount_block_size - 1);
3575     void *refblock;
3576     int ret;
3577 
3578     refblock_offs = get_refblock_offset(bs, discard_block_offs);
3579     if (refblock_offs < 0) {
3580         return refblock_offs;
3581     }
3582 
3583     assert(discard_block_offs != 0);
3584 
3585     ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
3586                           &refblock);
3587     if (ret < 0) {
3588         return ret;
3589     }
3590 
3591     if (s->get_refcount(refblock, block_index) != 1) {
3592         qcow2_signal_corruption(bs, true, -1, -1, "Invalid refcount:"
3593                                 " refblock offset %#" PRIx64
3594                                 ", reftable index %u"
3595                                 ", block offset %#" PRIx64
3596                                 ", refcount %#" PRIx64,
3597                                 refblock_offs,
3598                                 offset_to_reftable_index(s, discard_block_offs),
3599                                 discard_block_offs,
3600                                 s->get_refcount(refblock, block_index));
3601         qcow2_cache_put(s->refcount_block_cache, &refblock);
3602         return -EINVAL;
3603     }
3604     s->set_refcount(refblock, block_index, 0);
3605 
3606     qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refblock);
3607 
3608     qcow2_cache_put(s->refcount_block_cache, &refblock);
3609 
3610     if (cluster_index < s->free_cluster_index) {
3611         s->free_cluster_index = cluster_index;
3612     }
3613 
3614     refblock = qcow2_cache_is_table_offset(s->refcount_block_cache,
3615                                            discard_block_offs);
3616     if (refblock) {
3617         /* discard refblock from the cache if refblock is cached */
3618         qcow2_cache_discard(s->refcount_block_cache, refblock);
3619     }
3620     update_refcount_discard(bs, discard_block_offs, s->cluster_size);
3621 
3622     return 0;
3623 }
3624 
3625 int coroutine_fn qcow2_shrink_reftable(BlockDriverState *bs)
3626 {
3627     BDRVQcow2State *s = bs->opaque;
3628     uint64_t *reftable_tmp =
3629         g_malloc(s->refcount_table_size * REFTABLE_ENTRY_SIZE);
3630     int i, ret;
3631 
3632     for (i = 0; i < s->refcount_table_size; i++) {
3633         int64_t refblock_offs = s->refcount_table[i] & REFT_OFFSET_MASK;
3634         void *refblock;
3635         bool unused_block;
3636 
3637         if (refblock_offs == 0) {
3638             reftable_tmp[i] = 0;
3639             continue;
3640         }
3641         ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
3642                               &refblock);
3643         if (ret < 0) {
3644             goto out;
3645         }
3646 
3647         /* the refblock has own reference */
3648         if (i == offset_to_reftable_index(s, refblock_offs)) {
3649             uint64_t block_index = (refblock_offs >> s->cluster_bits) &
3650                                    (s->refcount_block_size - 1);
3651             uint64_t refcount = s->get_refcount(refblock, block_index);
3652 
3653             s->set_refcount(refblock, block_index, 0);
3654 
3655             unused_block = buffer_is_zero(refblock, s->cluster_size);
3656 
3657             s->set_refcount(refblock, block_index, refcount);
3658         } else {
3659             unused_block = buffer_is_zero(refblock, s->cluster_size);
3660         }
3661         qcow2_cache_put(s->refcount_block_cache, &refblock);
3662 
3663         reftable_tmp[i] = unused_block ? 0 : cpu_to_be64(s->refcount_table[i]);
3664     }
3665 
3666     ret = bdrv_co_pwrite_sync(bs->file, s->refcount_table_offset,
3667                               s->refcount_table_size * REFTABLE_ENTRY_SIZE,
3668                               reftable_tmp, 0);
3669     /*
3670      * If the write in the reftable failed the image may contain a partially
3671      * overwritten reftable. In this case it would be better to clear the
3672      * reftable in memory to avoid possible image corruption.
3673      */
3674     for (i = 0; i < s->refcount_table_size; i++) {
3675         if (s->refcount_table[i] && !reftable_tmp[i]) {
3676             if (ret == 0) {
3677                 ret = qcow2_discard_refcount_block(bs, s->refcount_table[i] &
3678                                                        REFT_OFFSET_MASK);
3679             }
3680             s->refcount_table[i] = 0;
3681         }
3682     }
3683 
3684     if (!s->cache_discards) {
3685         qcow2_process_discards(bs, ret);
3686     }
3687 
3688 out:
3689     g_free(reftable_tmp);
3690     return ret;
3691 }
3692 
3693 int64_t coroutine_fn qcow2_get_last_cluster(BlockDriverState *bs, int64_t size)
3694 {
3695     BDRVQcow2State *s = bs->opaque;
3696     int64_t i;
3697 
3698     for (i = size_to_clusters(s, size) - 1; i >= 0; i--) {
3699         uint64_t refcount;
3700         int ret = qcow2_get_refcount(bs, i, &refcount);
3701         if (ret < 0) {
3702             fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
3703                     i, strerror(-ret));
3704             return ret;
3705         }
3706         if (refcount > 0) {
3707             return i;
3708         }
3709     }
3710     qcow2_signal_corruption(bs, true, -1, -1,
3711                             "There are no references in the refcount table.");
3712     return -EIO;
3713 }
3714 
3715 int coroutine_fn GRAPH_RDLOCK
3716 qcow2_detect_metadata_preallocation(BlockDriverState *bs)
3717 {
3718     BDRVQcow2State *s = bs->opaque;
3719     int64_t i, end_cluster, cluster_count = 0, threshold;
3720     int64_t file_length, real_allocation, real_clusters;
3721 
3722     qemu_co_mutex_assert_locked(&s->lock);
3723 
3724     file_length = bdrv_co_getlength(bs->file->bs);
3725     if (file_length < 0) {
3726         return file_length;
3727     }
3728 
3729     real_allocation = bdrv_co_get_allocated_file_size(bs->file->bs);
3730     if (real_allocation < 0) {
3731         return real_allocation;
3732     }
3733 
3734     real_clusters = real_allocation / s->cluster_size;
3735     threshold = MAX(real_clusters * 10 / 9, real_clusters + 2);
3736 
3737     end_cluster = size_to_clusters(s, file_length);
3738     for (i = 0; i < end_cluster && cluster_count < threshold; i++) {
3739         uint64_t refcount;
3740         int ret = qcow2_get_refcount(bs, i, &refcount);
3741         if (ret < 0) {
3742             return ret;
3743         }
3744         cluster_count += !!refcount;
3745     }
3746 
3747     return cluster_count >= threshold;
3748 }
3749