xref: /openbmc/qemu/block/qcow2-refcount.c (revision fcf5ef2a)
1 /*
2  * Block driver for the QCOW version 2 format
3  *
4  * Copyright (c) 2004-2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu/osdep.h"
26 #include "qapi/error.h"
27 #include "qemu-common.h"
28 #include "block/block_int.h"
29 #include "block/qcow2.h"
30 #include "qemu/range.h"
31 #include "qemu/bswap.h"
32 
33 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
34 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
35                             int64_t offset, int64_t length, uint64_t addend,
36                             bool decrease, enum qcow2_discard_type type);
37 
38 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index);
39 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index);
40 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index);
41 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index);
42 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index);
43 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index);
44 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index);
45 
46 static void set_refcount_ro0(void *refcount_array, uint64_t index,
47                              uint64_t value);
48 static void set_refcount_ro1(void *refcount_array, uint64_t index,
49                              uint64_t value);
50 static void set_refcount_ro2(void *refcount_array, uint64_t index,
51                              uint64_t value);
52 static void set_refcount_ro3(void *refcount_array, uint64_t index,
53                              uint64_t value);
54 static void set_refcount_ro4(void *refcount_array, uint64_t index,
55                              uint64_t value);
56 static void set_refcount_ro5(void *refcount_array, uint64_t index,
57                              uint64_t value);
58 static void set_refcount_ro6(void *refcount_array, uint64_t index,
59                              uint64_t value);
60 
61 
62 static Qcow2GetRefcountFunc *const get_refcount_funcs[] = {
63     &get_refcount_ro0,
64     &get_refcount_ro1,
65     &get_refcount_ro2,
66     &get_refcount_ro3,
67     &get_refcount_ro4,
68     &get_refcount_ro5,
69     &get_refcount_ro6
70 };
71 
72 static Qcow2SetRefcountFunc *const set_refcount_funcs[] = {
73     &set_refcount_ro0,
74     &set_refcount_ro1,
75     &set_refcount_ro2,
76     &set_refcount_ro3,
77     &set_refcount_ro4,
78     &set_refcount_ro5,
79     &set_refcount_ro6
80 };
81 
82 
83 /*********************************************************/
84 /* refcount handling */
85 
86 int qcow2_refcount_init(BlockDriverState *bs)
87 {
88     BDRVQcow2State *s = bs->opaque;
89     unsigned int refcount_table_size2, i;
90     int ret;
91 
92     assert(s->refcount_order >= 0 && s->refcount_order <= 6);
93 
94     s->get_refcount = get_refcount_funcs[s->refcount_order];
95     s->set_refcount = set_refcount_funcs[s->refcount_order];
96 
97     assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t));
98     refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
99     s->refcount_table = g_try_malloc(refcount_table_size2);
100 
101     if (s->refcount_table_size > 0) {
102         if (s->refcount_table == NULL) {
103             ret = -ENOMEM;
104             goto fail;
105         }
106         BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
107         ret = bdrv_pread(bs->file, s->refcount_table_offset,
108                          s->refcount_table, refcount_table_size2);
109         if (ret < 0) {
110             goto fail;
111         }
112         for(i = 0; i < s->refcount_table_size; i++)
113             be64_to_cpus(&s->refcount_table[i]);
114     }
115     return 0;
116  fail:
117     return ret;
118 }
119 
120 void qcow2_refcount_close(BlockDriverState *bs)
121 {
122     BDRVQcow2State *s = bs->opaque;
123     g_free(s->refcount_table);
124 }
125 
126 
127 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index)
128 {
129     return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1;
130 }
131 
132 static void set_refcount_ro0(void *refcount_array, uint64_t index,
133                              uint64_t value)
134 {
135     assert(!(value >> 1));
136     ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8));
137     ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8);
138 }
139 
140 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index)
141 {
142     return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4)))
143            & 0x3;
144 }
145 
146 static void set_refcount_ro1(void *refcount_array, uint64_t index,
147                              uint64_t value)
148 {
149     assert(!(value >> 2));
150     ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4)));
151     ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4));
152 }
153 
154 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index)
155 {
156     return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2)))
157            & 0xf;
158 }
159 
160 static void set_refcount_ro2(void *refcount_array, uint64_t index,
161                              uint64_t value)
162 {
163     assert(!(value >> 4));
164     ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2)));
165     ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2));
166 }
167 
168 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index)
169 {
170     return ((const uint8_t *)refcount_array)[index];
171 }
172 
173 static void set_refcount_ro3(void *refcount_array, uint64_t index,
174                              uint64_t value)
175 {
176     assert(!(value >> 8));
177     ((uint8_t *)refcount_array)[index] = value;
178 }
179 
180 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index)
181 {
182     return be16_to_cpu(((const uint16_t *)refcount_array)[index]);
183 }
184 
185 static void set_refcount_ro4(void *refcount_array, uint64_t index,
186                              uint64_t value)
187 {
188     assert(!(value >> 16));
189     ((uint16_t *)refcount_array)[index] = cpu_to_be16(value);
190 }
191 
192 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index)
193 {
194     return be32_to_cpu(((const uint32_t *)refcount_array)[index]);
195 }
196 
197 static void set_refcount_ro5(void *refcount_array, uint64_t index,
198                              uint64_t value)
199 {
200     assert(!(value >> 32));
201     ((uint32_t *)refcount_array)[index] = cpu_to_be32(value);
202 }
203 
204 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index)
205 {
206     return be64_to_cpu(((const uint64_t *)refcount_array)[index]);
207 }
208 
209 static void set_refcount_ro6(void *refcount_array, uint64_t index,
210                              uint64_t value)
211 {
212     ((uint64_t *)refcount_array)[index] = cpu_to_be64(value);
213 }
214 
215 
216 static int load_refcount_block(BlockDriverState *bs,
217                                int64_t refcount_block_offset,
218                                void **refcount_block)
219 {
220     BDRVQcow2State *s = bs->opaque;
221 
222     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
223     return qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
224                            refcount_block);
225 }
226 
227 /*
228  * Retrieves the refcount of the cluster given by its index and stores it in
229  * *refcount. Returns 0 on success and -errno on failure.
230  */
231 int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
232                        uint64_t *refcount)
233 {
234     BDRVQcow2State *s = bs->opaque;
235     uint64_t refcount_table_index, block_index;
236     int64_t refcount_block_offset;
237     int ret;
238     void *refcount_block;
239 
240     refcount_table_index = cluster_index >> s->refcount_block_bits;
241     if (refcount_table_index >= s->refcount_table_size) {
242         *refcount = 0;
243         return 0;
244     }
245     refcount_block_offset =
246         s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
247     if (!refcount_block_offset) {
248         *refcount = 0;
249         return 0;
250     }
251 
252     if (offset_into_cluster(s, refcount_block_offset)) {
253         qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64
254                                 " unaligned (reftable index: %#" PRIx64 ")",
255                                 refcount_block_offset, refcount_table_index);
256         return -EIO;
257     }
258 
259     ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
260                           &refcount_block);
261     if (ret < 0) {
262         return ret;
263     }
264 
265     block_index = cluster_index & (s->refcount_block_size - 1);
266     *refcount = s->get_refcount(refcount_block, block_index);
267 
268     qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
269 
270     return 0;
271 }
272 
273 /*
274  * Rounds the refcount table size up to avoid growing the table for each single
275  * refcount block that is allocated.
276  */
277 static unsigned int next_refcount_table_size(BDRVQcow2State *s,
278     unsigned int min_size)
279 {
280     unsigned int min_clusters = (min_size >> (s->cluster_bits - 3)) + 1;
281     unsigned int refcount_table_clusters =
282         MAX(1, s->refcount_table_size >> (s->cluster_bits - 3));
283 
284     while (min_clusters > refcount_table_clusters) {
285         refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
286     }
287 
288     return refcount_table_clusters << (s->cluster_bits - 3);
289 }
290 
291 
292 /* Checks if two offsets are described by the same refcount block */
293 static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a,
294     uint64_t offset_b)
295 {
296     uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits);
297     uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits);
298 
299     return (block_a == block_b);
300 }
301 
302 /*
303  * Loads a refcount block. If it doesn't exist yet, it is allocated first
304  * (including growing the refcount table if needed).
305  *
306  * Returns 0 on success or -errno in error case
307  */
308 static int alloc_refcount_block(BlockDriverState *bs,
309                                 int64_t cluster_index, void **refcount_block)
310 {
311     BDRVQcow2State *s = bs->opaque;
312     unsigned int refcount_table_index;
313     int ret;
314 
315     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
316 
317     /* Find the refcount block for the given cluster */
318     refcount_table_index = cluster_index >> s->refcount_block_bits;
319 
320     if (refcount_table_index < s->refcount_table_size) {
321 
322         uint64_t refcount_block_offset =
323             s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
324 
325         /* If it's already there, we're done */
326         if (refcount_block_offset) {
327             if (offset_into_cluster(s, refcount_block_offset)) {
328                 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
329                                         PRIx64 " unaligned (reftable index: "
330                                         "%#x)", refcount_block_offset,
331                                         refcount_table_index);
332                 return -EIO;
333             }
334 
335              return load_refcount_block(bs, refcount_block_offset,
336                                         refcount_block);
337         }
338     }
339 
340     /*
341      * If we came here, we need to allocate something. Something is at least
342      * a cluster for the new refcount block. It may also include a new refcount
343      * table if the old refcount table is too small.
344      *
345      * Note that allocating clusters here needs some special care:
346      *
347      * - We can't use the normal qcow2_alloc_clusters(), it would try to
348      *   increase the refcount and very likely we would end up with an endless
349      *   recursion. Instead we must place the refcount blocks in a way that
350      *   they can describe them themselves.
351      *
352      * - We need to consider that at this point we are inside update_refcounts
353      *   and potentially doing an initial refcount increase. This means that
354      *   some clusters have already been allocated by the caller, but their
355      *   refcount isn't accurate yet. If we allocate clusters for metadata, we
356      *   need to return -EAGAIN to signal the caller that it needs to restart
357      *   the search for free clusters.
358      *
359      * - alloc_clusters_noref and qcow2_free_clusters may load a different
360      *   refcount block into the cache
361      */
362 
363     *refcount_block = NULL;
364 
365     /* We write to the refcount table, so we might depend on L2 tables */
366     ret = qcow2_cache_flush(bs, s->l2_table_cache);
367     if (ret < 0) {
368         return ret;
369     }
370 
371     /* Allocate the refcount block itself and mark it as used */
372     int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
373     if (new_block < 0) {
374         return new_block;
375     }
376 
377 #ifdef DEBUG_ALLOC2
378     fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
379         " at %" PRIx64 "\n",
380         refcount_table_index, cluster_index << s->cluster_bits, new_block);
381 #endif
382 
383     if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
384         /* Zero the new refcount block before updating it */
385         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
386                                     refcount_block);
387         if (ret < 0) {
388             goto fail_block;
389         }
390 
391         memset(*refcount_block, 0, s->cluster_size);
392 
393         /* The block describes itself, need to update the cache */
394         int block_index = (new_block >> s->cluster_bits) &
395             (s->refcount_block_size - 1);
396         s->set_refcount(*refcount_block, block_index, 1);
397     } else {
398         /* Described somewhere else. This can recurse at most twice before we
399          * arrive at a block that describes itself. */
400         ret = update_refcount(bs, new_block, s->cluster_size, 1, false,
401                               QCOW2_DISCARD_NEVER);
402         if (ret < 0) {
403             goto fail_block;
404         }
405 
406         ret = qcow2_cache_flush(bs, s->refcount_block_cache);
407         if (ret < 0) {
408             goto fail_block;
409         }
410 
411         /* Initialize the new refcount block only after updating its refcount,
412          * update_refcount uses the refcount cache itself */
413         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
414                                     refcount_block);
415         if (ret < 0) {
416             goto fail_block;
417         }
418 
419         memset(*refcount_block, 0, s->cluster_size);
420     }
421 
422     /* Now the new refcount block needs to be written to disk */
423     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
424     qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, *refcount_block);
425     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
426     if (ret < 0) {
427         goto fail_block;
428     }
429 
430     /* If the refcount table is big enough, just hook the block up there */
431     if (refcount_table_index < s->refcount_table_size) {
432         uint64_t data64 = cpu_to_be64(new_block);
433         BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
434         ret = bdrv_pwrite_sync(bs->file,
435             s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
436             &data64, sizeof(data64));
437         if (ret < 0) {
438             goto fail_block;
439         }
440 
441         s->refcount_table[refcount_table_index] = new_block;
442 
443         /* The new refcount block may be where the caller intended to put its
444          * data, so let it restart the search. */
445         return -EAGAIN;
446     }
447 
448     qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
449 
450     /*
451      * If we come here, we need to grow the refcount table. Again, a new
452      * refcount table needs some space and we can't simply allocate to avoid
453      * endless recursion.
454      *
455      * Therefore let's grab new refcount blocks at the end of the image, which
456      * will describe themselves and the new refcount table. This way we can
457      * reference them only in the new table and do the switch to the new
458      * refcount table at once without producing an inconsistent state in
459      * between.
460      */
461     BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
462 
463     /* Calculate the number of refcount blocks needed so far; this will be the
464      * basis for calculating the index of the first cluster used for the
465      * self-describing refcount structures which we are about to create.
466      *
467      * Because we reached this point, there cannot be any refcount entries for
468      * cluster_index or higher indices yet. However, because new_block has been
469      * allocated to describe that cluster (and it will assume this role later
470      * on), we cannot use that index; also, new_block may actually have a higher
471      * cluster index than cluster_index, so it needs to be taken into account
472      * here (and 1 needs to be added to its value because that cluster is used).
473      */
474     uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1,
475                                             (new_block >> s->cluster_bits) + 1),
476                                         s->refcount_block_size);
477 
478     if (blocks_used > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
479         return -EFBIG;
480     }
481 
482     /* And now we need at least one block more for the new metadata */
483     uint64_t table_size = next_refcount_table_size(s, blocks_used + 1);
484     uint64_t last_table_size;
485     uint64_t blocks_clusters;
486     do {
487         uint64_t table_clusters =
488             size_to_clusters(s, table_size * sizeof(uint64_t));
489         blocks_clusters = 1 +
490             DIV_ROUND_UP(table_clusters, s->refcount_block_size);
491         uint64_t meta_clusters = table_clusters + blocks_clusters;
492 
493         last_table_size = table_size;
494         table_size = next_refcount_table_size(s, blocks_used +
495             DIV_ROUND_UP(meta_clusters, s->refcount_block_size));
496 
497     } while (last_table_size != table_size);
498 
499 #ifdef DEBUG_ALLOC2
500     fprintf(stderr, "qcow2: Grow refcount table %" PRId32 " => %" PRId64 "\n",
501         s->refcount_table_size, table_size);
502 #endif
503 
504     /* Create the new refcount table and blocks */
505     uint64_t meta_offset = (blocks_used * s->refcount_block_size) *
506         s->cluster_size;
507     uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size;
508     uint64_t *new_table = g_try_new0(uint64_t, table_size);
509     void *new_blocks = g_try_malloc0(blocks_clusters * s->cluster_size);
510 
511     assert(table_size > 0 && blocks_clusters > 0);
512     if (new_table == NULL || new_blocks == NULL) {
513         ret = -ENOMEM;
514         goto fail_table;
515     }
516 
517     /* Fill the new refcount table */
518     memcpy(new_table, s->refcount_table,
519         s->refcount_table_size * sizeof(uint64_t));
520     new_table[refcount_table_index] = new_block;
521 
522     int i;
523     for (i = 0; i < blocks_clusters; i++) {
524         new_table[blocks_used + i] = meta_offset + (i * s->cluster_size);
525     }
526 
527     /* Fill the refcount blocks */
528     uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t));
529     int block = 0;
530     for (i = 0; i < table_clusters + blocks_clusters; i++) {
531         s->set_refcount(new_blocks, block++, 1);
532     }
533 
534     /* Write refcount blocks to disk */
535     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
536     ret = bdrv_pwrite_sync(bs->file, meta_offset, new_blocks,
537         blocks_clusters * s->cluster_size);
538     g_free(new_blocks);
539     new_blocks = NULL;
540     if (ret < 0) {
541         goto fail_table;
542     }
543 
544     /* Write refcount table to disk */
545     for(i = 0; i < table_size; i++) {
546         cpu_to_be64s(&new_table[i]);
547     }
548 
549     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
550     ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
551         table_size * sizeof(uint64_t));
552     if (ret < 0) {
553         goto fail_table;
554     }
555 
556     for(i = 0; i < table_size; i++) {
557         be64_to_cpus(&new_table[i]);
558     }
559 
560     /* Hook up the new refcount table in the qcow2 header */
561     struct QEMU_PACKED {
562         uint64_t d64;
563         uint32_t d32;
564     } data;
565     data.d64 = cpu_to_be64(table_offset);
566     data.d32 = cpu_to_be32(table_clusters);
567     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
568     ret = bdrv_pwrite_sync(bs->file,
569                            offsetof(QCowHeader, refcount_table_offset),
570                            &data, sizeof(data));
571     if (ret < 0) {
572         goto fail_table;
573     }
574 
575     /* And switch it in memory */
576     uint64_t old_table_offset = s->refcount_table_offset;
577     uint64_t old_table_size = s->refcount_table_size;
578 
579     g_free(s->refcount_table);
580     s->refcount_table = new_table;
581     s->refcount_table_size = table_size;
582     s->refcount_table_offset = table_offset;
583 
584     /* Free old table. */
585     qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
586                         QCOW2_DISCARD_OTHER);
587 
588     ret = load_refcount_block(bs, new_block, refcount_block);
589     if (ret < 0) {
590         return ret;
591     }
592 
593     /* If we were trying to do the initial refcount update for some cluster
594      * allocation, we might have used the same clusters to store newly
595      * allocated metadata. Make the caller search some new space. */
596     return -EAGAIN;
597 
598 fail_table:
599     g_free(new_blocks);
600     g_free(new_table);
601 fail_block:
602     if (*refcount_block != NULL) {
603         qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
604     }
605     return ret;
606 }
607 
608 void qcow2_process_discards(BlockDriverState *bs, int ret)
609 {
610     BDRVQcow2State *s = bs->opaque;
611     Qcow2DiscardRegion *d, *next;
612 
613     QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
614         QTAILQ_REMOVE(&s->discards, d, next);
615 
616         /* Discard is optional, ignore the return value */
617         if (ret >= 0) {
618             bdrv_pdiscard(bs->file->bs, d->offset, d->bytes);
619         }
620 
621         g_free(d);
622     }
623 }
624 
625 static void update_refcount_discard(BlockDriverState *bs,
626                                     uint64_t offset, uint64_t length)
627 {
628     BDRVQcow2State *s = bs->opaque;
629     Qcow2DiscardRegion *d, *p, *next;
630 
631     QTAILQ_FOREACH(d, &s->discards, next) {
632         uint64_t new_start = MIN(offset, d->offset);
633         uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
634 
635         if (new_end - new_start <= length + d->bytes) {
636             /* There can't be any overlap, areas ending up here have no
637              * references any more and therefore shouldn't get freed another
638              * time. */
639             assert(d->bytes + length == new_end - new_start);
640             d->offset = new_start;
641             d->bytes = new_end - new_start;
642             goto found;
643         }
644     }
645 
646     d = g_malloc(sizeof(*d));
647     *d = (Qcow2DiscardRegion) {
648         .bs     = bs,
649         .offset = offset,
650         .bytes  = length,
651     };
652     QTAILQ_INSERT_TAIL(&s->discards, d, next);
653 
654 found:
655     /* Merge discard requests if they are adjacent now */
656     QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
657         if (p == d
658             || p->offset > d->offset + d->bytes
659             || d->offset > p->offset + p->bytes)
660         {
661             continue;
662         }
663 
664         /* Still no overlap possible */
665         assert(p->offset == d->offset + d->bytes
666             || d->offset == p->offset + p->bytes);
667 
668         QTAILQ_REMOVE(&s->discards, p, next);
669         d->offset = MIN(d->offset, p->offset);
670         d->bytes += p->bytes;
671         g_free(p);
672     }
673 }
674 
675 /* XXX: cache several refcount block clusters ? */
676 /* @addend is the absolute value of the addend; if @decrease is set, @addend
677  * will be subtracted from the current refcount, otherwise it will be added */
678 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
679                                                    int64_t offset,
680                                                    int64_t length,
681                                                    uint64_t addend,
682                                                    bool decrease,
683                                                    enum qcow2_discard_type type)
684 {
685     BDRVQcow2State *s = bs->opaque;
686     int64_t start, last, cluster_offset;
687     void *refcount_block = NULL;
688     int64_t old_table_index = -1;
689     int ret;
690 
691 #ifdef DEBUG_ALLOC2
692     fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64
693             " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "",
694             addend);
695 #endif
696     if (length < 0) {
697         return -EINVAL;
698     } else if (length == 0) {
699         return 0;
700     }
701 
702     if (decrease) {
703         qcow2_cache_set_dependency(bs, s->refcount_block_cache,
704             s->l2_table_cache);
705     }
706 
707     start = start_of_cluster(s, offset);
708     last = start_of_cluster(s, offset + length - 1);
709     for(cluster_offset = start; cluster_offset <= last;
710         cluster_offset += s->cluster_size)
711     {
712         int block_index;
713         uint64_t refcount;
714         int64_t cluster_index = cluster_offset >> s->cluster_bits;
715         int64_t table_index = cluster_index >> s->refcount_block_bits;
716 
717         /* Load the refcount block and allocate it if needed */
718         if (table_index != old_table_index) {
719             if (refcount_block) {
720                 qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
721             }
722             ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
723             if (ret < 0) {
724                 goto fail;
725             }
726         }
727         old_table_index = table_index;
728 
729         qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache,
730                                      refcount_block);
731 
732         /* we can update the count and save it */
733         block_index = cluster_index & (s->refcount_block_size - 1);
734 
735         refcount = s->get_refcount(refcount_block, block_index);
736         if (decrease ? (refcount - addend > refcount)
737                      : (refcount + addend < refcount ||
738                         refcount + addend > s->refcount_max))
739         {
740             ret = -EINVAL;
741             goto fail;
742         }
743         if (decrease) {
744             refcount -= addend;
745         } else {
746             refcount += addend;
747         }
748         if (refcount == 0 && cluster_index < s->free_cluster_index) {
749             s->free_cluster_index = cluster_index;
750         }
751         s->set_refcount(refcount_block, block_index, refcount);
752 
753         if (refcount == 0 && s->discard_passthrough[type]) {
754             update_refcount_discard(bs, cluster_offset, s->cluster_size);
755         }
756     }
757 
758     ret = 0;
759 fail:
760     if (!s->cache_discards) {
761         qcow2_process_discards(bs, ret);
762     }
763 
764     /* Write last changed block to disk */
765     if (refcount_block) {
766         qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
767     }
768 
769     /*
770      * Try do undo any updates if an error is returned (This may succeed in
771      * some cases like ENOSPC for allocating a new refcount block)
772      */
773     if (ret < 0) {
774         int dummy;
775         dummy = update_refcount(bs, offset, cluster_offset - offset, addend,
776                                 !decrease, QCOW2_DISCARD_NEVER);
777         (void)dummy;
778     }
779 
780     return ret;
781 }
782 
783 /*
784  * Increases or decreases the refcount of a given cluster.
785  *
786  * @addend is the absolute value of the addend; if @decrease is set, @addend
787  * will be subtracted from the current refcount, otherwise it will be added.
788  *
789  * On success 0 is returned; on failure -errno is returned.
790  */
791 int qcow2_update_cluster_refcount(BlockDriverState *bs,
792                                   int64_t cluster_index,
793                                   uint64_t addend, bool decrease,
794                                   enum qcow2_discard_type type)
795 {
796     BDRVQcow2State *s = bs->opaque;
797     int ret;
798 
799     ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
800                           decrease, type);
801     if (ret < 0) {
802         return ret;
803     }
804 
805     return 0;
806 }
807 
808 
809 
810 /*********************************************************/
811 /* cluster allocation functions */
812 
813 
814 
815 /* return < 0 if error */
816 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size)
817 {
818     BDRVQcow2State *s = bs->opaque;
819     uint64_t i, nb_clusters, refcount;
820     int ret;
821 
822     /* We can't allocate clusters if they may still be queued for discard. */
823     if (s->cache_discards) {
824         qcow2_process_discards(bs, 0);
825     }
826 
827     nb_clusters = size_to_clusters(s, size);
828 retry:
829     for(i = 0; i < nb_clusters; i++) {
830         uint64_t next_cluster_index = s->free_cluster_index++;
831         ret = qcow2_get_refcount(bs, next_cluster_index, &refcount);
832 
833         if (ret < 0) {
834             return ret;
835         } else if (refcount != 0) {
836             goto retry;
837         }
838     }
839 
840     /* Make sure that all offsets in the "allocated" range are representable
841      * in an int64_t */
842     if (s->free_cluster_index > 0 &&
843         s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits))
844     {
845         return -EFBIG;
846     }
847 
848 #ifdef DEBUG_ALLOC2
849     fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
850             size,
851             (s->free_cluster_index - nb_clusters) << s->cluster_bits);
852 #endif
853     return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
854 }
855 
856 int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
857 {
858     int64_t offset;
859     int ret;
860 
861     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
862     do {
863         offset = alloc_clusters_noref(bs, size);
864         if (offset < 0) {
865             return offset;
866         }
867 
868         ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
869     } while (ret == -EAGAIN);
870 
871     if (ret < 0) {
872         return ret;
873     }
874 
875     return offset;
876 }
877 
878 int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
879                                 int64_t nb_clusters)
880 {
881     BDRVQcow2State *s = bs->opaque;
882     uint64_t cluster_index, refcount;
883     uint64_t i;
884     int ret;
885 
886     assert(nb_clusters >= 0);
887     if (nb_clusters == 0) {
888         return 0;
889     }
890 
891     do {
892         /* Check how many clusters there are free */
893         cluster_index = offset >> s->cluster_bits;
894         for(i = 0; i < nb_clusters; i++) {
895             ret = qcow2_get_refcount(bs, cluster_index++, &refcount);
896             if (ret < 0) {
897                 return ret;
898             } else if (refcount != 0) {
899                 break;
900             }
901         }
902 
903         /* And then allocate them */
904         ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false,
905                               QCOW2_DISCARD_NEVER);
906     } while (ret == -EAGAIN);
907 
908     if (ret < 0) {
909         return ret;
910     }
911 
912     return i;
913 }
914 
915 /* only used to allocate compressed sectors. We try to allocate
916    contiguous sectors. size must be <= cluster_size */
917 int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
918 {
919     BDRVQcow2State *s = bs->opaque;
920     int64_t offset;
921     size_t free_in_cluster;
922     int ret;
923 
924     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
925     assert(size > 0 && size <= s->cluster_size);
926     assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset));
927 
928     offset = s->free_byte_offset;
929 
930     if (offset) {
931         uint64_t refcount;
932         ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount);
933         if (ret < 0) {
934             return ret;
935         }
936 
937         if (refcount == s->refcount_max) {
938             offset = 0;
939         }
940     }
941 
942     free_in_cluster = s->cluster_size - offset_into_cluster(s, offset);
943     do {
944         if (!offset || free_in_cluster < size) {
945             int64_t new_cluster = alloc_clusters_noref(bs, s->cluster_size);
946             if (new_cluster < 0) {
947                 return new_cluster;
948             }
949 
950             if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) {
951                 offset = new_cluster;
952                 free_in_cluster = s->cluster_size;
953             } else {
954                 free_in_cluster += s->cluster_size;
955             }
956         }
957 
958         assert(offset);
959         ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
960         if (ret < 0) {
961             offset = 0;
962         }
963     } while (ret == -EAGAIN);
964     if (ret < 0) {
965         return ret;
966     }
967 
968     /* The cluster refcount was incremented; refcount blocks must be flushed
969      * before the caller's L2 table updates. */
970     qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
971 
972     s->free_byte_offset = offset + size;
973     if (!offset_into_cluster(s, s->free_byte_offset)) {
974         s->free_byte_offset = 0;
975     }
976 
977     return offset;
978 }
979 
980 void qcow2_free_clusters(BlockDriverState *bs,
981                           int64_t offset, int64_t size,
982                           enum qcow2_discard_type type)
983 {
984     int ret;
985 
986     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
987     ret = update_refcount(bs, offset, size, 1, true, type);
988     if (ret < 0) {
989         fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
990         /* TODO Remember the clusters to free them later and avoid leaking */
991     }
992 }
993 
994 /*
995  * Free a cluster using its L2 entry (handles clusters of all types, e.g.
996  * normal cluster, compressed cluster, etc.)
997  */
998 void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
999                              int nb_clusters, enum qcow2_discard_type type)
1000 {
1001     BDRVQcow2State *s = bs->opaque;
1002 
1003     switch (qcow2_get_cluster_type(l2_entry)) {
1004     case QCOW2_CLUSTER_COMPRESSED:
1005         {
1006             int nb_csectors;
1007             nb_csectors = ((l2_entry >> s->csize_shift) &
1008                            s->csize_mask) + 1;
1009             qcow2_free_clusters(bs,
1010                 (l2_entry & s->cluster_offset_mask) & ~511,
1011                 nb_csectors * 512, type);
1012         }
1013         break;
1014     case QCOW2_CLUSTER_NORMAL:
1015     case QCOW2_CLUSTER_ZERO:
1016         if (l2_entry & L2E_OFFSET_MASK) {
1017             if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) {
1018                 qcow2_signal_corruption(bs, false, -1, -1,
1019                                         "Cannot free unaligned cluster %#llx",
1020                                         l2_entry & L2E_OFFSET_MASK);
1021             } else {
1022                 qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
1023                                     nb_clusters << s->cluster_bits, type);
1024             }
1025         }
1026         break;
1027     case QCOW2_CLUSTER_UNALLOCATED:
1028         break;
1029     default:
1030         abort();
1031     }
1032 }
1033 
1034 
1035 
1036 /*********************************************************/
1037 /* snapshots and image creation */
1038 
1039 
1040 
1041 /* update the refcounts of snapshots and the copied flag */
1042 int qcow2_update_snapshot_refcount(BlockDriverState *bs,
1043     int64_t l1_table_offset, int l1_size, int addend)
1044 {
1045     BDRVQcow2State *s = bs->opaque;
1046     uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, refcount;
1047     bool l1_allocated = false;
1048     int64_t old_offset, old_l2_offset;
1049     int i, j, l1_modified = 0, nb_csectors;
1050     int ret;
1051 
1052     assert(addend >= -1 && addend <= 1);
1053 
1054     l2_table = NULL;
1055     l1_table = NULL;
1056     l1_size2 = l1_size * sizeof(uint64_t);
1057 
1058     s->cache_discards = true;
1059 
1060     /* WARNING: qcow2_snapshot_goto relies on this function not using the
1061      * l1_table_offset when it is the current s->l1_table_offset! Be careful
1062      * when changing this! */
1063     if (l1_table_offset != s->l1_table_offset) {
1064         l1_table = g_try_malloc0(align_offset(l1_size2, 512));
1065         if (l1_size2 && l1_table == NULL) {
1066             ret = -ENOMEM;
1067             goto fail;
1068         }
1069         l1_allocated = true;
1070 
1071         ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
1072         if (ret < 0) {
1073             goto fail;
1074         }
1075 
1076         for(i = 0;i < l1_size; i++)
1077             be64_to_cpus(&l1_table[i]);
1078     } else {
1079         assert(l1_size == s->l1_size);
1080         l1_table = s->l1_table;
1081         l1_allocated = false;
1082     }
1083 
1084     for(i = 0; i < l1_size; i++) {
1085         l2_offset = l1_table[i];
1086         if (l2_offset) {
1087             old_l2_offset = l2_offset;
1088             l2_offset &= L1E_OFFSET_MASK;
1089 
1090             if (offset_into_cluster(s, l2_offset)) {
1091                 qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
1092                                         PRIx64 " unaligned (L1 index: %#x)",
1093                                         l2_offset, i);
1094                 ret = -EIO;
1095                 goto fail;
1096             }
1097 
1098             ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
1099                 (void**) &l2_table);
1100             if (ret < 0) {
1101                 goto fail;
1102             }
1103 
1104             for(j = 0; j < s->l2_size; j++) {
1105                 uint64_t cluster_index;
1106 
1107                 offset = be64_to_cpu(l2_table[j]);
1108                 old_offset = offset;
1109                 offset &= ~QCOW_OFLAG_COPIED;
1110 
1111                 switch (qcow2_get_cluster_type(offset)) {
1112                     case QCOW2_CLUSTER_COMPRESSED:
1113                         nb_csectors = ((offset >> s->csize_shift) &
1114                                        s->csize_mask) + 1;
1115                         if (addend != 0) {
1116                             ret = update_refcount(bs,
1117                                 (offset & s->cluster_offset_mask) & ~511,
1118                                 nb_csectors * 512, abs(addend), addend < 0,
1119                                 QCOW2_DISCARD_SNAPSHOT);
1120                             if (ret < 0) {
1121                                 goto fail;
1122                             }
1123                         }
1124                         /* compressed clusters are never modified */
1125                         refcount = 2;
1126                         break;
1127 
1128                     case QCOW2_CLUSTER_NORMAL:
1129                     case QCOW2_CLUSTER_ZERO:
1130                         if (offset_into_cluster(s, offset & L2E_OFFSET_MASK)) {
1131                             qcow2_signal_corruption(bs, true, -1, -1, "Data "
1132                                                     "cluster offset %#llx "
1133                                                     "unaligned (L2 offset: %#"
1134                                                     PRIx64 ", L2 index: %#x)",
1135                                                     offset & L2E_OFFSET_MASK,
1136                                                     l2_offset, j);
1137                             ret = -EIO;
1138                             goto fail;
1139                         }
1140 
1141                         cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits;
1142                         if (!cluster_index) {
1143                             /* unallocated */
1144                             refcount = 0;
1145                             break;
1146                         }
1147                         if (addend != 0) {
1148                             ret = qcow2_update_cluster_refcount(bs,
1149                                     cluster_index, abs(addend), addend < 0,
1150                                     QCOW2_DISCARD_SNAPSHOT);
1151                             if (ret < 0) {
1152                                 goto fail;
1153                             }
1154                         }
1155 
1156                         ret = qcow2_get_refcount(bs, cluster_index, &refcount);
1157                         if (ret < 0) {
1158                             goto fail;
1159                         }
1160                         break;
1161 
1162                     case QCOW2_CLUSTER_UNALLOCATED:
1163                         refcount = 0;
1164                         break;
1165 
1166                     default:
1167                         abort();
1168                 }
1169 
1170                 if (refcount == 1) {
1171                     offset |= QCOW_OFLAG_COPIED;
1172                 }
1173                 if (offset != old_offset) {
1174                     if (addend > 0) {
1175                         qcow2_cache_set_dependency(bs, s->l2_table_cache,
1176                             s->refcount_block_cache);
1177                     }
1178                     l2_table[j] = cpu_to_be64(offset);
1179                     qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache,
1180                                                  l2_table);
1181                 }
1182             }
1183 
1184             qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
1185 
1186             if (addend != 0) {
1187                 ret = qcow2_update_cluster_refcount(bs, l2_offset >>
1188                                                         s->cluster_bits,
1189                                                     abs(addend), addend < 0,
1190                                                     QCOW2_DISCARD_SNAPSHOT);
1191                 if (ret < 0) {
1192                     goto fail;
1193                 }
1194             }
1195             ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1196                                      &refcount);
1197             if (ret < 0) {
1198                 goto fail;
1199             } else if (refcount == 1) {
1200                 l2_offset |= QCOW_OFLAG_COPIED;
1201             }
1202             if (l2_offset != old_l2_offset) {
1203                 l1_table[i] = l2_offset;
1204                 l1_modified = 1;
1205             }
1206         }
1207     }
1208 
1209     ret = bdrv_flush(bs);
1210 fail:
1211     if (l2_table) {
1212         qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
1213     }
1214 
1215     s->cache_discards = false;
1216     qcow2_process_discards(bs, ret);
1217 
1218     /* Update L1 only if it isn't deleted anyway (addend = -1) */
1219     if (ret == 0 && addend >= 0 && l1_modified) {
1220         for (i = 0; i < l1_size; i++) {
1221             cpu_to_be64s(&l1_table[i]);
1222         }
1223 
1224         ret = bdrv_pwrite_sync(bs->file, l1_table_offset,
1225                                l1_table, l1_size2);
1226 
1227         for (i = 0; i < l1_size; i++) {
1228             be64_to_cpus(&l1_table[i]);
1229         }
1230     }
1231     if (l1_allocated)
1232         g_free(l1_table);
1233     return ret;
1234 }
1235 
1236 
1237 
1238 
1239 /*********************************************************/
1240 /* refcount checking functions */
1241 
1242 
1243 static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries)
1244 {
1245     /* This assertion holds because there is no way we can address more than
1246      * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because
1247      * offsets have to be representable in bytes); due to every cluster
1248      * corresponding to one refcount entry, we are well below that limit */
1249     assert(entries < (UINT64_C(1) << (64 - 9)));
1250 
1251     /* Thanks to the assertion this will not overflow, because
1252      * s->refcount_order < 7.
1253      * (note: x << s->refcount_order == x * s->refcount_bits) */
1254     return DIV_ROUND_UP(entries << s->refcount_order, 8);
1255 }
1256 
1257 /**
1258  * Reallocates *array so that it can hold new_size entries. *size must contain
1259  * the current number of entries in *array. If the reallocation fails, *array
1260  * and *size will not be modified and -errno will be returned. If the
1261  * reallocation is successful, *array will be set to the new buffer, *size
1262  * will be set to new_size and 0 will be returned. The size of the reallocated
1263  * refcount array buffer will be aligned to a cluster boundary, and the newly
1264  * allocated area will be zeroed.
1265  */
1266 static int realloc_refcount_array(BDRVQcow2State *s, void **array,
1267                                   int64_t *size, int64_t new_size)
1268 {
1269     int64_t old_byte_size, new_byte_size;
1270     void *new_ptr;
1271 
1272     /* Round to clusters so the array can be directly written to disk */
1273     old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size))
1274                     * s->cluster_size;
1275     new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size))
1276                     * s->cluster_size;
1277 
1278     if (new_byte_size == old_byte_size) {
1279         *size = new_size;
1280         return 0;
1281     }
1282 
1283     assert(new_byte_size > 0);
1284 
1285     if (new_byte_size > SIZE_MAX) {
1286         return -ENOMEM;
1287     }
1288 
1289     new_ptr = g_try_realloc(*array, new_byte_size);
1290     if (!new_ptr) {
1291         return -ENOMEM;
1292     }
1293 
1294     if (new_byte_size > old_byte_size) {
1295         memset((char *)new_ptr + old_byte_size, 0,
1296                new_byte_size - old_byte_size);
1297     }
1298 
1299     *array = new_ptr;
1300     *size  = new_size;
1301 
1302     return 0;
1303 }
1304 
1305 /*
1306  * Increases the refcount for a range of clusters in a given refcount table.
1307  * This is used to construct a temporary refcount table out of L1 and L2 tables
1308  * which can be compared to the refcount table saved in the image.
1309  *
1310  * Modifies the number of errors in res.
1311  */
1312 static int inc_refcounts(BlockDriverState *bs,
1313                          BdrvCheckResult *res,
1314                          void **refcount_table,
1315                          int64_t *refcount_table_size,
1316                          int64_t offset, int64_t size)
1317 {
1318     BDRVQcow2State *s = bs->opaque;
1319     uint64_t start, last, cluster_offset, k, refcount;
1320     int ret;
1321 
1322     if (size <= 0) {
1323         return 0;
1324     }
1325 
1326     start = start_of_cluster(s, offset);
1327     last = start_of_cluster(s, offset + size - 1);
1328     for(cluster_offset = start; cluster_offset <= last;
1329         cluster_offset += s->cluster_size) {
1330         k = cluster_offset >> s->cluster_bits;
1331         if (k >= *refcount_table_size) {
1332             ret = realloc_refcount_array(s, refcount_table,
1333                                          refcount_table_size, k + 1);
1334             if (ret < 0) {
1335                 res->check_errors++;
1336                 return ret;
1337             }
1338         }
1339 
1340         refcount = s->get_refcount(*refcount_table, k);
1341         if (refcount == s->refcount_max) {
1342             fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
1343                     "\n", cluster_offset);
1344             fprintf(stderr, "Use qemu-img amend to increase the refcount entry "
1345                     "width or qemu-img convert to create a clean copy if the "
1346                     "image cannot be opened for writing\n");
1347             res->corruptions++;
1348             continue;
1349         }
1350         s->set_refcount(*refcount_table, k, refcount + 1);
1351     }
1352 
1353     return 0;
1354 }
1355 
1356 /* Flags for check_refcounts_l1() and check_refcounts_l2() */
1357 enum {
1358     CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
1359 };
1360 
1361 /*
1362  * Increases the refcount in the given refcount table for the all clusters
1363  * referenced in the L2 table. While doing so, performs some checks on L2
1364  * entries.
1365  *
1366  * Returns the number of errors found by the checks or -errno if an internal
1367  * error occurred.
1368  */
1369 static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
1370                               void **refcount_table,
1371                               int64_t *refcount_table_size, int64_t l2_offset,
1372                               int flags)
1373 {
1374     BDRVQcow2State *s = bs->opaque;
1375     uint64_t *l2_table, l2_entry;
1376     uint64_t next_contiguous_offset = 0;
1377     int i, l2_size, nb_csectors, ret;
1378 
1379     /* Read L2 table from disk */
1380     l2_size = s->l2_size * sizeof(uint64_t);
1381     l2_table = g_malloc(l2_size);
1382 
1383     ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size);
1384     if (ret < 0) {
1385         fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
1386         res->check_errors++;
1387         goto fail;
1388     }
1389 
1390     /* Do the actual checks */
1391     for(i = 0; i < s->l2_size; i++) {
1392         l2_entry = be64_to_cpu(l2_table[i]);
1393 
1394         switch (qcow2_get_cluster_type(l2_entry)) {
1395         case QCOW2_CLUSTER_COMPRESSED:
1396             /* Compressed clusters don't have QCOW_OFLAG_COPIED */
1397             if (l2_entry & QCOW_OFLAG_COPIED) {
1398                 fprintf(stderr, "ERROR: cluster %" PRId64 ": "
1399                     "copied flag must never be set for compressed "
1400                     "clusters\n", l2_entry >> s->cluster_bits);
1401                 l2_entry &= ~QCOW_OFLAG_COPIED;
1402                 res->corruptions++;
1403             }
1404 
1405             /* Mark cluster as used */
1406             nb_csectors = ((l2_entry >> s->csize_shift) &
1407                            s->csize_mask) + 1;
1408             l2_entry &= s->cluster_offset_mask;
1409             ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1410                                 l2_entry & ~511, nb_csectors * 512);
1411             if (ret < 0) {
1412                 goto fail;
1413             }
1414 
1415             if (flags & CHECK_FRAG_INFO) {
1416                 res->bfi.allocated_clusters++;
1417                 res->bfi.compressed_clusters++;
1418 
1419                 /* Compressed clusters are fragmented by nature.  Since they
1420                  * take up sub-sector space but we only have sector granularity
1421                  * I/O we need to re-read the same sectors even for adjacent
1422                  * compressed clusters.
1423                  */
1424                 res->bfi.fragmented_clusters++;
1425             }
1426             break;
1427 
1428         case QCOW2_CLUSTER_ZERO:
1429             if ((l2_entry & L2E_OFFSET_MASK) == 0) {
1430                 break;
1431             }
1432             /* fall through */
1433 
1434         case QCOW2_CLUSTER_NORMAL:
1435         {
1436             uint64_t offset = l2_entry & L2E_OFFSET_MASK;
1437 
1438             if (flags & CHECK_FRAG_INFO) {
1439                 res->bfi.allocated_clusters++;
1440                 if (next_contiguous_offset &&
1441                     offset != next_contiguous_offset) {
1442                     res->bfi.fragmented_clusters++;
1443                 }
1444                 next_contiguous_offset = offset + s->cluster_size;
1445             }
1446 
1447             /* Mark cluster as used */
1448             ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1449                                 offset, s->cluster_size);
1450             if (ret < 0) {
1451                 goto fail;
1452             }
1453 
1454             /* Correct offsets are cluster aligned */
1455             if (offset_into_cluster(s, offset)) {
1456                 fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
1457                     "properly aligned; L2 entry corrupted.\n", offset);
1458                 res->corruptions++;
1459             }
1460             break;
1461         }
1462 
1463         case QCOW2_CLUSTER_UNALLOCATED:
1464             break;
1465 
1466         default:
1467             abort();
1468         }
1469     }
1470 
1471     g_free(l2_table);
1472     return 0;
1473 
1474 fail:
1475     g_free(l2_table);
1476     return ret;
1477 }
1478 
1479 /*
1480  * Increases the refcount for the L1 table, its L2 tables and all referenced
1481  * clusters in the given refcount table. While doing so, performs some checks
1482  * on L1 and L2 entries.
1483  *
1484  * Returns the number of errors found by the checks or -errno if an internal
1485  * error occurred.
1486  */
1487 static int check_refcounts_l1(BlockDriverState *bs,
1488                               BdrvCheckResult *res,
1489                               void **refcount_table,
1490                               int64_t *refcount_table_size,
1491                               int64_t l1_table_offset, int l1_size,
1492                               int flags)
1493 {
1494     BDRVQcow2State *s = bs->opaque;
1495     uint64_t *l1_table = NULL, l2_offset, l1_size2;
1496     int i, ret;
1497 
1498     l1_size2 = l1_size * sizeof(uint64_t);
1499 
1500     /* Mark L1 table as used */
1501     ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1502                         l1_table_offset, l1_size2);
1503     if (ret < 0) {
1504         goto fail;
1505     }
1506 
1507     /* Read L1 table entries from disk */
1508     if (l1_size2 > 0) {
1509         l1_table = g_try_malloc(l1_size2);
1510         if (l1_table == NULL) {
1511             ret = -ENOMEM;
1512             res->check_errors++;
1513             goto fail;
1514         }
1515         ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
1516         if (ret < 0) {
1517             fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1518             res->check_errors++;
1519             goto fail;
1520         }
1521         for(i = 0;i < l1_size; i++)
1522             be64_to_cpus(&l1_table[i]);
1523     }
1524 
1525     /* Do the actual checks */
1526     for(i = 0; i < l1_size; i++) {
1527         l2_offset = l1_table[i];
1528         if (l2_offset) {
1529             /* Mark L2 table as used */
1530             l2_offset &= L1E_OFFSET_MASK;
1531             ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1532                                 l2_offset, s->cluster_size);
1533             if (ret < 0) {
1534                 goto fail;
1535             }
1536 
1537             /* L2 tables are cluster aligned */
1538             if (offset_into_cluster(s, l2_offset)) {
1539                 fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1540                     "cluster aligned; L1 entry corrupted\n", l2_offset);
1541                 res->corruptions++;
1542             }
1543 
1544             /* Process and check L2 entries */
1545             ret = check_refcounts_l2(bs, res, refcount_table,
1546                                      refcount_table_size, l2_offset, flags);
1547             if (ret < 0) {
1548                 goto fail;
1549             }
1550         }
1551     }
1552     g_free(l1_table);
1553     return 0;
1554 
1555 fail:
1556     g_free(l1_table);
1557     return ret;
1558 }
1559 
1560 /*
1561  * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
1562  *
1563  * This function does not print an error message nor does it increment
1564  * check_errors if qcow2_get_refcount fails (this is because such an error will
1565  * have been already detected and sufficiently signaled by the calling function
1566  * (qcow2_check_refcounts) by the time this function is called).
1567  */
1568 static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
1569                               BdrvCheckMode fix)
1570 {
1571     BDRVQcow2State *s = bs->opaque;
1572     uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
1573     int ret;
1574     uint64_t refcount;
1575     int i, j;
1576 
1577     for (i = 0; i < s->l1_size; i++) {
1578         uint64_t l1_entry = s->l1_table[i];
1579         uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
1580         bool l2_dirty = false;
1581 
1582         if (!l2_offset) {
1583             continue;
1584         }
1585 
1586         ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1587                                  &refcount);
1588         if (ret < 0) {
1589             /* don't print message nor increment check_errors */
1590             continue;
1591         }
1592         if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
1593             fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
1594                     "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1595                     fix & BDRV_FIX_ERRORS ? "Repairing" :
1596                                             "ERROR",
1597                     i, l1_entry, refcount);
1598             if (fix & BDRV_FIX_ERRORS) {
1599                 s->l1_table[i] = refcount == 1
1600                                ? l1_entry |  QCOW_OFLAG_COPIED
1601                                : l1_entry & ~QCOW_OFLAG_COPIED;
1602                 ret = qcow2_write_l1_entry(bs, i);
1603                 if (ret < 0) {
1604                     res->check_errors++;
1605                     goto fail;
1606                 }
1607                 res->corruptions_fixed++;
1608             } else {
1609                 res->corruptions++;
1610             }
1611         }
1612 
1613         ret = bdrv_pread(bs->file, l2_offset, l2_table,
1614                          s->l2_size * sizeof(uint64_t));
1615         if (ret < 0) {
1616             fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
1617                     strerror(-ret));
1618             res->check_errors++;
1619             goto fail;
1620         }
1621 
1622         for (j = 0; j < s->l2_size; j++) {
1623             uint64_t l2_entry = be64_to_cpu(l2_table[j]);
1624             uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
1625             int cluster_type = qcow2_get_cluster_type(l2_entry);
1626 
1627             if ((cluster_type == QCOW2_CLUSTER_NORMAL) ||
1628                 ((cluster_type == QCOW2_CLUSTER_ZERO) && (data_offset != 0))) {
1629                 ret = qcow2_get_refcount(bs,
1630                                          data_offset >> s->cluster_bits,
1631                                          &refcount);
1632                 if (ret < 0) {
1633                     /* don't print message nor increment check_errors */
1634                     continue;
1635                 }
1636                 if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
1637                     fprintf(stderr, "%s OFLAG_COPIED data cluster: "
1638                             "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1639                             fix & BDRV_FIX_ERRORS ? "Repairing" :
1640                                                     "ERROR",
1641                             l2_entry, refcount);
1642                     if (fix & BDRV_FIX_ERRORS) {
1643                         l2_table[j] = cpu_to_be64(refcount == 1
1644                                     ? l2_entry |  QCOW_OFLAG_COPIED
1645                                     : l2_entry & ~QCOW_OFLAG_COPIED);
1646                         l2_dirty = true;
1647                         res->corruptions_fixed++;
1648                     } else {
1649                         res->corruptions++;
1650                     }
1651                 }
1652             }
1653         }
1654 
1655         if (l2_dirty) {
1656             ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
1657                                                 l2_offset, s->cluster_size);
1658             if (ret < 0) {
1659                 fprintf(stderr, "ERROR: Could not write L2 table; metadata "
1660                         "overlap check failed: %s\n", strerror(-ret));
1661                 res->check_errors++;
1662                 goto fail;
1663             }
1664 
1665             ret = bdrv_pwrite(bs->file, l2_offset, l2_table,
1666                               s->cluster_size);
1667             if (ret < 0) {
1668                 fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
1669                         strerror(-ret));
1670                 res->check_errors++;
1671                 goto fail;
1672             }
1673         }
1674     }
1675 
1676     ret = 0;
1677 
1678 fail:
1679     qemu_vfree(l2_table);
1680     return ret;
1681 }
1682 
1683 /*
1684  * Checks consistency of refblocks and accounts for each refblock in
1685  * *refcount_table.
1686  */
1687 static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
1688                            BdrvCheckMode fix, bool *rebuild,
1689                            void **refcount_table, int64_t *nb_clusters)
1690 {
1691     BDRVQcow2State *s = bs->opaque;
1692     int64_t i, size;
1693     int ret;
1694 
1695     for(i = 0; i < s->refcount_table_size; i++) {
1696         uint64_t offset, cluster;
1697         offset = s->refcount_table[i];
1698         cluster = offset >> s->cluster_bits;
1699 
1700         /* Refcount blocks are cluster aligned */
1701         if (offset_into_cluster(s, offset)) {
1702             fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
1703                 "cluster aligned; refcount table entry corrupted\n", i);
1704             res->corruptions++;
1705             *rebuild = true;
1706             continue;
1707         }
1708 
1709         if (cluster >= *nb_clusters) {
1710             fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n",
1711                     fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
1712 
1713             if (fix & BDRV_FIX_ERRORS) {
1714                 int64_t new_nb_clusters;
1715 
1716                 if (offset > INT64_MAX - s->cluster_size) {
1717                     ret = -EINVAL;
1718                     goto resize_fail;
1719                 }
1720 
1721                 ret = bdrv_truncate(bs->file->bs, offset + s->cluster_size);
1722                 if (ret < 0) {
1723                     goto resize_fail;
1724                 }
1725                 size = bdrv_getlength(bs->file->bs);
1726                 if (size < 0) {
1727                     ret = size;
1728                     goto resize_fail;
1729                 }
1730 
1731                 new_nb_clusters = size_to_clusters(s, size);
1732                 assert(new_nb_clusters >= *nb_clusters);
1733 
1734                 ret = realloc_refcount_array(s, refcount_table,
1735                                              nb_clusters, new_nb_clusters);
1736                 if (ret < 0) {
1737                     res->check_errors++;
1738                     return ret;
1739                 }
1740 
1741                 if (cluster >= *nb_clusters) {
1742                     ret = -EINVAL;
1743                     goto resize_fail;
1744                 }
1745 
1746                 res->corruptions_fixed++;
1747                 ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1748                                     offset, s->cluster_size);
1749                 if (ret < 0) {
1750                     return ret;
1751                 }
1752                 /* No need to check whether the refcount is now greater than 1:
1753                  * This area was just allocated and zeroed, so it can only be
1754                  * exactly 1 after inc_refcounts() */
1755                 continue;
1756 
1757 resize_fail:
1758                 res->corruptions++;
1759                 *rebuild = true;
1760                 fprintf(stderr, "ERROR could not resize image: %s\n",
1761                         strerror(-ret));
1762             } else {
1763                 res->corruptions++;
1764             }
1765             continue;
1766         }
1767 
1768         if (offset != 0) {
1769             ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1770                                 offset, s->cluster_size);
1771             if (ret < 0) {
1772                 return ret;
1773             }
1774             if (s->get_refcount(*refcount_table, cluster) != 1) {
1775                 fprintf(stderr, "ERROR refcount block %" PRId64
1776                         " refcount=%" PRIu64 "\n", i,
1777                         s->get_refcount(*refcount_table, cluster));
1778                 res->corruptions++;
1779                 *rebuild = true;
1780             }
1781         }
1782     }
1783 
1784     return 0;
1785 }
1786 
1787 /*
1788  * Calculates an in-memory refcount table.
1789  */
1790 static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
1791                                BdrvCheckMode fix, bool *rebuild,
1792                                void **refcount_table, int64_t *nb_clusters)
1793 {
1794     BDRVQcow2State *s = bs->opaque;
1795     int64_t i;
1796     QCowSnapshot *sn;
1797     int ret;
1798 
1799     if (!*refcount_table) {
1800         int64_t old_size = 0;
1801         ret = realloc_refcount_array(s, refcount_table,
1802                                      &old_size, *nb_clusters);
1803         if (ret < 0) {
1804             res->check_errors++;
1805             return ret;
1806         }
1807     }
1808 
1809     /* header */
1810     ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1811                         0, s->cluster_size);
1812     if (ret < 0) {
1813         return ret;
1814     }
1815 
1816     /* current L1 table */
1817     ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
1818                              s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO);
1819     if (ret < 0) {
1820         return ret;
1821     }
1822 
1823     /* snapshots */
1824     for (i = 0; i < s->nb_snapshots; i++) {
1825         sn = s->snapshots + i;
1826         ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
1827                                  sn->l1_table_offset, sn->l1_size, 0);
1828         if (ret < 0) {
1829             return ret;
1830         }
1831     }
1832     ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1833                         s->snapshots_offset, s->snapshots_size);
1834     if (ret < 0) {
1835         return ret;
1836     }
1837 
1838     /* refcount data */
1839     ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1840                         s->refcount_table_offset,
1841                         s->refcount_table_size * sizeof(uint64_t));
1842     if (ret < 0) {
1843         return ret;
1844     }
1845 
1846     return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters);
1847 }
1848 
1849 /*
1850  * Compares the actual reference count for each cluster in the image against the
1851  * refcount as reported by the refcount structures on-disk.
1852  */
1853 static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
1854                               BdrvCheckMode fix, bool *rebuild,
1855                               int64_t *highest_cluster,
1856                               void *refcount_table, int64_t nb_clusters)
1857 {
1858     BDRVQcow2State *s = bs->opaque;
1859     int64_t i;
1860     uint64_t refcount1, refcount2;
1861     int ret;
1862 
1863     for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) {
1864         ret = qcow2_get_refcount(bs, i, &refcount1);
1865         if (ret < 0) {
1866             fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
1867                     i, strerror(-ret));
1868             res->check_errors++;
1869             continue;
1870         }
1871 
1872         refcount2 = s->get_refcount(refcount_table, i);
1873 
1874         if (refcount1 > 0 || refcount2 > 0) {
1875             *highest_cluster = i;
1876         }
1877 
1878         if (refcount1 != refcount2) {
1879             /* Check if we're allowed to fix the mismatch */
1880             int *num_fixed = NULL;
1881             if (refcount1 == 0) {
1882                 *rebuild = true;
1883             } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
1884                 num_fixed = &res->leaks_fixed;
1885             } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
1886                 num_fixed = &res->corruptions_fixed;
1887             }
1888 
1889             fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64
1890                     " reference=%" PRIu64 "\n",
1891                    num_fixed != NULL     ? "Repairing" :
1892                    refcount1 < refcount2 ? "ERROR" :
1893                                            "Leaked",
1894                    i, refcount1, refcount2);
1895 
1896             if (num_fixed) {
1897                 ret = update_refcount(bs, i << s->cluster_bits, 1,
1898                                       refcount_diff(refcount1, refcount2),
1899                                       refcount1 > refcount2,
1900                                       QCOW2_DISCARD_ALWAYS);
1901                 if (ret >= 0) {
1902                     (*num_fixed)++;
1903                     continue;
1904                 }
1905             }
1906 
1907             /* And if we couldn't, print an error */
1908             if (refcount1 < refcount2) {
1909                 res->corruptions++;
1910             } else {
1911                 res->leaks++;
1912             }
1913         }
1914     }
1915 }
1916 
1917 /*
1918  * Allocates clusters using an in-memory refcount table (IMRT) in contrast to
1919  * the on-disk refcount structures.
1920  *
1921  * On input, *first_free_cluster tells where to start looking, and need not
1922  * actually be a free cluster; the returned offset will not be before that
1923  * cluster.  On output, *first_free_cluster points to the first gap found, even
1924  * if that gap was too small to be used as the returned offset.
1925  *
1926  * Note that *first_free_cluster is a cluster index whereas the return value is
1927  * an offset.
1928  */
1929 static int64_t alloc_clusters_imrt(BlockDriverState *bs,
1930                                    int cluster_count,
1931                                    void **refcount_table,
1932                                    int64_t *imrt_nb_clusters,
1933                                    int64_t *first_free_cluster)
1934 {
1935     BDRVQcow2State *s = bs->opaque;
1936     int64_t cluster = *first_free_cluster, i;
1937     bool first_gap = true;
1938     int contiguous_free_clusters;
1939     int ret;
1940 
1941     /* Starting at *first_free_cluster, find a range of at least cluster_count
1942      * continuously free clusters */
1943     for (contiguous_free_clusters = 0;
1944          cluster < *imrt_nb_clusters &&
1945          contiguous_free_clusters < cluster_count;
1946          cluster++)
1947     {
1948         if (!s->get_refcount(*refcount_table, cluster)) {
1949             contiguous_free_clusters++;
1950             if (first_gap) {
1951                 /* If this is the first free cluster found, update
1952                  * *first_free_cluster accordingly */
1953                 *first_free_cluster = cluster;
1954                 first_gap = false;
1955             }
1956         } else if (contiguous_free_clusters) {
1957             contiguous_free_clusters = 0;
1958         }
1959     }
1960 
1961     /* If contiguous_free_clusters is greater than zero, it contains the number
1962      * of continuously free clusters until the current cluster; the first free
1963      * cluster in the current "gap" is therefore
1964      * cluster - contiguous_free_clusters */
1965 
1966     /* If no such range could be found, grow the in-memory refcount table
1967      * accordingly to append free clusters at the end of the image */
1968     if (contiguous_free_clusters < cluster_count) {
1969         /* contiguous_free_clusters clusters are already empty at the image end;
1970          * we need cluster_count clusters; therefore, we have to allocate
1971          * cluster_count - contiguous_free_clusters new clusters at the end of
1972          * the image (which is the current value of cluster; note that cluster
1973          * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond
1974          * the image end) */
1975         ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters,
1976                                      cluster + cluster_count
1977                                      - contiguous_free_clusters);
1978         if (ret < 0) {
1979             return ret;
1980         }
1981     }
1982 
1983     /* Go back to the first free cluster */
1984     cluster -= contiguous_free_clusters;
1985     for (i = 0; i < cluster_count; i++) {
1986         s->set_refcount(*refcount_table, cluster + i, 1);
1987     }
1988 
1989     return cluster << s->cluster_bits;
1990 }
1991 
1992 /*
1993  * Creates a new refcount structure based solely on the in-memory information
1994  * given through *refcount_table. All necessary allocations will be reflected
1995  * in that array.
1996  *
1997  * On success, the old refcount structure is leaked (it will be covered by the
1998  * new refcount structure).
1999  */
2000 static int rebuild_refcount_structure(BlockDriverState *bs,
2001                                       BdrvCheckResult *res,
2002                                       void **refcount_table,
2003                                       int64_t *nb_clusters)
2004 {
2005     BDRVQcow2State *s = bs->opaque;
2006     int64_t first_free_cluster = 0, reftable_offset = -1, cluster = 0;
2007     int64_t refblock_offset, refblock_start, refblock_index;
2008     uint32_t reftable_size = 0;
2009     uint64_t *on_disk_reftable = NULL;
2010     void *on_disk_refblock;
2011     int ret = 0;
2012     struct {
2013         uint64_t reftable_offset;
2014         uint32_t reftable_clusters;
2015     } QEMU_PACKED reftable_offset_and_clusters;
2016 
2017     qcow2_cache_empty(bs, s->refcount_block_cache);
2018 
2019 write_refblocks:
2020     for (; cluster < *nb_clusters; cluster++) {
2021         if (!s->get_refcount(*refcount_table, cluster)) {
2022             continue;
2023         }
2024 
2025         refblock_index = cluster >> s->refcount_block_bits;
2026         refblock_start = refblock_index << s->refcount_block_bits;
2027 
2028         /* Don't allocate a cluster in a refblock already written to disk */
2029         if (first_free_cluster < refblock_start) {
2030             first_free_cluster = refblock_start;
2031         }
2032         refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table,
2033                                               nb_clusters, &first_free_cluster);
2034         if (refblock_offset < 0) {
2035             fprintf(stderr, "ERROR allocating refblock: %s\n",
2036                     strerror(-refblock_offset));
2037             res->check_errors++;
2038             ret = refblock_offset;
2039             goto fail;
2040         }
2041 
2042         if (reftable_size <= refblock_index) {
2043             uint32_t old_reftable_size = reftable_size;
2044             uint64_t *new_on_disk_reftable;
2045 
2046             reftable_size = ROUND_UP((refblock_index + 1) * sizeof(uint64_t),
2047                                      s->cluster_size) / sizeof(uint64_t);
2048             new_on_disk_reftable = g_try_realloc(on_disk_reftable,
2049                                                  reftable_size *
2050                                                  sizeof(uint64_t));
2051             if (!new_on_disk_reftable) {
2052                 res->check_errors++;
2053                 ret = -ENOMEM;
2054                 goto fail;
2055             }
2056             on_disk_reftable = new_on_disk_reftable;
2057 
2058             memset(on_disk_reftable + old_reftable_size, 0,
2059                    (reftable_size - old_reftable_size) * sizeof(uint64_t));
2060 
2061             /* The offset we have for the reftable is now no longer valid;
2062              * this will leak that range, but we can easily fix that by running
2063              * a leak-fixing check after this rebuild operation */
2064             reftable_offset = -1;
2065         }
2066         on_disk_reftable[refblock_index] = refblock_offset;
2067 
2068         /* If this is apparently the last refblock (for now), try to squeeze the
2069          * reftable in */
2070         if (refblock_index == (*nb_clusters - 1) >> s->refcount_block_bits &&
2071             reftable_offset < 0)
2072         {
2073             uint64_t reftable_clusters = size_to_clusters(s, reftable_size *
2074                                                           sizeof(uint64_t));
2075             reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2076                                                   refcount_table, nb_clusters,
2077                                                   &first_free_cluster);
2078             if (reftable_offset < 0) {
2079                 fprintf(stderr, "ERROR allocating reftable: %s\n",
2080                         strerror(-reftable_offset));
2081                 res->check_errors++;
2082                 ret = reftable_offset;
2083                 goto fail;
2084             }
2085         }
2086 
2087         ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset,
2088                                             s->cluster_size);
2089         if (ret < 0) {
2090             fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2091             goto fail;
2092         }
2093 
2094         /* The size of *refcount_table is always cluster-aligned, therefore the
2095          * write operation will not overflow */
2096         on_disk_refblock = (void *)((char *) *refcount_table +
2097                                     refblock_index * s->cluster_size);
2098 
2099         ret = bdrv_write(bs->file, refblock_offset / BDRV_SECTOR_SIZE,
2100                          on_disk_refblock, s->cluster_sectors);
2101         if (ret < 0) {
2102             fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2103             goto fail;
2104         }
2105 
2106         /* Go to the end of this refblock */
2107         cluster = refblock_start + s->refcount_block_size - 1;
2108     }
2109 
2110     if (reftable_offset < 0) {
2111         uint64_t post_refblock_start, reftable_clusters;
2112 
2113         post_refblock_start = ROUND_UP(*nb_clusters, s->refcount_block_size);
2114         reftable_clusters = size_to_clusters(s,
2115                                              reftable_size * sizeof(uint64_t));
2116         /* Not pretty but simple */
2117         if (first_free_cluster < post_refblock_start) {
2118             first_free_cluster = post_refblock_start;
2119         }
2120         reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2121                                               refcount_table, nb_clusters,
2122                                               &first_free_cluster);
2123         if (reftable_offset < 0) {
2124             fprintf(stderr, "ERROR allocating reftable: %s\n",
2125                     strerror(-reftable_offset));
2126             res->check_errors++;
2127             ret = reftable_offset;
2128             goto fail;
2129         }
2130 
2131         goto write_refblocks;
2132     }
2133 
2134     assert(on_disk_reftable);
2135 
2136     for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2137         cpu_to_be64s(&on_disk_reftable[refblock_index]);
2138     }
2139 
2140     ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset,
2141                                         reftable_size * sizeof(uint64_t));
2142     if (ret < 0) {
2143         fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2144         goto fail;
2145     }
2146 
2147     assert(reftable_size < INT_MAX / sizeof(uint64_t));
2148     ret = bdrv_pwrite(bs->file, reftable_offset, on_disk_reftable,
2149                       reftable_size * sizeof(uint64_t));
2150     if (ret < 0) {
2151         fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2152         goto fail;
2153     }
2154 
2155     /* Enter new reftable into the image header */
2156     reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset);
2157     reftable_offset_and_clusters.reftable_clusters =
2158         cpu_to_be32(size_to_clusters(s, reftable_size * sizeof(uint64_t)));
2159     ret = bdrv_pwrite_sync(bs->file,
2160                            offsetof(QCowHeader, refcount_table_offset),
2161                            &reftable_offset_and_clusters,
2162                            sizeof(reftable_offset_and_clusters));
2163     if (ret < 0) {
2164         fprintf(stderr, "ERROR setting reftable: %s\n", strerror(-ret));
2165         goto fail;
2166     }
2167 
2168     for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2169         be64_to_cpus(&on_disk_reftable[refblock_index]);
2170     }
2171     s->refcount_table = on_disk_reftable;
2172     s->refcount_table_offset = reftable_offset;
2173     s->refcount_table_size = reftable_size;
2174 
2175     return 0;
2176 
2177 fail:
2178     g_free(on_disk_reftable);
2179     return ret;
2180 }
2181 
2182 /*
2183  * Checks an image for refcount consistency.
2184  *
2185  * Returns 0 if no errors are found, the number of errors in case the image is
2186  * detected as corrupted, and -errno when an internal error occurred.
2187  */
2188 int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2189                           BdrvCheckMode fix)
2190 {
2191     BDRVQcow2State *s = bs->opaque;
2192     BdrvCheckResult pre_compare_res;
2193     int64_t size, highest_cluster, nb_clusters;
2194     void *refcount_table = NULL;
2195     bool rebuild = false;
2196     int ret;
2197 
2198     size = bdrv_getlength(bs->file->bs);
2199     if (size < 0) {
2200         res->check_errors++;
2201         return size;
2202     }
2203 
2204     nb_clusters = size_to_clusters(s, size);
2205     if (nb_clusters > INT_MAX) {
2206         res->check_errors++;
2207         return -EFBIG;
2208     }
2209 
2210     res->bfi.total_clusters =
2211         size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
2212 
2213     ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table,
2214                               &nb_clusters);
2215     if (ret < 0) {
2216         goto fail;
2217     }
2218 
2219     /* In case we don't need to rebuild the refcount structure (but want to fix
2220      * something), this function is immediately called again, in which case the
2221      * result should be ignored */
2222     pre_compare_res = *res;
2223     compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table,
2224                       nb_clusters);
2225 
2226     if (rebuild && (fix & BDRV_FIX_ERRORS)) {
2227         BdrvCheckResult old_res = *res;
2228         int fresh_leaks = 0;
2229 
2230         fprintf(stderr, "Rebuilding refcount structure\n");
2231         ret = rebuild_refcount_structure(bs, res, &refcount_table,
2232                                          &nb_clusters);
2233         if (ret < 0) {
2234             goto fail;
2235         }
2236 
2237         res->corruptions = 0;
2238         res->leaks = 0;
2239 
2240         /* Because the old reftable has been exchanged for a new one the
2241          * references have to be recalculated */
2242         rebuild = false;
2243         memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters));
2244         ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table,
2245                                   &nb_clusters);
2246         if (ret < 0) {
2247             goto fail;
2248         }
2249 
2250         if (fix & BDRV_FIX_LEAKS) {
2251             /* The old refcount structures are now leaked, fix it; the result
2252              * can be ignored, aside from leaks which were introduced by
2253              * rebuild_refcount_structure() that could not be fixed */
2254             BdrvCheckResult saved_res = *res;
2255             *res = (BdrvCheckResult){ 0 };
2256 
2257             compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild,
2258                               &highest_cluster, refcount_table, nb_clusters);
2259             if (rebuild) {
2260                 fprintf(stderr, "ERROR rebuilt refcount structure is still "
2261                         "broken\n");
2262             }
2263 
2264             /* Any leaks accounted for here were introduced by
2265              * rebuild_refcount_structure() because that function has created a
2266              * new refcount structure from scratch */
2267             fresh_leaks = res->leaks;
2268             *res = saved_res;
2269         }
2270 
2271         if (res->corruptions < old_res.corruptions) {
2272             res->corruptions_fixed += old_res.corruptions - res->corruptions;
2273         }
2274         if (res->leaks < old_res.leaks) {
2275             res->leaks_fixed += old_res.leaks - res->leaks;
2276         }
2277         res->leaks += fresh_leaks;
2278     } else if (fix) {
2279         if (rebuild) {
2280             fprintf(stderr, "ERROR need to rebuild refcount structures\n");
2281             res->check_errors++;
2282             ret = -EIO;
2283             goto fail;
2284         }
2285 
2286         if (res->leaks || res->corruptions) {
2287             *res = pre_compare_res;
2288             compare_refcounts(bs, res, fix, &rebuild, &highest_cluster,
2289                               refcount_table, nb_clusters);
2290         }
2291     }
2292 
2293     /* check OFLAG_COPIED */
2294     ret = check_oflag_copied(bs, res, fix);
2295     if (ret < 0) {
2296         goto fail;
2297     }
2298 
2299     res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
2300     ret = 0;
2301 
2302 fail:
2303     g_free(refcount_table);
2304 
2305     return ret;
2306 }
2307 
2308 #define overlaps_with(ofs, sz) \
2309     ranges_overlap(offset, size, ofs, sz)
2310 
2311 /*
2312  * Checks if the given offset into the image file is actually free to use by
2313  * looking for overlaps with important metadata sections (L1/L2 tables etc.),
2314  * i.e. a sanity check without relying on the refcount tables.
2315  *
2316  * The ign parameter specifies what checks not to perform (being a bitmask of
2317  * QCow2MetadataOverlap values), i.e., what sections to ignore.
2318  *
2319  * Returns:
2320  * - 0 if writing to this offset will not affect the mentioned metadata
2321  * - a positive QCow2MetadataOverlap value indicating one overlapping section
2322  * - a negative value (-errno) indicating an error while performing a check,
2323  *   e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2
2324  */
2325 int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
2326                                  int64_t size)
2327 {
2328     BDRVQcow2State *s = bs->opaque;
2329     int chk = s->overlap_check & ~ign;
2330     int i, j;
2331 
2332     if (!size) {
2333         return 0;
2334     }
2335 
2336     if (chk & QCOW2_OL_MAIN_HEADER) {
2337         if (offset < s->cluster_size) {
2338             return QCOW2_OL_MAIN_HEADER;
2339         }
2340     }
2341 
2342     /* align range to test to cluster boundaries */
2343     size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size);
2344     offset = start_of_cluster(s, offset);
2345 
2346     if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
2347         if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) {
2348             return QCOW2_OL_ACTIVE_L1;
2349         }
2350     }
2351 
2352     if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
2353         if (overlaps_with(s->refcount_table_offset,
2354             s->refcount_table_size * sizeof(uint64_t))) {
2355             return QCOW2_OL_REFCOUNT_TABLE;
2356         }
2357     }
2358 
2359     if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
2360         if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
2361             return QCOW2_OL_SNAPSHOT_TABLE;
2362         }
2363     }
2364 
2365     if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
2366         for (i = 0; i < s->nb_snapshots; i++) {
2367             if (s->snapshots[i].l1_size &&
2368                 overlaps_with(s->snapshots[i].l1_table_offset,
2369                 s->snapshots[i].l1_size * sizeof(uint64_t))) {
2370                 return QCOW2_OL_INACTIVE_L1;
2371             }
2372         }
2373     }
2374 
2375     if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
2376         for (i = 0; i < s->l1_size; i++) {
2377             if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
2378                 overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
2379                 s->cluster_size)) {
2380                 return QCOW2_OL_ACTIVE_L2;
2381             }
2382         }
2383     }
2384 
2385     if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
2386         for (i = 0; i < s->refcount_table_size; i++) {
2387             if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
2388                 overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
2389                 s->cluster_size)) {
2390                 return QCOW2_OL_REFCOUNT_BLOCK;
2391             }
2392         }
2393     }
2394 
2395     if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
2396         for (i = 0; i < s->nb_snapshots; i++) {
2397             uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
2398             uint32_t l1_sz  = s->snapshots[i].l1_size;
2399             uint64_t l1_sz2 = l1_sz * sizeof(uint64_t);
2400             uint64_t *l1 = g_try_malloc(l1_sz2);
2401             int ret;
2402 
2403             if (l1_sz2 && l1 == NULL) {
2404                 return -ENOMEM;
2405             }
2406 
2407             ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2);
2408             if (ret < 0) {
2409                 g_free(l1);
2410                 return ret;
2411             }
2412 
2413             for (j = 0; j < l1_sz; j++) {
2414                 uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
2415                 if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
2416                     g_free(l1);
2417                     return QCOW2_OL_INACTIVE_L2;
2418                 }
2419             }
2420 
2421             g_free(l1);
2422         }
2423     }
2424 
2425     return 0;
2426 }
2427 
2428 static const char *metadata_ol_names[] = {
2429     [QCOW2_OL_MAIN_HEADER_BITNR]    = "qcow2_header",
2430     [QCOW2_OL_ACTIVE_L1_BITNR]      = "active L1 table",
2431     [QCOW2_OL_ACTIVE_L2_BITNR]      = "active L2 table",
2432     [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table",
2433     [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block",
2434     [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table",
2435     [QCOW2_OL_INACTIVE_L1_BITNR]    = "inactive L1 table",
2436     [QCOW2_OL_INACTIVE_L2_BITNR]    = "inactive L2 table",
2437 };
2438 
2439 /*
2440  * First performs a check for metadata overlaps (through
2441  * qcow2_check_metadata_overlap); if that fails with a negative value (error
2442  * while performing a check), that value is returned. If an impending overlap
2443  * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
2444  * and -EIO returned.
2445  *
2446  * Returns 0 if there were neither overlaps nor errors while checking for
2447  * overlaps; or a negative value (-errno) on error.
2448  */
2449 int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
2450                                   int64_t size)
2451 {
2452     int ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
2453 
2454     if (ret < 0) {
2455         return ret;
2456     } else if (ret > 0) {
2457         int metadata_ol_bitnr = ctz32(ret);
2458         assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
2459 
2460         qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid "
2461                                 "write on metadata (overlaps with %s)",
2462                                 metadata_ol_names[metadata_ol_bitnr]);
2463         return -EIO;
2464     }
2465 
2466     return 0;
2467 }
2468 
2469 /* A pointer to a function of this type is given to walk_over_reftable(). That
2470  * function will create refblocks and pass them to a RefblockFinishOp once they
2471  * are completed (@refblock). @refblock_empty is set if the refblock is
2472  * completely empty.
2473  *
2474  * Along with the refblock, a corresponding reftable entry is passed, in the
2475  * reftable @reftable (which may be reallocated) at @reftable_index.
2476  *
2477  * @allocated should be set to true if a new cluster has been allocated.
2478  */
2479 typedef int (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable,
2480                                uint64_t reftable_index, uint64_t *reftable_size,
2481                                void *refblock, bool refblock_empty,
2482                                bool *allocated, Error **errp);
2483 
2484 /**
2485  * This "operation" for walk_over_reftable() allocates the refblock on disk (if
2486  * it is not empty) and inserts its offset into the new reftable. The size of
2487  * this new reftable is increased as required.
2488  */
2489 static int alloc_refblock(BlockDriverState *bs, uint64_t **reftable,
2490                           uint64_t reftable_index, uint64_t *reftable_size,
2491                           void *refblock, bool refblock_empty, bool *allocated,
2492                           Error **errp)
2493 {
2494     BDRVQcow2State *s = bs->opaque;
2495     int64_t offset;
2496 
2497     if (!refblock_empty && reftable_index >= *reftable_size) {
2498         uint64_t *new_reftable;
2499         uint64_t new_reftable_size;
2500 
2501         new_reftable_size = ROUND_UP(reftable_index + 1,
2502                                      s->cluster_size / sizeof(uint64_t));
2503         if (new_reftable_size > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
2504             error_setg(errp,
2505                        "This operation would make the refcount table grow "
2506                        "beyond the maximum size supported by QEMU, aborting");
2507             return -ENOTSUP;
2508         }
2509 
2510         new_reftable = g_try_realloc(*reftable, new_reftable_size *
2511                                                 sizeof(uint64_t));
2512         if (!new_reftable) {
2513             error_setg(errp, "Failed to increase reftable buffer size");
2514             return -ENOMEM;
2515         }
2516 
2517         memset(new_reftable + *reftable_size, 0,
2518                (new_reftable_size - *reftable_size) * sizeof(uint64_t));
2519 
2520         *reftable      = new_reftable;
2521         *reftable_size = new_reftable_size;
2522     }
2523 
2524     if (!refblock_empty && !(*reftable)[reftable_index]) {
2525         offset = qcow2_alloc_clusters(bs, s->cluster_size);
2526         if (offset < 0) {
2527             error_setg_errno(errp, -offset, "Failed to allocate refblock");
2528             return offset;
2529         }
2530         (*reftable)[reftable_index] = offset;
2531         *allocated = true;
2532     }
2533 
2534     return 0;
2535 }
2536 
2537 /**
2538  * This "operation" for walk_over_reftable() writes the refblock to disk at the
2539  * offset specified by the new reftable's entry. It does not modify the new
2540  * reftable or change any refcounts.
2541  */
2542 static int flush_refblock(BlockDriverState *bs, uint64_t **reftable,
2543                           uint64_t reftable_index, uint64_t *reftable_size,
2544                           void *refblock, bool refblock_empty, bool *allocated,
2545                           Error **errp)
2546 {
2547     BDRVQcow2State *s = bs->opaque;
2548     int64_t offset;
2549     int ret;
2550 
2551     if (reftable_index < *reftable_size && (*reftable)[reftable_index]) {
2552         offset = (*reftable)[reftable_index];
2553 
2554         ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size);
2555         if (ret < 0) {
2556             error_setg_errno(errp, -ret, "Overlap check failed");
2557             return ret;
2558         }
2559 
2560         ret = bdrv_pwrite(bs->file, offset, refblock, s->cluster_size);
2561         if (ret < 0) {
2562             error_setg_errno(errp, -ret, "Failed to write refblock");
2563             return ret;
2564         }
2565     } else {
2566         assert(refblock_empty);
2567     }
2568 
2569     return 0;
2570 }
2571 
2572 /**
2573  * This function walks over the existing reftable and every referenced refblock;
2574  * if @new_set_refcount is non-NULL, it is called for every refcount entry to
2575  * create an equal new entry in the passed @new_refblock. Once that
2576  * @new_refblock is completely filled, @operation will be called.
2577  *
2578  * @status_cb and @cb_opaque are used for the amend operation's status callback.
2579  * @index is the index of the walk_over_reftable() calls and @total is the total
2580  * number of walk_over_reftable() calls per amend operation. Both are used for
2581  * calculating the parameters for the status callback.
2582  *
2583  * @allocated is set to true if a new cluster has been allocated.
2584  */
2585 static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
2586                               uint64_t *new_reftable_index,
2587                               uint64_t *new_reftable_size,
2588                               void *new_refblock, int new_refblock_size,
2589                               int new_refcount_bits,
2590                               RefblockFinishOp *operation, bool *allocated,
2591                               Qcow2SetRefcountFunc *new_set_refcount,
2592                               BlockDriverAmendStatusCB *status_cb,
2593                               void *cb_opaque, int index, int total,
2594                               Error **errp)
2595 {
2596     BDRVQcow2State *s = bs->opaque;
2597     uint64_t reftable_index;
2598     bool new_refblock_empty = true;
2599     int refblock_index;
2600     int new_refblock_index = 0;
2601     int ret;
2602 
2603     for (reftable_index = 0; reftable_index < s->refcount_table_size;
2604          reftable_index++)
2605     {
2606         uint64_t refblock_offset = s->refcount_table[reftable_index]
2607                                  & REFT_OFFSET_MASK;
2608 
2609         status_cb(bs, (uint64_t)index * s->refcount_table_size + reftable_index,
2610                   (uint64_t)total * s->refcount_table_size, cb_opaque);
2611 
2612         if (refblock_offset) {
2613             void *refblock;
2614 
2615             if (offset_into_cluster(s, refblock_offset)) {
2616                 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
2617                                         PRIx64 " unaligned (reftable index: %#"
2618                                         PRIx64 ")", refblock_offset,
2619                                         reftable_index);
2620                 error_setg(errp,
2621                            "Image is corrupt (unaligned refblock offset)");
2622                 return -EIO;
2623             }
2624 
2625             ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offset,
2626                                   &refblock);
2627             if (ret < 0) {
2628                 error_setg_errno(errp, -ret, "Failed to retrieve refblock");
2629                 return ret;
2630             }
2631 
2632             for (refblock_index = 0; refblock_index < s->refcount_block_size;
2633                  refblock_index++)
2634             {
2635                 uint64_t refcount;
2636 
2637                 if (new_refblock_index >= new_refblock_size) {
2638                     /* new_refblock is now complete */
2639                     ret = operation(bs, new_reftable, *new_reftable_index,
2640                                     new_reftable_size, new_refblock,
2641                                     new_refblock_empty, allocated, errp);
2642                     if (ret < 0) {
2643                         qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
2644                         return ret;
2645                     }
2646 
2647                     (*new_reftable_index)++;
2648                     new_refblock_index = 0;
2649                     new_refblock_empty = true;
2650                 }
2651 
2652                 refcount = s->get_refcount(refblock, refblock_index);
2653                 if (new_refcount_bits < 64 && refcount >> new_refcount_bits) {
2654                     uint64_t offset;
2655 
2656                     qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
2657 
2658                     offset = ((reftable_index << s->refcount_block_bits)
2659                               + refblock_index) << s->cluster_bits;
2660 
2661                     error_setg(errp, "Cannot decrease refcount entry width to "
2662                                "%i bits: Cluster at offset %#" PRIx64 " has a "
2663                                "refcount of %" PRIu64, new_refcount_bits,
2664                                offset, refcount);
2665                     return -EINVAL;
2666                 }
2667 
2668                 if (new_set_refcount) {
2669                     new_set_refcount(new_refblock, new_refblock_index++,
2670                                      refcount);
2671                 } else {
2672                     new_refblock_index++;
2673                 }
2674                 new_refblock_empty = new_refblock_empty && refcount == 0;
2675             }
2676 
2677             qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
2678         } else {
2679             /* No refblock means every refcount is 0 */
2680             for (refblock_index = 0; refblock_index < s->refcount_block_size;
2681                  refblock_index++)
2682             {
2683                 if (new_refblock_index >= new_refblock_size) {
2684                     /* new_refblock is now complete */
2685                     ret = operation(bs, new_reftable, *new_reftable_index,
2686                                     new_reftable_size, new_refblock,
2687                                     new_refblock_empty, allocated, errp);
2688                     if (ret < 0) {
2689                         return ret;
2690                     }
2691 
2692                     (*new_reftable_index)++;
2693                     new_refblock_index = 0;
2694                     new_refblock_empty = true;
2695                 }
2696 
2697                 if (new_set_refcount) {
2698                     new_set_refcount(new_refblock, new_refblock_index++, 0);
2699                 } else {
2700                     new_refblock_index++;
2701                 }
2702             }
2703         }
2704     }
2705 
2706     if (new_refblock_index > 0) {
2707         /* Complete the potentially existing partially filled final refblock */
2708         if (new_set_refcount) {
2709             for (; new_refblock_index < new_refblock_size;
2710                  new_refblock_index++)
2711             {
2712                 new_set_refcount(new_refblock, new_refblock_index, 0);
2713             }
2714         }
2715 
2716         ret = operation(bs, new_reftable, *new_reftable_index,
2717                         new_reftable_size, new_refblock, new_refblock_empty,
2718                         allocated, errp);
2719         if (ret < 0) {
2720             return ret;
2721         }
2722 
2723         (*new_reftable_index)++;
2724     }
2725 
2726     status_cb(bs, (uint64_t)(index + 1) * s->refcount_table_size,
2727               (uint64_t)total * s->refcount_table_size, cb_opaque);
2728 
2729     return 0;
2730 }
2731 
2732 int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
2733                                 BlockDriverAmendStatusCB *status_cb,
2734                                 void *cb_opaque, Error **errp)
2735 {
2736     BDRVQcow2State *s = bs->opaque;
2737     Qcow2GetRefcountFunc *new_get_refcount;
2738     Qcow2SetRefcountFunc *new_set_refcount;
2739     void *new_refblock = qemu_blockalign(bs->file->bs, s->cluster_size);
2740     uint64_t *new_reftable = NULL, new_reftable_size = 0;
2741     uint64_t *old_reftable, old_reftable_size, old_reftable_offset;
2742     uint64_t new_reftable_index = 0;
2743     uint64_t i;
2744     int64_t new_reftable_offset = 0, allocated_reftable_size = 0;
2745     int new_refblock_size, new_refcount_bits = 1 << refcount_order;
2746     int old_refcount_order;
2747     int walk_index = 0;
2748     int ret;
2749     bool new_allocation;
2750 
2751     assert(s->qcow_version >= 3);
2752     assert(refcount_order >= 0 && refcount_order <= 6);
2753 
2754     /* see qcow2_open() */
2755     new_refblock_size = 1 << (s->cluster_bits - (refcount_order - 3));
2756 
2757     new_get_refcount = get_refcount_funcs[refcount_order];
2758     new_set_refcount = set_refcount_funcs[refcount_order];
2759 
2760 
2761     do {
2762         int total_walks;
2763 
2764         new_allocation = false;
2765 
2766         /* At least we have to do this walk and the one which writes the
2767          * refblocks; also, at least we have to do this loop here at least
2768          * twice (normally), first to do the allocations, and second to
2769          * determine that everything is correctly allocated, this then makes
2770          * three walks in total */
2771         total_walks = MAX(walk_index + 2, 3);
2772 
2773         /* First, allocate the structures so they are present in the refcount
2774          * structures */
2775         ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
2776                                  &new_reftable_size, NULL, new_refblock_size,
2777                                  new_refcount_bits, &alloc_refblock,
2778                                  &new_allocation, NULL, status_cb, cb_opaque,
2779                                  walk_index++, total_walks, errp);
2780         if (ret < 0) {
2781             goto done;
2782         }
2783 
2784         new_reftable_index = 0;
2785 
2786         if (new_allocation) {
2787             if (new_reftable_offset) {
2788                 qcow2_free_clusters(bs, new_reftable_offset,
2789                                     allocated_reftable_size * sizeof(uint64_t),
2790                                     QCOW2_DISCARD_NEVER);
2791             }
2792 
2793             new_reftable_offset = qcow2_alloc_clusters(bs, new_reftable_size *
2794                                                            sizeof(uint64_t));
2795             if (new_reftable_offset < 0) {
2796                 error_setg_errno(errp, -new_reftable_offset,
2797                                  "Failed to allocate the new reftable");
2798                 ret = new_reftable_offset;
2799                 goto done;
2800             }
2801             allocated_reftable_size = new_reftable_size;
2802         }
2803     } while (new_allocation);
2804 
2805     /* Second, write the new refblocks */
2806     ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
2807                              &new_reftable_size, new_refblock,
2808                              new_refblock_size, new_refcount_bits,
2809                              &flush_refblock, &new_allocation, new_set_refcount,
2810                              status_cb, cb_opaque, walk_index, walk_index + 1,
2811                              errp);
2812     if (ret < 0) {
2813         goto done;
2814     }
2815     assert(!new_allocation);
2816 
2817 
2818     /* Write the new reftable */
2819     ret = qcow2_pre_write_overlap_check(bs, 0, new_reftable_offset,
2820                                         new_reftable_size * sizeof(uint64_t));
2821     if (ret < 0) {
2822         error_setg_errno(errp, -ret, "Overlap check failed");
2823         goto done;
2824     }
2825 
2826     for (i = 0; i < new_reftable_size; i++) {
2827         cpu_to_be64s(&new_reftable[i]);
2828     }
2829 
2830     ret = bdrv_pwrite(bs->file, new_reftable_offset, new_reftable,
2831                       new_reftable_size * sizeof(uint64_t));
2832 
2833     for (i = 0; i < new_reftable_size; i++) {
2834         be64_to_cpus(&new_reftable[i]);
2835     }
2836 
2837     if (ret < 0) {
2838         error_setg_errno(errp, -ret, "Failed to write the new reftable");
2839         goto done;
2840     }
2841 
2842 
2843     /* Empty the refcount cache */
2844     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
2845     if (ret < 0) {
2846         error_setg_errno(errp, -ret, "Failed to flush the refblock cache");
2847         goto done;
2848     }
2849 
2850     /* Update the image header to point to the new reftable; this only updates
2851      * the fields which are relevant to qcow2_update_header(); other fields
2852      * such as s->refcount_table or s->refcount_bits stay stale for now
2853      * (because we have to restore everything if qcow2_update_header() fails) */
2854     old_refcount_order  = s->refcount_order;
2855     old_reftable_size   = s->refcount_table_size;
2856     old_reftable_offset = s->refcount_table_offset;
2857 
2858     s->refcount_order        = refcount_order;
2859     s->refcount_table_size   = new_reftable_size;
2860     s->refcount_table_offset = new_reftable_offset;
2861 
2862     ret = qcow2_update_header(bs);
2863     if (ret < 0) {
2864         s->refcount_order        = old_refcount_order;
2865         s->refcount_table_size   = old_reftable_size;
2866         s->refcount_table_offset = old_reftable_offset;
2867         error_setg_errno(errp, -ret, "Failed to update the qcow2 header");
2868         goto done;
2869     }
2870 
2871     /* Now update the rest of the in-memory information */
2872     old_reftable = s->refcount_table;
2873     s->refcount_table = new_reftable;
2874 
2875     s->refcount_bits = 1 << refcount_order;
2876     s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
2877     s->refcount_max += s->refcount_max - 1;
2878 
2879     s->refcount_block_bits = s->cluster_bits - (refcount_order - 3);
2880     s->refcount_block_size = 1 << s->refcount_block_bits;
2881 
2882     s->get_refcount = new_get_refcount;
2883     s->set_refcount = new_set_refcount;
2884 
2885     /* For cleaning up all old refblocks and the old reftable below the "done"
2886      * label */
2887     new_reftable        = old_reftable;
2888     new_reftable_size   = old_reftable_size;
2889     new_reftable_offset = old_reftable_offset;
2890 
2891 done:
2892     if (new_reftable) {
2893         /* On success, new_reftable actually points to the old reftable (and
2894          * new_reftable_size is the old reftable's size); but that is just
2895          * fine */
2896         for (i = 0; i < new_reftable_size; i++) {
2897             uint64_t offset = new_reftable[i] & REFT_OFFSET_MASK;
2898             if (offset) {
2899                 qcow2_free_clusters(bs, offset, s->cluster_size,
2900                                     QCOW2_DISCARD_OTHER);
2901             }
2902         }
2903         g_free(new_reftable);
2904 
2905         if (new_reftable_offset > 0) {
2906             qcow2_free_clusters(bs, new_reftable_offset,
2907                                 new_reftable_size * sizeof(uint64_t),
2908                                 QCOW2_DISCARD_OTHER);
2909         }
2910     }
2911 
2912     qemu_vfree(new_refblock);
2913     return ret;
2914 }
2915