xref: /openbmc/qemu/block/qcow2-refcount.c (revision 9febfa94b69b7146582c48a868bd2330ac45037f)
1 /*
2  * Block driver for the QCOW version 2 format
3  *
4  * Copyright (c) 2004-2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu/osdep.h"
26 #include "block/block-io.h"
27 #include "qapi/error.h"
28 #include "qcow2.h"
29 #include "qemu/range.h"
30 #include "qemu/bswap.h"
31 #include "qemu/cutils.h"
32 #include "qemu/memalign.h"
33 #include "trace.h"
34 
35 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,
36                                     uint64_t max);
37 
38 G_GNUC_WARN_UNUSED_RESULT
39 static int update_refcount(BlockDriverState *bs,
40                            int64_t offset, int64_t length, uint64_t addend,
41                            bool decrease, enum qcow2_discard_type type);
42 
43 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index);
44 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index);
45 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index);
46 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index);
47 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index);
48 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index);
49 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index);
50 
51 static void set_refcount_ro0(void *refcount_array, uint64_t index,
52                              uint64_t value);
53 static void set_refcount_ro1(void *refcount_array, uint64_t index,
54                              uint64_t value);
55 static void set_refcount_ro2(void *refcount_array, uint64_t index,
56                              uint64_t value);
57 static void set_refcount_ro3(void *refcount_array, uint64_t index,
58                              uint64_t value);
59 static void set_refcount_ro4(void *refcount_array, uint64_t index,
60                              uint64_t value);
61 static void set_refcount_ro5(void *refcount_array, uint64_t index,
62                              uint64_t value);
63 static void set_refcount_ro6(void *refcount_array, uint64_t index,
64                              uint64_t value);
65 
66 
67 static Qcow2GetRefcountFunc *const get_refcount_funcs[] = {
68     &get_refcount_ro0,
69     &get_refcount_ro1,
70     &get_refcount_ro2,
71     &get_refcount_ro3,
72     &get_refcount_ro4,
73     &get_refcount_ro5,
74     &get_refcount_ro6
75 };
76 
77 static Qcow2SetRefcountFunc *const set_refcount_funcs[] = {
78     &set_refcount_ro0,
79     &set_refcount_ro1,
80     &set_refcount_ro2,
81     &set_refcount_ro3,
82     &set_refcount_ro4,
83     &set_refcount_ro5,
84     &set_refcount_ro6
85 };
86 
87 
88 /*********************************************************/
89 /* refcount handling */
90 
91 static void update_max_refcount_table_index(BDRVQcow2State *s)
92 {
93     unsigned i = s->refcount_table_size - 1;
94     while (i > 0 && (s->refcount_table[i] & REFT_OFFSET_MASK) == 0) {
95         i--;
96     }
97     /* Set s->max_refcount_table_index to the index of the last used entry */
98     s->max_refcount_table_index = i;
99 }
100 
101 int coroutine_fn qcow2_refcount_init(BlockDriverState *bs)
102 {
103     BDRVQcow2State *s = bs->opaque;
104     unsigned int refcount_table_size2, i;
105     int ret;
106 
107     assert(s->refcount_order >= 0 && s->refcount_order <= 6);
108 
109     s->get_refcount = get_refcount_funcs[s->refcount_order];
110     s->set_refcount = set_refcount_funcs[s->refcount_order];
111 
112     assert(s->refcount_table_size <= INT_MAX / REFTABLE_ENTRY_SIZE);
113     refcount_table_size2 = s->refcount_table_size * REFTABLE_ENTRY_SIZE;
114     s->refcount_table = g_try_malloc(refcount_table_size2);
115 
116     if (s->refcount_table_size > 0) {
117         if (s->refcount_table == NULL) {
118             ret = -ENOMEM;
119             goto fail;
120         }
121         BLKDBG_CO_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
122         ret = bdrv_co_pread(bs->file, s->refcount_table_offset,
123                             refcount_table_size2, s->refcount_table, 0);
124         if (ret < 0) {
125             goto fail;
126         }
127         for(i = 0; i < s->refcount_table_size; i++)
128             be64_to_cpus(&s->refcount_table[i]);
129         update_max_refcount_table_index(s);
130     }
131     return 0;
132  fail:
133     return ret;
134 }
135 
136 void qcow2_refcount_close(BlockDriverState *bs)
137 {
138     BDRVQcow2State *s = bs->opaque;
139     g_free(s->refcount_table);
140 }
141 
142 
143 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index)
144 {
145     return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1;
146 }
147 
148 static void set_refcount_ro0(void *refcount_array, uint64_t index,
149                              uint64_t value)
150 {
151     assert(!(value >> 1));
152     ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8));
153     ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8);
154 }
155 
156 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index)
157 {
158     return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4)))
159            & 0x3;
160 }
161 
162 static void set_refcount_ro1(void *refcount_array, uint64_t index,
163                              uint64_t value)
164 {
165     assert(!(value >> 2));
166     ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4)));
167     ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4));
168 }
169 
170 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index)
171 {
172     return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2)))
173            & 0xf;
174 }
175 
176 static void set_refcount_ro2(void *refcount_array, uint64_t index,
177                              uint64_t value)
178 {
179     assert(!(value >> 4));
180     ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2)));
181     ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2));
182 }
183 
184 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index)
185 {
186     return ((const uint8_t *)refcount_array)[index];
187 }
188 
189 static void set_refcount_ro3(void *refcount_array, uint64_t index,
190                              uint64_t value)
191 {
192     assert(!(value >> 8));
193     ((uint8_t *)refcount_array)[index] = value;
194 }
195 
196 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index)
197 {
198     return be16_to_cpu(((const uint16_t *)refcount_array)[index]);
199 }
200 
201 static void set_refcount_ro4(void *refcount_array, uint64_t index,
202                              uint64_t value)
203 {
204     assert(!(value >> 16));
205     ((uint16_t *)refcount_array)[index] = cpu_to_be16(value);
206 }
207 
208 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index)
209 {
210     return be32_to_cpu(((const uint32_t *)refcount_array)[index]);
211 }
212 
213 static void set_refcount_ro5(void *refcount_array, uint64_t index,
214                              uint64_t value)
215 {
216     assert(!(value >> 32));
217     ((uint32_t *)refcount_array)[index] = cpu_to_be32(value);
218 }
219 
220 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index)
221 {
222     return be64_to_cpu(((const uint64_t *)refcount_array)[index]);
223 }
224 
225 static void set_refcount_ro6(void *refcount_array, uint64_t index,
226                              uint64_t value)
227 {
228     ((uint64_t *)refcount_array)[index] = cpu_to_be64(value);
229 }
230 
231 
232 static int GRAPH_RDLOCK
233 load_refcount_block(BlockDriverState *bs, int64_t refcount_block_offset,
234                     void **refcount_block)
235 {
236     BDRVQcow2State *s = bs->opaque;
237 
238     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
239     return qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
240                            refcount_block);
241 }
242 
243 /*
244  * Retrieves the refcount of the cluster given by its index and stores it in
245  * *refcount. Returns 0 on success and -errno on failure.
246  */
247 int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
248                        uint64_t *refcount)
249 {
250     BDRVQcow2State *s = bs->opaque;
251     uint64_t refcount_table_index, block_index;
252     int64_t refcount_block_offset;
253     int ret;
254     void *refcount_block;
255 
256     refcount_table_index = cluster_index >> s->refcount_block_bits;
257     if (refcount_table_index >= s->refcount_table_size) {
258         *refcount = 0;
259         return 0;
260     }
261     refcount_block_offset =
262         s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
263     if (!refcount_block_offset) {
264         *refcount = 0;
265         return 0;
266     }
267 
268     if (offset_into_cluster(s, refcount_block_offset)) {
269         qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64
270                                 " unaligned (reftable index: %#" PRIx64 ")",
271                                 refcount_block_offset, refcount_table_index);
272         return -EIO;
273     }
274 
275     ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
276                           &refcount_block);
277     if (ret < 0) {
278         return ret;
279     }
280 
281     block_index = cluster_index & (s->refcount_block_size - 1);
282     *refcount = s->get_refcount(refcount_block, block_index);
283 
284     qcow2_cache_put(s->refcount_block_cache, &refcount_block);
285 
286     return 0;
287 }
288 
289 /* Checks if two offsets are described by the same refcount block */
290 static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a,
291     uint64_t offset_b)
292 {
293     uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits);
294     uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits);
295 
296     return (block_a == block_b);
297 }
298 
299 /*
300  * Loads a refcount block. If it doesn't exist yet, it is allocated first
301  * (including growing the refcount table if needed).
302  *
303  * Returns 0 on success or -errno in error case
304  */
305 static int GRAPH_RDLOCK
306 alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index,
307                      void **refcount_block)
308 {
309     BDRVQcow2State *s = bs->opaque;
310     unsigned int refcount_table_index;
311     int64_t ret;
312 
313     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
314 
315     /* Find the refcount block for the given cluster */
316     refcount_table_index = cluster_index >> s->refcount_block_bits;
317 
318     if (refcount_table_index < s->refcount_table_size) {
319 
320         uint64_t refcount_block_offset =
321             s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
322 
323         /* If it's already there, we're done */
324         if (refcount_block_offset) {
325             if (offset_into_cluster(s, refcount_block_offset)) {
326                 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
327                                         PRIx64 " unaligned (reftable index: "
328                                         "%#x)", refcount_block_offset,
329                                         refcount_table_index);
330                 return -EIO;
331             }
332 
333              return load_refcount_block(bs, refcount_block_offset,
334                                         refcount_block);
335         }
336     }
337 
338     /*
339      * If we came here, we need to allocate something. Something is at least
340      * a cluster for the new refcount block. It may also include a new refcount
341      * table if the old refcount table is too small.
342      *
343      * Note that allocating clusters here needs some special care:
344      *
345      * - We can't use the normal qcow2_alloc_clusters(), it would try to
346      *   increase the refcount and very likely we would end up with an endless
347      *   recursion. Instead we must place the refcount blocks in a way that
348      *   they can describe them themselves.
349      *
350      * - We need to consider that at this point we are inside update_refcounts
351      *   and potentially doing an initial refcount increase. This means that
352      *   some clusters have already been allocated by the caller, but their
353      *   refcount isn't accurate yet. If we allocate clusters for metadata, we
354      *   need to return -EAGAIN to signal the caller that it needs to restart
355      *   the search for free clusters.
356      *
357      * - alloc_clusters_noref and qcow2_free_clusters may load a different
358      *   refcount block into the cache
359      */
360 
361     *refcount_block = NULL;
362 
363     /* We write to the refcount table, so we might depend on L2 tables */
364     ret = qcow2_cache_flush(bs, s->l2_table_cache);
365     if (ret < 0) {
366         return ret;
367     }
368 
369     /* Allocate the refcount block itself and mark it as used */
370     int64_t new_block = alloc_clusters_noref(bs, s->cluster_size, INT64_MAX);
371     if (new_block < 0) {
372         return new_block;
373     }
374 
375     /* The offset must fit in the offset field of the refcount table entry */
376     assert((new_block & REFT_OFFSET_MASK) == new_block);
377 
378     /* If we're allocating the block at offset 0 then something is wrong */
379     if (new_block == 0) {
380         qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
381                                 "allocation of refcount block at offset 0");
382         return -EIO;
383     }
384 
385 #ifdef DEBUG_ALLOC2
386     fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
387         " at %" PRIx64 "\n",
388         refcount_table_index, cluster_index << s->cluster_bits, new_block);
389 #endif
390 
391     if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
392         /* Zero the new refcount block before updating it */
393         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
394                                     refcount_block);
395         if (ret < 0) {
396             goto fail;
397         }
398 
399         memset(*refcount_block, 0, s->cluster_size);
400 
401         /* The block describes itself, need to update the cache */
402         int block_index = (new_block >> s->cluster_bits) &
403             (s->refcount_block_size - 1);
404         s->set_refcount(*refcount_block, block_index, 1);
405     } else {
406         /* Described somewhere else. This can recurse at most twice before we
407          * arrive at a block that describes itself. */
408         ret = update_refcount(bs, new_block, s->cluster_size, 1, false,
409                               QCOW2_DISCARD_NEVER);
410         if (ret < 0) {
411             goto fail;
412         }
413 
414         ret = qcow2_cache_flush(bs, s->refcount_block_cache);
415         if (ret < 0) {
416             goto fail;
417         }
418 
419         /* Initialize the new refcount block only after updating its refcount,
420          * update_refcount uses the refcount cache itself */
421         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
422                                     refcount_block);
423         if (ret < 0) {
424             goto fail;
425         }
426 
427         memset(*refcount_block, 0, s->cluster_size);
428     }
429 
430     /* Now the new refcount block needs to be written to disk */
431     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
432     qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
433     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
434     if (ret < 0) {
435         goto fail;
436     }
437 
438     /* If the refcount table is big enough, just hook the block up there */
439     if (refcount_table_index < s->refcount_table_size) {
440         uint64_t data64 = cpu_to_be64(new_block);
441         BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
442         ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset +
443                                refcount_table_index * REFTABLE_ENTRY_SIZE,
444             sizeof(data64), &data64, 0);
445         if (ret < 0) {
446             goto fail;
447         }
448 
449         s->refcount_table[refcount_table_index] = new_block;
450         /* If there's a hole in s->refcount_table then it can happen
451          * that refcount_table_index < s->max_refcount_table_index */
452         s->max_refcount_table_index =
453             MAX(s->max_refcount_table_index, refcount_table_index);
454 
455         /* The new refcount block may be where the caller intended to put its
456          * data, so let it restart the search. */
457         return -EAGAIN;
458     }
459 
460     qcow2_cache_put(s->refcount_block_cache, refcount_block);
461 
462     /*
463      * If we come here, we need to grow the refcount table. Again, a new
464      * refcount table needs some space and we can't simply allocate to avoid
465      * endless recursion.
466      *
467      * Therefore let's grab new refcount blocks at the end of the image, which
468      * will describe themselves and the new refcount table. This way we can
469      * reference them only in the new table and do the switch to the new
470      * refcount table at once without producing an inconsistent state in
471      * between.
472      */
473     BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
474 
475     /* Calculate the number of refcount blocks needed so far; this will be the
476      * basis for calculating the index of the first cluster used for the
477      * self-describing refcount structures which we are about to create.
478      *
479      * Because we reached this point, there cannot be any refcount entries for
480      * cluster_index or higher indices yet. However, because new_block has been
481      * allocated to describe that cluster (and it will assume this role later
482      * on), we cannot use that index; also, new_block may actually have a higher
483      * cluster index than cluster_index, so it needs to be taken into account
484      * here (and 1 needs to be added to its value because that cluster is used).
485      */
486     uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1,
487                                             (new_block >> s->cluster_bits) + 1),
488                                         s->refcount_block_size);
489 
490     /* Create the new refcount table and blocks */
491     uint64_t meta_offset = (blocks_used * s->refcount_block_size) *
492         s->cluster_size;
493 
494     ret = qcow2_refcount_area(bs, meta_offset, 0, false,
495                               refcount_table_index, new_block);
496     if (ret < 0) {
497         return ret;
498     }
499 
500     ret = load_refcount_block(bs, new_block, refcount_block);
501     if (ret < 0) {
502         return ret;
503     }
504 
505     /* If we were trying to do the initial refcount update for some cluster
506      * allocation, we might have used the same clusters to store newly
507      * allocated metadata. Make the caller search some new space. */
508     return -EAGAIN;
509 
510 fail:
511     if (*refcount_block != NULL) {
512         qcow2_cache_put(s->refcount_block_cache, refcount_block);
513     }
514     return ret;
515 }
516 
517 /*
518  * Starting at @start_offset, this function creates new self-covering refcount
519  * structures: A new refcount table and refcount blocks which cover all of
520  * themselves, and a number of @additional_clusters beyond their end.
521  * @start_offset must be at the end of the image file, that is, there must be
522  * only empty space beyond it.
523  * If @exact_size is false, the refcount table will have 50 % more entries than
524  * necessary so it will not need to grow again soon.
525  * If @new_refblock_offset is not zero, it contains the offset of a refcount
526  * block that should be entered into the new refcount table at index
527  * @new_refblock_index.
528  *
529  * Returns: The offset after the new refcount structures (i.e. where the
530  *          @additional_clusters may be placed) on success, -errno on error.
531  */
532 int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset,
533                             uint64_t additional_clusters, bool exact_size,
534                             int new_refblock_index,
535                             uint64_t new_refblock_offset)
536 {
537     BDRVQcow2State *s = bs->opaque;
538     uint64_t total_refblock_count_u64, additional_refblock_count;
539     int total_refblock_count, table_size, area_reftable_index, table_clusters;
540     int i;
541     uint64_t table_offset, block_offset, end_offset;
542     int ret;
543     uint64_t *new_table;
544 
545     assert(!(start_offset % s->cluster_size));
546 
547     qcow2_refcount_metadata_size(start_offset / s->cluster_size +
548                                  additional_clusters,
549                                  s->cluster_size, s->refcount_order,
550                                  !exact_size, &total_refblock_count_u64);
551     if (total_refblock_count_u64 > QCOW_MAX_REFTABLE_SIZE) {
552         return -EFBIG;
553     }
554     total_refblock_count = total_refblock_count_u64;
555 
556     /* Index in the refcount table of the first refcount block to cover the area
557      * of refcount structures we are about to create; we know that
558      * @total_refblock_count can cover @start_offset, so this will definitely
559      * fit into an int. */
560     area_reftable_index = (start_offset / s->cluster_size) /
561                           s->refcount_block_size;
562 
563     if (exact_size) {
564         table_size = total_refblock_count;
565     } else {
566         table_size = total_refblock_count +
567                      DIV_ROUND_UP(total_refblock_count, 2);
568     }
569     /* The qcow2 file can only store the reftable size in number of clusters */
570     table_size = ROUND_UP(table_size, s->cluster_size / REFTABLE_ENTRY_SIZE);
571     table_clusters = (table_size * REFTABLE_ENTRY_SIZE) / s->cluster_size;
572 
573     if (table_size > QCOW_MAX_REFTABLE_SIZE) {
574         return -EFBIG;
575     }
576 
577     new_table = g_try_new0(uint64_t, table_size);
578 
579     assert(table_size > 0);
580     if (new_table == NULL) {
581         ret = -ENOMEM;
582         goto fail;
583     }
584 
585     /* Fill the new refcount table */
586     if (table_size > s->max_refcount_table_index) {
587         /* We're actually growing the reftable */
588         memcpy(new_table, s->refcount_table,
589                (s->max_refcount_table_index + 1) * REFTABLE_ENTRY_SIZE);
590     } else {
591         /* Improbable case: We're shrinking the reftable. However, the caller
592          * has assured us that there is only empty space beyond @start_offset,
593          * so we can simply drop all of the refblocks that won't fit into the
594          * new reftable. */
595         memcpy(new_table, s->refcount_table, table_size * REFTABLE_ENTRY_SIZE);
596     }
597 
598     if (new_refblock_offset) {
599         assert(new_refblock_index < total_refblock_count);
600         new_table[new_refblock_index] = new_refblock_offset;
601     }
602 
603     /* Count how many new refblocks we have to create */
604     additional_refblock_count = 0;
605     for (i = area_reftable_index; i < total_refblock_count; i++) {
606         if (!new_table[i]) {
607             additional_refblock_count++;
608         }
609     }
610 
611     table_offset = start_offset + additional_refblock_count * s->cluster_size;
612     end_offset = table_offset + table_clusters * s->cluster_size;
613 
614     /* Fill the refcount blocks, and create new ones, if necessary */
615     block_offset = start_offset;
616     for (i = area_reftable_index; i < total_refblock_count; i++) {
617         void *refblock_data;
618         uint64_t first_offset_covered;
619 
620         /* Reuse an existing refblock if possible, create a new one otherwise */
621         if (new_table[i]) {
622             ret = qcow2_cache_get(bs, s->refcount_block_cache, new_table[i],
623                                   &refblock_data);
624             if (ret < 0) {
625                 goto fail;
626             }
627         } else {
628             ret = qcow2_cache_get_empty(bs, s->refcount_block_cache,
629                                         block_offset, &refblock_data);
630             if (ret < 0) {
631                 goto fail;
632             }
633             memset(refblock_data, 0, s->cluster_size);
634             qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
635                                          refblock_data);
636 
637             new_table[i] = block_offset;
638             block_offset += s->cluster_size;
639         }
640 
641         /* First host offset covered by this refblock */
642         first_offset_covered = (uint64_t)i * s->refcount_block_size *
643                                s->cluster_size;
644         if (first_offset_covered < end_offset) {
645             int j, end_index;
646 
647             /* Set the refcount of all of the new refcount structures to 1 */
648 
649             if (first_offset_covered < start_offset) {
650                 assert(i == area_reftable_index);
651                 j = (start_offset - first_offset_covered) / s->cluster_size;
652                 assert(j < s->refcount_block_size);
653             } else {
654                 j = 0;
655             }
656 
657             end_index = MIN((end_offset - first_offset_covered) /
658                             s->cluster_size,
659                             s->refcount_block_size);
660 
661             for (; j < end_index; j++) {
662                 /* The caller guaranteed us this space would be empty */
663                 assert(s->get_refcount(refblock_data, j) == 0);
664                 s->set_refcount(refblock_data, j, 1);
665             }
666 
667             qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
668                                          refblock_data);
669         }
670 
671         qcow2_cache_put(s->refcount_block_cache, &refblock_data);
672     }
673 
674     assert(block_offset == table_offset);
675 
676     /* Write refcount blocks to disk */
677     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
678     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
679     if (ret < 0) {
680         goto fail;
681     }
682 
683     /* Write refcount table to disk */
684     for (i = 0; i < total_refblock_count; i++) {
685         cpu_to_be64s(&new_table[i]);
686     }
687 
688     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
689     ret = bdrv_pwrite_sync(bs->file, table_offset,
690                            table_size * REFTABLE_ENTRY_SIZE, new_table, 0);
691     if (ret < 0) {
692         goto fail;
693     }
694 
695     for (i = 0; i < total_refblock_count; i++) {
696         be64_to_cpus(&new_table[i]);
697     }
698 
699     /* Hook up the new refcount table in the qcow2 header */
700     struct QEMU_PACKED {
701         uint64_t d64;
702         uint32_t d32;
703     } data;
704     data.d64 = cpu_to_be64(table_offset);
705     data.d32 = cpu_to_be32(table_clusters);
706     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
707     ret = bdrv_pwrite_sync(bs->file,
708                            offsetof(QCowHeader, refcount_table_offset),
709                            sizeof(data), &data, 0);
710     if (ret < 0) {
711         goto fail;
712     }
713 
714     /* And switch it in memory */
715     uint64_t old_table_offset = s->refcount_table_offset;
716     uint64_t old_table_size = s->refcount_table_size;
717 
718     g_free(s->refcount_table);
719     s->refcount_table = new_table;
720     s->refcount_table_size = table_size;
721     s->refcount_table_offset = table_offset;
722     update_max_refcount_table_index(s);
723 
724     /* Free old table. */
725     qcow2_free_clusters(bs, old_table_offset,
726                         old_table_size * REFTABLE_ENTRY_SIZE,
727                         QCOW2_DISCARD_OTHER);
728 
729     return end_offset;
730 
731 fail:
732     g_free(new_table);
733     return ret;
734 }
735 
736 void qcow2_process_discards(BlockDriverState *bs, int ret)
737 {
738     BDRVQcow2State *s = bs->opaque;
739     Qcow2DiscardRegion *d, *next;
740 
741     QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
742         QTAILQ_REMOVE(&s->discards, d, next);
743 
744         /* Discard is optional, ignore the return value */
745         if (ret >= 0) {
746             int r2 = bdrv_pdiscard(bs->file, d->offset, d->bytes);
747             if (r2 < 0) {
748                 trace_qcow2_process_discards_failed_region(d->offset, d->bytes,
749                                                            r2);
750             }
751         }
752 
753         g_free(d);
754     }
755 }
756 
757 static void queue_discard(BlockDriverState *bs,
758                           uint64_t offset, uint64_t length)
759 {
760     BDRVQcow2State *s = bs->opaque;
761     Qcow2DiscardRegion *d, *p, *next;
762 
763     QTAILQ_FOREACH(d, &s->discards, next) {
764         uint64_t new_start = MIN(offset, d->offset);
765         uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
766 
767         if (new_end - new_start <= length + d->bytes) {
768             /* There can't be any overlap, areas ending up here have no
769              * references any more and therefore shouldn't get freed another
770              * time. */
771             assert(d->bytes + length == new_end - new_start);
772             d->offset = new_start;
773             d->bytes = new_end - new_start;
774             goto found;
775         }
776     }
777 
778     d = g_malloc(sizeof(*d));
779     *d = (Qcow2DiscardRegion) {
780         .bs     = bs,
781         .offset = offset,
782         .bytes  = length,
783     };
784     QTAILQ_INSERT_TAIL(&s->discards, d, next);
785 
786 found:
787     /* Merge discard requests if they are adjacent now */
788     QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
789         if (p == d
790             || p->offset > d->offset + d->bytes
791             || d->offset > p->offset + p->bytes)
792         {
793             continue;
794         }
795 
796         /* Still no overlap possible */
797         assert(p->offset == d->offset + d->bytes
798             || d->offset == p->offset + p->bytes);
799 
800         QTAILQ_REMOVE(&s->discards, p, next);
801         d->offset = MIN(d->offset, p->offset);
802         d->bytes += p->bytes;
803         g_free(p);
804     }
805 }
806 
807 /* XXX: cache several refcount block clusters ? */
808 /* @addend is the absolute value of the addend; if @decrease is set, @addend
809  * will be subtracted from the current refcount, otherwise it will be added */
810 static int GRAPH_RDLOCK
811 update_refcount(BlockDriverState *bs, int64_t offset, int64_t length,
812                 uint64_t addend, bool decrease, enum qcow2_discard_type type)
813 {
814     BDRVQcow2State *s = bs->opaque;
815     int64_t start, last, cluster_offset;
816     void *refcount_block = NULL;
817     int64_t old_table_index = -1;
818     int ret;
819 
820 #ifdef DEBUG_ALLOC2
821     fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64
822             " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "",
823             addend);
824 #endif
825     if (length < 0) {
826         return -EINVAL;
827     } else if (length == 0) {
828         return 0;
829     }
830 
831     if (decrease) {
832         qcow2_cache_set_dependency(bs, s->refcount_block_cache,
833             s->l2_table_cache);
834     }
835 
836     start = start_of_cluster(s, offset);
837     last = start_of_cluster(s, offset + length - 1);
838     for(cluster_offset = start; cluster_offset <= last;
839         cluster_offset += s->cluster_size)
840     {
841         int block_index;
842         uint64_t refcount;
843         int64_t cluster_index = cluster_offset >> s->cluster_bits;
844         int64_t table_index = cluster_index >> s->refcount_block_bits;
845 
846         /* Load the refcount block and allocate it if needed */
847         if (table_index != old_table_index) {
848             if (refcount_block) {
849                 qcow2_cache_put(s->refcount_block_cache, &refcount_block);
850             }
851             ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
852             /* If the caller needs to restart the search for free clusters,
853              * try the same ones first to see if they're still free. */
854             if (ret == -EAGAIN) {
855                 if (s->free_cluster_index > (start >> s->cluster_bits)) {
856                     s->free_cluster_index = (start >> s->cluster_bits);
857                 }
858             }
859             if (ret < 0) {
860                 goto fail;
861             }
862         }
863         old_table_index = table_index;
864 
865         qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
866 
867         /* we can update the count and save it */
868         block_index = cluster_index & (s->refcount_block_size - 1);
869 
870         refcount = s->get_refcount(refcount_block, block_index);
871         if (decrease ? (refcount - addend > refcount)
872                      : (refcount + addend < refcount ||
873                         refcount + addend > s->refcount_max))
874         {
875             ret = -EINVAL;
876             goto fail;
877         }
878         if (decrease) {
879             refcount -= addend;
880         } else {
881             refcount += addend;
882         }
883         if (refcount == 0 && cluster_index < s->free_cluster_index) {
884             s->free_cluster_index = cluster_index;
885         }
886         s->set_refcount(refcount_block, block_index, refcount);
887 
888         if (refcount == 0) {
889             void *table;
890 
891             table = qcow2_cache_is_table_offset(s->refcount_block_cache,
892                                                 offset);
893             if (table != NULL) {
894                 qcow2_cache_put(s->refcount_block_cache, &refcount_block);
895                 old_table_index = -1;
896                 qcow2_cache_discard(s->refcount_block_cache, table);
897             }
898 
899             table = qcow2_cache_is_table_offset(s->l2_table_cache, offset);
900             if (table != NULL) {
901                 qcow2_cache_discard(s->l2_table_cache, table);
902             }
903 
904             if (s->discard_passthrough[type]) {
905                 queue_discard(bs, cluster_offset, s->cluster_size);
906             }
907         }
908     }
909 
910     ret = 0;
911 fail:
912     if (!s->cache_discards) {
913         qcow2_process_discards(bs, ret);
914     }
915 
916     /* Write last changed block to disk */
917     if (refcount_block) {
918         qcow2_cache_put(s->refcount_block_cache, &refcount_block);
919     }
920 
921     /*
922      * Try do undo any updates if an error is returned (This may succeed in
923      * some cases like ENOSPC for allocating a new refcount block)
924      */
925     if (ret < 0) {
926         int dummy;
927         dummy = update_refcount(bs, offset, cluster_offset - offset, addend,
928                                 !decrease, QCOW2_DISCARD_NEVER);
929         (void)dummy;
930     }
931 
932     return ret;
933 }
934 
935 /*
936  * Increases or decreases the refcount of a given cluster.
937  *
938  * @addend is the absolute value of the addend; if @decrease is set, @addend
939  * will be subtracted from the current refcount, otherwise it will be added.
940  *
941  * On success 0 is returned; on failure -errno is returned.
942  */
943 int qcow2_update_cluster_refcount(BlockDriverState *bs,
944                                   int64_t cluster_index,
945                                   uint64_t addend, bool decrease,
946                                   enum qcow2_discard_type type)
947 {
948     BDRVQcow2State *s = bs->opaque;
949     int ret;
950 
951     ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
952                           decrease, type);
953     if (ret < 0) {
954         return ret;
955     }
956 
957     return 0;
958 }
959 
960 
961 
962 /*********************************************************/
963 /* cluster allocation functions */
964 
965 
966 
967 /* return < 0 if error */
968 static int64_t GRAPH_RDLOCK
969 alloc_clusters_noref(BlockDriverState *bs, uint64_t size, uint64_t max)
970 {
971     BDRVQcow2State *s = bs->opaque;
972     uint64_t i, nb_clusters, refcount;
973     int ret;
974 
975     /* We can't allocate clusters if they may still be queued for discard. */
976     if (s->cache_discards) {
977         qcow2_process_discards(bs, 0);
978     }
979 
980     nb_clusters = size_to_clusters(s, size);
981 retry:
982     for(i = 0; i < nb_clusters; i++) {
983         uint64_t next_cluster_index = s->free_cluster_index++;
984         ret = qcow2_get_refcount(bs, next_cluster_index, &refcount);
985 
986         if (ret < 0) {
987             return ret;
988         } else if (refcount != 0) {
989             goto retry;
990         }
991     }
992 
993     /* Make sure that all offsets in the "allocated" range are representable
994      * in the requested max */
995     if (s->free_cluster_index > 0 &&
996         s->free_cluster_index - 1 > (max >> s->cluster_bits))
997     {
998         return -EFBIG;
999     }
1000 
1001 #ifdef DEBUG_ALLOC2
1002     fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
1003             size,
1004             (s->free_cluster_index - nb_clusters) << s->cluster_bits);
1005 #endif
1006     return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
1007 }
1008 
1009 int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
1010 {
1011     int64_t offset;
1012     int ret;
1013 
1014     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
1015     do {
1016         offset = alloc_clusters_noref(bs, size, QCOW_MAX_CLUSTER_OFFSET);
1017         if (offset < 0) {
1018             return offset;
1019         }
1020 
1021         ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
1022     } while (ret == -EAGAIN);
1023 
1024     if (ret < 0) {
1025         return ret;
1026     }
1027 
1028     return offset;
1029 }
1030 
1031 int64_t coroutine_fn qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
1032                                              int64_t nb_clusters)
1033 {
1034     BDRVQcow2State *s = bs->opaque;
1035     uint64_t cluster_index, refcount;
1036     uint64_t i;
1037     int ret;
1038 
1039     assert(nb_clusters >= 0);
1040     if (nb_clusters == 0) {
1041         return 0;
1042     }
1043 
1044     do {
1045         /* Check how many clusters there are free */
1046         cluster_index = offset >> s->cluster_bits;
1047         for(i = 0; i < nb_clusters; i++) {
1048             ret = qcow2_get_refcount(bs, cluster_index++, &refcount);
1049             if (ret < 0) {
1050                 return ret;
1051             } else if (refcount != 0) {
1052                 break;
1053             }
1054         }
1055 
1056         /* And then allocate them */
1057         ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false,
1058                               QCOW2_DISCARD_NEVER);
1059     } while (ret == -EAGAIN);
1060 
1061     if (ret < 0) {
1062         return ret;
1063     }
1064 
1065     return i;
1066 }
1067 
1068 /* only used to allocate compressed sectors. We try to allocate
1069    contiguous sectors. size must be <= cluster_size */
1070 int64_t coroutine_fn GRAPH_RDLOCK qcow2_alloc_bytes(BlockDriverState *bs, int size)
1071 {
1072     BDRVQcow2State *s = bs->opaque;
1073     int64_t offset;
1074     size_t free_in_cluster;
1075     int ret;
1076 
1077     BLKDBG_CO_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
1078     assert(size > 0 && size <= s->cluster_size);
1079     assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset));
1080 
1081     offset = s->free_byte_offset;
1082 
1083     if (offset) {
1084         uint64_t refcount;
1085         ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount);
1086         if (ret < 0) {
1087             return ret;
1088         }
1089 
1090         if (refcount == s->refcount_max) {
1091             offset = 0;
1092         }
1093     }
1094 
1095     free_in_cluster = s->cluster_size - offset_into_cluster(s, offset);
1096     do {
1097         if (!offset || free_in_cluster < size) {
1098             int64_t new_cluster;
1099 
1100             new_cluster = alloc_clusters_noref(bs, s->cluster_size,
1101                                                MIN(s->cluster_offset_mask,
1102                                                    QCOW_MAX_CLUSTER_OFFSET));
1103             if (new_cluster < 0) {
1104                 return new_cluster;
1105             }
1106 
1107             if (new_cluster == 0) {
1108                 qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
1109                                         "allocation of compressed cluster "
1110                                         "at offset 0");
1111                 return -EIO;
1112             }
1113 
1114             if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) {
1115                 offset = new_cluster;
1116                 free_in_cluster = s->cluster_size;
1117             } else {
1118                 free_in_cluster += s->cluster_size;
1119             }
1120         }
1121 
1122         assert(offset);
1123         ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
1124         if (ret < 0) {
1125             offset = 0;
1126         }
1127     } while (ret == -EAGAIN);
1128     if (ret < 0) {
1129         return ret;
1130     }
1131 
1132     /* The cluster refcount was incremented; refcount blocks must be flushed
1133      * before the caller's L2 table updates. */
1134     qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
1135 
1136     s->free_byte_offset = offset + size;
1137     if (!offset_into_cluster(s, s->free_byte_offset)) {
1138         s->free_byte_offset = 0;
1139     }
1140 
1141     return offset;
1142 }
1143 
1144 void qcow2_free_clusters(BlockDriverState *bs,
1145                           int64_t offset, int64_t size,
1146                           enum qcow2_discard_type type)
1147 {
1148     int ret;
1149 
1150     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
1151     ret = update_refcount(bs, offset, size, 1, true, type);
1152     if (ret < 0) {
1153         fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
1154         /* TODO Remember the clusters to free them later and avoid leaking */
1155     }
1156 }
1157 
1158 /*
1159  * Free a cluster using its L2 entry (handles clusters of all types, e.g.
1160  * normal cluster, compressed cluster, etc.)
1161  */
1162 void qcow2_free_any_cluster(BlockDriverState *bs, uint64_t l2_entry,
1163                             enum qcow2_discard_type type)
1164 {
1165     BDRVQcow2State *s = bs->opaque;
1166     QCow2ClusterType ctype = qcow2_get_cluster_type(bs, l2_entry);
1167 
1168     if (has_data_file(bs)) {
1169         if (s->discard_passthrough[type] &&
1170             (ctype == QCOW2_CLUSTER_NORMAL ||
1171              ctype == QCOW2_CLUSTER_ZERO_ALLOC))
1172         {
1173             bdrv_pdiscard(s->data_file, l2_entry & L2E_OFFSET_MASK,
1174                           s->cluster_size);
1175         }
1176         return;
1177     }
1178 
1179     switch (ctype) {
1180     case QCOW2_CLUSTER_COMPRESSED:
1181         {
1182             uint64_t coffset;
1183             int csize;
1184 
1185             qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize);
1186             qcow2_free_clusters(bs, coffset, csize, type);
1187         }
1188         break;
1189     case QCOW2_CLUSTER_NORMAL:
1190     case QCOW2_CLUSTER_ZERO_ALLOC:
1191         if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) {
1192             qcow2_signal_corruption(bs, false, -1, -1,
1193                                     "Cannot free unaligned cluster %#llx",
1194                                     l2_entry & L2E_OFFSET_MASK);
1195         } else {
1196             qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
1197                                 s->cluster_size, type);
1198         }
1199         break;
1200     case QCOW2_CLUSTER_ZERO_PLAIN:
1201     case QCOW2_CLUSTER_UNALLOCATED:
1202         break;
1203     default:
1204         abort();
1205     }
1206 }
1207 
1208 void qcow2_discard_cluster(BlockDriverState *bs, uint64_t offset,
1209                            uint64_t length, QCow2ClusterType ctype,
1210                            enum qcow2_discard_type dtype)
1211 {
1212     BDRVQcow2State *s = bs->opaque;
1213 
1214     if (s->discard_passthrough[dtype] &&
1215         (ctype == QCOW2_CLUSTER_NORMAL ||
1216          ctype == QCOW2_CLUSTER_ZERO_ALLOC)) {
1217         if (has_data_file(bs)) {
1218             bdrv_pdiscard(s->data_file, offset, length);
1219         } else {
1220             queue_discard(bs, offset, length);
1221         }
1222     }
1223 }
1224 
1225 int qcow2_write_caches(BlockDriverState *bs)
1226 {
1227     BDRVQcow2State *s = bs->opaque;
1228     int ret;
1229 
1230     ret = qcow2_cache_write(bs, s->l2_table_cache);
1231     if (ret < 0) {
1232         return ret;
1233     }
1234 
1235     if (qcow2_need_accurate_refcounts(s)) {
1236         ret = qcow2_cache_write(bs, s->refcount_block_cache);
1237         if (ret < 0) {
1238             return ret;
1239         }
1240     }
1241 
1242     return 0;
1243 }
1244 
1245 int qcow2_flush_caches(BlockDriverState *bs)
1246 {
1247     int ret = qcow2_write_caches(bs);
1248     if (ret < 0) {
1249         return ret;
1250     }
1251 
1252     return bdrv_flush(bs->file->bs);
1253 }
1254 
1255 /*********************************************************/
1256 /* snapshots and image creation */
1257 
1258 
1259 
1260 /* update the refcounts of snapshots and the copied flag */
1261 int qcow2_update_snapshot_refcount(BlockDriverState *bs,
1262     int64_t l1_table_offset, int l1_size, int addend)
1263 {
1264     BDRVQcow2State *s = bs->opaque;
1265     uint64_t *l1_table, *l2_slice, l2_offset, entry, l1_size2, refcount;
1266     bool l1_allocated = false;
1267     int64_t old_entry, old_l2_offset;
1268     unsigned slice, slice_size2, n_slices;
1269     int i, j, l1_modified = 0;
1270     int ret;
1271 
1272     assert(addend >= -1 && addend <= 1);
1273 
1274     l2_slice = NULL;
1275     l1_table = NULL;
1276     l1_size2 = l1_size * L1E_SIZE;
1277     slice_size2 = s->l2_slice_size * l2_entry_size(s);
1278     n_slices = s->cluster_size / slice_size2;
1279 
1280     s->cache_discards = true;
1281 
1282     /* WARNING: qcow2_snapshot_goto relies on this function not using the
1283      * l1_table_offset when it is the current s->l1_table_offset! Be careful
1284      * when changing this! */
1285     if (l1_table_offset != s->l1_table_offset) {
1286         l1_table = g_try_malloc0(l1_size2);
1287         if (l1_size2 && l1_table == NULL) {
1288             ret = -ENOMEM;
1289             goto fail;
1290         }
1291         l1_allocated = true;
1292 
1293         ret = bdrv_pread(bs->file, l1_table_offset, l1_size2, l1_table, 0);
1294         if (ret < 0) {
1295             goto fail;
1296         }
1297 
1298         for (i = 0; i < l1_size; i++) {
1299             be64_to_cpus(&l1_table[i]);
1300         }
1301     } else {
1302         assert(l1_size == s->l1_size);
1303         l1_table = s->l1_table;
1304         l1_allocated = false;
1305     }
1306 
1307     for (i = 0; i < l1_size; i++) {
1308         l2_offset = l1_table[i];
1309         if (l2_offset) {
1310             old_l2_offset = l2_offset;
1311             l2_offset &= L1E_OFFSET_MASK;
1312 
1313             if (offset_into_cluster(s, l2_offset)) {
1314                 qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
1315                                         PRIx64 " unaligned (L1 index: %#x)",
1316                                         l2_offset, i);
1317                 ret = -EIO;
1318                 goto fail;
1319             }
1320 
1321             for (slice = 0; slice < n_slices; slice++) {
1322                 ret = qcow2_cache_get(bs, s->l2_table_cache,
1323                                       l2_offset + slice * slice_size2,
1324                                       (void **) &l2_slice);
1325                 if (ret < 0) {
1326                     goto fail;
1327                 }
1328 
1329                 for (j = 0; j < s->l2_slice_size; j++) {
1330                     uint64_t cluster_index;
1331                     uint64_t offset;
1332 
1333                     entry = get_l2_entry(s, l2_slice, j);
1334                     old_entry = entry;
1335                     entry &= ~QCOW_OFLAG_COPIED;
1336                     offset = entry & L2E_OFFSET_MASK;
1337 
1338                     switch (qcow2_get_cluster_type(bs, entry)) {
1339                     case QCOW2_CLUSTER_COMPRESSED:
1340                         if (addend != 0) {
1341                             uint64_t coffset;
1342                             int csize;
1343 
1344                             qcow2_parse_compressed_l2_entry(bs, entry,
1345                                                             &coffset, &csize);
1346                             ret = update_refcount(
1347                                 bs, coffset, csize,
1348                                 abs(addend), addend < 0,
1349                                 QCOW2_DISCARD_SNAPSHOT);
1350                             if (ret < 0) {
1351                                 goto fail;
1352                             }
1353                         }
1354                         /* compressed clusters are never modified */
1355                         refcount = 2;
1356                         break;
1357 
1358                     case QCOW2_CLUSTER_NORMAL:
1359                     case QCOW2_CLUSTER_ZERO_ALLOC:
1360                         if (offset_into_cluster(s, offset)) {
1361                             /* Here l2_index means table (not slice) index */
1362                             int l2_index = slice * s->l2_slice_size + j;
1363                             qcow2_signal_corruption(
1364                                 bs, true, -1, -1, "Cluster "
1365                                 "allocation offset %#" PRIx64
1366                                 " unaligned (L2 offset: %#"
1367                                 PRIx64 ", L2 index: %#x)",
1368                                 offset, l2_offset, l2_index);
1369                             ret = -EIO;
1370                             goto fail;
1371                         }
1372 
1373                         cluster_index = offset >> s->cluster_bits;
1374                         assert(cluster_index);
1375                         if (addend != 0) {
1376                             ret = qcow2_update_cluster_refcount(
1377                                 bs, cluster_index, abs(addend), addend < 0,
1378                                 QCOW2_DISCARD_SNAPSHOT);
1379                             if (ret < 0) {
1380                                 goto fail;
1381                             }
1382                         }
1383 
1384                         ret = qcow2_get_refcount(bs, cluster_index, &refcount);
1385                         if (ret < 0) {
1386                             goto fail;
1387                         }
1388                         break;
1389 
1390                     case QCOW2_CLUSTER_ZERO_PLAIN:
1391                     case QCOW2_CLUSTER_UNALLOCATED:
1392                         refcount = 0;
1393                         break;
1394 
1395                     default:
1396                         abort();
1397                     }
1398 
1399                     if (refcount == 1) {
1400                         entry |= QCOW_OFLAG_COPIED;
1401                     }
1402                     if (entry != old_entry) {
1403                         if (addend > 0) {
1404                             qcow2_cache_set_dependency(bs, s->l2_table_cache,
1405                                                        s->refcount_block_cache);
1406                         }
1407                         set_l2_entry(s, l2_slice, j, entry);
1408                         qcow2_cache_entry_mark_dirty(s->l2_table_cache,
1409                                                      l2_slice);
1410                     }
1411                 }
1412 
1413                 qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1414             }
1415 
1416             if (addend != 0) {
1417                 ret = qcow2_update_cluster_refcount(bs, l2_offset >>
1418                                                         s->cluster_bits,
1419                                                     abs(addend), addend < 0,
1420                                                     QCOW2_DISCARD_SNAPSHOT);
1421                 if (ret < 0) {
1422                     goto fail;
1423                 }
1424             }
1425             ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1426                                      &refcount);
1427             if (ret < 0) {
1428                 goto fail;
1429             } else if (refcount == 1) {
1430                 l2_offset |= QCOW_OFLAG_COPIED;
1431             }
1432             if (l2_offset != old_l2_offset) {
1433                 l1_table[i] = l2_offset;
1434                 l1_modified = 1;
1435             }
1436         }
1437     }
1438 
1439     ret = bdrv_flush(bs);
1440 fail:
1441     if (l2_slice) {
1442         qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1443     }
1444 
1445     s->cache_discards = false;
1446     qcow2_process_discards(bs, ret);
1447 
1448     /* Update L1 only if it isn't deleted anyway (addend = -1) */
1449     if (ret == 0 && addend >= 0 && l1_modified) {
1450         for (i = 0; i < l1_size; i++) {
1451             cpu_to_be64s(&l1_table[i]);
1452         }
1453 
1454         ret = bdrv_pwrite_sync(bs->file, l1_table_offset, l1_size2, l1_table,
1455                                0);
1456 
1457         for (i = 0; i < l1_size; i++) {
1458             be64_to_cpus(&l1_table[i]);
1459         }
1460     }
1461     if (l1_allocated)
1462         g_free(l1_table);
1463     return ret;
1464 }
1465 
1466 
1467 
1468 
1469 /*********************************************************/
1470 /* refcount checking functions */
1471 
1472 
1473 static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries)
1474 {
1475     /* This assertion holds because there is no way we can address more than
1476      * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because
1477      * offsets have to be representable in bytes); due to every cluster
1478      * corresponding to one refcount entry, we are well below that limit */
1479     assert(entries < (UINT64_C(1) << (64 - 9)));
1480 
1481     /* Thanks to the assertion this will not overflow, because
1482      * s->refcount_order < 7.
1483      * (note: x << s->refcount_order == x * s->refcount_bits) */
1484     return DIV_ROUND_UP(entries << s->refcount_order, 8);
1485 }
1486 
1487 /**
1488  * Reallocates *array so that it can hold new_size entries. *size must contain
1489  * the current number of entries in *array. If the reallocation fails, *array
1490  * and *size will not be modified and -errno will be returned. If the
1491  * reallocation is successful, *array will be set to the new buffer, *size
1492  * will be set to new_size and 0 will be returned. The size of the reallocated
1493  * refcount array buffer will be aligned to a cluster boundary, and the newly
1494  * allocated area will be zeroed.
1495  */
1496 static int realloc_refcount_array(BDRVQcow2State *s, void **array,
1497                                   int64_t *size, int64_t new_size)
1498 {
1499     int64_t old_byte_size, new_byte_size;
1500     void *new_ptr;
1501 
1502     /* Round to clusters so the array can be directly written to disk */
1503     old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size))
1504                     * s->cluster_size;
1505     new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size))
1506                     * s->cluster_size;
1507 
1508     if (new_byte_size == old_byte_size) {
1509         *size = new_size;
1510         return 0;
1511     }
1512 
1513     assert(new_byte_size > 0);
1514 
1515     if (new_byte_size > SIZE_MAX) {
1516         return -ENOMEM;
1517     }
1518 
1519     new_ptr = g_try_realloc(*array, new_byte_size);
1520     if (!new_ptr) {
1521         return -ENOMEM;
1522     }
1523 
1524     if (new_byte_size > old_byte_size) {
1525         memset((char *)new_ptr + old_byte_size, 0,
1526                new_byte_size - old_byte_size);
1527     }
1528 
1529     *array = new_ptr;
1530     *size  = new_size;
1531 
1532     return 0;
1533 }
1534 
1535 /*
1536  * Increases the refcount for a range of clusters in a given refcount table.
1537  * This is used to construct a temporary refcount table out of L1 and L2 tables
1538  * which can be compared to the refcount table saved in the image.
1539  *
1540  * Modifies the number of errors in res.
1541  */
1542 int coroutine_fn GRAPH_RDLOCK
1543 qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res,
1544                          void **refcount_table,
1545                          int64_t *refcount_table_size,
1546                          int64_t offset, int64_t size)
1547 {
1548     BDRVQcow2State *s = bs->opaque;
1549     uint64_t start, last, cluster_offset, k, refcount;
1550     int64_t file_len;
1551     int ret;
1552 
1553     if (size <= 0) {
1554         return 0;
1555     }
1556 
1557     file_len = bdrv_co_getlength(bs->file->bs);
1558     if (file_len < 0) {
1559         return file_len;
1560     }
1561 
1562     /*
1563      * Last cluster of qcow2 image may be semi-allocated, so it may be OK to
1564      * reference some space after file end but it should be less than one
1565      * cluster.
1566      */
1567     if (offset + size - file_len >= s->cluster_size) {
1568         fprintf(stderr, "ERROR: counting reference for region exceeding the "
1569                 "end of the file by one cluster or more: offset 0x%" PRIx64
1570                 " size 0x%" PRIx64 "\n", offset, size);
1571         res->corruptions++;
1572         return 0;
1573     }
1574 
1575     start = start_of_cluster(s, offset);
1576     last = start_of_cluster(s, offset + size - 1);
1577     for(cluster_offset = start; cluster_offset <= last;
1578         cluster_offset += s->cluster_size) {
1579         k = cluster_offset >> s->cluster_bits;
1580         if (k >= *refcount_table_size) {
1581             ret = realloc_refcount_array(s, refcount_table,
1582                                          refcount_table_size, k + 1);
1583             if (ret < 0) {
1584                 res->check_errors++;
1585                 return ret;
1586             }
1587         }
1588 
1589         refcount = s->get_refcount(*refcount_table, k);
1590         if (refcount == s->refcount_max) {
1591             fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
1592                     "\n", cluster_offset);
1593             fprintf(stderr, "Use qemu-img amend to increase the refcount entry "
1594                     "width or qemu-img convert to create a clean copy if the "
1595                     "image cannot be opened for writing\n");
1596             res->corruptions++;
1597             continue;
1598         }
1599         s->set_refcount(*refcount_table, k, refcount + 1);
1600     }
1601 
1602     return 0;
1603 }
1604 
1605 /* Flags for check_refcounts_l1() and check_refcounts_l2() */
1606 enum {
1607     CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
1608 };
1609 
1610 /*
1611  * Fix L2 entry by making it QCOW2_CLUSTER_ZERO_PLAIN (or making all its present
1612  * subclusters QCOW2_SUBCLUSTER_ZERO_PLAIN).
1613  *
1614  * This function decrements res->corruptions on success, so the caller is
1615  * responsible to increment res->corruptions prior to the call.
1616  *
1617  * On failure in-memory @l2_table may be modified.
1618  */
1619 static int coroutine_fn GRAPH_RDLOCK
1620 fix_l2_entry_by_zero(BlockDriverState *bs, BdrvCheckResult *res,
1621                      uint64_t l2_offset, uint64_t *l2_table,
1622                      int l2_index, bool active,
1623                      bool *metadata_overlap)
1624 {
1625     BDRVQcow2State *s = bs->opaque;
1626     int ret;
1627     int idx = l2_index * (l2_entry_size(s) / sizeof(uint64_t));
1628     uint64_t l2e_offset = l2_offset + (uint64_t)l2_index * l2_entry_size(s);
1629     int ign = active ? QCOW2_OL_ACTIVE_L2 : QCOW2_OL_INACTIVE_L2;
1630 
1631     if (has_subclusters(s)) {
1632         uint64_t l2_bitmap = get_l2_bitmap(s, l2_table, l2_index);
1633 
1634         /* Allocated subclusters become zero */
1635         l2_bitmap |= l2_bitmap << 32;
1636         l2_bitmap &= QCOW_L2_BITMAP_ALL_ZEROES;
1637 
1638         set_l2_bitmap(s, l2_table, l2_index, l2_bitmap);
1639         set_l2_entry(s, l2_table, l2_index, 0);
1640     } else {
1641         set_l2_entry(s, l2_table, l2_index, QCOW_OFLAG_ZERO);
1642     }
1643 
1644     ret = qcow2_pre_write_overlap_check(bs, ign, l2e_offset, l2_entry_size(s),
1645                                         false);
1646     if (metadata_overlap) {
1647         *metadata_overlap = ret < 0;
1648     }
1649     if (ret < 0) {
1650         fprintf(stderr, "ERROR: Overlap check failed\n");
1651         goto fail;
1652     }
1653 
1654     ret = bdrv_co_pwrite_sync(bs->file, l2e_offset, l2_entry_size(s),
1655                               &l2_table[idx], 0);
1656     if (ret < 0) {
1657         fprintf(stderr, "ERROR: Failed to overwrite L2 "
1658                 "table entry: %s\n", strerror(-ret));
1659         goto fail;
1660     }
1661 
1662     res->corruptions--;
1663     res->corruptions_fixed++;
1664     return 0;
1665 
1666 fail:
1667     res->check_errors++;
1668     return ret;
1669 }
1670 
1671 /*
1672  * Increases the refcount in the given refcount table for the all clusters
1673  * referenced in the L2 table. While doing so, performs some checks on L2
1674  * entries.
1675  *
1676  * Returns the number of errors found by the checks or -errno if an internal
1677  * error occurred.
1678  */
1679 static int coroutine_fn GRAPH_RDLOCK
1680 check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
1681                    void **refcount_table,
1682                    int64_t *refcount_table_size, int64_t l2_offset,
1683                    int flags, BdrvCheckMode fix, bool active)
1684 {
1685     BDRVQcow2State *s = bs->opaque;
1686     uint64_t l2_entry, l2_bitmap;
1687     uint64_t next_contiguous_offset = 0;
1688     int i, ret;
1689     size_t l2_size_bytes = s->l2_size * l2_entry_size(s);
1690     g_autofree uint64_t *l2_table = g_malloc(l2_size_bytes);
1691     bool metadata_overlap;
1692 
1693     /* Read L2 table from disk */
1694     ret = bdrv_co_pread(bs->file, l2_offset, l2_size_bytes, l2_table, 0);
1695     if (ret < 0) {
1696         fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
1697         res->check_errors++;
1698         return ret;
1699     }
1700 
1701     /* Do the actual checks */
1702     for (i = 0; i < s->l2_size; i++) {
1703         uint64_t coffset;
1704         int csize;
1705         QCow2ClusterType type;
1706 
1707         l2_entry = get_l2_entry(s, l2_table, i);
1708         l2_bitmap = get_l2_bitmap(s, l2_table, i);
1709         type = qcow2_get_cluster_type(bs, l2_entry);
1710 
1711         if (type != QCOW2_CLUSTER_COMPRESSED) {
1712             /* Check reserved bits of Standard Cluster Descriptor */
1713             if (l2_entry & L2E_STD_RESERVED_MASK) {
1714                 fprintf(stderr, "ERROR found l2 entry with reserved bits set: "
1715                         "%" PRIx64 "\n", l2_entry);
1716                 res->corruptions++;
1717             }
1718         }
1719 
1720         switch (type) {
1721         case QCOW2_CLUSTER_COMPRESSED:
1722             /* Compressed clusters don't have QCOW_OFLAG_COPIED */
1723             if (l2_entry & QCOW_OFLAG_COPIED) {
1724                 fprintf(stderr, "ERROR: coffset=0x%" PRIx64 ": "
1725                     "copied flag must never be set for compressed "
1726                     "clusters\n", l2_entry & s->cluster_offset_mask);
1727                 l2_entry &= ~QCOW_OFLAG_COPIED;
1728                 res->corruptions++;
1729             }
1730 
1731             if (has_data_file(bs)) {
1732                 fprintf(stderr, "ERROR compressed cluster %d with data file, "
1733                         "entry=0x%" PRIx64 "\n", i, l2_entry);
1734                 res->corruptions++;
1735                 break;
1736             }
1737 
1738             if (l2_bitmap) {
1739                 fprintf(stderr, "ERROR compressed cluster %d with non-zero "
1740                         "subcluster allocation bitmap, entry=0x%" PRIx64 "\n",
1741                         i, l2_entry);
1742                 res->corruptions++;
1743                 break;
1744             }
1745 
1746             /* Mark cluster as used */
1747             qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize);
1748             ret = qcow2_inc_refcounts_imrt(
1749                 bs, res, refcount_table, refcount_table_size, coffset, csize);
1750             if (ret < 0) {
1751                 return ret;
1752             }
1753 
1754             if (flags & CHECK_FRAG_INFO) {
1755                 res->bfi.allocated_clusters++;
1756                 res->bfi.compressed_clusters++;
1757 
1758                 /*
1759                  * Compressed clusters are fragmented by nature.  Since they
1760                  * take up sub-sector space but we only have sector granularity
1761                  * I/O we need to re-read the same sectors even for adjacent
1762                  * compressed clusters.
1763                  */
1764                 res->bfi.fragmented_clusters++;
1765             }
1766             break;
1767 
1768         case QCOW2_CLUSTER_ZERO_ALLOC:
1769         case QCOW2_CLUSTER_NORMAL:
1770         {
1771             uint64_t offset = l2_entry & L2E_OFFSET_MASK;
1772 
1773             if ((l2_bitmap >> 32) & l2_bitmap) {
1774                 res->corruptions++;
1775                 fprintf(stderr, "ERROR offset=%" PRIx64 ": Allocated "
1776                         "cluster has corrupted subcluster allocation bitmap\n",
1777                         offset);
1778             }
1779 
1780             /* Correct offsets are cluster aligned */
1781             if (offset_into_cluster(s, offset)) {
1782                 bool contains_data;
1783                 res->corruptions++;
1784 
1785                 if (has_subclusters(s)) {
1786                     contains_data = (l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC);
1787                 } else {
1788                     contains_data = !(l2_entry & QCOW_OFLAG_ZERO);
1789                 }
1790 
1791                 if (!contains_data) {
1792                     fprintf(stderr, "%s offset=%" PRIx64 ": Preallocated "
1793                             "cluster is not properly aligned; L2 entry "
1794                             "corrupted.\n",
1795                             fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR",
1796                             offset);
1797                     if (fix & BDRV_FIX_ERRORS) {
1798                         ret = fix_l2_entry_by_zero(bs, res, l2_offset,
1799                                                    l2_table, i, active,
1800                                                    &metadata_overlap);
1801                         if (metadata_overlap) {
1802                             /*
1803                              * Something is seriously wrong, so abort checking
1804                              * this L2 table.
1805                              */
1806                             return ret;
1807                         }
1808 
1809                         if (ret == 0) {
1810                             /*
1811                              * Skip marking the cluster as used
1812                              * (it is unused now).
1813                              */
1814                             continue;
1815                         }
1816 
1817                         /*
1818                          * Failed to fix.
1819                          * Do not abort, continue checking the rest of this
1820                          * L2 table's entries.
1821                          */
1822                     }
1823                 } else {
1824                     fprintf(stderr, "ERROR offset=%" PRIx64 ": Data cluster is "
1825                         "not properly aligned; L2 entry corrupted.\n", offset);
1826                 }
1827             }
1828 
1829             if (flags & CHECK_FRAG_INFO) {
1830                 res->bfi.allocated_clusters++;
1831                 if (next_contiguous_offset &&
1832                     offset != next_contiguous_offset) {
1833                     res->bfi.fragmented_clusters++;
1834                 }
1835                 next_contiguous_offset = offset + s->cluster_size;
1836             }
1837 
1838             /* Mark cluster as used */
1839             if (!has_data_file(bs)) {
1840                 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table,
1841                                                refcount_table_size,
1842                                                offset, s->cluster_size);
1843                 if (ret < 0) {
1844                     return ret;
1845                 }
1846             }
1847             break;
1848         }
1849 
1850         case QCOW2_CLUSTER_ZERO_PLAIN:
1851             /* Impossible when image has subclusters */
1852             assert(!l2_bitmap);
1853             break;
1854 
1855         case QCOW2_CLUSTER_UNALLOCATED:
1856             if (l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC) {
1857                 res->corruptions++;
1858                 fprintf(stderr, "ERROR: Unallocated "
1859                         "cluster has non-zero subcluster allocation map\n");
1860             }
1861             break;
1862 
1863         default:
1864             abort();
1865         }
1866     }
1867 
1868     return 0;
1869 }
1870 
1871 /*
1872  * Increases the refcount for the L1 table, its L2 tables and all referenced
1873  * clusters in the given refcount table. While doing so, performs some checks
1874  * on L1 and L2 entries.
1875  *
1876  * Returns the number of errors found by the checks or -errno if an internal
1877  * error occurred.
1878  */
1879 static int coroutine_fn GRAPH_RDLOCK
1880 check_refcounts_l1(BlockDriverState *bs, BdrvCheckResult *res,
1881                    void **refcount_table, int64_t *refcount_table_size,
1882                    int64_t l1_table_offset, int l1_size,
1883                    int flags, BdrvCheckMode fix, bool active)
1884 {
1885     BDRVQcow2State *s = bs->opaque;
1886     size_t l1_size_bytes = l1_size * L1E_SIZE;
1887     g_autofree uint64_t *l1_table = NULL;
1888     uint64_t l2_offset;
1889     int i, ret;
1890 
1891     if (!l1_size) {
1892         return 0;
1893     }
1894 
1895     /* Mark L1 table as used */
1896     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, refcount_table_size,
1897                                    l1_table_offset, l1_size_bytes);
1898     if (ret < 0) {
1899         return ret;
1900     }
1901 
1902     l1_table = g_try_malloc(l1_size_bytes);
1903     if (l1_table == NULL) {
1904         res->check_errors++;
1905         return -ENOMEM;
1906     }
1907 
1908     /* Read L1 table entries from disk */
1909     ret = bdrv_co_pread(bs->file, l1_table_offset, l1_size_bytes, l1_table, 0);
1910     if (ret < 0) {
1911         fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1912         res->check_errors++;
1913         return ret;
1914     }
1915 
1916     for (i = 0; i < l1_size; i++) {
1917         be64_to_cpus(&l1_table[i]);
1918     }
1919 
1920     /* Do the actual checks */
1921     for (i = 0; i < l1_size; i++) {
1922         if (!l1_table[i]) {
1923             continue;
1924         }
1925 
1926         if (l1_table[i] & L1E_RESERVED_MASK) {
1927             fprintf(stderr, "ERROR found L1 entry with reserved bits set: "
1928                     "%" PRIx64 "\n", l1_table[i]);
1929             res->corruptions++;
1930         }
1931 
1932         l2_offset = l1_table[i] & L1E_OFFSET_MASK;
1933 
1934         /* Mark L2 table as used */
1935         ret = qcow2_inc_refcounts_imrt(bs, res,
1936                                        refcount_table, refcount_table_size,
1937                                        l2_offset, s->cluster_size);
1938         if (ret < 0) {
1939             return ret;
1940         }
1941 
1942         /* L2 tables are cluster aligned */
1943         if (offset_into_cluster(s, l2_offset)) {
1944             fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1945                 "cluster aligned; L1 entry corrupted\n", l2_offset);
1946             res->corruptions++;
1947         }
1948 
1949         /* Process and check L2 entries */
1950         ret = check_refcounts_l2(bs, res, refcount_table,
1951                                  refcount_table_size, l2_offset, flags,
1952                                  fix, active);
1953         if (ret < 0) {
1954             return ret;
1955         }
1956     }
1957 
1958     return 0;
1959 }
1960 
1961 /*
1962  * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
1963  *
1964  * This function does not print an error message nor does it increment
1965  * check_errors if qcow2_get_refcount fails (this is because such an error will
1966  * have been already detected and sufficiently signaled by the calling function
1967  * (qcow2_check_refcounts) by the time this function is called).
1968  */
1969 static int coroutine_fn GRAPH_RDLOCK
1970 check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
1971 {
1972     BDRVQcow2State *s = bs->opaque;
1973     uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
1974     int ret;
1975     uint64_t refcount;
1976     int i, j;
1977     bool repair;
1978 
1979     if (fix & BDRV_FIX_ERRORS) {
1980         /* Always repair */
1981         repair = true;
1982     } else if (fix & BDRV_FIX_LEAKS) {
1983         /* Repair only if that seems safe: This function is always
1984          * called after the refcounts have been fixed, so the refcount
1985          * is accurate if that repair was successful */
1986         repair = !res->check_errors && !res->corruptions && !res->leaks;
1987     } else {
1988         repair = false;
1989     }
1990 
1991     for (i = 0; i < s->l1_size; i++) {
1992         uint64_t l1_entry = s->l1_table[i];
1993         uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
1994         int l2_dirty = 0;
1995 
1996         if (!l2_offset) {
1997             continue;
1998         }
1999 
2000         ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
2001                                  &refcount);
2002         if (ret < 0) {
2003             /* don't print message nor increment check_errors */
2004             continue;
2005         }
2006         if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
2007             res->corruptions++;
2008             fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
2009                     "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
2010                     repair ? "Repairing" : "ERROR", i, l1_entry, refcount);
2011             if (repair) {
2012                 s->l1_table[i] = refcount == 1
2013                                ? l1_entry |  QCOW_OFLAG_COPIED
2014                                : l1_entry & ~QCOW_OFLAG_COPIED;
2015                 ret = qcow2_write_l1_entry(bs, i);
2016                 if (ret < 0) {
2017                     res->check_errors++;
2018                     goto fail;
2019                 }
2020                 res->corruptions--;
2021                 res->corruptions_fixed++;
2022             }
2023         }
2024 
2025         ret = bdrv_co_pread(bs->file, l2_offset, s->l2_size * l2_entry_size(s),
2026                             l2_table, 0);
2027         if (ret < 0) {
2028             fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
2029                     strerror(-ret));
2030             res->check_errors++;
2031             goto fail;
2032         }
2033 
2034         for (j = 0; j < s->l2_size; j++) {
2035             uint64_t l2_entry = get_l2_entry(s, l2_table, j);
2036             uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
2037             QCow2ClusterType cluster_type = qcow2_get_cluster_type(bs, l2_entry);
2038 
2039             if (cluster_type == QCOW2_CLUSTER_NORMAL ||
2040                 cluster_type == QCOW2_CLUSTER_ZERO_ALLOC) {
2041                 if (has_data_file(bs)) {
2042                     refcount = 1;
2043                 } else {
2044                     ret = qcow2_get_refcount(bs,
2045                                              data_offset >> s->cluster_bits,
2046                                              &refcount);
2047                     if (ret < 0) {
2048                         /* don't print message nor increment check_errors */
2049                         continue;
2050                     }
2051                 }
2052                 if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
2053                     res->corruptions++;
2054                     fprintf(stderr, "%s OFLAG_COPIED data cluster: "
2055                             "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
2056                             repair ? "Repairing" : "ERROR", l2_entry, refcount);
2057                     if (repair) {
2058                         set_l2_entry(s, l2_table, j,
2059                                      refcount == 1 ?
2060                                      l2_entry |  QCOW_OFLAG_COPIED :
2061                                      l2_entry & ~QCOW_OFLAG_COPIED);
2062                         l2_dirty++;
2063                     }
2064                 }
2065             }
2066         }
2067 
2068         if (l2_dirty > 0) {
2069             ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
2070                                                 l2_offset, s->cluster_size,
2071                                                 false);
2072             if (ret < 0) {
2073                 fprintf(stderr, "ERROR: Could not write L2 table; metadata "
2074                         "overlap check failed: %s\n", strerror(-ret));
2075                 res->check_errors++;
2076                 goto fail;
2077             }
2078 
2079             ret = bdrv_co_pwrite(bs->file, l2_offset, s->cluster_size, l2_table, 0);
2080             if (ret < 0) {
2081                 fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
2082                         strerror(-ret));
2083                 res->check_errors++;
2084                 goto fail;
2085             }
2086             res->corruptions -= l2_dirty;
2087             res->corruptions_fixed += l2_dirty;
2088         }
2089     }
2090 
2091     ret = 0;
2092 
2093 fail:
2094     qemu_vfree(l2_table);
2095     return ret;
2096 }
2097 
2098 /*
2099  * Checks consistency of refblocks and accounts for each refblock in
2100  * *refcount_table.
2101  */
2102 static int coroutine_fn GRAPH_RDLOCK
2103 check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
2104                 BdrvCheckMode fix, bool *rebuild,
2105                 void **refcount_table, int64_t *nb_clusters)
2106 {
2107     BDRVQcow2State *s = bs->opaque;
2108     int64_t i, size;
2109     int ret;
2110 
2111     for(i = 0; i < s->refcount_table_size; i++) {
2112         uint64_t offset, cluster;
2113         offset = s->refcount_table[i] & REFT_OFFSET_MASK;
2114         cluster = offset >> s->cluster_bits;
2115 
2116         if (s->refcount_table[i] & REFT_RESERVED_MASK) {
2117             fprintf(stderr, "ERROR refcount table entry %" PRId64 " has "
2118                     "reserved bits set\n", i);
2119             res->corruptions++;
2120             *rebuild = true;
2121             continue;
2122         }
2123 
2124         /* Refcount blocks are cluster aligned */
2125         if (offset_into_cluster(s, offset)) {
2126             fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
2127                 "cluster aligned; refcount table entry corrupted\n", i);
2128             res->corruptions++;
2129             *rebuild = true;
2130             continue;
2131         }
2132 
2133         if (cluster >= *nb_clusters) {
2134             res->corruptions++;
2135             fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n",
2136                     fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
2137 
2138             if (fix & BDRV_FIX_ERRORS) {
2139                 int64_t new_nb_clusters;
2140                 Error *local_err = NULL;
2141 
2142                 if (offset > INT64_MAX - s->cluster_size) {
2143                     ret = -EINVAL;
2144                     goto resize_fail;
2145                 }
2146 
2147                 ret = bdrv_co_truncate(bs->file, offset + s->cluster_size, false,
2148                                        PREALLOC_MODE_OFF, 0, &local_err);
2149                 if (ret < 0) {
2150                     error_report_err(local_err);
2151                     goto resize_fail;
2152                 }
2153                 size = bdrv_co_getlength(bs->file->bs);
2154                 if (size < 0) {
2155                     ret = size;
2156                     goto resize_fail;
2157                 }
2158 
2159                 new_nb_clusters = size_to_clusters(s, size);
2160                 assert(new_nb_clusters >= *nb_clusters);
2161 
2162                 ret = realloc_refcount_array(s, refcount_table,
2163                                              nb_clusters, new_nb_clusters);
2164                 if (ret < 0) {
2165                     res->check_errors++;
2166                     return ret;
2167                 }
2168 
2169                 if (cluster >= *nb_clusters) {
2170                     ret = -EINVAL;
2171                     goto resize_fail;
2172                 }
2173 
2174                 res->corruptions--;
2175                 res->corruptions_fixed++;
2176                 ret = qcow2_inc_refcounts_imrt(bs, res,
2177                                                refcount_table, nb_clusters,
2178                                                offset, s->cluster_size);
2179                 if (ret < 0) {
2180                     return ret;
2181                 }
2182                 /* No need to check whether the refcount is now greater than 1:
2183                  * This area was just allocated and zeroed, so it can only be
2184                  * exactly 1 after qcow2_inc_refcounts_imrt() */
2185                 continue;
2186 
2187 resize_fail:
2188                 *rebuild = true;
2189                 fprintf(stderr, "ERROR could not resize image: %s\n",
2190                         strerror(-ret));
2191             }
2192             continue;
2193         }
2194 
2195         if (offset != 0) {
2196             ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2197                                            offset, s->cluster_size);
2198             if (ret < 0) {
2199                 return ret;
2200             }
2201             if (s->get_refcount(*refcount_table, cluster) != 1) {
2202                 fprintf(stderr, "ERROR refcount block %" PRId64
2203                         " refcount=%" PRIu64 "\n", i,
2204                         s->get_refcount(*refcount_table, cluster));
2205                 res->corruptions++;
2206                 *rebuild = true;
2207             }
2208         }
2209     }
2210 
2211     return 0;
2212 }
2213 
2214 /*
2215  * Calculates an in-memory refcount table.
2216  */
2217 static int coroutine_fn GRAPH_RDLOCK
2218 calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2219                     BdrvCheckMode fix, bool *rebuild,
2220                     void **refcount_table, int64_t *nb_clusters)
2221 {
2222     BDRVQcow2State *s = bs->opaque;
2223     int64_t i;
2224     QCowSnapshot *sn;
2225     int ret;
2226 
2227     if (!*refcount_table) {
2228         int64_t old_size = 0;
2229         ret = realloc_refcount_array(s, refcount_table,
2230                                      &old_size, *nb_clusters);
2231         if (ret < 0) {
2232             res->check_errors++;
2233             return ret;
2234         }
2235     }
2236 
2237     /* header */
2238     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2239                                    0, s->cluster_size);
2240     if (ret < 0) {
2241         return ret;
2242     }
2243 
2244     /* current L1 table */
2245     ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
2246                              s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO,
2247                              fix, true);
2248     if (ret < 0) {
2249         return ret;
2250     }
2251 
2252     /* snapshots */
2253     if (has_data_file(bs) && s->nb_snapshots) {
2254         fprintf(stderr, "ERROR %d snapshots in image with data file\n",
2255                 s->nb_snapshots);
2256         res->corruptions++;
2257     }
2258 
2259     for (i = 0; i < s->nb_snapshots; i++) {
2260         sn = s->snapshots + i;
2261         if (offset_into_cluster(s, sn->l1_table_offset)) {
2262             fprintf(stderr, "ERROR snapshot %s (%s) l1_offset=%#" PRIx64 ": "
2263                     "L1 table is not cluster aligned; snapshot table entry "
2264                     "corrupted\n", sn->id_str, sn->name, sn->l1_table_offset);
2265             res->corruptions++;
2266             continue;
2267         }
2268         if (sn->l1_size > QCOW_MAX_L1_SIZE / L1E_SIZE) {
2269             fprintf(stderr, "ERROR snapshot %s (%s) l1_size=%#" PRIx32 ": "
2270                     "L1 table is too large; snapshot table entry corrupted\n",
2271                     sn->id_str, sn->name, sn->l1_size);
2272             res->corruptions++;
2273             continue;
2274         }
2275         ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
2276                                  sn->l1_table_offset, sn->l1_size, 0, fix,
2277                                  false);
2278         if (ret < 0) {
2279             return ret;
2280         }
2281     }
2282     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2283                                    s->snapshots_offset, s->snapshots_size);
2284     if (ret < 0) {
2285         return ret;
2286     }
2287 
2288     /* refcount data */
2289     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2290                                    s->refcount_table_offset,
2291                                    s->refcount_table_size *
2292                                    REFTABLE_ENTRY_SIZE);
2293     if (ret < 0) {
2294         return ret;
2295     }
2296 
2297     /* encryption */
2298     if (s->crypto_header.length) {
2299         ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2300                                        s->crypto_header.offset,
2301                                        s->crypto_header.length);
2302         if (ret < 0) {
2303             return ret;
2304         }
2305     }
2306 
2307     /* bitmaps */
2308     ret = qcow2_check_bitmaps_refcounts(bs, res, refcount_table, nb_clusters);
2309     if (ret < 0) {
2310         return ret;
2311     }
2312 
2313     return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters);
2314 }
2315 
2316 /*
2317  * Compares the actual reference count for each cluster in the image against the
2318  * refcount as reported by the refcount structures on-disk.
2319  */
2320 static void coroutine_fn GRAPH_RDLOCK
2321 compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2322                   BdrvCheckMode fix, bool *rebuild,
2323                   int64_t *highest_cluster,
2324                   void *refcount_table, int64_t nb_clusters)
2325 {
2326     BDRVQcow2State *s = bs->opaque;
2327     int64_t i;
2328     uint64_t refcount1, refcount2;
2329     int ret;
2330 
2331     for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) {
2332         ret = qcow2_get_refcount(bs, i, &refcount1);
2333         if (ret < 0) {
2334             fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
2335                     i, strerror(-ret));
2336             res->check_errors++;
2337             continue;
2338         }
2339 
2340         refcount2 = s->get_refcount(refcount_table, i);
2341 
2342         if (refcount1 > 0 || refcount2 > 0) {
2343             *highest_cluster = i;
2344         }
2345 
2346         if (refcount1 != refcount2) {
2347             /* Check if we're allowed to fix the mismatch */
2348             int *num_fixed = NULL;
2349             if (refcount1 == 0) {
2350                 *rebuild = true;
2351             } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
2352                 num_fixed = &res->leaks_fixed;
2353             } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
2354                 num_fixed = &res->corruptions_fixed;
2355             }
2356 
2357             fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64
2358                     " reference=%" PRIu64 "\n",
2359                    num_fixed != NULL     ? "Repairing" :
2360                    refcount1 < refcount2 ? "ERROR" :
2361                                            "Leaked",
2362                    i, refcount1, refcount2);
2363 
2364             if (num_fixed) {
2365                 ret = update_refcount(bs, i << s->cluster_bits, 1,
2366                                       refcount_diff(refcount1, refcount2),
2367                                       refcount1 > refcount2,
2368                                       QCOW2_DISCARD_ALWAYS);
2369                 if (ret >= 0) {
2370                     (*num_fixed)++;
2371                     continue;
2372                 }
2373             }
2374 
2375             /* And if we couldn't, print an error */
2376             if (refcount1 < refcount2) {
2377                 res->corruptions++;
2378             } else {
2379                 res->leaks++;
2380             }
2381         }
2382     }
2383 }
2384 
2385 /*
2386  * Allocates clusters using an in-memory refcount table (IMRT) in contrast to
2387  * the on-disk refcount structures.
2388  *
2389  * On input, *first_free_cluster tells where to start looking, and need not
2390  * actually be a free cluster; the returned offset will not be before that
2391  * cluster.  On output, *first_free_cluster points to the first gap found, even
2392  * if that gap was too small to be used as the returned offset.
2393  *
2394  * Note that *first_free_cluster is a cluster index whereas the return value is
2395  * an offset.
2396  */
2397 static int64_t alloc_clusters_imrt(BlockDriverState *bs,
2398                                    int cluster_count,
2399                                    void **refcount_table,
2400                                    int64_t *imrt_nb_clusters,
2401                                    int64_t *first_free_cluster)
2402 {
2403     BDRVQcow2State *s = bs->opaque;
2404     int64_t cluster = *first_free_cluster, i;
2405     bool first_gap = true;
2406     int contiguous_free_clusters;
2407     int ret;
2408 
2409     /* Starting at *first_free_cluster, find a range of at least cluster_count
2410      * continuously free clusters */
2411     for (contiguous_free_clusters = 0;
2412          cluster < *imrt_nb_clusters &&
2413          contiguous_free_clusters < cluster_count;
2414          cluster++)
2415     {
2416         if (!s->get_refcount(*refcount_table, cluster)) {
2417             contiguous_free_clusters++;
2418             if (first_gap) {
2419                 /* If this is the first free cluster found, update
2420                  * *first_free_cluster accordingly */
2421                 *first_free_cluster = cluster;
2422                 first_gap = false;
2423             }
2424         } else if (contiguous_free_clusters) {
2425             contiguous_free_clusters = 0;
2426         }
2427     }
2428 
2429     /* If contiguous_free_clusters is greater than zero, it contains the number
2430      * of continuously free clusters until the current cluster; the first free
2431      * cluster in the current "gap" is therefore
2432      * cluster - contiguous_free_clusters */
2433 
2434     /* If no such range could be found, grow the in-memory refcount table
2435      * accordingly to append free clusters at the end of the image */
2436     if (contiguous_free_clusters < cluster_count) {
2437         /* contiguous_free_clusters clusters are already empty at the image end;
2438          * we need cluster_count clusters; therefore, we have to allocate
2439          * cluster_count - contiguous_free_clusters new clusters at the end of
2440          * the image (which is the current value of cluster; note that cluster
2441          * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond
2442          * the image end) */
2443         ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters,
2444                                      cluster + cluster_count
2445                                      - contiguous_free_clusters);
2446         if (ret < 0) {
2447             return ret;
2448         }
2449     }
2450 
2451     /* Go back to the first free cluster */
2452     cluster -= contiguous_free_clusters;
2453     for (i = 0; i < cluster_count; i++) {
2454         s->set_refcount(*refcount_table, cluster + i, 1);
2455     }
2456 
2457     return cluster << s->cluster_bits;
2458 }
2459 
2460 /*
2461  * Helper function for rebuild_refcount_structure().
2462  *
2463  * Scan the range of clusters [first_cluster, end_cluster) for allocated
2464  * clusters and write all corresponding refblocks to disk.  The refblock
2465  * and allocation data is taken from the in-memory refcount table
2466  * *refcount_table[] (of size *nb_clusters), which is basically one big
2467  * (unlimited size) refblock for the whole image.
2468  *
2469  * For these refblocks, clusters are allocated using said in-memory
2470  * refcount table.  Care is taken that these allocations are reflected
2471  * in the refblocks written to disk.
2472  *
2473  * The refblocks' offsets are written into a reftable, which is
2474  * *on_disk_reftable_ptr[] (of size *on_disk_reftable_entries_ptr).  If
2475  * that reftable is of insufficient size, it will be resized to fit.
2476  * This reftable is not written to disk.
2477  *
2478  * (If *on_disk_reftable_ptr is not NULL, the entries within are assumed
2479  * to point to existing valid refblocks that do not need to be allocated
2480  * again.)
2481  *
2482  * Return whether the on-disk reftable array was resized (true/false),
2483  * or -errno on error.
2484  */
2485 static int coroutine_fn GRAPH_RDLOCK
2486 rebuild_refcounts_write_refblocks(
2487         BlockDriverState *bs, void **refcount_table, int64_t *nb_clusters,
2488         int64_t first_cluster, int64_t end_cluster,
2489         uint64_t **on_disk_reftable_ptr, uint32_t *on_disk_reftable_entries_ptr,
2490         Error **errp
2491     )
2492 {
2493     BDRVQcow2State *s = bs->opaque;
2494     int64_t cluster;
2495     int64_t refblock_offset, refblock_start, refblock_index;
2496     int64_t first_free_cluster = 0;
2497     uint64_t *on_disk_reftable = *on_disk_reftable_ptr;
2498     uint32_t on_disk_reftable_entries = *on_disk_reftable_entries_ptr;
2499     void *on_disk_refblock;
2500     bool reftable_grown = false;
2501     int ret;
2502 
2503     for (cluster = first_cluster; cluster < end_cluster; cluster++) {
2504         /* Check all clusters to find refblocks that contain non-zero entries */
2505         if (!s->get_refcount(*refcount_table, cluster)) {
2506             continue;
2507         }
2508 
2509         /*
2510          * This cluster is allocated, so we need to create a refblock
2511          * for it.  The data we will write to disk is just the
2512          * respective slice from *refcount_table, so it will contain
2513          * accurate refcounts for all clusters belonging to this
2514          * refblock.  After we have written it, we will therefore skip
2515          * all remaining clusters in this refblock.
2516          */
2517 
2518         refblock_index = cluster >> s->refcount_block_bits;
2519         refblock_start = refblock_index << s->refcount_block_bits;
2520 
2521         if (on_disk_reftable_entries > refblock_index &&
2522             on_disk_reftable[refblock_index])
2523         {
2524             /*
2525              * We can get here after a `goto write_refblocks`: We have a
2526              * reftable from a previous run, and the refblock is already
2527              * allocated.  No need to allocate it again.
2528              */
2529             refblock_offset = on_disk_reftable[refblock_index];
2530         } else {
2531             int64_t refblock_cluster_index;
2532 
2533             /* Don't allocate a cluster in a refblock already written to disk */
2534             if (first_free_cluster < refblock_start) {
2535                 first_free_cluster = refblock_start;
2536             }
2537             refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table,
2538                                                   nb_clusters,
2539                                                   &first_free_cluster);
2540             if (refblock_offset < 0) {
2541                 error_setg_errno(errp, -refblock_offset,
2542                                  "ERROR allocating refblock");
2543                 return refblock_offset;
2544             }
2545 
2546             refblock_cluster_index = refblock_offset / s->cluster_size;
2547             if (refblock_cluster_index >= end_cluster) {
2548                 /*
2549                  * We must write the refblock that holds this refblock's
2550                  * refcount
2551                  */
2552                 end_cluster = refblock_cluster_index + 1;
2553             }
2554 
2555             if (on_disk_reftable_entries <= refblock_index) {
2556                 on_disk_reftable_entries =
2557                     ROUND_UP((refblock_index + 1) * REFTABLE_ENTRY_SIZE,
2558                              s->cluster_size) / REFTABLE_ENTRY_SIZE;
2559                 on_disk_reftable =
2560                     g_try_realloc(on_disk_reftable,
2561                                   on_disk_reftable_entries *
2562                                   REFTABLE_ENTRY_SIZE);
2563                 if (!on_disk_reftable) {
2564                     error_setg(errp, "ERROR allocating reftable memory");
2565                     return -ENOMEM;
2566                 }
2567 
2568                 memset(on_disk_reftable + *on_disk_reftable_entries_ptr, 0,
2569                        (on_disk_reftable_entries -
2570                         *on_disk_reftable_entries_ptr) *
2571                        REFTABLE_ENTRY_SIZE);
2572 
2573                 *on_disk_reftable_ptr = on_disk_reftable;
2574                 *on_disk_reftable_entries_ptr = on_disk_reftable_entries;
2575 
2576                 reftable_grown = true;
2577             } else {
2578                 assert(on_disk_reftable);
2579             }
2580             on_disk_reftable[refblock_index] = refblock_offset;
2581         }
2582 
2583         /* Refblock is allocated, write it to disk */
2584 
2585         ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset,
2586                                             s->cluster_size, false);
2587         if (ret < 0) {
2588             error_setg_errno(errp, -ret, "ERROR writing refblock");
2589             return ret;
2590         }
2591 
2592         /*
2593          * The refblock is simply a slice of *refcount_table.
2594          * Note that the size of *refcount_table is always aligned to
2595          * whole clusters, so the write operation will not result in
2596          * out-of-bounds accesses.
2597          */
2598         on_disk_refblock = (void *)((char *) *refcount_table +
2599                                     refblock_index * s->cluster_size);
2600 
2601         ret = bdrv_co_pwrite(bs->file, refblock_offset, s->cluster_size,
2602                              on_disk_refblock, 0);
2603         if (ret < 0) {
2604             error_setg_errno(errp, -ret, "ERROR writing refblock");
2605             return ret;
2606         }
2607 
2608         /* This refblock is done, skip to its end */
2609         cluster = refblock_start + s->refcount_block_size - 1;
2610     }
2611 
2612     return reftable_grown;
2613 }
2614 
2615 /*
2616  * Creates a new refcount structure based solely on the in-memory information
2617  * given through *refcount_table (this in-memory information is basically just
2618  * the concatenation of all refblocks).  All necessary allocations will be
2619  * reflected in that array.
2620  *
2621  * On success, the old refcount structure is leaked (it will be covered by the
2622  * new refcount structure).
2623  */
2624 static int coroutine_fn GRAPH_RDLOCK
2625 rebuild_refcount_structure(BlockDriverState *bs, BdrvCheckResult *res,
2626                            void **refcount_table, int64_t *nb_clusters,
2627                            Error **errp)
2628 {
2629     BDRVQcow2State *s = bs->opaque;
2630     int64_t reftable_offset = -1;
2631     int64_t reftable_length = 0;
2632     int64_t reftable_clusters;
2633     int64_t refblock_index;
2634     uint32_t on_disk_reftable_entries = 0;
2635     uint64_t *on_disk_reftable = NULL;
2636     int ret = 0;
2637     int reftable_size_changed = 0;
2638     struct {
2639         uint64_t reftable_offset;
2640         uint32_t reftable_clusters;
2641     } QEMU_PACKED reftable_offset_and_clusters;
2642 
2643     qcow2_cache_empty(bs, s->refcount_block_cache);
2644 
2645     /*
2646      * For each refblock containing entries, we try to allocate a
2647      * cluster (in the in-memory refcount table) and write its offset
2648      * into on_disk_reftable[].  We then write the whole refblock to
2649      * disk (as a slice of the in-memory refcount table).
2650      * This is done by rebuild_refcounts_write_refblocks().
2651      *
2652      * Once we have scanned all clusters, we try to find space for the
2653      * reftable.  This will dirty the in-memory refcount table (i.e.
2654      * make it differ from the refblocks we have already written), so we
2655      * need to run rebuild_refcounts_write_refblocks() again for the
2656      * range of clusters where the reftable has been allocated.
2657      *
2658      * This second run might make the reftable grow again, in which case
2659      * we will need to allocate another space for it, which is why we
2660      * repeat all this until the reftable stops growing.
2661      *
2662      * (This loop will terminate, because with every cluster the
2663      * reftable grows, it can accommodate a multitude of more refcounts,
2664      * so that at some point this must be able to cover the reftable
2665      * and all refblocks describing it.)
2666      *
2667      * We then convert the reftable to big-endian and write it to disk.
2668      *
2669      * Note that we never free any reftable allocations.  Doing so would
2670      * needlessly complicate the algorithm: The eventual second check
2671      * run we do will clean up all leaks we have caused.
2672      */
2673 
2674     reftable_size_changed =
2675         rebuild_refcounts_write_refblocks(bs, refcount_table, nb_clusters,
2676                                           0, *nb_clusters,
2677                                           &on_disk_reftable,
2678                                           &on_disk_reftable_entries, errp);
2679     if (reftable_size_changed < 0) {
2680         res->check_errors++;
2681         ret = reftable_size_changed;
2682         goto fail;
2683     }
2684 
2685     /*
2686      * There was no reftable before, so rebuild_refcounts_write_refblocks()
2687      * must have increased its size (from 0 to something).
2688      */
2689     assert(reftable_size_changed);
2690 
2691     do {
2692         int64_t reftable_start_cluster, reftable_end_cluster;
2693         int64_t first_free_cluster = 0;
2694 
2695         reftable_length = on_disk_reftable_entries * REFTABLE_ENTRY_SIZE;
2696         reftable_clusters = size_to_clusters(s, reftable_length);
2697 
2698         reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2699                                               refcount_table, nb_clusters,
2700                                               &first_free_cluster);
2701         if (reftable_offset < 0) {
2702             error_setg_errno(errp, -reftable_offset,
2703                              "ERROR allocating reftable");
2704             res->check_errors++;
2705             ret = reftable_offset;
2706             goto fail;
2707         }
2708 
2709         /*
2710          * We need to update the affected refblocks, so re-run the
2711          * write_refblocks loop for the reftable's range of clusters.
2712          */
2713         assert(offset_into_cluster(s, reftable_offset) == 0);
2714         reftable_start_cluster = reftable_offset / s->cluster_size;
2715         reftable_end_cluster = reftable_start_cluster + reftable_clusters;
2716         reftable_size_changed =
2717             rebuild_refcounts_write_refblocks(bs, refcount_table, nb_clusters,
2718                                               reftable_start_cluster,
2719                                               reftable_end_cluster,
2720                                               &on_disk_reftable,
2721                                               &on_disk_reftable_entries, errp);
2722         if (reftable_size_changed < 0) {
2723             res->check_errors++;
2724             ret = reftable_size_changed;
2725             goto fail;
2726         }
2727 
2728         /*
2729          * If the reftable size has changed, we will need to find a new
2730          * allocation, repeating the loop.
2731          */
2732     } while (reftable_size_changed);
2733 
2734     /* The above loop must have run at least once */
2735     assert(reftable_offset >= 0);
2736 
2737     /*
2738      * All allocations are done, all refblocks are written, convert the
2739      * reftable to big-endian and write it to disk.
2740      */
2741 
2742     for (refblock_index = 0; refblock_index < on_disk_reftable_entries;
2743          refblock_index++)
2744     {
2745         cpu_to_be64s(&on_disk_reftable[refblock_index]);
2746     }
2747 
2748     ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset, reftable_length,
2749                                         false);
2750     if (ret < 0) {
2751         error_setg_errno(errp, -ret, "ERROR writing reftable");
2752         goto fail;
2753     }
2754 
2755     assert(reftable_length < INT_MAX);
2756     ret = bdrv_co_pwrite(bs->file, reftable_offset, reftable_length,
2757                          on_disk_reftable, 0);
2758     if (ret < 0) {
2759         error_setg_errno(errp, -ret, "ERROR writing reftable");
2760         goto fail;
2761     }
2762 
2763     /* Enter new reftable into the image header */
2764     reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset);
2765     reftable_offset_and_clusters.reftable_clusters =
2766         cpu_to_be32(reftable_clusters);
2767     ret = bdrv_co_pwrite_sync(bs->file,
2768                               offsetof(QCowHeader, refcount_table_offset),
2769                               sizeof(reftable_offset_and_clusters),
2770                               &reftable_offset_and_clusters, 0);
2771     if (ret < 0) {
2772         error_setg_errno(errp, -ret, "ERROR setting reftable");
2773         goto fail;
2774     }
2775 
2776     for (refblock_index = 0; refblock_index < on_disk_reftable_entries;
2777          refblock_index++)
2778     {
2779         be64_to_cpus(&on_disk_reftable[refblock_index]);
2780     }
2781     s->refcount_table = on_disk_reftable;
2782     s->refcount_table_offset = reftable_offset;
2783     s->refcount_table_size = on_disk_reftable_entries;
2784     update_max_refcount_table_index(s);
2785 
2786     return 0;
2787 
2788 fail:
2789     g_free(on_disk_reftable);
2790     return ret;
2791 }
2792 
2793 /*
2794  * Checks an image for refcount consistency.
2795  *
2796  * Returns 0 if no errors are found, the number of errors in case the image is
2797  * detected as corrupted, and -errno when an internal error occurred.
2798  */
2799 int coroutine_fn GRAPH_RDLOCK
2800 qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2801 {
2802     BDRVQcow2State *s = bs->opaque;
2803     BdrvCheckResult pre_compare_res;
2804     int64_t size, highest_cluster, nb_clusters;
2805     void *refcount_table = NULL;
2806     bool rebuild = false;
2807     int ret;
2808 
2809     size = bdrv_co_getlength(bs->file->bs);
2810     if (size < 0) {
2811         res->check_errors++;
2812         return size;
2813     }
2814 
2815     nb_clusters = size_to_clusters(s, size);
2816     if (nb_clusters > INT_MAX) {
2817         res->check_errors++;
2818         return -EFBIG;
2819     }
2820 
2821     res->bfi.total_clusters =
2822         size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
2823 
2824     ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table,
2825                               &nb_clusters);
2826     if (ret < 0) {
2827         goto fail;
2828     }
2829 
2830     /* In case we don't need to rebuild the refcount structure (but want to fix
2831      * something), this function is immediately called again, in which case the
2832      * result should be ignored */
2833     pre_compare_res = *res;
2834     compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table,
2835                       nb_clusters);
2836 
2837     if (rebuild && (fix & BDRV_FIX_ERRORS)) {
2838         BdrvCheckResult old_res = *res;
2839         int fresh_leaks = 0;
2840         Error *local_err = NULL;
2841 
2842         fprintf(stderr, "Rebuilding refcount structure\n");
2843         ret = rebuild_refcount_structure(bs, res, &refcount_table,
2844                                          &nb_clusters, &local_err);
2845         if (ret < 0) {
2846             error_report_err(local_err);
2847             goto fail;
2848         }
2849 
2850         res->corruptions = 0;
2851         res->leaks = 0;
2852 
2853         /* Because the old reftable has been exchanged for a new one the
2854          * references have to be recalculated */
2855         rebuild = false;
2856         memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters));
2857         ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table,
2858                                   &nb_clusters);
2859         if (ret < 0) {
2860             goto fail;
2861         }
2862 
2863         if (fix & BDRV_FIX_LEAKS) {
2864             /* The old refcount structures are now leaked, fix it; the result
2865              * can be ignored, aside from leaks which were introduced by
2866              * rebuild_refcount_structure() that could not be fixed */
2867             BdrvCheckResult saved_res = *res;
2868             *res = (BdrvCheckResult){ 0 };
2869 
2870             compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild,
2871                               &highest_cluster, refcount_table, nb_clusters);
2872             if (rebuild) {
2873                 fprintf(stderr, "ERROR rebuilt refcount structure is still "
2874                         "broken\n");
2875             }
2876 
2877             /* Any leaks accounted for here were introduced by
2878              * rebuild_refcount_structure() because that function has created a
2879              * new refcount structure from scratch */
2880             fresh_leaks = res->leaks;
2881             *res = saved_res;
2882         }
2883 
2884         if (res->corruptions < old_res.corruptions) {
2885             res->corruptions_fixed += old_res.corruptions - res->corruptions;
2886         }
2887         if (res->leaks < old_res.leaks) {
2888             res->leaks_fixed += old_res.leaks - res->leaks;
2889         }
2890         res->leaks += fresh_leaks;
2891     } else if (fix) {
2892         if (rebuild) {
2893             fprintf(stderr, "ERROR need to rebuild refcount structures\n");
2894             res->check_errors++;
2895             ret = -EIO;
2896             goto fail;
2897         }
2898 
2899         if (res->leaks || res->corruptions) {
2900             *res = pre_compare_res;
2901             compare_refcounts(bs, res, fix, &rebuild, &highest_cluster,
2902                               refcount_table, nb_clusters);
2903         }
2904     }
2905 
2906     /* check OFLAG_COPIED */
2907     ret = check_oflag_copied(bs, res, fix);
2908     if (ret < 0) {
2909         goto fail;
2910     }
2911 
2912     res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
2913     ret = 0;
2914 
2915 fail:
2916     g_free(refcount_table);
2917 
2918     return ret;
2919 }
2920 
2921 #define overlaps_with(ofs, sz) \
2922     ranges_overlap(offset, size, ofs, sz)
2923 
2924 /*
2925  * Checks if the given offset into the image file is actually free to use by
2926  * looking for overlaps with important metadata sections (L1/L2 tables etc.),
2927  * i.e. a sanity check without relying on the refcount tables.
2928  *
2929  * The ign parameter specifies what checks not to perform (being a bitmask of
2930  * QCow2MetadataOverlap values), i.e., what sections to ignore.
2931  *
2932  * Returns:
2933  * - 0 if writing to this offset will not affect the mentioned metadata
2934  * - a positive QCow2MetadataOverlap value indicating one overlapping section
2935  * - a negative value (-errno) indicating an error while performing a check,
2936  *   e.g. when bdrv_pread failed on QCOW2_OL_INACTIVE_L2
2937  */
2938 int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
2939                                  int64_t size)
2940 {
2941     BDRVQcow2State *s = bs->opaque;
2942     int chk = s->overlap_check & ~ign;
2943     int i, j;
2944 
2945     if (!size) {
2946         return 0;
2947     }
2948 
2949     if (chk & QCOW2_OL_MAIN_HEADER) {
2950         if (offset < s->cluster_size) {
2951             return QCOW2_OL_MAIN_HEADER;
2952         }
2953     }
2954 
2955     /* align range to test to cluster boundaries */
2956     size = ROUND_UP(offset_into_cluster(s, offset) + size, s->cluster_size);
2957     offset = start_of_cluster(s, offset);
2958 
2959     if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
2960         if (overlaps_with(s->l1_table_offset, s->l1_size * L1E_SIZE)) {
2961             return QCOW2_OL_ACTIVE_L1;
2962         }
2963     }
2964 
2965     if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
2966         if (overlaps_with(s->refcount_table_offset,
2967             s->refcount_table_size * REFTABLE_ENTRY_SIZE)) {
2968             return QCOW2_OL_REFCOUNT_TABLE;
2969         }
2970     }
2971 
2972     if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
2973         if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
2974             return QCOW2_OL_SNAPSHOT_TABLE;
2975         }
2976     }
2977 
2978     if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
2979         for (i = 0; i < s->nb_snapshots; i++) {
2980             if (s->snapshots[i].l1_size &&
2981                 overlaps_with(s->snapshots[i].l1_table_offset,
2982                 s->snapshots[i].l1_size * L1E_SIZE)) {
2983                 return QCOW2_OL_INACTIVE_L1;
2984             }
2985         }
2986     }
2987 
2988     if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
2989         for (i = 0; i < s->l1_size; i++) {
2990             if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
2991                 overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
2992                 s->cluster_size)) {
2993                 return QCOW2_OL_ACTIVE_L2;
2994             }
2995         }
2996     }
2997 
2998     if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
2999         unsigned last_entry = s->max_refcount_table_index;
3000         assert(last_entry < s->refcount_table_size);
3001         assert(last_entry + 1 == s->refcount_table_size ||
3002                (s->refcount_table[last_entry + 1] & REFT_OFFSET_MASK) == 0);
3003         for (i = 0; i <= last_entry; i++) {
3004             if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
3005                 overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
3006                 s->cluster_size)) {
3007                 return QCOW2_OL_REFCOUNT_BLOCK;
3008             }
3009         }
3010     }
3011 
3012     if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
3013         for (i = 0; i < s->nb_snapshots; i++) {
3014             uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
3015             uint32_t l1_sz  = s->snapshots[i].l1_size;
3016             uint64_t l1_sz2 = l1_sz * L1E_SIZE;
3017             uint64_t *l1;
3018             int ret;
3019 
3020             ret = qcow2_validate_table(bs, l1_ofs, l1_sz, L1E_SIZE,
3021                                        QCOW_MAX_L1_SIZE, "", NULL);
3022             if (ret < 0) {
3023                 return ret;
3024             }
3025 
3026             l1 = g_try_malloc(l1_sz2);
3027 
3028             if (l1_sz2 && l1 == NULL) {
3029                 return -ENOMEM;
3030             }
3031 
3032             ret = bdrv_pread(bs->file, l1_ofs, l1_sz2, l1, 0);
3033             if (ret < 0) {
3034                 g_free(l1);
3035                 return ret;
3036             }
3037 
3038             for (j = 0; j < l1_sz; j++) {
3039                 uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
3040                 if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
3041                     g_free(l1);
3042                     return QCOW2_OL_INACTIVE_L2;
3043                 }
3044             }
3045 
3046             g_free(l1);
3047         }
3048     }
3049 
3050     if ((chk & QCOW2_OL_BITMAP_DIRECTORY) &&
3051         (s->autoclear_features & QCOW2_AUTOCLEAR_BITMAPS))
3052     {
3053         if (overlaps_with(s->bitmap_directory_offset,
3054                           s->bitmap_directory_size))
3055         {
3056             return QCOW2_OL_BITMAP_DIRECTORY;
3057         }
3058     }
3059 
3060     return 0;
3061 }
3062 
3063 static const char *metadata_ol_names[] = {
3064     [QCOW2_OL_MAIN_HEADER_BITNR]        = "qcow2_header",
3065     [QCOW2_OL_ACTIVE_L1_BITNR]          = "active L1 table",
3066     [QCOW2_OL_ACTIVE_L2_BITNR]          = "active L2 table",
3067     [QCOW2_OL_REFCOUNT_TABLE_BITNR]     = "refcount table",
3068     [QCOW2_OL_REFCOUNT_BLOCK_BITNR]     = "refcount block",
3069     [QCOW2_OL_SNAPSHOT_TABLE_BITNR]     = "snapshot table",
3070     [QCOW2_OL_INACTIVE_L1_BITNR]        = "inactive L1 table",
3071     [QCOW2_OL_INACTIVE_L2_BITNR]        = "inactive L2 table",
3072     [QCOW2_OL_BITMAP_DIRECTORY_BITNR]   = "bitmap directory",
3073 };
3074 QEMU_BUILD_BUG_ON(QCOW2_OL_MAX_BITNR != ARRAY_SIZE(metadata_ol_names));
3075 
3076 /*
3077  * First performs a check for metadata overlaps (through
3078  * qcow2_check_metadata_overlap); if that fails with a negative value (error
3079  * while performing a check), that value is returned. If an impending overlap
3080  * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
3081  * and -EIO returned.
3082  *
3083  * Returns 0 if there were neither overlaps nor errors while checking for
3084  * overlaps; or a negative value (-errno) on error.
3085  */
3086 int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
3087                                   int64_t size, bool data_file)
3088 {
3089     int ret;
3090 
3091     if (data_file && has_data_file(bs)) {
3092         return 0;
3093     }
3094 
3095     ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
3096     if (ret < 0) {
3097         return ret;
3098     } else if (ret > 0) {
3099         int metadata_ol_bitnr = ctz32(ret);
3100         assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
3101 
3102         qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid "
3103                                 "write on metadata (overlaps with %s)",
3104                                 metadata_ol_names[metadata_ol_bitnr]);
3105         return -EIO;
3106     }
3107 
3108     return 0;
3109 }
3110 
3111 /* A pointer to a function of this type is given to walk_over_reftable(). That
3112  * function will create refblocks and pass them to a RefblockFinishOp once they
3113  * are completed (@refblock). @refblock_empty is set if the refblock is
3114  * completely empty.
3115  *
3116  * Along with the refblock, a corresponding reftable entry is passed, in the
3117  * reftable @reftable (which may be reallocated) at @reftable_index.
3118  *
3119  * @allocated should be set to true if a new cluster has been allocated.
3120  */
3121 typedef int /* GRAPH_RDLOCK_PTR */
3122     (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable,
3123                        uint64_t reftable_index, uint64_t *reftable_size,
3124                        void *refblock, bool refblock_empty,
3125                        bool *allocated, Error **errp);
3126 
3127 /**
3128  * This "operation" for walk_over_reftable() allocates the refblock on disk (if
3129  * it is not empty) and inserts its offset into the new reftable. The size of
3130  * this new reftable is increased as required.
3131  */
3132 static int GRAPH_RDLOCK
3133 alloc_refblock(BlockDriverState *bs, uint64_t **reftable,
3134                uint64_t reftable_index, uint64_t *reftable_size,
3135                void *refblock, bool refblock_empty, bool *allocated,
3136                Error **errp)
3137 {
3138     BDRVQcow2State *s = bs->opaque;
3139     int64_t offset;
3140 
3141     if (!refblock_empty && reftable_index >= *reftable_size) {
3142         uint64_t *new_reftable;
3143         uint64_t new_reftable_size;
3144 
3145         new_reftable_size = ROUND_UP(reftable_index + 1,
3146                                      s->cluster_size / REFTABLE_ENTRY_SIZE);
3147         if (new_reftable_size > QCOW_MAX_REFTABLE_SIZE / REFTABLE_ENTRY_SIZE) {
3148             error_setg(errp,
3149                        "This operation would make the refcount table grow "
3150                        "beyond the maximum size supported by QEMU, aborting");
3151             return -ENOTSUP;
3152         }
3153 
3154         new_reftable = g_try_realloc(*reftable, new_reftable_size *
3155                                                 REFTABLE_ENTRY_SIZE);
3156         if (!new_reftable) {
3157             error_setg(errp, "Failed to increase reftable buffer size");
3158             return -ENOMEM;
3159         }
3160 
3161         memset(new_reftable + *reftable_size, 0,
3162                (new_reftable_size - *reftable_size) * REFTABLE_ENTRY_SIZE);
3163 
3164         *reftable      = new_reftable;
3165         *reftable_size = new_reftable_size;
3166     }
3167 
3168     if (!refblock_empty && !(*reftable)[reftable_index]) {
3169         offset = qcow2_alloc_clusters(bs, s->cluster_size);
3170         if (offset < 0) {
3171             error_setg_errno(errp, -offset, "Failed to allocate refblock");
3172             return offset;
3173         }
3174         (*reftable)[reftable_index] = offset;
3175         *allocated = true;
3176     }
3177 
3178     return 0;
3179 }
3180 
3181 /**
3182  * This "operation" for walk_over_reftable() writes the refblock to disk at the
3183  * offset specified by the new reftable's entry. It does not modify the new
3184  * reftable or change any refcounts.
3185  */
3186 static int GRAPH_RDLOCK
3187 flush_refblock(BlockDriverState *bs, uint64_t **reftable,
3188                uint64_t reftable_index, uint64_t *reftable_size,
3189                void *refblock, bool refblock_empty, bool *allocated,
3190                Error **errp)
3191 {
3192     BDRVQcow2State *s = bs->opaque;
3193     int64_t offset;
3194     int ret;
3195 
3196     if (reftable_index < *reftable_size && (*reftable)[reftable_index]) {
3197         offset = (*reftable)[reftable_index];
3198 
3199         ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size,
3200                                             false);
3201         if (ret < 0) {
3202             error_setg_errno(errp, -ret, "Overlap check failed");
3203             return ret;
3204         }
3205 
3206         ret = bdrv_pwrite(bs->file, offset, s->cluster_size, refblock, 0);
3207         if (ret < 0) {
3208             error_setg_errno(errp, -ret, "Failed to write refblock");
3209             return ret;
3210         }
3211     } else {
3212         assert(refblock_empty);
3213     }
3214 
3215     return 0;
3216 }
3217 
3218 /**
3219  * This function walks over the existing reftable and every referenced refblock;
3220  * if @new_set_refcount is non-NULL, it is called for every refcount entry to
3221  * create an equal new entry in the passed @new_refblock. Once that
3222  * @new_refblock is completely filled, @operation will be called.
3223  *
3224  * @status_cb and @cb_opaque are used for the amend operation's status callback.
3225  * @index is the index of the walk_over_reftable() calls and @total is the total
3226  * number of walk_over_reftable() calls per amend operation. Both are used for
3227  * calculating the parameters for the status callback.
3228  *
3229  * @allocated is set to true if a new cluster has been allocated.
3230  */
3231 static int GRAPH_RDLOCK
3232 walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
3233                    uint64_t *new_reftable_index,
3234                    uint64_t *new_reftable_size,
3235                    void *new_refblock, int new_refblock_size,
3236                    int new_refcount_bits,
3237                    RefblockFinishOp *operation, bool *allocated,
3238                    Qcow2SetRefcountFunc *new_set_refcount,
3239                    BlockDriverAmendStatusCB *status_cb,
3240                    void *cb_opaque, int index, int total,
3241                    Error **errp)
3242 {
3243     BDRVQcow2State *s = bs->opaque;
3244     uint64_t reftable_index;
3245     bool new_refblock_empty = true;
3246     int refblock_index;
3247     int new_refblock_index = 0;
3248     int ret;
3249 
3250     for (reftable_index = 0; reftable_index < s->refcount_table_size;
3251          reftable_index++)
3252     {
3253         uint64_t refblock_offset = s->refcount_table[reftable_index]
3254                                  & REFT_OFFSET_MASK;
3255 
3256         status_cb(bs, (uint64_t)index * s->refcount_table_size + reftable_index,
3257                   (uint64_t)total * s->refcount_table_size, cb_opaque);
3258 
3259         if (refblock_offset) {
3260             void *refblock;
3261 
3262             if (offset_into_cluster(s, refblock_offset)) {
3263                 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
3264                                         PRIx64 " unaligned (reftable index: %#"
3265                                         PRIx64 ")", refblock_offset,
3266                                         reftable_index);
3267                 error_setg(errp,
3268                            "Image is corrupt (unaligned refblock offset)");
3269                 return -EIO;
3270             }
3271 
3272             ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offset,
3273                                   &refblock);
3274             if (ret < 0) {
3275                 error_setg_errno(errp, -ret, "Failed to retrieve refblock");
3276                 return ret;
3277             }
3278 
3279             for (refblock_index = 0; refblock_index < s->refcount_block_size;
3280                  refblock_index++)
3281             {
3282                 uint64_t refcount;
3283 
3284                 if (new_refblock_index >= new_refblock_size) {
3285                     /* new_refblock is now complete */
3286                     ret = operation(bs, new_reftable, *new_reftable_index,
3287                                     new_reftable_size, new_refblock,
3288                                     new_refblock_empty, allocated, errp);
3289                     if (ret < 0) {
3290                         qcow2_cache_put(s->refcount_block_cache, &refblock);
3291                         return ret;
3292                     }
3293 
3294                     (*new_reftable_index)++;
3295                     new_refblock_index = 0;
3296                     new_refblock_empty = true;
3297                 }
3298 
3299                 refcount = s->get_refcount(refblock, refblock_index);
3300                 if (new_refcount_bits < 64 && refcount >> new_refcount_bits) {
3301                     uint64_t offset;
3302 
3303                     qcow2_cache_put(s->refcount_block_cache, &refblock);
3304 
3305                     offset = ((reftable_index << s->refcount_block_bits)
3306                               + refblock_index) << s->cluster_bits;
3307 
3308                     error_setg(errp, "Cannot decrease refcount entry width to "
3309                                "%i bits: Cluster at offset %#" PRIx64 " has a "
3310                                "refcount of %" PRIu64, new_refcount_bits,
3311                                offset, refcount);
3312                     return -EINVAL;
3313                 }
3314 
3315                 if (new_set_refcount) {
3316                     new_set_refcount(new_refblock, new_refblock_index++,
3317                                      refcount);
3318                 } else {
3319                     new_refblock_index++;
3320                 }
3321                 new_refblock_empty = new_refblock_empty && refcount == 0;
3322             }
3323 
3324             qcow2_cache_put(s->refcount_block_cache, &refblock);
3325         } else {
3326             /* No refblock means every refcount is 0 */
3327             for (refblock_index = 0; refblock_index < s->refcount_block_size;
3328                  refblock_index++)
3329             {
3330                 if (new_refblock_index >= new_refblock_size) {
3331                     /* new_refblock is now complete */
3332                     ret = operation(bs, new_reftable, *new_reftable_index,
3333                                     new_reftable_size, new_refblock,
3334                                     new_refblock_empty, allocated, errp);
3335                     if (ret < 0) {
3336                         return ret;
3337                     }
3338 
3339                     (*new_reftable_index)++;
3340                     new_refblock_index = 0;
3341                     new_refblock_empty = true;
3342                 }
3343 
3344                 if (new_set_refcount) {
3345                     new_set_refcount(new_refblock, new_refblock_index++, 0);
3346                 } else {
3347                     new_refblock_index++;
3348                 }
3349             }
3350         }
3351     }
3352 
3353     if (new_refblock_index > 0) {
3354         /* Complete the potentially existing partially filled final refblock */
3355         if (new_set_refcount) {
3356             for (; new_refblock_index < new_refblock_size;
3357                  new_refblock_index++)
3358             {
3359                 new_set_refcount(new_refblock, new_refblock_index, 0);
3360             }
3361         }
3362 
3363         ret = operation(bs, new_reftable, *new_reftable_index,
3364                         new_reftable_size, new_refblock, new_refblock_empty,
3365                         allocated, errp);
3366         if (ret < 0) {
3367             return ret;
3368         }
3369 
3370         (*new_reftable_index)++;
3371     }
3372 
3373     status_cb(bs, (uint64_t)(index + 1) * s->refcount_table_size,
3374               (uint64_t)total * s->refcount_table_size, cb_opaque);
3375 
3376     return 0;
3377 }
3378 
3379 int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
3380                                 BlockDriverAmendStatusCB *status_cb,
3381                                 void *cb_opaque, Error **errp)
3382 {
3383     BDRVQcow2State *s = bs->opaque;
3384     Qcow2GetRefcountFunc *new_get_refcount;
3385     Qcow2SetRefcountFunc *new_set_refcount;
3386     void *new_refblock = qemu_blockalign(bs->file->bs, s->cluster_size);
3387     uint64_t *new_reftable = NULL, new_reftable_size = 0;
3388     uint64_t *old_reftable, old_reftable_size, old_reftable_offset;
3389     uint64_t new_reftable_index = 0;
3390     uint64_t i;
3391     int64_t new_reftable_offset = 0, allocated_reftable_size = 0;
3392     int new_refblock_size, new_refcount_bits = 1 << refcount_order;
3393     int old_refcount_order;
3394     int walk_index = 0;
3395     int ret;
3396     bool new_allocation;
3397 
3398     assert(s->qcow_version >= 3);
3399     assert(refcount_order >= 0 && refcount_order <= 6);
3400 
3401     /* see qcow2_open() */
3402     new_refblock_size = 1 << (s->cluster_bits - (refcount_order - 3));
3403 
3404     new_get_refcount = get_refcount_funcs[refcount_order];
3405     new_set_refcount = set_refcount_funcs[refcount_order];
3406 
3407 
3408     do {
3409         int total_walks;
3410 
3411         new_allocation = false;
3412 
3413         /* At least we have to do this walk and the one which writes the
3414          * refblocks; also, at least we have to do this loop here at least
3415          * twice (normally), first to do the allocations, and second to
3416          * determine that everything is correctly allocated, this then makes
3417          * three walks in total */
3418         total_walks = MAX(walk_index + 2, 3);
3419 
3420         /* First, allocate the structures so they are present in the refcount
3421          * structures */
3422         ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
3423                                  &new_reftable_size, NULL, new_refblock_size,
3424                                  new_refcount_bits, &alloc_refblock,
3425                                  &new_allocation, NULL, status_cb, cb_opaque,
3426                                  walk_index++, total_walks, errp);
3427         if (ret < 0) {
3428             goto done;
3429         }
3430 
3431         new_reftable_index = 0;
3432 
3433         if (new_allocation) {
3434             if (new_reftable_offset) {
3435                 qcow2_free_clusters(
3436                     bs, new_reftable_offset,
3437                     allocated_reftable_size * REFTABLE_ENTRY_SIZE,
3438                     QCOW2_DISCARD_NEVER);
3439             }
3440 
3441             new_reftable_offset = qcow2_alloc_clusters(bs, new_reftable_size *
3442                                                            REFTABLE_ENTRY_SIZE);
3443             if (new_reftable_offset < 0) {
3444                 error_setg_errno(errp, -new_reftable_offset,
3445                                  "Failed to allocate the new reftable");
3446                 ret = new_reftable_offset;
3447                 goto done;
3448             }
3449             allocated_reftable_size = new_reftable_size;
3450         }
3451     } while (new_allocation);
3452 
3453     /* Second, write the new refblocks */
3454     ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
3455                              &new_reftable_size, new_refblock,
3456                              new_refblock_size, new_refcount_bits,
3457                              &flush_refblock, &new_allocation, new_set_refcount,
3458                              status_cb, cb_opaque, walk_index, walk_index + 1,
3459                              errp);
3460     if (ret < 0) {
3461         goto done;
3462     }
3463     assert(!new_allocation);
3464 
3465 
3466     /* Write the new reftable */
3467     ret = qcow2_pre_write_overlap_check(bs, 0, new_reftable_offset,
3468                                         new_reftable_size * REFTABLE_ENTRY_SIZE,
3469                                         false);
3470     if (ret < 0) {
3471         error_setg_errno(errp, -ret, "Overlap check failed");
3472         goto done;
3473     }
3474 
3475     for (i = 0; i < new_reftable_size; i++) {
3476         cpu_to_be64s(&new_reftable[i]);
3477     }
3478 
3479     ret = bdrv_pwrite(bs->file, new_reftable_offset,
3480                       new_reftable_size * REFTABLE_ENTRY_SIZE, new_reftable,
3481                       0);
3482 
3483     for (i = 0; i < new_reftable_size; i++) {
3484         be64_to_cpus(&new_reftable[i]);
3485     }
3486 
3487     if (ret < 0) {
3488         error_setg_errno(errp, -ret, "Failed to write the new reftable");
3489         goto done;
3490     }
3491 
3492 
3493     /* Empty the refcount cache */
3494     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
3495     if (ret < 0) {
3496         error_setg_errno(errp, -ret, "Failed to flush the refblock cache");
3497         goto done;
3498     }
3499 
3500     /* Update the image header to point to the new reftable; this only updates
3501      * the fields which are relevant to qcow2_update_header(); other fields
3502      * such as s->refcount_table or s->refcount_bits stay stale for now
3503      * (because we have to restore everything if qcow2_update_header() fails) */
3504     old_refcount_order  = s->refcount_order;
3505     old_reftable_size   = s->refcount_table_size;
3506     old_reftable_offset = s->refcount_table_offset;
3507 
3508     s->refcount_order        = refcount_order;
3509     s->refcount_table_size   = new_reftable_size;
3510     s->refcount_table_offset = new_reftable_offset;
3511 
3512     ret = qcow2_update_header(bs);
3513     if (ret < 0) {
3514         s->refcount_order        = old_refcount_order;
3515         s->refcount_table_size   = old_reftable_size;
3516         s->refcount_table_offset = old_reftable_offset;
3517         error_setg_errno(errp, -ret, "Failed to update the qcow2 header");
3518         goto done;
3519     }
3520 
3521     /* Now update the rest of the in-memory information */
3522     old_reftable = s->refcount_table;
3523     s->refcount_table = new_reftable;
3524     update_max_refcount_table_index(s);
3525 
3526     s->refcount_bits = 1 << refcount_order;
3527     s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
3528     s->refcount_max += s->refcount_max - 1;
3529 
3530     s->refcount_block_bits = s->cluster_bits - (refcount_order - 3);
3531     s->refcount_block_size = 1 << s->refcount_block_bits;
3532 
3533     s->get_refcount = new_get_refcount;
3534     s->set_refcount = new_set_refcount;
3535 
3536     /* For cleaning up all old refblocks and the old reftable below the "done"
3537      * label */
3538     new_reftable        = old_reftable;
3539     new_reftable_size   = old_reftable_size;
3540     new_reftable_offset = old_reftable_offset;
3541 
3542 done:
3543     if (new_reftable) {
3544         /* On success, new_reftable actually points to the old reftable (and
3545          * new_reftable_size is the old reftable's size); but that is just
3546          * fine */
3547         for (i = 0; i < new_reftable_size; i++) {
3548             uint64_t offset = new_reftable[i] & REFT_OFFSET_MASK;
3549             if (offset) {
3550                 qcow2_free_clusters(bs, offset, s->cluster_size,
3551                                     QCOW2_DISCARD_OTHER);
3552             }
3553         }
3554         g_free(new_reftable);
3555 
3556         if (new_reftable_offset > 0) {
3557             qcow2_free_clusters(bs, new_reftable_offset,
3558                                 new_reftable_size * REFTABLE_ENTRY_SIZE,
3559                                 QCOW2_DISCARD_OTHER);
3560         }
3561     }
3562 
3563     qemu_vfree(new_refblock);
3564     return ret;
3565 }
3566 
3567 static int64_t coroutine_fn GRAPH_RDLOCK
3568 get_refblock_offset(BlockDriverState *bs, uint64_t offset)
3569 {
3570     BDRVQcow2State *s = bs->opaque;
3571     uint32_t index = offset_to_reftable_index(s, offset);
3572     int64_t covering_refblock_offset = 0;
3573 
3574     if (index < s->refcount_table_size) {
3575         covering_refblock_offset = s->refcount_table[index] & REFT_OFFSET_MASK;
3576     }
3577     if (!covering_refblock_offset) {
3578         qcow2_signal_corruption(bs, true, -1, -1, "Refblock at %#" PRIx64 " is "
3579                                 "not covered by the refcount structures",
3580                                 offset);
3581         return -EIO;
3582     }
3583 
3584     return covering_refblock_offset;
3585 }
3586 
3587 static int coroutine_fn GRAPH_RDLOCK
3588 qcow2_discard_refcount_block(BlockDriverState *bs, uint64_t discard_block_offs)
3589 {
3590     BDRVQcow2State *s = bs->opaque;
3591     int64_t refblock_offs;
3592     uint64_t cluster_index = discard_block_offs >> s->cluster_bits;
3593     uint32_t block_index = cluster_index & (s->refcount_block_size - 1);
3594     void *refblock;
3595     int ret;
3596 
3597     refblock_offs = get_refblock_offset(bs, discard_block_offs);
3598     if (refblock_offs < 0) {
3599         return refblock_offs;
3600     }
3601 
3602     assert(discard_block_offs != 0);
3603 
3604     ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
3605                           &refblock);
3606     if (ret < 0) {
3607         return ret;
3608     }
3609 
3610     if (s->get_refcount(refblock, block_index) != 1) {
3611         qcow2_signal_corruption(bs, true, -1, -1, "Invalid refcount:"
3612                                 " refblock offset %#" PRIx64
3613                                 ", reftable index %u"
3614                                 ", block offset %#" PRIx64
3615                                 ", refcount %#" PRIx64,
3616                                 refblock_offs,
3617                                 offset_to_reftable_index(s, discard_block_offs),
3618                                 discard_block_offs,
3619                                 s->get_refcount(refblock, block_index));
3620         qcow2_cache_put(s->refcount_block_cache, &refblock);
3621         return -EINVAL;
3622     }
3623     s->set_refcount(refblock, block_index, 0);
3624 
3625     qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refblock);
3626 
3627     qcow2_cache_put(s->refcount_block_cache, &refblock);
3628 
3629     if (cluster_index < s->free_cluster_index) {
3630         s->free_cluster_index = cluster_index;
3631     }
3632 
3633     refblock = qcow2_cache_is_table_offset(s->refcount_block_cache,
3634                                            discard_block_offs);
3635     if (refblock) {
3636         /* discard refblock from the cache if refblock is cached */
3637         qcow2_cache_discard(s->refcount_block_cache, refblock);
3638     }
3639     queue_discard(bs, discard_block_offs, s->cluster_size);
3640 
3641     return 0;
3642 }
3643 
3644 int coroutine_fn qcow2_shrink_reftable(BlockDriverState *bs)
3645 {
3646     BDRVQcow2State *s = bs->opaque;
3647     uint64_t *reftable_tmp =
3648         g_malloc(s->refcount_table_size * REFTABLE_ENTRY_SIZE);
3649     int i, ret;
3650 
3651     for (i = 0; i < s->refcount_table_size; i++) {
3652         int64_t refblock_offs = s->refcount_table[i] & REFT_OFFSET_MASK;
3653         void *refblock;
3654         bool unused_block;
3655 
3656         if (refblock_offs == 0) {
3657             reftable_tmp[i] = 0;
3658             continue;
3659         }
3660         ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
3661                               &refblock);
3662         if (ret < 0) {
3663             goto out;
3664         }
3665 
3666         /* the refblock has own reference */
3667         if (i == offset_to_reftable_index(s, refblock_offs)) {
3668             uint64_t block_index = (refblock_offs >> s->cluster_bits) &
3669                                    (s->refcount_block_size - 1);
3670             uint64_t refcount = s->get_refcount(refblock, block_index);
3671 
3672             s->set_refcount(refblock, block_index, 0);
3673 
3674             unused_block = buffer_is_zero(refblock, s->cluster_size);
3675 
3676             s->set_refcount(refblock, block_index, refcount);
3677         } else {
3678             unused_block = buffer_is_zero(refblock, s->cluster_size);
3679         }
3680         qcow2_cache_put(s->refcount_block_cache, &refblock);
3681 
3682         reftable_tmp[i] = unused_block ? 0 : cpu_to_be64(s->refcount_table[i]);
3683     }
3684 
3685     ret = bdrv_co_pwrite_sync(bs->file, s->refcount_table_offset,
3686                               s->refcount_table_size * REFTABLE_ENTRY_SIZE,
3687                               reftable_tmp, 0);
3688     /*
3689      * If the write in the reftable failed the image may contain a partially
3690      * overwritten reftable. In this case it would be better to clear the
3691      * reftable in memory to avoid possible image corruption.
3692      */
3693     for (i = 0; i < s->refcount_table_size; i++) {
3694         if (s->refcount_table[i] && !reftable_tmp[i]) {
3695             if (ret == 0) {
3696                 ret = qcow2_discard_refcount_block(bs, s->refcount_table[i] &
3697                                                        REFT_OFFSET_MASK);
3698             }
3699             s->refcount_table[i] = 0;
3700         }
3701     }
3702 
3703     if (!s->cache_discards) {
3704         qcow2_process_discards(bs, ret);
3705     }
3706 
3707 out:
3708     g_free(reftable_tmp);
3709     return ret;
3710 }
3711 
3712 int64_t coroutine_fn qcow2_get_last_cluster(BlockDriverState *bs, int64_t size)
3713 {
3714     BDRVQcow2State *s = bs->opaque;
3715     int64_t i;
3716 
3717     for (i = size_to_clusters(s, size) - 1; i >= 0; i--) {
3718         uint64_t refcount;
3719         int ret = qcow2_get_refcount(bs, i, &refcount);
3720         if (ret < 0) {
3721             fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
3722                     i, strerror(-ret));
3723             return ret;
3724         }
3725         if (refcount > 0) {
3726             return i;
3727         }
3728     }
3729     qcow2_signal_corruption(bs, true, -1, -1,
3730                             "There are no references in the refcount table.");
3731     return -EIO;
3732 }
3733 
3734 int coroutine_fn GRAPH_RDLOCK
3735 qcow2_detect_metadata_preallocation(BlockDriverState *bs)
3736 {
3737     BDRVQcow2State *s = bs->opaque;
3738     int64_t i, end_cluster, cluster_count = 0, threshold;
3739     int64_t file_length, real_allocation, real_clusters;
3740 
3741     qemu_co_mutex_assert_locked(&s->lock);
3742 
3743     file_length = bdrv_co_getlength(bs->file->bs);
3744     if (file_length < 0) {
3745         return file_length;
3746     }
3747 
3748     real_allocation = bdrv_co_get_allocated_file_size(bs->file->bs);
3749     if (real_allocation < 0) {
3750         return real_allocation;
3751     }
3752 
3753     real_clusters = real_allocation / s->cluster_size;
3754     threshold = MAX(real_clusters * 10 / 9, real_clusters + 2);
3755 
3756     end_cluster = size_to_clusters(s, file_length);
3757     for (i = 0; i < end_cluster && cluster_count < threshold; i++) {
3758         uint64_t refcount;
3759         int ret = qcow2_get_refcount(bs, i, &refcount);
3760         if (ret < 0) {
3761             return ret;
3762         }
3763         cluster_count += !!refcount;
3764     }
3765 
3766     return cluster_count >= threshold;
3767 }
3768