xref: /openbmc/qemu/block/qcow2-refcount.c (revision 41a1a9c4)
1 /*
2  * Block driver for the QCOW version 2 format
3  *
4  * Copyright (c) 2004-2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu-common.h"
26 #include "block/block_int.h"
27 #include "block/qcow2.h"
28 #include "qemu/range.h"
29 #include "qapi/qmp/types.h"
30 #include "qapi-event.h"
31 
32 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
33 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
34                             int64_t offset, int64_t length,
35                             int addend, enum qcow2_discard_type type);
36 
37 
38 /*********************************************************/
39 /* refcount handling */
40 
41 int qcow2_refcount_init(BlockDriverState *bs)
42 {
43     BDRVQcowState *s = bs->opaque;
44     unsigned int refcount_table_size2, i;
45     int ret;
46 
47     assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t));
48     refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
49     s->refcount_table = g_malloc(refcount_table_size2);
50     if (s->refcount_table_size > 0) {
51         BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
52         ret = bdrv_pread(bs->file, s->refcount_table_offset,
53                          s->refcount_table, refcount_table_size2);
54         if (ret != refcount_table_size2)
55             goto fail;
56         for(i = 0; i < s->refcount_table_size; i++)
57             be64_to_cpus(&s->refcount_table[i]);
58     }
59     return 0;
60  fail:
61     return -ENOMEM;
62 }
63 
64 void qcow2_refcount_close(BlockDriverState *bs)
65 {
66     BDRVQcowState *s = bs->opaque;
67     g_free(s->refcount_table);
68 }
69 
70 
71 static int load_refcount_block(BlockDriverState *bs,
72                                int64_t refcount_block_offset,
73                                void **refcount_block)
74 {
75     BDRVQcowState *s = bs->opaque;
76     int ret;
77 
78     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
79     ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
80         refcount_block);
81 
82     return ret;
83 }
84 
85 /*
86  * Returns the refcount of the cluster given by its index. Any non-negative
87  * return value is the refcount of the cluster, negative values are -errno
88  * and indicate an error.
89  */
90 static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
91 {
92     BDRVQcowState *s = bs->opaque;
93     uint64_t refcount_table_index, block_index;
94     int64_t refcount_block_offset;
95     int ret;
96     uint16_t *refcount_block;
97     uint16_t refcount;
98 
99     refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
100     if (refcount_table_index >= s->refcount_table_size)
101         return 0;
102     refcount_block_offset =
103         s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
104     if (!refcount_block_offset)
105         return 0;
106 
107     ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
108         (void**) &refcount_block);
109     if (ret < 0) {
110         return ret;
111     }
112 
113     block_index = cluster_index &
114         ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
115     refcount = be16_to_cpu(refcount_block[block_index]);
116 
117     ret = qcow2_cache_put(bs, s->refcount_block_cache,
118         (void**) &refcount_block);
119     if (ret < 0) {
120         return ret;
121     }
122 
123     return refcount;
124 }
125 
126 /*
127  * Rounds the refcount table size up to avoid growing the table for each single
128  * refcount block that is allocated.
129  */
130 static unsigned int next_refcount_table_size(BDRVQcowState *s,
131     unsigned int min_size)
132 {
133     unsigned int min_clusters = (min_size >> (s->cluster_bits - 3)) + 1;
134     unsigned int refcount_table_clusters =
135         MAX(1, s->refcount_table_size >> (s->cluster_bits - 3));
136 
137     while (min_clusters > refcount_table_clusters) {
138         refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
139     }
140 
141     return refcount_table_clusters << (s->cluster_bits - 3);
142 }
143 
144 
145 /* Checks if two offsets are described by the same refcount block */
146 static int in_same_refcount_block(BDRVQcowState *s, uint64_t offset_a,
147     uint64_t offset_b)
148 {
149     uint64_t block_a = offset_a >> (2 * s->cluster_bits - REFCOUNT_SHIFT);
150     uint64_t block_b = offset_b >> (2 * s->cluster_bits - REFCOUNT_SHIFT);
151 
152     return (block_a == block_b);
153 }
154 
155 /*
156  * Loads a refcount block. If it doesn't exist yet, it is allocated first
157  * (including growing the refcount table if needed).
158  *
159  * Returns 0 on success or -errno in error case
160  */
161 static int alloc_refcount_block(BlockDriverState *bs,
162     int64_t cluster_index, uint16_t **refcount_block)
163 {
164     BDRVQcowState *s = bs->opaque;
165     unsigned int refcount_table_index;
166     int ret;
167 
168     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
169 
170     /* Find the refcount block for the given cluster */
171     refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
172 
173     if (refcount_table_index < s->refcount_table_size) {
174 
175         uint64_t refcount_block_offset =
176             s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
177 
178         /* If it's already there, we're done */
179         if (refcount_block_offset) {
180              return load_refcount_block(bs, refcount_block_offset,
181                  (void**) refcount_block);
182         }
183     }
184 
185     /*
186      * If we came here, we need to allocate something. Something is at least
187      * a cluster for the new refcount block. It may also include a new refcount
188      * table if the old refcount table is too small.
189      *
190      * Note that allocating clusters here needs some special care:
191      *
192      * - We can't use the normal qcow2_alloc_clusters(), it would try to
193      *   increase the refcount and very likely we would end up with an endless
194      *   recursion. Instead we must place the refcount blocks in a way that
195      *   they can describe them themselves.
196      *
197      * - We need to consider that at this point we are inside update_refcounts
198      *   and potentially doing an initial refcount increase. This means that
199      *   some clusters have already been allocated by the caller, but their
200      *   refcount isn't accurate yet. If we allocate clusters for metadata, we
201      *   need to return -EAGAIN to signal the caller that it needs to restart
202      *   the search for free clusters.
203      *
204      * - alloc_clusters_noref and qcow2_free_clusters may load a different
205      *   refcount block into the cache
206      */
207 
208     *refcount_block = NULL;
209 
210     /* We write to the refcount table, so we might depend on L2 tables */
211     ret = qcow2_cache_flush(bs, s->l2_table_cache);
212     if (ret < 0) {
213         return ret;
214     }
215 
216     /* Allocate the refcount block itself and mark it as used */
217     int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
218     if (new_block < 0) {
219         return new_block;
220     }
221 
222 #ifdef DEBUG_ALLOC2
223     fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
224         " at %" PRIx64 "\n",
225         refcount_table_index, cluster_index << s->cluster_bits, new_block);
226 #endif
227 
228     if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
229         /* Zero the new refcount block before updating it */
230         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
231             (void**) refcount_block);
232         if (ret < 0) {
233             goto fail_block;
234         }
235 
236         memset(*refcount_block, 0, s->cluster_size);
237 
238         /* The block describes itself, need to update the cache */
239         int block_index = (new_block >> s->cluster_bits) &
240             ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
241         (*refcount_block)[block_index] = cpu_to_be16(1);
242     } else {
243         /* Described somewhere else. This can recurse at most twice before we
244          * arrive at a block that describes itself. */
245         ret = update_refcount(bs, new_block, s->cluster_size, 1,
246                               QCOW2_DISCARD_NEVER);
247         if (ret < 0) {
248             goto fail_block;
249         }
250 
251         ret = qcow2_cache_flush(bs, s->refcount_block_cache);
252         if (ret < 0) {
253             goto fail_block;
254         }
255 
256         /* Initialize the new refcount block only after updating its refcount,
257          * update_refcount uses the refcount cache itself */
258         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
259             (void**) refcount_block);
260         if (ret < 0) {
261             goto fail_block;
262         }
263 
264         memset(*refcount_block, 0, s->cluster_size);
265     }
266 
267     /* Now the new refcount block needs to be written to disk */
268     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
269     qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
270     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
271     if (ret < 0) {
272         goto fail_block;
273     }
274 
275     /* If the refcount table is big enough, just hook the block up there */
276     if (refcount_table_index < s->refcount_table_size) {
277         uint64_t data64 = cpu_to_be64(new_block);
278         BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
279         ret = bdrv_pwrite_sync(bs->file,
280             s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
281             &data64, sizeof(data64));
282         if (ret < 0) {
283             goto fail_block;
284         }
285 
286         s->refcount_table[refcount_table_index] = new_block;
287 
288         /* The new refcount block may be where the caller intended to put its
289          * data, so let it restart the search. */
290         return -EAGAIN;
291     }
292 
293     ret = qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block);
294     if (ret < 0) {
295         goto fail_block;
296     }
297 
298     /*
299      * If we come here, we need to grow the refcount table. Again, a new
300      * refcount table needs some space and we can't simply allocate to avoid
301      * endless recursion.
302      *
303      * Therefore let's grab new refcount blocks at the end of the image, which
304      * will describe themselves and the new refcount table. This way we can
305      * reference them only in the new table and do the switch to the new
306      * refcount table at once without producing an inconsistent state in
307      * between.
308      */
309     BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
310 
311     /* Calculate the number of refcount blocks needed so far */
312     uint64_t refcount_block_clusters = 1 << (s->cluster_bits - REFCOUNT_SHIFT);
313     uint64_t blocks_used = DIV_ROUND_UP(cluster_index, refcount_block_clusters);
314 
315     if (blocks_used > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
316         return -EFBIG;
317     }
318 
319     /* And now we need at least one block more for the new metadata */
320     uint64_t table_size = next_refcount_table_size(s, blocks_used + 1);
321     uint64_t last_table_size;
322     uint64_t blocks_clusters;
323     do {
324         uint64_t table_clusters =
325             size_to_clusters(s, table_size * sizeof(uint64_t));
326         blocks_clusters = 1 +
327             ((table_clusters + refcount_block_clusters - 1)
328             / refcount_block_clusters);
329         uint64_t meta_clusters = table_clusters + blocks_clusters;
330 
331         last_table_size = table_size;
332         table_size = next_refcount_table_size(s, blocks_used +
333             ((meta_clusters + refcount_block_clusters - 1)
334             / refcount_block_clusters));
335 
336     } while (last_table_size != table_size);
337 
338 #ifdef DEBUG_ALLOC2
339     fprintf(stderr, "qcow2: Grow refcount table %" PRId32 " => %" PRId64 "\n",
340         s->refcount_table_size, table_size);
341 #endif
342 
343     /* Create the new refcount table and blocks */
344     uint64_t meta_offset = (blocks_used * refcount_block_clusters) *
345         s->cluster_size;
346     uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size;
347     uint16_t *new_blocks = g_malloc0(blocks_clusters * s->cluster_size);
348     uint64_t *new_table = g_malloc0(table_size * sizeof(uint64_t));
349 
350     /* Fill the new refcount table */
351     memcpy(new_table, s->refcount_table,
352         s->refcount_table_size * sizeof(uint64_t));
353     new_table[refcount_table_index] = new_block;
354 
355     int i;
356     for (i = 0; i < blocks_clusters; i++) {
357         new_table[blocks_used + i] = meta_offset + (i * s->cluster_size);
358     }
359 
360     /* Fill the refcount blocks */
361     uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t));
362     int block = 0;
363     for (i = 0; i < table_clusters + blocks_clusters; i++) {
364         new_blocks[block++] = cpu_to_be16(1);
365     }
366 
367     /* Write refcount blocks to disk */
368     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
369     ret = bdrv_pwrite_sync(bs->file, meta_offset, new_blocks,
370         blocks_clusters * s->cluster_size);
371     g_free(new_blocks);
372     if (ret < 0) {
373         goto fail_table;
374     }
375 
376     /* Write refcount table to disk */
377     for(i = 0; i < table_size; i++) {
378         cpu_to_be64s(&new_table[i]);
379     }
380 
381     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
382     ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
383         table_size * sizeof(uint64_t));
384     if (ret < 0) {
385         goto fail_table;
386     }
387 
388     for(i = 0; i < table_size; i++) {
389         be64_to_cpus(&new_table[i]);
390     }
391 
392     /* Hook up the new refcount table in the qcow2 header */
393     uint8_t data[12];
394     cpu_to_be64w((uint64_t*)data, table_offset);
395     cpu_to_be32w((uint32_t*)(data + 8), table_clusters);
396     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
397     ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, refcount_table_offset),
398         data, sizeof(data));
399     if (ret < 0) {
400         goto fail_table;
401     }
402 
403     /* And switch it in memory */
404     uint64_t old_table_offset = s->refcount_table_offset;
405     uint64_t old_table_size = s->refcount_table_size;
406 
407     g_free(s->refcount_table);
408     s->refcount_table = new_table;
409     s->refcount_table_size = table_size;
410     s->refcount_table_offset = table_offset;
411 
412     /* Free old table. */
413     qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
414                         QCOW2_DISCARD_OTHER);
415 
416     ret = load_refcount_block(bs, new_block, (void**) refcount_block);
417     if (ret < 0) {
418         return ret;
419     }
420 
421     /* If we were trying to do the initial refcount update for some cluster
422      * allocation, we might have used the same clusters to store newly
423      * allocated metadata. Make the caller search some new space. */
424     return -EAGAIN;
425 
426 fail_table:
427     g_free(new_table);
428 fail_block:
429     if (*refcount_block != NULL) {
430         qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block);
431     }
432     return ret;
433 }
434 
435 void qcow2_process_discards(BlockDriverState *bs, int ret)
436 {
437     BDRVQcowState *s = bs->opaque;
438     Qcow2DiscardRegion *d, *next;
439 
440     QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
441         QTAILQ_REMOVE(&s->discards, d, next);
442 
443         /* Discard is optional, ignore the return value */
444         if (ret >= 0) {
445             bdrv_discard(bs->file,
446                          d->offset >> BDRV_SECTOR_BITS,
447                          d->bytes >> BDRV_SECTOR_BITS);
448         }
449 
450         g_free(d);
451     }
452 }
453 
454 static void update_refcount_discard(BlockDriverState *bs,
455                                     uint64_t offset, uint64_t length)
456 {
457     BDRVQcowState *s = bs->opaque;
458     Qcow2DiscardRegion *d, *p, *next;
459 
460     QTAILQ_FOREACH(d, &s->discards, next) {
461         uint64_t new_start = MIN(offset, d->offset);
462         uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
463 
464         if (new_end - new_start <= length + d->bytes) {
465             /* There can't be any overlap, areas ending up here have no
466              * references any more and therefore shouldn't get freed another
467              * time. */
468             assert(d->bytes + length == new_end - new_start);
469             d->offset = new_start;
470             d->bytes = new_end - new_start;
471             goto found;
472         }
473     }
474 
475     d = g_malloc(sizeof(*d));
476     *d = (Qcow2DiscardRegion) {
477         .bs     = bs,
478         .offset = offset,
479         .bytes  = length,
480     };
481     QTAILQ_INSERT_TAIL(&s->discards, d, next);
482 
483 found:
484     /* Merge discard requests if they are adjacent now */
485     QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
486         if (p == d
487             || p->offset > d->offset + d->bytes
488             || d->offset > p->offset + p->bytes)
489         {
490             continue;
491         }
492 
493         /* Still no overlap possible */
494         assert(p->offset == d->offset + d->bytes
495             || d->offset == p->offset + p->bytes);
496 
497         QTAILQ_REMOVE(&s->discards, p, next);
498         d->offset = MIN(d->offset, p->offset);
499         d->bytes += p->bytes;
500     }
501 }
502 
503 /* XXX: cache several refcount block clusters ? */
504 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
505     int64_t offset, int64_t length, int addend, enum qcow2_discard_type type)
506 {
507     BDRVQcowState *s = bs->opaque;
508     int64_t start, last, cluster_offset;
509     uint16_t *refcount_block = NULL;
510     int64_t old_table_index = -1;
511     int ret;
512 
513 #ifdef DEBUG_ALLOC2
514     fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64 " addend=%d\n",
515            offset, length, addend);
516 #endif
517     if (length < 0) {
518         return -EINVAL;
519     } else if (length == 0) {
520         return 0;
521     }
522 
523     if (addend < 0) {
524         qcow2_cache_set_dependency(bs, s->refcount_block_cache,
525             s->l2_table_cache);
526     }
527 
528     start = start_of_cluster(s, offset);
529     last = start_of_cluster(s, offset + length - 1);
530     for(cluster_offset = start; cluster_offset <= last;
531         cluster_offset += s->cluster_size)
532     {
533         int block_index, refcount;
534         int64_t cluster_index = cluster_offset >> s->cluster_bits;
535         int64_t table_index =
536             cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
537 
538         /* Load the refcount block and allocate it if needed */
539         if (table_index != old_table_index) {
540             if (refcount_block) {
541                 ret = qcow2_cache_put(bs, s->refcount_block_cache,
542                     (void**) &refcount_block);
543                 if (ret < 0) {
544                     goto fail;
545                 }
546             }
547 
548             ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
549             if (ret < 0) {
550                 goto fail;
551             }
552         }
553         old_table_index = table_index;
554 
555         qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
556 
557         /* we can update the count and save it */
558         block_index = cluster_index &
559             ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
560 
561         refcount = be16_to_cpu(refcount_block[block_index]);
562         refcount += addend;
563         if (refcount < 0 || refcount > 0xffff) {
564             ret = -EINVAL;
565             goto fail;
566         }
567         if (refcount == 0 && cluster_index < s->free_cluster_index) {
568             s->free_cluster_index = cluster_index;
569         }
570         refcount_block[block_index] = cpu_to_be16(refcount);
571 
572         if (refcount == 0 && s->discard_passthrough[type]) {
573             update_refcount_discard(bs, cluster_offset, s->cluster_size);
574         }
575     }
576 
577     ret = 0;
578 fail:
579     if (!s->cache_discards) {
580         qcow2_process_discards(bs, ret);
581     }
582 
583     /* Write last changed block to disk */
584     if (refcount_block) {
585         int wret;
586         wret = qcow2_cache_put(bs, s->refcount_block_cache,
587             (void**) &refcount_block);
588         if (wret < 0) {
589             return ret < 0 ? ret : wret;
590         }
591     }
592 
593     /*
594      * Try do undo any updates if an error is returned (This may succeed in
595      * some cases like ENOSPC for allocating a new refcount block)
596      */
597     if (ret < 0) {
598         int dummy;
599         dummy = update_refcount(bs, offset, cluster_offset - offset, -addend,
600                                 QCOW2_DISCARD_NEVER);
601         (void)dummy;
602     }
603 
604     return ret;
605 }
606 
607 /*
608  * Increases or decreases the refcount of a given cluster by one.
609  * addend must be 1 or -1.
610  *
611  * If the return value is non-negative, it is the new refcount of the cluster.
612  * If it is negative, it is -errno and indicates an error.
613  */
614 int qcow2_update_cluster_refcount(BlockDriverState *bs,
615                                   int64_t cluster_index,
616                                   int addend,
617                                   enum qcow2_discard_type type)
618 {
619     BDRVQcowState *s = bs->opaque;
620     int ret;
621 
622     ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
623                           type);
624     if (ret < 0) {
625         return ret;
626     }
627 
628     return get_refcount(bs, cluster_index);
629 }
630 
631 
632 
633 /*********************************************************/
634 /* cluster allocation functions */
635 
636 
637 
638 /* return < 0 if error */
639 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size)
640 {
641     BDRVQcowState *s = bs->opaque;
642     uint64_t i, nb_clusters;
643     int refcount;
644 
645     nb_clusters = size_to_clusters(s, size);
646 retry:
647     for(i = 0; i < nb_clusters; i++) {
648         uint64_t next_cluster_index = s->free_cluster_index++;
649         refcount = get_refcount(bs, next_cluster_index);
650 
651         if (refcount < 0) {
652             return refcount;
653         } else if (refcount != 0) {
654             goto retry;
655         }
656     }
657 
658     /* Make sure that all offsets in the "allocated" range are representable
659      * in an int64_t */
660     if (s->free_cluster_index > 0 &&
661         s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits))
662     {
663         return -EFBIG;
664     }
665 
666 #ifdef DEBUG_ALLOC2
667     fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
668             size,
669             (s->free_cluster_index - nb_clusters) << s->cluster_bits);
670 #endif
671     return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
672 }
673 
674 int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
675 {
676     int64_t offset;
677     int ret;
678 
679     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
680     do {
681         offset = alloc_clusters_noref(bs, size);
682         if (offset < 0) {
683             return offset;
684         }
685 
686         ret = update_refcount(bs, offset, size, 1, QCOW2_DISCARD_NEVER);
687     } while (ret == -EAGAIN);
688 
689     if (ret < 0) {
690         return ret;
691     }
692 
693     return offset;
694 }
695 
696 int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
697     int nb_clusters)
698 {
699     BDRVQcowState *s = bs->opaque;
700     uint64_t cluster_index;
701     uint64_t i;
702     int refcount, ret;
703 
704     assert(nb_clusters >= 0);
705     if (nb_clusters == 0) {
706         return 0;
707     }
708 
709     do {
710         /* Check how many clusters there are free */
711         cluster_index = offset >> s->cluster_bits;
712         for(i = 0; i < nb_clusters; i++) {
713             refcount = get_refcount(bs, cluster_index++);
714 
715             if (refcount < 0) {
716                 return refcount;
717             } else if (refcount != 0) {
718                 break;
719             }
720         }
721 
722         /* And then allocate them */
723         ret = update_refcount(bs, offset, i << s->cluster_bits, 1,
724                               QCOW2_DISCARD_NEVER);
725     } while (ret == -EAGAIN);
726 
727     if (ret < 0) {
728         return ret;
729     }
730 
731     return i;
732 }
733 
734 /* only used to allocate compressed sectors. We try to allocate
735    contiguous sectors. size must be <= cluster_size */
736 int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
737 {
738     BDRVQcowState *s = bs->opaque;
739     int64_t offset, cluster_offset;
740     int free_in_cluster;
741 
742     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
743     assert(size > 0 && size <= s->cluster_size);
744     if (s->free_byte_offset == 0) {
745         offset = qcow2_alloc_clusters(bs, s->cluster_size);
746         if (offset < 0) {
747             return offset;
748         }
749         s->free_byte_offset = offset;
750     }
751  redo:
752     free_in_cluster = s->cluster_size -
753         offset_into_cluster(s, s->free_byte_offset);
754     if (size <= free_in_cluster) {
755         /* enough space in current cluster */
756         offset = s->free_byte_offset;
757         s->free_byte_offset += size;
758         free_in_cluster -= size;
759         if (free_in_cluster == 0)
760             s->free_byte_offset = 0;
761         if (offset_into_cluster(s, offset) != 0)
762             qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
763                                           QCOW2_DISCARD_NEVER);
764     } else {
765         offset = qcow2_alloc_clusters(bs, s->cluster_size);
766         if (offset < 0) {
767             return offset;
768         }
769         cluster_offset = start_of_cluster(s, s->free_byte_offset);
770         if ((cluster_offset + s->cluster_size) == offset) {
771             /* we are lucky: contiguous data */
772             offset = s->free_byte_offset;
773             qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
774                                           QCOW2_DISCARD_NEVER);
775             s->free_byte_offset += size;
776         } else {
777             s->free_byte_offset = offset;
778             goto redo;
779         }
780     }
781 
782     /* The cluster refcount was incremented, either by qcow2_alloc_clusters()
783      * or explicitly by qcow2_update_cluster_refcount().  Refcount blocks must
784      * be flushed before the caller's L2 table updates.
785      */
786     qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
787     return offset;
788 }
789 
790 void qcow2_free_clusters(BlockDriverState *bs,
791                           int64_t offset, int64_t size,
792                           enum qcow2_discard_type type)
793 {
794     int ret;
795 
796     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
797     ret = update_refcount(bs, offset, size, -1, type);
798     if (ret < 0) {
799         fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
800         /* TODO Remember the clusters to free them later and avoid leaking */
801     }
802 }
803 
804 /*
805  * Free a cluster using its L2 entry (handles clusters of all types, e.g.
806  * normal cluster, compressed cluster, etc.)
807  */
808 void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
809                              int nb_clusters, enum qcow2_discard_type type)
810 {
811     BDRVQcowState *s = bs->opaque;
812 
813     switch (qcow2_get_cluster_type(l2_entry)) {
814     case QCOW2_CLUSTER_COMPRESSED:
815         {
816             int nb_csectors;
817             nb_csectors = ((l2_entry >> s->csize_shift) &
818                            s->csize_mask) + 1;
819             qcow2_free_clusters(bs,
820                 (l2_entry & s->cluster_offset_mask) & ~511,
821                 nb_csectors * 512, type);
822         }
823         break;
824     case QCOW2_CLUSTER_NORMAL:
825     case QCOW2_CLUSTER_ZERO:
826         if (l2_entry & L2E_OFFSET_MASK) {
827             qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
828                                 nb_clusters << s->cluster_bits, type);
829         }
830         break;
831     case QCOW2_CLUSTER_UNALLOCATED:
832         break;
833     default:
834         abort();
835     }
836 }
837 
838 
839 
840 /*********************************************************/
841 /* snapshots and image creation */
842 
843 
844 
845 /* update the refcounts of snapshots and the copied flag */
846 int qcow2_update_snapshot_refcount(BlockDriverState *bs,
847     int64_t l1_table_offset, int l1_size, int addend)
848 {
849     BDRVQcowState *s = bs->opaque;
850     uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated;
851     int64_t old_offset, old_l2_offset;
852     int i, j, l1_modified = 0, nb_csectors, refcount;
853     int ret;
854 
855     l2_table = NULL;
856     l1_table = NULL;
857     l1_size2 = l1_size * sizeof(uint64_t);
858 
859     s->cache_discards = true;
860 
861     /* WARNING: qcow2_snapshot_goto relies on this function not using the
862      * l1_table_offset when it is the current s->l1_table_offset! Be careful
863      * when changing this! */
864     if (l1_table_offset != s->l1_table_offset) {
865         l1_table = g_malloc0(align_offset(l1_size2, 512));
866         l1_allocated = 1;
867 
868         ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
869         if (ret < 0) {
870             goto fail;
871         }
872 
873         for(i = 0;i < l1_size; i++)
874             be64_to_cpus(&l1_table[i]);
875     } else {
876         assert(l1_size == s->l1_size);
877         l1_table = s->l1_table;
878         l1_allocated = 0;
879     }
880 
881     for(i = 0; i < l1_size; i++) {
882         l2_offset = l1_table[i];
883         if (l2_offset) {
884             old_l2_offset = l2_offset;
885             l2_offset &= L1E_OFFSET_MASK;
886 
887             ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
888                 (void**) &l2_table);
889             if (ret < 0) {
890                 goto fail;
891             }
892 
893             for(j = 0; j < s->l2_size; j++) {
894                 uint64_t cluster_index;
895 
896                 offset = be64_to_cpu(l2_table[j]);
897                 old_offset = offset;
898                 offset &= ~QCOW_OFLAG_COPIED;
899 
900                 switch (qcow2_get_cluster_type(offset)) {
901                     case QCOW2_CLUSTER_COMPRESSED:
902                         nb_csectors = ((offset >> s->csize_shift) &
903                                        s->csize_mask) + 1;
904                         if (addend != 0) {
905                             ret = update_refcount(bs,
906                                 (offset & s->cluster_offset_mask) & ~511,
907                                 nb_csectors * 512, addend,
908                                 QCOW2_DISCARD_SNAPSHOT);
909                             if (ret < 0) {
910                                 goto fail;
911                             }
912                         }
913                         /* compressed clusters are never modified */
914                         refcount = 2;
915                         break;
916 
917                     case QCOW2_CLUSTER_NORMAL:
918                     case QCOW2_CLUSTER_ZERO:
919                         cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits;
920                         if (!cluster_index) {
921                             /* unallocated */
922                             refcount = 0;
923                             break;
924                         }
925                         if (addend != 0) {
926                             refcount = qcow2_update_cluster_refcount(bs,
927                                     cluster_index, addend,
928                                     QCOW2_DISCARD_SNAPSHOT);
929                         } else {
930                             refcount = get_refcount(bs, cluster_index);
931                         }
932 
933                         if (refcount < 0) {
934                             ret = refcount;
935                             goto fail;
936                         }
937                         break;
938 
939                     case QCOW2_CLUSTER_UNALLOCATED:
940                         refcount = 0;
941                         break;
942 
943                     default:
944                         abort();
945                 }
946 
947                 if (refcount == 1) {
948                     offset |= QCOW_OFLAG_COPIED;
949                 }
950                 if (offset != old_offset) {
951                     if (addend > 0) {
952                         qcow2_cache_set_dependency(bs, s->l2_table_cache,
953                             s->refcount_block_cache);
954                     }
955                     l2_table[j] = cpu_to_be64(offset);
956                     qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
957                 }
958             }
959 
960             ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
961             if (ret < 0) {
962                 goto fail;
963             }
964 
965 
966             if (addend != 0) {
967                 refcount = qcow2_update_cluster_refcount(bs, l2_offset >>
968                         s->cluster_bits, addend, QCOW2_DISCARD_SNAPSHOT);
969             } else {
970                 refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
971             }
972             if (refcount < 0) {
973                 ret = refcount;
974                 goto fail;
975             } else if (refcount == 1) {
976                 l2_offset |= QCOW_OFLAG_COPIED;
977             }
978             if (l2_offset != old_l2_offset) {
979                 l1_table[i] = l2_offset;
980                 l1_modified = 1;
981             }
982         }
983     }
984 
985     ret = bdrv_flush(bs);
986 fail:
987     if (l2_table) {
988         qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
989     }
990 
991     s->cache_discards = false;
992     qcow2_process_discards(bs, ret);
993 
994     /* Update L1 only if it isn't deleted anyway (addend = -1) */
995     if (ret == 0 && addend >= 0 && l1_modified) {
996         for (i = 0; i < l1_size; i++) {
997             cpu_to_be64s(&l1_table[i]);
998         }
999 
1000         ret = bdrv_pwrite_sync(bs->file, l1_table_offset, l1_table, l1_size2);
1001 
1002         for (i = 0; i < l1_size; i++) {
1003             be64_to_cpus(&l1_table[i]);
1004         }
1005     }
1006     if (l1_allocated)
1007         g_free(l1_table);
1008     return ret;
1009 }
1010 
1011 
1012 
1013 
1014 /*********************************************************/
1015 /* refcount checking functions */
1016 
1017 
1018 
1019 /*
1020  * Increases the refcount for a range of clusters in a given refcount table.
1021  * This is used to construct a temporary refcount table out of L1 and L2 tables
1022  * which can be compared the the refcount table saved in the image.
1023  *
1024  * Modifies the number of errors in res.
1025  */
1026 static void inc_refcounts(BlockDriverState *bs,
1027                           BdrvCheckResult *res,
1028                           uint16_t *refcount_table,
1029                           int refcount_table_size,
1030                           int64_t offset, int64_t size)
1031 {
1032     BDRVQcowState *s = bs->opaque;
1033     uint64_t start, last, cluster_offset, k;
1034 
1035     if (size <= 0)
1036         return;
1037 
1038     start = start_of_cluster(s, offset);
1039     last = start_of_cluster(s, offset + size - 1);
1040     for(cluster_offset = start; cluster_offset <= last;
1041         cluster_offset += s->cluster_size) {
1042         k = cluster_offset >> s->cluster_bits;
1043         if (k >= refcount_table_size) {
1044             fprintf(stderr, "Warning: cluster offset=0x%" PRIx64 " is after "
1045                 "the end of the image file, can't properly check refcounts.\n",
1046                 cluster_offset);
1047             res->check_errors++;
1048         } else {
1049             if (++refcount_table[k] == 0) {
1050                 fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
1051                     "\n", cluster_offset);
1052                 res->corruptions++;
1053             }
1054         }
1055     }
1056 }
1057 
1058 /* Flags for check_refcounts_l1() and check_refcounts_l2() */
1059 enum {
1060     CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
1061 };
1062 
1063 /*
1064  * Increases the refcount in the given refcount table for the all clusters
1065  * referenced in the L2 table. While doing so, performs some checks on L2
1066  * entries.
1067  *
1068  * Returns the number of errors found by the checks or -errno if an internal
1069  * error occurred.
1070  */
1071 static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
1072     uint16_t *refcount_table, int refcount_table_size, int64_t l2_offset,
1073     int flags)
1074 {
1075     BDRVQcowState *s = bs->opaque;
1076     uint64_t *l2_table, l2_entry;
1077     uint64_t next_contiguous_offset = 0;
1078     int i, l2_size, nb_csectors;
1079 
1080     /* Read L2 table from disk */
1081     l2_size = s->l2_size * sizeof(uint64_t);
1082     l2_table = g_malloc(l2_size);
1083 
1084     if (bdrv_pread(bs->file, l2_offset, l2_table, l2_size) != l2_size)
1085         goto fail;
1086 
1087     /* Do the actual checks */
1088     for(i = 0; i < s->l2_size; i++) {
1089         l2_entry = be64_to_cpu(l2_table[i]);
1090 
1091         switch (qcow2_get_cluster_type(l2_entry)) {
1092         case QCOW2_CLUSTER_COMPRESSED:
1093             /* Compressed clusters don't have QCOW_OFLAG_COPIED */
1094             if (l2_entry & QCOW_OFLAG_COPIED) {
1095                 fprintf(stderr, "ERROR: cluster %" PRId64 ": "
1096                     "copied flag must never be set for compressed "
1097                     "clusters\n", l2_entry >> s->cluster_bits);
1098                 l2_entry &= ~QCOW_OFLAG_COPIED;
1099                 res->corruptions++;
1100             }
1101 
1102             /* Mark cluster as used */
1103             nb_csectors = ((l2_entry >> s->csize_shift) &
1104                            s->csize_mask) + 1;
1105             l2_entry &= s->cluster_offset_mask;
1106             inc_refcounts(bs, res, refcount_table, refcount_table_size,
1107                 l2_entry & ~511, nb_csectors * 512);
1108 
1109             if (flags & CHECK_FRAG_INFO) {
1110                 res->bfi.allocated_clusters++;
1111                 res->bfi.compressed_clusters++;
1112 
1113                 /* Compressed clusters are fragmented by nature.  Since they
1114                  * take up sub-sector space but we only have sector granularity
1115                  * I/O we need to re-read the same sectors even for adjacent
1116                  * compressed clusters.
1117                  */
1118                 res->bfi.fragmented_clusters++;
1119             }
1120             break;
1121 
1122         case QCOW2_CLUSTER_ZERO:
1123             if ((l2_entry & L2E_OFFSET_MASK) == 0) {
1124                 break;
1125             }
1126             /* fall through */
1127 
1128         case QCOW2_CLUSTER_NORMAL:
1129         {
1130             uint64_t offset = l2_entry & L2E_OFFSET_MASK;
1131 
1132             if (flags & CHECK_FRAG_INFO) {
1133                 res->bfi.allocated_clusters++;
1134                 if (next_contiguous_offset &&
1135                     offset != next_contiguous_offset) {
1136                     res->bfi.fragmented_clusters++;
1137                 }
1138                 next_contiguous_offset = offset + s->cluster_size;
1139             }
1140 
1141             /* Mark cluster as used */
1142             inc_refcounts(bs, res, refcount_table,refcount_table_size,
1143                 offset, s->cluster_size);
1144 
1145             /* Correct offsets are cluster aligned */
1146             if (offset_into_cluster(s, offset)) {
1147                 fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
1148                     "properly aligned; L2 entry corrupted.\n", offset);
1149                 res->corruptions++;
1150             }
1151             break;
1152         }
1153 
1154         case QCOW2_CLUSTER_UNALLOCATED:
1155             break;
1156 
1157         default:
1158             abort();
1159         }
1160     }
1161 
1162     g_free(l2_table);
1163     return 0;
1164 
1165 fail:
1166     fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
1167     g_free(l2_table);
1168     return -EIO;
1169 }
1170 
1171 /*
1172  * Increases the refcount for the L1 table, its L2 tables and all referenced
1173  * clusters in the given refcount table. While doing so, performs some checks
1174  * on L1 and L2 entries.
1175  *
1176  * Returns the number of errors found by the checks or -errno if an internal
1177  * error occurred.
1178  */
1179 static int check_refcounts_l1(BlockDriverState *bs,
1180                               BdrvCheckResult *res,
1181                               uint16_t *refcount_table,
1182                               int refcount_table_size,
1183                               int64_t l1_table_offset, int l1_size,
1184                               int flags)
1185 {
1186     BDRVQcowState *s = bs->opaque;
1187     uint64_t *l1_table, l2_offset, l1_size2;
1188     int i, ret;
1189 
1190     l1_size2 = l1_size * sizeof(uint64_t);
1191 
1192     /* Mark L1 table as used */
1193     inc_refcounts(bs, res, refcount_table, refcount_table_size,
1194         l1_table_offset, l1_size2);
1195 
1196     /* Read L1 table entries from disk */
1197     if (l1_size2 == 0) {
1198         l1_table = NULL;
1199     } else {
1200         l1_table = g_malloc(l1_size2);
1201         if (bdrv_pread(bs->file, l1_table_offset,
1202                        l1_table, l1_size2) != l1_size2)
1203             goto fail;
1204         for(i = 0;i < l1_size; i++)
1205             be64_to_cpus(&l1_table[i]);
1206     }
1207 
1208     /* Do the actual checks */
1209     for(i = 0; i < l1_size; i++) {
1210         l2_offset = l1_table[i];
1211         if (l2_offset) {
1212             /* Mark L2 table as used */
1213             l2_offset &= L1E_OFFSET_MASK;
1214             inc_refcounts(bs, res, refcount_table, refcount_table_size,
1215                 l2_offset, s->cluster_size);
1216 
1217             /* L2 tables are cluster aligned */
1218             if (offset_into_cluster(s, l2_offset)) {
1219                 fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1220                     "cluster aligned; L1 entry corrupted\n", l2_offset);
1221                 res->corruptions++;
1222             }
1223 
1224             /* Process and check L2 entries */
1225             ret = check_refcounts_l2(bs, res, refcount_table,
1226                                      refcount_table_size, l2_offset, flags);
1227             if (ret < 0) {
1228                 goto fail;
1229             }
1230         }
1231     }
1232     g_free(l1_table);
1233     return 0;
1234 
1235 fail:
1236     fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1237     res->check_errors++;
1238     g_free(l1_table);
1239     return -EIO;
1240 }
1241 
1242 /*
1243  * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
1244  *
1245  * This function does not print an error message nor does it increment
1246  * check_errors if get_refcount fails (this is because such an error will have
1247  * been already detected and sufficiently signaled by the calling function
1248  * (qcow2_check_refcounts) by the time this function is called).
1249  */
1250 static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
1251                               BdrvCheckMode fix)
1252 {
1253     BDRVQcowState *s = bs->opaque;
1254     uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
1255     int ret;
1256     int refcount;
1257     int i, j;
1258 
1259     for (i = 0; i < s->l1_size; i++) {
1260         uint64_t l1_entry = s->l1_table[i];
1261         uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
1262         bool l2_dirty = false;
1263 
1264         if (!l2_offset) {
1265             continue;
1266         }
1267 
1268         refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
1269         if (refcount < 0) {
1270             /* don't print message nor increment check_errors */
1271             continue;
1272         }
1273         if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
1274             fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
1275                     "l1_entry=%" PRIx64 " refcount=%d\n",
1276                     fix & BDRV_FIX_ERRORS ? "Repairing" :
1277                                             "ERROR",
1278                     i, l1_entry, refcount);
1279             if (fix & BDRV_FIX_ERRORS) {
1280                 s->l1_table[i] = refcount == 1
1281                                ? l1_entry |  QCOW_OFLAG_COPIED
1282                                : l1_entry & ~QCOW_OFLAG_COPIED;
1283                 ret = qcow2_write_l1_entry(bs, i);
1284                 if (ret < 0) {
1285                     res->check_errors++;
1286                     goto fail;
1287                 }
1288                 res->corruptions_fixed++;
1289             } else {
1290                 res->corruptions++;
1291             }
1292         }
1293 
1294         ret = bdrv_pread(bs->file, l2_offset, l2_table,
1295                          s->l2_size * sizeof(uint64_t));
1296         if (ret < 0) {
1297             fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
1298                     strerror(-ret));
1299             res->check_errors++;
1300             goto fail;
1301         }
1302 
1303         for (j = 0; j < s->l2_size; j++) {
1304             uint64_t l2_entry = be64_to_cpu(l2_table[j]);
1305             uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
1306             int cluster_type = qcow2_get_cluster_type(l2_entry);
1307 
1308             if ((cluster_type == QCOW2_CLUSTER_NORMAL) ||
1309                 ((cluster_type == QCOW2_CLUSTER_ZERO) && (data_offset != 0))) {
1310                 refcount = get_refcount(bs, data_offset >> s->cluster_bits);
1311                 if (refcount < 0) {
1312                     /* don't print message nor increment check_errors */
1313                     continue;
1314                 }
1315                 if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
1316                     fprintf(stderr, "%s OFLAG_COPIED data cluster: "
1317                             "l2_entry=%" PRIx64 " refcount=%d\n",
1318                             fix & BDRV_FIX_ERRORS ? "Repairing" :
1319                                                     "ERROR",
1320                             l2_entry, refcount);
1321                     if (fix & BDRV_FIX_ERRORS) {
1322                         l2_table[j] = cpu_to_be64(refcount == 1
1323                                     ? l2_entry |  QCOW_OFLAG_COPIED
1324                                     : l2_entry & ~QCOW_OFLAG_COPIED);
1325                         l2_dirty = true;
1326                         res->corruptions_fixed++;
1327                     } else {
1328                         res->corruptions++;
1329                     }
1330                 }
1331             }
1332         }
1333 
1334         if (l2_dirty) {
1335             ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
1336                                                 l2_offset, s->cluster_size);
1337             if (ret < 0) {
1338                 fprintf(stderr, "ERROR: Could not write L2 table; metadata "
1339                         "overlap check failed: %s\n", strerror(-ret));
1340                 res->check_errors++;
1341                 goto fail;
1342             }
1343 
1344             ret = bdrv_pwrite(bs->file, l2_offset, l2_table, s->cluster_size);
1345             if (ret < 0) {
1346                 fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
1347                         strerror(-ret));
1348                 res->check_errors++;
1349                 goto fail;
1350             }
1351         }
1352     }
1353 
1354     ret = 0;
1355 
1356 fail:
1357     qemu_vfree(l2_table);
1358     return ret;
1359 }
1360 
1361 /*
1362  * Writes one sector of the refcount table to the disk
1363  */
1364 #define RT_ENTRIES_PER_SECTOR (512 / sizeof(uint64_t))
1365 static int write_reftable_entry(BlockDriverState *bs, int rt_index)
1366 {
1367     BDRVQcowState *s = bs->opaque;
1368     uint64_t buf[RT_ENTRIES_PER_SECTOR];
1369     int rt_start_index;
1370     int i, ret;
1371 
1372     rt_start_index = rt_index & ~(RT_ENTRIES_PER_SECTOR - 1);
1373     for (i = 0; i < RT_ENTRIES_PER_SECTOR; i++) {
1374         buf[i] = cpu_to_be64(s->refcount_table[rt_start_index + i]);
1375     }
1376 
1377     ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_REFCOUNT_TABLE,
1378             s->refcount_table_offset + rt_start_index * sizeof(uint64_t),
1379             sizeof(buf));
1380     if (ret < 0) {
1381         return ret;
1382     }
1383 
1384     BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_UPDATE);
1385     ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset +
1386             rt_start_index * sizeof(uint64_t), buf, sizeof(buf));
1387     if (ret < 0) {
1388         return ret;
1389     }
1390 
1391     return 0;
1392 }
1393 
1394 /*
1395  * Allocates a new cluster for the given refcount block (represented by its
1396  * offset in the image file) and copies the current content there. This function
1397  * does _not_ decrement the reference count for the currently occupied cluster.
1398  *
1399  * This function prints an informative message to stderr on error (and returns
1400  * -errno); on success, the offset of the newly allocated cluster is returned.
1401  */
1402 static int64_t realloc_refcount_block(BlockDriverState *bs, int reftable_index,
1403                                       uint64_t offset)
1404 {
1405     BDRVQcowState *s = bs->opaque;
1406     int64_t new_offset = 0;
1407     void *refcount_block = NULL;
1408     int ret;
1409 
1410     /* allocate new refcount block */
1411     new_offset = qcow2_alloc_clusters(bs, s->cluster_size);
1412     if (new_offset < 0) {
1413         fprintf(stderr, "Could not allocate new cluster: %s\n",
1414                 strerror(-new_offset));
1415         ret = new_offset;
1416         goto done;
1417     }
1418 
1419     /* fetch current refcount block content */
1420     ret = qcow2_cache_get(bs, s->refcount_block_cache, offset, &refcount_block);
1421     if (ret < 0) {
1422         fprintf(stderr, "Could not fetch refcount block: %s\n", strerror(-ret));
1423         goto fail_free_cluster;
1424     }
1425 
1426     /* new block has not yet been entered into refcount table, therefore it is
1427      * no refcount block yet (regarding this check) */
1428     ret = qcow2_pre_write_overlap_check(bs, 0, new_offset, s->cluster_size);
1429     if (ret < 0) {
1430         fprintf(stderr, "Could not write refcount block; metadata overlap "
1431                 "check failed: %s\n", strerror(-ret));
1432         /* the image will be marked corrupt, so don't even attempt on freeing
1433          * the cluster */
1434         goto done;
1435     }
1436 
1437     /* write to new block */
1438     ret = bdrv_write(bs->file, new_offset / BDRV_SECTOR_SIZE, refcount_block,
1439             s->cluster_sectors);
1440     if (ret < 0) {
1441         fprintf(stderr, "Could not write refcount block: %s\n", strerror(-ret));
1442         goto fail_free_cluster;
1443     }
1444 
1445     /* update refcount table */
1446     assert(!offset_into_cluster(s, new_offset));
1447     s->refcount_table[reftable_index] = new_offset;
1448     ret = write_reftable_entry(bs, reftable_index);
1449     if (ret < 0) {
1450         fprintf(stderr, "Could not update refcount table: %s\n",
1451                 strerror(-ret));
1452         goto fail_free_cluster;
1453     }
1454 
1455     goto done;
1456 
1457 fail_free_cluster:
1458     qcow2_free_clusters(bs, new_offset, s->cluster_size, QCOW2_DISCARD_OTHER);
1459 
1460 done:
1461     if (refcount_block) {
1462         /* This should never fail, as it would only do so if the given refcount
1463          * block cannot be found in the cache. As this is impossible as long as
1464          * there are no bugs, assert the success. */
1465         int tmp = qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
1466         assert(tmp == 0);
1467     }
1468 
1469     if (ret < 0) {
1470         return ret;
1471     }
1472 
1473     return new_offset;
1474 }
1475 
1476 /*
1477  * Checks an image for refcount consistency.
1478  *
1479  * Returns 0 if no errors are found, the number of errors in case the image is
1480  * detected as corrupted, and -errno when an internal error occurred.
1481  */
1482 int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
1483                           BdrvCheckMode fix)
1484 {
1485     BDRVQcowState *s = bs->opaque;
1486     int64_t size, i, highest_cluster, nb_clusters;
1487     int refcount1, refcount2;
1488     QCowSnapshot *sn;
1489     uint16_t *refcount_table;
1490     int ret;
1491 
1492     size = bdrv_getlength(bs->file);
1493     if (size < 0) {
1494         res->check_errors++;
1495         return size;
1496     }
1497 
1498     nb_clusters = size_to_clusters(s, size);
1499     if (nb_clusters > INT_MAX) {
1500         res->check_errors++;
1501         return -EFBIG;
1502     }
1503 
1504     refcount_table = g_malloc0(nb_clusters * sizeof(uint16_t));
1505 
1506     res->bfi.total_clusters =
1507         size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
1508 
1509     /* header */
1510     inc_refcounts(bs, res, refcount_table, nb_clusters,
1511         0, s->cluster_size);
1512 
1513     /* current L1 table */
1514     ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
1515                              s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO);
1516     if (ret < 0) {
1517         goto fail;
1518     }
1519 
1520     /* snapshots */
1521     for(i = 0; i < s->nb_snapshots; i++) {
1522         sn = s->snapshots + i;
1523         ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
1524             sn->l1_table_offset, sn->l1_size, 0);
1525         if (ret < 0) {
1526             goto fail;
1527         }
1528     }
1529     inc_refcounts(bs, res, refcount_table, nb_clusters,
1530         s->snapshots_offset, s->snapshots_size);
1531 
1532     /* refcount data */
1533     inc_refcounts(bs, res, refcount_table, nb_clusters,
1534         s->refcount_table_offset,
1535         s->refcount_table_size * sizeof(uint64_t));
1536 
1537     for(i = 0; i < s->refcount_table_size; i++) {
1538         uint64_t offset, cluster;
1539         offset = s->refcount_table[i];
1540         cluster = offset >> s->cluster_bits;
1541 
1542         /* Refcount blocks are cluster aligned */
1543         if (offset_into_cluster(s, offset)) {
1544             fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
1545                 "cluster aligned; refcount table entry corrupted\n", i);
1546             res->corruptions++;
1547             continue;
1548         }
1549 
1550         if (cluster >= nb_clusters) {
1551             fprintf(stderr, "ERROR refcount block %" PRId64
1552                     " is outside image\n", i);
1553             res->corruptions++;
1554             continue;
1555         }
1556 
1557         if (offset != 0) {
1558             inc_refcounts(bs, res, refcount_table, nb_clusters,
1559                 offset, s->cluster_size);
1560             if (refcount_table[cluster] != 1) {
1561                 fprintf(stderr, "%s refcount block %" PRId64
1562                     " refcount=%d\n",
1563                     fix & BDRV_FIX_ERRORS ? "Repairing" :
1564                                             "ERROR",
1565                     i, refcount_table[cluster]);
1566 
1567                 if (fix & BDRV_FIX_ERRORS) {
1568                     int64_t new_offset;
1569 
1570                     new_offset = realloc_refcount_block(bs, i, offset);
1571                     if (new_offset < 0) {
1572                         res->corruptions++;
1573                         continue;
1574                     }
1575 
1576                     /* update refcounts */
1577                     if ((new_offset >> s->cluster_bits) >= nb_clusters) {
1578                         /* increase refcount_table size if necessary */
1579                         int old_nb_clusters = nb_clusters;
1580                         nb_clusters = (new_offset >> s->cluster_bits) + 1;
1581                         refcount_table = g_realloc(refcount_table,
1582                                 nb_clusters * sizeof(uint16_t));
1583                         memset(&refcount_table[old_nb_clusters], 0, (nb_clusters
1584                                 - old_nb_clusters) * sizeof(uint16_t));
1585                     }
1586                     refcount_table[cluster]--;
1587                     inc_refcounts(bs, res, refcount_table, nb_clusters,
1588                             new_offset, s->cluster_size);
1589 
1590                     res->corruptions_fixed++;
1591                 } else {
1592                     res->corruptions++;
1593                 }
1594             }
1595         }
1596     }
1597 
1598     /* compare ref counts */
1599     for (i = 0, highest_cluster = 0; i < nb_clusters; i++) {
1600         refcount1 = get_refcount(bs, i);
1601         if (refcount1 < 0) {
1602             fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
1603                 i, strerror(-refcount1));
1604             res->check_errors++;
1605             continue;
1606         }
1607 
1608         refcount2 = refcount_table[i];
1609 
1610         if (refcount1 > 0 || refcount2 > 0) {
1611             highest_cluster = i;
1612         }
1613 
1614         if (refcount1 != refcount2) {
1615 
1616             /* Check if we're allowed to fix the mismatch */
1617             int *num_fixed = NULL;
1618             if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
1619                 num_fixed = &res->leaks_fixed;
1620             } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
1621                 num_fixed = &res->corruptions_fixed;
1622             }
1623 
1624             fprintf(stderr, "%s cluster %" PRId64 " refcount=%d reference=%d\n",
1625                    num_fixed != NULL     ? "Repairing" :
1626                    refcount1 < refcount2 ? "ERROR" :
1627                                            "Leaked",
1628                    i, refcount1, refcount2);
1629 
1630             if (num_fixed) {
1631                 ret = update_refcount(bs, i << s->cluster_bits, 1,
1632                                       refcount2 - refcount1,
1633                                       QCOW2_DISCARD_ALWAYS);
1634                 if (ret >= 0) {
1635                     (*num_fixed)++;
1636                     continue;
1637                 }
1638             }
1639 
1640             /* And if we couldn't, print an error */
1641             if (refcount1 < refcount2) {
1642                 res->corruptions++;
1643             } else {
1644                 res->leaks++;
1645             }
1646         }
1647     }
1648 
1649     /* check OFLAG_COPIED */
1650     ret = check_oflag_copied(bs, res, fix);
1651     if (ret < 0) {
1652         goto fail;
1653     }
1654 
1655     res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
1656     ret = 0;
1657 
1658 fail:
1659     g_free(refcount_table);
1660 
1661     return ret;
1662 }
1663 
1664 #define overlaps_with(ofs, sz) \
1665     ranges_overlap(offset, size, ofs, sz)
1666 
1667 /*
1668  * Checks if the given offset into the image file is actually free to use by
1669  * looking for overlaps with important metadata sections (L1/L2 tables etc.),
1670  * i.e. a sanity check without relying on the refcount tables.
1671  *
1672  * The ign parameter specifies what checks not to perform (being a bitmask of
1673  * QCow2MetadataOverlap values), i.e., what sections to ignore.
1674  *
1675  * Returns:
1676  * - 0 if writing to this offset will not affect the mentioned metadata
1677  * - a positive QCow2MetadataOverlap value indicating one overlapping section
1678  * - a negative value (-errno) indicating an error while performing a check,
1679  *   e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2
1680  */
1681 int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
1682                                  int64_t size)
1683 {
1684     BDRVQcowState *s = bs->opaque;
1685     int chk = s->overlap_check & ~ign;
1686     int i, j;
1687 
1688     if (!size) {
1689         return 0;
1690     }
1691 
1692     if (chk & QCOW2_OL_MAIN_HEADER) {
1693         if (offset < s->cluster_size) {
1694             return QCOW2_OL_MAIN_HEADER;
1695         }
1696     }
1697 
1698     /* align range to test to cluster boundaries */
1699     size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size);
1700     offset = start_of_cluster(s, offset);
1701 
1702     if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
1703         if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) {
1704             return QCOW2_OL_ACTIVE_L1;
1705         }
1706     }
1707 
1708     if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
1709         if (overlaps_with(s->refcount_table_offset,
1710             s->refcount_table_size * sizeof(uint64_t))) {
1711             return QCOW2_OL_REFCOUNT_TABLE;
1712         }
1713     }
1714 
1715     if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
1716         if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
1717             return QCOW2_OL_SNAPSHOT_TABLE;
1718         }
1719     }
1720 
1721     if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
1722         for (i = 0; i < s->nb_snapshots; i++) {
1723             if (s->snapshots[i].l1_size &&
1724                 overlaps_with(s->snapshots[i].l1_table_offset,
1725                 s->snapshots[i].l1_size * sizeof(uint64_t))) {
1726                 return QCOW2_OL_INACTIVE_L1;
1727             }
1728         }
1729     }
1730 
1731     if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
1732         for (i = 0; i < s->l1_size; i++) {
1733             if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
1734                 overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
1735                 s->cluster_size)) {
1736                 return QCOW2_OL_ACTIVE_L2;
1737             }
1738         }
1739     }
1740 
1741     if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
1742         for (i = 0; i < s->refcount_table_size; i++) {
1743             if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
1744                 overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
1745                 s->cluster_size)) {
1746                 return QCOW2_OL_REFCOUNT_BLOCK;
1747             }
1748         }
1749     }
1750 
1751     if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
1752         for (i = 0; i < s->nb_snapshots; i++) {
1753             uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
1754             uint32_t l1_sz  = s->snapshots[i].l1_size;
1755             uint64_t l1_sz2 = l1_sz * sizeof(uint64_t);
1756             uint64_t *l1 = g_malloc(l1_sz2);
1757             int ret;
1758 
1759             ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2);
1760             if (ret < 0) {
1761                 g_free(l1);
1762                 return ret;
1763             }
1764 
1765             for (j = 0; j < l1_sz; j++) {
1766                 uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
1767                 if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
1768                     g_free(l1);
1769                     return QCOW2_OL_INACTIVE_L2;
1770                 }
1771             }
1772 
1773             g_free(l1);
1774         }
1775     }
1776 
1777     return 0;
1778 }
1779 
1780 static const char *metadata_ol_names[] = {
1781     [QCOW2_OL_MAIN_HEADER_BITNR]    = "qcow2_header",
1782     [QCOW2_OL_ACTIVE_L1_BITNR]      = "active L1 table",
1783     [QCOW2_OL_ACTIVE_L2_BITNR]      = "active L2 table",
1784     [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table",
1785     [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block",
1786     [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table",
1787     [QCOW2_OL_INACTIVE_L1_BITNR]    = "inactive L1 table",
1788     [QCOW2_OL_INACTIVE_L2_BITNR]    = "inactive L2 table",
1789 };
1790 
1791 /*
1792  * First performs a check for metadata overlaps (through
1793  * qcow2_check_metadata_overlap); if that fails with a negative value (error
1794  * while performing a check), that value is returned. If an impending overlap
1795  * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
1796  * and -EIO returned.
1797  *
1798  * Returns 0 if there were neither overlaps nor errors while checking for
1799  * overlaps; or a negative value (-errno) on error.
1800  */
1801 int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
1802                                   int64_t size)
1803 {
1804     int ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
1805 
1806     if (ret < 0) {
1807         return ret;
1808     } else if (ret > 0) {
1809         int metadata_ol_bitnr = ffs(ret) - 1;
1810         char *message;
1811 
1812         assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
1813 
1814         fprintf(stderr, "qcow2: Preventing invalid write on metadata (overlaps "
1815                 "with %s); image marked as corrupt.\n",
1816                 metadata_ol_names[metadata_ol_bitnr]);
1817         message = g_strdup_printf("Prevented %s overwrite",
1818                 metadata_ol_names[metadata_ol_bitnr]);
1819         qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs),
1820                                               message,
1821                                               true,
1822                                               offset,
1823                                               true,
1824                                               size,
1825                                               &error_abort);
1826         g_free(message);
1827 
1828         qcow2_mark_corrupt(bs);
1829         bs->drv = NULL; /* make BDS unusable */
1830         return -EIO;
1831     }
1832 
1833     return 0;
1834 }
1835