xref: /openbmc/qemu/block/qcow2-refcount.c (revision 35d08458)
1 /*
2  * Block driver for the QCOW version 2 format
3  *
4  * Copyright (c) 2004-2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu-common.h"
26 #include "block/block_int.h"
27 #include "block/qcow2.h"
28 #include "qemu/range.h"
29 #include "qapi/qmp/types.h"
30 
31 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
32 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
33                             int64_t offset, int64_t length,
34                             int addend, enum qcow2_discard_type type);
35 
36 
37 /*********************************************************/
38 /* refcount handling */
39 
40 int qcow2_refcount_init(BlockDriverState *bs)
41 {
42     BDRVQcowState *s = bs->opaque;
43     unsigned int refcount_table_size2, i;
44     int ret;
45 
46     assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t));
47     refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
48     s->refcount_table = g_malloc(refcount_table_size2);
49     if (s->refcount_table_size > 0) {
50         BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
51         ret = bdrv_pread(bs->file, s->refcount_table_offset,
52                          s->refcount_table, refcount_table_size2);
53         if (ret != refcount_table_size2)
54             goto fail;
55         for(i = 0; i < s->refcount_table_size; i++)
56             be64_to_cpus(&s->refcount_table[i]);
57     }
58     return 0;
59  fail:
60     return -ENOMEM;
61 }
62 
63 void qcow2_refcount_close(BlockDriverState *bs)
64 {
65     BDRVQcowState *s = bs->opaque;
66     g_free(s->refcount_table);
67 }
68 
69 
70 static int load_refcount_block(BlockDriverState *bs,
71                                int64_t refcount_block_offset,
72                                void **refcount_block)
73 {
74     BDRVQcowState *s = bs->opaque;
75     int ret;
76 
77     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
78     ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
79         refcount_block);
80 
81     return ret;
82 }
83 
84 /*
85  * Returns the refcount of the cluster given by its index. Any non-negative
86  * return value is the refcount of the cluster, negative values are -errno
87  * and indicate an error.
88  */
89 static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
90 {
91     BDRVQcowState *s = bs->opaque;
92     uint64_t refcount_table_index, block_index;
93     int64_t refcount_block_offset;
94     int ret;
95     uint16_t *refcount_block;
96     uint16_t refcount;
97 
98     refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
99     if (refcount_table_index >= s->refcount_table_size)
100         return 0;
101     refcount_block_offset =
102         s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
103     if (!refcount_block_offset)
104         return 0;
105 
106     ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
107         (void**) &refcount_block);
108     if (ret < 0) {
109         return ret;
110     }
111 
112     block_index = cluster_index &
113         ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
114     refcount = be16_to_cpu(refcount_block[block_index]);
115 
116     ret = qcow2_cache_put(bs, s->refcount_block_cache,
117         (void**) &refcount_block);
118     if (ret < 0) {
119         return ret;
120     }
121 
122     return refcount;
123 }
124 
125 /*
126  * Rounds the refcount table size up to avoid growing the table for each single
127  * refcount block that is allocated.
128  */
129 static unsigned int next_refcount_table_size(BDRVQcowState *s,
130     unsigned int min_size)
131 {
132     unsigned int min_clusters = (min_size >> (s->cluster_bits - 3)) + 1;
133     unsigned int refcount_table_clusters =
134         MAX(1, s->refcount_table_size >> (s->cluster_bits - 3));
135 
136     while (min_clusters > refcount_table_clusters) {
137         refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
138     }
139 
140     return refcount_table_clusters << (s->cluster_bits - 3);
141 }
142 
143 
144 /* Checks if two offsets are described by the same refcount block */
145 static int in_same_refcount_block(BDRVQcowState *s, uint64_t offset_a,
146     uint64_t offset_b)
147 {
148     uint64_t block_a = offset_a >> (2 * s->cluster_bits - REFCOUNT_SHIFT);
149     uint64_t block_b = offset_b >> (2 * s->cluster_bits - REFCOUNT_SHIFT);
150 
151     return (block_a == block_b);
152 }
153 
154 /*
155  * Loads a refcount block. If it doesn't exist yet, it is allocated first
156  * (including growing the refcount table if needed).
157  *
158  * Returns 0 on success or -errno in error case
159  */
160 static int alloc_refcount_block(BlockDriverState *bs,
161     int64_t cluster_index, uint16_t **refcount_block)
162 {
163     BDRVQcowState *s = bs->opaque;
164     unsigned int refcount_table_index;
165     int ret;
166 
167     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
168 
169     /* Find the refcount block for the given cluster */
170     refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
171 
172     if (refcount_table_index < s->refcount_table_size) {
173 
174         uint64_t refcount_block_offset =
175             s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
176 
177         /* If it's already there, we're done */
178         if (refcount_block_offset) {
179              return load_refcount_block(bs, refcount_block_offset,
180                  (void**) refcount_block);
181         }
182     }
183 
184     /*
185      * If we came here, we need to allocate something. Something is at least
186      * a cluster for the new refcount block. It may also include a new refcount
187      * table if the old refcount table is too small.
188      *
189      * Note that allocating clusters here needs some special care:
190      *
191      * - We can't use the normal qcow2_alloc_clusters(), it would try to
192      *   increase the refcount and very likely we would end up with an endless
193      *   recursion. Instead we must place the refcount blocks in a way that
194      *   they can describe them themselves.
195      *
196      * - We need to consider that at this point we are inside update_refcounts
197      *   and potentially doing an initial refcount increase. This means that
198      *   some clusters have already been allocated by the caller, but their
199      *   refcount isn't accurate yet. If we allocate clusters for metadata, we
200      *   need to return -EAGAIN to signal the caller that it needs to restart
201      *   the search for free clusters.
202      *
203      * - alloc_clusters_noref and qcow2_free_clusters may load a different
204      *   refcount block into the cache
205      */
206 
207     *refcount_block = NULL;
208 
209     /* We write to the refcount table, so we might depend on L2 tables */
210     ret = qcow2_cache_flush(bs, s->l2_table_cache);
211     if (ret < 0) {
212         return ret;
213     }
214 
215     /* Allocate the refcount block itself and mark it as used */
216     int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
217     if (new_block < 0) {
218         return new_block;
219     }
220 
221 #ifdef DEBUG_ALLOC2
222     fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
223         " at %" PRIx64 "\n",
224         refcount_table_index, cluster_index << s->cluster_bits, new_block);
225 #endif
226 
227     if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
228         /* Zero the new refcount block before updating it */
229         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
230             (void**) refcount_block);
231         if (ret < 0) {
232             goto fail_block;
233         }
234 
235         memset(*refcount_block, 0, s->cluster_size);
236 
237         /* The block describes itself, need to update the cache */
238         int block_index = (new_block >> s->cluster_bits) &
239             ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
240         (*refcount_block)[block_index] = cpu_to_be16(1);
241     } else {
242         /* Described somewhere else. This can recurse at most twice before we
243          * arrive at a block that describes itself. */
244         ret = update_refcount(bs, new_block, s->cluster_size, 1,
245                               QCOW2_DISCARD_NEVER);
246         if (ret < 0) {
247             goto fail_block;
248         }
249 
250         ret = qcow2_cache_flush(bs, s->refcount_block_cache);
251         if (ret < 0) {
252             goto fail_block;
253         }
254 
255         /* Initialize the new refcount block only after updating its refcount,
256          * update_refcount uses the refcount cache itself */
257         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
258             (void**) refcount_block);
259         if (ret < 0) {
260             goto fail_block;
261         }
262 
263         memset(*refcount_block, 0, s->cluster_size);
264     }
265 
266     /* Now the new refcount block needs to be written to disk */
267     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
268     qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
269     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
270     if (ret < 0) {
271         goto fail_block;
272     }
273 
274     /* If the refcount table is big enough, just hook the block up there */
275     if (refcount_table_index < s->refcount_table_size) {
276         uint64_t data64 = cpu_to_be64(new_block);
277         BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
278         ret = bdrv_pwrite_sync(bs->file,
279             s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
280             &data64, sizeof(data64));
281         if (ret < 0) {
282             goto fail_block;
283         }
284 
285         s->refcount_table[refcount_table_index] = new_block;
286 
287         /* The new refcount block may be where the caller intended to put its
288          * data, so let it restart the search. */
289         return -EAGAIN;
290     }
291 
292     ret = qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block);
293     if (ret < 0) {
294         goto fail_block;
295     }
296 
297     /*
298      * If we come here, we need to grow the refcount table. Again, a new
299      * refcount table needs some space and we can't simply allocate to avoid
300      * endless recursion.
301      *
302      * Therefore let's grab new refcount blocks at the end of the image, which
303      * will describe themselves and the new refcount table. This way we can
304      * reference them only in the new table and do the switch to the new
305      * refcount table at once without producing an inconsistent state in
306      * between.
307      */
308     BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
309 
310     /* Calculate the number of refcount blocks needed so far */
311     uint64_t refcount_block_clusters = 1 << (s->cluster_bits - REFCOUNT_SHIFT);
312     uint64_t blocks_used = DIV_ROUND_UP(cluster_index, refcount_block_clusters);
313 
314     if (blocks_used > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
315         return -EFBIG;
316     }
317 
318     /* And now we need at least one block more for the new metadata */
319     uint64_t table_size = next_refcount_table_size(s, blocks_used + 1);
320     uint64_t last_table_size;
321     uint64_t blocks_clusters;
322     do {
323         uint64_t table_clusters =
324             size_to_clusters(s, table_size * sizeof(uint64_t));
325         blocks_clusters = 1 +
326             ((table_clusters + refcount_block_clusters - 1)
327             / refcount_block_clusters);
328         uint64_t meta_clusters = table_clusters + blocks_clusters;
329 
330         last_table_size = table_size;
331         table_size = next_refcount_table_size(s, blocks_used +
332             ((meta_clusters + refcount_block_clusters - 1)
333             / refcount_block_clusters));
334 
335     } while (last_table_size != table_size);
336 
337 #ifdef DEBUG_ALLOC2
338     fprintf(stderr, "qcow2: Grow refcount table %" PRId32 " => %" PRId64 "\n",
339         s->refcount_table_size, table_size);
340 #endif
341 
342     /* Create the new refcount table and blocks */
343     uint64_t meta_offset = (blocks_used * refcount_block_clusters) *
344         s->cluster_size;
345     uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size;
346     uint16_t *new_blocks = g_malloc0(blocks_clusters * s->cluster_size);
347     uint64_t *new_table = g_malloc0(table_size * sizeof(uint64_t));
348 
349     /* Fill the new refcount table */
350     memcpy(new_table, s->refcount_table,
351         s->refcount_table_size * sizeof(uint64_t));
352     new_table[refcount_table_index] = new_block;
353 
354     int i;
355     for (i = 0; i < blocks_clusters; i++) {
356         new_table[blocks_used + i] = meta_offset + (i * s->cluster_size);
357     }
358 
359     /* Fill the refcount blocks */
360     uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t));
361     int block = 0;
362     for (i = 0; i < table_clusters + blocks_clusters; i++) {
363         new_blocks[block++] = cpu_to_be16(1);
364     }
365 
366     /* Write refcount blocks to disk */
367     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
368     ret = bdrv_pwrite_sync(bs->file, meta_offset, new_blocks,
369         blocks_clusters * s->cluster_size);
370     g_free(new_blocks);
371     if (ret < 0) {
372         goto fail_table;
373     }
374 
375     /* Write refcount table to disk */
376     for(i = 0; i < table_size; i++) {
377         cpu_to_be64s(&new_table[i]);
378     }
379 
380     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
381     ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
382         table_size * sizeof(uint64_t));
383     if (ret < 0) {
384         goto fail_table;
385     }
386 
387     for(i = 0; i < table_size; i++) {
388         be64_to_cpus(&new_table[i]);
389     }
390 
391     /* Hook up the new refcount table in the qcow2 header */
392     uint8_t data[12];
393     cpu_to_be64w((uint64_t*)data, table_offset);
394     cpu_to_be32w((uint32_t*)(data + 8), table_clusters);
395     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
396     ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, refcount_table_offset),
397         data, sizeof(data));
398     if (ret < 0) {
399         goto fail_table;
400     }
401 
402     /* And switch it in memory */
403     uint64_t old_table_offset = s->refcount_table_offset;
404     uint64_t old_table_size = s->refcount_table_size;
405 
406     g_free(s->refcount_table);
407     s->refcount_table = new_table;
408     s->refcount_table_size = table_size;
409     s->refcount_table_offset = table_offset;
410 
411     /* Free old table. */
412     qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
413                         QCOW2_DISCARD_OTHER);
414 
415     ret = load_refcount_block(bs, new_block, (void**) refcount_block);
416     if (ret < 0) {
417         return ret;
418     }
419 
420     /* If we were trying to do the initial refcount update for some cluster
421      * allocation, we might have used the same clusters to store newly
422      * allocated metadata. Make the caller search some new space. */
423     return -EAGAIN;
424 
425 fail_table:
426     g_free(new_table);
427 fail_block:
428     if (*refcount_block != NULL) {
429         qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block);
430     }
431     return ret;
432 }
433 
434 void qcow2_process_discards(BlockDriverState *bs, int ret)
435 {
436     BDRVQcowState *s = bs->opaque;
437     Qcow2DiscardRegion *d, *next;
438 
439     QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
440         QTAILQ_REMOVE(&s->discards, d, next);
441 
442         /* Discard is optional, ignore the return value */
443         if (ret >= 0) {
444             bdrv_discard(bs->file,
445                          d->offset >> BDRV_SECTOR_BITS,
446                          d->bytes >> BDRV_SECTOR_BITS);
447         }
448 
449         g_free(d);
450     }
451 }
452 
453 static void update_refcount_discard(BlockDriverState *bs,
454                                     uint64_t offset, uint64_t length)
455 {
456     BDRVQcowState *s = bs->opaque;
457     Qcow2DiscardRegion *d, *p, *next;
458 
459     QTAILQ_FOREACH(d, &s->discards, next) {
460         uint64_t new_start = MIN(offset, d->offset);
461         uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
462 
463         if (new_end - new_start <= length + d->bytes) {
464             /* There can't be any overlap, areas ending up here have no
465              * references any more and therefore shouldn't get freed another
466              * time. */
467             assert(d->bytes + length == new_end - new_start);
468             d->offset = new_start;
469             d->bytes = new_end - new_start;
470             goto found;
471         }
472     }
473 
474     d = g_malloc(sizeof(*d));
475     *d = (Qcow2DiscardRegion) {
476         .bs     = bs,
477         .offset = offset,
478         .bytes  = length,
479     };
480     QTAILQ_INSERT_TAIL(&s->discards, d, next);
481 
482 found:
483     /* Merge discard requests if they are adjacent now */
484     QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
485         if (p == d
486             || p->offset > d->offset + d->bytes
487             || d->offset > p->offset + p->bytes)
488         {
489             continue;
490         }
491 
492         /* Still no overlap possible */
493         assert(p->offset == d->offset + d->bytes
494             || d->offset == p->offset + p->bytes);
495 
496         QTAILQ_REMOVE(&s->discards, p, next);
497         d->offset = MIN(d->offset, p->offset);
498         d->bytes += p->bytes;
499     }
500 }
501 
502 /* XXX: cache several refcount block clusters ? */
503 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
504     int64_t offset, int64_t length, int addend, enum qcow2_discard_type type)
505 {
506     BDRVQcowState *s = bs->opaque;
507     int64_t start, last, cluster_offset;
508     uint16_t *refcount_block = NULL;
509     int64_t old_table_index = -1;
510     int ret;
511 
512 #ifdef DEBUG_ALLOC2
513     fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64 " addend=%d\n",
514            offset, length, addend);
515 #endif
516     if (length < 0) {
517         return -EINVAL;
518     } else if (length == 0) {
519         return 0;
520     }
521 
522     if (addend < 0) {
523         qcow2_cache_set_dependency(bs, s->refcount_block_cache,
524             s->l2_table_cache);
525     }
526 
527     start = start_of_cluster(s, offset);
528     last = start_of_cluster(s, offset + length - 1);
529     for(cluster_offset = start; cluster_offset <= last;
530         cluster_offset += s->cluster_size)
531     {
532         int block_index, refcount;
533         int64_t cluster_index = cluster_offset >> s->cluster_bits;
534         int64_t table_index =
535             cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
536 
537         /* Load the refcount block and allocate it if needed */
538         if (table_index != old_table_index) {
539             if (refcount_block) {
540                 ret = qcow2_cache_put(bs, s->refcount_block_cache,
541                     (void**) &refcount_block);
542                 if (ret < 0) {
543                     goto fail;
544                 }
545             }
546 
547             ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
548             if (ret < 0) {
549                 goto fail;
550             }
551         }
552         old_table_index = table_index;
553 
554         qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
555 
556         /* we can update the count and save it */
557         block_index = cluster_index &
558             ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
559 
560         refcount = be16_to_cpu(refcount_block[block_index]);
561         refcount += addend;
562         if (refcount < 0 || refcount > 0xffff) {
563             ret = -EINVAL;
564             goto fail;
565         }
566         if (refcount == 0 && cluster_index < s->free_cluster_index) {
567             s->free_cluster_index = cluster_index;
568         }
569         refcount_block[block_index] = cpu_to_be16(refcount);
570 
571         if (refcount == 0 && s->discard_passthrough[type]) {
572             update_refcount_discard(bs, cluster_offset, s->cluster_size);
573         }
574     }
575 
576     ret = 0;
577 fail:
578     if (!s->cache_discards) {
579         qcow2_process_discards(bs, ret);
580     }
581 
582     /* Write last changed block to disk */
583     if (refcount_block) {
584         int wret;
585         wret = qcow2_cache_put(bs, s->refcount_block_cache,
586             (void**) &refcount_block);
587         if (wret < 0) {
588             return ret < 0 ? ret : wret;
589         }
590     }
591 
592     /*
593      * Try do undo any updates if an error is returned (This may succeed in
594      * some cases like ENOSPC for allocating a new refcount block)
595      */
596     if (ret < 0) {
597         int dummy;
598         dummy = update_refcount(bs, offset, cluster_offset - offset, -addend,
599                                 QCOW2_DISCARD_NEVER);
600         (void)dummy;
601     }
602 
603     return ret;
604 }
605 
606 /*
607  * Increases or decreases the refcount of a given cluster by one.
608  * addend must be 1 or -1.
609  *
610  * If the return value is non-negative, it is the new refcount of the cluster.
611  * If it is negative, it is -errno and indicates an error.
612  */
613 int qcow2_update_cluster_refcount(BlockDriverState *bs,
614                                   int64_t cluster_index,
615                                   int addend,
616                                   enum qcow2_discard_type type)
617 {
618     BDRVQcowState *s = bs->opaque;
619     int ret;
620 
621     ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
622                           type);
623     if (ret < 0) {
624         return ret;
625     }
626 
627     return get_refcount(bs, cluster_index);
628 }
629 
630 
631 
632 /*********************************************************/
633 /* cluster allocation functions */
634 
635 
636 
637 /* return < 0 if error */
638 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size)
639 {
640     BDRVQcowState *s = bs->opaque;
641     uint64_t i, nb_clusters;
642     int refcount;
643 
644     nb_clusters = size_to_clusters(s, size);
645 retry:
646     for(i = 0; i < nb_clusters; i++) {
647         uint64_t next_cluster_index = s->free_cluster_index++;
648         refcount = get_refcount(bs, next_cluster_index);
649 
650         if (refcount < 0) {
651             return refcount;
652         } else if (refcount != 0) {
653             goto retry;
654         }
655     }
656 
657     /* Make sure that all offsets in the "allocated" range are representable
658      * in an int64_t */
659     if (s->free_cluster_index > 0 &&
660         s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits))
661     {
662         return -EFBIG;
663     }
664 
665 #ifdef DEBUG_ALLOC2
666     fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
667             size,
668             (s->free_cluster_index - nb_clusters) << s->cluster_bits);
669 #endif
670     return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
671 }
672 
673 int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
674 {
675     int64_t offset;
676     int ret;
677 
678     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
679     do {
680         offset = alloc_clusters_noref(bs, size);
681         if (offset < 0) {
682             return offset;
683         }
684 
685         ret = update_refcount(bs, offset, size, 1, QCOW2_DISCARD_NEVER);
686     } while (ret == -EAGAIN);
687 
688     if (ret < 0) {
689         return ret;
690     }
691 
692     return offset;
693 }
694 
695 int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
696     int nb_clusters)
697 {
698     BDRVQcowState *s = bs->opaque;
699     uint64_t cluster_index;
700     uint64_t i;
701     int refcount, ret;
702 
703     assert(nb_clusters >= 0);
704     if (nb_clusters == 0) {
705         return 0;
706     }
707 
708     do {
709         /* Check how many clusters there are free */
710         cluster_index = offset >> s->cluster_bits;
711         for(i = 0; i < nb_clusters; i++) {
712             refcount = get_refcount(bs, cluster_index++);
713 
714             if (refcount < 0) {
715                 return refcount;
716             } else if (refcount != 0) {
717                 break;
718             }
719         }
720 
721         /* And then allocate them */
722         ret = update_refcount(bs, offset, i << s->cluster_bits, 1,
723                               QCOW2_DISCARD_NEVER);
724     } while (ret == -EAGAIN);
725 
726     if (ret < 0) {
727         return ret;
728     }
729 
730     return i;
731 }
732 
733 /* only used to allocate compressed sectors. We try to allocate
734    contiguous sectors. size must be <= cluster_size */
735 int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
736 {
737     BDRVQcowState *s = bs->opaque;
738     int64_t offset, cluster_offset;
739     int free_in_cluster;
740 
741     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
742     assert(size > 0 && size <= s->cluster_size);
743     if (s->free_byte_offset == 0) {
744         offset = qcow2_alloc_clusters(bs, s->cluster_size);
745         if (offset < 0) {
746             return offset;
747         }
748         s->free_byte_offset = offset;
749     }
750  redo:
751     free_in_cluster = s->cluster_size -
752         offset_into_cluster(s, s->free_byte_offset);
753     if (size <= free_in_cluster) {
754         /* enough space in current cluster */
755         offset = s->free_byte_offset;
756         s->free_byte_offset += size;
757         free_in_cluster -= size;
758         if (free_in_cluster == 0)
759             s->free_byte_offset = 0;
760         if (offset_into_cluster(s, offset) != 0)
761             qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
762                                           QCOW2_DISCARD_NEVER);
763     } else {
764         offset = qcow2_alloc_clusters(bs, s->cluster_size);
765         if (offset < 0) {
766             return offset;
767         }
768         cluster_offset = start_of_cluster(s, s->free_byte_offset);
769         if ((cluster_offset + s->cluster_size) == offset) {
770             /* we are lucky: contiguous data */
771             offset = s->free_byte_offset;
772             qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
773                                           QCOW2_DISCARD_NEVER);
774             s->free_byte_offset += size;
775         } else {
776             s->free_byte_offset = offset;
777             goto redo;
778         }
779     }
780 
781     /* The cluster refcount was incremented, either by qcow2_alloc_clusters()
782      * or explicitly by qcow2_update_cluster_refcount().  Refcount blocks must
783      * be flushed before the caller's L2 table updates.
784      */
785     qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
786     return offset;
787 }
788 
789 void qcow2_free_clusters(BlockDriverState *bs,
790                           int64_t offset, int64_t size,
791                           enum qcow2_discard_type type)
792 {
793     int ret;
794 
795     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
796     ret = update_refcount(bs, offset, size, -1, type);
797     if (ret < 0) {
798         fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
799         /* TODO Remember the clusters to free them later and avoid leaking */
800     }
801 }
802 
803 /*
804  * Free a cluster using its L2 entry (handles clusters of all types, e.g.
805  * normal cluster, compressed cluster, etc.)
806  */
807 void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
808                              int nb_clusters, enum qcow2_discard_type type)
809 {
810     BDRVQcowState *s = bs->opaque;
811 
812     switch (qcow2_get_cluster_type(l2_entry)) {
813     case QCOW2_CLUSTER_COMPRESSED:
814         {
815             int nb_csectors;
816             nb_csectors = ((l2_entry >> s->csize_shift) &
817                            s->csize_mask) + 1;
818             qcow2_free_clusters(bs,
819                 (l2_entry & s->cluster_offset_mask) & ~511,
820                 nb_csectors * 512, type);
821         }
822         break;
823     case QCOW2_CLUSTER_NORMAL:
824     case QCOW2_CLUSTER_ZERO:
825         if (l2_entry & L2E_OFFSET_MASK) {
826             qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
827                                 nb_clusters << s->cluster_bits, type);
828         }
829         break;
830     case QCOW2_CLUSTER_UNALLOCATED:
831         break;
832     default:
833         abort();
834     }
835 }
836 
837 
838 
839 /*********************************************************/
840 /* snapshots and image creation */
841 
842 
843 
844 /* update the refcounts of snapshots and the copied flag */
845 int qcow2_update_snapshot_refcount(BlockDriverState *bs,
846     int64_t l1_table_offset, int l1_size, int addend)
847 {
848     BDRVQcowState *s = bs->opaque;
849     uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated;
850     int64_t old_offset, old_l2_offset;
851     int i, j, l1_modified = 0, nb_csectors, refcount;
852     int ret;
853 
854     l2_table = NULL;
855     l1_table = NULL;
856     l1_size2 = l1_size * sizeof(uint64_t);
857 
858     s->cache_discards = true;
859 
860     /* WARNING: qcow2_snapshot_goto relies on this function not using the
861      * l1_table_offset when it is the current s->l1_table_offset! Be careful
862      * when changing this! */
863     if (l1_table_offset != s->l1_table_offset) {
864         l1_table = g_malloc0(align_offset(l1_size2, 512));
865         l1_allocated = 1;
866 
867         ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
868         if (ret < 0) {
869             goto fail;
870         }
871 
872         for(i = 0;i < l1_size; i++)
873             be64_to_cpus(&l1_table[i]);
874     } else {
875         assert(l1_size == s->l1_size);
876         l1_table = s->l1_table;
877         l1_allocated = 0;
878     }
879 
880     for(i = 0; i < l1_size; i++) {
881         l2_offset = l1_table[i];
882         if (l2_offset) {
883             old_l2_offset = l2_offset;
884             l2_offset &= L1E_OFFSET_MASK;
885 
886             ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
887                 (void**) &l2_table);
888             if (ret < 0) {
889                 goto fail;
890             }
891 
892             for(j = 0; j < s->l2_size; j++) {
893                 uint64_t cluster_index;
894 
895                 offset = be64_to_cpu(l2_table[j]);
896                 old_offset = offset;
897                 offset &= ~QCOW_OFLAG_COPIED;
898 
899                 switch (qcow2_get_cluster_type(offset)) {
900                     case QCOW2_CLUSTER_COMPRESSED:
901                         nb_csectors = ((offset >> s->csize_shift) &
902                                        s->csize_mask) + 1;
903                         if (addend != 0) {
904                             ret = update_refcount(bs,
905                                 (offset & s->cluster_offset_mask) & ~511,
906                                 nb_csectors * 512, addend,
907                                 QCOW2_DISCARD_SNAPSHOT);
908                             if (ret < 0) {
909                                 goto fail;
910                             }
911                         }
912                         /* compressed clusters are never modified */
913                         refcount = 2;
914                         break;
915 
916                     case QCOW2_CLUSTER_NORMAL:
917                     case QCOW2_CLUSTER_ZERO:
918                         cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits;
919                         if (!cluster_index) {
920                             /* unallocated */
921                             refcount = 0;
922                             break;
923                         }
924                         if (addend != 0) {
925                             refcount = qcow2_update_cluster_refcount(bs,
926                                     cluster_index, addend,
927                                     QCOW2_DISCARD_SNAPSHOT);
928                         } else {
929                             refcount = get_refcount(bs, cluster_index);
930                         }
931 
932                         if (refcount < 0) {
933                             ret = refcount;
934                             goto fail;
935                         }
936                         break;
937 
938                     case QCOW2_CLUSTER_UNALLOCATED:
939                         refcount = 0;
940                         break;
941 
942                     default:
943                         abort();
944                 }
945 
946                 if (refcount == 1) {
947                     offset |= QCOW_OFLAG_COPIED;
948                 }
949                 if (offset != old_offset) {
950                     if (addend > 0) {
951                         qcow2_cache_set_dependency(bs, s->l2_table_cache,
952                             s->refcount_block_cache);
953                     }
954                     l2_table[j] = cpu_to_be64(offset);
955                     qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
956                 }
957             }
958 
959             ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
960             if (ret < 0) {
961                 goto fail;
962             }
963 
964 
965             if (addend != 0) {
966                 refcount = qcow2_update_cluster_refcount(bs, l2_offset >>
967                         s->cluster_bits, addend, QCOW2_DISCARD_SNAPSHOT);
968             } else {
969                 refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
970             }
971             if (refcount < 0) {
972                 ret = refcount;
973                 goto fail;
974             } else if (refcount == 1) {
975                 l2_offset |= QCOW_OFLAG_COPIED;
976             }
977             if (l2_offset != old_l2_offset) {
978                 l1_table[i] = l2_offset;
979                 l1_modified = 1;
980             }
981         }
982     }
983 
984     ret = bdrv_flush(bs);
985 fail:
986     if (l2_table) {
987         qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
988     }
989 
990     s->cache_discards = false;
991     qcow2_process_discards(bs, ret);
992 
993     /* Update L1 only if it isn't deleted anyway (addend = -1) */
994     if (ret == 0 && addend >= 0 && l1_modified) {
995         for (i = 0; i < l1_size; i++) {
996             cpu_to_be64s(&l1_table[i]);
997         }
998 
999         ret = bdrv_pwrite_sync(bs->file, l1_table_offset, l1_table, l1_size2);
1000 
1001         for (i = 0; i < l1_size; i++) {
1002             be64_to_cpus(&l1_table[i]);
1003         }
1004     }
1005     if (l1_allocated)
1006         g_free(l1_table);
1007     return ret;
1008 }
1009 
1010 
1011 
1012 
1013 /*********************************************************/
1014 /* refcount checking functions */
1015 
1016 
1017 
1018 /*
1019  * Increases the refcount for a range of clusters in a given refcount table.
1020  * This is used to construct a temporary refcount table out of L1 and L2 tables
1021  * which can be compared the the refcount table saved in the image.
1022  *
1023  * Modifies the number of errors in res.
1024  */
1025 static void inc_refcounts(BlockDriverState *bs,
1026                           BdrvCheckResult *res,
1027                           uint16_t *refcount_table,
1028                           int refcount_table_size,
1029                           int64_t offset, int64_t size)
1030 {
1031     BDRVQcowState *s = bs->opaque;
1032     uint64_t start, last, cluster_offset, k;
1033 
1034     if (size <= 0)
1035         return;
1036 
1037     start = start_of_cluster(s, offset);
1038     last = start_of_cluster(s, offset + size - 1);
1039     for(cluster_offset = start; cluster_offset <= last;
1040         cluster_offset += s->cluster_size) {
1041         k = cluster_offset >> s->cluster_bits;
1042         if (k >= refcount_table_size) {
1043             fprintf(stderr, "Warning: cluster offset=0x%" PRIx64 " is after "
1044                 "the end of the image file, can't properly check refcounts.\n",
1045                 cluster_offset);
1046             res->check_errors++;
1047         } else {
1048             if (++refcount_table[k] == 0) {
1049                 fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
1050                     "\n", cluster_offset);
1051                 res->corruptions++;
1052             }
1053         }
1054     }
1055 }
1056 
1057 /* Flags for check_refcounts_l1() and check_refcounts_l2() */
1058 enum {
1059     CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
1060 };
1061 
1062 /*
1063  * Increases the refcount in the given refcount table for the all clusters
1064  * referenced in the L2 table. While doing so, performs some checks on L2
1065  * entries.
1066  *
1067  * Returns the number of errors found by the checks or -errno if an internal
1068  * error occurred.
1069  */
1070 static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
1071     uint16_t *refcount_table, int refcount_table_size, int64_t l2_offset,
1072     int flags)
1073 {
1074     BDRVQcowState *s = bs->opaque;
1075     uint64_t *l2_table, l2_entry;
1076     uint64_t next_contiguous_offset = 0;
1077     int i, l2_size, nb_csectors;
1078 
1079     /* Read L2 table from disk */
1080     l2_size = s->l2_size * sizeof(uint64_t);
1081     l2_table = g_malloc(l2_size);
1082 
1083     if (bdrv_pread(bs->file, l2_offset, l2_table, l2_size) != l2_size)
1084         goto fail;
1085 
1086     /* Do the actual checks */
1087     for(i = 0; i < s->l2_size; i++) {
1088         l2_entry = be64_to_cpu(l2_table[i]);
1089 
1090         switch (qcow2_get_cluster_type(l2_entry)) {
1091         case QCOW2_CLUSTER_COMPRESSED:
1092             /* Compressed clusters don't have QCOW_OFLAG_COPIED */
1093             if (l2_entry & QCOW_OFLAG_COPIED) {
1094                 fprintf(stderr, "ERROR: cluster %" PRId64 ": "
1095                     "copied flag must never be set for compressed "
1096                     "clusters\n", l2_entry >> s->cluster_bits);
1097                 l2_entry &= ~QCOW_OFLAG_COPIED;
1098                 res->corruptions++;
1099             }
1100 
1101             /* Mark cluster as used */
1102             nb_csectors = ((l2_entry >> s->csize_shift) &
1103                            s->csize_mask) + 1;
1104             l2_entry &= s->cluster_offset_mask;
1105             inc_refcounts(bs, res, refcount_table, refcount_table_size,
1106                 l2_entry & ~511, nb_csectors * 512);
1107 
1108             if (flags & CHECK_FRAG_INFO) {
1109                 res->bfi.allocated_clusters++;
1110                 res->bfi.compressed_clusters++;
1111 
1112                 /* Compressed clusters are fragmented by nature.  Since they
1113                  * take up sub-sector space but we only have sector granularity
1114                  * I/O we need to re-read the same sectors even for adjacent
1115                  * compressed clusters.
1116                  */
1117                 res->bfi.fragmented_clusters++;
1118             }
1119             break;
1120 
1121         case QCOW2_CLUSTER_ZERO:
1122             if ((l2_entry & L2E_OFFSET_MASK) == 0) {
1123                 break;
1124             }
1125             /* fall through */
1126 
1127         case QCOW2_CLUSTER_NORMAL:
1128         {
1129             uint64_t offset = l2_entry & L2E_OFFSET_MASK;
1130 
1131             if (flags & CHECK_FRAG_INFO) {
1132                 res->bfi.allocated_clusters++;
1133                 if (next_contiguous_offset &&
1134                     offset != next_contiguous_offset) {
1135                     res->bfi.fragmented_clusters++;
1136                 }
1137                 next_contiguous_offset = offset + s->cluster_size;
1138             }
1139 
1140             /* Mark cluster as used */
1141             inc_refcounts(bs, res, refcount_table,refcount_table_size,
1142                 offset, s->cluster_size);
1143 
1144             /* Correct offsets are cluster aligned */
1145             if (offset_into_cluster(s, offset)) {
1146                 fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
1147                     "properly aligned; L2 entry corrupted.\n", offset);
1148                 res->corruptions++;
1149             }
1150             break;
1151         }
1152 
1153         case QCOW2_CLUSTER_UNALLOCATED:
1154             break;
1155 
1156         default:
1157             abort();
1158         }
1159     }
1160 
1161     g_free(l2_table);
1162     return 0;
1163 
1164 fail:
1165     fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
1166     g_free(l2_table);
1167     return -EIO;
1168 }
1169 
1170 /*
1171  * Increases the refcount for the L1 table, its L2 tables and all referenced
1172  * clusters in the given refcount table. While doing so, performs some checks
1173  * on L1 and L2 entries.
1174  *
1175  * Returns the number of errors found by the checks or -errno if an internal
1176  * error occurred.
1177  */
1178 static int check_refcounts_l1(BlockDriverState *bs,
1179                               BdrvCheckResult *res,
1180                               uint16_t *refcount_table,
1181                               int refcount_table_size,
1182                               int64_t l1_table_offset, int l1_size,
1183                               int flags)
1184 {
1185     BDRVQcowState *s = bs->opaque;
1186     uint64_t *l1_table, l2_offset, l1_size2;
1187     int i, ret;
1188 
1189     l1_size2 = l1_size * sizeof(uint64_t);
1190 
1191     /* Mark L1 table as used */
1192     inc_refcounts(bs, res, refcount_table, refcount_table_size,
1193         l1_table_offset, l1_size2);
1194 
1195     /* Read L1 table entries from disk */
1196     if (l1_size2 == 0) {
1197         l1_table = NULL;
1198     } else {
1199         l1_table = g_malloc(l1_size2);
1200         if (bdrv_pread(bs->file, l1_table_offset,
1201                        l1_table, l1_size2) != l1_size2)
1202             goto fail;
1203         for(i = 0;i < l1_size; i++)
1204             be64_to_cpus(&l1_table[i]);
1205     }
1206 
1207     /* Do the actual checks */
1208     for(i = 0; i < l1_size; i++) {
1209         l2_offset = l1_table[i];
1210         if (l2_offset) {
1211             /* Mark L2 table as used */
1212             l2_offset &= L1E_OFFSET_MASK;
1213             inc_refcounts(bs, res, refcount_table, refcount_table_size,
1214                 l2_offset, s->cluster_size);
1215 
1216             /* L2 tables are cluster aligned */
1217             if (offset_into_cluster(s, l2_offset)) {
1218                 fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1219                     "cluster aligned; L1 entry corrupted\n", l2_offset);
1220                 res->corruptions++;
1221             }
1222 
1223             /* Process and check L2 entries */
1224             ret = check_refcounts_l2(bs, res, refcount_table,
1225                                      refcount_table_size, l2_offset, flags);
1226             if (ret < 0) {
1227                 goto fail;
1228             }
1229         }
1230     }
1231     g_free(l1_table);
1232     return 0;
1233 
1234 fail:
1235     fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1236     res->check_errors++;
1237     g_free(l1_table);
1238     return -EIO;
1239 }
1240 
1241 /*
1242  * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
1243  *
1244  * This function does not print an error message nor does it increment
1245  * check_errors if get_refcount fails (this is because such an error will have
1246  * been already detected and sufficiently signaled by the calling function
1247  * (qcow2_check_refcounts) by the time this function is called).
1248  */
1249 static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
1250                               BdrvCheckMode fix)
1251 {
1252     BDRVQcowState *s = bs->opaque;
1253     uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
1254     int ret;
1255     int refcount;
1256     int i, j;
1257 
1258     for (i = 0; i < s->l1_size; i++) {
1259         uint64_t l1_entry = s->l1_table[i];
1260         uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
1261         bool l2_dirty = false;
1262 
1263         if (!l2_offset) {
1264             continue;
1265         }
1266 
1267         refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
1268         if (refcount < 0) {
1269             /* don't print message nor increment check_errors */
1270             continue;
1271         }
1272         if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
1273             fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
1274                     "l1_entry=%" PRIx64 " refcount=%d\n",
1275                     fix & BDRV_FIX_ERRORS ? "Repairing" :
1276                                             "ERROR",
1277                     i, l1_entry, refcount);
1278             if (fix & BDRV_FIX_ERRORS) {
1279                 s->l1_table[i] = refcount == 1
1280                                ? l1_entry |  QCOW_OFLAG_COPIED
1281                                : l1_entry & ~QCOW_OFLAG_COPIED;
1282                 ret = qcow2_write_l1_entry(bs, i);
1283                 if (ret < 0) {
1284                     res->check_errors++;
1285                     goto fail;
1286                 }
1287                 res->corruptions_fixed++;
1288             } else {
1289                 res->corruptions++;
1290             }
1291         }
1292 
1293         ret = bdrv_pread(bs->file, l2_offset, l2_table,
1294                          s->l2_size * sizeof(uint64_t));
1295         if (ret < 0) {
1296             fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
1297                     strerror(-ret));
1298             res->check_errors++;
1299             goto fail;
1300         }
1301 
1302         for (j = 0; j < s->l2_size; j++) {
1303             uint64_t l2_entry = be64_to_cpu(l2_table[j]);
1304             uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
1305             int cluster_type = qcow2_get_cluster_type(l2_entry);
1306 
1307             if ((cluster_type == QCOW2_CLUSTER_NORMAL) ||
1308                 ((cluster_type == QCOW2_CLUSTER_ZERO) && (data_offset != 0))) {
1309                 refcount = get_refcount(bs, data_offset >> s->cluster_bits);
1310                 if (refcount < 0) {
1311                     /* don't print message nor increment check_errors */
1312                     continue;
1313                 }
1314                 if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
1315                     fprintf(stderr, "%s OFLAG_COPIED data cluster: "
1316                             "l2_entry=%" PRIx64 " refcount=%d\n",
1317                             fix & BDRV_FIX_ERRORS ? "Repairing" :
1318                                                     "ERROR",
1319                             l2_entry, refcount);
1320                     if (fix & BDRV_FIX_ERRORS) {
1321                         l2_table[j] = cpu_to_be64(refcount == 1
1322                                     ? l2_entry |  QCOW_OFLAG_COPIED
1323                                     : l2_entry & ~QCOW_OFLAG_COPIED);
1324                         l2_dirty = true;
1325                         res->corruptions_fixed++;
1326                     } else {
1327                         res->corruptions++;
1328                     }
1329                 }
1330             }
1331         }
1332 
1333         if (l2_dirty) {
1334             ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
1335                                                 l2_offset, s->cluster_size);
1336             if (ret < 0) {
1337                 fprintf(stderr, "ERROR: Could not write L2 table; metadata "
1338                         "overlap check failed: %s\n", strerror(-ret));
1339                 res->check_errors++;
1340                 goto fail;
1341             }
1342 
1343             ret = bdrv_pwrite(bs->file, l2_offset, l2_table, s->cluster_size);
1344             if (ret < 0) {
1345                 fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
1346                         strerror(-ret));
1347                 res->check_errors++;
1348                 goto fail;
1349             }
1350         }
1351     }
1352 
1353     ret = 0;
1354 
1355 fail:
1356     qemu_vfree(l2_table);
1357     return ret;
1358 }
1359 
1360 /*
1361  * Writes one sector of the refcount table to the disk
1362  */
1363 #define RT_ENTRIES_PER_SECTOR (512 / sizeof(uint64_t))
1364 static int write_reftable_entry(BlockDriverState *bs, int rt_index)
1365 {
1366     BDRVQcowState *s = bs->opaque;
1367     uint64_t buf[RT_ENTRIES_PER_SECTOR];
1368     int rt_start_index;
1369     int i, ret;
1370 
1371     rt_start_index = rt_index & ~(RT_ENTRIES_PER_SECTOR - 1);
1372     for (i = 0; i < RT_ENTRIES_PER_SECTOR; i++) {
1373         buf[i] = cpu_to_be64(s->refcount_table[rt_start_index + i]);
1374     }
1375 
1376     ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_REFCOUNT_TABLE,
1377             s->refcount_table_offset + rt_start_index * sizeof(uint64_t),
1378             sizeof(buf));
1379     if (ret < 0) {
1380         return ret;
1381     }
1382 
1383     BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_UPDATE);
1384     ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset +
1385             rt_start_index * sizeof(uint64_t), buf, sizeof(buf));
1386     if (ret < 0) {
1387         return ret;
1388     }
1389 
1390     return 0;
1391 }
1392 
1393 /*
1394  * Allocates a new cluster for the given refcount block (represented by its
1395  * offset in the image file) and copies the current content there. This function
1396  * does _not_ decrement the reference count for the currently occupied cluster.
1397  *
1398  * This function prints an informative message to stderr on error (and returns
1399  * -errno); on success, the offset of the newly allocated cluster is returned.
1400  */
1401 static int64_t realloc_refcount_block(BlockDriverState *bs, int reftable_index,
1402                                       uint64_t offset)
1403 {
1404     BDRVQcowState *s = bs->opaque;
1405     int64_t new_offset = 0;
1406     void *refcount_block = NULL;
1407     int ret;
1408 
1409     /* allocate new refcount block */
1410     new_offset = qcow2_alloc_clusters(bs, s->cluster_size);
1411     if (new_offset < 0) {
1412         fprintf(stderr, "Could not allocate new cluster: %s\n",
1413                 strerror(-new_offset));
1414         ret = new_offset;
1415         goto done;
1416     }
1417 
1418     /* fetch current refcount block content */
1419     ret = qcow2_cache_get(bs, s->refcount_block_cache, offset, &refcount_block);
1420     if (ret < 0) {
1421         fprintf(stderr, "Could not fetch refcount block: %s\n", strerror(-ret));
1422         goto fail_free_cluster;
1423     }
1424 
1425     /* new block has not yet been entered into refcount table, therefore it is
1426      * no refcount block yet (regarding this check) */
1427     ret = qcow2_pre_write_overlap_check(bs, 0, new_offset, s->cluster_size);
1428     if (ret < 0) {
1429         fprintf(stderr, "Could not write refcount block; metadata overlap "
1430                 "check failed: %s\n", strerror(-ret));
1431         /* the image will be marked corrupt, so don't even attempt on freeing
1432          * the cluster */
1433         goto done;
1434     }
1435 
1436     /* write to new block */
1437     ret = bdrv_write(bs->file, new_offset / BDRV_SECTOR_SIZE, refcount_block,
1438             s->cluster_sectors);
1439     if (ret < 0) {
1440         fprintf(stderr, "Could not write refcount block: %s\n", strerror(-ret));
1441         goto fail_free_cluster;
1442     }
1443 
1444     /* update refcount table */
1445     assert(!offset_into_cluster(s, new_offset));
1446     s->refcount_table[reftable_index] = new_offset;
1447     ret = write_reftable_entry(bs, reftable_index);
1448     if (ret < 0) {
1449         fprintf(stderr, "Could not update refcount table: %s\n",
1450                 strerror(-ret));
1451         goto fail_free_cluster;
1452     }
1453 
1454     goto done;
1455 
1456 fail_free_cluster:
1457     qcow2_free_clusters(bs, new_offset, s->cluster_size, QCOW2_DISCARD_OTHER);
1458 
1459 done:
1460     if (refcount_block) {
1461         /* This should never fail, as it would only do so if the given refcount
1462          * block cannot be found in the cache. As this is impossible as long as
1463          * there are no bugs, assert the success. */
1464         int tmp = qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
1465         assert(tmp == 0);
1466     }
1467 
1468     if (ret < 0) {
1469         return ret;
1470     }
1471 
1472     return new_offset;
1473 }
1474 
1475 /*
1476  * Checks an image for refcount consistency.
1477  *
1478  * Returns 0 if no errors are found, the number of errors in case the image is
1479  * detected as corrupted, and -errno when an internal error occurred.
1480  */
1481 int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
1482                           BdrvCheckMode fix)
1483 {
1484     BDRVQcowState *s = bs->opaque;
1485     int64_t size, i, highest_cluster, nb_clusters;
1486     int refcount1, refcount2;
1487     QCowSnapshot *sn;
1488     uint16_t *refcount_table;
1489     int ret;
1490 
1491     size = bdrv_getlength(bs->file);
1492     if (size < 0) {
1493         res->check_errors++;
1494         return size;
1495     }
1496 
1497     nb_clusters = size_to_clusters(s, size);
1498     if (nb_clusters > INT_MAX) {
1499         res->check_errors++;
1500         return -EFBIG;
1501     }
1502 
1503     refcount_table = g_malloc0(nb_clusters * sizeof(uint16_t));
1504 
1505     res->bfi.total_clusters =
1506         size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
1507 
1508     /* header */
1509     inc_refcounts(bs, res, refcount_table, nb_clusters,
1510         0, s->cluster_size);
1511 
1512     /* current L1 table */
1513     ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
1514                              s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO);
1515     if (ret < 0) {
1516         goto fail;
1517     }
1518 
1519     /* snapshots */
1520     for(i = 0; i < s->nb_snapshots; i++) {
1521         sn = s->snapshots + i;
1522         ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
1523             sn->l1_table_offset, sn->l1_size, 0);
1524         if (ret < 0) {
1525             goto fail;
1526         }
1527     }
1528     inc_refcounts(bs, res, refcount_table, nb_clusters,
1529         s->snapshots_offset, s->snapshots_size);
1530 
1531     /* refcount data */
1532     inc_refcounts(bs, res, refcount_table, nb_clusters,
1533         s->refcount_table_offset,
1534         s->refcount_table_size * sizeof(uint64_t));
1535 
1536     for(i = 0; i < s->refcount_table_size; i++) {
1537         uint64_t offset, cluster;
1538         offset = s->refcount_table[i];
1539         cluster = offset >> s->cluster_bits;
1540 
1541         /* Refcount blocks are cluster aligned */
1542         if (offset_into_cluster(s, offset)) {
1543             fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
1544                 "cluster aligned; refcount table entry corrupted\n", i);
1545             res->corruptions++;
1546             continue;
1547         }
1548 
1549         if (cluster >= nb_clusters) {
1550             fprintf(stderr, "ERROR refcount block %" PRId64
1551                     " is outside image\n", i);
1552             res->corruptions++;
1553             continue;
1554         }
1555 
1556         if (offset != 0) {
1557             inc_refcounts(bs, res, refcount_table, nb_clusters,
1558                 offset, s->cluster_size);
1559             if (refcount_table[cluster] != 1) {
1560                 fprintf(stderr, "%s refcount block %" PRId64
1561                     " refcount=%d\n",
1562                     fix & BDRV_FIX_ERRORS ? "Repairing" :
1563                                             "ERROR",
1564                     i, refcount_table[cluster]);
1565 
1566                 if (fix & BDRV_FIX_ERRORS) {
1567                     int64_t new_offset;
1568 
1569                     new_offset = realloc_refcount_block(bs, i, offset);
1570                     if (new_offset < 0) {
1571                         res->corruptions++;
1572                         continue;
1573                     }
1574 
1575                     /* update refcounts */
1576                     if ((new_offset >> s->cluster_bits) >= nb_clusters) {
1577                         /* increase refcount_table size if necessary */
1578                         int old_nb_clusters = nb_clusters;
1579                         nb_clusters = (new_offset >> s->cluster_bits) + 1;
1580                         refcount_table = g_realloc(refcount_table,
1581                                 nb_clusters * sizeof(uint16_t));
1582                         memset(&refcount_table[old_nb_clusters], 0, (nb_clusters
1583                                 - old_nb_clusters) * sizeof(uint16_t));
1584                     }
1585                     refcount_table[cluster]--;
1586                     inc_refcounts(bs, res, refcount_table, nb_clusters,
1587                             new_offset, s->cluster_size);
1588 
1589                     res->corruptions_fixed++;
1590                 } else {
1591                     res->corruptions++;
1592                 }
1593             }
1594         }
1595     }
1596 
1597     /* compare ref counts */
1598     for (i = 0, highest_cluster = 0; i < nb_clusters; i++) {
1599         refcount1 = get_refcount(bs, i);
1600         if (refcount1 < 0) {
1601             fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
1602                 i, strerror(-refcount1));
1603             res->check_errors++;
1604             continue;
1605         }
1606 
1607         refcount2 = refcount_table[i];
1608 
1609         if (refcount1 > 0 || refcount2 > 0) {
1610             highest_cluster = i;
1611         }
1612 
1613         if (refcount1 != refcount2) {
1614 
1615             /* Check if we're allowed to fix the mismatch */
1616             int *num_fixed = NULL;
1617             if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
1618                 num_fixed = &res->leaks_fixed;
1619             } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
1620                 num_fixed = &res->corruptions_fixed;
1621             }
1622 
1623             fprintf(stderr, "%s cluster %" PRId64 " refcount=%d reference=%d\n",
1624                    num_fixed != NULL     ? "Repairing" :
1625                    refcount1 < refcount2 ? "ERROR" :
1626                                            "Leaked",
1627                    i, refcount1, refcount2);
1628 
1629             if (num_fixed) {
1630                 ret = update_refcount(bs, i << s->cluster_bits, 1,
1631                                       refcount2 - refcount1,
1632                                       QCOW2_DISCARD_ALWAYS);
1633                 if (ret >= 0) {
1634                     (*num_fixed)++;
1635                     continue;
1636                 }
1637             }
1638 
1639             /* And if we couldn't, print an error */
1640             if (refcount1 < refcount2) {
1641                 res->corruptions++;
1642             } else {
1643                 res->leaks++;
1644             }
1645         }
1646     }
1647 
1648     /* check OFLAG_COPIED */
1649     ret = check_oflag_copied(bs, res, fix);
1650     if (ret < 0) {
1651         goto fail;
1652     }
1653 
1654     res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
1655     ret = 0;
1656 
1657 fail:
1658     g_free(refcount_table);
1659 
1660     return ret;
1661 }
1662 
1663 #define overlaps_with(ofs, sz) \
1664     ranges_overlap(offset, size, ofs, sz)
1665 
1666 /*
1667  * Checks if the given offset into the image file is actually free to use by
1668  * looking for overlaps with important metadata sections (L1/L2 tables etc.),
1669  * i.e. a sanity check without relying on the refcount tables.
1670  *
1671  * The ign parameter specifies what checks not to perform (being a bitmask of
1672  * QCow2MetadataOverlap values), i.e., what sections to ignore.
1673  *
1674  * Returns:
1675  * - 0 if writing to this offset will not affect the mentioned metadata
1676  * - a positive QCow2MetadataOverlap value indicating one overlapping section
1677  * - a negative value (-errno) indicating an error while performing a check,
1678  *   e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2
1679  */
1680 int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
1681                                  int64_t size)
1682 {
1683     BDRVQcowState *s = bs->opaque;
1684     int chk = s->overlap_check & ~ign;
1685     int i, j;
1686 
1687     if (!size) {
1688         return 0;
1689     }
1690 
1691     if (chk & QCOW2_OL_MAIN_HEADER) {
1692         if (offset < s->cluster_size) {
1693             return QCOW2_OL_MAIN_HEADER;
1694         }
1695     }
1696 
1697     /* align range to test to cluster boundaries */
1698     size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size);
1699     offset = start_of_cluster(s, offset);
1700 
1701     if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
1702         if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) {
1703             return QCOW2_OL_ACTIVE_L1;
1704         }
1705     }
1706 
1707     if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
1708         if (overlaps_with(s->refcount_table_offset,
1709             s->refcount_table_size * sizeof(uint64_t))) {
1710             return QCOW2_OL_REFCOUNT_TABLE;
1711         }
1712     }
1713 
1714     if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
1715         if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
1716             return QCOW2_OL_SNAPSHOT_TABLE;
1717         }
1718     }
1719 
1720     if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
1721         for (i = 0; i < s->nb_snapshots; i++) {
1722             if (s->snapshots[i].l1_size &&
1723                 overlaps_with(s->snapshots[i].l1_table_offset,
1724                 s->snapshots[i].l1_size * sizeof(uint64_t))) {
1725                 return QCOW2_OL_INACTIVE_L1;
1726             }
1727         }
1728     }
1729 
1730     if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
1731         for (i = 0; i < s->l1_size; i++) {
1732             if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
1733                 overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
1734                 s->cluster_size)) {
1735                 return QCOW2_OL_ACTIVE_L2;
1736             }
1737         }
1738     }
1739 
1740     if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
1741         for (i = 0; i < s->refcount_table_size; i++) {
1742             if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
1743                 overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
1744                 s->cluster_size)) {
1745                 return QCOW2_OL_REFCOUNT_BLOCK;
1746             }
1747         }
1748     }
1749 
1750     if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
1751         for (i = 0; i < s->nb_snapshots; i++) {
1752             uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
1753             uint32_t l1_sz  = s->snapshots[i].l1_size;
1754             uint64_t l1_sz2 = l1_sz * sizeof(uint64_t);
1755             uint64_t *l1 = g_malloc(l1_sz2);
1756             int ret;
1757 
1758             ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2);
1759             if (ret < 0) {
1760                 g_free(l1);
1761                 return ret;
1762             }
1763 
1764             for (j = 0; j < l1_sz; j++) {
1765                 uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
1766                 if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
1767                     g_free(l1);
1768                     return QCOW2_OL_INACTIVE_L2;
1769                 }
1770             }
1771 
1772             g_free(l1);
1773         }
1774     }
1775 
1776     return 0;
1777 }
1778 
1779 static const char *metadata_ol_names[] = {
1780     [QCOW2_OL_MAIN_HEADER_BITNR]    = "qcow2_header",
1781     [QCOW2_OL_ACTIVE_L1_BITNR]      = "active L1 table",
1782     [QCOW2_OL_ACTIVE_L2_BITNR]      = "active L2 table",
1783     [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table",
1784     [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block",
1785     [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table",
1786     [QCOW2_OL_INACTIVE_L1_BITNR]    = "inactive L1 table",
1787     [QCOW2_OL_INACTIVE_L2_BITNR]    = "inactive L2 table",
1788 };
1789 
1790 /*
1791  * First performs a check for metadata overlaps (through
1792  * qcow2_check_metadata_overlap); if that fails with a negative value (error
1793  * while performing a check), that value is returned. If an impending overlap
1794  * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
1795  * and -EIO returned.
1796  *
1797  * Returns 0 if there were neither overlaps nor errors while checking for
1798  * overlaps; or a negative value (-errno) on error.
1799  */
1800 int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
1801                                   int64_t size)
1802 {
1803     int ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
1804 
1805     if (ret < 0) {
1806         return ret;
1807     } else if (ret > 0) {
1808         int metadata_ol_bitnr = ffs(ret) - 1;
1809         char *message;
1810         QObject *data;
1811 
1812         assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
1813 
1814         fprintf(stderr, "qcow2: Preventing invalid write on metadata (overlaps "
1815                 "with %s); image marked as corrupt.\n",
1816                 metadata_ol_names[metadata_ol_bitnr]);
1817         message = g_strdup_printf("Prevented %s overwrite",
1818                 metadata_ol_names[metadata_ol_bitnr]);
1819         data = qobject_from_jsonf("{ 'device': %s, 'msg': %s, 'offset': %"
1820                 PRId64 ", 'size': %" PRId64 " }", bs->device_name, message,
1821                 offset, size);
1822         monitor_protocol_event(QEVENT_BLOCK_IMAGE_CORRUPTED, data);
1823         g_free(message);
1824         qobject_decref(data);
1825 
1826         qcow2_mark_corrupt(bs);
1827         bs->drv = NULL; /* make BDS unusable */
1828         return -EIO;
1829     }
1830 
1831     return 0;
1832 }
1833