xref: /openbmc/qemu/block/qcow2-refcount.c (revision dffacd46)
1 /*
2  * Block driver for the QCOW version 2 format
3  *
4  * Copyright (c) 2004-2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu-common.h"
26 #include "block/block_int.h"
27 #include "block/qcow2.h"
28 #include "qemu/range.h"
29 #include "qapi/qmp/types.h"
30 
31 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
32 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
33                             int64_t offset, int64_t length,
34                             int addend, enum qcow2_discard_type type);
35 
36 
37 /*********************************************************/
38 /* refcount handling */
39 
40 int qcow2_refcount_init(BlockDriverState *bs)
41 {
42     BDRVQcowState *s = bs->opaque;
43     unsigned int refcount_table_size2, i;
44     int ret;
45 
46     assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t));
47     refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
48     s->refcount_table = g_malloc(refcount_table_size2);
49     if (s->refcount_table_size > 0) {
50         BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
51         ret = bdrv_pread(bs->file, s->refcount_table_offset,
52                          s->refcount_table, refcount_table_size2);
53         if (ret != refcount_table_size2)
54             goto fail;
55         for(i = 0; i < s->refcount_table_size; i++)
56             be64_to_cpus(&s->refcount_table[i]);
57     }
58     return 0;
59  fail:
60     return -ENOMEM;
61 }
62 
63 void qcow2_refcount_close(BlockDriverState *bs)
64 {
65     BDRVQcowState *s = bs->opaque;
66     g_free(s->refcount_table);
67 }
68 
69 
70 static int load_refcount_block(BlockDriverState *bs,
71                                int64_t refcount_block_offset,
72                                void **refcount_block)
73 {
74     BDRVQcowState *s = bs->opaque;
75     int ret;
76 
77     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
78     ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
79         refcount_block);
80 
81     return ret;
82 }
83 
84 /*
85  * Returns the refcount of the cluster given by its index. Any non-negative
86  * return value is the refcount of the cluster, negative values are -errno
87  * and indicate an error.
88  */
89 static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
90 {
91     BDRVQcowState *s = bs->opaque;
92     uint64_t refcount_table_index, block_index;
93     int64_t refcount_block_offset;
94     int ret;
95     uint16_t *refcount_block;
96     uint16_t refcount;
97 
98     refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
99     if (refcount_table_index >= s->refcount_table_size)
100         return 0;
101     refcount_block_offset =
102         s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
103     if (!refcount_block_offset)
104         return 0;
105 
106     ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
107         (void**) &refcount_block);
108     if (ret < 0) {
109         return ret;
110     }
111 
112     block_index = cluster_index &
113         ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
114     refcount = be16_to_cpu(refcount_block[block_index]);
115 
116     ret = qcow2_cache_put(bs, s->refcount_block_cache,
117         (void**) &refcount_block);
118     if (ret < 0) {
119         return ret;
120     }
121 
122     return refcount;
123 }
124 
125 /*
126  * Rounds the refcount table size up to avoid growing the table for each single
127  * refcount block that is allocated.
128  */
129 static unsigned int next_refcount_table_size(BDRVQcowState *s,
130     unsigned int min_size)
131 {
132     unsigned int min_clusters = (min_size >> (s->cluster_bits - 3)) + 1;
133     unsigned int refcount_table_clusters =
134         MAX(1, s->refcount_table_size >> (s->cluster_bits - 3));
135 
136     while (min_clusters > refcount_table_clusters) {
137         refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
138     }
139 
140     return refcount_table_clusters << (s->cluster_bits - 3);
141 }
142 
143 
144 /* Checks if two offsets are described by the same refcount block */
145 static int in_same_refcount_block(BDRVQcowState *s, uint64_t offset_a,
146     uint64_t offset_b)
147 {
148     uint64_t block_a = offset_a >> (2 * s->cluster_bits - REFCOUNT_SHIFT);
149     uint64_t block_b = offset_b >> (2 * s->cluster_bits - REFCOUNT_SHIFT);
150 
151     return (block_a == block_b);
152 }
153 
154 /*
155  * Loads a refcount block. If it doesn't exist yet, it is allocated first
156  * (including growing the refcount table if needed).
157  *
158  * Returns 0 on success or -errno in error case
159  */
160 static int alloc_refcount_block(BlockDriverState *bs,
161     int64_t cluster_index, uint16_t **refcount_block)
162 {
163     BDRVQcowState *s = bs->opaque;
164     unsigned int refcount_table_index;
165     int ret;
166 
167     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
168 
169     /* Find the refcount block for the given cluster */
170     refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
171 
172     if (refcount_table_index < s->refcount_table_size) {
173 
174         uint64_t refcount_block_offset =
175             s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
176 
177         /* If it's already there, we're done */
178         if (refcount_block_offset) {
179              return load_refcount_block(bs, refcount_block_offset,
180                  (void**) refcount_block);
181         }
182     }
183 
184     /*
185      * If we came here, we need to allocate something. Something is at least
186      * a cluster for the new refcount block. It may also include a new refcount
187      * table if the old refcount table is too small.
188      *
189      * Note that allocating clusters here needs some special care:
190      *
191      * - We can't use the normal qcow2_alloc_clusters(), it would try to
192      *   increase the refcount and very likely we would end up with an endless
193      *   recursion. Instead we must place the refcount blocks in a way that
194      *   they can describe them themselves.
195      *
196      * - We need to consider that at this point we are inside update_refcounts
197      *   and potentially doing an initial refcount increase. This means that
198      *   some clusters have already been allocated by the caller, but their
199      *   refcount isn't accurate yet. If we allocate clusters for metadata, we
200      *   need to return -EAGAIN to signal the caller that it needs to restart
201      *   the search for free clusters.
202      *
203      * - alloc_clusters_noref and qcow2_free_clusters may load a different
204      *   refcount block into the cache
205      */
206 
207     *refcount_block = NULL;
208 
209     /* We write to the refcount table, so we might depend on L2 tables */
210     ret = qcow2_cache_flush(bs, s->l2_table_cache);
211     if (ret < 0) {
212         return ret;
213     }
214 
215     /* Allocate the refcount block itself and mark it as used */
216     int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
217     if (new_block < 0) {
218         return new_block;
219     }
220 
221 #ifdef DEBUG_ALLOC2
222     fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
223         " at %" PRIx64 "\n",
224         refcount_table_index, cluster_index << s->cluster_bits, new_block);
225 #endif
226 
227     if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
228         /* Zero the new refcount block before updating it */
229         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
230             (void**) refcount_block);
231         if (ret < 0) {
232             goto fail_block;
233         }
234 
235         memset(*refcount_block, 0, s->cluster_size);
236 
237         /* The block describes itself, need to update the cache */
238         int block_index = (new_block >> s->cluster_bits) &
239             ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
240         (*refcount_block)[block_index] = cpu_to_be16(1);
241     } else {
242         /* Described somewhere else. This can recurse at most twice before we
243          * arrive at a block that describes itself. */
244         ret = update_refcount(bs, new_block, s->cluster_size, 1,
245                               QCOW2_DISCARD_NEVER);
246         if (ret < 0) {
247             goto fail_block;
248         }
249 
250         ret = qcow2_cache_flush(bs, s->refcount_block_cache);
251         if (ret < 0) {
252             goto fail_block;
253         }
254 
255         /* Initialize the new refcount block only after updating its refcount,
256          * update_refcount uses the refcount cache itself */
257         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
258             (void**) refcount_block);
259         if (ret < 0) {
260             goto fail_block;
261         }
262 
263         memset(*refcount_block, 0, s->cluster_size);
264     }
265 
266     /* Now the new refcount block needs to be written to disk */
267     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
268     qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
269     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
270     if (ret < 0) {
271         goto fail_block;
272     }
273 
274     /* If the refcount table is big enough, just hook the block up there */
275     if (refcount_table_index < s->refcount_table_size) {
276         uint64_t data64 = cpu_to_be64(new_block);
277         BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
278         ret = bdrv_pwrite_sync(bs->file,
279             s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
280             &data64, sizeof(data64));
281         if (ret < 0) {
282             goto fail_block;
283         }
284 
285         s->refcount_table[refcount_table_index] = new_block;
286 
287         /* The new refcount block may be where the caller intended to put its
288          * data, so let it restart the search. */
289         return -EAGAIN;
290     }
291 
292     ret = qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block);
293     if (ret < 0) {
294         goto fail_block;
295     }
296 
297     /*
298      * If we come here, we need to grow the refcount table. Again, a new
299      * refcount table needs some space and we can't simply allocate to avoid
300      * endless recursion.
301      *
302      * Therefore let's grab new refcount blocks at the end of the image, which
303      * will describe themselves and the new refcount table. This way we can
304      * reference them only in the new table and do the switch to the new
305      * refcount table at once without producing an inconsistent state in
306      * between.
307      */
308     BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
309 
310     /* Calculate the number of refcount blocks needed so far */
311     uint64_t refcount_block_clusters = 1 << (s->cluster_bits - REFCOUNT_SHIFT);
312     uint64_t blocks_used = DIV_ROUND_UP(cluster_index, refcount_block_clusters);
313 
314     if (blocks_used > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
315         return -EFBIG;
316     }
317 
318     /* And now we need at least one block more for the new metadata */
319     uint64_t table_size = next_refcount_table_size(s, blocks_used + 1);
320     uint64_t last_table_size;
321     uint64_t blocks_clusters;
322     do {
323         uint64_t table_clusters =
324             size_to_clusters(s, table_size * sizeof(uint64_t));
325         blocks_clusters = 1 +
326             ((table_clusters + refcount_block_clusters - 1)
327             / refcount_block_clusters);
328         uint64_t meta_clusters = table_clusters + blocks_clusters;
329 
330         last_table_size = table_size;
331         table_size = next_refcount_table_size(s, blocks_used +
332             ((meta_clusters + refcount_block_clusters - 1)
333             / refcount_block_clusters));
334 
335     } while (last_table_size != table_size);
336 
337 #ifdef DEBUG_ALLOC2
338     fprintf(stderr, "qcow2: Grow refcount table %" PRId32 " => %" PRId64 "\n",
339         s->refcount_table_size, table_size);
340 #endif
341 
342     /* Create the new refcount table and blocks */
343     uint64_t meta_offset = (blocks_used * refcount_block_clusters) *
344         s->cluster_size;
345     uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size;
346     uint16_t *new_blocks = g_malloc0(blocks_clusters * s->cluster_size);
347     uint64_t *new_table = g_malloc0(table_size * sizeof(uint64_t));
348 
349     /* Fill the new refcount table */
350     memcpy(new_table, s->refcount_table,
351         s->refcount_table_size * sizeof(uint64_t));
352     new_table[refcount_table_index] = new_block;
353 
354     int i;
355     for (i = 0; i < blocks_clusters; i++) {
356         new_table[blocks_used + i] = meta_offset + (i * s->cluster_size);
357     }
358 
359     /* Fill the refcount blocks */
360     uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t));
361     int block = 0;
362     for (i = 0; i < table_clusters + blocks_clusters; i++) {
363         new_blocks[block++] = cpu_to_be16(1);
364     }
365 
366     /* Write refcount blocks to disk */
367     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
368     ret = bdrv_pwrite_sync(bs->file, meta_offset, new_blocks,
369         blocks_clusters * s->cluster_size);
370     g_free(new_blocks);
371     if (ret < 0) {
372         goto fail_table;
373     }
374 
375     /* Write refcount table to disk */
376     for(i = 0; i < table_size; i++) {
377         cpu_to_be64s(&new_table[i]);
378     }
379 
380     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
381     ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
382         table_size * sizeof(uint64_t));
383     if (ret < 0) {
384         goto fail_table;
385     }
386 
387     for(i = 0; i < table_size; i++) {
388         be64_to_cpus(&new_table[i]);
389     }
390 
391     /* Hook up the new refcount table in the qcow2 header */
392     uint8_t data[12];
393     cpu_to_be64w((uint64_t*)data, table_offset);
394     cpu_to_be32w((uint32_t*)(data + 8), table_clusters);
395     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
396     ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, refcount_table_offset),
397         data, sizeof(data));
398     if (ret < 0) {
399         goto fail_table;
400     }
401 
402     /* And switch it in memory */
403     uint64_t old_table_offset = s->refcount_table_offset;
404     uint64_t old_table_size = s->refcount_table_size;
405 
406     g_free(s->refcount_table);
407     s->refcount_table = new_table;
408     s->refcount_table_size = table_size;
409     s->refcount_table_offset = table_offset;
410 
411     /* Free old table. */
412     qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
413                         QCOW2_DISCARD_OTHER);
414 
415     ret = load_refcount_block(bs, new_block, (void**) refcount_block);
416     if (ret < 0) {
417         return ret;
418     }
419 
420     /* If we were trying to do the initial refcount update for some cluster
421      * allocation, we might have used the same clusters to store newly
422      * allocated metadata. Make the caller search some new space. */
423     return -EAGAIN;
424 
425 fail_table:
426     g_free(new_table);
427 fail_block:
428     if (*refcount_block != NULL) {
429         qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block);
430     }
431     return ret;
432 }
433 
434 void qcow2_process_discards(BlockDriverState *bs, int ret)
435 {
436     BDRVQcowState *s = bs->opaque;
437     Qcow2DiscardRegion *d, *next;
438 
439     QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
440         QTAILQ_REMOVE(&s->discards, d, next);
441 
442         /* Discard is optional, ignore the return value */
443         if (ret >= 0) {
444             bdrv_discard(bs->file,
445                          d->offset >> BDRV_SECTOR_BITS,
446                          d->bytes >> BDRV_SECTOR_BITS);
447         }
448 
449         g_free(d);
450     }
451 }
452 
453 static void update_refcount_discard(BlockDriverState *bs,
454                                     uint64_t offset, uint64_t length)
455 {
456     BDRVQcowState *s = bs->opaque;
457     Qcow2DiscardRegion *d, *p, *next;
458 
459     QTAILQ_FOREACH(d, &s->discards, next) {
460         uint64_t new_start = MIN(offset, d->offset);
461         uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
462 
463         if (new_end - new_start <= length + d->bytes) {
464             /* There can't be any overlap, areas ending up here have no
465              * references any more and therefore shouldn't get freed another
466              * time. */
467             assert(d->bytes + length == new_end - new_start);
468             d->offset = new_start;
469             d->bytes = new_end - new_start;
470             goto found;
471         }
472     }
473 
474     d = g_malloc(sizeof(*d));
475     *d = (Qcow2DiscardRegion) {
476         .bs     = bs,
477         .offset = offset,
478         .bytes  = length,
479     };
480     QTAILQ_INSERT_TAIL(&s->discards, d, next);
481 
482 found:
483     /* Merge discard requests if they are adjacent now */
484     QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
485         if (p == d
486             || p->offset > d->offset + d->bytes
487             || d->offset > p->offset + p->bytes)
488         {
489             continue;
490         }
491 
492         /* Still no overlap possible */
493         assert(p->offset == d->offset + d->bytes
494             || d->offset == p->offset + p->bytes);
495 
496         QTAILQ_REMOVE(&s->discards, p, next);
497         d->offset = MIN(d->offset, p->offset);
498         d->bytes += p->bytes;
499     }
500 }
501 
502 /* XXX: cache several refcount block clusters ? */
503 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
504     int64_t offset, int64_t length, int addend, enum qcow2_discard_type type)
505 {
506     BDRVQcowState *s = bs->opaque;
507     int64_t start, last, cluster_offset;
508     uint16_t *refcount_block = NULL;
509     int64_t old_table_index = -1;
510     int ret;
511 
512 #ifdef DEBUG_ALLOC2
513     fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64 " addend=%d\n",
514            offset, length, addend);
515 #endif
516     if (length < 0) {
517         return -EINVAL;
518     } else if (length == 0) {
519         return 0;
520     }
521 
522     if (addend < 0) {
523         qcow2_cache_set_dependency(bs, s->refcount_block_cache,
524             s->l2_table_cache);
525     }
526 
527     start = start_of_cluster(s, offset);
528     last = start_of_cluster(s, offset + length - 1);
529     for(cluster_offset = start; cluster_offset <= last;
530         cluster_offset += s->cluster_size)
531     {
532         int block_index, refcount;
533         int64_t cluster_index = cluster_offset >> s->cluster_bits;
534         int64_t table_index =
535             cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
536 
537         /* Load the refcount block and allocate it if needed */
538         if (table_index != old_table_index) {
539             if (refcount_block) {
540                 ret = qcow2_cache_put(bs, s->refcount_block_cache,
541                     (void**) &refcount_block);
542                 if (ret < 0) {
543                     goto fail;
544                 }
545             }
546 
547             ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
548             if (ret < 0) {
549                 goto fail;
550             }
551         }
552         old_table_index = table_index;
553 
554         qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
555 
556         /* we can update the count and save it */
557         block_index = cluster_index &
558             ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
559 
560         refcount = be16_to_cpu(refcount_block[block_index]);
561         refcount += addend;
562         if (refcount < 0 || refcount > 0xffff) {
563             ret = -EINVAL;
564             goto fail;
565         }
566         if (refcount == 0 && cluster_index < s->free_cluster_index) {
567             s->free_cluster_index = cluster_index;
568         }
569         refcount_block[block_index] = cpu_to_be16(refcount);
570 
571         if (refcount == 0 && s->discard_passthrough[type]) {
572             update_refcount_discard(bs, cluster_offset, s->cluster_size);
573         }
574     }
575 
576     ret = 0;
577 fail:
578     if (!s->cache_discards) {
579         qcow2_process_discards(bs, ret);
580     }
581 
582     /* Write last changed block to disk */
583     if (refcount_block) {
584         int wret;
585         wret = qcow2_cache_put(bs, s->refcount_block_cache,
586             (void**) &refcount_block);
587         if (wret < 0) {
588             return ret < 0 ? ret : wret;
589         }
590     }
591 
592     /*
593      * Try do undo any updates if an error is returned (This may succeed in
594      * some cases like ENOSPC for allocating a new refcount block)
595      */
596     if (ret < 0) {
597         int dummy;
598         dummy = update_refcount(bs, offset, cluster_offset - offset, -addend,
599                                 QCOW2_DISCARD_NEVER);
600         (void)dummy;
601     }
602 
603     return ret;
604 }
605 
606 /*
607  * Increases or decreases the refcount of a given cluster by one.
608  * addend must be 1 or -1.
609  *
610  * If the return value is non-negative, it is the new refcount of the cluster.
611  * If it is negative, it is -errno and indicates an error.
612  */
613 int qcow2_update_cluster_refcount(BlockDriverState *bs,
614                                   int64_t cluster_index,
615                                   int addend,
616                                   enum qcow2_discard_type type)
617 {
618     BDRVQcowState *s = bs->opaque;
619     int ret;
620 
621     ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
622                           type);
623     if (ret < 0) {
624         return ret;
625     }
626 
627     return get_refcount(bs, cluster_index);
628 }
629 
630 
631 
632 /*********************************************************/
633 /* cluster allocation functions */
634 
635 
636 
637 /* return < 0 if error */
638 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size)
639 {
640     BDRVQcowState *s = bs->opaque;
641     uint64_t i, nb_clusters;
642     int refcount;
643 
644     nb_clusters = size_to_clusters(s, size);
645 retry:
646     for(i = 0; i < nb_clusters; i++) {
647         uint64_t next_cluster_index = s->free_cluster_index++;
648         refcount = get_refcount(bs, next_cluster_index);
649 
650         if (refcount < 0) {
651             return refcount;
652         } else if (refcount != 0) {
653             goto retry;
654         }
655     }
656 #ifdef DEBUG_ALLOC2
657     fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
658             size,
659             (s->free_cluster_index - nb_clusters) << s->cluster_bits);
660 #endif
661     return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
662 }
663 
664 int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
665 {
666     int64_t offset;
667     int ret;
668 
669     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
670     do {
671         offset = alloc_clusters_noref(bs, size);
672         if (offset < 0) {
673             return offset;
674         }
675 
676         ret = update_refcount(bs, offset, size, 1, QCOW2_DISCARD_NEVER);
677     } while (ret == -EAGAIN);
678 
679     if (ret < 0) {
680         return ret;
681     }
682 
683     return offset;
684 }
685 
686 int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
687     int nb_clusters)
688 {
689     BDRVQcowState *s = bs->opaque;
690     uint64_t cluster_index;
691     uint64_t i;
692     int refcount, ret;
693 
694     assert(nb_clusters >= 0);
695     if (nb_clusters == 0) {
696         return 0;
697     }
698 
699     do {
700         /* Check how many clusters there are free */
701         cluster_index = offset >> s->cluster_bits;
702         for(i = 0; i < nb_clusters; i++) {
703             refcount = get_refcount(bs, cluster_index++);
704 
705             if (refcount < 0) {
706                 return refcount;
707             } else if (refcount != 0) {
708                 break;
709             }
710         }
711 
712         /* And then allocate them */
713         ret = update_refcount(bs, offset, i << s->cluster_bits, 1,
714                               QCOW2_DISCARD_NEVER);
715     } while (ret == -EAGAIN);
716 
717     if (ret < 0) {
718         return ret;
719     }
720 
721     return i;
722 }
723 
724 /* only used to allocate compressed sectors. We try to allocate
725    contiguous sectors. size must be <= cluster_size */
726 int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
727 {
728     BDRVQcowState *s = bs->opaque;
729     int64_t offset, cluster_offset;
730     int free_in_cluster;
731 
732     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
733     assert(size > 0 && size <= s->cluster_size);
734     if (s->free_byte_offset == 0) {
735         offset = qcow2_alloc_clusters(bs, s->cluster_size);
736         if (offset < 0) {
737             return offset;
738         }
739         s->free_byte_offset = offset;
740     }
741  redo:
742     free_in_cluster = s->cluster_size -
743         offset_into_cluster(s, s->free_byte_offset);
744     if (size <= free_in_cluster) {
745         /* enough space in current cluster */
746         offset = s->free_byte_offset;
747         s->free_byte_offset += size;
748         free_in_cluster -= size;
749         if (free_in_cluster == 0)
750             s->free_byte_offset = 0;
751         if (offset_into_cluster(s, offset) != 0)
752             qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
753                                           QCOW2_DISCARD_NEVER);
754     } else {
755         offset = qcow2_alloc_clusters(bs, s->cluster_size);
756         if (offset < 0) {
757             return offset;
758         }
759         cluster_offset = start_of_cluster(s, s->free_byte_offset);
760         if ((cluster_offset + s->cluster_size) == offset) {
761             /* we are lucky: contiguous data */
762             offset = s->free_byte_offset;
763             qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
764                                           QCOW2_DISCARD_NEVER);
765             s->free_byte_offset += size;
766         } else {
767             s->free_byte_offset = offset;
768             goto redo;
769         }
770     }
771 
772     /* The cluster refcount was incremented, either by qcow2_alloc_clusters()
773      * or explicitly by qcow2_update_cluster_refcount().  Refcount blocks must
774      * be flushed before the caller's L2 table updates.
775      */
776     qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
777     return offset;
778 }
779 
780 void qcow2_free_clusters(BlockDriverState *bs,
781                           int64_t offset, int64_t size,
782                           enum qcow2_discard_type type)
783 {
784     int ret;
785 
786     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
787     ret = update_refcount(bs, offset, size, -1, type);
788     if (ret < 0) {
789         fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
790         /* TODO Remember the clusters to free them later and avoid leaking */
791     }
792 }
793 
794 /*
795  * Free a cluster using its L2 entry (handles clusters of all types, e.g.
796  * normal cluster, compressed cluster, etc.)
797  */
798 void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
799                              int nb_clusters, enum qcow2_discard_type type)
800 {
801     BDRVQcowState *s = bs->opaque;
802 
803     switch (qcow2_get_cluster_type(l2_entry)) {
804     case QCOW2_CLUSTER_COMPRESSED:
805         {
806             int nb_csectors;
807             nb_csectors = ((l2_entry >> s->csize_shift) &
808                            s->csize_mask) + 1;
809             qcow2_free_clusters(bs,
810                 (l2_entry & s->cluster_offset_mask) & ~511,
811                 nb_csectors * 512, type);
812         }
813         break;
814     case QCOW2_CLUSTER_NORMAL:
815     case QCOW2_CLUSTER_ZERO:
816         if (l2_entry & L2E_OFFSET_MASK) {
817             qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
818                                 nb_clusters << s->cluster_bits, type);
819         }
820         break;
821     case QCOW2_CLUSTER_UNALLOCATED:
822         break;
823     default:
824         abort();
825     }
826 }
827 
828 
829 
830 /*********************************************************/
831 /* snapshots and image creation */
832 
833 
834 
835 /* update the refcounts of snapshots and the copied flag */
836 int qcow2_update_snapshot_refcount(BlockDriverState *bs,
837     int64_t l1_table_offset, int l1_size, int addend)
838 {
839     BDRVQcowState *s = bs->opaque;
840     uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated;
841     int64_t old_offset, old_l2_offset;
842     int i, j, l1_modified = 0, nb_csectors, refcount;
843     int ret;
844 
845     l2_table = NULL;
846     l1_table = NULL;
847     l1_size2 = l1_size * sizeof(uint64_t);
848 
849     s->cache_discards = true;
850 
851     /* WARNING: qcow2_snapshot_goto relies on this function not using the
852      * l1_table_offset when it is the current s->l1_table_offset! Be careful
853      * when changing this! */
854     if (l1_table_offset != s->l1_table_offset) {
855         l1_table = g_malloc0(align_offset(l1_size2, 512));
856         l1_allocated = 1;
857 
858         ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
859         if (ret < 0) {
860             goto fail;
861         }
862 
863         for(i = 0;i < l1_size; i++)
864             be64_to_cpus(&l1_table[i]);
865     } else {
866         assert(l1_size == s->l1_size);
867         l1_table = s->l1_table;
868         l1_allocated = 0;
869     }
870 
871     for(i = 0; i < l1_size; i++) {
872         l2_offset = l1_table[i];
873         if (l2_offset) {
874             old_l2_offset = l2_offset;
875             l2_offset &= L1E_OFFSET_MASK;
876 
877             ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
878                 (void**) &l2_table);
879             if (ret < 0) {
880                 goto fail;
881             }
882 
883             for(j = 0; j < s->l2_size; j++) {
884                 uint64_t cluster_index;
885 
886                 offset = be64_to_cpu(l2_table[j]);
887                 old_offset = offset;
888                 offset &= ~QCOW_OFLAG_COPIED;
889 
890                 switch (qcow2_get_cluster_type(offset)) {
891                     case QCOW2_CLUSTER_COMPRESSED:
892                         nb_csectors = ((offset >> s->csize_shift) &
893                                        s->csize_mask) + 1;
894                         if (addend != 0) {
895                             ret = update_refcount(bs,
896                                 (offset & s->cluster_offset_mask) & ~511,
897                                 nb_csectors * 512, addend,
898                                 QCOW2_DISCARD_SNAPSHOT);
899                             if (ret < 0) {
900                                 goto fail;
901                             }
902                         }
903                         /* compressed clusters are never modified */
904                         refcount = 2;
905                         break;
906 
907                     case QCOW2_CLUSTER_NORMAL:
908                     case QCOW2_CLUSTER_ZERO:
909                         cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits;
910                         if (!cluster_index) {
911                             /* unallocated */
912                             refcount = 0;
913                             break;
914                         }
915                         if (addend != 0) {
916                             refcount = qcow2_update_cluster_refcount(bs,
917                                     cluster_index, addend,
918                                     QCOW2_DISCARD_SNAPSHOT);
919                         } else {
920                             refcount = get_refcount(bs, cluster_index);
921                         }
922 
923                         if (refcount < 0) {
924                             ret = refcount;
925                             goto fail;
926                         }
927                         break;
928 
929                     case QCOW2_CLUSTER_UNALLOCATED:
930                         refcount = 0;
931                         break;
932 
933                     default:
934                         abort();
935                 }
936 
937                 if (refcount == 1) {
938                     offset |= QCOW_OFLAG_COPIED;
939                 }
940                 if (offset != old_offset) {
941                     if (addend > 0) {
942                         qcow2_cache_set_dependency(bs, s->l2_table_cache,
943                             s->refcount_block_cache);
944                     }
945                     l2_table[j] = cpu_to_be64(offset);
946                     qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
947                 }
948             }
949 
950             ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
951             if (ret < 0) {
952                 goto fail;
953             }
954 
955 
956             if (addend != 0) {
957                 refcount = qcow2_update_cluster_refcount(bs, l2_offset >>
958                         s->cluster_bits, addend, QCOW2_DISCARD_SNAPSHOT);
959             } else {
960                 refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
961             }
962             if (refcount < 0) {
963                 ret = refcount;
964                 goto fail;
965             } else if (refcount == 1) {
966                 l2_offset |= QCOW_OFLAG_COPIED;
967             }
968             if (l2_offset != old_l2_offset) {
969                 l1_table[i] = l2_offset;
970                 l1_modified = 1;
971             }
972         }
973     }
974 
975     ret = bdrv_flush(bs);
976 fail:
977     if (l2_table) {
978         qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
979     }
980 
981     s->cache_discards = false;
982     qcow2_process_discards(bs, ret);
983 
984     /* Update L1 only if it isn't deleted anyway (addend = -1) */
985     if (ret == 0 && addend >= 0 && l1_modified) {
986         for (i = 0; i < l1_size; i++) {
987             cpu_to_be64s(&l1_table[i]);
988         }
989 
990         ret = bdrv_pwrite_sync(bs->file, l1_table_offset, l1_table, l1_size2);
991 
992         for (i = 0; i < l1_size; i++) {
993             be64_to_cpus(&l1_table[i]);
994         }
995     }
996     if (l1_allocated)
997         g_free(l1_table);
998     return ret;
999 }
1000 
1001 
1002 
1003 
1004 /*********************************************************/
1005 /* refcount checking functions */
1006 
1007 
1008 
1009 /*
1010  * Increases the refcount for a range of clusters in a given refcount table.
1011  * This is used to construct a temporary refcount table out of L1 and L2 tables
1012  * which can be compared the the refcount table saved in the image.
1013  *
1014  * Modifies the number of errors in res.
1015  */
1016 static void inc_refcounts(BlockDriverState *bs,
1017                           BdrvCheckResult *res,
1018                           uint16_t *refcount_table,
1019                           int refcount_table_size,
1020                           int64_t offset, int64_t size)
1021 {
1022     BDRVQcowState *s = bs->opaque;
1023     uint64_t start, last, cluster_offset, k;
1024 
1025     if (size <= 0)
1026         return;
1027 
1028     start = start_of_cluster(s, offset);
1029     last = start_of_cluster(s, offset + size - 1);
1030     for(cluster_offset = start; cluster_offset <= last;
1031         cluster_offset += s->cluster_size) {
1032         k = cluster_offset >> s->cluster_bits;
1033         if (k >= refcount_table_size) {
1034             fprintf(stderr, "Warning: cluster offset=0x%" PRIx64 " is after "
1035                 "the end of the image file, can't properly check refcounts.\n",
1036                 cluster_offset);
1037             res->check_errors++;
1038         } else {
1039             if (++refcount_table[k] == 0) {
1040                 fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
1041                     "\n", cluster_offset);
1042                 res->corruptions++;
1043             }
1044         }
1045     }
1046 }
1047 
1048 /* Flags for check_refcounts_l1() and check_refcounts_l2() */
1049 enum {
1050     CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
1051 };
1052 
1053 /*
1054  * Increases the refcount in the given refcount table for the all clusters
1055  * referenced in the L2 table. While doing so, performs some checks on L2
1056  * entries.
1057  *
1058  * Returns the number of errors found by the checks or -errno if an internal
1059  * error occurred.
1060  */
1061 static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
1062     uint16_t *refcount_table, int refcount_table_size, int64_t l2_offset,
1063     int flags)
1064 {
1065     BDRVQcowState *s = bs->opaque;
1066     uint64_t *l2_table, l2_entry;
1067     uint64_t next_contiguous_offset = 0;
1068     int i, l2_size, nb_csectors;
1069 
1070     /* Read L2 table from disk */
1071     l2_size = s->l2_size * sizeof(uint64_t);
1072     l2_table = g_malloc(l2_size);
1073 
1074     if (bdrv_pread(bs->file, l2_offset, l2_table, l2_size) != l2_size)
1075         goto fail;
1076 
1077     /* Do the actual checks */
1078     for(i = 0; i < s->l2_size; i++) {
1079         l2_entry = be64_to_cpu(l2_table[i]);
1080 
1081         switch (qcow2_get_cluster_type(l2_entry)) {
1082         case QCOW2_CLUSTER_COMPRESSED:
1083             /* Compressed clusters don't have QCOW_OFLAG_COPIED */
1084             if (l2_entry & QCOW_OFLAG_COPIED) {
1085                 fprintf(stderr, "ERROR: cluster %" PRId64 ": "
1086                     "copied flag must never be set for compressed "
1087                     "clusters\n", l2_entry >> s->cluster_bits);
1088                 l2_entry &= ~QCOW_OFLAG_COPIED;
1089                 res->corruptions++;
1090             }
1091 
1092             /* Mark cluster as used */
1093             nb_csectors = ((l2_entry >> s->csize_shift) &
1094                            s->csize_mask) + 1;
1095             l2_entry &= s->cluster_offset_mask;
1096             inc_refcounts(bs, res, refcount_table, refcount_table_size,
1097                 l2_entry & ~511, nb_csectors * 512);
1098 
1099             if (flags & CHECK_FRAG_INFO) {
1100                 res->bfi.allocated_clusters++;
1101                 res->bfi.compressed_clusters++;
1102 
1103                 /* Compressed clusters are fragmented by nature.  Since they
1104                  * take up sub-sector space but we only have sector granularity
1105                  * I/O we need to re-read the same sectors even for adjacent
1106                  * compressed clusters.
1107                  */
1108                 res->bfi.fragmented_clusters++;
1109             }
1110             break;
1111 
1112         case QCOW2_CLUSTER_ZERO:
1113             if ((l2_entry & L2E_OFFSET_MASK) == 0) {
1114                 break;
1115             }
1116             /* fall through */
1117 
1118         case QCOW2_CLUSTER_NORMAL:
1119         {
1120             uint64_t offset = l2_entry & L2E_OFFSET_MASK;
1121 
1122             if (flags & CHECK_FRAG_INFO) {
1123                 res->bfi.allocated_clusters++;
1124                 if (next_contiguous_offset &&
1125                     offset != next_contiguous_offset) {
1126                     res->bfi.fragmented_clusters++;
1127                 }
1128                 next_contiguous_offset = offset + s->cluster_size;
1129             }
1130 
1131             /* Mark cluster as used */
1132             inc_refcounts(bs, res, refcount_table,refcount_table_size,
1133                 offset, s->cluster_size);
1134 
1135             /* Correct offsets are cluster aligned */
1136             if (offset_into_cluster(s, offset)) {
1137                 fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
1138                     "properly aligned; L2 entry corrupted.\n", offset);
1139                 res->corruptions++;
1140             }
1141             break;
1142         }
1143 
1144         case QCOW2_CLUSTER_UNALLOCATED:
1145             break;
1146 
1147         default:
1148             abort();
1149         }
1150     }
1151 
1152     g_free(l2_table);
1153     return 0;
1154 
1155 fail:
1156     fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
1157     g_free(l2_table);
1158     return -EIO;
1159 }
1160 
1161 /*
1162  * Increases the refcount for the L1 table, its L2 tables and all referenced
1163  * clusters in the given refcount table. While doing so, performs some checks
1164  * on L1 and L2 entries.
1165  *
1166  * Returns the number of errors found by the checks or -errno if an internal
1167  * error occurred.
1168  */
1169 static int check_refcounts_l1(BlockDriverState *bs,
1170                               BdrvCheckResult *res,
1171                               uint16_t *refcount_table,
1172                               int refcount_table_size,
1173                               int64_t l1_table_offset, int l1_size,
1174                               int flags)
1175 {
1176     BDRVQcowState *s = bs->opaque;
1177     uint64_t *l1_table, l2_offset, l1_size2;
1178     int i, ret;
1179 
1180     l1_size2 = l1_size * sizeof(uint64_t);
1181 
1182     /* Mark L1 table as used */
1183     inc_refcounts(bs, res, refcount_table, refcount_table_size,
1184         l1_table_offset, l1_size2);
1185 
1186     /* Read L1 table entries from disk */
1187     if (l1_size2 == 0) {
1188         l1_table = NULL;
1189     } else {
1190         l1_table = g_malloc(l1_size2);
1191         if (bdrv_pread(bs->file, l1_table_offset,
1192                        l1_table, l1_size2) != l1_size2)
1193             goto fail;
1194         for(i = 0;i < l1_size; i++)
1195             be64_to_cpus(&l1_table[i]);
1196     }
1197 
1198     /* Do the actual checks */
1199     for(i = 0; i < l1_size; i++) {
1200         l2_offset = l1_table[i];
1201         if (l2_offset) {
1202             /* Mark L2 table as used */
1203             l2_offset &= L1E_OFFSET_MASK;
1204             inc_refcounts(bs, res, refcount_table, refcount_table_size,
1205                 l2_offset, s->cluster_size);
1206 
1207             /* L2 tables are cluster aligned */
1208             if (offset_into_cluster(s, l2_offset)) {
1209                 fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1210                     "cluster aligned; L1 entry corrupted\n", l2_offset);
1211                 res->corruptions++;
1212             }
1213 
1214             /* Process and check L2 entries */
1215             ret = check_refcounts_l2(bs, res, refcount_table,
1216                                      refcount_table_size, l2_offset, flags);
1217             if (ret < 0) {
1218                 goto fail;
1219             }
1220         }
1221     }
1222     g_free(l1_table);
1223     return 0;
1224 
1225 fail:
1226     fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1227     res->check_errors++;
1228     g_free(l1_table);
1229     return -EIO;
1230 }
1231 
1232 /*
1233  * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
1234  *
1235  * This function does not print an error message nor does it increment
1236  * check_errors if get_refcount fails (this is because such an error will have
1237  * been already detected and sufficiently signaled by the calling function
1238  * (qcow2_check_refcounts) by the time this function is called).
1239  */
1240 static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
1241                               BdrvCheckMode fix)
1242 {
1243     BDRVQcowState *s = bs->opaque;
1244     uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
1245     int ret;
1246     int refcount;
1247     int i, j;
1248 
1249     for (i = 0; i < s->l1_size; i++) {
1250         uint64_t l1_entry = s->l1_table[i];
1251         uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
1252         bool l2_dirty = false;
1253 
1254         if (!l2_offset) {
1255             continue;
1256         }
1257 
1258         refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
1259         if (refcount < 0) {
1260             /* don't print message nor increment check_errors */
1261             continue;
1262         }
1263         if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
1264             fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
1265                     "l1_entry=%" PRIx64 " refcount=%d\n",
1266                     fix & BDRV_FIX_ERRORS ? "Repairing" :
1267                                             "ERROR",
1268                     i, l1_entry, refcount);
1269             if (fix & BDRV_FIX_ERRORS) {
1270                 s->l1_table[i] = refcount == 1
1271                                ? l1_entry |  QCOW_OFLAG_COPIED
1272                                : l1_entry & ~QCOW_OFLAG_COPIED;
1273                 ret = qcow2_write_l1_entry(bs, i);
1274                 if (ret < 0) {
1275                     res->check_errors++;
1276                     goto fail;
1277                 }
1278                 res->corruptions_fixed++;
1279             } else {
1280                 res->corruptions++;
1281             }
1282         }
1283 
1284         ret = bdrv_pread(bs->file, l2_offset, l2_table,
1285                          s->l2_size * sizeof(uint64_t));
1286         if (ret < 0) {
1287             fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
1288                     strerror(-ret));
1289             res->check_errors++;
1290             goto fail;
1291         }
1292 
1293         for (j = 0; j < s->l2_size; j++) {
1294             uint64_t l2_entry = be64_to_cpu(l2_table[j]);
1295             uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
1296             int cluster_type = qcow2_get_cluster_type(l2_entry);
1297 
1298             if ((cluster_type == QCOW2_CLUSTER_NORMAL) ||
1299                 ((cluster_type == QCOW2_CLUSTER_ZERO) && (data_offset != 0))) {
1300                 refcount = get_refcount(bs, data_offset >> s->cluster_bits);
1301                 if (refcount < 0) {
1302                     /* don't print message nor increment check_errors */
1303                     continue;
1304                 }
1305                 if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
1306                     fprintf(stderr, "%s OFLAG_COPIED data cluster: "
1307                             "l2_entry=%" PRIx64 " refcount=%d\n",
1308                             fix & BDRV_FIX_ERRORS ? "Repairing" :
1309                                                     "ERROR",
1310                             l2_entry, refcount);
1311                     if (fix & BDRV_FIX_ERRORS) {
1312                         l2_table[j] = cpu_to_be64(refcount == 1
1313                                     ? l2_entry |  QCOW_OFLAG_COPIED
1314                                     : l2_entry & ~QCOW_OFLAG_COPIED);
1315                         l2_dirty = true;
1316                         res->corruptions_fixed++;
1317                     } else {
1318                         res->corruptions++;
1319                     }
1320                 }
1321             }
1322         }
1323 
1324         if (l2_dirty) {
1325             ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
1326                                                 l2_offset, s->cluster_size);
1327             if (ret < 0) {
1328                 fprintf(stderr, "ERROR: Could not write L2 table; metadata "
1329                         "overlap check failed: %s\n", strerror(-ret));
1330                 res->check_errors++;
1331                 goto fail;
1332             }
1333 
1334             ret = bdrv_pwrite(bs->file, l2_offset, l2_table, s->cluster_size);
1335             if (ret < 0) {
1336                 fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
1337                         strerror(-ret));
1338                 res->check_errors++;
1339                 goto fail;
1340             }
1341         }
1342     }
1343 
1344     ret = 0;
1345 
1346 fail:
1347     qemu_vfree(l2_table);
1348     return ret;
1349 }
1350 
1351 /*
1352  * Writes one sector of the refcount table to the disk
1353  */
1354 #define RT_ENTRIES_PER_SECTOR (512 / sizeof(uint64_t))
1355 static int write_reftable_entry(BlockDriverState *bs, int rt_index)
1356 {
1357     BDRVQcowState *s = bs->opaque;
1358     uint64_t buf[RT_ENTRIES_PER_SECTOR];
1359     int rt_start_index;
1360     int i, ret;
1361 
1362     rt_start_index = rt_index & ~(RT_ENTRIES_PER_SECTOR - 1);
1363     for (i = 0; i < RT_ENTRIES_PER_SECTOR; i++) {
1364         buf[i] = cpu_to_be64(s->refcount_table[rt_start_index + i]);
1365     }
1366 
1367     ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_REFCOUNT_TABLE,
1368             s->refcount_table_offset + rt_start_index * sizeof(uint64_t),
1369             sizeof(buf));
1370     if (ret < 0) {
1371         return ret;
1372     }
1373 
1374     BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_UPDATE);
1375     ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset +
1376             rt_start_index * sizeof(uint64_t), buf, sizeof(buf));
1377     if (ret < 0) {
1378         return ret;
1379     }
1380 
1381     return 0;
1382 }
1383 
1384 /*
1385  * Allocates a new cluster for the given refcount block (represented by its
1386  * offset in the image file) and copies the current content there. This function
1387  * does _not_ decrement the reference count for the currently occupied cluster.
1388  *
1389  * This function prints an informative message to stderr on error (and returns
1390  * -errno); on success, the offset of the newly allocated cluster is returned.
1391  */
1392 static int64_t realloc_refcount_block(BlockDriverState *bs, int reftable_index,
1393                                       uint64_t offset)
1394 {
1395     BDRVQcowState *s = bs->opaque;
1396     int64_t new_offset = 0;
1397     void *refcount_block = NULL;
1398     int ret;
1399 
1400     /* allocate new refcount block */
1401     new_offset = qcow2_alloc_clusters(bs, s->cluster_size);
1402     if (new_offset < 0) {
1403         fprintf(stderr, "Could not allocate new cluster: %s\n",
1404                 strerror(-new_offset));
1405         ret = new_offset;
1406         goto done;
1407     }
1408 
1409     /* fetch current refcount block content */
1410     ret = qcow2_cache_get(bs, s->refcount_block_cache, offset, &refcount_block);
1411     if (ret < 0) {
1412         fprintf(stderr, "Could not fetch refcount block: %s\n", strerror(-ret));
1413         goto fail_free_cluster;
1414     }
1415 
1416     /* new block has not yet been entered into refcount table, therefore it is
1417      * no refcount block yet (regarding this check) */
1418     ret = qcow2_pre_write_overlap_check(bs, 0, new_offset, s->cluster_size);
1419     if (ret < 0) {
1420         fprintf(stderr, "Could not write refcount block; metadata overlap "
1421                 "check failed: %s\n", strerror(-ret));
1422         /* the image will be marked corrupt, so don't even attempt on freeing
1423          * the cluster */
1424         goto done;
1425     }
1426 
1427     /* write to new block */
1428     ret = bdrv_write(bs->file, new_offset / BDRV_SECTOR_SIZE, refcount_block,
1429             s->cluster_sectors);
1430     if (ret < 0) {
1431         fprintf(stderr, "Could not write refcount block: %s\n", strerror(-ret));
1432         goto fail_free_cluster;
1433     }
1434 
1435     /* update refcount table */
1436     assert(!offset_into_cluster(s, new_offset));
1437     s->refcount_table[reftable_index] = new_offset;
1438     ret = write_reftable_entry(bs, reftable_index);
1439     if (ret < 0) {
1440         fprintf(stderr, "Could not update refcount table: %s\n",
1441                 strerror(-ret));
1442         goto fail_free_cluster;
1443     }
1444 
1445     goto done;
1446 
1447 fail_free_cluster:
1448     qcow2_free_clusters(bs, new_offset, s->cluster_size, QCOW2_DISCARD_OTHER);
1449 
1450 done:
1451     if (refcount_block) {
1452         /* This should never fail, as it would only do so if the given refcount
1453          * block cannot be found in the cache. As this is impossible as long as
1454          * there are no bugs, assert the success. */
1455         int tmp = qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
1456         assert(tmp == 0);
1457     }
1458 
1459     if (ret < 0) {
1460         return ret;
1461     }
1462 
1463     return new_offset;
1464 }
1465 
1466 /*
1467  * Checks an image for refcount consistency.
1468  *
1469  * Returns 0 if no errors are found, the number of errors in case the image is
1470  * detected as corrupted, and -errno when an internal error occurred.
1471  */
1472 int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
1473                           BdrvCheckMode fix)
1474 {
1475     BDRVQcowState *s = bs->opaque;
1476     int64_t size, i, highest_cluster, nb_clusters;
1477     int refcount1, refcount2;
1478     QCowSnapshot *sn;
1479     uint16_t *refcount_table;
1480     int ret;
1481 
1482     size = bdrv_getlength(bs->file);
1483     nb_clusters = size_to_clusters(s, size);
1484     if (nb_clusters > INT_MAX) {
1485         res->check_errors++;
1486         return -EFBIG;
1487     }
1488 
1489     refcount_table = g_malloc0(nb_clusters * sizeof(uint16_t));
1490 
1491     res->bfi.total_clusters =
1492         size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
1493 
1494     /* header */
1495     inc_refcounts(bs, res, refcount_table, nb_clusters,
1496         0, s->cluster_size);
1497 
1498     /* current L1 table */
1499     ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
1500                              s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO);
1501     if (ret < 0) {
1502         goto fail;
1503     }
1504 
1505     /* snapshots */
1506     for(i = 0; i < s->nb_snapshots; i++) {
1507         sn = s->snapshots + i;
1508         ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
1509             sn->l1_table_offset, sn->l1_size, 0);
1510         if (ret < 0) {
1511             goto fail;
1512         }
1513     }
1514     inc_refcounts(bs, res, refcount_table, nb_clusters,
1515         s->snapshots_offset, s->snapshots_size);
1516 
1517     /* refcount data */
1518     inc_refcounts(bs, res, refcount_table, nb_clusters,
1519         s->refcount_table_offset,
1520         s->refcount_table_size * sizeof(uint64_t));
1521 
1522     for(i = 0; i < s->refcount_table_size; i++) {
1523         uint64_t offset, cluster;
1524         offset = s->refcount_table[i];
1525         cluster = offset >> s->cluster_bits;
1526 
1527         /* Refcount blocks are cluster aligned */
1528         if (offset_into_cluster(s, offset)) {
1529             fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
1530                 "cluster aligned; refcount table entry corrupted\n", i);
1531             res->corruptions++;
1532             continue;
1533         }
1534 
1535         if (cluster >= nb_clusters) {
1536             fprintf(stderr, "ERROR refcount block %" PRId64
1537                     " is outside image\n", i);
1538             res->corruptions++;
1539             continue;
1540         }
1541 
1542         if (offset != 0) {
1543             inc_refcounts(bs, res, refcount_table, nb_clusters,
1544                 offset, s->cluster_size);
1545             if (refcount_table[cluster] != 1) {
1546                 fprintf(stderr, "%s refcount block %" PRId64
1547                     " refcount=%d\n",
1548                     fix & BDRV_FIX_ERRORS ? "Repairing" :
1549                                             "ERROR",
1550                     i, refcount_table[cluster]);
1551 
1552                 if (fix & BDRV_FIX_ERRORS) {
1553                     int64_t new_offset;
1554 
1555                     new_offset = realloc_refcount_block(bs, i, offset);
1556                     if (new_offset < 0) {
1557                         res->corruptions++;
1558                         continue;
1559                     }
1560 
1561                     /* update refcounts */
1562                     if ((new_offset >> s->cluster_bits) >= nb_clusters) {
1563                         /* increase refcount_table size if necessary */
1564                         int old_nb_clusters = nb_clusters;
1565                         nb_clusters = (new_offset >> s->cluster_bits) + 1;
1566                         refcount_table = g_realloc(refcount_table,
1567                                 nb_clusters * sizeof(uint16_t));
1568                         memset(&refcount_table[old_nb_clusters], 0, (nb_clusters
1569                                 - old_nb_clusters) * sizeof(uint16_t));
1570                     }
1571                     refcount_table[cluster]--;
1572                     inc_refcounts(bs, res, refcount_table, nb_clusters,
1573                             new_offset, s->cluster_size);
1574 
1575                     res->corruptions_fixed++;
1576                 } else {
1577                     res->corruptions++;
1578                 }
1579             }
1580         }
1581     }
1582 
1583     /* compare ref counts */
1584     for (i = 0, highest_cluster = 0; i < nb_clusters; i++) {
1585         refcount1 = get_refcount(bs, i);
1586         if (refcount1 < 0) {
1587             fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
1588                 i, strerror(-refcount1));
1589             res->check_errors++;
1590             continue;
1591         }
1592 
1593         refcount2 = refcount_table[i];
1594 
1595         if (refcount1 > 0 || refcount2 > 0) {
1596             highest_cluster = i;
1597         }
1598 
1599         if (refcount1 != refcount2) {
1600 
1601             /* Check if we're allowed to fix the mismatch */
1602             int *num_fixed = NULL;
1603             if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
1604                 num_fixed = &res->leaks_fixed;
1605             } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
1606                 num_fixed = &res->corruptions_fixed;
1607             }
1608 
1609             fprintf(stderr, "%s cluster %" PRId64 " refcount=%d reference=%d\n",
1610                    num_fixed != NULL     ? "Repairing" :
1611                    refcount1 < refcount2 ? "ERROR" :
1612                                            "Leaked",
1613                    i, refcount1, refcount2);
1614 
1615             if (num_fixed) {
1616                 ret = update_refcount(bs, i << s->cluster_bits, 1,
1617                                       refcount2 - refcount1,
1618                                       QCOW2_DISCARD_ALWAYS);
1619                 if (ret >= 0) {
1620                     (*num_fixed)++;
1621                     continue;
1622                 }
1623             }
1624 
1625             /* And if we couldn't, print an error */
1626             if (refcount1 < refcount2) {
1627                 res->corruptions++;
1628             } else {
1629                 res->leaks++;
1630             }
1631         }
1632     }
1633 
1634     /* check OFLAG_COPIED */
1635     ret = check_oflag_copied(bs, res, fix);
1636     if (ret < 0) {
1637         goto fail;
1638     }
1639 
1640     res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
1641     ret = 0;
1642 
1643 fail:
1644     g_free(refcount_table);
1645 
1646     return ret;
1647 }
1648 
1649 #define overlaps_with(ofs, sz) \
1650     ranges_overlap(offset, size, ofs, sz)
1651 
1652 /*
1653  * Checks if the given offset into the image file is actually free to use by
1654  * looking for overlaps with important metadata sections (L1/L2 tables etc.),
1655  * i.e. a sanity check without relying on the refcount tables.
1656  *
1657  * The ign parameter specifies what checks not to perform (being a bitmask of
1658  * QCow2MetadataOverlap values), i.e., what sections to ignore.
1659  *
1660  * Returns:
1661  * - 0 if writing to this offset will not affect the mentioned metadata
1662  * - a positive QCow2MetadataOverlap value indicating one overlapping section
1663  * - a negative value (-errno) indicating an error while performing a check,
1664  *   e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2
1665  */
1666 int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
1667                                  int64_t size)
1668 {
1669     BDRVQcowState *s = bs->opaque;
1670     int chk = s->overlap_check & ~ign;
1671     int i, j;
1672 
1673     if (!size) {
1674         return 0;
1675     }
1676 
1677     if (chk & QCOW2_OL_MAIN_HEADER) {
1678         if (offset < s->cluster_size) {
1679             return QCOW2_OL_MAIN_HEADER;
1680         }
1681     }
1682 
1683     /* align range to test to cluster boundaries */
1684     size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size);
1685     offset = start_of_cluster(s, offset);
1686 
1687     if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
1688         if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) {
1689             return QCOW2_OL_ACTIVE_L1;
1690         }
1691     }
1692 
1693     if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
1694         if (overlaps_with(s->refcount_table_offset,
1695             s->refcount_table_size * sizeof(uint64_t))) {
1696             return QCOW2_OL_REFCOUNT_TABLE;
1697         }
1698     }
1699 
1700     if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
1701         if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
1702             return QCOW2_OL_SNAPSHOT_TABLE;
1703         }
1704     }
1705 
1706     if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
1707         for (i = 0; i < s->nb_snapshots; i++) {
1708             if (s->snapshots[i].l1_size &&
1709                 overlaps_with(s->snapshots[i].l1_table_offset,
1710                 s->snapshots[i].l1_size * sizeof(uint64_t))) {
1711                 return QCOW2_OL_INACTIVE_L1;
1712             }
1713         }
1714     }
1715 
1716     if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
1717         for (i = 0; i < s->l1_size; i++) {
1718             if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
1719                 overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
1720                 s->cluster_size)) {
1721                 return QCOW2_OL_ACTIVE_L2;
1722             }
1723         }
1724     }
1725 
1726     if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
1727         for (i = 0; i < s->refcount_table_size; i++) {
1728             if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
1729                 overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
1730                 s->cluster_size)) {
1731                 return QCOW2_OL_REFCOUNT_BLOCK;
1732             }
1733         }
1734     }
1735 
1736     if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
1737         for (i = 0; i < s->nb_snapshots; i++) {
1738             uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
1739             uint32_t l1_sz  = s->snapshots[i].l1_size;
1740             uint64_t l1_sz2 = l1_sz * sizeof(uint64_t);
1741             uint64_t *l1 = g_malloc(l1_sz2);
1742             int ret;
1743 
1744             ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2);
1745             if (ret < 0) {
1746                 g_free(l1);
1747                 return ret;
1748             }
1749 
1750             for (j = 0; j < l1_sz; j++) {
1751                 uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
1752                 if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
1753                     g_free(l1);
1754                     return QCOW2_OL_INACTIVE_L2;
1755                 }
1756             }
1757 
1758             g_free(l1);
1759         }
1760     }
1761 
1762     return 0;
1763 }
1764 
1765 static const char *metadata_ol_names[] = {
1766     [QCOW2_OL_MAIN_HEADER_BITNR]    = "qcow2_header",
1767     [QCOW2_OL_ACTIVE_L1_BITNR]      = "active L1 table",
1768     [QCOW2_OL_ACTIVE_L2_BITNR]      = "active L2 table",
1769     [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table",
1770     [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block",
1771     [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table",
1772     [QCOW2_OL_INACTIVE_L1_BITNR]    = "inactive L1 table",
1773     [QCOW2_OL_INACTIVE_L2_BITNR]    = "inactive L2 table",
1774 };
1775 
1776 /*
1777  * First performs a check for metadata overlaps (through
1778  * qcow2_check_metadata_overlap); if that fails with a negative value (error
1779  * while performing a check), that value is returned. If an impending overlap
1780  * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
1781  * and -EIO returned.
1782  *
1783  * Returns 0 if there were neither overlaps nor errors while checking for
1784  * overlaps; or a negative value (-errno) on error.
1785  */
1786 int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
1787                                   int64_t size)
1788 {
1789     int ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
1790 
1791     if (ret < 0) {
1792         return ret;
1793     } else if (ret > 0) {
1794         int metadata_ol_bitnr = ffs(ret) - 1;
1795         char *message;
1796         QObject *data;
1797 
1798         assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
1799 
1800         fprintf(stderr, "qcow2: Preventing invalid write on metadata (overlaps "
1801                 "with %s); image marked as corrupt.\n",
1802                 metadata_ol_names[metadata_ol_bitnr]);
1803         message = g_strdup_printf("Prevented %s overwrite",
1804                 metadata_ol_names[metadata_ol_bitnr]);
1805         data = qobject_from_jsonf("{ 'device': %s, 'msg': %s, 'offset': %"
1806                 PRId64 ", 'size': %" PRId64 " }", bs->device_name, message,
1807                 offset, size);
1808         monitor_protocol_event(QEVENT_BLOCK_IMAGE_CORRUPTED, data);
1809         g_free(message);
1810         qobject_decref(data);
1811 
1812         qcow2_mark_corrupt(bs);
1813         bs->drv = NULL; /* make BDS unusable */
1814         return -EIO;
1815     }
1816 
1817     return 0;
1818 }
1819