xref: /openbmc/qemu/block/qcow2-refcount.c (revision a719a27c)
1 /*
2  * Block driver for the QCOW version 2 format
3  *
4  * Copyright (c) 2004-2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu-common.h"
26 #include "block/block_int.h"
27 #include "block/qcow2.h"
28 #include "qemu/range.h"
29 #include "qapi/qmp/types.h"
30 
31 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
32 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
33                             int64_t offset, int64_t length,
34                             int addend, enum qcow2_discard_type type);
35 
36 
37 /*********************************************************/
38 /* refcount handling */
39 
40 int qcow2_refcount_init(BlockDriverState *bs)
41 {
42     BDRVQcowState *s = bs->opaque;
43     unsigned int refcount_table_size2, i;
44     int ret;
45 
46     assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t));
47     refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
48     s->refcount_table = g_malloc(refcount_table_size2);
49     if (s->refcount_table_size > 0) {
50         BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
51         ret = bdrv_pread(bs->file, s->refcount_table_offset,
52                          s->refcount_table, refcount_table_size2);
53         if (ret != refcount_table_size2)
54             goto fail;
55         for(i = 0; i < s->refcount_table_size; i++)
56             be64_to_cpus(&s->refcount_table[i]);
57     }
58     return 0;
59  fail:
60     return -ENOMEM;
61 }
62 
63 void qcow2_refcount_close(BlockDriverState *bs)
64 {
65     BDRVQcowState *s = bs->opaque;
66     g_free(s->refcount_table);
67 }
68 
69 
70 static int load_refcount_block(BlockDriverState *bs,
71                                int64_t refcount_block_offset,
72                                void **refcount_block)
73 {
74     BDRVQcowState *s = bs->opaque;
75     int ret;
76 
77     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
78     ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
79         refcount_block);
80 
81     return ret;
82 }
83 
84 /*
85  * Returns the refcount of the cluster given by its index. Any non-negative
86  * return value is the refcount of the cluster, negative values are -errno
87  * and indicate an error.
88  */
89 static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
90 {
91     BDRVQcowState *s = bs->opaque;
92     uint64_t refcount_table_index, block_index;
93     int64_t refcount_block_offset;
94     int ret;
95     uint16_t *refcount_block;
96     uint16_t refcount;
97 
98     refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
99     if (refcount_table_index >= s->refcount_table_size)
100         return 0;
101     refcount_block_offset =
102         s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
103     if (!refcount_block_offset)
104         return 0;
105 
106     ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
107         (void**) &refcount_block);
108     if (ret < 0) {
109         return ret;
110     }
111 
112     block_index = cluster_index &
113         ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
114     refcount = be16_to_cpu(refcount_block[block_index]);
115 
116     ret = qcow2_cache_put(bs, s->refcount_block_cache,
117         (void**) &refcount_block);
118     if (ret < 0) {
119         return ret;
120     }
121 
122     return refcount;
123 }
124 
125 /*
126  * Rounds the refcount table size up to avoid growing the table for each single
127  * refcount block that is allocated.
128  */
129 static unsigned int next_refcount_table_size(BDRVQcowState *s,
130     unsigned int min_size)
131 {
132     unsigned int min_clusters = (min_size >> (s->cluster_bits - 3)) + 1;
133     unsigned int refcount_table_clusters =
134         MAX(1, s->refcount_table_size >> (s->cluster_bits - 3));
135 
136     while (min_clusters > refcount_table_clusters) {
137         refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
138     }
139 
140     return refcount_table_clusters << (s->cluster_bits - 3);
141 }
142 
143 
144 /* Checks if two offsets are described by the same refcount block */
145 static int in_same_refcount_block(BDRVQcowState *s, uint64_t offset_a,
146     uint64_t offset_b)
147 {
148     uint64_t block_a = offset_a >> (2 * s->cluster_bits - REFCOUNT_SHIFT);
149     uint64_t block_b = offset_b >> (2 * s->cluster_bits - REFCOUNT_SHIFT);
150 
151     return (block_a == block_b);
152 }
153 
154 /*
155  * Loads a refcount block. If it doesn't exist yet, it is allocated first
156  * (including growing the refcount table if needed).
157  *
158  * Returns 0 on success or -errno in error case
159  */
160 static int alloc_refcount_block(BlockDriverState *bs,
161     int64_t cluster_index, uint16_t **refcount_block)
162 {
163     BDRVQcowState *s = bs->opaque;
164     unsigned int refcount_table_index;
165     int ret;
166 
167     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
168 
169     /* Find the refcount block for the given cluster */
170     refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
171 
172     if (refcount_table_index < s->refcount_table_size) {
173 
174         uint64_t refcount_block_offset =
175             s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
176 
177         /* If it's already there, we're done */
178         if (refcount_block_offset) {
179              return load_refcount_block(bs, refcount_block_offset,
180                  (void**) refcount_block);
181         }
182     }
183 
184     /*
185      * If we came here, we need to allocate something. Something is at least
186      * a cluster for the new refcount block. It may also include a new refcount
187      * table if the old refcount table is too small.
188      *
189      * Note that allocating clusters here needs some special care:
190      *
191      * - We can't use the normal qcow2_alloc_clusters(), it would try to
192      *   increase the refcount and very likely we would end up with an endless
193      *   recursion. Instead we must place the refcount blocks in a way that
194      *   they can describe them themselves.
195      *
196      * - We need to consider that at this point we are inside update_refcounts
197      *   and potentially doing an initial refcount increase. This means that
198      *   some clusters have already been allocated by the caller, but their
199      *   refcount isn't accurate yet. If we allocate clusters for metadata, we
200      *   need to return -EAGAIN to signal the caller that it needs to restart
201      *   the search for free clusters.
202      *
203      * - alloc_clusters_noref and qcow2_free_clusters may load a different
204      *   refcount block into the cache
205      */
206 
207     *refcount_block = NULL;
208 
209     /* We write to the refcount table, so we might depend on L2 tables */
210     ret = qcow2_cache_flush(bs, s->l2_table_cache);
211     if (ret < 0) {
212         return ret;
213     }
214 
215     /* Allocate the refcount block itself and mark it as used */
216     int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
217     if (new_block < 0) {
218         return new_block;
219     }
220 
221 #ifdef DEBUG_ALLOC2
222     fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
223         " at %" PRIx64 "\n",
224         refcount_table_index, cluster_index << s->cluster_bits, new_block);
225 #endif
226 
227     if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
228         /* Zero the new refcount block before updating it */
229         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
230             (void**) refcount_block);
231         if (ret < 0) {
232             goto fail_block;
233         }
234 
235         memset(*refcount_block, 0, s->cluster_size);
236 
237         /* The block describes itself, need to update the cache */
238         int block_index = (new_block >> s->cluster_bits) &
239             ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
240         (*refcount_block)[block_index] = cpu_to_be16(1);
241     } else {
242         /* Described somewhere else. This can recurse at most twice before we
243          * arrive at a block that describes itself. */
244         ret = update_refcount(bs, new_block, s->cluster_size, 1,
245                               QCOW2_DISCARD_NEVER);
246         if (ret < 0) {
247             goto fail_block;
248         }
249 
250         ret = qcow2_cache_flush(bs, s->refcount_block_cache);
251         if (ret < 0) {
252             goto fail_block;
253         }
254 
255         /* Initialize the new refcount block only after updating its refcount,
256          * update_refcount uses the refcount cache itself */
257         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
258             (void**) refcount_block);
259         if (ret < 0) {
260             goto fail_block;
261         }
262 
263         memset(*refcount_block, 0, s->cluster_size);
264     }
265 
266     /* Now the new refcount block needs to be written to disk */
267     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
268     qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
269     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
270     if (ret < 0) {
271         goto fail_block;
272     }
273 
274     /* If the refcount table is big enough, just hook the block up there */
275     if (refcount_table_index < s->refcount_table_size) {
276         uint64_t data64 = cpu_to_be64(new_block);
277         BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
278         ret = bdrv_pwrite_sync(bs->file,
279             s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
280             &data64, sizeof(data64));
281         if (ret < 0) {
282             goto fail_block;
283         }
284 
285         s->refcount_table[refcount_table_index] = new_block;
286 
287         /* The new refcount block may be where the caller intended to put its
288          * data, so let it restart the search. */
289         return -EAGAIN;
290     }
291 
292     ret = qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block);
293     if (ret < 0) {
294         goto fail_block;
295     }
296 
297     /*
298      * If we come here, we need to grow the refcount table. Again, a new
299      * refcount table needs some space and we can't simply allocate to avoid
300      * endless recursion.
301      *
302      * Therefore let's grab new refcount blocks at the end of the image, which
303      * will describe themselves and the new refcount table. This way we can
304      * reference them only in the new table and do the switch to the new
305      * refcount table at once without producing an inconsistent state in
306      * between.
307      */
308     BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
309 
310     /* Calculate the number of refcount blocks needed so far */
311     uint64_t refcount_block_clusters = 1 << (s->cluster_bits - REFCOUNT_SHIFT);
312     uint64_t blocks_used = DIV_ROUND_UP(cluster_index, refcount_block_clusters);
313 
314     if (blocks_used > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
315         return -EFBIG;
316     }
317 
318     /* And now we need at least one block more for the new metadata */
319     uint64_t table_size = next_refcount_table_size(s, blocks_used + 1);
320     uint64_t last_table_size;
321     uint64_t blocks_clusters;
322     do {
323         uint64_t table_clusters =
324             size_to_clusters(s, table_size * sizeof(uint64_t));
325         blocks_clusters = 1 +
326             ((table_clusters + refcount_block_clusters - 1)
327             / refcount_block_clusters);
328         uint64_t meta_clusters = table_clusters + blocks_clusters;
329 
330         last_table_size = table_size;
331         table_size = next_refcount_table_size(s, blocks_used +
332             ((meta_clusters + refcount_block_clusters - 1)
333             / refcount_block_clusters));
334 
335     } while (last_table_size != table_size);
336 
337 #ifdef DEBUG_ALLOC2
338     fprintf(stderr, "qcow2: Grow refcount table %" PRId32 " => %" PRId64 "\n",
339         s->refcount_table_size, table_size);
340 #endif
341 
342     /* Create the new refcount table and blocks */
343     uint64_t meta_offset = (blocks_used * refcount_block_clusters) *
344         s->cluster_size;
345     uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size;
346     uint16_t *new_blocks = g_malloc0(blocks_clusters * s->cluster_size);
347     uint64_t *new_table = g_malloc0(table_size * sizeof(uint64_t));
348 
349     /* Fill the new refcount table */
350     memcpy(new_table, s->refcount_table,
351         s->refcount_table_size * sizeof(uint64_t));
352     new_table[refcount_table_index] = new_block;
353 
354     int i;
355     for (i = 0; i < blocks_clusters; i++) {
356         new_table[blocks_used + i] = meta_offset + (i * s->cluster_size);
357     }
358 
359     /* Fill the refcount blocks */
360     uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t));
361     int block = 0;
362     for (i = 0; i < table_clusters + blocks_clusters; i++) {
363         new_blocks[block++] = cpu_to_be16(1);
364     }
365 
366     /* Write refcount blocks to disk */
367     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
368     ret = bdrv_pwrite_sync(bs->file, meta_offset, new_blocks,
369         blocks_clusters * s->cluster_size);
370     g_free(new_blocks);
371     if (ret < 0) {
372         goto fail_table;
373     }
374 
375     /* Write refcount table to disk */
376     for(i = 0; i < table_size; i++) {
377         cpu_to_be64s(&new_table[i]);
378     }
379 
380     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
381     ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
382         table_size * sizeof(uint64_t));
383     if (ret < 0) {
384         goto fail_table;
385     }
386 
387     for(i = 0; i < table_size; i++) {
388         be64_to_cpus(&new_table[i]);
389     }
390 
391     /* Hook up the new refcount table in the qcow2 header */
392     uint8_t data[12];
393     cpu_to_be64w((uint64_t*)data, table_offset);
394     cpu_to_be32w((uint32_t*)(data + 8), table_clusters);
395     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
396     ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, refcount_table_offset),
397         data, sizeof(data));
398     if (ret < 0) {
399         goto fail_table;
400     }
401 
402     /* And switch it in memory */
403     uint64_t old_table_offset = s->refcount_table_offset;
404     uint64_t old_table_size = s->refcount_table_size;
405 
406     g_free(s->refcount_table);
407     s->refcount_table = new_table;
408     s->refcount_table_size = table_size;
409     s->refcount_table_offset = table_offset;
410 
411     /* Free old table. */
412     qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
413                         QCOW2_DISCARD_OTHER);
414 
415     ret = load_refcount_block(bs, new_block, (void**) refcount_block);
416     if (ret < 0) {
417         return ret;
418     }
419 
420     /* If we were trying to do the initial refcount update for some cluster
421      * allocation, we might have used the same clusters to store newly
422      * allocated metadata. Make the caller search some new space. */
423     return -EAGAIN;
424 
425 fail_table:
426     g_free(new_table);
427 fail_block:
428     if (*refcount_block != NULL) {
429         qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block);
430     }
431     return ret;
432 }
433 
434 void qcow2_process_discards(BlockDriverState *bs, int ret)
435 {
436     BDRVQcowState *s = bs->opaque;
437     Qcow2DiscardRegion *d, *next;
438 
439     QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
440         QTAILQ_REMOVE(&s->discards, d, next);
441 
442         /* Discard is optional, ignore the return value */
443         if (ret >= 0) {
444             bdrv_discard(bs->file,
445                          d->offset >> BDRV_SECTOR_BITS,
446                          d->bytes >> BDRV_SECTOR_BITS);
447         }
448 
449         g_free(d);
450     }
451 }
452 
453 static void update_refcount_discard(BlockDriverState *bs,
454                                     uint64_t offset, uint64_t length)
455 {
456     BDRVQcowState *s = bs->opaque;
457     Qcow2DiscardRegion *d, *p, *next;
458 
459     QTAILQ_FOREACH(d, &s->discards, next) {
460         uint64_t new_start = MIN(offset, d->offset);
461         uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
462 
463         if (new_end - new_start <= length + d->bytes) {
464             /* There can't be any overlap, areas ending up here have no
465              * references any more and therefore shouldn't get freed another
466              * time. */
467             assert(d->bytes + length == new_end - new_start);
468             d->offset = new_start;
469             d->bytes = new_end - new_start;
470             goto found;
471         }
472     }
473 
474     d = g_malloc(sizeof(*d));
475     *d = (Qcow2DiscardRegion) {
476         .bs     = bs,
477         .offset = offset,
478         .bytes  = length,
479     };
480     QTAILQ_INSERT_TAIL(&s->discards, d, next);
481 
482 found:
483     /* Merge discard requests if they are adjacent now */
484     QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
485         if (p == d
486             || p->offset > d->offset + d->bytes
487             || d->offset > p->offset + p->bytes)
488         {
489             continue;
490         }
491 
492         /* Still no overlap possible */
493         assert(p->offset == d->offset + d->bytes
494             || d->offset == p->offset + p->bytes);
495 
496         QTAILQ_REMOVE(&s->discards, p, next);
497         d->offset = MIN(d->offset, p->offset);
498         d->bytes += p->bytes;
499     }
500 }
501 
502 /* XXX: cache several refcount block clusters ? */
503 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
504     int64_t offset, int64_t length, int addend, enum qcow2_discard_type type)
505 {
506     BDRVQcowState *s = bs->opaque;
507     int64_t start, last, cluster_offset;
508     uint16_t *refcount_block = NULL;
509     int64_t old_table_index = -1;
510     int ret;
511 
512 #ifdef DEBUG_ALLOC2
513     fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64 " addend=%d\n",
514            offset, length, addend);
515 #endif
516     if (length < 0) {
517         return -EINVAL;
518     } else if (length == 0) {
519         return 0;
520     }
521 
522     if (addend < 0) {
523         qcow2_cache_set_dependency(bs, s->refcount_block_cache,
524             s->l2_table_cache);
525     }
526 
527     start = start_of_cluster(s, offset);
528     last = start_of_cluster(s, offset + length - 1);
529     for(cluster_offset = start; cluster_offset <= last;
530         cluster_offset += s->cluster_size)
531     {
532         int block_index, refcount;
533         int64_t cluster_index = cluster_offset >> s->cluster_bits;
534         int64_t table_index =
535             cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
536 
537         /* Load the refcount block and allocate it if needed */
538         if (table_index != old_table_index) {
539             if (refcount_block) {
540                 ret = qcow2_cache_put(bs, s->refcount_block_cache,
541                     (void**) &refcount_block);
542                 if (ret < 0) {
543                     goto fail;
544                 }
545             }
546 
547             ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
548             if (ret < 0) {
549                 goto fail;
550             }
551         }
552         old_table_index = table_index;
553 
554         qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
555 
556         /* we can update the count and save it */
557         block_index = cluster_index &
558             ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
559 
560         refcount = be16_to_cpu(refcount_block[block_index]);
561         refcount += addend;
562         if (refcount < 0 || refcount > 0xffff) {
563             ret = -EINVAL;
564             goto fail;
565         }
566         if (refcount == 0 && cluster_index < s->free_cluster_index) {
567             s->free_cluster_index = cluster_index;
568         }
569         refcount_block[block_index] = cpu_to_be16(refcount);
570 
571         if (refcount == 0 && s->discard_passthrough[type]) {
572             update_refcount_discard(bs, cluster_offset, s->cluster_size);
573         }
574     }
575 
576     ret = 0;
577 fail:
578     if (!s->cache_discards) {
579         qcow2_process_discards(bs, ret);
580     }
581 
582     /* Write last changed block to disk */
583     if (refcount_block) {
584         int wret;
585         wret = qcow2_cache_put(bs, s->refcount_block_cache,
586             (void**) &refcount_block);
587         if (wret < 0) {
588             return ret < 0 ? ret : wret;
589         }
590     }
591 
592     /*
593      * Try do undo any updates if an error is returned (This may succeed in
594      * some cases like ENOSPC for allocating a new refcount block)
595      */
596     if (ret < 0) {
597         int dummy;
598         dummy = update_refcount(bs, offset, cluster_offset - offset, -addend,
599                                 QCOW2_DISCARD_NEVER);
600         (void)dummy;
601     }
602 
603     return ret;
604 }
605 
606 /*
607  * Increases or decreases the refcount of a given cluster by one.
608  * addend must be 1 or -1.
609  *
610  * If the return value is non-negative, it is the new refcount of the cluster.
611  * If it is negative, it is -errno and indicates an error.
612  */
613 int qcow2_update_cluster_refcount(BlockDriverState *bs,
614                                   int64_t cluster_index,
615                                   int addend,
616                                   enum qcow2_discard_type type)
617 {
618     BDRVQcowState *s = bs->opaque;
619     int ret;
620 
621     ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
622                           type);
623     if (ret < 0) {
624         return ret;
625     }
626 
627     return get_refcount(bs, cluster_index);
628 }
629 
630 
631 
632 /*********************************************************/
633 /* cluster allocation functions */
634 
635 
636 
637 /* return < 0 if error */
638 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size)
639 {
640     BDRVQcowState *s = bs->opaque;
641     uint64_t i, nb_clusters;
642     int refcount;
643 
644     nb_clusters = size_to_clusters(s, size);
645 retry:
646     for(i = 0; i < nb_clusters; i++) {
647         uint64_t next_cluster_index = s->free_cluster_index++;
648         refcount = get_refcount(bs, next_cluster_index);
649 
650         if (refcount < 0) {
651             return refcount;
652         } else if (refcount != 0) {
653             goto retry;
654         }
655     }
656 
657     /* Make sure that all offsets in the "allocated" range are representable
658      * in an int64_t */
659     if (s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits)) {
660         return -EFBIG;
661     }
662 
663 #ifdef DEBUG_ALLOC2
664     fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
665             size,
666             (s->free_cluster_index - nb_clusters) << s->cluster_bits);
667 #endif
668     return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
669 }
670 
671 int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
672 {
673     int64_t offset;
674     int ret;
675 
676     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
677     do {
678         offset = alloc_clusters_noref(bs, size);
679         if (offset < 0) {
680             return offset;
681         }
682 
683         ret = update_refcount(bs, offset, size, 1, QCOW2_DISCARD_NEVER);
684     } while (ret == -EAGAIN);
685 
686     if (ret < 0) {
687         return ret;
688     }
689 
690     return offset;
691 }
692 
693 int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
694     int nb_clusters)
695 {
696     BDRVQcowState *s = bs->opaque;
697     uint64_t cluster_index;
698     uint64_t i;
699     int refcount, ret;
700 
701     assert(nb_clusters >= 0);
702     if (nb_clusters == 0) {
703         return 0;
704     }
705 
706     do {
707         /* Check how many clusters there are free */
708         cluster_index = offset >> s->cluster_bits;
709         for(i = 0; i < nb_clusters; i++) {
710             refcount = get_refcount(bs, cluster_index++);
711 
712             if (refcount < 0) {
713                 return refcount;
714             } else if (refcount != 0) {
715                 break;
716             }
717         }
718 
719         /* And then allocate them */
720         ret = update_refcount(bs, offset, i << s->cluster_bits, 1,
721                               QCOW2_DISCARD_NEVER);
722     } while (ret == -EAGAIN);
723 
724     if (ret < 0) {
725         return ret;
726     }
727 
728     return i;
729 }
730 
731 /* only used to allocate compressed sectors. We try to allocate
732    contiguous sectors. size must be <= cluster_size */
733 int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
734 {
735     BDRVQcowState *s = bs->opaque;
736     int64_t offset, cluster_offset;
737     int free_in_cluster;
738 
739     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
740     assert(size > 0 && size <= s->cluster_size);
741     if (s->free_byte_offset == 0) {
742         offset = qcow2_alloc_clusters(bs, s->cluster_size);
743         if (offset < 0) {
744             return offset;
745         }
746         s->free_byte_offset = offset;
747     }
748  redo:
749     free_in_cluster = s->cluster_size -
750         offset_into_cluster(s, s->free_byte_offset);
751     if (size <= free_in_cluster) {
752         /* enough space in current cluster */
753         offset = s->free_byte_offset;
754         s->free_byte_offset += size;
755         free_in_cluster -= size;
756         if (free_in_cluster == 0)
757             s->free_byte_offset = 0;
758         if (offset_into_cluster(s, offset) != 0)
759             qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
760                                           QCOW2_DISCARD_NEVER);
761     } else {
762         offset = qcow2_alloc_clusters(bs, s->cluster_size);
763         if (offset < 0) {
764             return offset;
765         }
766         cluster_offset = start_of_cluster(s, s->free_byte_offset);
767         if ((cluster_offset + s->cluster_size) == offset) {
768             /* we are lucky: contiguous data */
769             offset = s->free_byte_offset;
770             qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
771                                           QCOW2_DISCARD_NEVER);
772             s->free_byte_offset += size;
773         } else {
774             s->free_byte_offset = offset;
775             goto redo;
776         }
777     }
778 
779     /* The cluster refcount was incremented, either by qcow2_alloc_clusters()
780      * or explicitly by qcow2_update_cluster_refcount().  Refcount blocks must
781      * be flushed before the caller's L2 table updates.
782      */
783     qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
784     return offset;
785 }
786 
787 void qcow2_free_clusters(BlockDriverState *bs,
788                           int64_t offset, int64_t size,
789                           enum qcow2_discard_type type)
790 {
791     int ret;
792 
793     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
794     ret = update_refcount(bs, offset, size, -1, type);
795     if (ret < 0) {
796         fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
797         /* TODO Remember the clusters to free them later and avoid leaking */
798     }
799 }
800 
801 /*
802  * Free a cluster using its L2 entry (handles clusters of all types, e.g.
803  * normal cluster, compressed cluster, etc.)
804  */
805 void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
806                              int nb_clusters, enum qcow2_discard_type type)
807 {
808     BDRVQcowState *s = bs->opaque;
809 
810     switch (qcow2_get_cluster_type(l2_entry)) {
811     case QCOW2_CLUSTER_COMPRESSED:
812         {
813             int nb_csectors;
814             nb_csectors = ((l2_entry >> s->csize_shift) &
815                            s->csize_mask) + 1;
816             qcow2_free_clusters(bs,
817                 (l2_entry & s->cluster_offset_mask) & ~511,
818                 nb_csectors * 512, type);
819         }
820         break;
821     case QCOW2_CLUSTER_NORMAL:
822     case QCOW2_CLUSTER_ZERO:
823         if (l2_entry & L2E_OFFSET_MASK) {
824             qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
825                                 nb_clusters << s->cluster_bits, type);
826         }
827         break;
828     case QCOW2_CLUSTER_UNALLOCATED:
829         break;
830     default:
831         abort();
832     }
833 }
834 
835 
836 
837 /*********************************************************/
838 /* snapshots and image creation */
839 
840 
841 
842 /* update the refcounts of snapshots and the copied flag */
843 int qcow2_update_snapshot_refcount(BlockDriverState *bs,
844     int64_t l1_table_offset, int l1_size, int addend)
845 {
846     BDRVQcowState *s = bs->opaque;
847     uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated;
848     int64_t old_offset, old_l2_offset;
849     int i, j, l1_modified = 0, nb_csectors, refcount;
850     int ret;
851 
852     l2_table = NULL;
853     l1_table = NULL;
854     l1_size2 = l1_size * sizeof(uint64_t);
855 
856     s->cache_discards = true;
857 
858     /* WARNING: qcow2_snapshot_goto relies on this function not using the
859      * l1_table_offset when it is the current s->l1_table_offset! Be careful
860      * when changing this! */
861     if (l1_table_offset != s->l1_table_offset) {
862         l1_table = g_malloc0(align_offset(l1_size2, 512));
863         l1_allocated = 1;
864 
865         ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
866         if (ret < 0) {
867             goto fail;
868         }
869 
870         for(i = 0;i < l1_size; i++)
871             be64_to_cpus(&l1_table[i]);
872     } else {
873         assert(l1_size == s->l1_size);
874         l1_table = s->l1_table;
875         l1_allocated = 0;
876     }
877 
878     for(i = 0; i < l1_size; i++) {
879         l2_offset = l1_table[i];
880         if (l2_offset) {
881             old_l2_offset = l2_offset;
882             l2_offset &= L1E_OFFSET_MASK;
883 
884             ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
885                 (void**) &l2_table);
886             if (ret < 0) {
887                 goto fail;
888             }
889 
890             for(j = 0; j < s->l2_size; j++) {
891                 uint64_t cluster_index;
892 
893                 offset = be64_to_cpu(l2_table[j]);
894                 old_offset = offset;
895                 offset &= ~QCOW_OFLAG_COPIED;
896 
897                 switch (qcow2_get_cluster_type(offset)) {
898                     case QCOW2_CLUSTER_COMPRESSED:
899                         nb_csectors = ((offset >> s->csize_shift) &
900                                        s->csize_mask) + 1;
901                         if (addend != 0) {
902                             ret = update_refcount(bs,
903                                 (offset & s->cluster_offset_mask) & ~511,
904                                 nb_csectors * 512, addend,
905                                 QCOW2_DISCARD_SNAPSHOT);
906                             if (ret < 0) {
907                                 goto fail;
908                             }
909                         }
910                         /* compressed clusters are never modified */
911                         refcount = 2;
912                         break;
913 
914                     case QCOW2_CLUSTER_NORMAL:
915                     case QCOW2_CLUSTER_ZERO:
916                         cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits;
917                         if (!cluster_index) {
918                             /* unallocated */
919                             refcount = 0;
920                             break;
921                         }
922                         if (addend != 0) {
923                             refcount = qcow2_update_cluster_refcount(bs,
924                                     cluster_index, addend,
925                                     QCOW2_DISCARD_SNAPSHOT);
926                         } else {
927                             refcount = get_refcount(bs, cluster_index);
928                         }
929 
930                         if (refcount < 0) {
931                             ret = refcount;
932                             goto fail;
933                         }
934                         break;
935 
936                     case QCOW2_CLUSTER_UNALLOCATED:
937                         refcount = 0;
938                         break;
939 
940                     default:
941                         abort();
942                 }
943 
944                 if (refcount == 1) {
945                     offset |= QCOW_OFLAG_COPIED;
946                 }
947                 if (offset != old_offset) {
948                     if (addend > 0) {
949                         qcow2_cache_set_dependency(bs, s->l2_table_cache,
950                             s->refcount_block_cache);
951                     }
952                     l2_table[j] = cpu_to_be64(offset);
953                     qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
954                 }
955             }
956 
957             ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
958             if (ret < 0) {
959                 goto fail;
960             }
961 
962 
963             if (addend != 0) {
964                 refcount = qcow2_update_cluster_refcount(bs, l2_offset >>
965                         s->cluster_bits, addend, QCOW2_DISCARD_SNAPSHOT);
966             } else {
967                 refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
968             }
969             if (refcount < 0) {
970                 ret = refcount;
971                 goto fail;
972             } else if (refcount == 1) {
973                 l2_offset |= QCOW_OFLAG_COPIED;
974             }
975             if (l2_offset != old_l2_offset) {
976                 l1_table[i] = l2_offset;
977                 l1_modified = 1;
978             }
979         }
980     }
981 
982     ret = bdrv_flush(bs);
983 fail:
984     if (l2_table) {
985         qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
986     }
987 
988     s->cache_discards = false;
989     qcow2_process_discards(bs, ret);
990 
991     /* Update L1 only if it isn't deleted anyway (addend = -1) */
992     if (ret == 0 && addend >= 0 && l1_modified) {
993         for (i = 0; i < l1_size; i++) {
994             cpu_to_be64s(&l1_table[i]);
995         }
996 
997         ret = bdrv_pwrite_sync(bs->file, l1_table_offset, l1_table, l1_size2);
998 
999         for (i = 0; i < l1_size; i++) {
1000             be64_to_cpus(&l1_table[i]);
1001         }
1002     }
1003     if (l1_allocated)
1004         g_free(l1_table);
1005     return ret;
1006 }
1007 
1008 
1009 
1010 
1011 /*********************************************************/
1012 /* refcount checking functions */
1013 
1014 
1015 
1016 /*
1017  * Increases the refcount for a range of clusters in a given refcount table.
1018  * This is used to construct a temporary refcount table out of L1 and L2 tables
1019  * which can be compared the the refcount table saved in the image.
1020  *
1021  * Modifies the number of errors in res.
1022  */
1023 static void inc_refcounts(BlockDriverState *bs,
1024                           BdrvCheckResult *res,
1025                           uint16_t *refcount_table,
1026                           int refcount_table_size,
1027                           int64_t offset, int64_t size)
1028 {
1029     BDRVQcowState *s = bs->opaque;
1030     uint64_t start, last, cluster_offset, k;
1031 
1032     if (size <= 0)
1033         return;
1034 
1035     start = start_of_cluster(s, offset);
1036     last = start_of_cluster(s, offset + size - 1);
1037     for(cluster_offset = start; cluster_offset <= last;
1038         cluster_offset += s->cluster_size) {
1039         k = cluster_offset >> s->cluster_bits;
1040         if (k >= refcount_table_size) {
1041             fprintf(stderr, "Warning: cluster offset=0x%" PRIx64 " is after "
1042                 "the end of the image file, can't properly check refcounts.\n",
1043                 cluster_offset);
1044             res->check_errors++;
1045         } else {
1046             if (++refcount_table[k] == 0) {
1047                 fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
1048                     "\n", cluster_offset);
1049                 res->corruptions++;
1050             }
1051         }
1052     }
1053 }
1054 
1055 /* Flags for check_refcounts_l1() and check_refcounts_l2() */
1056 enum {
1057     CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
1058 };
1059 
1060 /*
1061  * Increases the refcount in the given refcount table for the all clusters
1062  * referenced in the L2 table. While doing so, performs some checks on L2
1063  * entries.
1064  *
1065  * Returns the number of errors found by the checks or -errno if an internal
1066  * error occurred.
1067  */
1068 static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
1069     uint16_t *refcount_table, int refcount_table_size, int64_t l2_offset,
1070     int flags)
1071 {
1072     BDRVQcowState *s = bs->opaque;
1073     uint64_t *l2_table, l2_entry;
1074     uint64_t next_contiguous_offset = 0;
1075     int i, l2_size, nb_csectors;
1076 
1077     /* Read L2 table from disk */
1078     l2_size = s->l2_size * sizeof(uint64_t);
1079     l2_table = g_malloc(l2_size);
1080 
1081     if (bdrv_pread(bs->file, l2_offset, l2_table, l2_size) != l2_size)
1082         goto fail;
1083 
1084     /* Do the actual checks */
1085     for(i = 0; i < s->l2_size; i++) {
1086         l2_entry = be64_to_cpu(l2_table[i]);
1087 
1088         switch (qcow2_get_cluster_type(l2_entry)) {
1089         case QCOW2_CLUSTER_COMPRESSED:
1090             /* Compressed clusters don't have QCOW_OFLAG_COPIED */
1091             if (l2_entry & QCOW_OFLAG_COPIED) {
1092                 fprintf(stderr, "ERROR: cluster %" PRId64 ": "
1093                     "copied flag must never be set for compressed "
1094                     "clusters\n", l2_entry >> s->cluster_bits);
1095                 l2_entry &= ~QCOW_OFLAG_COPIED;
1096                 res->corruptions++;
1097             }
1098 
1099             /* Mark cluster as used */
1100             nb_csectors = ((l2_entry >> s->csize_shift) &
1101                            s->csize_mask) + 1;
1102             l2_entry &= s->cluster_offset_mask;
1103             inc_refcounts(bs, res, refcount_table, refcount_table_size,
1104                 l2_entry & ~511, nb_csectors * 512);
1105 
1106             if (flags & CHECK_FRAG_INFO) {
1107                 res->bfi.allocated_clusters++;
1108                 res->bfi.compressed_clusters++;
1109 
1110                 /* Compressed clusters are fragmented by nature.  Since they
1111                  * take up sub-sector space but we only have sector granularity
1112                  * I/O we need to re-read the same sectors even for adjacent
1113                  * compressed clusters.
1114                  */
1115                 res->bfi.fragmented_clusters++;
1116             }
1117             break;
1118 
1119         case QCOW2_CLUSTER_ZERO:
1120             if ((l2_entry & L2E_OFFSET_MASK) == 0) {
1121                 break;
1122             }
1123             /* fall through */
1124 
1125         case QCOW2_CLUSTER_NORMAL:
1126         {
1127             uint64_t offset = l2_entry & L2E_OFFSET_MASK;
1128 
1129             if (flags & CHECK_FRAG_INFO) {
1130                 res->bfi.allocated_clusters++;
1131                 if (next_contiguous_offset &&
1132                     offset != next_contiguous_offset) {
1133                     res->bfi.fragmented_clusters++;
1134                 }
1135                 next_contiguous_offset = offset + s->cluster_size;
1136             }
1137 
1138             /* Mark cluster as used */
1139             inc_refcounts(bs, res, refcount_table,refcount_table_size,
1140                 offset, s->cluster_size);
1141 
1142             /* Correct offsets are cluster aligned */
1143             if (offset_into_cluster(s, offset)) {
1144                 fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
1145                     "properly aligned; L2 entry corrupted.\n", offset);
1146                 res->corruptions++;
1147             }
1148             break;
1149         }
1150 
1151         case QCOW2_CLUSTER_UNALLOCATED:
1152             break;
1153 
1154         default:
1155             abort();
1156         }
1157     }
1158 
1159     g_free(l2_table);
1160     return 0;
1161 
1162 fail:
1163     fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
1164     g_free(l2_table);
1165     return -EIO;
1166 }
1167 
1168 /*
1169  * Increases the refcount for the L1 table, its L2 tables and all referenced
1170  * clusters in the given refcount table. While doing so, performs some checks
1171  * on L1 and L2 entries.
1172  *
1173  * Returns the number of errors found by the checks or -errno if an internal
1174  * error occurred.
1175  */
1176 static int check_refcounts_l1(BlockDriverState *bs,
1177                               BdrvCheckResult *res,
1178                               uint16_t *refcount_table,
1179                               int refcount_table_size,
1180                               int64_t l1_table_offset, int l1_size,
1181                               int flags)
1182 {
1183     BDRVQcowState *s = bs->opaque;
1184     uint64_t *l1_table, l2_offset, l1_size2;
1185     int i, ret;
1186 
1187     l1_size2 = l1_size * sizeof(uint64_t);
1188 
1189     /* Mark L1 table as used */
1190     inc_refcounts(bs, res, refcount_table, refcount_table_size,
1191         l1_table_offset, l1_size2);
1192 
1193     /* Read L1 table entries from disk */
1194     if (l1_size2 == 0) {
1195         l1_table = NULL;
1196     } else {
1197         l1_table = g_malloc(l1_size2);
1198         if (bdrv_pread(bs->file, l1_table_offset,
1199                        l1_table, l1_size2) != l1_size2)
1200             goto fail;
1201         for(i = 0;i < l1_size; i++)
1202             be64_to_cpus(&l1_table[i]);
1203     }
1204 
1205     /* Do the actual checks */
1206     for(i = 0; i < l1_size; i++) {
1207         l2_offset = l1_table[i];
1208         if (l2_offset) {
1209             /* Mark L2 table as used */
1210             l2_offset &= L1E_OFFSET_MASK;
1211             inc_refcounts(bs, res, refcount_table, refcount_table_size,
1212                 l2_offset, s->cluster_size);
1213 
1214             /* L2 tables are cluster aligned */
1215             if (offset_into_cluster(s, l2_offset)) {
1216                 fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1217                     "cluster aligned; L1 entry corrupted\n", l2_offset);
1218                 res->corruptions++;
1219             }
1220 
1221             /* Process and check L2 entries */
1222             ret = check_refcounts_l2(bs, res, refcount_table,
1223                                      refcount_table_size, l2_offset, flags);
1224             if (ret < 0) {
1225                 goto fail;
1226             }
1227         }
1228     }
1229     g_free(l1_table);
1230     return 0;
1231 
1232 fail:
1233     fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1234     res->check_errors++;
1235     g_free(l1_table);
1236     return -EIO;
1237 }
1238 
1239 /*
1240  * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
1241  *
1242  * This function does not print an error message nor does it increment
1243  * check_errors if get_refcount fails (this is because such an error will have
1244  * been already detected and sufficiently signaled by the calling function
1245  * (qcow2_check_refcounts) by the time this function is called).
1246  */
1247 static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
1248                               BdrvCheckMode fix)
1249 {
1250     BDRVQcowState *s = bs->opaque;
1251     uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
1252     int ret;
1253     int refcount;
1254     int i, j;
1255 
1256     for (i = 0; i < s->l1_size; i++) {
1257         uint64_t l1_entry = s->l1_table[i];
1258         uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
1259         bool l2_dirty = false;
1260 
1261         if (!l2_offset) {
1262             continue;
1263         }
1264 
1265         refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
1266         if (refcount < 0) {
1267             /* don't print message nor increment check_errors */
1268             continue;
1269         }
1270         if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
1271             fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
1272                     "l1_entry=%" PRIx64 " refcount=%d\n",
1273                     fix & BDRV_FIX_ERRORS ? "Repairing" :
1274                                             "ERROR",
1275                     i, l1_entry, refcount);
1276             if (fix & BDRV_FIX_ERRORS) {
1277                 s->l1_table[i] = refcount == 1
1278                                ? l1_entry |  QCOW_OFLAG_COPIED
1279                                : l1_entry & ~QCOW_OFLAG_COPIED;
1280                 ret = qcow2_write_l1_entry(bs, i);
1281                 if (ret < 0) {
1282                     res->check_errors++;
1283                     goto fail;
1284                 }
1285                 res->corruptions_fixed++;
1286             } else {
1287                 res->corruptions++;
1288             }
1289         }
1290 
1291         ret = bdrv_pread(bs->file, l2_offset, l2_table,
1292                          s->l2_size * sizeof(uint64_t));
1293         if (ret < 0) {
1294             fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
1295                     strerror(-ret));
1296             res->check_errors++;
1297             goto fail;
1298         }
1299 
1300         for (j = 0; j < s->l2_size; j++) {
1301             uint64_t l2_entry = be64_to_cpu(l2_table[j]);
1302             uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
1303             int cluster_type = qcow2_get_cluster_type(l2_entry);
1304 
1305             if ((cluster_type == QCOW2_CLUSTER_NORMAL) ||
1306                 ((cluster_type == QCOW2_CLUSTER_ZERO) && (data_offset != 0))) {
1307                 refcount = get_refcount(bs, data_offset >> s->cluster_bits);
1308                 if (refcount < 0) {
1309                     /* don't print message nor increment check_errors */
1310                     continue;
1311                 }
1312                 if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
1313                     fprintf(stderr, "%s OFLAG_COPIED data cluster: "
1314                             "l2_entry=%" PRIx64 " refcount=%d\n",
1315                             fix & BDRV_FIX_ERRORS ? "Repairing" :
1316                                                     "ERROR",
1317                             l2_entry, refcount);
1318                     if (fix & BDRV_FIX_ERRORS) {
1319                         l2_table[j] = cpu_to_be64(refcount == 1
1320                                     ? l2_entry |  QCOW_OFLAG_COPIED
1321                                     : l2_entry & ~QCOW_OFLAG_COPIED);
1322                         l2_dirty = true;
1323                         res->corruptions_fixed++;
1324                     } else {
1325                         res->corruptions++;
1326                     }
1327                 }
1328             }
1329         }
1330 
1331         if (l2_dirty) {
1332             ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
1333                                                 l2_offset, s->cluster_size);
1334             if (ret < 0) {
1335                 fprintf(stderr, "ERROR: Could not write L2 table; metadata "
1336                         "overlap check failed: %s\n", strerror(-ret));
1337                 res->check_errors++;
1338                 goto fail;
1339             }
1340 
1341             ret = bdrv_pwrite(bs->file, l2_offset, l2_table, s->cluster_size);
1342             if (ret < 0) {
1343                 fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
1344                         strerror(-ret));
1345                 res->check_errors++;
1346                 goto fail;
1347             }
1348         }
1349     }
1350 
1351     ret = 0;
1352 
1353 fail:
1354     qemu_vfree(l2_table);
1355     return ret;
1356 }
1357 
1358 /*
1359  * Writes one sector of the refcount table to the disk
1360  */
1361 #define RT_ENTRIES_PER_SECTOR (512 / sizeof(uint64_t))
1362 static int write_reftable_entry(BlockDriverState *bs, int rt_index)
1363 {
1364     BDRVQcowState *s = bs->opaque;
1365     uint64_t buf[RT_ENTRIES_PER_SECTOR];
1366     int rt_start_index;
1367     int i, ret;
1368 
1369     rt_start_index = rt_index & ~(RT_ENTRIES_PER_SECTOR - 1);
1370     for (i = 0; i < RT_ENTRIES_PER_SECTOR; i++) {
1371         buf[i] = cpu_to_be64(s->refcount_table[rt_start_index + i]);
1372     }
1373 
1374     ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_REFCOUNT_TABLE,
1375             s->refcount_table_offset + rt_start_index * sizeof(uint64_t),
1376             sizeof(buf));
1377     if (ret < 0) {
1378         return ret;
1379     }
1380 
1381     BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_UPDATE);
1382     ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset +
1383             rt_start_index * sizeof(uint64_t), buf, sizeof(buf));
1384     if (ret < 0) {
1385         return ret;
1386     }
1387 
1388     return 0;
1389 }
1390 
1391 /*
1392  * Allocates a new cluster for the given refcount block (represented by its
1393  * offset in the image file) and copies the current content there. This function
1394  * does _not_ decrement the reference count for the currently occupied cluster.
1395  *
1396  * This function prints an informative message to stderr on error (and returns
1397  * -errno); on success, the offset of the newly allocated cluster is returned.
1398  */
1399 static int64_t realloc_refcount_block(BlockDriverState *bs, int reftable_index,
1400                                       uint64_t offset)
1401 {
1402     BDRVQcowState *s = bs->opaque;
1403     int64_t new_offset = 0;
1404     void *refcount_block = NULL;
1405     int ret;
1406 
1407     /* allocate new refcount block */
1408     new_offset = qcow2_alloc_clusters(bs, s->cluster_size);
1409     if (new_offset < 0) {
1410         fprintf(stderr, "Could not allocate new cluster: %s\n",
1411                 strerror(-new_offset));
1412         ret = new_offset;
1413         goto done;
1414     }
1415 
1416     /* fetch current refcount block content */
1417     ret = qcow2_cache_get(bs, s->refcount_block_cache, offset, &refcount_block);
1418     if (ret < 0) {
1419         fprintf(stderr, "Could not fetch refcount block: %s\n", strerror(-ret));
1420         goto fail_free_cluster;
1421     }
1422 
1423     /* new block has not yet been entered into refcount table, therefore it is
1424      * no refcount block yet (regarding this check) */
1425     ret = qcow2_pre_write_overlap_check(bs, 0, new_offset, s->cluster_size);
1426     if (ret < 0) {
1427         fprintf(stderr, "Could not write refcount block; metadata overlap "
1428                 "check failed: %s\n", strerror(-ret));
1429         /* the image will be marked corrupt, so don't even attempt on freeing
1430          * the cluster */
1431         goto done;
1432     }
1433 
1434     /* write to new block */
1435     ret = bdrv_write(bs->file, new_offset / BDRV_SECTOR_SIZE, refcount_block,
1436             s->cluster_sectors);
1437     if (ret < 0) {
1438         fprintf(stderr, "Could not write refcount block: %s\n", strerror(-ret));
1439         goto fail_free_cluster;
1440     }
1441 
1442     /* update refcount table */
1443     assert(!offset_into_cluster(s, new_offset));
1444     s->refcount_table[reftable_index] = new_offset;
1445     ret = write_reftable_entry(bs, reftable_index);
1446     if (ret < 0) {
1447         fprintf(stderr, "Could not update refcount table: %s\n",
1448                 strerror(-ret));
1449         goto fail_free_cluster;
1450     }
1451 
1452     goto done;
1453 
1454 fail_free_cluster:
1455     qcow2_free_clusters(bs, new_offset, s->cluster_size, QCOW2_DISCARD_OTHER);
1456 
1457 done:
1458     if (refcount_block) {
1459         /* This should never fail, as it would only do so if the given refcount
1460          * block cannot be found in the cache. As this is impossible as long as
1461          * there are no bugs, assert the success. */
1462         int tmp = qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
1463         assert(tmp == 0);
1464     }
1465 
1466     if (ret < 0) {
1467         return ret;
1468     }
1469 
1470     return new_offset;
1471 }
1472 
1473 /*
1474  * Checks an image for refcount consistency.
1475  *
1476  * Returns 0 if no errors are found, the number of errors in case the image is
1477  * detected as corrupted, and -errno when an internal error occurred.
1478  */
1479 int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
1480                           BdrvCheckMode fix)
1481 {
1482     BDRVQcowState *s = bs->opaque;
1483     int64_t size, i, highest_cluster, nb_clusters;
1484     int refcount1, refcount2;
1485     QCowSnapshot *sn;
1486     uint16_t *refcount_table;
1487     int ret;
1488 
1489     size = bdrv_getlength(bs->file);
1490     if (size < 0) {
1491         res->check_errors++;
1492         return size;
1493     }
1494 
1495     nb_clusters = size_to_clusters(s, size);
1496     if (nb_clusters > INT_MAX) {
1497         res->check_errors++;
1498         return -EFBIG;
1499     }
1500 
1501     refcount_table = g_malloc0(nb_clusters * sizeof(uint16_t));
1502 
1503     res->bfi.total_clusters =
1504         size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
1505 
1506     /* header */
1507     inc_refcounts(bs, res, refcount_table, nb_clusters,
1508         0, s->cluster_size);
1509 
1510     /* current L1 table */
1511     ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
1512                              s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO);
1513     if (ret < 0) {
1514         goto fail;
1515     }
1516 
1517     /* snapshots */
1518     for(i = 0; i < s->nb_snapshots; i++) {
1519         sn = s->snapshots + i;
1520         ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
1521             sn->l1_table_offset, sn->l1_size, 0);
1522         if (ret < 0) {
1523             goto fail;
1524         }
1525     }
1526     inc_refcounts(bs, res, refcount_table, nb_clusters,
1527         s->snapshots_offset, s->snapshots_size);
1528 
1529     /* refcount data */
1530     inc_refcounts(bs, res, refcount_table, nb_clusters,
1531         s->refcount_table_offset,
1532         s->refcount_table_size * sizeof(uint64_t));
1533 
1534     for(i = 0; i < s->refcount_table_size; i++) {
1535         uint64_t offset, cluster;
1536         offset = s->refcount_table[i];
1537         cluster = offset >> s->cluster_bits;
1538 
1539         /* Refcount blocks are cluster aligned */
1540         if (offset_into_cluster(s, offset)) {
1541             fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
1542                 "cluster aligned; refcount table entry corrupted\n", i);
1543             res->corruptions++;
1544             continue;
1545         }
1546 
1547         if (cluster >= nb_clusters) {
1548             fprintf(stderr, "ERROR refcount block %" PRId64
1549                     " is outside image\n", i);
1550             res->corruptions++;
1551             continue;
1552         }
1553 
1554         if (offset != 0) {
1555             inc_refcounts(bs, res, refcount_table, nb_clusters,
1556                 offset, s->cluster_size);
1557             if (refcount_table[cluster] != 1) {
1558                 fprintf(stderr, "%s refcount block %" PRId64
1559                     " refcount=%d\n",
1560                     fix & BDRV_FIX_ERRORS ? "Repairing" :
1561                                             "ERROR",
1562                     i, refcount_table[cluster]);
1563 
1564                 if (fix & BDRV_FIX_ERRORS) {
1565                     int64_t new_offset;
1566 
1567                     new_offset = realloc_refcount_block(bs, i, offset);
1568                     if (new_offset < 0) {
1569                         res->corruptions++;
1570                         continue;
1571                     }
1572 
1573                     /* update refcounts */
1574                     if ((new_offset >> s->cluster_bits) >= nb_clusters) {
1575                         /* increase refcount_table size if necessary */
1576                         int old_nb_clusters = nb_clusters;
1577                         nb_clusters = (new_offset >> s->cluster_bits) + 1;
1578                         refcount_table = g_realloc(refcount_table,
1579                                 nb_clusters * sizeof(uint16_t));
1580                         memset(&refcount_table[old_nb_clusters], 0, (nb_clusters
1581                                 - old_nb_clusters) * sizeof(uint16_t));
1582                     }
1583                     refcount_table[cluster]--;
1584                     inc_refcounts(bs, res, refcount_table, nb_clusters,
1585                             new_offset, s->cluster_size);
1586 
1587                     res->corruptions_fixed++;
1588                 } else {
1589                     res->corruptions++;
1590                 }
1591             }
1592         }
1593     }
1594 
1595     /* compare ref counts */
1596     for (i = 0, highest_cluster = 0; i < nb_clusters; i++) {
1597         refcount1 = get_refcount(bs, i);
1598         if (refcount1 < 0) {
1599             fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
1600                 i, strerror(-refcount1));
1601             res->check_errors++;
1602             continue;
1603         }
1604 
1605         refcount2 = refcount_table[i];
1606 
1607         if (refcount1 > 0 || refcount2 > 0) {
1608             highest_cluster = i;
1609         }
1610 
1611         if (refcount1 != refcount2) {
1612 
1613             /* Check if we're allowed to fix the mismatch */
1614             int *num_fixed = NULL;
1615             if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
1616                 num_fixed = &res->leaks_fixed;
1617             } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
1618                 num_fixed = &res->corruptions_fixed;
1619             }
1620 
1621             fprintf(stderr, "%s cluster %" PRId64 " refcount=%d reference=%d\n",
1622                    num_fixed != NULL     ? "Repairing" :
1623                    refcount1 < refcount2 ? "ERROR" :
1624                                            "Leaked",
1625                    i, refcount1, refcount2);
1626 
1627             if (num_fixed) {
1628                 ret = update_refcount(bs, i << s->cluster_bits, 1,
1629                                       refcount2 - refcount1,
1630                                       QCOW2_DISCARD_ALWAYS);
1631                 if (ret >= 0) {
1632                     (*num_fixed)++;
1633                     continue;
1634                 }
1635             }
1636 
1637             /* And if we couldn't, print an error */
1638             if (refcount1 < refcount2) {
1639                 res->corruptions++;
1640             } else {
1641                 res->leaks++;
1642             }
1643         }
1644     }
1645 
1646     /* check OFLAG_COPIED */
1647     ret = check_oflag_copied(bs, res, fix);
1648     if (ret < 0) {
1649         goto fail;
1650     }
1651 
1652     res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
1653     ret = 0;
1654 
1655 fail:
1656     g_free(refcount_table);
1657 
1658     return ret;
1659 }
1660 
1661 #define overlaps_with(ofs, sz) \
1662     ranges_overlap(offset, size, ofs, sz)
1663 
1664 /*
1665  * Checks if the given offset into the image file is actually free to use by
1666  * looking for overlaps with important metadata sections (L1/L2 tables etc.),
1667  * i.e. a sanity check without relying on the refcount tables.
1668  *
1669  * The ign parameter specifies what checks not to perform (being a bitmask of
1670  * QCow2MetadataOverlap values), i.e., what sections to ignore.
1671  *
1672  * Returns:
1673  * - 0 if writing to this offset will not affect the mentioned metadata
1674  * - a positive QCow2MetadataOverlap value indicating one overlapping section
1675  * - a negative value (-errno) indicating an error while performing a check,
1676  *   e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2
1677  */
1678 int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
1679                                  int64_t size)
1680 {
1681     BDRVQcowState *s = bs->opaque;
1682     int chk = s->overlap_check & ~ign;
1683     int i, j;
1684 
1685     if (!size) {
1686         return 0;
1687     }
1688 
1689     if (chk & QCOW2_OL_MAIN_HEADER) {
1690         if (offset < s->cluster_size) {
1691             return QCOW2_OL_MAIN_HEADER;
1692         }
1693     }
1694 
1695     /* align range to test to cluster boundaries */
1696     size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size);
1697     offset = start_of_cluster(s, offset);
1698 
1699     if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
1700         if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) {
1701             return QCOW2_OL_ACTIVE_L1;
1702         }
1703     }
1704 
1705     if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
1706         if (overlaps_with(s->refcount_table_offset,
1707             s->refcount_table_size * sizeof(uint64_t))) {
1708             return QCOW2_OL_REFCOUNT_TABLE;
1709         }
1710     }
1711 
1712     if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
1713         if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
1714             return QCOW2_OL_SNAPSHOT_TABLE;
1715         }
1716     }
1717 
1718     if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
1719         for (i = 0; i < s->nb_snapshots; i++) {
1720             if (s->snapshots[i].l1_size &&
1721                 overlaps_with(s->snapshots[i].l1_table_offset,
1722                 s->snapshots[i].l1_size * sizeof(uint64_t))) {
1723                 return QCOW2_OL_INACTIVE_L1;
1724             }
1725         }
1726     }
1727 
1728     if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
1729         for (i = 0; i < s->l1_size; i++) {
1730             if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
1731                 overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
1732                 s->cluster_size)) {
1733                 return QCOW2_OL_ACTIVE_L2;
1734             }
1735         }
1736     }
1737 
1738     if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
1739         for (i = 0; i < s->refcount_table_size; i++) {
1740             if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
1741                 overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
1742                 s->cluster_size)) {
1743                 return QCOW2_OL_REFCOUNT_BLOCK;
1744             }
1745         }
1746     }
1747 
1748     if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
1749         for (i = 0; i < s->nb_snapshots; i++) {
1750             uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
1751             uint32_t l1_sz  = s->snapshots[i].l1_size;
1752             uint64_t l1_sz2 = l1_sz * sizeof(uint64_t);
1753             uint64_t *l1 = g_malloc(l1_sz2);
1754             int ret;
1755 
1756             ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2);
1757             if (ret < 0) {
1758                 g_free(l1);
1759                 return ret;
1760             }
1761 
1762             for (j = 0; j < l1_sz; j++) {
1763                 uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
1764                 if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
1765                     g_free(l1);
1766                     return QCOW2_OL_INACTIVE_L2;
1767                 }
1768             }
1769 
1770             g_free(l1);
1771         }
1772     }
1773 
1774     return 0;
1775 }
1776 
1777 static const char *metadata_ol_names[] = {
1778     [QCOW2_OL_MAIN_HEADER_BITNR]    = "qcow2_header",
1779     [QCOW2_OL_ACTIVE_L1_BITNR]      = "active L1 table",
1780     [QCOW2_OL_ACTIVE_L2_BITNR]      = "active L2 table",
1781     [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table",
1782     [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block",
1783     [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table",
1784     [QCOW2_OL_INACTIVE_L1_BITNR]    = "inactive L1 table",
1785     [QCOW2_OL_INACTIVE_L2_BITNR]    = "inactive L2 table",
1786 };
1787 
1788 /*
1789  * First performs a check for metadata overlaps (through
1790  * qcow2_check_metadata_overlap); if that fails with a negative value (error
1791  * while performing a check), that value is returned. If an impending overlap
1792  * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
1793  * and -EIO returned.
1794  *
1795  * Returns 0 if there were neither overlaps nor errors while checking for
1796  * overlaps; or a negative value (-errno) on error.
1797  */
1798 int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
1799                                   int64_t size)
1800 {
1801     int ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
1802 
1803     if (ret < 0) {
1804         return ret;
1805     } else if (ret > 0) {
1806         int metadata_ol_bitnr = ffs(ret) - 1;
1807         char *message;
1808         QObject *data;
1809 
1810         assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
1811 
1812         fprintf(stderr, "qcow2: Preventing invalid write on metadata (overlaps "
1813                 "with %s); image marked as corrupt.\n",
1814                 metadata_ol_names[metadata_ol_bitnr]);
1815         message = g_strdup_printf("Prevented %s overwrite",
1816                 metadata_ol_names[metadata_ol_bitnr]);
1817         data = qobject_from_jsonf("{ 'device': %s, 'msg': %s, 'offset': %"
1818                 PRId64 ", 'size': %" PRId64 " }", bs->device_name, message,
1819                 offset, size);
1820         monitor_protocol_event(QEVENT_BLOCK_IMAGE_CORRUPTED, data);
1821         g_free(message);
1822         qobject_decref(data);
1823 
1824         qcow2_mark_corrupt(bs);
1825         bs->drv = NULL; /* make BDS unusable */
1826         return -EIO;
1827     }
1828 
1829     return 0;
1830 }
1831