xref: /openbmc/qemu/block/qcow2-cluster.c (revision 99d46107)
1 /*
2  * Block driver for the QCOW version 2 format
3  *
4  * Copyright (c) 2004-2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu/osdep.h"
26 #include <zlib.h>
27 
28 #include "qapi/error.h"
29 #include "qemu-common.h"
30 #include "block/block_int.h"
31 #include "qcow2.h"
32 #include "qemu/bswap.h"
33 #include "trace.h"
34 
35 int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t exact_size)
36 {
37     BDRVQcow2State *s = bs->opaque;
38     int new_l1_size, i, ret;
39 
40     if (exact_size >= s->l1_size) {
41         return 0;
42     }
43 
44     new_l1_size = exact_size;
45 
46 #ifdef DEBUG_ALLOC2
47     fprintf(stderr, "shrink l1_table from %d to %d\n", s->l1_size, new_l1_size);
48 #endif
49 
50     BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_WRITE_TABLE);
51     ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset +
52                                        new_l1_size * sizeof(uint64_t),
53                              (s->l1_size - new_l1_size) * sizeof(uint64_t), 0);
54     if (ret < 0) {
55         goto fail;
56     }
57 
58     ret = bdrv_flush(bs->file->bs);
59     if (ret < 0) {
60         goto fail;
61     }
62 
63     BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_FREE_L2_CLUSTERS);
64     for (i = s->l1_size - 1; i > new_l1_size - 1; i--) {
65         if ((s->l1_table[i] & L1E_OFFSET_MASK) == 0) {
66             continue;
67         }
68         qcow2_free_clusters(bs, s->l1_table[i] & L1E_OFFSET_MASK,
69                             s->cluster_size, QCOW2_DISCARD_ALWAYS);
70         s->l1_table[i] = 0;
71     }
72     return 0;
73 
74 fail:
75     /*
76      * If the write in the l1_table failed the image may contain a partially
77      * overwritten l1_table. In this case it would be better to clear the
78      * l1_table in memory to avoid possible image corruption.
79      */
80     memset(s->l1_table + new_l1_size, 0,
81            (s->l1_size - new_l1_size) * sizeof(uint64_t));
82     return ret;
83 }
84 
85 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
86                         bool exact_size)
87 {
88     BDRVQcow2State *s = bs->opaque;
89     int new_l1_size2, ret, i;
90     uint64_t *new_l1_table;
91     int64_t old_l1_table_offset, old_l1_size;
92     int64_t new_l1_table_offset, new_l1_size;
93     uint8_t data[12];
94 
95     if (min_size <= s->l1_size)
96         return 0;
97 
98     /* Do a sanity check on min_size before trying to calculate new_l1_size
99      * (this prevents overflows during the while loop for the calculation of
100      * new_l1_size) */
101     if (min_size > INT_MAX / sizeof(uint64_t)) {
102         return -EFBIG;
103     }
104 
105     if (exact_size) {
106         new_l1_size = min_size;
107     } else {
108         /* Bump size up to reduce the number of times we have to grow */
109         new_l1_size = s->l1_size;
110         if (new_l1_size == 0) {
111             new_l1_size = 1;
112         }
113         while (min_size > new_l1_size) {
114             new_l1_size = DIV_ROUND_UP(new_l1_size * 3, 2);
115         }
116     }
117 
118     QEMU_BUILD_BUG_ON(QCOW_MAX_L1_SIZE > INT_MAX);
119     if (new_l1_size > QCOW_MAX_L1_SIZE / sizeof(uint64_t)) {
120         return -EFBIG;
121     }
122 
123 #ifdef DEBUG_ALLOC2
124     fprintf(stderr, "grow l1_table from %d to %" PRId64 "\n",
125             s->l1_size, new_l1_size);
126 #endif
127 
128     new_l1_size2 = sizeof(uint64_t) * new_l1_size;
129     new_l1_table = qemu_try_blockalign(bs->file->bs,
130                                        ROUND_UP(new_l1_size2, 512));
131     if (new_l1_table == NULL) {
132         return -ENOMEM;
133     }
134     memset(new_l1_table, 0, ROUND_UP(new_l1_size2, 512));
135 
136     if (s->l1_size) {
137         memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));
138     }
139 
140     /* write new table (align to cluster) */
141     BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE);
142     new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2);
143     if (new_l1_table_offset < 0) {
144         qemu_vfree(new_l1_table);
145         return new_l1_table_offset;
146     }
147 
148     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
149     if (ret < 0) {
150         goto fail;
151     }
152 
153     /* the L1 position has not yet been updated, so these clusters must
154      * indeed be completely free */
155     ret = qcow2_pre_write_overlap_check(bs, 0, new_l1_table_offset,
156                                         new_l1_size2);
157     if (ret < 0) {
158         goto fail;
159     }
160 
161     BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE);
162     for(i = 0; i < s->l1_size; i++)
163         new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
164     ret = bdrv_pwrite_sync(bs->file, new_l1_table_offset,
165                            new_l1_table, new_l1_size2);
166     if (ret < 0)
167         goto fail;
168     for(i = 0; i < s->l1_size; i++)
169         new_l1_table[i] = be64_to_cpu(new_l1_table[i]);
170 
171     /* set new table */
172     BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE);
173     stl_be_p(data, new_l1_size);
174     stq_be_p(data + 4, new_l1_table_offset);
175     ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size),
176                            data, sizeof(data));
177     if (ret < 0) {
178         goto fail;
179     }
180     qemu_vfree(s->l1_table);
181     old_l1_table_offset = s->l1_table_offset;
182     s->l1_table_offset = new_l1_table_offset;
183     s->l1_table = new_l1_table;
184     old_l1_size = s->l1_size;
185     s->l1_size = new_l1_size;
186     qcow2_free_clusters(bs, old_l1_table_offset, old_l1_size * sizeof(uint64_t),
187                         QCOW2_DISCARD_OTHER);
188     return 0;
189  fail:
190     qemu_vfree(new_l1_table);
191     qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2,
192                         QCOW2_DISCARD_OTHER);
193     return ret;
194 }
195 
196 /*
197  * l2_load
198  *
199  * @bs: The BlockDriverState
200  * @offset: A guest offset, used to calculate what slice of the L2
201  *          table to load.
202  * @l2_offset: Offset to the L2 table in the image file.
203  * @l2_slice: Location to store the pointer to the L2 slice.
204  *
205  * Loads a L2 slice into memory (L2 slices are the parts of L2 tables
206  * that are loaded by the qcow2 cache). If the slice is in the cache,
207  * the cache is used; otherwise the L2 slice is loaded from the image
208  * file.
209  */
210 static int l2_load(BlockDriverState *bs, uint64_t offset,
211                    uint64_t l2_offset, uint64_t **l2_slice)
212 {
213     BDRVQcow2State *s = bs->opaque;
214     int start_of_slice = sizeof(uint64_t) *
215         (offset_to_l2_index(s, offset) - offset_to_l2_slice_index(s, offset));
216 
217     return qcow2_cache_get(bs, s->l2_table_cache, l2_offset + start_of_slice,
218                            (void **)l2_slice);
219 }
220 
221 /*
222  * Writes one sector of the L1 table to the disk (can't update single entries
223  * and we really don't want bdrv_pread to perform a read-modify-write)
224  */
225 #define L1_ENTRIES_PER_SECTOR (512 / 8)
226 int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index)
227 {
228     BDRVQcow2State *s = bs->opaque;
229     uint64_t buf[L1_ENTRIES_PER_SECTOR] = { 0 };
230     int l1_start_index;
231     int i, ret;
232 
233     l1_start_index = l1_index & ~(L1_ENTRIES_PER_SECTOR - 1);
234     for (i = 0; i < L1_ENTRIES_PER_SECTOR && l1_start_index + i < s->l1_size;
235          i++)
236     {
237         buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]);
238     }
239 
240     ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L1,
241             s->l1_table_offset + 8 * l1_start_index, sizeof(buf));
242     if (ret < 0) {
243         return ret;
244     }
245 
246     BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
247     ret = bdrv_pwrite_sync(bs->file,
248                            s->l1_table_offset + 8 * l1_start_index,
249                            buf, sizeof(buf));
250     if (ret < 0) {
251         return ret;
252     }
253 
254     return 0;
255 }
256 
257 /*
258  * l2_allocate
259  *
260  * Allocate a new l2 entry in the file. If l1_index points to an already
261  * used entry in the L2 table (i.e. we are doing a copy on write for the L2
262  * table) copy the contents of the old L2 table into the newly allocated one.
263  * Otherwise the new table is initialized with zeros.
264  *
265  */
266 
267 static int l2_allocate(BlockDriverState *bs, int l1_index)
268 {
269     BDRVQcow2State *s = bs->opaque;
270     uint64_t old_l2_offset;
271     uint64_t *l2_slice = NULL;
272     unsigned slice, slice_size2, n_slices;
273     int64_t l2_offset;
274     int ret;
275 
276     old_l2_offset = s->l1_table[l1_index];
277 
278     trace_qcow2_l2_allocate(bs, l1_index);
279 
280     /* allocate a new l2 entry */
281 
282     l2_offset = qcow2_alloc_clusters(bs, s->l2_size * sizeof(uint64_t));
283     if (l2_offset < 0) {
284         ret = l2_offset;
285         goto fail;
286     }
287 
288     /* The offset must fit in the offset field of the L1 table entry */
289     assert((l2_offset & L1E_OFFSET_MASK) == l2_offset);
290 
291     /* If we're allocating the table at offset 0 then something is wrong */
292     if (l2_offset == 0) {
293         qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
294                                 "allocation of L2 table at offset 0");
295         ret = -EIO;
296         goto fail;
297     }
298 
299     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
300     if (ret < 0) {
301         goto fail;
302     }
303 
304     /* allocate a new entry in the l2 cache */
305 
306     slice_size2 = s->l2_slice_size * sizeof(uint64_t);
307     n_slices = s->cluster_size / slice_size2;
308 
309     trace_qcow2_l2_allocate_get_empty(bs, l1_index);
310     for (slice = 0; slice < n_slices; slice++) {
311         ret = qcow2_cache_get_empty(bs, s->l2_table_cache,
312                                     l2_offset + slice * slice_size2,
313                                     (void **) &l2_slice);
314         if (ret < 0) {
315             goto fail;
316         }
317 
318         if ((old_l2_offset & L1E_OFFSET_MASK) == 0) {
319             /* if there was no old l2 table, clear the new slice */
320             memset(l2_slice, 0, slice_size2);
321         } else {
322             uint64_t *old_slice;
323             uint64_t old_l2_slice_offset =
324                 (old_l2_offset & L1E_OFFSET_MASK) + slice * slice_size2;
325 
326             /* if there was an old l2 table, read a slice from the disk */
327             BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ);
328             ret = qcow2_cache_get(bs, s->l2_table_cache, old_l2_slice_offset,
329                                   (void **) &old_slice);
330             if (ret < 0) {
331                 goto fail;
332             }
333 
334             memcpy(l2_slice, old_slice, slice_size2);
335 
336             qcow2_cache_put(s->l2_table_cache, (void **) &old_slice);
337         }
338 
339         /* write the l2 slice to the file */
340         BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE);
341 
342         trace_qcow2_l2_allocate_write_l2(bs, l1_index);
343         qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
344         qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
345     }
346 
347     ret = qcow2_cache_flush(bs, s->l2_table_cache);
348     if (ret < 0) {
349         goto fail;
350     }
351 
352     /* update the L1 entry */
353     trace_qcow2_l2_allocate_write_l1(bs, l1_index);
354     s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
355     ret = qcow2_write_l1_entry(bs, l1_index);
356     if (ret < 0) {
357         goto fail;
358     }
359 
360     trace_qcow2_l2_allocate_done(bs, l1_index, 0);
361     return 0;
362 
363 fail:
364     trace_qcow2_l2_allocate_done(bs, l1_index, ret);
365     if (l2_slice != NULL) {
366         qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
367     }
368     s->l1_table[l1_index] = old_l2_offset;
369     if (l2_offset > 0) {
370         qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t),
371                             QCOW2_DISCARD_ALWAYS);
372     }
373     return ret;
374 }
375 
376 /*
377  * Checks how many clusters in a given L2 slice are contiguous in the image
378  * file. As soon as one of the flags in the bitmask stop_flags changes compared
379  * to the first cluster, the search is stopped and the cluster is not counted
380  * as contiguous. (This allows it, for example, to stop at the first compressed
381  * cluster which may require a different handling)
382  */
383 static int count_contiguous_clusters(int nb_clusters, int cluster_size,
384         uint64_t *l2_slice, uint64_t stop_flags)
385 {
386     int i;
387     QCow2ClusterType first_cluster_type;
388     uint64_t mask = stop_flags | L2E_OFFSET_MASK | QCOW_OFLAG_COMPRESSED;
389     uint64_t first_entry = be64_to_cpu(l2_slice[0]);
390     uint64_t offset = first_entry & mask;
391 
392     if (!offset) {
393         return 0;
394     }
395 
396     /* must be allocated */
397     first_cluster_type = qcow2_get_cluster_type(first_entry);
398     assert(first_cluster_type == QCOW2_CLUSTER_NORMAL ||
399            first_cluster_type == QCOW2_CLUSTER_ZERO_ALLOC);
400 
401     for (i = 0; i < nb_clusters; i++) {
402         uint64_t l2_entry = be64_to_cpu(l2_slice[i]) & mask;
403         if (offset + (uint64_t) i * cluster_size != l2_entry) {
404             break;
405         }
406     }
407 
408         return i;
409 }
410 
411 /*
412  * Checks how many consecutive unallocated clusters in a given L2
413  * slice have the same cluster type.
414  */
415 static int count_contiguous_clusters_unallocated(int nb_clusters,
416                                                  uint64_t *l2_slice,
417                                                  QCow2ClusterType wanted_type)
418 {
419     int i;
420 
421     assert(wanted_type == QCOW2_CLUSTER_ZERO_PLAIN ||
422            wanted_type == QCOW2_CLUSTER_UNALLOCATED);
423     for (i = 0; i < nb_clusters; i++) {
424         uint64_t entry = be64_to_cpu(l2_slice[i]);
425         QCow2ClusterType type = qcow2_get_cluster_type(entry);
426 
427         if (type != wanted_type) {
428             break;
429         }
430     }
431 
432     return i;
433 }
434 
435 static int coroutine_fn do_perform_cow_read(BlockDriverState *bs,
436                                             uint64_t src_cluster_offset,
437                                             unsigned offset_in_cluster,
438                                             QEMUIOVector *qiov)
439 {
440     int ret;
441 
442     if (qiov->size == 0) {
443         return 0;
444     }
445 
446     BLKDBG_EVENT(bs->file, BLKDBG_COW_READ);
447 
448     if (!bs->drv) {
449         return -ENOMEDIUM;
450     }
451 
452     /* Call .bdrv_co_readv() directly instead of using the public block-layer
453      * interface.  This avoids double I/O throttling and request tracking,
454      * which can lead to deadlock when block layer copy-on-read is enabled.
455      */
456     ret = bs->drv->bdrv_co_preadv(bs, src_cluster_offset + offset_in_cluster,
457                                   qiov->size, qiov, 0);
458     if (ret < 0) {
459         return ret;
460     }
461 
462     return 0;
463 }
464 
465 static bool coroutine_fn do_perform_cow_encrypt(BlockDriverState *bs,
466                                                 uint64_t src_cluster_offset,
467                                                 uint64_t cluster_offset,
468                                                 unsigned offset_in_cluster,
469                                                 uint8_t *buffer,
470                                                 unsigned bytes)
471 {
472     if (bytes && bs->encrypted) {
473         BDRVQcow2State *s = bs->opaque;
474         int64_t offset = (s->crypt_physical_offset ?
475                           (cluster_offset + offset_in_cluster) :
476                           (src_cluster_offset + offset_in_cluster));
477         assert((offset_in_cluster & ~BDRV_SECTOR_MASK) == 0);
478         assert((bytes & ~BDRV_SECTOR_MASK) == 0);
479         assert(s->crypto);
480         if (qcrypto_block_encrypt(s->crypto, offset, buffer, bytes, NULL) < 0) {
481             return false;
482         }
483     }
484     return true;
485 }
486 
487 static int coroutine_fn do_perform_cow_write(BlockDriverState *bs,
488                                              uint64_t cluster_offset,
489                                              unsigned offset_in_cluster,
490                                              QEMUIOVector *qiov)
491 {
492     int ret;
493 
494     if (qiov->size == 0) {
495         return 0;
496     }
497 
498     ret = qcow2_pre_write_overlap_check(bs, 0,
499             cluster_offset + offset_in_cluster, qiov->size);
500     if (ret < 0) {
501         return ret;
502     }
503 
504     BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
505     ret = bdrv_co_pwritev(bs->file, cluster_offset + offset_in_cluster,
506                           qiov->size, qiov, 0);
507     if (ret < 0) {
508         return ret;
509     }
510 
511     return 0;
512 }
513 
514 
515 /*
516  * get_cluster_offset
517  *
518  * For a given offset of the virtual disk, find the cluster type and offset in
519  * the qcow2 file. The offset is stored in *cluster_offset.
520  *
521  * On entry, *bytes is the maximum number of contiguous bytes starting at
522  * offset that we are interested in.
523  *
524  * On exit, *bytes is the number of bytes starting at offset that have the same
525  * cluster type and (if applicable) are stored contiguously in the image file.
526  * Compressed clusters are always returned one by one.
527  *
528  * Returns the cluster type (QCOW2_CLUSTER_*) on success, -errno in error
529  * cases.
530  */
531 int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
532                              unsigned int *bytes, uint64_t *cluster_offset)
533 {
534     BDRVQcow2State *s = bs->opaque;
535     unsigned int l2_index;
536     uint64_t l1_index, l2_offset, *l2_slice;
537     int c;
538     unsigned int offset_in_cluster;
539     uint64_t bytes_available, bytes_needed, nb_clusters;
540     QCow2ClusterType type;
541     int ret;
542 
543     offset_in_cluster = offset_into_cluster(s, offset);
544     bytes_needed = (uint64_t) *bytes + offset_in_cluster;
545 
546     /* compute how many bytes there are between the start of the cluster
547      * containing offset and the end of the l2 slice that contains
548      * the entry pointing to it */
549     bytes_available =
550         ((uint64_t) (s->l2_slice_size - offset_to_l2_slice_index(s, offset)))
551         << s->cluster_bits;
552 
553     if (bytes_needed > bytes_available) {
554         bytes_needed = bytes_available;
555     }
556 
557     *cluster_offset = 0;
558 
559     /* seek to the l2 offset in the l1 table */
560 
561     l1_index = offset_to_l1_index(s, offset);
562     if (l1_index >= s->l1_size) {
563         type = QCOW2_CLUSTER_UNALLOCATED;
564         goto out;
565     }
566 
567     l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
568     if (!l2_offset) {
569         type = QCOW2_CLUSTER_UNALLOCATED;
570         goto out;
571     }
572 
573     if (offset_into_cluster(s, l2_offset)) {
574         qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64
575                                 " unaligned (L1 index: %#" PRIx64 ")",
576                                 l2_offset, l1_index);
577         return -EIO;
578     }
579 
580     /* load the l2 slice in memory */
581 
582     ret = l2_load(bs, offset, l2_offset, &l2_slice);
583     if (ret < 0) {
584         return ret;
585     }
586 
587     /* find the cluster offset for the given disk offset */
588 
589     l2_index = offset_to_l2_slice_index(s, offset);
590     *cluster_offset = be64_to_cpu(l2_slice[l2_index]);
591 
592     nb_clusters = size_to_clusters(s, bytes_needed);
593     /* bytes_needed <= *bytes + offset_in_cluster, both of which are unsigned
594      * integers; the minimum cluster size is 512, so this assertion is always
595      * true */
596     assert(nb_clusters <= INT_MAX);
597 
598     type = qcow2_get_cluster_type(*cluster_offset);
599     if (s->qcow_version < 3 && (type == QCOW2_CLUSTER_ZERO_PLAIN ||
600                                 type == QCOW2_CLUSTER_ZERO_ALLOC)) {
601         qcow2_signal_corruption(bs, true, -1, -1, "Zero cluster entry found"
602                                 " in pre-v3 image (L2 offset: %#" PRIx64
603                                 ", L2 index: %#x)", l2_offset, l2_index);
604         ret = -EIO;
605         goto fail;
606     }
607     switch (type) {
608     case QCOW2_CLUSTER_COMPRESSED:
609         /* Compressed clusters can only be processed one by one */
610         c = 1;
611         *cluster_offset &= L2E_COMPRESSED_OFFSET_SIZE_MASK;
612         break;
613     case QCOW2_CLUSTER_ZERO_PLAIN:
614     case QCOW2_CLUSTER_UNALLOCATED:
615         /* how many empty clusters ? */
616         c = count_contiguous_clusters_unallocated(nb_clusters,
617                                                   &l2_slice[l2_index], type);
618         *cluster_offset = 0;
619         break;
620     case QCOW2_CLUSTER_ZERO_ALLOC:
621     case QCOW2_CLUSTER_NORMAL:
622         /* how many allocated clusters ? */
623         c = count_contiguous_clusters(nb_clusters, s->cluster_size,
624                                       &l2_slice[l2_index], QCOW_OFLAG_ZERO);
625         *cluster_offset &= L2E_OFFSET_MASK;
626         if (offset_into_cluster(s, *cluster_offset)) {
627             qcow2_signal_corruption(bs, true, -1, -1,
628                                     "Cluster allocation offset %#"
629                                     PRIx64 " unaligned (L2 offset: %#" PRIx64
630                                     ", L2 index: %#x)", *cluster_offset,
631                                     l2_offset, l2_index);
632             ret = -EIO;
633             goto fail;
634         }
635         break;
636     default:
637         abort();
638     }
639 
640     qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
641 
642     bytes_available = (int64_t)c * s->cluster_size;
643 
644 out:
645     if (bytes_available > bytes_needed) {
646         bytes_available = bytes_needed;
647     }
648 
649     /* bytes_available <= bytes_needed <= *bytes + offset_in_cluster;
650      * subtracting offset_in_cluster will therefore definitely yield something
651      * not exceeding UINT_MAX */
652     assert(bytes_available - offset_in_cluster <= UINT_MAX);
653     *bytes = bytes_available - offset_in_cluster;
654 
655     return type;
656 
657 fail:
658     qcow2_cache_put(s->l2_table_cache, (void **)&l2_slice);
659     return ret;
660 }
661 
662 /*
663  * get_cluster_table
664  *
665  * for a given disk offset, load (and allocate if needed)
666  * the appropriate slice of its l2 table.
667  *
668  * the cluster index in the l2 slice is given to the caller.
669  *
670  * Returns 0 on success, -errno in failure case
671  */
672 static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
673                              uint64_t **new_l2_slice,
674                              int *new_l2_index)
675 {
676     BDRVQcow2State *s = bs->opaque;
677     unsigned int l2_index;
678     uint64_t l1_index, l2_offset;
679     uint64_t *l2_slice = NULL;
680     int ret;
681 
682     /* seek to the l2 offset in the l1 table */
683 
684     l1_index = offset_to_l1_index(s, offset);
685     if (l1_index >= s->l1_size) {
686         ret = qcow2_grow_l1_table(bs, l1_index + 1, false);
687         if (ret < 0) {
688             return ret;
689         }
690     }
691 
692     assert(l1_index < s->l1_size);
693     l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
694     if (offset_into_cluster(s, l2_offset)) {
695         qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64
696                                 " unaligned (L1 index: %#" PRIx64 ")",
697                                 l2_offset, l1_index);
698         return -EIO;
699     }
700 
701     if (!(s->l1_table[l1_index] & QCOW_OFLAG_COPIED)) {
702         /* First allocate a new L2 table (and do COW if needed) */
703         ret = l2_allocate(bs, l1_index);
704         if (ret < 0) {
705             return ret;
706         }
707 
708         /* Then decrease the refcount of the old table */
709         if (l2_offset) {
710             qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t),
711                                 QCOW2_DISCARD_OTHER);
712         }
713 
714         /* Get the offset of the newly-allocated l2 table */
715         l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
716         assert(offset_into_cluster(s, l2_offset) == 0);
717     }
718 
719     /* load the l2 slice in memory */
720     ret = l2_load(bs, offset, l2_offset, &l2_slice);
721     if (ret < 0) {
722         return ret;
723     }
724 
725     /* find the cluster offset for the given disk offset */
726 
727     l2_index = offset_to_l2_slice_index(s, offset);
728 
729     *new_l2_slice = l2_slice;
730     *new_l2_index = l2_index;
731 
732     return 0;
733 }
734 
735 /*
736  * alloc_compressed_cluster_offset
737  *
738  * For a given offset of the disk image, return cluster offset in
739  * qcow2 file.
740  *
741  * If the offset is not found, allocate a new compressed cluster.
742  *
743  * Return the cluster offset if successful,
744  * Return 0, otherwise.
745  *
746  */
747 
748 uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
749                                                uint64_t offset,
750                                                int compressed_size)
751 {
752     BDRVQcow2State *s = bs->opaque;
753     int l2_index, ret;
754     uint64_t *l2_slice;
755     int64_t cluster_offset;
756     int nb_csectors;
757 
758     ret = get_cluster_table(bs, offset, &l2_slice, &l2_index);
759     if (ret < 0) {
760         return 0;
761     }
762 
763     /* Compression can't overwrite anything. Fail if the cluster was already
764      * allocated. */
765     cluster_offset = be64_to_cpu(l2_slice[l2_index]);
766     if (cluster_offset & L2E_OFFSET_MASK) {
767         qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
768         return 0;
769     }
770 
771     cluster_offset = qcow2_alloc_bytes(bs, compressed_size);
772     if (cluster_offset < 0) {
773         qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
774         return 0;
775     }
776 
777     nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) -
778                   (cluster_offset >> 9);
779 
780     cluster_offset |= QCOW_OFLAG_COMPRESSED |
781                       ((uint64_t)nb_csectors << s->csize_shift);
782 
783     /* update L2 table */
784 
785     /* compressed clusters never have the copied flag */
786 
787     BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED);
788     qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
789     l2_slice[l2_index] = cpu_to_be64(cluster_offset);
790     qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
791 
792     return cluster_offset;
793 }
794 
795 static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
796 {
797     BDRVQcow2State *s = bs->opaque;
798     Qcow2COWRegion *start = &m->cow_start;
799     Qcow2COWRegion *end = &m->cow_end;
800     unsigned buffer_size;
801     unsigned data_bytes = end->offset - (start->offset + start->nb_bytes);
802     bool merge_reads;
803     uint8_t *start_buffer, *end_buffer;
804     QEMUIOVector qiov;
805     int ret;
806 
807     assert(start->nb_bytes <= UINT_MAX - end->nb_bytes);
808     assert(start->nb_bytes + end->nb_bytes <= UINT_MAX - data_bytes);
809     assert(start->offset + start->nb_bytes <= end->offset);
810     assert(!m->data_qiov || m->data_qiov->size == data_bytes);
811 
812     if (start->nb_bytes == 0 && end->nb_bytes == 0) {
813         return 0;
814     }
815 
816     /* If we have to read both the start and end COW regions and the
817      * middle region is not too large then perform just one read
818      * operation */
819     merge_reads = start->nb_bytes && end->nb_bytes && data_bytes <= 16384;
820     if (merge_reads) {
821         buffer_size = start->nb_bytes + data_bytes + end->nb_bytes;
822     } else {
823         /* If we have to do two reads, add some padding in the middle
824          * if necessary to make sure that the end region is optimally
825          * aligned. */
826         size_t align = bdrv_opt_mem_align(bs);
827         assert(align > 0 && align <= UINT_MAX);
828         assert(QEMU_ALIGN_UP(start->nb_bytes, align) <=
829                UINT_MAX - end->nb_bytes);
830         buffer_size = QEMU_ALIGN_UP(start->nb_bytes, align) + end->nb_bytes;
831     }
832 
833     /* Reserve a buffer large enough to store all the data that we're
834      * going to read */
835     start_buffer = qemu_try_blockalign(bs, buffer_size);
836     if (start_buffer == NULL) {
837         return -ENOMEM;
838     }
839     /* The part of the buffer where the end region is located */
840     end_buffer = start_buffer + buffer_size - end->nb_bytes;
841 
842     qemu_iovec_init(&qiov, 2 + (m->data_qiov ? m->data_qiov->niov : 0));
843 
844     qemu_co_mutex_unlock(&s->lock);
845     /* First we read the existing data from both COW regions. We
846      * either read the whole region in one go, or the start and end
847      * regions separately. */
848     if (merge_reads) {
849         qemu_iovec_add(&qiov, start_buffer, buffer_size);
850         ret = do_perform_cow_read(bs, m->offset, start->offset, &qiov);
851     } else {
852         qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
853         ret = do_perform_cow_read(bs, m->offset, start->offset, &qiov);
854         if (ret < 0) {
855             goto fail;
856         }
857 
858         qemu_iovec_reset(&qiov);
859         qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
860         ret = do_perform_cow_read(bs, m->offset, end->offset, &qiov);
861     }
862     if (ret < 0) {
863         goto fail;
864     }
865 
866     /* Encrypt the data if necessary before writing it */
867     if (bs->encrypted) {
868         if (!do_perform_cow_encrypt(bs, m->offset, m->alloc_offset,
869                                     start->offset, start_buffer,
870                                     start->nb_bytes) ||
871             !do_perform_cow_encrypt(bs, m->offset, m->alloc_offset,
872                                     end->offset, end_buffer, end->nb_bytes)) {
873             ret = -EIO;
874             goto fail;
875         }
876     }
877 
878     /* And now we can write everything. If we have the guest data we
879      * can write everything in one single operation */
880     if (m->data_qiov) {
881         qemu_iovec_reset(&qiov);
882         if (start->nb_bytes) {
883             qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
884         }
885         qemu_iovec_concat(&qiov, m->data_qiov, 0, data_bytes);
886         if (end->nb_bytes) {
887             qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
888         }
889         /* NOTE: we have a write_aio blkdebug event here followed by
890          * a cow_write one in do_perform_cow_write(), but there's only
891          * one single I/O operation */
892         BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
893         ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov);
894     } else {
895         /* If there's no guest data then write both COW regions separately */
896         qemu_iovec_reset(&qiov);
897         qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
898         ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov);
899         if (ret < 0) {
900             goto fail;
901         }
902 
903         qemu_iovec_reset(&qiov);
904         qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
905         ret = do_perform_cow_write(bs, m->alloc_offset, end->offset, &qiov);
906     }
907 
908 fail:
909     qemu_co_mutex_lock(&s->lock);
910 
911     /*
912      * Before we update the L2 table to actually point to the new cluster, we
913      * need to be sure that the refcounts have been increased and COW was
914      * handled.
915      */
916     if (ret == 0) {
917         qcow2_cache_depends_on_flush(s->l2_table_cache);
918     }
919 
920     qemu_vfree(start_buffer);
921     qemu_iovec_destroy(&qiov);
922     return ret;
923 }
924 
925 int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
926 {
927     BDRVQcow2State *s = bs->opaque;
928     int i, j = 0, l2_index, ret;
929     uint64_t *old_cluster, *l2_slice;
930     uint64_t cluster_offset = m->alloc_offset;
931 
932     trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters);
933     assert(m->nb_clusters > 0);
934 
935     old_cluster = g_try_new(uint64_t, m->nb_clusters);
936     if (old_cluster == NULL) {
937         ret = -ENOMEM;
938         goto err;
939     }
940 
941     /* copy content of unmodified sectors */
942     ret = perform_cow(bs, m);
943     if (ret < 0) {
944         goto err;
945     }
946 
947     /* Update L2 table. */
948     if (s->use_lazy_refcounts) {
949         qcow2_mark_dirty(bs);
950     }
951     if (qcow2_need_accurate_refcounts(s)) {
952         qcow2_cache_set_dependency(bs, s->l2_table_cache,
953                                    s->refcount_block_cache);
954     }
955 
956     ret = get_cluster_table(bs, m->offset, &l2_slice, &l2_index);
957     if (ret < 0) {
958         goto err;
959     }
960     qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
961 
962     assert(l2_index + m->nb_clusters <= s->l2_slice_size);
963     for (i = 0; i < m->nb_clusters; i++) {
964         /* if two concurrent writes happen to the same unallocated cluster
965          * each write allocates separate cluster and writes data concurrently.
966          * The first one to complete updates l2 table with pointer to its
967          * cluster the second one has to do RMW (which is done above by
968          * perform_cow()), update l2 table with its cluster pointer and free
969          * old cluster. This is what this loop does */
970         if (l2_slice[l2_index + i] != 0) {
971             old_cluster[j++] = l2_slice[l2_index + i];
972         }
973 
974         l2_slice[l2_index + i] = cpu_to_be64((cluster_offset +
975                     (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
976      }
977 
978 
979     qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
980 
981     /*
982      * If this was a COW, we need to decrease the refcount of the old cluster.
983      *
984      * Don't discard clusters that reach a refcount of 0 (e.g. compressed
985      * clusters), the next write will reuse them anyway.
986      */
987     if (!m->keep_old_clusters && j != 0) {
988         for (i = 0; i < j; i++) {
989             qcow2_free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1,
990                                     QCOW2_DISCARD_NEVER);
991         }
992     }
993 
994     ret = 0;
995 err:
996     g_free(old_cluster);
997     return ret;
998  }
999 
1000 /**
1001  * Frees the allocated clusters because the request failed and they won't
1002  * actually be linked.
1003  */
1004 void qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m)
1005 {
1006     BDRVQcow2State *s = bs->opaque;
1007     qcow2_free_clusters(bs, m->alloc_offset, m->nb_clusters << s->cluster_bits,
1008                         QCOW2_DISCARD_NEVER);
1009 }
1010 
1011 /*
1012  * Returns the number of contiguous clusters that can be used for an allocating
1013  * write, but require COW to be performed (this includes yet unallocated space,
1014  * which must copy from the backing file)
1015  */
1016 static int count_cow_clusters(BDRVQcow2State *s, int nb_clusters,
1017     uint64_t *l2_slice, int l2_index)
1018 {
1019     int i;
1020 
1021     for (i = 0; i < nb_clusters; i++) {
1022         uint64_t l2_entry = be64_to_cpu(l2_slice[l2_index + i]);
1023         QCow2ClusterType cluster_type = qcow2_get_cluster_type(l2_entry);
1024 
1025         switch(cluster_type) {
1026         case QCOW2_CLUSTER_NORMAL:
1027             if (l2_entry & QCOW_OFLAG_COPIED) {
1028                 goto out;
1029             }
1030             break;
1031         case QCOW2_CLUSTER_UNALLOCATED:
1032         case QCOW2_CLUSTER_COMPRESSED:
1033         case QCOW2_CLUSTER_ZERO_PLAIN:
1034         case QCOW2_CLUSTER_ZERO_ALLOC:
1035             break;
1036         default:
1037             abort();
1038         }
1039     }
1040 
1041 out:
1042     assert(i <= nb_clusters);
1043     return i;
1044 }
1045 
1046 /*
1047  * Check if there already is an AIO write request in flight which allocates
1048  * the same cluster. In this case we need to wait until the previous
1049  * request has completed and updated the L2 table accordingly.
1050  *
1051  * Returns:
1052  *   0       if there was no dependency. *cur_bytes indicates the number of
1053  *           bytes from guest_offset that can be read before the next
1054  *           dependency must be processed (or the request is complete)
1055  *
1056  *   -EAGAIN if we had to wait for another request, previously gathered
1057  *           information on cluster allocation may be invalid now. The caller
1058  *           must start over anyway, so consider *cur_bytes undefined.
1059  */
1060 static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
1061     uint64_t *cur_bytes, QCowL2Meta **m)
1062 {
1063     BDRVQcow2State *s = bs->opaque;
1064     QCowL2Meta *old_alloc;
1065     uint64_t bytes = *cur_bytes;
1066 
1067     QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) {
1068 
1069         uint64_t start = guest_offset;
1070         uint64_t end = start + bytes;
1071         uint64_t old_start = l2meta_cow_start(old_alloc);
1072         uint64_t old_end = l2meta_cow_end(old_alloc);
1073 
1074         if (end <= old_start || start >= old_end) {
1075             /* No intersection */
1076         } else {
1077             if (start < old_start) {
1078                 /* Stop at the start of a running allocation */
1079                 bytes = old_start - start;
1080             } else {
1081                 bytes = 0;
1082             }
1083 
1084             /* Stop if already an l2meta exists. After yielding, it wouldn't
1085              * be valid any more, so we'd have to clean up the old L2Metas
1086              * and deal with requests depending on them before starting to
1087              * gather new ones. Not worth the trouble. */
1088             if (bytes == 0 && *m) {
1089                 *cur_bytes = 0;
1090                 return 0;
1091             }
1092 
1093             if (bytes == 0) {
1094                 /* Wait for the dependency to complete. We need to recheck
1095                  * the free/allocated clusters when we continue. */
1096                 qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock);
1097                 return -EAGAIN;
1098             }
1099         }
1100     }
1101 
1102     /* Make sure that existing clusters and new allocations are only used up to
1103      * the next dependency if we shortened the request above */
1104     *cur_bytes = bytes;
1105 
1106     return 0;
1107 }
1108 
1109 /*
1110  * Checks how many already allocated clusters that don't require a copy on
1111  * write there are at the given guest_offset (up to *bytes). If
1112  * *host_offset is not zero, only physically contiguous clusters beginning at
1113  * this host offset are counted.
1114  *
1115  * Note that guest_offset may not be cluster aligned. In this case, the
1116  * returned *host_offset points to exact byte referenced by guest_offset and
1117  * therefore isn't cluster aligned as well.
1118  *
1119  * Returns:
1120  *   0:     if no allocated clusters are available at the given offset.
1121  *          *bytes is normally unchanged. It is set to 0 if the cluster
1122  *          is allocated and doesn't need COW, but doesn't have the right
1123  *          physical offset.
1124  *
1125  *   1:     if allocated clusters that don't require a COW are available at
1126  *          the requested offset. *bytes may have decreased and describes
1127  *          the length of the area that can be written to.
1128  *
1129  *  -errno: in error cases
1130  */
1131 static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
1132     uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
1133 {
1134     BDRVQcow2State *s = bs->opaque;
1135     int l2_index;
1136     uint64_t cluster_offset;
1137     uint64_t *l2_slice;
1138     uint64_t nb_clusters;
1139     unsigned int keep_clusters;
1140     int ret;
1141 
1142     trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset,
1143                               *bytes);
1144 
1145     assert(*host_offset == 0 ||    offset_into_cluster(s, guest_offset)
1146                                 == offset_into_cluster(s, *host_offset));
1147 
1148     /*
1149      * Calculate the number of clusters to look for. We stop at L2 slice
1150      * boundaries to keep things simple.
1151      */
1152     nb_clusters =
1153         size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
1154 
1155     l2_index = offset_to_l2_slice_index(s, guest_offset);
1156     nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
1157     assert(nb_clusters <= INT_MAX);
1158 
1159     /* Find L2 entry for the first involved cluster */
1160     ret = get_cluster_table(bs, guest_offset, &l2_slice, &l2_index);
1161     if (ret < 0) {
1162         return ret;
1163     }
1164 
1165     cluster_offset = be64_to_cpu(l2_slice[l2_index]);
1166 
1167     /* Check how many clusters are already allocated and don't need COW */
1168     if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL
1169         && (cluster_offset & QCOW_OFLAG_COPIED))
1170     {
1171         /* If a specific host_offset is required, check it */
1172         bool offset_matches =
1173             (cluster_offset & L2E_OFFSET_MASK) == *host_offset;
1174 
1175         if (offset_into_cluster(s, cluster_offset & L2E_OFFSET_MASK)) {
1176             qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset "
1177                                     "%#llx unaligned (guest offset: %#" PRIx64
1178                                     ")", cluster_offset & L2E_OFFSET_MASK,
1179                                     guest_offset);
1180             ret = -EIO;
1181             goto out;
1182         }
1183 
1184         if (*host_offset != 0 && !offset_matches) {
1185             *bytes = 0;
1186             ret = 0;
1187             goto out;
1188         }
1189 
1190         /* We keep all QCOW_OFLAG_COPIED clusters */
1191         keep_clusters =
1192             count_contiguous_clusters(nb_clusters, s->cluster_size,
1193                                       &l2_slice[l2_index],
1194                                       QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO);
1195         assert(keep_clusters <= nb_clusters);
1196 
1197         *bytes = MIN(*bytes,
1198                  keep_clusters * s->cluster_size
1199                  - offset_into_cluster(s, guest_offset));
1200 
1201         ret = 1;
1202     } else {
1203         ret = 0;
1204     }
1205 
1206     /* Cleanup */
1207 out:
1208     qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1209 
1210     /* Only return a host offset if we actually made progress. Otherwise we
1211      * would make requirements for handle_alloc() that it can't fulfill */
1212     if (ret > 0) {
1213         *host_offset = (cluster_offset & L2E_OFFSET_MASK)
1214                      + offset_into_cluster(s, guest_offset);
1215     }
1216 
1217     return ret;
1218 }
1219 
1220 /*
1221  * Allocates new clusters for the given guest_offset.
1222  *
1223  * At most *nb_clusters are allocated, and on return *nb_clusters is updated to
1224  * contain the number of clusters that have been allocated and are contiguous
1225  * in the image file.
1226  *
1227  * If *host_offset is non-zero, it specifies the offset in the image file at
1228  * which the new clusters must start. *nb_clusters can be 0 on return in this
1229  * case if the cluster at host_offset is already in use. If *host_offset is
1230  * zero, the clusters can be allocated anywhere in the image file.
1231  *
1232  * *host_offset is updated to contain the offset into the image file at which
1233  * the first allocated cluster starts.
1234  *
1235  * Return 0 on success and -errno in error cases. -EAGAIN means that the
1236  * function has been waiting for another request and the allocation must be
1237  * restarted, but the whole request should not be failed.
1238  */
1239 static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
1240                                    uint64_t *host_offset, uint64_t *nb_clusters)
1241 {
1242     BDRVQcow2State *s = bs->opaque;
1243 
1244     trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset,
1245                                          *host_offset, *nb_clusters);
1246 
1247     /* Allocate new clusters */
1248     trace_qcow2_cluster_alloc_phys(qemu_coroutine_self());
1249     if (*host_offset == 0) {
1250         int64_t cluster_offset =
1251             qcow2_alloc_clusters(bs, *nb_clusters * s->cluster_size);
1252         if (cluster_offset < 0) {
1253             return cluster_offset;
1254         }
1255         *host_offset = cluster_offset;
1256         return 0;
1257     } else {
1258         int64_t ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters);
1259         if (ret < 0) {
1260             return ret;
1261         }
1262         *nb_clusters = ret;
1263         return 0;
1264     }
1265 }
1266 
1267 /*
1268  * Allocates new clusters for an area that either is yet unallocated or needs a
1269  * copy on write. If *host_offset is non-zero, clusters are only allocated if
1270  * the new allocation can match the specified host offset.
1271  *
1272  * Note that guest_offset may not be cluster aligned. In this case, the
1273  * returned *host_offset points to exact byte referenced by guest_offset and
1274  * therefore isn't cluster aligned as well.
1275  *
1276  * Returns:
1277  *   0:     if no clusters could be allocated. *bytes is set to 0,
1278  *          *host_offset is left unchanged.
1279  *
1280  *   1:     if new clusters were allocated. *bytes may be decreased if the
1281  *          new allocation doesn't cover all of the requested area.
1282  *          *host_offset is updated to contain the host offset of the first
1283  *          newly allocated cluster.
1284  *
1285  *  -errno: in error cases
1286  */
1287 static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
1288     uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
1289 {
1290     BDRVQcow2State *s = bs->opaque;
1291     int l2_index;
1292     uint64_t *l2_slice;
1293     uint64_t entry;
1294     uint64_t nb_clusters;
1295     int ret;
1296     bool keep_old_clusters = false;
1297 
1298     uint64_t alloc_cluster_offset = 0;
1299 
1300     trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset,
1301                              *bytes);
1302     assert(*bytes > 0);
1303 
1304     /*
1305      * Calculate the number of clusters to look for. We stop at L2 slice
1306      * boundaries to keep things simple.
1307      */
1308     nb_clusters =
1309         size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
1310 
1311     l2_index = offset_to_l2_slice_index(s, guest_offset);
1312     nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
1313     assert(nb_clusters <= INT_MAX);
1314 
1315     /* Find L2 entry for the first involved cluster */
1316     ret = get_cluster_table(bs, guest_offset, &l2_slice, &l2_index);
1317     if (ret < 0) {
1318         return ret;
1319     }
1320 
1321     entry = be64_to_cpu(l2_slice[l2_index]);
1322 
1323     /* For the moment, overwrite compressed clusters one by one */
1324     if (entry & QCOW_OFLAG_COMPRESSED) {
1325         nb_clusters = 1;
1326     } else {
1327         nb_clusters = count_cow_clusters(s, nb_clusters, l2_slice, l2_index);
1328     }
1329 
1330     /* This function is only called when there were no non-COW clusters, so if
1331      * we can't find any unallocated or COW clusters either, something is
1332      * wrong with our code. */
1333     assert(nb_clusters > 0);
1334 
1335     if (qcow2_get_cluster_type(entry) == QCOW2_CLUSTER_ZERO_ALLOC &&
1336         (entry & QCOW_OFLAG_COPIED) &&
1337         (!*host_offset ||
1338          start_of_cluster(s, *host_offset) == (entry & L2E_OFFSET_MASK)))
1339     {
1340         int preallocated_nb_clusters;
1341 
1342         if (offset_into_cluster(s, entry & L2E_OFFSET_MASK)) {
1343             qcow2_signal_corruption(bs, true, -1, -1, "Preallocated zero "
1344                                     "cluster offset %#llx unaligned (guest "
1345                                     "offset: %#" PRIx64 ")",
1346                                     entry & L2E_OFFSET_MASK, guest_offset);
1347             ret = -EIO;
1348             goto fail;
1349         }
1350 
1351         /* Try to reuse preallocated zero clusters; contiguous normal clusters
1352          * would be fine, too, but count_cow_clusters() above has limited
1353          * nb_clusters already to a range of COW clusters */
1354         preallocated_nb_clusters =
1355             count_contiguous_clusters(nb_clusters, s->cluster_size,
1356                                       &l2_slice[l2_index], QCOW_OFLAG_COPIED);
1357         assert(preallocated_nb_clusters > 0);
1358 
1359         nb_clusters = preallocated_nb_clusters;
1360         alloc_cluster_offset = entry & L2E_OFFSET_MASK;
1361 
1362         /* We want to reuse these clusters, so qcow2_alloc_cluster_link_l2()
1363          * should not free them. */
1364         keep_old_clusters = true;
1365     }
1366 
1367     qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1368 
1369     if (!alloc_cluster_offset) {
1370         /* Allocate, if necessary at a given offset in the image file */
1371         alloc_cluster_offset = start_of_cluster(s, *host_offset);
1372         ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset,
1373                                       &nb_clusters);
1374         if (ret < 0) {
1375             goto fail;
1376         }
1377 
1378         /* Can't extend contiguous allocation */
1379         if (nb_clusters == 0) {
1380             *bytes = 0;
1381             return 0;
1382         }
1383 
1384         /* !*host_offset would overwrite the image header and is reserved for
1385          * "no host offset preferred". If 0 was a valid host offset, it'd
1386          * trigger the following overlap check; do that now to avoid having an
1387          * invalid value in *host_offset. */
1388         if (!alloc_cluster_offset) {
1389             ret = qcow2_pre_write_overlap_check(bs, 0, alloc_cluster_offset,
1390                                                 nb_clusters * s->cluster_size);
1391             assert(ret < 0);
1392             goto fail;
1393         }
1394     }
1395 
1396     /*
1397      * Save info needed for meta data update.
1398      *
1399      * requested_bytes: Number of bytes from the start of the first
1400      * newly allocated cluster to the end of the (possibly shortened
1401      * before) write request.
1402      *
1403      * avail_bytes: Number of bytes from the start of the first
1404      * newly allocated to the end of the last newly allocated cluster.
1405      *
1406      * nb_bytes: The number of bytes from the start of the first
1407      * newly allocated cluster to the end of the area that the write
1408      * request actually writes to (excluding COW at the end)
1409      */
1410     uint64_t requested_bytes = *bytes + offset_into_cluster(s, guest_offset);
1411     int avail_bytes = MIN(INT_MAX, nb_clusters << s->cluster_bits);
1412     int nb_bytes = MIN(requested_bytes, avail_bytes);
1413     QCowL2Meta *old_m = *m;
1414 
1415     *m = g_malloc0(sizeof(**m));
1416 
1417     **m = (QCowL2Meta) {
1418         .next           = old_m,
1419 
1420         .alloc_offset   = alloc_cluster_offset,
1421         .offset         = start_of_cluster(s, guest_offset),
1422         .nb_clusters    = nb_clusters,
1423 
1424         .keep_old_clusters  = keep_old_clusters,
1425 
1426         .cow_start = {
1427             .offset     = 0,
1428             .nb_bytes   = offset_into_cluster(s, guest_offset),
1429         },
1430         .cow_end = {
1431             .offset     = nb_bytes,
1432             .nb_bytes   = avail_bytes - nb_bytes,
1433         },
1434     };
1435     qemu_co_queue_init(&(*m)->dependent_requests);
1436     QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);
1437 
1438     *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset);
1439     *bytes = MIN(*bytes, nb_bytes - offset_into_cluster(s, guest_offset));
1440     assert(*bytes != 0);
1441 
1442     return 1;
1443 
1444 fail:
1445     if (*m && (*m)->nb_clusters > 0) {
1446         QLIST_REMOVE(*m, next_in_flight);
1447     }
1448     return ret;
1449 }
1450 
1451 /*
1452  * alloc_cluster_offset
1453  *
1454  * For a given offset on the virtual disk, find the cluster offset in qcow2
1455  * file. If the offset is not found, allocate a new cluster.
1456  *
1457  * If the cluster was already allocated, m->nb_clusters is set to 0 and
1458  * other fields in m are meaningless.
1459  *
1460  * If the cluster is newly allocated, m->nb_clusters is set to the number of
1461  * contiguous clusters that have been allocated. In this case, the other
1462  * fields of m are valid and contain information about the first allocated
1463  * cluster.
1464  *
1465  * If the request conflicts with another write request in flight, the coroutine
1466  * is queued and will be reentered when the dependency has completed.
1467  *
1468  * Return 0 on success and -errno in error cases
1469  */
1470 int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
1471                                unsigned int *bytes, uint64_t *host_offset,
1472                                QCowL2Meta **m)
1473 {
1474     BDRVQcow2State *s = bs->opaque;
1475     uint64_t start, remaining;
1476     uint64_t cluster_offset;
1477     uint64_t cur_bytes;
1478     int ret;
1479 
1480     trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, *bytes);
1481 
1482 again:
1483     start = offset;
1484     remaining = *bytes;
1485     cluster_offset = 0;
1486     *host_offset = 0;
1487     cur_bytes = 0;
1488     *m = NULL;
1489 
1490     while (true) {
1491 
1492         if (!*host_offset) {
1493             *host_offset = start_of_cluster(s, cluster_offset);
1494         }
1495 
1496         assert(remaining >= cur_bytes);
1497 
1498         start           += cur_bytes;
1499         remaining       -= cur_bytes;
1500         cluster_offset  += cur_bytes;
1501 
1502         if (remaining == 0) {
1503             break;
1504         }
1505 
1506         cur_bytes = remaining;
1507 
1508         /*
1509          * Now start gathering as many contiguous clusters as possible:
1510          *
1511          * 1. Check for overlaps with in-flight allocations
1512          *
1513          *      a) Overlap not in the first cluster -> shorten this request and
1514          *         let the caller handle the rest in its next loop iteration.
1515          *
1516          *      b) Real overlaps of two requests. Yield and restart the search
1517          *         for contiguous clusters (the situation could have changed
1518          *         while we were sleeping)
1519          *
1520          *      c) TODO: Request starts in the same cluster as the in-flight
1521          *         allocation ends. Shorten the COW of the in-fight allocation,
1522          *         set cluster_offset to write to the same cluster and set up
1523          *         the right synchronisation between the in-flight request and
1524          *         the new one.
1525          */
1526         ret = handle_dependencies(bs, start, &cur_bytes, m);
1527         if (ret == -EAGAIN) {
1528             /* Currently handle_dependencies() doesn't yield if we already had
1529              * an allocation. If it did, we would have to clean up the L2Meta
1530              * structs before starting over. */
1531             assert(*m == NULL);
1532             goto again;
1533         } else if (ret < 0) {
1534             return ret;
1535         } else if (cur_bytes == 0) {
1536             break;
1537         } else {
1538             /* handle_dependencies() may have decreased cur_bytes (shortened
1539              * the allocations below) so that the next dependency is processed
1540              * correctly during the next loop iteration. */
1541         }
1542 
1543         /*
1544          * 2. Count contiguous COPIED clusters.
1545          */
1546         ret = handle_copied(bs, start, &cluster_offset, &cur_bytes, m);
1547         if (ret < 0) {
1548             return ret;
1549         } else if (ret) {
1550             continue;
1551         } else if (cur_bytes == 0) {
1552             break;
1553         }
1554 
1555         /*
1556          * 3. If the request still hasn't completed, allocate new clusters,
1557          *    considering any cluster_offset of steps 1c or 2.
1558          */
1559         ret = handle_alloc(bs, start, &cluster_offset, &cur_bytes, m);
1560         if (ret < 0) {
1561             return ret;
1562         } else if (ret) {
1563             continue;
1564         } else {
1565             assert(cur_bytes == 0);
1566             break;
1567         }
1568     }
1569 
1570     *bytes -= remaining;
1571     assert(*bytes > 0);
1572     assert(*host_offset != 0);
1573 
1574     return 0;
1575 }
1576 
1577 /*
1578  * This discards as many clusters of nb_clusters as possible at once (i.e.
1579  * all clusters in the same L2 slice) and returns the number of discarded
1580  * clusters.
1581  */
1582 static int discard_in_l2_slice(BlockDriverState *bs, uint64_t offset,
1583                                uint64_t nb_clusters,
1584                                enum qcow2_discard_type type, bool full_discard)
1585 {
1586     BDRVQcow2State *s = bs->opaque;
1587     uint64_t *l2_slice;
1588     int l2_index;
1589     int ret;
1590     int i;
1591 
1592     ret = get_cluster_table(bs, offset, &l2_slice, &l2_index);
1593     if (ret < 0) {
1594         return ret;
1595     }
1596 
1597     /* Limit nb_clusters to one L2 slice */
1598     nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
1599     assert(nb_clusters <= INT_MAX);
1600 
1601     for (i = 0; i < nb_clusters; i++) {
1602         uint64_t old_l2_entry;
1603 
1604         old_l2_entry = be64_to_cpu(l2_slice[l2_index + i]);
1605 
1606         /*
1607          * If full_discard is false, make sure that a discarded area reads back
1608          * as zeroes for v3 images (we cannot do it for v2 without actually
1609          * writing a zero-filled buffer). We can skip the operation if the
1610          * cluster is already marked as zero, or if it's unallocated and we
1611          * don't have a backing file.
1612          *
1613          * TODO We might want to use bdrv_block_status(bs) here, but we're
1614          * holding s->lock, so that doesn't work today.
1615          *
1616          * If full_discard is true, the sector should not read back as zeroes,
1617          * but rather fall through to the backing file.
1618          */
1619         switch (qcow2_get_cluster_type(old_l2_entry)) {
1620         case QCOW2_CLUSTER_UNALLOCATED:
1621             if (full_discard || !bs->backing) {
1622                 continue;
1623             }
1624             break;
1625 
1626         case QCOW2_CLUSTER_ZERO_PLAIN:
1627             if (!full_discard) {
1628                 continue;
1629             }
1630             break;
1631 
1632         case QCOW2_CLUSTER_ZERO_ALLOC:
1633         case QCOW2_CLUSTER_NORMAL:
1634         case QCOW2_CLUSTER_COMPRESSED:
1635             break;
1636 
1637         default:
1638             abort();
1639         }
1640 
1641         /* First remove L2 entries */
1642         qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
1643         if (!full_discard && s->qcow_version >= 3) {
1644             l2_slice[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
1645         } else {
1646             l2_slice[l2_index + i] = cpu_to_be64(0);
1647         }
1648 
1649         /* Then decrease the refcount */
1650         qcow2_free_any_clusters(bs, old_l2_entry, 1, type);
1651     }
1652 
1653     qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1654 
1655     return nb_clusters;
1656 }
1657 
1658 int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset,
1659                           uint64_t bytes, enum qcow2_discard_type type,
1660                           bool full_discard)
1661 {
1662     BDRVQcow2State *s = bs->opaque;
1663     uint64_t end_offset = offset + bytes;
1664     uint64_t nb_clusters;
1665     int64_t cleared;
1666     int ret;
1667 
1668     /* Caller must pass aligned values, except at image end */
1669     assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
1670     assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) ||
1671            end_offset == bs->total_sectors << BDRV_SECTOR_BITS);
1672 
1673     nb_clusters = size_to_clusters(s, bytes);
1674 
1675     s->cache_discards = true;
1676 
1677     /* Each L2 slice is handled by its own loop iteration */
1678     while (nb_clusters > 0) {
1679         cleared = discard_in_l2_slice(bs, offset, nb_clusters, type,
1680                                       full_discard);
1681         if (cleared < 0) {
1682             ret = cleared;
1683             goto fail;
1684         }
1685 
1686         nb_clusters -= cleared;
1687         offset += (cleared * s->cluster_size);
1688     }
1689 
1690     ret = 0;
1691 fail:
1692     s->cache_discards = false;
1693     qcow2_process_discards(bs, ret);
1694 
1695     return ret;
1696 }
1697 
1698 /*
1699  * This zeroes as many clusters of nb_clusters as possible at once (i.e.
1700  * all clusters in the same L2 slice) and returns the number of zeroed
1701  * clusters.
1702  */
1703 static int zero_in_l2_slice(BlockDriverState *bs, uint64_t offset,
1704                             uint64_t nb_clusters, int flags)
1705 {
1706     BDRVQcow2State *s = bs->opaque;
1707     uint64_t *l2_slice;
1708     int l2_index;
1709     int ret;
1710     int i;
1711     bool unmap = !!(flags & BDRV_REQ_MAY_UNMAP);
1712 
1713     ret = get_cluster_table(bs, offset, &l2_slice, &l2_index);
1714     if (ret < 0) {
1715         return ret;
1716     }
1717 
1718     /* Limit nb_clusters to one L2 slice */
1719     nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
1720     assert(nb_clusters <= INT_MAX);
1721 
1722     for (i = 0; i < nb_clusters; i++) {
1723         uint64_t old_offset;
1724         QCow2ClusterType cluster_type;
1725 
1726         old_offset = be64_to_cpu(l2_slice[l2_index + i]);
1727 
1728         /*
1729          * Minimize L2 changes if the cluster already reads back as
1730          * zeroes with correct allocation.
1731          */
1732         cluster_type = qcow2_get_cluster_type(old_offset);
1733         if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN ||
1734             (cluster_type == QCOW2_CLUSTER_ZERO_ALLOC && !unmap)) {
1735             continue;
1736         }
1737 
1738         qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
1739         if (cluster_type == QCOW2_CLUSTER_COMPRESSED || unmap) {
1740             l2_slice[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
1741             qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
1742         } else {
1743             l2_slice[l2_index + i] |= cpu_to_be64(QCOW_OFLAG_ZERO);
1744         }
1745     }
1746 
1747     qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1748 
1749     return nb_clusters;
1750 }
1751 
1752 int qcow2_cluster_zeroize(BlockDriverState *bs, uint64_t offset,
1753                           uint64_t bytes, int flags)
1754 {
1755     BDRVQcow2State *s = bs->opaque;
1756     uint64_t end_offset = offset + bytes;
1757     uint64_t nb_clusters;
1758     int64_t cleared;
1759     int ret;
1760 
1761     /* Caller must pass aligned values, except at image end */
1762     assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
1763     assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) ||
1764            end_offset == bs->total_sectors << BDRV_SECTOR_BITS);
1765 
1766     /* The zero flag is only supported by version 3 and newer */
1767     if (s->qcow_version < 3) {
1768         return -ENOTSUP;
1769     }
1770 
1771     /* Each L2 slice is handled by its own loop iteration */
1772     nb_clusters = size_to_clusters(s, bytes);
1773 
1774     s->cache_discards = true;
1775 
1776     while (nb_clusters > 0) {
1777         cleared = zero_in_l2_slice(bs, offset, nb_clusters, flags);
1778         if (cleared < 0) {
1779             ret = cleared;
1780             goto fail;
1781         }
1782 
1783         nb_clusters -= cleared;
1784         offset += (cleared * s->cluster_size);
1785     }
1786 
1787     ret = 0;
1788 fail:
1789     s->cache_discards = false;
1790     qcow2_process_discards(bs, ret);
1791 
1792     return ret;
1793 }
1794 
1795 /*
1796  * Expands all zero clusters in a specific L1 table (or deallocates them, for
1797  * non-backed non-pre-allocated zero clusters).
1798  *
1799  * l1_entries and *visited_l1_entries are used to keep track of progress for
1800  * status_cb(). l1_entries contains the total number of L1 entries and
1801  * *visited_l1_entries counts all visited L1 entries.
1802  */
1803 static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
1804                                       int l1_size, int64_t *visited_l1_entries,
1805                                       int64_t l1_entries,
1806                                       BlockDriverAmendStatusCB *status_cb,
1807                                       void *cb_opaque)
1808 {
1809     BDRVQcow2State *s = bs->opaque;
1810     bool is_active_l1 = (l1_table == s->l1_table);
1811     uint64_t *l2_slice = NULL;
1812     unsigned slice, slice_size2, n_slices;
1813     int ret;
1814     int i, j;
1815 
1816     slice_size2 = s->l2_slice_size * sizeof(uint64_t);
1817     n_slices = s->cluster_size / slice_size2;
1818 
1819     if (!is_active_l1) {
1820         /* inactive L2 tables require a buffer to be stored in when loading
1821          * them from disk */
1822         l2_slice = qemu_try_blockalign(bs->file->bs, slice_size2);
1823         if (l2_slice == NULL) {
1824             return -ENOMEM;
1825         }
1826     }
1827 
1828     for (i = 0; i < l1_size; i++) {
1829         uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK;
1830         uint64_t l2_refcount;
1831 
1832         if (!l2_offset) {
1833             /* unallocated */
1834             (*visited_l1_entries)++;
1835             if (status_cb) {
1836                 status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque);
1837             }
1838             continue;
1839         }
1840 
1841         if (offset_into_cluster(s, l2_offset)) {
1842             qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
1843                                     PRIx64 " unaligned (L1 index: %#x)",
1844                                     l2_offset, i);
1845             ret = -EIO;
1846             goto fail;
1847         }
1848 
1849         ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1850                                  &l2_refcount);
1851         if (ret < 0) {
1852             goto fail;
1853         }
1854 
1855         for (slice = 0; slice < n_slices; slice++) {
1856             uint64_t slice_offset = l2_offset + slice * slice_size2;
1857             bool l2_dirty = false;
1858             if (is_active_l1) {
1859                 /* get active L2 tables from cache */
1860                 ret = qcow2_cache_get(bs, s->l2_table_cache, slice_offset,
1861                                       (void **)&l2_slice);
1862             } else {
1863                 /* load inactive L2 tables from disk */
1864                 ret = bdrv_pread(bs->file, slice_offset, l2_slice, slice_size2);
1865             }
1866             if (ret < 0) {
1867                 goto fail;
1868             }
1869 
1870             for (j = 0; j < s->l2_slice_size; j++) {
1871                 uint64_t l2_entry = be64_to_cpu(l2_slice[j]);
1872                 int64_t offset = l2_entry & L2E_OFFSET_MASK;
1873                 QCow2ClusterType cluster_type =
1874                     qcow2_get_cluster_type(l2_entry);
1875 
1876                 if (cluster_type != QCOW2_CLUSTER_ZERO_PLAIN &&
1877                     cluster_type != QCOW2_CLUSTER_ZERO_ALLOC) {
1878                     continue;
1879                 }
1880 
1881                 if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
1882                     if (!bs->backing) {
1883                         /* not backed; therefore we can simply deallocate the
1884                          * cluster */
1885                         l2_slice[j] = 0;
1886                         l2_dirty = true;
1887                         continue;
1888                     }
1889 
1890                     offset = qcow2_alloc_clusters(bs, s->cluster_size);
1891                     if (offset < 0) {
1892                         ret = offset;
1893                         goto fail;
1894                     }
1895 
1896                     if (l2_refcount > 1) {
1897                         /* For shared L2 tables, set the refcount accordingly
1898                          * (it is already 1 and needs to be l2_refcount) */
1899                         ret = qcow2_update_cluster_refcount(
1900                             bs, offset >> s->cluster_bits,
1901                             refcount_diff(1, l2_refcount), false,
1902                             QCOW2_DISCARD_OTHER);
1903                         if (ret < 0) {
1904                             qcow2_free_clusters(bs, offset, s->cluster_size,
1905                                                 QCOW2_DISCARD_OTHER);
1906                             goto fail;
1907                         }
1908                     }
1909                 }
1910 
1911                 if (offset_into_cluster(s, offset)) {
1912                     int l2_index = slice * s->l2_slice_size + j;
1913                     qcow2_signal_corruption(
1914                         bs, true, -1, -1,
1915                         "Cluster allocation offset "
1916                         "%#" PRIx64 " unaligned (L2 offset: %#"
1917                         PRIx64 ", L2 index: %#x)", offset,
1918                         l2_offset, l2_index);
1919                     if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
1920                         qcow2_free_clusters(bs, offset, s->cluster_size,
1921                                             QCOW2_DISCARD_ALWAYS);
1922                     }
1923                     ret = -EIO;
1924                     goto fail;
1925                 }
1926 
1927                 ret = qcow2_pre_write_overlap_check(bs, 0, offset,
1928                                                     s->cluster_size);
1929                 if (ret < 0) {
1930                     if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
1931                         qcow2_free_clusters(bs, offset, s->cluster_size,
1932                                             QCOW2_DISCARD_ALWAYS);
1933                     }
1934                     goto fail;
1935                 }
1936 
1937                 ret = bdrv_pwrite_zeroes(bs->file, offset, s->cluster_size, 0);
1938                 if (ret < 0) {
1939                     if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
1940                         qcow2_free_clusters(bs, offset, s->cluster_size,
1941                                             QCOW2_DISCARD_ALWAYS);
1942                     }
1943                     goto fail;
1944                 }
1945 
1946                 if (l2_refcount == 1) {
1947                     l2_slice[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED);
1948                 } else {
1949                     l2_slice[j] = cpu_to_be64(offset);
1950                 }
1951                 l2_dirty = true;
1952             }
1953 
1954             if (is_active_l1) {
1955                 if (l2_dirty) {
1956                     qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
1957                     qcow2_cache_depends_on_flush(s->l2_table_cache);
1958                 }
1959                 qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1960             } else {
1961                 if (l2_dirty) {
1962                     ret = qcow2_pre_write_overlap_check(
1963                         bs, QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2,
1964                         slice_offset, slice_size2);
1965                     if (ret < 0) {
1966                         goto fail;
1967                     }
1968 
1969                     ret = bdrv_pwrite(bs->file, slice_offset,
1970                                       l2_slice, slice_size2);
1971                     if (ret < 0) {
1972                         goto fail;
1973                     }
1974                 }
1975             }
1976         }
1977 
1978         (*visited_l1_entries)++;
1979         if (status_cb) {
1980             status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque);
1981         }
1982     }
1983 
1984     ret = 0;
1985 
1986 fail:
1987     if (l2_slice) {
1988         if (!is_active_l1) {
1989             qemu_vfree(l2_slice);
1990         } else {
1991             qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1992         }
1993     }
1994     return ret;
1995 }
1996 
1997 /*
1998  * For backed images, expands all zero clusters on the image. For non-backed
1999  * images, deallocates all non-pre-allocated zero clusters (and claims the
2000  * allocation for pre-allocated ones). This is important for downgrading to a
2001  * qcow2 version which doesn't yet support metadata zero clusters.
2002  */
2003 int qcow2_expand_zero_clusters(BlockDriverState *bs,
2004                                BlockDriverAmendStatusCB *status_cb,
2005                                void *cb_opaque)
2006 {
2007     BDRVQcow2State *s = bs->opaque;
2008     uint64_t *l1_table = NULL;
2009     int64_t l1_entries = 0, visited_l1_entries = 0;
2010     int ret;
2011     int i, j;
2012 
2013     if (status_cb) {
2014         l1_entries = s->l1_size;
2015         for (i = 0; i < s->nb_snapshots; i++) {
2016             l1_entries += s->snapshots[i].l1_size;
2017         }
2018     }
2019 
2020     ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size,
2021                                      &visited_l1_entries, l1_entries,
2022                                      status_cb, cb_opaque);
2023     if (ret < 0) {
2024         goto fail;
2025     }
2026 
2027     /* Inactive L1 tables may point to active L2 tables - therefore it is
2028      * necessary to flush the L2 table cache before trying to access the L2
2029      * tables pointed to by inactive L1 entries (else we might try to expand
2030      * zero clusters that have already been expanded); furthermore, it is also
2031      * necessary to empty the L2 table cache, since it may contain tables which
2032      * are now going to be modified directly on disk, bypassing the cache.
2033      * qcow2_cache_empty() does both for us. */
2034     ret = qcow2_cache_empty(bs, s->l2_table_cache);
2035     if (ret < 0) {
2036         goto fail;
2037     }
2038 
2039     for (i = 0; i < s->nb_snapshots; i++) {
2040         int l1_size2;
2041         uint64_t *new_l1_table;
2042         Error *local_err = NULL;
2043 
2044         ret = qcow2_validate_table(bs, s->snapshots[i].l1_table_offset,
2045                                    s->snapshots[i].l1_size, sizeof(uint64_t),
2046                                    QCOW_MAX_L1_SIZE, "Snapshot L1 table",
2047                                    &local_err);
2048         if (ret < 0) {
2049             error_report_err(local_err);
2050             goto fail;
2051         }
2052 
2053         l1_size2 = s->snapshots[i].l1_size * sizeof(uint64_t);
2054         new_l1_table = g_try_realloc(l1_table, l1_size2);
2055 
2056         if (!new_l1_table) {
2057             ret = -ENOMEM;
2058             goto fail;
2059         }
2060 
2061         l1_table = new_l1_table;
2062 
2063         ret = bdrv_pread(bs->file, s->snapshots[i].l1_table_offset,
2064                          l1_table, l1_size2);
2065         if (ret < 0) {
2066             goto fail;
2067         }
2068 
2069         for (j = 0; j < s->snapshots[i].l1_size; j++) {
2070             be64_to_cpus(&l1_table[j]);
2071         }
2072 
2073         ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size,
2074                                          &visited_l1_entries, l1_entries,
2075                                          status_cb, cb_opaque);
2076         if (ret < 0) {
2077             goto fail;
2078         }
2079     }
2080 
2081     ret = 0;
2082 
2083 fail:
2084     g_free(l1_table);
2085     return ret;
2086 }
2087