xref: /openbmc/qemu/block/qcow2-cluster.c (revision 878096ee)
1 /*
2  * Block driver for the QCOW version 2 format
3  *
4  * Copyright (c) 2004-2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include <zlib.h>
26 
27 #include "qemu-common.h"
28 #include "block/block_int.h"
29 #include "block/qcow2.h"
30 #include "trace.h"
31 
32 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
33                         bool exact_size)
34 {
35     BDRVQcowState *s = bs->opaque;
36     int new_l1_size2, ret, i;
37     uint64_t *new_l1_table;
38     int64_t new_l1_table_offset, new_l1_size;
39     uint8_t data[12];
40 
41     if (min_size <= s->l1_size)
42         return 0;
43 
44     if (exact_size) {
45         new_l1_size = min_size;
46     } else {
47         /* Bump size up to reduce the number of times we have to grow */
48         new_l1_size = s->l1_size;
49         if (new_l1_size == 0) {
50             new_l1_size = 1;
51         }
52         while (min_size > new_l1_size) {
53             new_l1_size = (new_l1_size * 3 + 1) / 2;
54         }
55     }
56 
57     if (new_l1_size > INT_MAX) {
58         return -EFBIG;
59     }
60 
61 #ifdef DEBUG_ALLOC2
62     fprintf(stderr, "grow l1_table from %d to %" PRId64 "\n",
63             s->l1_size, new_l1_size);
64 #endif
65 
66     new_l1_size2 = sizeof(uint64_t) * new_l1_size;
67     new_l1_table = g_malloc0(align_offset(new_l1_size2, 512));
68     memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));
69 
70     /* write new table (align to cluster) */
71     BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE);
72     new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2);
73     if (new_l1_table_offset < 0) {
74         g_free(new_l1_table);
75         return new_l1_table_offset;
76     }
77 
78     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
79     if (ret < 0) {
80         goto fail;
81     }
82 
83     BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE);
84     for(i = 0; i < s->l1_size; i++)
85         new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
86     ret = bdrv_pwrite_sync(bs->file, new_l1_table_offset, new_l1_table, new_l1_size2);
87     if (ret < 0)
88         goto fail;
89     for(i = 0; i < s->l1_size; i++)
90         new_l1_table[i] = be64_to_cpu(new_l1_table[i]);
91 
92     /* set new table */
93     BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE);
94     cpu_to_be32w((uint32_t*)data, new_l1_size);
95     cpu_to_be64wu((uint64_t*)(data + 4), new_l1_table_offset);
96     ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size), data,sizeof(data));
97     if (ret < 0) {
98         goto fail;
99     }
100     g_free(s->l1_table);
101     qcow2_free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t),
102                         QCOW2_DISCARD_OTHER);
103     s->l1_table_offset = new_l1_table_offset;
104     s->l1_table = new_l1_table;
105     s->l1_size = new_l1_size;
106     return 0;
107  fail:
108     g_free(new_l1_table);
109     qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2,
110                         QCOW2_DISCARD_OTHER);
111     return ret;
112 }
113 
114 /*
115  * l2_load
116  *
117  * Loads a L2 table into memory. If the table is in the cache, the cache
118  * is used; otherwise the L2 table is loaded from the image file.
119  *
120  * Returns a pointer to the L2 table on success, or NULL if the read from
121  * the image file failed.
122  */
123 
124 static int l2_load(BlockDriverState *bs, uint64_t l2_offset,
125     uint64_t **l2_table)
126 {
127     BDRVQcowState *s = bs->opaque;
128     int ret;
129 
130     ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, (void**) l2_table);
131 
132     return ret;
133 }
134 
135 /*
136  * Writes one sector of the L1 table to the disk (can't update single entries
137  * and we really don't want bdrv_pread to perform a read-modify-write)
138  */
139 #define L1_ENTRIES_PER_SECTOR (512 / 8)
140 static int write_l1_entry(BlockDriverState *bs, int l1_index)
141 {
142     BDRVQcowState *s = bs->opaque;
143     uint64_t buf[L1_ENTRIES_PER_SECTOR];
144     int l1_start_index;
145     int i, ret;
146 
147     l1_start_index = l1_index & ~(L1_ENTRIES_PER_SECTOR - 1);
148     for (i = 0; i < L1_ENTRIES_PER_SECTOR; i++) {
149         buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]);
150     }
151 
152     BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
153     ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset + 8 * l1_start_index,
154         buf, sizeof(buf));
155     if (ret < 0) {
156         return ret;
157     }
158 
159     return 0;
160 }
161 
162 /*
163  * l2_allocate
164  *
165  * Allocate a new l2 entry in the file. If l1_index points to an already
166  * used entry in the L2 table (i.e. we are doing a copy on write for the L2
167  * table) copy the contents of the old L2 table into the newly allocated one.
168  * Otherwise the new table is initialized with zeros.
169  *
170  */
171 
172 static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
173 {
174     BDRVQcowState *s = bs->opaque;
175     uint64_t old_l2_offset;
176     uint64_t *l2_table;
177     int64_t l2_offset;
178     int ret;
179 
180     old_l2_offset = s->l1_table[l1_index];
181 
182     trace_qcow2_l2_allocate(bs, l1_index);
183 
184     /* allocate a new l2 entry */
185 
186     l2_offset = qcow2_alloc_clusters(bs, s->l2_size * sizeof(uint64_t));
187     if (l2_offset < 0) {
188         return l2_offset;
189     }
190 
191     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
192     if (ret < 0) {
193         goto fail;
194     }
195 
196     /* allocate a new entry in the l2 cache */
197 
198     trace_qcow2_l2_allocate_get_empty(bs, l1_index);
199     ret = qcow2_cache_get_empty(bs, s->l2_table_cache, l2_offset, (void**) table);
200     if (ret < 0) {
201         return ret;
202     }
203 
204     l2_table = *table;
205 
206     if ((old_l2_offset & L1E_OFFSET_MASK) == 0) {
207         /* if there was no old l2 table, clear the new table */
208         memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
209     } else {
210         uint64_t* old_table;
211 
212         /* if there was an old l2 table, read it from the disk */
213         BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ);
214         ret = qcow2_cache_get(bs, s->l2_table_cache,
215             old_l2_offset & L1E_OFFSET_MASK,
216             (void**) &old_table);
217         if (ret < 0) {
218             goto fail;
219         }
220 
221         memcpy(l2_table, old_table, s->cluster_size);
222 
223         ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &old_table);
224         if (ret < 0) {
225             goto fail;
226         }
227     }
228 
229     /* write the l2 table to the file */
230     BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE);
231 
232     trace_qcow2_l2_allocate_write_l2(bs, l1_index);
233     qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
234     ret = qcow2_cache_flush(bs, s->l2_table_cache);
235     if (ret < 0) {
236         goto fail;
237     }
238 
239     /* update the L1 entry */
240     trace_qcow2_l2_allocate_write_l1(bs, l1_index);
241     s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
242     ret = write_l1_entry(bs, l1_index);
243     if (ret < 0) {
244         goto fail;
245     }
246 
247     *table = l2_table;
248     trace_qcow2_l2_allocate_done(bs, l1_index, 0);
249     return 0;
250 
251 fail:
252     trace_qcow2_l2_allocate_done(bs, l1_index, ret);
253     qcow2_cache_put(bs, s->l2_table_cache, (void**) table);
254     s->l1_table[l1_index] = old_l2_offset;
255     return ret;
256 }
257 
258 /*
259  * Checks how many clusters in a given L2 table are contiguous in the image
260  * file. As soon as one of the flags in the bitmask stop_flags changes compared
261  * to the first cluster, the search is stopped and the cluster is not counted
262  * as contiguous. (This allows it, for example, to stop at the first compressed
263  * cluster which may require a different handling)
264  */
265 static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size,
266         uint64_t *l2_table, uint64_t start, uint64_t stop_flags)
267 {
268     int i;
269     uint64_t mask = stop_flags | L2E_OFFSET_MASK;
270     uint64_t offset = be64_to_cpu(l2_table[0]) & mask;
271 
272     if (!offset)
273         return 0;
274 
275     for (i = start; i < start + nb_clusters; i++) {
276         uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask;
277         if (offset + (uint64_t) i * cluster_size != l2_entry) {
278             break;
279         }
280     }
281 
282 	return (i - start);
283 }
284 
285 static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_table)
286 {
287     int i;
288 
289     for (i = 0; i < nb_clusters; i++) {
290         int type = qcow2_get_cluster_type(be64_to_cpu(l2_table[i]));
291 
292         if (type != QCOW2_CLUSTER_UNALLOCATED) {
293             break;
294         }
295     }
296 
297     return i;
298 }
299 
300 /* The crypt function is compatible with the linux cryptoloop
301    algorithm for < 4 GB images. NOTE: out_buf == in_buf is
302    supported */
303 void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
304                            uint8_t *out_buf, const uint8_t *in_buf,
305                            int nb_sectors, int enc,
306                            const AES_KEY *key)
307 {
308     union {
309         uint64_t ll[2];
310         uint8_t b[16];
311     } ivec;
312     int i;
313 
314     for(i = 0; i < nb_sectors; i++) {
315         ivec.ll[0] = cpu_to_le64(sector_num);
316         ivec.ll[1] = 0;
317         AES_cbc_encrypt(in_buf, out_buf, 512, key,
318                         ivec.b, enc);
319         sector_num++;
320         in_buf += 512;
321         out_buf += 512;
322     }
323 }
324 
325 static int coroutine_fn copy_sectors(BlockDriverState *bs,
326                                      uint64_t start_sect,
327                                      uint64_t cluster_offset,
328                                      int n_start, int n_end)
329 {
330     BDRVQcowState *s = bs->opaque;
331     QEMUIOVector qiov;
332     struct iovec iov;
333     int n, ret;
334 
335     /*
336      * If this is the last cluster and it is only partially used, we must only
337      * copy until the end of the image, or bdrv_check_request will fail for the
338      * bdrv_read/write calls below.
339      */
340     if (start_sect + n_end > bs->total_sectors) {
341         n_end = bs->total_sectors - start_sect;
342     }
343 
344     n = n_end - n_start;
345     if (n <= 0) {
346         return 0;
347     }
348 
349     iov.iov_len = n * BDRV_SECTOR_SIZE;
350     iov.iov_base = qemu_blockalign(bs, iov.iov_len);
351 
352     qemu_iovec_init_external(&qiov, &iov, 1);
353 
354     BLKDBG_EVENT(bs->file, BLKDBG_COW_READ);
355 
356     /* Call .bdrv_co_readv() directly instead of using the public block-layer
357      * interface.  This avoids double I/O throttling and request tracking,
358      * which can lead to deadlock when block layer copy-on-read is enabled.
359      */
360     ret = bs->drv->bdrv_co_readv(bs, start_sect + n_start, n, &qiov);
361     if (ret < 0) {
362         goto out;
363     }
364 
365     if (s->crypt_method) {
366         qcow2_encrypt_sectors(s, start_sect + n_start,
367                         iov.iov_base, iov.iov_base, n, 1,
368                         &s->aes_encrypt_key);
369     }
370 
371     BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
372     ret = bdrv_co_writev(bs->file, (cluster_offset >> 9) + n_start, n, &qiov);
373     if (ret < 0) {
374         goto out;
375     }
376 
377     ret = 0;
378 out:
379     qemu_vfree(iov.iov_base);
380     return ret;
381 }
382 
383 
384 /*
385  * get_cluster_offset
386  *
387  * For a given offset of the disk image, find the cluster offset in
388  * qcow2 file. The offset is stored in *cluster_offset.
389  *
390  * on entry, *num is the number of contiguous sectors we'd like to
391  * access following offset.
392  *
393  * on exit, *num is the number of contiguous sectors we can read.
394  *
395  * Returns the cluster type (QCOW2_CLUSTER_*) on success, -errno in error
396  * cases.
397  */
398 int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
399     int *num, uint64_t *cluster_offset)
400 {
401     BDRVQcowState *s = bs->opaque;
402     unsigned int l2_index;
403     uint64_t l1_index, l2_offset, *l2_table;
404     int l1_bits, c;
405     unsigned int index_in_cluster, nb_clusters;
406     uint64_t nb_available, nb_needed;
407     int ret;
408 
409     index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1);
410     nb_needed = *num + index_in_cluster;
411 
412     l1_bits = s->l2_bits + s->cluster_bits;
413 
414     /* compute how many bytes there are between the offset and
415      * the end of the l1 entry
416      */
417 
418     nb_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1));
419 
420     /* compute the number of available sectors */
421 
422     nb_available = (nb_available >> 9) + index_in_cluster;
423 
424     if (nb_needed > nb_available) {
425         nb_needed = nb_available;
426     }
427 
428     *cluster_offset = 0;
429 
430     /* seek the the l2 offset in the l1 table */
431 
432     l1_index = offset >> l1_bits;
433     if (l1_index >= s->l1_size) {
434         ret = QCOW2_CLUSTER_UNALLOCATED;
435         goto out;
436     }
437 
438     l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
439     if (!l2_offset) {
440         ret = QCOW2_CLUSTER_UNALLOCATED;
441         goto out;
442     }
443 
444     /* load the l2 table in memory */
445 
446     ret = l2_load(bs, l2_offset, &l2_table);
447     if (ret < 0) {
448         return ret;
449     }
450 
451     /* find the cluster offset for the given disk offset */
452 
453     l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
454     *cluster_offset = be64_to_cpu(l2_table[l2_index]);
455     nb_clusters = size_to_clusters(s, nb_needed << 9);
456 
457     ret = qcow2_get_cluster_type(*cluster_offset);
458     switch (ret) {
459     case QCOW2_CLUSTER_COMPRESSED:
460         /* Compressed clusters can only be processed one by one */
461         c = 1;
462         *cluster_offset &= L2E_COMPRESSED_OFFSET_SIZE_MASK;
463         break;
464     case QCOW2_CLUSTER_ZERO:
465         if (s->qcow_version < 3) {
466             return -EIO;
467         }
468         c = count_contiguous_clusters(nb_clusters, s->cluster_size,
469                 &l2_table[l2_index], 0,
470                 QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_ZERO);
471         *cluster_offset = 0;
472         break;
473     case QCOW2_CLUSTER_UNALLOCATED:
474         /* how many empty clusters ? */
475         c = count_contiguous_free_clusters(nb_clusters, &l2_table[l2_index]);
476         *cluster_offset = 0;
477         break;
478     case QCOW2_CLUSTER_NORMAL:
479         /* how many allocated clusters ? */
480         c = count_contiguous_clusters(nb_clusters, s->cluster_size,
481                 &l2_table[l2_index], 0,
482                 QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_ZERO);
483         *cluster_offset &= L2E_OFFSET_MASK;
484         break;
485     default:
486         abort();
487     }
488 
489     qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
490 
491     nb_available = (c * s->cluster_sectors);
492 
493 out:
494     if (nb_available > nb_needed)
495         nb_available = nb_needed;
496 
497     *num = nb_available - index_in_cluster;
498 
499     return ret;
500 }
501 
502 /*
503  * get_cluster_table
504  *
505  * for a given disk offset, load (and allocate if needed)
506  * the l2 table.
507  *
508  * the l2 table offset in the qcow2 file and the cluster index
509  * in the l2 table are given to the caller.
510  *
511  * Returns 0 on success, -errno in failure case
512  */
513 static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
514                              uint64_t **new_l2_table,
515                              int *new_l2_index)
516 {
517     BDRVQcowState *s = bs->opaque;
518     unsigned int l2_index;
519     uint64_t l1_index, l2_offset;
520     uint64_t *l2_table = NULL;
521     int ret;
522 
523     /* seek the the l2 offset in the l1 table */
524 
525     l1_index = offset >> (s->l2_bits + s->cluster_bits);
526     if (l1_index >= s->l1_size) {
527         ret = qcow2_grow_l1_table(bs, l1_index + 1, false);
528         if (ret < 0) {
529             return ret;
530         }
531     }
532 
533     assert(l1_index < s->l1_size);
534     l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
535 
536     /* seek the l2 table of the given l2 offset */
537 
538     if (s->l1_table[l1_index] & QCOW_OFLAG_COPIED) {
539         /* load the l2 table in memory */
540         ret = l2_load(bs, l2_offset, &l2_table);
541         if (ret < 0) {
542             return ret;
543         }
544     } else {
545         /* First allocate a new L2 table (and do COW if needed) */
546         ret = l2_allocate(bs, l1_index, &l2_table);
547         if (ret < 0) {
548             return ret;
549         }
550 
551         /* Then decrease the refcount of the old table */
552         if (l2_offset) {
553             qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t),
554                                 QCOW2_DISCARD_OTHER);
555         }
556     }
557 
558     /* find the cluster offset for the given disk offset */
559 
560     l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
561 
562     *new_l2_table = l2_table;
563     *new_l2_index = l2_index;
564 
565     return 0;
566 }
567 
568 /*
569  * alloc_compressed_cluster_offset
570  *
571  * For a given offset of the disk image, return cluster offset in
572  * qcow2 file.
573  *
574  * If the offset is not found, allocate a new compressed cluster.
575  *
576  * Return the cluster offset if successful,
577  * Return 0, otherwise.
578  *
579  */
580 
581 uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
582                                                uint64_t offset,
583                                                int compressed_size)
584 {
585     BDRVQcowState *s = bs->opaque;
586     int l2_index, ret;
587     uint64_t *l2_table;
588     int64_t cluster_offset;
589     int nb_csectors;
590 
591     ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
592     if (ret < 0) {
593         return 0;
594     }
595 
596     /* Compression can't overwrite anything. Fail if the cluster was already
597      * allocated. */
598     cluster_offset = be64_to_cpu(l2_table[l2_index]);
599     if (cluster_offset & L2E_OFFSET_MASK) {
600         qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
601         return 0;
602     }
603 
604     cluster_offset = qcow2_alloc_bytes(bs, compressed_size);
605     if (cluster_offset < 0) {
606         qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
607         return 0;
608     }
609 
610     nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) -
611                   (cluster_offset >> 9);
612 
613     cluster_offset |= QCOW_OFLAG_COMPRESSED |
614                       ((uint64_t)nb_csectors << s->csize_shift);
615 
616     /* update L2 table */
617 
618     /* compressed clusters never have the copied flag */
619 
620     BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED);
621     qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
622     l2_table[l2_index] = cpu_to_be64(cluster_offset);
623     ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
624     if (ret < 0) {
625         return 0;
626     }
627 
628     return cluster_offset;
629 }
630 
631 static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r)
632 {
633     BDRVQcowState *s = bs->opaque;
634     int ret;
635 
636     if (r->nb_sectors == 0) {
637         return 0;
638     }
639 
640     qemu_co_mutex_unlock(&s->lock);
641     ret = copy_sectors(bs, m->offset / BDRV_SECTOR_SIZE, m->alloc_offset,
642                        r->offset / BDRV_SECTOR_SIZE,
643                        r->offset / BDRV_SECTOR_SIZE + r->nb_sectors);
644     qemu_co_mutex_lock(&s->lock);
645 
646     if (ret < 0) {
647         return ret;
648     }
649 
650     /*
651      * Before we update the L2 table to actually point to the new cluster, we
652      * need to be sure that the refcounts have been increased and COW was
653      * handled.
654      */
655     qcow2_cache_depends_on_flush(s->l2_table_cache);
656 
657     return 0;
658 }
659 
660 int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
661 {
662     BDRVQcowState *s = bs->opaque;
663     int i, j = 0, l2_index, ret;
664     uint64_t *old_cluster, *l2_table;
665     uint64_t cluster_offset = m->alloc_offset;
666 
667     trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters);
668     assert(m->nb_clusters > 0);
669 
670     old_cluster = g_malloc(m->nb_clusters * sizeof(uint64_t));
671 
672     /* copy content of unmodified sectors */
673     ret = perform_cow(bs, m, &m->cow_start);
674     if (ret < 0) {
675         goto err;
676     }
677 
678     ret = perform_cow(bs, m, &m->cow_end);
679     if (ret < 0) {
680         goto err;
681     }
682 
683     /* Update L2 table. */
684     if (s->use_lazy_refcounts) {
685         qcow2_mark_dirty(bs);
686     }
687     if (qcow2_need_accurate_refcounts(s)) {
688         qcow2_cache_set_dependency(bs, s->l2_table_cache,
689                                    s->refcount_block_cache);
690     }
691 
692     ret = get_cluster_table(bs, m->offset, &l2_table, &l2_index);
693     if (ret < 0) {
694         goto err;
695     }
696     qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
697 
698     for (i = 0; i < m->nb_clusters; i++) {
699         /* if two concurrent writes happen to the same unallocated cluster
700 	 * each write allocates separate cluster and writes data concurrently.
701 	 * The first one to complete updates l2 table with pointer to its
702 	 * cluster the second one has to do RMW (which is done above by
703 	 * copy_sectors()), update l2 table with its cluster pointer and free
704 	 * old cluster. This is what this loop does */
705         if(l2_table[l2_index + i] != 0)
706             old_cluster[j++] = l2_table[l2_index + i];
707 
708         l2_table[l2_index + i] = cpu_to_be64((cluster_offset +
709                     (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
710      }
711 
712 
713     ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
714     if (ret < 0) {
715         goto err;
716     }
717 
718     /*
719      * If this was a COW, we need to decrease the refcount of the old cluster.
720      * Also flush bs->file to get the right order for L2 and refcount update.
721      *
722      * Don't discard clusters that reach a refcount of 0 (e.g. compressed
723      * clusters), the next write will reuse them anyway.
724      */
725     if (j != 0) {
726         for (i = 0; i < j; i++) {
727             qcow2_free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1,
728                                     QCOW2_DISCARD_NEVER);
729         }
730     }
731 
732     ret = 0;
733 err:
734     g_free(old_cluster);
735     return ret;
736  }
737 
738 /*
739  * Returns the number of contiguous clusters that can be used for an allocating
740  * write, but require COW to be performed (this includes yet unallocated space,
741  * which must copy from the backing file)
742  */
743 static int count_cow_clusters(BDRVQcowState *s, int nb_clusters,
744     uint64_t *l2_table, int l2_index)
745 {
746     int i;
747 
748     for (i = 0; i < nb_clusters; i++) {
749         uint64_t l2_entry = be64_to_cpu(l2_table[l2_index + i]);
750         int cluster_type = qcow2_get_cluster_type(l2_entry);
751 
752         switch(cluster_type) {
753         case QCOW2_CLUSTER_NORMAL:
754             if (l2_entry & QCOW_OFLAG_COPIED) {
755                 goto out;
756             }
757             break;
758         case QCOW2_CLUSTER_UNALLOCATED:
759         case QCOW2_CLUSTER_COMPRESSED:
760         case QCOW2_CLUSTER_ZERO:
761             break;
762         default:
763             abort();
764         }
765     }
766 
767 out:
768     assert(i <= nb_clusters);
769     return i;
770 }
771 
772 /*
773  * Check if there already is an AIO write request in flight which allocates
774  * the same cluster. In this case we need to wait until the previous
775  * request has completed and updated the L2 table accordingly.
776  *
777  * Returns:
778  *   0       if there was no dependency. *cur_bytes indicates the number of
779  *           bytes from guest_offset that can be read before the next
780  *           dependency must be processed (or the request is complete)
781  *
782  *   -EAGAIN if we had to wait for another request, previously gathered
783  *           information on cluster allocation may be invalid now. The caller
784  *           must start over anyway, so consider *cur_bytes undefined.
785  */
786 static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
787     uint64_t *cur_bytes, QCowL2Meta **m)
788 {
789     BDRVQcowState *s = bs->opaque;
790     QCowL2Meta *old_alloc;
791     uint64_t bytes = *cur_bytes;
792 
793     QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) {
794 
795         uint64_t start = guest_offset;
796         uint64_t end = start + bytes;
797         uint64_t old_start = l2meta_cow_start(old_alloc);
798         uint64_t old_end = l2meta_cow_end(old_alloc);
799 
800         if (end <= old_start || start >= old_end) {
801             /* No intersection */
802         } else {
803             if (start < old_start) {
804                 /* Stop at the start of a running allocation */
805                 bytes = old_start - start;
806             } else {
807                 bytes = 0;
808             }
809 
810             /* Stop if already an l2meta exists. After yielding, it wouldn't
811              * be valid any more, so we'd have to clean up the old L2Metas
812              * and deal with requests depending on them before starting to
813              * gather new ones. Not worth the trouble. */
814             if (bytes == 0 && *m) {
815                 *cur_bytes = 0;
816                 return 0;
817             }
818 
819             if (bytes == 0) {
820                 /* Wait for the dependency to complete. We need to recheck
821                  * the free/allocated clusters when we continue. */
822                 qemu_co_mutex_unlock(&s->lock);
823                 qemu_co_queue_wait(&old_alloc->dependent_requests);
824                 qemu_co_mutex_lock(&s->lock);
825                 return -EAGAIN;
826             }
827         }
828     }
829 
830     /* Make sure that existing clusters and new allocations are only used up to
831      * the next dependency if we shortened the request above */
832     *cur_bytes = bytes;
833 
834     return 0;
835 }
836 
837 /*
838  * Checks how many already allocated clusters that don't require a copy on
839  * write there are at the given guest_offset (up to *bytes). If
840  * *host_offset is not zero, only physically contiguous clusters beginning at
841  * this host offset are counted.
842  *
843  * Note that guest_offset may not be cluster aligned. In this case, the
844  * returned *host_offset points to exact byte referenced by guest_offset and
845  * therefore isn't cluster aligned as well.
846  *
847  * Returns:
848  *   0:     if no allocated clusters are available at the given offset.
849  *          *bytes is normally unchanged. It is set to 0 if the cluster
850  *          is allocated and doesn't need COW, but doesn't have the right
851  *          physical offset.
852  *
853  *   1:     if allocated clusters that don't require a COW are available at
854  *          the requested offset. *bytes may have decreased and describes
855  *          the length of the area that can be written to.
856  *
857  *  -errno: in error cases
858  */
859 static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
860     uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
861 {
862     BDRVQcowState *s = bs->opaque;
863     int l2_index;
864     uint64_t cluster_offset;
865     uint64_t *l2_table;
866     unsigned int nb_clusters;
867     unsigned int keep_clusters;
868     int ret, pret;
869 
870     trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset,
871                               *bytes);
872 
873     assert(*host_offset == 0 ||    offset_into_cluster(s, guest_offset)
874                                 == offset_into_cluster(s, *host_offset));
875 
876     /*
877      * Calculate the number of clusters to look for. We stop at L2 table
878      * boundaries to keep things simple.
879      */
880     nb_clusters =
881         size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
882 
883     l2_index = offset_to_l2_index(s, guest_offset);
884     nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
885 
886     /* Find L2 entry for the first involved cluster */
887     ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index);
888     if (ret < 0) {
889         return ret;
890     }
891 
892     cluster_offset = be64_to_cpu(l2_table[l2_index]);
893 
894     /* Check how many clusters are already allocated and don't need COW */
895     if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL
896         && (cluster_offset & QCOW_OFLAG_COPIED))
897     {
898         /* If a specific host_offset is required, check it */
899         bool offset_matches =
900             (cluster_offset & L2E_OFFSET_MASK) == *host_offset;
901 
902         if (*host_offset != 0 && !offset_matches) {
903             *bytes = 0;
904             ret = 0;
905             goto out;
906         }
907 
908         /* We keep all QCOW_OFLAG_COPIED clusters */
909         keep_clusters =
910             count_contiguous_clusters(nb_clusters, s->cluster_size,
911                                       &l2_table[l2_index], 0,
912                                       QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO);
913         assert(keep_clusters <= nb_clusters);
914 
915         *bytes = MIN(*bytes,
916                  keep_clusters * s->cluster_size
917                  - offset_into_cluster(s, guest_offset));
918 
919         ret = 1;
920     } else {
921         ret = 0;
922     }
923 
924     /* Cleanup */
925 out:
926     pret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
927     if (pret < 0) {
928         return pret;
929     }
930 
931     /* Only return a host offset if we actually made progress. Otherwise we
932      * would make requirements for handle_alloc() that it can't fulfill */
933     if (ret) {
934         *host_offset = (cluster_offset & L2E_OFFSET_MASK)
935                      + offset_into_cluster(s, guest_offset);
936     }
937 
938     return ret;
939 }
940 
941 /*
942  * Allocates new clusters for the given guest_offset.
943  *
944  * At most *nb_clusters are allocated, and on return *nb_clusters is updated to
945  * contain the number of clusters that have been allocated and are contiguous
946  * in the image file.
947  *
948  * If *host_offset is non-zero, it specifies the offset in the image file at
949  * which the new clusters must start. *nb_clusters can be 0 on return in this
950  * case if the cluster at host_offset is already in use. If *host_offset is
951  * zero, the clusters can be allocated anywhere in the image file.
952  *
953  * *host_offset is updated to contain the offset into the image file at which
954  * the first allocated cluster starts.
955  *
956  * Return 0 on success and -errno in error cases. -EAGAIN means that the
957  * function has been waiting for another request and the allocation must be
958  * restarted, but the whole request should not be failed.
959  */
960 static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
961     uint64_t *host_offset, unsigned int *nb_clusters)
962 {
963     BDRVQcowState *s = bs->opaque;
964 
965     trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset,
966                                          *host_offset, *nb_clusters);
967 
968     /* Allocate new clusters */
969     trace_qcow2_cluster_alloc_phys(qemu_coroutine_self());
970     if (*host_offset == 0) {
971         int64_t cluster_offset =
972             qcow2_alloc_clusters(bs, *nb_clusters * s->cluster_size);
973         if (cluster_offset < 0) {
974             return cluster_offset;
975         }
976         *host_offset = cluster_offset;
977         return 0;
978     } else {
979         int ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters);
980         if (ret < 0) {
981             return ret;
982         }
983         *nb_clusters = ret;
984         return 0;
985     }
986 }
987 
988 /*
989  * Allocates new clusters for an area that either is yet unallocated or needs a
990  * copy on write. If *host_offset is non-zero, clusters are only allocated if
991  * the new allocation can match the specified host offset.
992  *
993  * Note that guest_offset may not be cluster aligned. In this case, the
994  * returned *host_offset points to exact byte referenced by guest_offset and
995  * therefore isn't cluster aligned as well.
996  *
997  * Returns:
998  *   0:     if no clusters could be allocated. *bytes is set to 0,
999  *          *host_offset is left unchanged.
1000  *
1001  *   1:     if new clusters were allocated. *bytes may be decreased if the
1002  *          new allocation doesn't cover all of the requested area.
1003  *          *host_offset is updated to contain the host offset of the first
1004  *          newly allocated cluster.
1005  *
1006  *  -errno: in error cases
1007  */
1008 static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
1009     uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
1010 {
1011     BDRVQcowState *s = bs->opaque;
1012     int l2_index;
1013     uint64_t *l2_table;
1014     uint64_t entry;
1015     unsigned int nb_clusters;
1016     int ret;
1017 
1018     uint64_t alloc_cluster_offset;
1019 
1020     trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset,
1021                              *bytes);
1022     assert(*bytes > 0);
1023 
1024     /*
1025      * Calculate the number of clusters to look for. We stop at L2 table
1026      * boundaries to keep things simple.
1027      */
1028     nb_clusters =
1029         size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
1030 
1031     l2_index = offset_to_l2_index(s, guest_offset);
1032     nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
1033 
1034     /* Find L2 entry for the first involved cluster */
1035     ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index);
1036     if (ret < 0) {
1037         return ret;
1038     }
1039 
1040     entry = be64_to_cpu(l2_table[l2_index]);
1041 
1042     /* For the moment, overwrite compressed clusters one by one */
1043     if (entry & QCOW_OFLAG_COMPRESSED) {
1044         nb_clusters = 1;
1045     } else {
1046         nb_clusters = count_cow_clusters(s, nb_clusters, l2_table, l2_index);
1047     }
1048 
1049     /* This function is only called when there were no non-COW clusters, so if
1050      * we can't find any unallocated or COW clusters either, something is
1051      * wrong with our code. */
1052     assert(nb_clusters > 0);
1053 
1054     ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
1055     if (ret < 0) {
1056         return ret;
1057     }
1058 
1059     /* Allocate, if necessary at a given offset in the image file */
1060     alloc_cluster_offset = start_of_cluster(s, *host_offset);
1061     ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset,
1062                                   &nb_clusters);
1063     if (ret < 0) {
1064         goto fail;
1065     }
1066 
1067     /* Can't extend contiguous allocation */
1068     if (nb_clusters == 0) {
1069         *bytes = 0;
1070         return 0;
1071     }
1072 
1073     /*
1074      * Save info needed for meta data update.
1075      *
1076      * requested_sectors: Number of sectors from the start of the first
1077      * newly allocated cluster to the end of the (possibly shortened
1078      * before) write request.
1079      *
1080      * avail_sectors: Number of sectors from the start of the first
1081      * newly allocated to the end of the last newly allocated cluster.
1082      *
1083      * nb_sectors: The number of sectors from the start of the first
1084      * newly allocated cluster to the end of the area that the write
1085      * request actually writes to (excluding COW at the end)
1086      */
1087     int requested_sectors =
1088         (*bytes + offset_into_cluster(s, guest_offset))
1089         >> BDRV_SECTOR_BITS;
1090     int avail_sectors = nb_clusters
1091                         << (s->cluster_bits - BDRV_SECTOR_BITS);
1092     int alloc_n_start = offset_into_cluster(s, guest_offset)
1093                         >> BDRV_SECTOR_BITS;
1094     int nb_sectors = MIN(requested_sectors, avail_sectors);
1095     QCowL2Meta *old_m = *m;
1096 
1097     *m = g_malloc0(sizeof(**m));
1098 
1099     **m = (QCowL2Meta) {
1100         .next           = old_m,
1101 
1102         .alloc_offset   = alloc_cluster_offset,
1103         .offset         = start_of_cluster(s, guest_offset),
1104         .nb_clusters    = nb_clusters,
1105         .nb_available   = nb_sectors,
1106 
1107         .cow_start = {
1108             .offset     = 0,
1109             .nb_sectors = alloc_n_start,
1110         },
1111         .cow_end = {
1112             .offset     = nb_sectors * BDRV_SECTOR_SIZE,
1113             .nb_sectors = avail_sectors - nb_sectors,
1114         },
1115     };
1116     qemu_co_queue_init(&(*m)->dependent_requests);
1117     QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);
1118 
1119     *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset);
1120     *bytes = MIN(*bytes, (nb_sectors * BDRV_SECTOR_SIZE)
1121                          - offset_into_cluster(s, guest_offset));
1122     assert(*bytes != 0);
1123 
1124     return 1;
1125 
1126 fail:
1127     if (*m && (*m)->nb_clusters > 0) {
1128         QLIST_REMOVE(*m, next_in_flight);
1129     }
1130     return ret;
1131 }
1132 
1133 /*
1134  * alloc_cluster_offset
1135  *
1136  * For a given offset on the virtual disk, find the cluster offset in qcow2
1137  * file. If the offset is not found, allocate a new cluster.
1138  *
1139  * If the cluster was already allocated, m->nb_clusters is set to 0 and
1140  * other fields in m are meaningless.
1141  *
1142  * If the cluster is newly allocated, m->nb_clusters is set to the number of
1143  * contiguous clusters that have been allocated. In this case, the other
1144  * fields of m are valid and contain information about the first allocated
1145  * cluster.
1146  *
1147  * If the request conflicts with another write request in flight, the coroutine
1148  * is queued and will be reentered when the dependency has completed.
1149  *
1150  * Return 0 on success and -errno in error cases
1151  */
1152 int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
1153     int n_start, int n_end, int *num, uint64_t *host_offset, QCowL2Meta **m)
1154 {
1155     BDRVQcowState *s = bs->opaque;
1156     uint64_t start, remaining;
1157     uint64_t cluster_offset;
1158     uint64_t cur_bytes;
1159     int ret;
1160 
1161     trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset,
1162                                       n_start, n_end);
1163 
1164     assert(n_start * BDRV_SECTOR_SIZE == offset_into_cluster(s, offset));
1165     offset = start_of_cluster(s, offset);
1166 
1167 again:
1168     start = offset + (n_start << BDRV_SECTOR_BITS);
1169     remaining = (n_end - n_start) << BDRV_SECTOR_BITS;
1170     cluster_offset = 0;
1171     *host_offset = 0;
1172     cur_bytes = 0;
1173     *m = NULL;
1174 
1175     while (true) {
1176 
1177         if (!*host_offset) {
1178             *host_offset = start_of_cluster(s, cluster_offset);
1179         }
1180 
1181         assert(remaining >= cur_bytes);
1182 
1183         start           += cur_bytes;
1184         remaining       -= cur_bytes;
1185         cluster_offset  += cur_bytes;
1186 
1187         if (remaining == 0) {
1188             break;
1189         }
1190 
1191         cur_bytes = remaining;
1192 
1193         /*
1194          * Now start gathering as many contiguous clusters as possible:
1195          *
1196          * 1. Check for overlaps with in-flight allocations
1197          *
1198          *      a) Overlap not in the first cluster -> shorten this request and
1199          *         let the caller handle the rest in its next loop iteration.
1200          *
1201          *      b) Real overlaps of two requests. Yield and restart the search
1202          *         for contiguous clusters (the situation could have changed
1203          *         while we were sleeping)
1204          *
1205          *      c) TODO: Request starts in the same cluster as the in-flight
1206          *         allocation ends. Shorten the COW of the in-fight allocation,
1207          *         set cluster_offset to write to the same cluster and set up
1208          *         the right synchronisation between the in-flight request and
1209          *         the new one.
1210          */
1211         ret = handle_dependencies(bs, start, &cur_bytes, m);
1212         if (ret == -EAGAIN) {
1213             /* Currently handle_dependencies() doesn't yield if we already had
1214              * an allocation. If it did, we would have to clean up the L2Meta
1215              * structs before starting over. */
1216             assert(*m == NULL);
1217             goto again;
1218         } else if (ret < 0) {
1219             return ret;
1220         } else if (cur_bytes == 0) {
1221             break;
1222         } else {
1223             /* handle_dependencies() may have decreased cur_bytes (shortened
1224              * the allocations below) so that the next dependency is processed
1225              * correctly during the next loop iteration. */
1226         }
1227 
1228         /*
1229          * 2. Count contiguous COPIED clusters.
1230          */
1231         ret = handle_copied(bs, start, &cluster_offset, &cur_bytes, m);
1232         if (ret < 0) {
1233             return ret;
1234         } else if (ret) {
1235             continue;
1236         } else if (cur_bytes == 0) {
1237             break;
1238         }
1239 
1240         /*
1241          * 3. If the request still hasn't completed, allocate new clusters,
1242          *    considering any cluster_offset of steps 1c or 2.
1243          */
1244         ret = handle_alloc(bs, start, &cluster_offset, &cur_bytes, m);
1245         if (ret < 0) {
1246             return ret;
1247         } else if (ret) {
1248             continue;
1249         } else {
1250             assert(cur_bytes == 0);
1251             break;
1252         }
1253     }
1254 
1255     *num = (n_end - n_start) - (remaining >> BDRV_SECTOR_BITS);
1256     assert(*num > 0);
1257     assert(*host_offset != 0);
1258 
1259     return 0;
1260 }
1261 
1262 static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
1263                              const uint8_t *buf, int buf_size)
1264 {
1265     z_stream strm1, *strm = &strm1;
1266     int ret, out_len;
1267 
1268     memset(strm, 0, sizeof(*strm));
1269 
1270     strm->next_in = (uint8_t *)buf;
1271     strm->avail_in = buf_size;
1272     strm->next_out = out_buf;
1273     strm->avail_out = out_buf_size;
1274 
1275     ret = inflateInit2(strm, -12);
1276     if (ret != Z_OK)
1277         return -1;
1278     ret = inflate(strm, Z_FINISH);
1279     out_len = strm->next_out - out_buf;
1280     if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
1281         out_len != out_buf_size) {
1282         inflateEnd(strm);
1283         return -1;
1284     }
1285     inflateEnd(strm);
1286     return 0;
1287 }
1288 
1289 int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
1290 {
1291     BDRVQcowState *s = bs->opaque;
1292     int ret, csize, nb_csectors, sector_offset;
1293     uint64_t coffset;
1294 
1295     coffset = cluster_offset & s->cluster_offset_mask;
1296     if (s->cluster_cache_offset != coffset) {
1297         nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1;
1298         sector_offset = coffset & 511;
1299         csize = nb_csectors * 512 - sector_offset;
1300         BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
1301         ret = bdrv_read(bs->file, coffset >> 9, s->cluster_data, nb_csectors);
1302         if (ret < 0) {
1303             return ret;
1304         }
1305         if (decompress_buffer(s->cluster_cache, s->cluster_size,
1306                               s->cluster_data + sector_offset, csize) < 0) {
1307             return -EIO;
1308         }
1309         s->cluster_cache_offset = coffset;
1310     }
1311     return 0;
1312 }
1313 
1314 /*
1315  * This discards as many clusters of nb_clusters as possible at once (i.e.
1316  * all clusters in the same L2 table) and returns the number of discarded
1317  * clusters.
1318  */
1319 static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
1320     unsigned int nb_clusters)
1321 {
1322     BDRVQcowState *s = bs->opaque;
1323     uint64_t *l2_table;
1324     int l2_index;
1325     int ret;
1326     int i;
1327 
1328     ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
1329     if (ret < 0) {
1330         return ret;
1331     }
1332 
1333     /* Limit nb_clusters to one L2 table */
1334     nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
1335 
1336     for (i = 0; i < nb_clusters; i++) {
1337         uint64_t old_offset;
1338 
1339         old_offset = be64_to_cpu(l2_table[l2_index + i]);
1340         if ((old_offset & L2E_OFFSET_MASK) == 0) {
1341             continue;
1342         }
1343 
1344         /* First remove L2 entries */
1345         qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
1346         l2_table[l2_index + i] = cpu_to_be64(0);
1347 
1348         /* Then decrease the refcount */
1349         qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
1350     }
1351 
1352     ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
1353     if (ret < 0) {
1354         return ret;
1355     }
1356 
1357     return nb_clusters;
1358 }
1359 
1360 int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
1361     int nb_sectors)
1362 {
1363     BDRVQcowState *s = bs->opaque;
1364     uint64_t end_offset;
1365     unsigned int nb_clusters;
1366     int ret;
1367 
1368     end_offset = offset + (nb_sectors << BDRV_SECTOR_BITS);
1369 
1370     /* Round start up and end down */
1371     offset = align_offset(offset, s->cluster_size);
1372     end_offset &= ~(s->cluster_size - 1);
1373 
1374     if (offset > end_offset) {
1375         return 0;
1376     }
1377 
1378     nb_clusters = size_to_clusters(s, end_offset - offset);
1379 
1380     s->cache_discards = true;
1381 
1382     /* Each L2 table is handled by its own loop iteration */
1383     while (nb_clusters > 0) {
1384         ret = discard_single_l2(bs, offset, nb_clusters);
1385         if (ret < 0) {
1386             goto fail;
1387         }
1388 
1389         nb_clusters -= ret;
1390         offset += (ret * s->cluster_size);
1391     }
1392 
1393     ret = 0;
1394 fail:
1395     s->cache_discards = false;
1396     qcow2_process_discards(bs, ret);
1397 
1398     return ret;
1399 }
1400 
1401 /*
1402  * This zeroes as many clusters of nb_clusters as possible at once (i.e.
1403  * all clusters in the same L2 table) and returns the number of zeroed
1404  * clusters.
1405  */
1406 static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
1407     unsigned int nb_clusters)
1408 {
1409     BDRVQcowState *s = bs->opaque;
1410     uint64_t *l2_table;
1411     int l2_index;
1412     int ret;
1413     int i;
1414 
1415     ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
1416     if (ret < 0) {
1417         return ret;
1418     }
1419 
1420     /* Limit nb_clusters to one L2 table */
1421     nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
1422 
1423     for (i = 0; i < nb_clusters; i++) {
1424         uint64_t old_offset;
1425 
1426         old_offset = be64_to_cpu(l2_table[l2_index + i]);
1427 
1428         /* Update L2 entries */
1429         qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
1430         if (old_offset & QCOW_OFLAG_COMPRESSED) {
1431             l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
1432             qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
1433         } else {
1434             l2_table[l2_index + i] |= cpu_to_be64(QCOW_OFLAG_ZERO);
1435         }
1436     }
1437 
1438     ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
1439     if (ret < 0) {
1440         return ret;
1441     }
1442 
1443     return nb_clusters;
1444 }
1445 
1446 int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors)
1447 {
1448     BDRVQcowState *s = bs->opaque;
1449     unsigned int nb_clusters;
1450     int ret;
1451 
1452     /* The zero flag is only supported by version 3 and newer */
1453     if (s->qcow_version < 3) {
1454         return -ENOTSUP;
1455     }
1456 
1457     /* Each L2 table is handled by its own loop iteration */
1458     nb_clusters = size_to_clusters(s, nb_sectors << BDRV_SECTOR_BITS);
1459 
1460     s->cache_discards = true;
1461 
1462     while (nb_clusters > 0) {
1463         ret = zero_single_l2(bs, offset, nb_clusters);
1464         if (ret < 0) {
1465             goto fail;
1466         }
1467 
1468         nb_clusters -= ret;
1469         offset += (ret * s->cluster_size);
1470     }
1471 
1472     ret = 0;
1473 fail:
1474     s->cache_discards = false;
1475     qcow2_process_discards(bs, ret);
1476 
1477     return ret;
1478 }
1479