xref: /openbmc/qemu/block/qcow2-cluster.c (revision 71af014f1451bec3244e086298813b5aa7b2a0ee)
1 /*
2  * Block driver for the QCOW version 2 format
3  *
4  * Copyright (c) 2004-2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include <zlib.h>
26 
27 #include "qemu-common.h"
28 #include "block_int.h"
29 #include "block/qcow2.h"
30 
31 int qcow2_grow_l1_table(BlockDriverState *bs, int min_size, bool exact_size)
32 {
33     BDRVQcowState *s = bs->opaque;
34     int new_l1_size, new_l1_size2, ret, i;
35     uint64_t *new_l1_table;
36     int64_t new_l1_table_offset;
37     uint8_t data[12];
38 
39     if (min_size <= s->l1_size)
40         return 0;
41 
42     if (exact_size) {
43         new_l1_size = min_size;
44     } else {
45         /* Bump size up to reduce the number of times we have to grow */
46         new_l1_size = s->l1_size;
47         if (new_l1_size == 0) {
48             new_l1_size = 1;
49         }
50         while (min_size > new_l1_size) {
51             new_l1_size = (new_l1_size * 3 + 1) / 2;
52         }
53     }
54 
55 #ifdef DEBUG_ALLOC2
56     printf("grow l1_table from %d to %d\n", s->l1_size, new_l1_size);
57 #endif
58 
59     new_l1_size2 = sizeof(uint64_t) * new_l1_size;
60     new_l1_table = qemu_mallocz(align_offset(new_l1_size2, 512));
61     memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));
62 
63     /* write new table (align to cluster) */
64     BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE);
65     new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2);
66     if (new_l1_table_offset < 0) {
67         qemu_free(new_l1_table);
68         return new_l1_table_offset;
69     }
70     bdrv_flush(bs->file);
71 
72     BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE);
73     for(i = 0; i < s->l1_size; i++)
74         new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
75     ret = bdrv_pwrite_sync(bs->file, new_l1_table_offset, new_l1_table, new_l1_size2);
76     if (ret < 0)
77         goto fail;
78     for(i = 0; i < s->l1_size; i++)
79         new_l1_table[i] = be64_to_cpu(new_l1_table[i]);
80 
81     /* set new table */
82     BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE);
83     cpu_to_be32w((uint32_t*)data, new_l1_size);
84     cpu_to_be64w((uint64_t*)(data + 4), new_l1_table_offset);
85     ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size), data,sizeof(data));
86     if (ret < 0) {
87         goto fail;
88     }
89     qemu_free(s->l1_table);
90     qcow2_free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t));
91     s->l1_table_offset = new_l1_table_offset;
92     s->l1_table = new_l1_table;
93     s->l1_size = new_l1_size;
94     return 0;
95  fail:
96     qemu_free(new_l1_table);
97     qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2);
98     return ret;
99 }
100 
101 void qcow2_l2_cache_reset(BlockDriverState *bs)
102 {
103     BDRVQcowState *s = bs->opaque;
104 
105     memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
106     memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
107     memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
108 }
109 
110 static inline int l2_cache_new_entry(BlockDriverState *bs)
111 {
112     BDRVQcowState *s = bs->opaque;
113     uint32_t min_count;
114     int min_index, i;
115 
116     /* find a new entry in the least used one */
117     min_index = 0;
118     min_count = 0xffffffff;
119     for(i = 0; i < L2_CACHE_SIZE; i++) {
120         if (s->l2_cache_counts[i] < min_count) {
121             min_count = s->l2_cache_counts[i];
122             min_index = i;
123         }
124     }
125     return min_index;
126 }
127 
128 /*
129  * seek_l2_table
130  *
131  * seek l2_offset in the l2_cache table
132  * if not found, return NULL,
133  * if found,
134  *   increments the l2 cache hit count of the entry,
135  *   if counter overflow, divide by two all counters
136  *   return the pointer to the l2 cache entry
137  *
138  */
139 
140 static uint64_t *seek_l2_table(BDRVQcowState *s, uint64_t l2_offset)
141 {
142     int i, j;
143 
144     for(i = 0; i < L2_CACHE_SIZE; i++) {
145         if (l2_offset == s->l2_cache_offsets[i]) {
146             /* increment the hit count */
147             if (++s->l2_cache_counts[i] == 0xffffffff) {
148                 for(j = 0; j < L2_CACHE_SIZE; j++) {
149                     s->l2_cache_counts[j] >>= 1;
150                 }
151             }
152             return s->l2_cache + (i << s->l2_bits);
153         }
154     }
155     return NULL;
156 }
157 
158 /*
159  * l2_load
160  *
161  * Loads a L2 table into memory. If the table is in the cache, the cache
162  * is used; otherwise the L2 table is loaded from the image file.
163  *
164  * Returns a pointer to the L2 table on success, or NULL if the read from
165  * the image file failed.
166  */
167 
168 static int l2_load(BlockDriverState *bs, uint64_t l2_offset,
169     uint64_t **l2_table)
170 {
171     BDRVQcowState *s = bs->opaque;
172     int min_index;
173     int ret;
174 
175     /* seek if the table for the given offset is in the cache */
176 
177     *l2_table = seek_l2_table(s, l2_offset);
178     if (*l2_table != NULL) {
179         return 0;
180     }
181 
182     /* not found: load a new entry in the least used one */
183 
184     min_index = l2_cache_new_entry(bs);
185     *l2_table = s->l2_cache + (min_index << s->l2_bits);
186 
187     BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD);
188     ret = bdrv_pread(bs->file, l2_offset, *l2_table,
189         s->l2_size * sizeof(uint64_t));
190     if (ret < 0) {
191         qcow2_l2_cache_reset(bs);
192         return ret;
193     }
194 
195     s->l2_cache_offsets[min_index] = l2_offset;
196     s->l2_cache_counts[min_index] = 1;
197 
198     return 0;
199 }
200 
201 /*
202  * Writes one sector of the L1 table to the disk (can't update single entries
203  * and we really don't want bdrv_pread to perform a read-modify-write)
204  */
205 #define L1_ENTRIES_PER_SECTOR (512 / 8)
206 static int write_l1_entry(BlockDriverState *bs, int l1_index)
207 {
208     BDRVQcowState *s = bs->opaque;
209     uint64_t buf[L1_ENTRIES_PER_SECTOR];
210     int l1_start_index;
211     int i, ret;
212 
213     l1_start_index = l1_index & ~(L1_ENTRIES_PER_SECTOR - 1);
214     for (i = 0; i < L1_ENTRIES_PER_SECTOR; i++) {
215         buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]);
216     }
217 
218     BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
219     ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset + 8 * l1_start_index,
220         buf, sizeof(buf));
221     if (ret < 0) {
222         return ret;
223     }
224 
225     return 0;
226 }
227 
228 /*
229  * l2_allocate
230  *
231  * Allocate a new l2 entry in the file. If l1_index points to an already
232  * used entry in the L2 table (i.e. we are doing a copy on write for the L2
233  * table) copy the contents of the old L2 table into the newly allocated one.
234  * Otherwise the new table is initialized with zeros.
235  *
236  */
237 
238 static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
239 {
240     BDRVQcowState *s = bs->opaque;
241     int min_index;
242     uint64_t old_l2_offset;
243     uint64_t *l2_table;
244     int64_t l2_offset;
245     int ret;
246 
247     old_l2_offset = s->l1_table[l1_index];
248 
249     /* allocate a new l2 entry */
250 
251     l2_offset = qcow2_alloc_clusters(bs, s->l2_size * sizeof(uint64_t));
252     if (l2_offset < 0) {
253         return l2_offset;
254     }
255     bdrv_flush(bs->file);
256 
257     /* allocate a new entry in the l2 cache */
258 
259     min_index = l2_cache_new_entry(bs);
260     l2_table = s->l2_cache + (min_index << s->l2_bits);
261 
262     if (old_l2_offset == 0) {
263         /* if there was no old l2 table, clear the new table */
264         memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
265     } else {
266         /* if there was an old l2 table, read it from the disk */
267         BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ);
268         ret = bdrv_pread(bs->file, old_l2_offset, l2_table,
269             s->l2_size * sizeof(uint64_t));
270         if (ret < 0) {
271             goto fail;
272         }
273     }
274     /* write the l2 table to the file */
275     BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE);
276     ret = bdrv_pwrite_sync(bs->file, l2_offset, l2_table,
277         s->l2_size * sizeof(uint64_t));
278     if (ret < 0) {
279         goto fail;
280     }
281 
282     /* update the L1 entry */
283     s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
284     ret = write_l1_entry(bs, l1_index);
285     if (ret < 0) {
286         goto fail;
287     }
288 
289     /* update the l2 cache entry */
290 
291     s->l2_cache_offsets[min_index] = l2_offset;
292     s->l2_cache_counts[min_index] = 1;
293 
294     *table = l2_table;
295     return 0;
296 
297 fail:
298     s->l1_table[l1_index] = old_l2_offset;
299     qcow2_l2_cache_reset(bs);
300     return ret;
301 }
302 
303 static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size,
304         uint64_t *l2_table, uint64_t start, uint64_t mask)
305 {
306     int i;
307     uint64_t offset = be64_to_cpu(l2_table[0]) & ~mask;
308 
309     if (!offset)
310         return 0;
311 
312     for (i = start; i < start + nb_clusters; i++)
313         if (offset + (uint64_t) i * cluster_size != (be64_to_cpu(l2_table[i]) & ~mask))
314             break;
315 
316 	return (i - start);
317 }
318 
319 static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_table)
320 {
321     int i = 0;
322 
323     while(nb_clusters-- && l2_table[i] == 0)
324         i++;
325 
326     return i;
327 }
328 
329 /* The crypt function is compatible with the linux cryptoloop
330    algorithm for < 4 GB images. NOTE: out_buf == in_buf is
331    supported */
332 void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
333                            uint8_t *out_buf, const uint8_t *in_buf,
334                            int nb_sectors, int enc,
335                            const AES_KEY *key)
336 {
337     union {
338         uint64_t ll[2];
339         uint8_t b[16];
340     } ivec;
341     int i;
342 
343     for(i = 0; i < nb_sectors; i++) {
344         ivec.ll[0] = cpu_to_le64(sector_num);
345         ivec.ll[1] = 0;
346         AES_cbc_encrypt(in_buf, out_buf, 512, key,
347                         ivec.b, enc);
348         sector_num++;
349         in_buf += 512;
350         out_buf += 512;
351     }
352 }
353 
354 
355 static int qcow_read(BlockDriverState *bs, int64_t sector_num,
356                      uint8_t *buf, int nb_sectors)
357 {
358     BDRVQcowState *s = bs->opaque;
359     int ret, index_in_cluster, n, n1;
360     uint64_t cluster_offset;
361     struct iovec iov;
362     QEMUIOVector qiov;
363 
364     while (nb_sectors > 0) {
365         n = nb_sectors;
366 
367         ret = qcow2_get_cluster_offset(bs, sector_num << 9, &n,
368             &cluster_offset);
369         if (ret < 0) {
370             return ret;
371         }
372 
373         index_in_cluster = sector_num & (s->cluster_sectors - 1);
374         if (!cluster_offset) {
375             if (bs->backing_hd) {
376                 /* read from the base image */
377                 iov.iov_base = buf;
378                 iov.iov_len = n * 512;
379                 qemu_iovec_init_external(&qiov, &iov, 1);
380 
381                 n1 = qcow2_backing_read1(bs->backing_hd, &qiov, sector_num, n);
382                 if (n1 > 0) {
383                     BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING);
384                     ret = bdrv_read(bs->backing_hd, sector_num, buf, n1);
385                     if (ret < 0)
386                         return -1;
387                 }
388             } else {
389                 memset(buf, 0, 512 * n);
390             }
391         } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
392             if (qcow2_decompress_cluster(bs, cluster_offset) < 0)
393                 return -1;
394             memcpy(buf, s->cluster_cache + index_in_cluster * 512, 512 * n);
395         } else {
396             BLKDBG_EVENT(bs->file, BLKDBG_READ);
397             ret = bdrv_pread(bs->file, cluster_offset + index_in_cluster * 512, buf, n * 512);
398             if (ret != n * 512)
399                 return -1;
400             if (s->crypt_method) {
401                 qcow2_encrypt_sectors(s, sector_num, buf, buf, n, 0,
402                                 &s->aes_decrypt_key);
403             }
404         }
405         nb_sectors -= n;
406         sector_num += n;
407         buf += n * 512;
408     }
409     return 0;
410 }
411 
412 static int copy_sectors(BlockDriverState *bs, uint64_t start_sect,
413                         uint64_t cluster_offset, int n_start, int n_end)
414 {
415     BDRVQcowState *s = bs->opaque;
416     int n, ret;
417 
418     n = n_end - n_start;
419     if (n <= 0)
420         return 0;
421     BLKDBG_EVENT(bs->file, BLKDBG_COW_READ);
422     ret = qcow_read(bs, start_sect + n_start, s->cluster_data, n);
423     if (ret < 0)
424         return ret;
425     if (s->crypt_method) {
426         qcow2_encrypt_sectors(s, start_sect + n_start,
427                         s->cluster_data,
428                         s->cluster_data, n, 1,
429                         &s->aes_encrypt_key);
430     }
431     BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
432     ret = bdrv_write(bs->file, (cluster_offset >> 9) + n_start,
433         s->cluster_data, n);
434     if (ret < 0)
435         return ret;
436     return 0;
437 }
438 
439 
440 /*
441  * get_cluster_offset
442  *
443  * For a given offset of the disk image, find the cluster offset in
444  * qcow2 file. The offset is stored in *cluster_offset.
445  *
446  * on entry, *num is the number of contiguous clusters we'd like to
447  * access following offset.
448  *
449  * on exit, *num is the number of contiguous clusters we can read.
450  *
451  * Return 0, if the offset is found
452  * Return -errno, otherwise.
453  *
454  */
455 
456 int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
457     int *num, uint64_t *cluster_offset)
458 {
459     BDRVQcowState *s = bs->opaque;
460     unsigned int l1_index, l2_index;
461     uint64_t l2_offset, *l2_table;
462     int l1_bits, c;
463     unsigned int index_in_cluster, nb_clusters;
464     uint64_t nb_available, nb_needed;
465     int ret;
466 
467     index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1);
468     nb_needed = *num + index_in_cluster;
469 
470     l1_bits = s->l2_bits + s->cluster_bits;
471 
472     /* compute how many bytes there are between the offset and
473      * the end of the l1 entry
474      */
475 
476     nb_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1));
477 
478     /* compute the number of available sectors */
479 
480     nb_available = (nb_available >> 9) + index_in_cluster;
481 
482     if (nb_needed > nb_available) {
483         nb_needed = nb_available;
484     }
485 
486     *cluster_offset = 0;
487 
488     /* seek the the l2 offset in the l1 table */
489 
490     l1_index = offset >> l1_bits;
491     if (l1_index >= s->l1_size)
492         goto out;
493 
494     l2_offset = s->l1_table[l1_index];
495 
496     /* seek the l2 table of the given l2 offset */
497 
498     if (!l2_offset)
499         goto out;
500 
501     /* load the l2 table in memory */
502 
503     l2_offset &= ~QCOW_OFLAG_COPIED;
504     ret = l2_load(bs, l2_offset, &l2_table);
505     if (ret < 0) {
506         return ret;
507     }
508 
509     /* find the cluster offset for the given disk offset */
510 
511     l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
512     *cluster_offset = be64_to_cpu(l2_table[l2_index]);
513     nb_clusters = size_to_clusters(s, nb_needed << 9);
514 
515     if (!*cluster_offset) {
516         /* how many empty clusters ? */
517         c = count_contiguous_free_clusters(nb_clusters, &l2_table[l2_index]);
518     } else {
519         /* how many allocated clusters ? */
520         c = count_contiguous_clusters(nb_clusters, s->cluster_size,
521                 &l2_table[l2_index], 0, QCOW_OFLAG_COPIED);
522     }
523 
524    nb_available = (c * s->cluster_sectors);
525 out:
526     if (nb_available > nb_needed)
527         nb_available = nb_needed;
528 
529     *num = nb_available - index_in_cluster;
530 
531     *cluster_offset &=~QCOW_OFLAG_COPIED;
532     return 0;
533 }
534 
535 /*
536  * get_cluster_table
537  *
538  * for a given disk offset, load (and allocate if needed)
539  * the l2 table.
540  *
541  * the l2 table offset in the qcow2 file and the cluster index
542  * in the l2 table are given to the caller.
543  *
544  * Returns 0 on success, -errno in failure case
545  */
546 static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
547                              uint64_t **new_l2_table,
548                              uint64_t *new_l2_offset,
549                              int *new_l2_index)
550 {
551     BDRVQcowState *s = bs->opaque;
552     unsigned int l1_index, l2_index;
553     uint64_t l2_offset;
554     uint64_t *l2_table = NULL;
555     int ret;
556 
557     /* seek the the l2 offset in the l1 table */
558 
559     l1_index = offset >> (s->l2_bits + s->cluster_bits);
560     if (l1_index >= s->l1_size) {
561         ret = qcow2_grow_l1_table(bs, l1_index + 1, false);
562         if (ret < 0) {
563             return ret;
564         }
565     }
566     l2_offset = s->l1_table[l1_index];
567 
568     /* seek the l2 table of the given l2 offset */
569 
570     if (l2_offset & QCOW_OFLAG_COPIED) {
571         /* load the l2 table in memory */
572         l2_offset &= ~QCOW_OFLAG_COPIED;
573         ret = l2_load(bs, l2_offset, &l2_table);
574         if (ret < 0) {
575             return ret;
576         }
577     } else {
578         if (l2_offset)
579             qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t));
580         ret = l2_allocate(bs, l1_index, &l2_table);
581         if (ret < 0) {
582             return ret;
583         }
584         l2_offset = s->l1_table[l1_index] & ~QCOW_OFLAG_COPIED;
585     }
586 
587     /* find the cluster offset for the given disk offset */
588 
589     l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
590 
591     *new_l2_table = l2_table;
592     *new_l2_offset = l2_offset;
593     *new_l2_index = l2_index;
594 
595     return 0;
596 }
597 
598 /*
599  * alloc_compressed_cluster_offset
600  *
601  * For a given offset of the disk image, return cluster offset in
602  * qcow2 file.
603  *
604  * If the offset is not found, allocate a new compressed cluster.
605  *
606  * Return the cluster offset if successful,
607  * Return 0, otherwise.
608  *
609  */
610 
611 uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
612                                                uint64_t offset,
613                                                int compressed_size)
614 {
615     BDRVQcowState *s = bs->opaque;
616     int l2_index, ret;
617     uint64_t l2_offset, *l2_table;
618     int64_t cluster_offset;
619     int nb_csectors;
620 
621     ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
622     if (ret < 0) {
623         return 0;
624     }
625 
626     cluster_offset = be64_to_cpu(l2_table[l2_index]);
627     if (cluster_offset & QCOW_OFLAG_COPIED)
628         return cluster_offset & ~QCOW_OFLAG_COPIED;
629 
630     if (cluster_offset)
631         qcow2_free_any_clusters(bs, cluster_offset, 1);
632 
633     cluster_offset = qcow2_alloc_bytes(bs, compressed_size);
634     if (cluster_offset < 0) {
635         return 0;
636     }
637 
638     nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) -
639                   (cluster_offset >> 9);
640 
641     cluster_offset |= QCOW_OFLAG_COMPRESSED |
642                       ((uint64_t)nb_csectors << s->csize_shift);
643 
644     /* update L2 table */
645 
646     /* compressed clusters never have the copied flag */
647 
648     BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED);
649     l2_table[l2_index] = cpu_to_be64(cluster_offset);
650     if (bdrv_pwrite_sync(bs->file,
651                     l2_offset + l2_index * sizeof(uint64_t),
652                     l2_table + l2_index,
653                     sizeof(uint64_t)) < 0)
654         return 0;
655 
656     return cluster_offset;
657 }
658 
659 /*
660  * Write L2 table updates to disk, writing whole sectors to avoid a
661  * read-modify-write in bdrv_pwrite
662  */
663 #define L2_ENTRIES_PER_SECTOR (512 / 8)
664 static int write_l2_entries(BlockDriverState *bs, uint64_t *l2_table,
665     uint64_t l2_offset, int l2_index, int num)
666 {
667     int l2_start_index = l2_index & ~(L1_ENTRIES_PER_SECTOR - 1);
668     int start_offset = (8 * l2_index) & ~511;
669     int end_offset = (8 * (l2_index + num) + 511) & ~511;
670     size_t len = end_offset - start_offset;
671     int ret;
672 
673     BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE);
674     ret = bdrv_pwrite(bs->file, l2_offset + start_offset,
675         &l2_table[l2_start_index], len);
676     if (ret < 0) {
677         return ret;
678     }
679 
680     return 0;
681 }
682 
683 int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
684 {
685     BDRVQcowState *s = bs->opaque;
686     int i, j = 0, l2_index, ret;
687     uint64_t *old_cluster, start_sect, l2_offset, *l2_table;
688     uint64_t cluster_offset = m->cluster_offset;
689 
690     if (m->nb_clusters == 0)
691         return 0;
692 
693     old_cluster = qemu_malloc(m->nb_clusters * sizeof(uint64_t));
694 
695     /* copy content of unmodified sectors */
696     start_sect = (m->offset & ~(s->cluster_size - 1)) >> 9;
697     if (m->n_start) {
698         ret = copy_sectors(bs, start_sect, cluster_offset, 0, m->n_start);
699         if (ret < 0)
700             goto err;
701     }
702 
703     if (m->nb_available & (s->cluster_sectors - 1)) {
704         uint64_t end = m->nb_available & ~(uint64_t)(s->cluster_sectors - 1);
705         ret = copy_sectors(bs, start_sect + end, cluster_offset + (end << 9),
706                 m->nb_available - end, s->cluster_sectors);
707         if (ret < 0)
708             goto err;
709     }
710 
711     /* update L2 table */
712     ret = get_cluster_table(bs, m->offset, &l2_table, &l2_offset, &l2_index);
713     if (ret < 0) {
714         goto err;
715     }
716 
717     for (i = 0; i < m->nb_clusters; i++) {
718         /* if two concurrent writes happen to the same unallocated cluster
719 	 * each write allocates separate cluster and writes data concurrently.
720 	 * The first one to complete updates l2 table with pointer to its
721 	 * cluster the second one has to do RMW (which is done above by
722 	 * copy_sectors()), update l2 table with its cluster pointer and free
723 	 * old cluster. This is what this loop does */
724         if(l2_table[l2_index + i] != 0)
725             old_cluster[j++] = l2_table[l2_index + i];
726 
727         l2_table[l2_index + i] = cpu_to_be64((cluster_offset +
728                     (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
729      }
730 
731     /*
732      * Before we update the L2 table to actually point to the new cluster, we
733      * need to be sure that the refcounts have been increased and COW was
734      * handled.
735      */
736     bdrv_flush(bs->file);
737 
738     ret = write_l2_entries(bs, l2_table, l2_offset, l2_index, m->nb_clusters);
739     if (ret < 0) {
740         qcow2_l2_cache_reset(bs);
741         goto err;
742     }
743 
744     /*
745      * If this was a COW, we need to decrease the refcount of the old cluster.
746      * Also flush bs->file to get the right order for L2 and refcount update.
747      */
748     if (j != 0) {
749         bdrv_flush(bs->file);
750         for (i = 0; i < j; i++) {
751             qcow2_free_any_clusters(bs,
752                 be64_to_cpu(old_cluster[i]) & ~QCOW_OFLAG_COPIED, 1);
753         }
754     }
755 
756     ret = 0;
757 err:
758     qemu_free(old_cluster);
759     return ret;
760  }
761 
762 /*
763  * alloc_cluster_offset
764  *
765  * For a given offset of the disk image, return cluster offset in qcow2 file.
766  * If the offset is not found, allocate a new cluster.
767  *
768  * If the cluster was already allocated, m->nb_clusters is set to 0,
769  * m->depends_on is set to NULL and the other fields in m are meaningless.
770  *
771  * If the cluster is newly allocated, m->nb_clusters is set to the number of
772  * contiguous clusters that have been allocated. This may be 0 if the request
773  * conflict with another write request in flight; in this case, m->depends_on
774  * is set and the remaining fields of m are meaningless.
775  *
776  * If m->nb_clusters is non-zero, the other fields of m are valid and contain
777  * information about the first allocated cluster.
778  *
779  * Return 0 on success and -errno in error cases
780  */
781 int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
782     int n_start, int n_end, int *num, QCowL2Meta *m)
783 {
784     BDRVQcowState *s = bs->opaque;
785     int l2_index, ret;
786     uint64_t l2_offset, *l2_table;
787     int64_t cluster_offset;
788     unsigned int nb_clusters, i = 0;
789     QCowL2Meta *old_alloc;
790 
791     ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
792     if (ret < 0) {
793         return ret;
794     }
795 
796     nb_clusters = size_to_clusters(s, n_end << 9);
797 
798     nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
799 
800     cluster_offset = be64_to_cpu(l2_table[l2_index]);
801 
802     /* We keep all QCOW_OFLAG_COPIED clusters */
803 
804     if (cluster_offset & QCOW_OFLAG_COPIED) {
805         nb_clusters = count_contiguous_clusters(nb_clusters, s->cluster_size,
806                 &l2_table[l2_index], 0, 0);
807 
808         cluster_offset &= ~QCOW_OFLAG_COPIED;
809         m->nb_clusters = 0;
810         m->depends_on = NULL;
811 
812         goto out;
813     }
814 
815     /* for the moment, multiple compressed clusters are not managed */
816 
817     if (cluster_offset & QCOW_OFLAG_COMPRESSED)
818         nb_clusters = 1;
819 
820     /* how many available clusters ? */
821 
822     while (i < nb_clusters) {
823         i += count_contiguous_clusters(nb_clusters - i, s->cluster_size,
824                 &l2_table[l2_index], i, 0);
825         if ((i >= nb_clusters) || be64_to_cpu(l2_table[l2_index + i])) {
826             break;
827         }
828 
829         i += count_contiguous_free_clusters(nb_clusters - i,
830                 &l2_table[l2_index + i]);
831         if (i >= nb_clusters) {
832             break;
833         }
834 
835         cluster_offset = be64_to_cpu(l2_table[l2_index + i]);
836 
837         if ((cluster_offset & QCOW_OFLAG_COPIED) ||
838                 (cluster_offset & QCOW_OFLAG_COMPRESSED))
839             break;
840     }
841     assert(i <= nb_clusters);
842     nb_clusters = i;
843 
844     /*
845      * Check if there already is an AIO write request in flight which allocates
846      * the same cluster. In this case we need to wait until the previous
847      * request has completed and updated the L2 table accordingly.
848      */
849     QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) {
850 
851         uint64_t end_offset = offset + nb_clusters * s->cluster_size;
852         uint64_t old_offset = old_alloc->offset;
853         uint64_t old_end_offset = old_alloc->offset +
854             old_alloc->nb_clusters * s->cluster_size;
855 
856         if (end_offset < old_offset || offset > old_end_offset) {
857             /* No intersection */
858         } else {
859             if (offset < old_offset) {
860                 /* Stop at the start of a running allocation */
861                 nb_clusters = (old_offset - offset) >> s->cluster_bits;
862             } else {
863                 nb_clusters = 0;
864             }
865 
866             if (nb_clusters == 0) {
867                 /* Set dependency and wait for a callback */
868                 m->depends_on = old_alloc;
869                 m->nb_clusters = 0;
870                 *num = 0;
871                 return 0;
872             }
873         }
874     }
875 
876     if (!nb_clusters) {
877         abort();
878     }
879 
880     QLIST_INSERT_HEAD(&s->cluster_allocs, m, next_in_flight);
881 
882     /* allocate a new cluster */
883 
884     cluster_offset = qcow2_alloc_clusters(bs, nb_clusters * s->cluster_size);
885     if (cluster_offset < 0) {
886         QLIST_REMOVE(m, next_in_flight);
887         return cluster_offset;
888     }
889 
890     /* save info needed for meta data update */
891     m->offset = offset;
892     m->n_start = n_start;
893     m->nb_clusters = nb_clusters;
894 
895 out:
896     m->nb_available = MIN(nb_clusters << (s->cluster_bits - 9), n_end);
897     m->cluster_offset = cluster_offset;
898 
899     *num = m->nb_available - n_start;
900 
901     return 0;
902 }
903 
904 static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
905                              const uint8_t *buf, int buf_size)
906 {
907     z_stream strm1, *strm = &strm1;
908     int ret, out_len;
909 
910     memset(strm, 0, sizeof(*strm));
911 
912     strm->next_in = (uint8_t *)buf;
913     strm->avail_in = buf_size;
914     strm->next_out = out_buf;
915     strm->avail_out = out_buf_size;
916 
917     ret = inflateInit2(strm, -12);
918     if (ret != Z_OK)
919         return -1;
920     ret = inflate(strm, Z_FINISH);
921     out_len = strm->next_out - out_buf;
922     if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
923         out_len != out_buf_size) {
924         inflateEnd(strm);
925         return -1;
926     }
927     inflateEnd(strm);
928     return 0;
929 }
930 
931 int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
932 {
933     BDRVQcowState *s = bs->opaque;
934     int ret, csize, nb_csectors, sector_offset;
935     uint64_t coffset;
936 
937     coffset = cluster_offset & s->cluster_offset_mask;
938     if (s->cluster_cache_offset != coffset) {
939         nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1;
940         sector_offset = coffset & 511;
941         csize = nb_csectors * 512 - sector_offset;
942         BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
943         ret = bdrv_read(bs->file, coffset >> 9, s->cluster_data, nb_csectors);
944         if (ret < 0) {
945             return -1;
946         }
947         if (decompress_buffer(s->cluster_cache, s->cluster_size,
948                               s->cluster_data + sector_offset, csize) < 0) {
949             return -1;
950         }
951         s->cluster_cache_offset = coffset;
952     }
953     return 0;
954 }
955