145aba42fSKevin Wolf /*
245aba42fSKevin Wolf * Block driver for the QCOW version 2 format
345aba42fSKevin Wolf *
445aba42fSKevin Wolf * Copyright (c) 2004-2006 Fabrice Bellard
545aba42fSKevin Wolf *
645aba42fSKevin Wolf * Permission is hereby granted, free of charge, to any person obtaining a copy
745aba42fSKevin Wolf * of this software and associated documentation files (the "Software"), to deal
845aba42fSKevin Wolf * in the Software without restriction, including without limitation the rights
945aba42fSKevin Wolf * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
1045aba42fSKevin Wolf * copies of the Software, and to permit persons to whom the Software is
1145aba42fSKevin Wolf * furnished to do so, subject to the following conditions:
1245aba42fSKevin Wolf *
1345aba42fSKevin Wolf * The above copyright notice and this permission notice shall be included in
1445aba42fSKevin Wolf * all copies or substantial portions of the Software.
1545aba42fSKevin Wolf *
1645aba42fSKevin Wolf * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1745aba42fSKevin Wolf * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1845aba42fSKevin Wolf * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
1945aba42fSKevin Wolf * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
2045aba42fSKevin Wolf * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2145aba42fSKevin Wolf * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
2245aba42fSKevin Wolf * THE SOFTWARE.
2345aba42fSKevin Wolf */
2445aba42fSKevin Wolf
2580c71a24SPeter Maydell #include "qemu/osdep.h"
2645aba42fSKevin Wolf #include <zlib.h>
2745aba42fSKevin Wolf
28e2c1c34fSMarkus Armbruster #include "block/block-io.h"
29c9a442e4SAlberto Garcia #include "qapi/error.h"
300d8c41daSMichael S. Tsirkin #include "qcow2.h"
3158369e22SPaolo Bonzini #include "qemu/bswap.h"
325df022cfSPeter Maydell #include "qemu/memalign.h"
333cce16f4SKevin Wolf #include "trace.h"
3445aba42fSKevin Wolf
qcow2_shrink_l1_table(BlockDriverState * bs,uint64_t exact_size)35a1b4ecfdSPaolo Bonzini int coroutine_fn qcow2_shrink_l1_table(BlockDriverState *bs,
36a1b4ecfdSPaolo Bonzini uint64_t exact_size)
3746b732cdSPavel Butsykin {
3846b732cdSPavel Butsykin BDRVQcow2State *s = bs->opaque;
3946b732cdSPavel Butsykin int new_l1_size, i, ret;
4046b732cdSPavel Butsykin
4146b732cdSPavel Butsykin if (exact_size >= s->l1_size) {
4246b732cdSPavel Butsykin return 0;
4346b732cdSPavel Butsykin }
4446b732cdSPavel Butsykin
4546b732cdSPavel Butsykin new_l1_size = exact_size;
4646b732cdSPavel Butsykin
4746b732cdSPavel Butsykin #ifdef DEBUG_ALLOC2
4846b732cdSPavel Butsykin fprintf(stderr, "shrink l1_table from %d to %d\n", s->l1_size, new_l1_size);
4946b732cdSPavel Butsykin #endif
5046b732cdSPavel Butsykin
5117362398SPaolo Bonzini BLKDBG_CO_EVENT(bs->file, BLKDBG_L1_SHRINK_WRITE_TABLE);
5238505e2aSAlberto Faria ret = bdrv_co_pwrite_zeroes(bs->file,
5338505e2aSAlberto Faria s->l1_table_offset + new_l1_size * L1E_SIZE,
5402b1ecfaSAlberto Garcia (s->l1_size - new_l1_size) * L1E_SIZE, 0);
5546b732cdSPavel Butsykin if (ret < 0) {
5646b732cdSPavel Butsykin goto fail;
5746b732cdSPavel Butsykin }
5846b732cdSPavel Butsykin
5938505e2aSAlberto Faria ret = bdrv_co_flush(bs->file->bs);
6046b732cdSPavel Butsykin if (ret < 0) {
6146b732cdSPavel Butsykin goto fail;
6246b732cdSPavel Butsykin }
6346b732cdSPavel Butsykin
6417362398SPaolo Bonzini BLKDBG_CO_EVENT(bs->file, BLKDBG_L1_SHRINK_FREE_L2_CLUSTERS);
6546b732cdSPavel Butsykin for (i = s->l1_size - 1; i > new_l1_size - 1; i--) {
6646b732cdSPavel Butsykin if ((s->l1_table[i] & L1E_OFFSET_MASK) == 0) {
6746b732cdSPavel Butsykin continue;
6846b732cdSPavel Butsykin }
6946b732cdSPavel Butsykin qcow2_free_clusters(bs, s->l1_table[i] & L1E_OFFSET_MASK,
7046b732cdSPavel Butsykin s->cluster_size, QCOW2_DISCARD_ALWAYS);
7146b732cdSPavel Butsykin s->l1_table[i] = 0;
7246b732cdSPavel Butsykin }
7346b732cdSPavel Butsykin return 0;
7446b732cdSPavel Butsykin
7546b732cdSPavel Butsykin fail:
7646b732cdSPavel Butsykin /*
7746b732cdSPavel Butsykin * If the write in the l1_table failed the image may contain a partially
7846b732cdSPavel Butsykin * overwritten l1_table. In this case it would be better to clear the
7946b732cdSPavel Butsykin * l1_table in memory to avoid possible image corruption.
8046b732cdSPavel Butsykin */
8146b732cdSPavel Butsykin memset(s->l1_table + new_l1_size, 0,
8202b1ecfaSAlberto Garcia (s->l1_size - new_l1_size) * L1E_SIZE);
8346b732cdSPavel Butsykin return ret;
8446b732cdSPavel Butsykin }
8546b732cdSPavel Butsykin
qcow2_grow_l1_table(BlockDriverState * bs,uint64_t min_size,bool exact_size)862cf7cfa1SKevin Wolf int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
872cf7cfa1SKevin Wolf bool exact_size)
8845aba42fSKevin Wolf {
89ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque;
902cf7cfa1SKevin Wolf int new_l1_size2, ret, i;
9145aba42fSKevin Wolf uint64_t *new_l1_table;
92fda74f82SMax Reitz int64_t old_l1_table_offset, old_l1_size;
932cf7cfa1SKevin Wolf int64_t new_l1_table_offset, new_l1_size;
9445aba42fSKevin Wolf uint8_t data[12];
9545aba42fSKevin Wolf
9672893756SStefan Hajnoczi if (min_size <= s->l1_size)
9745aba42fSKevin Wolf return 0;
9872893756SStefan Hajnoczi
99b93f9950SMax Reitz /* Do a sanity check on min_size before trying to calculate new_l1_size
100b93f9950SMax Reitz * (this prevents overflows during the while loop for the calculation of
101b93f9950SMax Reitz * new_l1_size) */
10202b1ecfaSAlberto Garcia if (min_size > INT_MAX / L1E_SIZE) {
103b93f9950SMax Reitz return -EFBIG;
104b93f9950SMax Reitz }
105b93f9950SMax Reitz
10672893756SStefan Hajnoczi if (exact_size) {
10772893756SStefan Hajnoczi new_l1_size = min_size;
10872893756SStefan Hajnoczi } else {
10972893756SStefan Hajnoczi /* Bump size up to reduce the number of times we have to grow */
11072893756SStefan Hajnoczi new_l1_size = s->l1_size;
111d191d12dSStefan Weil if (new_l1_size == 0) {
112d191d12dSStefan Weil new_l1_size = 1;
113d191d12dSStefan Weil }
11445aba42fSKevin Wolf while (min_size > new_l1_size) {
11521cf3e12SMarc-André Lureau new_l1_size = DIV_ROUND_UP(new_l1_size * 3, 2);
11645aba42fSKevin Wolf }
11772893756SStefan Hajnoczi }
11872893756SStefan Hajnoczi
11984c26520SMax Reitz QEMU_BUILD_BUG_ON(QCOW_MAX_L1_SIZE > INT_MAX);
12002b1ecfaSAlberto Garcia if (new_l1_size > QCOW_MAX_L1_SIZE / L1E_SIZE) {
1212cf7cfa1SKevin Wolf return -EFBIG;
1222cf7cfa1SKevin Wolf }
1232cf7cfa1SKevin Wolf
12445aba42fSKevin Wolf #ifdef DEBUG_ALLOC2
1252cf7cfa1SKevin Wolf fprintf(stderr, "grow l1_table from %d to %" PRId64 "\n",
1262cf7cfa1SKevin Wolf s->l1_size, new_l1_size);
12745aba42fSKevin Wolf #endif
12845aba42fSKevin Wolf
12902b1ecfaSAlberto Garcia new_l1_size2 = L1E_SIZE * new_l1_size;
130ef97d608SAlberto Garcia new_l1_table = qemu_try_blockalign(bs->file->bs, new_l1_size2);
131de82815dSKevin Wolf if (new_l1_table == NULL) {
132de82815dSKevin Wolf return -ENOMEM;
133de82815dSKevin Wolf }
134ef97d608SAlberto Garcia memset(new_l1_table, 0, new_l1_size2);
135de82815dSKevin Wolf
1360647d47cSStefan Hajnoczi if (s->l1_size) {
13702b1ecfaSAlberto Garcia memcpy(new_l1_table, s->l1_table, s->l1_size * L1E_SIZE);
1380647d47cSStefan Hajnoczi }
13945aba42fSKevin Wolf
14045aba42fSKevin Wolf /* write new table (align to cluster) */
14166f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE);
142ed6ccf0fSKevin Wolf new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2);
1435d757b56SKevin Wolf if (new_l1_table_offset < 0) {
144de82815dSKevin Wolf qemu_vfree(new_l1_table);
1455d757b56SKevin Wolf return new_l1_table_offset;
1465d757b56SKevin Wolf }
14729c1a730SKevin Wolf
14829c1a730SKevin Wolf ret = qcow2_cache_flush(bs, s->refcount_block_cache);
14929c1a730SKevin Wolf if (ret < 0) {
15080fa3341SKevin Wolf goto fail;
15129c1a730SKevin Wolf }
15245aba42fSKevin Wolf
153cf93980eSMax Reitz /* the L1 position has not yet been updated, so these clusters must
154cf93980eSMax Reitz * indeed be completely free */
155231bb267SMax Reitz ret = qcow2_pre_write_overlap_check(bs, 0, new_l1_table_offset,
156966b000fSKevin Wolf new_l1_size2, false);
157cf93980eSMax Reitz if (ret < 0) {
158cf93980eSMax Reitz goto fail;
159cf93980eSMax Reitz }
160cf93980eSMax Reitz
16166f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE);
16245aba42fSKevin Wolf for(i = 0; i < s->l1_size; i++)
16345aba42fSKevin Wolf new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
16432cc71deSAlberto Faria ret = bdrv_pwrite_sync(bs->file, new_l1_table_offset, new_l1_size2,
16532cc71deSAlberto Faria new_l1_table, 0);
1668b3b7206SKevin Wolf if (ret < 0)
16745aba42fSKevin Wolf goto fail;
16845aba42fSKevin Wolf for(i = 0; i < s->l1_size; i++)
16945aba42fSKevin Wolf new_l1_table[i] = be64_to_cpu(new_l1_table[i]);
17045aba42fSKevin Wolf
17145aba42fSKevin Wolf /* set new table */
17266f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE);
173f1f7a1ddSPeter Maydell stl_be_p(data, new_l1_size);
174e4ef9f46SPeter Maydell stq_be_p(data + 4, new_l1_table_offset);
17532cc71deSAlberto Faria ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size),
17632cc71deSAlberto Faria sizeof(data), data, 0);
1778b3b7206SKevin Wolf if (ret < 0) {
17845aba42fSKevin Wolf goto fail;
179fb8fa77cSKevin Wolf }
180de82815dSKevin Wolf qemu_vfree(s->l1_table);
181fda74f82SMax Reitz old_l1_table_offset = s->l1_table_offset;
18245aba42fSKevin Wolf s->l1_table_offset = new_l1_table_offset;
18345aba42fSKevin Wolf s->l1_table = new_l1_table;
184fda74f82SMax Reitz old_l1_size = s->l1_size;
18545aba42fSKevin Wolf s->l1_size = new_l1_size;
18602b1ecfaSAlberto Garcia qcow2_free_clusters(bs, old_l1_table_offset, old_l1_size * L1E_SIZE,
187fda74f82SMax Reitz QCOW2_DISCARD_OTHER);
18845aba42fSKevin Wolf return 0;
18945aba42fSKevin Wolf fail:
190de82815dSKevin Wolf qemu_vfree(new_l1_table);
1916cfcb9b8SKevin Wolf qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2,
1926cfcb9b8SKevin Wolf QCOW2_DISCARD_OTHER);
1938b3b7206SKevin Wolf return ret;
19445aba42fSKevin Wolf }
19545aba42fSKevin Wolf
19645aba42fSKevin Wolf /*
19745aba42fSKevin Wolf * l2_load
19845aba42fSKevin Wolf *
199e2b5713eSAlberto Garcia * @bs: The BlockDriverState
200e2b5713eSAlberto Garcia * @offset: A guest offset, used to calculate what slice of the L2
201e2b5713eSAlberto Garcia * table to load.
202e2b5713eSAlberto Garcia * @l2_offset: Offset to the L2 table in the image file.
203e2b5713eSAlberto Garcia * @l2_slice: Location to store the pointer to the L2 slice.
20445aba42fSKevin Wolf *
205e2b5713eSAlberto Garcia * Loads a L2 slice into memory (L2 slices are the parts of L2 tables
206e2b5713eSAlberto Garcia * that are loaded by the qcow2 cache). If the slice is in the cache,
207e2b5713eSAlberto Garcia * the cache is used; otherwise the L2 slice is loaded from the image
208e2b5713eSAlberto Garcia * file.
20945aba42fSKevin Wolf */
2100bb79c97SKevin Wolf static int GRAPH_RDLOCK
l2_load(BlockDriverState * bs,uint64_t offset,uint64_t l2_offset,uint64_t ** l2_slice)2110bb79c97SKevin Wolf l2_load(BlockDriverState *bs, uint64_t offset,
212e2b5713eSAlberto Garcia uint64_t l2_offset, uint64_t **l2_slice)
21345aba42fSKevin Wolf {
214ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque;
215c8fd8554SAlberto Garcia int start_of_slice = l2_entry_size(s) *
216e2b5713eSAlberto Garcia (offset_to_l2_index(s, offset) - offset_to_l2_slice_index(s, offset));
21745aba42fSKevin Wolf
218e2b5713eSAlberto Garcia return qcow2_cache_get(bs, s->l2_table_cache, l2_offset + start_of_slice,
219e2b5713eSAlberto Garcia (void **)l2_slice);
22055c17e98SKevin Wolf }
22155c17e98SKevin Wolf
22245aba42fSKevin Wolf /*
223da86f8cbSAlberto Garcia * Writes an L1 entry to disk (note that depending on the alignment
224da86f8cbSAlberto Garcia * requirements this function may write more that just one entry in
225da86f8cbSAlberto Garcia * order to prevent bdrv_pwrite from performing a read-modify-write)
2266583e3c7SKevin Wolf */
qcow2_write_l1_entry(BlockDriverState * bs,int l1_index)227e23e400eSMax Reitz int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index)
2286583e3c7SKevin Wolf {
229ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque;
2306583e3c7SKevin Wolf int l1_start_index;
231f7defcb6SKevin Wolf int i, ret;
23202b1ecfaSAlberto Garcia int bufsize = MAX(L1E_SIZE,
233da86f8cbSAlberto Garcia MIN(bs->file->bs->bl.request_alignment, s->cluster_size));
23402b1ecfaSAlberto Garcia int nentries = bufsize / L1E_SIZE;
235da86f8cbSAlberto Garcia g_autofree uint64_t *buf = g_try_new0(uint64_t, nentries);
2366583e3c7SKevin Wolf
237da86f8cbSAlberto Garcia if (buf == NULL) {
238da86f8cbSAlberto Garcia return -ENOMEM;
239da86f8cbSAlberto Garcia }
240da86f8cbSAlberto Garcia
241da86f8cbSAlberto Garcia l1_start_index = QEMU_ALIGN_DOWN(l1_index, nentries);
242da86f8cbSAlberto Garcia for (i = 0; i < MIN(nentries, s->l1_size - l1_start_index); i++) {
2436583e3c7SKevin Wolf buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]);
2446583e3c7SKevin Wolf }
2456583e3c7SKevin Wolf
246231bb267SMax Reitz ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L1,
247c508c73dSAlberto Garcia s->l1_table_offset + L1E_SIZE * l1_start_index, bufsize, false);
248cf93980eSMax Reitz if (ret < 0) {
249cf93980eSMax Reitz return ret;
250cf93980eSMax Reitz }
251cf93980eSMax Reitz
25266f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
253d9ca2ea2SKevin Wolf ret = bdrv_pwrite_sync(bs->file,
254c508c73dSAlberto Garcia s->l1_table_offset + L1E_SIZE * l1_start_index,
25532cc71deSAlberto Faria bufsize, buf, 0);
256f7defcb6SKevin Wolf if (ret < 0) {
257f7defcb6SKevin Wolf return ret;
2586583e3c7SKevin Wolf }
2596583e3c7SKevin Wolf
2606583e3c7SKevin Wolf return 0;
2616583e3c7SKevin Wolf }
2626583e3c7SKevin Wolf
2636583e3c7SKevin Wolf /*
26445aba42fSKevin Wolf * l2_allocate
26545aba42fSKevin Wolf *
26645aba42fSKevin Wolf * Allocate a new l2 entry in the file. If l1_index points to an already
26745aba42fSKevin Wolf * used entry in the L2 table (i.e. we are doing a copy on write for the L2
26845aba42fSKevin Wolf * table) copy the contents of the old L2 table into the newly allocated one.
26945aba42fSKevin Wolf * Otherwise the new table is initialized with zeros.
27045aba42fSKevin Wolf *
27145aba42fSKevin Wolf */
27245aba42fSKevin Wolf
l2_allocate(BlockDriverState * bs,int l1_index)2730bb79c97SKevin Wolf static int GRAPH_RDLOCK l2_allocate(BlockDriverState *bs, int l1_index)
27445aba42fSKevin Wolf {
275ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque;
2766583e3c7SKevin Wolf uint64_t old_l2_offset;
2773861946aSAlberto Garcia uint64_t *l2_slice = NULL;
2783861946aSAlberto Garcia unsigned slice, slice_size2, n_slices;
279f4f0d391SKevin Wolf int64_t l2_offset;
280c46e1167SKevin Wolf int ret;
28145aba42fSKevin Wolf
28245aba42fSKevin Wolf old_l2_offset = s->l1_table[l1_index];
28345aba42fSKevin Wolf
2843cce16f4SKevin Wolf trace_qcow2_l2_allocate(bs, l1_index);
2853cce16f4SKevin Wolf
28645aba42fSKevin Wolf /* allocate a new l2 entry */
28745aba42fSKevin Wolf
288c8fd8554SAlberto Garcia l2_offset = qcow2_alloc_clusters(bs, s->l2_size * l2_entry_size(s));
2895d757b56SKevin Wolf if (l2_offset < 0) {
290be0b742eSMax Reitz ret = l2_offset;
291be0b742eSMax Reitz goto fail;
2925d757b56SKevin Wolf }
29329c1a730SKevin Wolf
294c1c43990SAlberto Garcia /* The offset must fit in the offset field of the L1 table entry */
295c1c43990SAlberto Garcia assert((l2_offset & L1E_OFFSET_MASK) == l2_offset);
296c1c43990SAlberto Garcia
29798839750SAlberto Garcia /* If we're allocating the table at offset 0 then something is wrong */
29898839750SAlberto Garcia if (l2_offset == 0) {
29998839750SAlberto Garcia qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
30098839750SAlberto Garcia "allocation of L2 table at offset 0");
30198839750SAlberto Garcia ret = -EIO;
30298839750SAlberto Garcia goto fail;
30398839750SAlberto Garcia }
30498839750SAlberto Garcia
30529c1a730SKevin Wolf ret = qcow2_cache_flush(bs, s->refcount_block_cache);
30629c1a730SKevin Wolf if (ret < 0) {
30729c1a730SKevin Wolf goto fail;
30829c1a730SKevin Wolf }
30945aba42fSKevin Wolf
31045aba42fSKevin Wolf /* allocate a new entry in the l2 cache */
31145aba42fSKevin Wolf
312c8fd8554SAlberto Garcia slice_size2 = s->l2_slice_size * l2_entry_size(s);
3133861946aSAlberto Garcia n_slices = s->cluster_size / slice_size2;
3143861946aSAlberto Garcia
3153cce16f4SKevin Wolf trace_qcow2_l2_allocate_get_empty(bs, l1_index);
3163861946aSAlberto Garcia for (slice = 0; slice < n_slices; slice++) {
3176580bb09SAlberto Garcia ret = qcow2_cache_get_empty(bs, s->l2_table_cache,
3183861946aSAlberto Garcia l2_offset + slice * slice_size2,
3193861946aSAlberto Garcia (void **) &l2_slice);
32029c1a730SKevin Wolf if (ret < 0) {
321be0b742eSMax Reitz goto fail;
32229c1a730SKevin Wolf }
32329c1a730SKevin Wolf
3248e37f681SKevin Wolf if ((old_l2_offset & L1E_OFFSET_MASK) == 0) {
3253861946aSAlberto Garcia /* if there was no old l2 table, clear the new slice */
3263861946aSAlberto Garcia memset(l2_slice, 0, slice_size2);
32745aba42fSKevin Wolf } else {
3283861946aSAlberto Garcia uint64_t *old_slice;
3293861946aSAlberto Garcia uint64_t old_l2_slice_offset =
3303861946aSAlberto Garcia (old_l2_offset & L1E_OFFSET_MASK) + slice * slice_size2;
33129c1a730SKevin Wolf
3323861946aSAlberto Garcia /* if there was an old l2 table, read a slice from the disk */
33366f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ);
3343861946aSAlberto Garcia ret = qcow2_cache_get(bs, s->l2_table_cache, old_l2_slice_offset,
3353861946aSAlberto Garcia (void **) &old_slice);
33629c1a730SKevin Wolf if (ret < 0) {
33729c1a730SKevin Wolf goto fail;
33829c1a730SKevin Wolf }
33929c1a730SKevin Wolf
3403861946aSAlberto Garcia memcpy(l2_slice, old_slice, slice_size2);
34129c1a730SKevin Wolf
3423861946aSAlberto Garcia qcow2_cache_put(s->l2_table_cache, (void **) &old_slice);
34345aba42fSKevin Wolf }
34429c1a730SKevin Wolf
3453861946aSAlberto Garcia /* write the l2 slice to the file */
34666f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE);
34729c1a730SKevin Wolf
3483cce16f4SKevin Wolf trace_qcow2_l2_allocate_write_l2(bs, l1_index);
3493861946aSAlberto Garcia qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
3503861946aSAlberto Garcia qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
3516580bb09SAlberto Garcia }
3526580bb09SAlberto Garcia
35329c1a730SKevin Wolf ret = qcow2_cache_flush(bs, s->l2_table_cache);
354c46e1167SKevin Wolf if (ret < 0) {
355175e1152SKevin Wolf goto fail;
356175e1152SKevin Wolf }
357175e1152SKevin Wolf
358175e1152SKevin Wolf /* update the L1 entry */
3593cce16f4SKevin Wolf trace_qcow2_l2_allocate_write_l1(bs, l1_index);
360175e1152SKevin Wolf s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
361e23e400eSMax Reitz ret = qcow2_write_l1_entry(bs, l1_index);
362175e1152SKevin Wolf if (ret < 0) {
363175e1152SKevin Wolf goto fail;
364c46e1167SKevin Wolf }
36545aba42fSKevin Wolf
3663cce16f4SKevin Wolf trace_qcow2_l2_allocate_done(bs, l1_index, 0);
367c46e1167SKevin Wolf return 0;
368175e1152SKevin Wolf
369175e1152SKevin Wolf fail:
3703cce16f4SKevin Wolf trace_qcow2_l2_allocate_done(bs, l1_index, ret);
3713861946aSAlberto Garcia if (l2_slice != NULL) {
3723861946aSAlberto Garcia qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
3738585afd8SMax Reitz }
37468dba0bfSKevin Wolf s->l1_table[l1_index] = old_l2_offset;
375e3b21ef9SMax Reitz if (l2_offset > 0) {
376c8fd8554SAlberto Garcia qcow2_free_clusters(bs, l2_offset, s->l2_size * l2_entry_size(s),
377e3b21ef9SMax Reitz QCOW2_DISCARD_ALWAYS);
378e3b21ef9SMax Reitz }
379175e1152SKevin Wolf return ret;
38045aba42fSKevin Wolf }
38145aba42fSKevin Wolf
3822bfcc4a0SKevin Wolf /*
38370d1cbaeSAlberto Garcia * For a given L2 entry, count the number of contiguous subclusters of
38470d1cbaeSAlberto Garcia * the same type starting from @sc_from. Compressed clusters are
38570d1cbaeSAlberto Garcia * treated as if they were divided into subclusters of size
38670d1cbaeSAlberto Garcia * s->subcluster_size.
38770d1cbaeSAlberto Garcia *
38870d1cbaeSAlberto Garcia * Return the number of contiguous subclusters and set @type to the
38970d1cbaeSAlberto Garcia * subcluster type.
39070d1cbaeSAlberto Garcia *
39170d1cbaeSAlberto Garcia * If the L2 entry is invalid return -errno and set @type to
39270d1cbaeSAlberto Garcia * QCOW2_SUBCLUSTER_INVALID.
39370d1cbaeSAlberto Garcia */
394*8f897341SKevin Wolf static int GRAPH_RDLOCK
qcow2_get_subcluster_range_type(BlockDriverState * bs,uint64_t l2_entry,uint64_t l2_bitmap,unsigned sc_from,QCow2SubclusterType * type)395*8f897341SKevin Wolf qcow2_get_subcluster_range_type(BlockDriverState *bs, uint64_t l2_entry,
396*8f897341SKevin Wolf uint64_t l2_bitmap, unsigned sc_from,
39770d1cbaeSAlberto Garcia QCow2SubclusterType *type)
39870d1cbaeSAlberto Garcia {
39970d1cbaeSAlberto Garcia BDRVQcow2State *s = bs->opaque;
40070d1cbaeSAlberto Garcia uint32_t val;
40170d1cbaeSAlberto Garcia
40270d1cbaeSAlberto Garcia *type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, sc_from);
40370d1cbaeSAlberto Garcia
40470d1cbaeSAlberto Garcia if (*type == QCOW2_SUBCLUSTER_INVALID) {
40570d1cbaeSAlberto Garcia return -EINVAL;
40670d1cbaeSAlberto Garcia } else if (!has_subclusters(s) || *type == QCOW2_SUBCLUSTER_COMPRESSED) {
40770d1cbaeSAlberto Garcia return s->subclusters_per_cluster - sc_from;
40870d1cbaeSAlberto Garcia }
40970d1cbaeSAlberto Garcia
41070d1cbaeSAlberto Garcia switch (*type) {
41170d1cbaeSAlberto Garcia case QCOW2_SUBCLUSTER_NORMAL:
41270d1cbaeSAlberto Garcia val = l2_bitmap | QCOW_OFLAG_SUB_ALLOC_RANGE(0, sc_from);
41370d1cbaeSAlberto Garcia return cto32(val) - sc_from;
41470d1cbaeSAlberto Garcia
41570d1cbaeSAlberto Garcia case QCOW2_SUBCLUSTER_ZERO_PLAIN:
41670d1cbaeSAlberto Garcia case QCOW2_SUBCLUSTER_ZERO_ALLOC:
41770d1cbaeSAlberto Garcia val = (l2_bitmap | QCOW_OFLAG_SUB_ZERO_RANGE(0, sc_from)) >> 32;
41870d1cbaeSAlberto Garcia return cto32(val) - sc_from;
41970d1cbaeSAlberto Garcia
42070d1cbaeSAlberto Garcia case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
42170d1cbaeSAlberto Garcia case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
42270d1cbaeSAlberto Garcia val = ((l2_bitmap >> 32) | l2_bitmap)
42370d1cbaeSAlberto Garcia & ~QCOW_OFLAG_SUB_ALLOC_RANGE(0, sc_from);
42470d1cbaeSAlberto Garcia return ctz32(val) - sc_from;
42570d1cbaeSAlberto Garcia
42670d1cbaeSAlberto Garcia default:
42770d1cbaeSAlberto Garcia g_assert_not_reached();
42870d1cbaeSAlberto Garcia }
42970d1cbaeSAlberto Garcia }
43070d1cbaeSAlberto Garcia
43170d1cbaeSAlberto Garcia /*
4323f9c6b3bSAlberto Garcia * Return the number of contiguous subclusters of the exact same type
4333f9c6b3bSAlberto Garcia * in a given L2 slice, starting from cluster @l2_index, subcluster
4343f9c6b3bSAlberto Garcia * @sc_index. Allocated subclusters are required to be contiguous in
4353f9c6b3bSAlberto Garcia * the image file.
4363f9c6b3bSAlberto Garcia * At most @nb_clusters are checked (note that this means clusters,
4373f9c6b3bSAlberto Garcia * not subclusters).
4383f9c6b3bSAlberto Garcia * Compressed clusters are always processed one by one but for the
4393f9c6b3bSAlberto Garcia * purpose of this count they are treated as if they were divided into
4403f9c6b3bSAlberto Garcia * subclusters of size s->subcluster_size.
4413f9c6b3bSAlberto Garcia * On failure return -errno and update @l2_index to point to the
4423f9c6b3bSAlberto Garcia * invalid entry.
4432bfcc4a0SKevin Wolf */
444*8f897341SKevin Wolf static int GRAPH_RDLOCK
count_contiguous_subclusters(BlockDriverState * bs,int nb_clusters,unsigned sc_index,uint64_t * l2_slice,unsigned * l2_index)445*8f897341SKevin Wolf count_contiguous_subclusters(BlockDriverState *bs, int nb_clusters,
4463f9c6b3bSAlberto Garcia unsigned sc_index, uint64_t *l2_slice,
4473f9c6b3bSAlberto Garcia unsigned *l2_index)
44845aba42fSKevin Wolf {
44912c6aebeSAlberto Garcia BDRVQcow2State *s = bs->opaque;
4503f9c6b3bSAlberto Garcia int i, count = 0;
4513f9c6b3bSAlberto Garcia bool check_offset = false;
4523f9c6b3bSAlberto Garcia uint64_t expected_offset = 0;
4533f9c6b3bSAlberto Garcia QCow2SubclusterType expected_type = QCOW2_SUBCLUSTER_NORMAL, type;
45445aba42fSKevin Wolf
4553f9c6b3bSAlberto Garcia assert(*l2_index + nb_clusters <= s->l2_slice_size);
45615684a47SMax Reitz
45761653008SKevin Wolf for (i = 0; i < nb_clusters; i++) {
4583f9c6b3bSAlberto Garcia unsigned first_sc = (i == 0) ? sc_index : 0;
4593f9c6b3bSAlberto Garcia uint64_t l2_entry = get_l2_entry(s, l2_slice, *l2_index + i);
4603f9c6b3bSAlberto Garcia uint64_t l2_bitmap = get_l2_bitmap(s, l2_slice, *l2_index + i);
4613f9c6b3bSAlberto Garcia int ret = qcow2_get_subcluster_range_type(bs, l2_entry, l2_bitmap,
4623f9c6b3bSAlberto Garcia first_sc, &type);
4633f9c6b3bSAlberto Garcia if (ret < 0) {
4643f9c6b3bSAlberto Garcia *l2_index += i; /* Point to the invalid entry */
4653f9c6b3bSAlberto Garcia return -EIO;
4663f9c6b3bSAlberto Garcia }
4673f9c6b3bSAlberto Garcia if (i == 0) {
4683f9c6b3bSAlberto Garcia if (type == QCOW2_SUBCLUSTER_COMPRESSED) {
4693f9c6b3bSAlberto Garcia /* Compressed clusters are always processed one by one */
4703f9c6b3bSAlberto Garcia return ret;
4713f9c6b3bSAlberto Garcia }
4723f9c6b3bSAlberto Garcia expected_type = type;
4733f9c6b3bSAlberto Garcia expected_offset = l2_entry & L2E_OFFSET_MASK;
4743f9c6b3bSAlberto Garcia check_offset = (type == QCOW2_SUBCLUSTER_NORMAL ||
4753f9c6b3bSAlberto Garcia type == QCOW2_SUBCLUSTER_ZERO_ALLOC ||
4763f9c6b3bSAlberto Garcia type == QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC);
4773f9c6b3bSAlberto Garcia } else if (type != expected_type) {
4783f9c6b3bSAlberto Garcia break;
4793f9c6b3bSAlberto Garcia } else if (check_offset) {
4803f9c6b3bSAlberto Garcia expected_offset += s->cluster_size;
4813f9c6b3bSAlberto Garcia if (expected_offset != (l2_entry & L2E_OFFSET_MASK)) {
4823f9c6b3bSAlberto Garcia break;
4833f9c6b3bSAlberto Garcia }
4843f9c6b3bSAlberto Garcia }
4853f9c6b3bSAlberto Garcia count += ret;
4863f9c6b3bSAlberto Garcia /* Stop if there are type changes before the end of the cluster */
4873f9c6b3bSAlberto Garcia if (first_sc + ret < s->subclusters_per_cluster) {
48845aba42fSKevin Wolf break;
4892bfcc4a0SKevin Wolf }
4902bfcc4a0SKevin Wolf }
49145aba42fSKevin Wolf
4923f9c6b3bSAlberto Garcia return count;
49345aba42fSKevin Wolf }
49445aba42fSKevin Wolf
4957b1fb72eSKevin Wolf static int coroutine_fn GRAPH_RDLOCK
do_perform_cow_read(BlockDriverState * bs,uint64_t src_cluster_offset,unsigned offset_in_cluster,QEMUIOVector * qiov)4967b1fb72eSKevin Wolf do_perform_cow_read(BlockDriverState *bs, uint64_t src_cluster_offset,
4977b1fb72eSKevin Wolf unsigned offset_in_cluster, QEMUIOVector *qiov)
49845aba42fSKevin Wolf {
499aaa4d20bSKevin Wolf int ret;
5001b9f1491SKevin Wolf
50186b862c4SAlberto Garcia if (qiov->size == 0) {
50299450c6fSAlberto Garcia return 0;
50399450c6fSAlberto Garcia }
50499450c6fSAlberto Garcia
50517362398SPaolo Bonzini BLKDBG_CO_EVENT(bs->file, BLKDBG_COW_READ);
506aef4acb6SStefan Hajnoczi
507dba28555SMax Reitz if (!bs->drv) {
508672f0f2cSAlberto Garcia return -ENOMEDIUM;
509dba28555SMax Reitz }
510dba28555SMax Reitz
511f7ef38ddSVladimir Sementsov-Ogievskiy /*
512f7ef38ddSVladimir Sementsov-Ogievskiy * We never deal with requests that don't satisfy
513f7ef38ddSVladimir Sementsov-Ogievskiy * bdrv_check_qiov_request(), and aligning requests to clusters never
514f7ef38ddSVladimir Sementsov-Ogievskiy * breaks this condition. So, do some assertions before calling
515f7ef38ddSVladimir Sementsov-Ogievskiy * bs->drv->bdrv_co_preadv_part() which has int64_t arguments.
516f7ef38ddSVladimir Sementsov-Ogievskiy */
517f7ef38ddSVladimir Sementsov-Ogievskiy assert(src_cluster_offset <= INT64_MAX);
518f7ef38ddSVladimir Sementsov-Ogievskiy assert(src_cluster_offset + offset_in_cluster <= INT64_MAX);
519e7e588d4SHanna Reitz /* Cast qiov->size to uint64_t to silence a compiler warning on -m32 */
520e7e588d4SHanna Reitz assert((uint64_t)qiov->size <= INT64_MAX);
521f7ef38ddSVladimir Sementsov-Ogievskiy bdrv_check_qiov_request(src_cluster_offset + offset_in_cluster, qiov->size,
522f7ef38ddSVladimir Sementsov-Ogievskiy qiov, 0, &error_abort);
523f7ef38ddSVladimir Sementsov-Ogievskiy /*
524f7ef38ddSVladimir Sementsov-Ogievskiy * Call .bdrv_co_readv() directly instead of using the public block-layer
525aef4acb6SStefan Hajnoczi * interface. This avoids double I/O throttling and request tracking,
526aef4acb6SStefan Hajnoczi * which can lead to deadlock when block layer copy-on-read is enabled.
527aef4acb6SStefan Hajnoczi */
528df893d25SVladimir Sementsov-Ogievskiy ret = bs->drv->bdrv_co_preadv_part(bs,
529df893d25SVladimir Sementsov-Ogievskiy src_cluster_offset + offset_in_cluster,
530df893d25SVladimir Sementsov-Ogievskiy qiov->size, qiov, 0, 0);
5311b9f1491SKevin Wolf if (ret < 0) {
532672f0f2cSAlberto Garcia return ret;
5331b9f1491SKevin Wolf }
5341b9f1491SKevin Wolf
535672f0f2cSAlberto Garcia return 0;
536672f0f2cSAlberto Garcia }
537672f0f2cSAlberto Garcia
538b9b10c35SKevin Wolf static int coroutine_fn GRAPH_RDLOCK
do_perform_cow_write(BlockDriverState * bs,uint64_t cluster_offset,unsigned offset_in_cluster,QEMUIOVector * qiov)539b9b10c35SKevin Wolf do_perform_cow_write(BlockDriverState *bs, uint64_t cluster_offset,
540b9b10c35SKevin Wolf unsigned offset_in_cluster, QEMUIOVector *qiov)
541672f0f2cSAlberto Garcia {
542966b000fSKevin Wolf BDRVQcow2State *s = bs->opaque;
543672f0f2cSAlberto Garcia int ret;
544672f0f2cSAlberto Garcia
54586b862c4SAlberto Garcia if (qiov->size == 0) {
546672f0f2cSAlberto Garcia return 0;
547672f0f2cSAlberto Garcia }
548672f0f2cSAlberto Garcia
549231bb267SMax Reitz ret = qcow2_pre_write_overlap_check(bs, 0,
550966b000fSKevin Wolf cluster_offset + offset_in_cluster, qiov->size, true);
551cf93980eSMax Reitz if (ret < 0) {
552672f0f2cSAlberto Garcia return ret;
553cf93980eSMax Reitz }
554cf93980eSMax Reitz
55517362398SPaolo Bonzini BLKDBG_CO_EVENT(bs->file, BLKDBG_COW_WRITE);
556966b000fSKevin Wolf ret = bdrv_co_pwritev(s->data_file, cluster_offset + offset_in_cluster,
55786b862c4SAlberto Garcia qiov->size, qiov, 0);
5581b9f1491SKevin Wolf if (ret < 0) {
559672f0f2cSAlberto Garcia return ret;
5601b9f1491SKevin Wolf }
5611b9f1491SKevin Wolf
562672f0f2cSAlberto Garcia return 0;
56345aba42fSKevin Wolf }
56445aba42fSKevin Wolf
56545aba42fSKevin Wolf
56645aba42fSKevin Wolf /*
567388e5816SAlberto Garcia * get_host_offset
56845aba42fSKevin Wolf *
569388e5816SAlberto Garcia * For a given offset of the virtual disk find the equivalent host
570388e5816SAlberto Garcia * offset in the qcow2 file and store it in *host_offset. Neither
571388e5816SAlberto Garcia * offset needs to be aligned to a cluster boundary.
572388e5816SAlberto Garcia *
573388e5816SAlberto Garcia * If the cluster is unallocated then *host_offset will be 0.
5749a3978a4SVladimir Sementsov-Ogievskiy * If the cluster is compressed then *host_offset will contain the l2 entry.
57545aba42fSKevin Wolf *
576ecfe1863SKevin Wolf * On entry, *bytes is the maximum number of contiguous bytes starting at
577ecfe1863SKevin Wolf * offset that we are interested in.
57845aba42fSKevin Wolf *
579ecfe1863SKevin Wolf * On exit, *bytes is the number of bytes starting at offset that have the same
58010dabdc5SAlberto Garcia * subcluster type and (if applicable) are stored contiguously in the image
58110dabdc5SAlberto Garcia * file. The subcluster type is stored in *subcluster_type.
58210dabdc5SAlberto Garcia * Compressed clusters are always processed one by one.
58345aba42fSKevin Wolf *
584ca4a0bb8SAlberto Garcia * Returns 0 on success, -errno in error cases.
58545aba42fSKevin Wolf */
qcow2_get_host_offset(BlockDriverState * bs,uint64_t offset,unsigned int * bytes,uint64_t * host_offset,QCow2SubclusterType * subcluster_type)586388e5816SAlberto Garcia int qcow2_get_host_offset(BlockDriverState *bs, uint64_t offset,
587ca4a0bb8SAlberto Garcia unsigned int *bytes, uint64_t *host_offset,
58810dabdc5SAlberto Garcia QCow2SubclusterType *subcluster_type)
58945aba42fSKevin Wolf {
590ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque;
5913f9c6b3bSAlberto Garcia unsigned int l2_index, sc_index;
5923f9c6b3bSAlberto Garcia uint64_t l1_index, l2_offset, *l2_slice, l2_entry, l2_bitmap;
5933f9c6b3bSAlberto Garcia int sc;
594c834cba9SMax Reitz unsigned int offset_in_cluster;
595c834cba9SMax Reitz uint64_t bytes_available, bytes_needed, nb_clusters;
5963f9c6b3bSAlberto Garcia QCow2SubclusterType type;
59755c17e98SKevin Wolf int ret;
598b2f65d6bSKevin Wolf
599b2f65d6bSKevin Wolf offset_in_cluster = offset_into_cluster(s, offset);
600ecfe1863SKevin Wolf bytes_needed = (uint64_t) *bytes + offset_in_cluster;
60145aba42fSKevin Wolf
602b2f65d6bSKevin Wolf /* compute how many bytes there are between the start of the cluster
603fd630039SAlberto Garcia * containing offset and the end of the l2 slice that contains
604fd630039SAlberto Garcia * the entry pointing to it */
605fd630039SAlberto Garcia bytes_available =
606fd630039SAlberto Garcia ((uint64_t) (s->l2_slice_size - offset_to_l2_slice_index(s, offset)))
607fd630039SAlberto Garcia << s->cluster_bits;
60845aba42fSKevin Wolf
609b2f65d6bSKevin Wolf if (bytes_needed > bytes_available) {
610b2f65d6bSKevin Wolf bytes_needed = bytes_available;
61145aba42fSKevin Wolf }
61245aba42fSKevin Wolf
613388e5816SAlberto Garcia *host_offset = 0;
61445aba42fSKevin Wolf
615b6af0975SDaniel P. Berrange /* seek to the l2 offset in the l1 table */
61645aba42fSKevin Wolf
61705b5b6eeSAlberto Garcia l1_index = offset_to_l1_index(s, offset);
61868d000a3SKevin Wolf if (l1_index >= s->l1_size) {
6193f9c6b3bSAlberto Garcia type = QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN;
62045aba42fSKevin Wolf goto out;
62168d000a3SKevin Wolf }
62245aba42fSKevin Wolf
62368d000a3SKevin Wolf l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
62468d000a3SKevin Wolf if (!l2_offset) {
6253f9c6b3bSAlberto Garcia type = QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN;
62645aba42fSKevin Wolf goto out;
62768d000a3SKevin Wolf }
62845aba42fSKevin Wolf
629a97c67eeSMax Reitz if (offset_into_cluster(s, l2_offset)) {
630a97c67eeSMax Reitz qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64
631a97c67eeSMax Reitz " unaligned (L1 index: %#" PRIx64 ")",
632a97c67eeSMax Reitz l2_offset, l1_index);
633a97c67eeSMax Reitz return -EIO;
634a97c67eeSMax Reitz }
635a97c67eeSMax Reitz
636fd630039SAlberto Garcia /* load the l2 slice in memory */
63745aba42fSKevin Wolf
638fd630039SAlberto Garcia ret = l2_load(bs, offset, l2_offset, &l2_slice);
63955c17e98SKevin Wolf if (ret < 0) {
64055c17e98SKevin Wolf return ret;
6411c46efaaSKevin Wolf }
64245aba42fSKevin Wolf
64345aba42fSKevin Wolf /* find the cluster offset for the given disk offset */
64445aba42fSKevin Wolf
645fd630039SAlberto Garcia l2_index = offset_to_l2_slice_index(s, offset);
6463f9c6b3bSAlberto Garcia sc_index = offset_to_sc_index(s, offset);
64712c6aebeSAlberto Garcia l2_entry = get_l2_entry(s, l2_slice, l2_index);
6483f9c6b3bSAlberto Garcia l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index);
649b6d36defSMax Reitz
650b2f65d6bSKevin Wolf nb_clusters = size_to_clusters(s, bytes_needed);
651c834cba9SMax Reitz /* bytes_needed <= *bytes + offset_in_cluster, both of which are unsigned
652c834cba9SMax Reitz * integers; the minimum cluster size is 512, so this assertion is always
653c834cba9SMax Reitz * true */
654c834cba9SMax Reitz assert(nb_clusters <= INT_MAX);
65545aba42fSKevin Wolf
6563f9c6b3bSAlberto Garcia type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, sc_index);
6573f9c6b3bSAlberto Garcia if (s->qcow_version < 3 && (type == QCOW2_SUBCLUSTER_ZERO_PLAIN ||
6583f9c6b3bSAlberto Garcia type == QCOW2_SUBCLUSTER_ZERO_ALLOC)) {
659a97c67eeSMax Reitz qcow2_signal_corruption(bs, true, -1, -1, "Zero cluster entry found"
660a97c67eeSMax Reitz " in pre-v3 image (L2 offset: %#" PRIx64
661a97c67eeSMax Reitz ", L2 index: %#x)", l2_offset, l2_index);
662a97c67eeSMax Reitz ret = -EIO;
663a97c67eeSMax Reitz goto fail;
664381b487dSPaolo Bonzini }
665fdfab37dSEric Blake switch (type) {
6663f9c6b3bSAlberto Garcia case QCOW2_SUBCLUSTER_INVALID:
6673f9c6b3bSAlberto Garcia break; /* This is handled by count_contiguous_subclusters() below */
6683f9c6b3bSAlberto Garcia case QCOW2_SUBCLUSTER_COMPRESSED:
669966b000fSKevin Wolf if (has_data_file(bs)) {
670966b000fSKevin Wolf qcow2_signal_corruption(bs, true, -1, -1, "Compressed cluster "
671966b000fSKevin Wolf "entry found in image with external data "
672966b000fSKevin Wolf "file (L2 offset: %#" PRIx64 ", L2 index: "
673966b000fSKevin Wolf "%#x)", l2_offset, l2_index);
674966b000fSKevin Wolf ret = -EIO;
675966b000fSKevin Wolf goto fail;
676966b000fSKevin Wolf }
6779a3978a4SVladimir Sementsov-Ogievskiy *host_offset = l2_entry;
6786377af48SKevin Wolf break;
6793f9c6b3bSAlberto Garcia case QCOW2_SUBCLUSTER_ZERO_PLAIN:
6803f9c6b3bSAlberto Garcia case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
68168d000a3SKevin Wolf break;
6823f9c6b3bSAlberto Garcia case QCOW2_SUBCLUSTER_ZERO_ALLOC:
6833f9c6b3bSAlberto Garcia case QCOW2_SUBCLUSTER_NORMAL:
6843f9c6b3bSAlberto Garcia case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC: {
685388e5816SAlberto Garcia uint64_t host_cluster_offset = l2_entry & L2E_OFFSET_MASK;
686388e5816SAlberto Garcia *host_offset = host_cluster_offset + offset_in_cluster;
687388e5816SAlberto Garcia if (offset_into_cluster(s, host_cluster_offset)) {
688fdfab37dSEric Blake qcow2_signal_corruption(bs, true, -1, -1,
689fdfab37dSEric Blake "Cluster allocation offset %#"
690a97c67eeSMax Reitz PRIx64 " unaligned (L2 offset: %#" PRIx64
691388e5816SAlberto Garcia ", L2 index: %#x)", host_cluster_offset,
692a97c67eeSMax Reitz l2_offset, l2_index);
693a97c67eeSMax Reitz ret = -EIO;
694a97c67eeSMax Reitz goto fail;
695a97c67eeSMax Reitz }
696388e5816SAlberto Garcia if (has_data_file(bs) && *host_offset != offset) {
697966b000fSKevin Wolf qcow2_signal_corruption(bs, true, -1, -1,
698966b000fSKevin Wolf "External data file host cluster offset %#"
699966b000fSKevin Wolf PRIx64 " does not match guest cluster "
700966b000fSKevin Wolf "offset: %#" PRIx64
701388e5816SAlberto Garcia ", L2 index: %#x)", host_cluster_offset,
702966b000fSKevin Wolf offset - offset_in_cluster, l2_index);
703966b000fSKevin Wolf ret = -EIO;
704966b000fSKevin Wolf goto fail;
705966b000fSKevin Wolf }
70668d000a3SKevin Wolf break;
707388e5816SAlberto Garcia }
7081417d7e4SKevin Wolf default:
7091417d7e4SKevin Wolf abort();
71045aba42fSKevin Wolf }
71145aba42fSKevin Wolf
7123f9c6b3bSAlberto Garcia sc = count_contiguous_subclusters(bs, nb_clusters, sc_index,
7133f9c6b3bSAlberto Garcia l2_slice, &l2_index);
7143f9c6b3bSAlberto Garcia if (sc < 0) {
7153f9c6b3bSAlberto Garcia qcow2_signal_corruption(bs, true, -1, -1, "Invalid cluster entry found "
7163f9c6b3bSAlberto Garcia " (L2 offset: %#" PRIx64 ", L2 index: %#x)",
7173f9c6b3bSAlberto Garcia l2_offset, l2_index);
7183f9c6b3bSAlberto Garcia ret = -EIO;
7193f9c6b3bSAlberto Garcia goto fail;
7203f9c6b3bSAlberto Garcia }
721fd630039SAlberto Garcia qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
72229c1a730SKevin Wolf
7233f9c6b3bSAlberto Garcia bytes_available = ((int64_t)sc + sc_index) << s->subcluster_bits;
72468d000a3SKevin Wolf
72545aba42fSKevin Wolf out:
726b2f65d6bSKevin Wolf if (bytes_available > bytes_needed) {
727b2f65d6bSKevin Wolf bytes_available = bytes_needed;
728b2f65d6bSKevin Wolf }
72945aba42fSKevin Wolf
730c834cba9SMax Reitz /* bytes_available <= bytes_needed <= *bytes + offset_in_cluster;
731c834cba9SMax Reitz * subtracting offset_in_cluster will therefore definitely yield something
732c834cba9SMax Reitz * not exceeding UINT_MAX */
733c834cba9SMax Reitz assert(bytes_available - offset_in_cluster <= UINT_MAX);
734ecfe1863SKevin Wolf *bytes = bytes_available - offset_in_cluster;
73545aba42fSKevin Wolf
7363f9c6b3bSAlberto Garcia *subcluster_type = type;
737ca4a0bb8SAlberto Garcia
738ca4a0bb8SAlberto Garcia return 0;
739a97c67eeSMax Reitz
740a97c67eeSMax Reitz fail:
741fd630039SAlberto Garcia qcow2_cache_put(s->l2_table_cache, (void **)&l2_slice);
742a97c67eeSMax Reitz return ret;
74345aba42fSKevin Wolf }
74445aba42fSKevin Wolf
74545aba42fSKevin Wolf /*
74645aba42fSKevin Wolf * get_cluster_table
74745aba42fSKevin Wolf *
74845aba42fSKevin Wolf * for a given disk offset, load (and allocate if needed)
749c03bfc5bSAlberto Garcia * the appropriate slice of its l2 table.
75045aba42fSKevin Wolf *
751c03bfc5bSAlberto Garcia * the cluster index in the l2 slice is given to the caller.
75245aba42fSKevin Wolf *
7531e3e8f1aSKevin Wolf * Returns 0 on success, -errno in failure case
75445aba42fSKevin Wolf */
7550bb79c97SKevin Wolf static int GRAPH_RDLOCK
get_cluster_table(BlockDriverState * bs,uint64_t offset,uint64_t ** new_l2_slice,int * new_l2_index)7560bb79c97SKevin Wolf get_cluster_table(BlockDriverState *bs, uint64_t offset,
7570bb79c97SKevin Wolf uint64_t **new_l2_slice, int *new_l2_index)
75845aba42fSKevin Wolf {
759ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque;
7602cf7cfa1SKevin Wolf unsigned int l2_index;
7612cf7cfa1SKevin Wolf uint64_t l1_index, l2_offset;
762c03bfc5bSAlberto Garcia uint64_t *l2_slice = NULL;
76380ee15a6SKevin Wolf int ret;
76445aba42fSKevin Wolf
765b6af0975SDaniel P. Berrange /* seek to the l2 offset in the l1 table */
76645aba42fSKevin Wolf
76705b5b6eeSAlberto Garcia l1_index = offset_to_l1_index(s, offset);
76845aba42fSKevin Wolf if (l1_index >= s->l1_size) {
76972893756SStefan Hajnoczi ret = qcow2_grow_l1_table(bs, l1_index + 1, false);
7701e3e8f1aSKevin Wolf if (ret < 0) {
7711e3e8f1aSKevin Wolf return ret;
7721e3e8f1aSKevin Wolf }
77345aba42fSKevin Wolf }
7748e37f681SKevin Wolf
7752cf7cfa1SKevin Wolf assert(l1_index < s->l1_size);
7768e37f681SKevin Wolf l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
777a97c67eeSMax Reitz if (offset_into_cluster(s, l2_offset)) {
778a97c67eeSMax Reitz qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64
779a97c67eeSMax Reitz " unaligned (L1 index: %#" PRIx64 ")",
780a97c67eeSMax Reitz l2_offset, l1_index);
781a97c67eeSMax Reitz return -EIO;
782a97c67eeSMax Reitz }
78345aba42fSKevin Wolf
78405f9ee46SAlberto Garcia if (!(s->l1_table[l1_index] & QCOW_OFLAG_COPIED)) {
78516fde5f2SKevin Wolf /* First allocate a new L2 table (and do COW if needed) */
7863861946aSAlberto Garcia ret = l2_allocate(bs, l1_index);
787c46e1167SKevin Wolf if (ret < 0) {
788c46e1167SKevin Wolf return ret;
7891e3e8f1aSKevin Wolf }
79016fde5f2SKevin Wolf
79116fde5f2SKevin Wolf /* Then decrease the refcount of the old table */
79216fde5f2SKevin Wolf if (l2_offset) {
793c8fd8554SAlberto Garcia qcow2_free_clusters(bs, l2_offset, s->l2_size * l2_entry_size(s),
7946cfcb9b8SKevin Wolf QCOW2_DISCARD_OTHER);
79516fde5f2SKevin Wolf }
7963861946aSAlberto Garcia
7973861946aSAlberto Garcia /* Get the offset of the newly-allocated l2 table */
7983861946aSAlberto Garcia l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
7993861946aSAlberto Garcia assert(offset_into_cluster(s, l2_offset) == 0);
80005f9ee46SAlberto Garcia }
80105f9ee46SAlberto Garcia
802c03bfc5bSAlberto Garcia /* load the l2 slice in memory */
803c03bfc5bSAlberto Garcia ret = l2_load(bs, offset, l2_offset, &l2_slice);
8043861946aSAlberto Garcia if (ret < 0) {
8053861946aSAlberto Garcia return ret;
8063861946aSAlberto Garcia }
80745aba42fSKevin Wolf
80845aba42fSKevin Wolf /* find the cluster offset for the given disk offset */
80945aba42fSKevin Wolf
810c03bfc5bSAlberto Garcia l2_index = offset_to_l2_slice_index(s, offset);
81145aba42fSKevin Wolf
812c03bfc5bSAlberto Garcia *new_l2_slice = l2_slice;
81345aba42fSKevin Wolf *new_l2_index = l2_index;
81445aba42fSKevin Wolf
8151e3e8f1aSKevin Wolf return 0;
81645aba42fSKevin Wolf }
81745aba42fSKevin Wolf
81845aba42fSKevin Wolf /*
81945aba42fSKevin Wolf * alloc_compressed_cluster_offset
82045aba42fSKevin Wolf *
82177e023ffSKevin Wolf * For a given offset on the virtual disk, allocate a new compressed cluster
82277e023ffSKevin Wolf * and put the host offset of the cluster into *host_offset. If a cluster is
82377e023ffSKevin Wolf * already allocated at the offset, return an error.
82445aba42fSKevin Wolf *
82577e023ffSKevin Wolf * Return 0 on success and -errno in error cases
82645aba42fSKevin Wolf */
82770bacc44SPaolo Bonzini int coroutine_fn GRAPH_RDLOCK
qcow2_alloc_compressed_cluster_offset(BlockDriverState * bs,uint64_t offset,int compressed_size,uint64_t * host_offset)82870bacc44SPaolo Bonzini qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, uint64_t offset,
82970bacc44SPaolo Bonzini int compressed_size, uint64_t *host_offset)
83045aba42fSKevin Wolf {
831ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque;
83245aba42fSKevin Wolf int l2_index, ret;
833e4e72548SAlberto Garcia uint64_t *l2_slice;
834f4f0d391SKevin Wolf int64_t cluster_offset;
83545aba42fSKevin Wolf int nb_csectors;
83645aba42fSKevin Wolf
837966b000fSKevin Wolf if (has_data_file(bs)) {
838966b000fSKevin Wolf return 0;
839966b000fSKevin Wolf }
840966b000fSKevin Wolf
841e4e72548SAlberto Garcia ret = get_cluster_table(bs, offset, &l2_slice, &l2_index);
8421e3e8f1aSKevin Wolf if (ret < 0) {
84377e023ffSKevin Wolf return ret;
8441e3e8f1aSKevin Wolf }
84545aba42fSKevin Wolf
846b0b6862eSKevin Wolf /* Compression can't overwrite anything. Fail if the cluster was already
847b0b6862eSKevin Wolf * allocated. */
84812c6aebeSAlberto Garcia cluster_offset = get_l2_entry(s, l2_slice, l2_index);
849b0b6862eSKevin Wolf if (cluster_offset & L2E_OFFSET_MASK) {
850e4e72548SAlberto Garcia qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
85177e023ffSKevin Wolf return -EIO;
8528f1efd00SKevin Wolf }
85345aba42fSKevin Wolf
854ed6ccf0fSKevin Wolf cluster_offset = qcow2_alloc_bytes(bs, compressed_size);
8555d757b56SKevin Wolf if (cluster_offset < 0) {
856e4e72548SAlberto Garcia qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
85777e023ffSKevin Wolf return cluster_offset;
8585d757b56SKevin Wolf }
8595d757b56SKevin Wolf
860b6c24694SAlberto Garcia nb_csectors =
861b6c24694SAlberto Garcia (cluster_offset + compressed_size - 1) / QCOW2_COMPRESSED_SECTOR_SIZE -
862b6c24694SAlberto Garcia (cluster_offset / QCOW2_COMPRESSED_SECTOR_SIZE);
86345aba42fSKevin Wolf
8643a75a870SAlberto Garcia /* The offset and size must fit in their fields of the L2 table entry */
8653a75a870SAlberto Garcia assert((cluster_offset & s->cluster_offset_mask) == cluster_offset);
8663a75a870SAlberto Garcia assert((nb_csectors & s->csize_mask) == nb_csectors);
8673a75a870SAlberto Garcia
86845aba42fSKevin Wolf cluster_offset |= QCOW_OFLAG_COMPRESSED |
86945aba42fSKevin Wolf ((uint64_t)nb_csectors << s->csize_shift);
87045aba42fSKevin Wolf
87145aba42fSKevin Wolf /* update L2 table */
87245aba42fSKevin Wolf
87345aba42fSKevin Wolf /* compressed clusters never have the copied flag */
87445aba42fSKevin Wolf
87517362398SPaolo Bonzini BLKDBG_CO_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED);
876e4e72548SAlberto Garcia qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
87712c6aebeSAlberto Garcia set_l2_entry(s, l2_slice, l2_index, cluster_offset);
878ff4cdec7SAlberto Garcia if (has_subclusters(s)) {
879ff4cdec7SAlberto Garcia set_l2_bitmap(s, l2_slice, l2_index, 0);
880ff4cdec7SAlberto Garcia }
881e4e72548SAlberto Garcia qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
88245aba42fSKevin Wolf
88377e023ffSKevin Wolf *host_offset = cluster_offset & s->cluster_offset_mask;
88477e023ffSKevin Wolf return 0;
88545aba42fSKevin Wolf }
88645aba42fSKevin Wolf
8877b1fb72eSKevin Wolf static int coroutine_fn GRAPH_RDLOCK
perform_cow(BlockDriverState * bs,QCowL2Meta * m)8887b1fb72eSKevin Wolf perform_cow(BlockDriverState *bs, QCowL2Meta *m)
889593fb83cSKevin Wolf {
890ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque;
89199450c6fSAlberto Garcia Qcow2COWRegion *start = &m->cow_start;
89299450c6fSAlberto Garcia Qcow2COWRegion *end = &m->cow_end;
893672f0f2cSAlberto Garcia unsigned buffer_size;
894b3cf1c7cSAlberto Garcia unsigned data_bytes = end->offset - (start->offset + start->nb_bytes);
895b3cf1c7cSAlberto Garcia bool merge_reads;
896672f0f2cSAlberto Garcia uint8_t *start_buffer, *end_buffer;
89786b862c4SAlberto Garcia QEMUIOVector qiov;
898593fb83cSKevin Wolf int ret;
899593fb83cSKevin Wolf
900672f0f2cSAlberto Garcia assert(start->nb_bytes <= UINT_MAX - end->nb_bytes);
901b3cf1c7cSAlberto Garcia assert(start->nb_bytes + end->nb_bytes <= UINT_MAX - data_bytes);
902b3cf1c7cSAlberto Garcia assert(start->offset + start->nb_bytes <= end->offset);
903672f0f2cSAlberto Garcia
904c8bb23cbSAnton Nefedov if ((start->nb_bytes == 0 && end->nb_bytes == 0) || m->skip_cow) {
905593fb83cSKevin Wolf return 0;
906593fb83cSKevin Wolf }
907593fb83cSKevin Wolf
908b3cf1c7cSAlberto Garcia /* If we have to read both the start and end COW regions and the
909b3cf1c7cSAlberto Garcia * middle region is not too large then perform just one read
910b3cf1c7cSAlberto Garcia * operation */
911b3cf1c7cSAlberto Garcia merge_reads = start->nb_bytes && end->nb_bytes && data_bytes <= 16384;
912b3cf1c7cSAlberto Garcia if (merge_reads) {
913b3cf1c7cSAlberto Garcia buffer_size = start->nb_bytes + data_bytes + end->nb_bytes;
914b3cf1c7cSAlberto Garcia } else {
915b3cf1c7cSAlberto Garcia /* If we have to do two reads, add some padding in the middle
916b3cf1c7cSAlberto Garcia * if necessary to make sure that the end region is optimally
917b3cf1c7cSAlberto Garcia * aligned. */
918b3cf1c7cSAlberto Garcia size_t align = bdrv_opt_mem_align(bs);
919b3cf1c7cSAlberto Garcia assert(align > 0 && align <= UINT_MAX);
920b3cf1c7cSAlberto Garcia assert(QEMU_ALIGN_UP(start->nb_bytes, align) <=
921b3cf1c7cSAlberto Garcia UINT_MAX - end->nb_bytes);
922b3cf1c7cSAlberto Garcia buffer_size = QEMU_ALIGN_UP(start->nb_bytes, align) + end->nb_bytes;
923b3cf1c7cSAlberto Garcia }
924b3cf1c7cSAlberto Garcia
925b3cf1c7cSAlberto Garcia /* Reserve a buffer large enough to store all the data that we're
926b3cf1c7cSAlberto Garcia * going to read */
927672f0f2cSAlberto Garcia start_buffer = qemu_try_blockalign(bs, buffer_size);
928672f0f2cSAlberto Garcia if (start_buffer == NULL) {
929672f0f2cSAlberto Garcia return -ENOMEM;
930672f0f2cSAlberto Garcia }
931672f0f2cSAlberto Garcia /* The part of the buffer where the end region is located */
932672f0f2cSAlberto Garcia end_buffer = start_buffer + buffer_size - end->nb_bytes;
933672f0f2cSAlberto Garcia
9345396234bSVladimir Sementsov-Ogievskiy qemu_iovec_init(&qiov, 2 + (m->data_qiov ?
9355396234bSVladimir Sementsov-Ogievskiy qemu_iovec_subvec_niov(m->data_qiov,
9365396234bSVladimir Sementsov-Ogievskiy m->data_qiov_offset,
9375396234bSVladimir Sementsov-Ogievskiy data_bytes)
9385396234bSVladimir Sementsov-Ogievskiy : 0));
93986b862c4SAlberto Garcia
940593fb83cSKevin Wolf qemu_co_mutex_unlock(&s->lock);
941b3cf1c7cSAlberto Garcia /* First we read the existing data from both COW regions. We
942b3cf1c7cSAlberto Garcia * either read the whole region in one go, or the start and end
943b3cf1c7cSAlberto Garcia * regions separately. */
944b3cf1c7cSAlberto Garcia if (merge_reads) {
94586b862c4SAlberto Garcia qemu_iovec_add(&qiov, start_buffer, buffer_size);
94686b862c4SAlberto Garcia ret = do_perform_cow_read(bs, m->offset, start->offset, &qiov);
947b3cf1c7cSAlberto Garcia } else {
94886b862c4SAlberto Garcia qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
94986b862c4SAlberto Garcia ret = do_perform_cow_read(bs, m->offset, start->offset, &qiov);
950593fb83cSKevin Wolf if (ret < 0) {
95199450c6fSAlberto Garcia goto fail;
952593fb83cSKevin Wolf }
953593fb83cSKevin Wolf
95486b862c4SAlberto Garcia qemu_iovec_reset(&qiov);
95586b862c4SAlberto Garcia qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
95686b862c4SAlberto Garcia ret = do_perform_cow_read(bs, m->offset, end->offset, &qiov);
957b3cf1c7cSAlberto Garcia }
958672f0f2cSAlberto Garcia if (ret < 0) {
959672f0f2cSAlberto Garcia goto fail;
960672f0f2cSAlberto Garcia }
96199450c6fSAlberto Garcia
962672f0f2cSAlberto Garcia /* Encrypt the data if necessary before writing it */
963672f0f2cSAlberto Garcia if (bs->encrypted) {
964603fbd07SMaxim Levitsky ret = qcow2_co_encrypt(bs,
965603fbd07SMaxim Levitsky m->alloc_offset + start->offset,
966603fbd07SMaxim Levitsky m->offset + start->offset,
967603fbd07SMaxim Levitsky start_buffer, start->nb_bytes);
968603fbd07SMaxim Levitsky if (ret < 0) {
969603fbd07SMaxim Levitsky goto fail;
970603fbd07SMaxim Levitsky }
971603fbd07SMaxim Levitsky
972603fbd07SMaxim Levitsky ret = qcow2_co_encrypt(bs,
973603fbd07SMaxim Levitsky m->alloc_offset + end->offset,
974603fbd07SMaxim Levitsky m->offset + end->offset,
975603fbd07SMaxim Levitsky end_buffer, end->nb_bytes);
976603fbd07SMaxim Levitsky if (ret < 0) {
977672f0f2cSAlberto Garcia goto fail;
978672f0f2cSAlberto Garcia }
979672f0f2cSAlberto Garcia }
980672f0f2cSAlberto Garcia
981ee22a9d8SAlberto Garcia /* And now we can write everything. If we have the guest data we
982ee22a9d8SAlberto Garcia * can write everything in one single operation */
983ee22a9d8SAlberto Garcia if (m->data_qiov) {
984ee22a9d8SAlberto Garcia qemu_iovec_reset(&qiov);
985ee22a9d8SAlberto Garcia if (start->nb_bytes) {
986ee22a9d8SAlberto Garcia qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
987ee22a9d8SAlberto Garcia }
9885396234bSVladimir Sementsov-Ogievskiy qemu_iovec_concat(&qiov, m->data_qiov, m->data_qiov_offset, data_bytes);
989ee22a9d8SAlberto Garcia if (end->nb_bytes) {
990ee22a9d8SAlberto Garcia qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
991ee22a9d8SAlberto Garcia }
992ee22a9d8SAlberto Garcia /* NOTE: we have a write_aio blkdebug event here followed by
993ee22a9d8SAlberto Garcia * a cow_write one in do_perform_cow_write(), but there's only
994ee22a9d8SAlberto Garcia * one single I/O operation */
99517362398SPaolo Bonzini BLKDBG_CO_EVENT(bs->file, BLKDBG_WRITE_AIO);
996ee22a9d8SAlberto Garcia ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov);
997ee22a9d8SAlberto Garcia } else {
998ee22a9d8SAlberto Garcia /* If there's no guest data then write both COW regions separately */
99986b862c4SAlberto Garcia qemu_iovec_reset(&qiov);
100086b862c4SAlberto Garcia qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
100186b862c4SAlberto Garcia ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov);
1002672f0f2cSAlberto Garcia if (ret < 0) {
1003672f0f2cSAlberto Garcia goto fail;
1004672f0f2cSAlberto Garcia }
1005672f0f2cSAlberto Garcia
100686b862c4SAlberto Garcia qemu_iovec_reset(&qiov);
100786b862c4SAlberto Garcia qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
100886b862c4SAlberto Garcia ret = do_perform_cow_write(bs, m->alloc_offset, end->offset, &qiov);
1009ee22a9d8SAlberto Garcia }
1010ee22a9d8SAlberto Garcia
101199450c6fSAlberto Garcia fail:
101299450c6fSAlberto Garcia qemu_co_mutex_lock(&s->lock);
101399450c6fSAlberto Garcia
1014593fb83cSKevin Wolf /*
1015593fb83cSKevin Wolf * Before we update the L2 table to actually point to the new cluster, we
1016593fb83cSKevin Wolf * need to be sure that the refcounts have been increased and COW was
1017593fb83cSKevin Wolf * handled.
1018593fb83cSKevin Wolf */
101999450c6fSAlberto Garcia if (ret == 0) {
1020593fb83cSKevin Wolf qcow2_cache_depends_on_flush(s->l2_table_cache);
102199450c6fSAlberto Garcia }
1022593fb83cSKevin Wolf
1023672f0f2cSAlberto Garcia qemu_vfree(start_buffer);
102486b862c4SAlberto Garcia qemu_iovec_destroy(&qiov);
102599450c6fSAlberto Garcia return ret;
1026593fb83cSKevin Wolf }
1027593fb83cSKevin Wolf
qcow2_alloc_cluster_link_l2(BlockDriverState * bs,QCowL2Meta * m)1028050ed2e7SPaolo Bonzini int coroutine_fn qcow2_alloc_cluster_link_l2(BlockDriverState *bs,
1029050ed2e7SPaolo Bonzini QCowL2Meta *m)
103045aba42fSKevin Wolf {
1031ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque;
103245aba42fSKevin Wolf int i, j = 0, l2_index, ret;
1033a002c0b0SAlberto Garcia uint64_t *old_cluster, *l2_slice;
1034250196f1SKevin Wolf uint64_t cluster_offset = m->alloc_offset;
103545aba42fSKevin Wolf
10363cce16f4SKevin Wolf trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters);
1037f50f88b9SKevin Wolf assert(m->nb_clusters > 0);
103845aba42fSKevin Wolf
10395839e53bSMarkus Armbruster old_cluster = g_try_new(uint64_t, m->nb_clusters);
1040de82815dSKevin Wolf if (old_cluster == NULL) {
1041de82815dSKevin Wolf ret = -ENOMEM;
1042de82815dSKevin Wolf goto err;
1043de82815dSKevin Wolf }
104445aba42fSKevin Wolf
104545aba42fSKevin Wolf /* copy content of unmodified sectors */
104699450c6fSAlberto Garcia ret = perform_cow(bs, m);
1047593fb83cSKevin Wolf if (ret < 0) {
104845aba42fSKevin Wolf goto err;
104945aba42fSKevin Wolf }
105045aba42fSKevin Wolf
1051593fb83cSKevin Wolf /* Update L2 table. */
105274c4510aSKevin Wolf if (s->use_lazy_refcounts) {
1053280d3735SKevin Wolf qcow2_mark_dirty(bs);
1054280d3735SKevin Wolf }
1055bfe8043eSStefan Hajnoczi if (qcow2_need_accurate_refcounts(s)) {
1056bfe8043eSStefan Hajnoczi qcow2_cache_set_dependency(bs, s->l2_table_cache,
1057bfe8043eSStefan Hajnoczi s->refcount_block_cache);
1058bfe8043eSStefan Hajnoczi }
1059280d3735SKevin Wolf
1060a002c0b0SAlberto Garcia ret = get_cluster_table(bs, m->offset, &l2_slice, &l2_index);
10611e3e8f1aSKevin Wolf if (ret < 0) {
106245aba42fSKevin Wolf goto err;
10631e3e8f1aSKevin Wolf }
1064a002c0b0SAlberto Garcia qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
106545aba42fSKevin Wolf
1066a002c0b0SAlberto Garcia assert(l2_index + m->nb_clusters <= s->l2_slice_size);
10673441ad4bSAlberto Garcia assert(m->cow_end.offset + m->cow_end.nb_bytes <=
10683441ad4bSAlberto Garcia m->nb_clusters << s->cluster_bits);
106945aba42fSKevin Wolf for (i = 0; i < m->nb_clusters; i++) {
1070348fcc4fSTuguoyi uint64_t offset = cluster_offset + ((uint64_t)i << s->cluster_bits);
107145aba42fSKevin Wolf /* if two concurrent writes happen to the same unallocated cluster
107245aba42fSKevin Wolf * each write allocates separate cluster and writes data concurrently.
107345aba42fSKevin Wolf * The first one to complete updates l2 table with pointer to its
107445aba42fSKevin Wolf * cluster the second one has to do RMW (which is done above by
1075aaa4d20bSKevin Wolf * perform_cow()), update l2 table with its cluster pointer and free
107645aba42fSKevin Wolf * old cluster. This is what this loop does */
107712c6aebeSAlberto Garcia if (get_l2_entry(s, l2_slice, l2_index + i) != 0) {
107812c6aebeSAlberto Garcia old_cluster[j++] = get_l2_entry(s, l2_slice, l2_index + i);
1079aaa4d20bSKevin Wolf }
108045aba42fSKevin Wolf
10813a75a870SAlberto Garcia /* The offset must fit in the offset field of the L2 table entry */
10823a75a870SAlberto Garcia assert((offset & L2E_OFFSET_MASK) == offset);
10833a75a870SAlberto Garcia
108412c6aebeSAlberto Garcia set_l2_entry(s, l2_slice, l2_index + i, offset | QCOW_OFLAG_COPIED);
1085aca00cd9SAlberto Garcia
1086aca00cd9SAlberto Garcia /* Update bitmap with the subclusters that were just written */
108740dee943SAlberto Garcia if (has_subclusters(s) && !m->prealloc) {
1088aca00cd9SAlberto Garcia uint64_t l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index + i);
1089aca00cd9SAlberto Garcia unsigned written_from = m->cow_start.offset;
10903441ad4bSAlberto Garcia unsigned written_to = m->cow_end.offset + m->cow_end.nb_bytes;
1091aca00cd9SAlberto Garcia int first_sc, last_sc;
1092aca00cd9SAlberto Garcia /* Narrow written_from and written_to down to the current cluster */
1093aca00cd9SAlberto Garcia written_from = MAX(written_from, i << s->cluster_bits);
1094aca00cd9SAlberto Garcia written_to = MIN(written_to, (i + 1) << s->cluster_bits);
1095aca00cd9SAlberto Garcia assert(written_from < written_to);
1096aca00cd9SAlberto Garcia first_sc = offset_to_sc_index(s, written_from);
1097aca00cd9SAlberto Garcia last_sc = offset_to_sc_index(s, written_to - 1);
1098aca00cd9SAlberto Garcia l2_bitmap |= QCOW_OFLAG_SUB_ALLOC_RANGE(first_sc, last_sc + 1);
1099aca00cd9SAlberto Garcia l2_bitmap &= ~QCOW_OFLAG_SUB_ZERO_RANGE(first_sc, last_sc + 1);
1100aca00cd9SAlberto Garcia set_l2_bitmap(s, l2_slice, l2_index + i, l2_bitmap);
1101aca00cd9SAlberto Garcia }
110245aba42fSKevin Wolf }
110345aba42fSKevin Wolf
11049f8e668eSKevin Wolf
1105a002c0b0SAlberto Garcia qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
110645aba42fSKevin Wolf
11077ec5e6a4SKevin Wolf /*
11087ec5e6a4SKevin Wolf * If this was a COW, we need to decrease the refcount of the old cluster.
11096cfcb9b8SKevin Wolf *
11106cfcb9b8SKevin Wolf * Don't discard clusters that reach a refcount of 0 (e.g. compressed
11116cfcb9b8SKevin Wolf * clusters), the next write will reuse them anyway.
11127ec5e6a4SKevin Wolf */
1113564a6b69SMax Reitz if (!m->keep_old_clusters && j != 0) {
11147ec5e6a4SKevin Wolf for (i = 0; i < j; i++) {
11153fec237fSAlberto Garcia qcow2_free_any_cluster(bs, old_cluster[i], QCOW2_DISCARD_NEVER);
11167ec5e6a4SKevin Wolf }
11177ec5e6a4SKevin Wolf }
111845aba42fSKevin Wolf
111945aba42fSKevin Wolf ret = 0;
112045aba42fSKevin Wolf err:
11217267c094SAnthony Liguori g_free(old_cluster);
112245aba42fSKevin Wolf return ret;
112345aba42fSKevin Wolf }
112445aba42fSKevin Wolf
11258b24cd14SKevin Wolf /**
11268b24cd14SKevin Wolf * Frees the allocated clusters because the request failed and they won't
11278b24cd14SKevin Wolf * actually be linked.
11288b24cd14SKevin Wolf */
qcow2_alloc_cluster_abort(BlockDriverState * bs,QCowL2Meta * m)1129a39bae4eSPaolo Bonzini void coroutine_fn qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m)
11308b24cd14SKevin Wolf {
11318b24cd14SKevin Wolf BDRVQcow2State *s = bs->opaque;
11323ede935fSMax Reitz if (!has_data_file(bs) && !m->keep_old_clusters) {
1133c3b6658cSKevin Wolf qcow2_free_clusters(bs, m->alloc_offset,
1134c3b6658cSKevin Wolf m->nb_clusters << s->cluster_bits,
11358b24cd14SKevin Wolf QCOW2_DISCARD_NEVER);
11368b24cd14SKevin Wolf }
1137c3b6658cSKevin Wolf }
11388b24cd14SKevin Wolf
113945aba42fSKevin Wolf /*
11408f91d690SAlberto Garcia * For a given write request, create a new QCowL2Meta structure, add
114157538c86SAlberto Garcia * it to @m and the BDRVQcow2State.cluster_allocs list. If the write
114257538c86SAlberto Garcia * request does not need copy-on-write or changes to the L2 metadata
114357538c86SAlberto Garcia * then this function does nothing.
11448f91d690SAlberto Garcia *
11458f91d690SAlberto Garcia * @host_cluster_offset points to the beginning of the first cluster.
11468f91d690SAlberto Garcia *
11478f91d690SAlberto Garcia * @guest_offset and @bytes indicate the offset and length of the
11488f91d690SAlberto Garcia * request.
11498f91d690SAlberto Garcia *
115057538c86SAlberto Garcia * @l2_slice contains the L2 entries of all clusters involved in this
115157538c86SAlberto Garcia * write request.
115257538c86SAlberto Garcia *
11538f91d690SAlberto Garcia * If @keep_old is true it means that the clusters were already
11548f91d690SAlberto Garcia * allocated and will be overwritten. If false then the clusters are
11558f91d690SAlberto Garcia * new and we have to decrease the reference count of the old ones.
1156d53ec3d8SAlberto Garcia *
1157d53ec3d8SAlberto Garcia * Returns 0 on success, -errno on failure.
11588f91d690SAlberto Garcia */
11590bb79c97SKevin Wolf static int coroutine_fn GRAPH_RDLOCK
calculate_l2_meta(BlockDriverState * bs,uint64_t host_cluster_offset,uint64_t guest_offset,unsigned bytes,uint64_t * l2_slice,QCowL2Meta ** m,bool keep_old)11600bb79c97SKevin Wolf calculate_l2_meta(BlockDriverState *bs, uint64_t host_cluster_offset,
11610bb79c97SKevin Wolf uint64_t guest_offset, unsigned bytes, uint64_t *l2_slice,
11620bb79c97SKevin Wolf QCowL2Meta **m, bool keep_old)
11638f91d690SAlberto Garcia {
11648f91d690SAlberto Garcia BDRVQcow2State *s = bs->opaque;
1165d53ec3d8SAlberto Garcia int sc_index, l2_index = offset_to_l2_slice_index(s, guest_offset);
1166d53ec3d8SAlberto Garcia uint64_t l2_entry, l2_bitmap;
116757538c86SAlberto Garcia unsigned cow_start_from, cow_end_to;
11688f91d690SAlberto Garcia unsigned cow_start_to = offset_into_cluster(s, guest_offset);
11698f91d690SAlberto Garcia unsigned cow_end_from = cow_start_to + bytes;
11708f91d690SAlberto Garcia unsigned nb_clusters = size_to_clusters(s, cow_end_from);
11718f91d690SAlberto Garcia QCowL2Meta *old_m = *m;
1172d53ec3d8SAlberto Garcia QCow2SubclusterType type;
1173d53ec3d8SAlberto Garcia int i;
1174d53ec3d8SAlberto Garcia bool skip_cow = keep_old;
117557538c86SAlberto Garcia
117657538c86SAlberto Garcia assert(nb_clusters <= s->l2_slice_size - l2_index);
117757538c86SAlberto Garcia
1178d53ec3d8SAlberto Garcia /* Check the type of all affected subclusters */
117957538c86SAlberto Garcia for (i = 0; i < nb_clusters; i++) {
118012c6aebeSAlberto Garcia l2_entry = get_l2_entry(s, l2_slice, l2_index + i);
1181d53ec3d8SAlberto Garcia l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index + i);
1182d53ec3d8SAlberto Garcia if (skip_cow) {
1183d53ec3d8SAlberto Garcia unsigned write_from = MAX(cow_start_to, i << s->cluster_bits);
1184d53ec3d8SAlberto Garcia unsigned write_to = MIN(cow_end_from, (i + 1) << s->cluster_bits);
1185d53ec3d8SAlberto Garcia int first_sc = offset_to_sc_index(s, write_from);
1186d53ec3d8SAlberto Garcia int last_sc = offset_to_sc_index(s, write_to - 1);
1187d53ec3d8SAlberto Garcia int cnt = qcow2_get_subcluster_range_type(bs, l2_entry, l2_bitmap,
1188d53ec3d8SAlberto Garcia first_sc, &type);
1189d53ec3d8SAlberto Garcia /* Is any of the subclusters of type != QCOW2_SUBCLUSTER_NORMAL ? */
1190d53ec3d8SAlberto Garcia if (type != QCOW2_SUBCLUSTER_NORMAL || first_sc + cnt <= last_sc) {
1191d53ec3d8SAlberto Garcia skip_cow = false;
1192d53ec3d8SAlberto Garcia }
1193d53ec3d8SAlberto Garcia } else {
1194d53ec3d8SAlberto Garcia /* If we can't skip the cow we can still look for invalid entries */
1195d53ec3d8SAlberto Garcia type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, 0);
1196d53ec3d8SAlberto Garcia }
1197d53ec3d8SAlberto Garcia if (type == QCOW2_SUBCLUSTER_INVALID) {
1198d53ec3d8SAlberto Garcia int l1_index = offset_to_l1_index(s, guest_offset);
1199d53ec3d8SAlberto Garcia uint64_t l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
1200d53ec3d8SAlberto Garcia qcow2_signal_corruption(bs, true, -1, -1, "Invalid cluster "
1201d53ec3d8SAlberto Garcia "entry found (L2 offset: %#" PRIx64
1202d53ec3d8SAlberto Garcia ", L2 index: %#x)",
1203d53ec3d8SAlberto Garcia l2_offset, l2_index + i);
1204d53ec3d8SAlberto Garcia return -EIO;
120557538c86SAlberto Garcia }
120657538c86SAlberto Garcia }
1207d53ec3d8SAlberto Garcia
1208d53ec3d8SAlberto Garcia if (skip_cow) {
1209d53ec3d8SAlberto Garcia return 0;
121057538c86SAlberto Garcia }
121157538c86SAlberto Garcia
121257538c86SAlberto Garcia /* Get the L2 entry of the first cluster */
121312c6aebeSAlberto Garcia l2_entry = get_l2_entry(s, l2_slice, l2_index);
1214d53ec3d8SAlberto Garcia l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index);
1215d53ec3d8SAlberto Garcia sc_index = offset_to_sc_index(s, guest_offset);
1216d53ec3d8SAlberto Garcia type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, sc_index);
121757538c86SAlberto Garcia
1218d53ec3d8SAlberto Garcia if (!keep_old) {
1219d53ec3d8SAlberto Garcia switch (type) {
1220d53ec3d8SAlberto Garcia case QCOW2_SUBCLUSTER_COMPRESSED:
1221d53ec3d8SAlberto Garcia cow_start_from = 0;
1222d53ec3d8SAlberto Garcia break;
1223d53ec3d8SAlberto Garcia case QCOW2_SUBCLUSTER_NORMAL:
1224d53ec3d8SAlberto Garcia case QCOW2_SUBCLUSTER_ZERO_ALLOC:
1225d53ec3d8SAlberto Garcia case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
1226d53ec3d8SAlberto Garcia if (has_subclusters(s)) {
1227d53ec3d8SAlberto Garcia /* Skip all leading zero and unallocated subclusters */
1228d53ec3d8SAlberto Garcia uint32_t alloc_bitmap = l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC;
1229d53ec3d8SAlberto Garcia cow_start_from =
1230d53ec3d8SAlberto Garcia MIN(sc_index, ctz32(alloc_bitmap)) << s->subcluster_bits;
123157538c86SAlberto Garcia } else {
123257538c86SAlberto Garcia cow_start_from = 0;
123357538c86SAlberto Garcia }
1234d53ec3d8SAlberto Garcia break;
1235d53ec3d8SAlberto Garcia case QCOW2_SUBCLUSTER_ZERO_PLAIN:
1236d53ec3d8SAlberto Garcia case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
1237d53ec3d8SAlberto Garcia cow_start_from = sc_index << s->subcluster_bits;
1238d53ec3d8SAlberto Garcia break;
1239d53ec3d8SAlberto Garcia default:
1240d53ec3d8SAlberto Garcia g_assert_not_reached();
1241d53ec3d8SAlberto Garcia }
1242d53ec3d8SAlberto Garcia } else {
1243d53ec3d8SAlberto Garcia switch (type) {
1244d53ec3d8SAlberto Garcia case QCOW2_SUBCLUSTER_NORMAL:
1245d53ec3d8SAlberto Garcia cow_start_from = cow_start_to;
1246d53ec3d8SAlberto Garcia break;
1247d53ec3d8SAlberto Garcia case QCOW2_SUBCLUSTER_ZERO_ALLOC:
1248d53ec3d8SAlberto Garcia case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
1249d53ec3d8SAlberto Garcia cow_start_from = sc_index << s->subcluster_bits;
1250d53ec3d8SAlberto Garcia break;
1251d53ec3d8SAlberto Garcia default:
1252d53ec3d8SAlberto Garcia g_assert_not_reached();
1253d53ec3d8SAlberto Garcia }
1254d53ec3d8SAlberto Garcia }
125557538c86SAlberto Garcia
125657538c86SAlberto Garcia /* Get the L2 entry of the last cluster */
1257d53ec3d8SAlberto Garcia l2_index += nb_clusters - 1;
1258d53ec3d8SAlberto Garcia l2_entry = get_l2_entry(s, l2_slice, l2_index);
1259d53ec3d8SAlberto Garcia l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index);
1260d53ec3d8SAlberto Garcia sc_index = offset_to_sc_index(s, guest_offset + bytes - 1);
1261d53ec3d8SAlberto Garcia type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, sc_index);
126257538c86SAlberto Garcia
1263d53ec3d8SAlberto Garcia if (!keep_old) {
1264d53ec3d8SAlberto Garcia switch (type) {
1265d53ec3d8SAlberto Garcia case QCOW2_SUBCLUSTER_COMPRESSED:
126657538c86SAlberto Garcia cow_end_to = ROUND_UP(cow_end_from, s->cluster_size);
1267d53ec3d8SAlberto Garcia break;
1268d53ec3d8SAlberto Garcia case QCOW2_SUBCLUSTER_NORMAL:
1269d53ec3d8SAlberto Garcia case QCOW2_SUBCLUSTER_ZERO_ALLOC:
1270d53ec3d8SAlberto Garcia case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
1271d53ec3d8SAlberto Garcia cow_end_to = ROUND_UP(cow_end_from, s->cluster_size);
1272d53ec3d8SAlberto Garcia if (has_subclusters(s)) {
1273d53ec3d8SAlberto Garcia /* Skip all trailing zero and unallocated subclusters */
1274d53ec3d8SAlberto Garcia uint32_t alloc_bitmap = l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC;
1275d53ec3d8SAlberto Garcia cow_end_to -=
1276d53ec3d8SAlberto Garcia MIN(s->subclusters_per_cluster - sc_index - 1,
1277d53ec3d8SAlberto Garcia clz32(alloc_bitmap)) << s->subcluster_bits;
1278d53ec3d8SAlberto Garcia }
1279d53ec3d8SAlberto Garcia break;
1280d53ec3d8SAlberto Garcia case QCOW2_SUBCLUSTER_ZERO_PLAIN:
1281d53ec3d8SAlberto Garcia case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
1282d53ec3d8SAlberto Garcia cow_end_to = ROUND_UP(cow_end_from, s->subcluster_size);
1283d53ec3d8SAlberto Garcia break;
1284d53ec3d8SAlberto Garcia default:
1285d53ec3d8SAlberto Garcia g_assert_not_reached();
1286d53ec3d8SAlberto Garcia }
1287d53ec3d8SAlberto Garcia } else {
1288d53ec3d8SAlberto Garcia switch (type) {
1289d53ec3d8SAlberto Garcia case QCOW2_SUBCLUSTER_NORMAL:
1290d53ec3d8SAlberto Garcia cow_end_to = cow_end_from;
1291d53ec3d8SAlberto Garcia break;
1292d53ec3d8SAlberto Garcia case QCOW2_SUBCLUSTER_ZERO_ALLOC:
1293d53ec3d8SAlberto Garcia case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
1294d53ec3d8SAlberto Garcia cow_end_to = ROUND_UP(cow_end_from, s->subcluster_size);
1295d53ec3d8SAlberto Garcia break;
1296d53ec3d8SAlberto Garcia default:
1297d53ec3d8SAlberto Garcia g_assert_not_reached();
1298d53ec3d8SAlberto Garcia }
129957538c86SAlberto Garcia }
13008f91d690SAlberto Garcia
13018f91d690SAlberto Garcia *m = g_malloc0(sizeof(**m));
13028f91d690SAlberto Garcia **m = (QCowL2Meta) {
13038f91d690SAlberto Garcia .next = old_m,
13048f91d690SAlberto Garcia
13058f91d690SAlberto Garcia .alloc_offset = host_cluster_offset,
13068f91d690SAlberto Garcia .offset = start_of_cluster(s, guest_offset),
13078f91d690SAlberto Garcia .nb_clusters = nb_clusters,
13088f91d690SAlberto Garcia
13098f91d690SAlberto Garcia .keep_old_clusters = keep_old,
13108f91d690SAlberto Garcia
13118f91d690SAlberto Garcia .cow_start = {
13128f91d690SAlberto Garcia .offset = cow_start_from,
13138f91d690SAlberto Garcia .nb_bytes = cow_start_to - cow_start_from,
13148f91d690SAlberto Garcia },
13158f91d690SAlberto Garcia .cow_end = {
13168f91d690SAlberto Garcia .offset = cow_end_from,
13178f91d690SAlberto Garcia .nb_bytes = cow_end_to - cow_end_from,
13188f91d690SAlberto Garcia },
13198f91d690SAlberto Garcia };
13208f91d690SAlberto Garcia
13218f91d690SAlberto Garcia qemu_co_queue_init(&(*m)->dependent_requests);
13228f91d690SAlberto Garcia QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);
1323d53ec3d8SAlberto Garcia
1324d53ec3d8SAlberto Garcia return 0;
13258f91d690SAlberto Garcia }
13268f91d690SAlberto Garcia
132757538c86SAlberto Garcia /*
132857538c86SAlberto Garcia * Returns true if writing to the cluster pointed to by @l2_entry
132957538c86SAlberto Garcia * requires a new allocation (that is, if the cluster is unallocated
133057538c86SAlberto Garcia * or has refcount > 1 and therefore cannot be written in-place).
133157538c86SAlberto Garcia */
1332*8f897341SKevin Wolf static bool GRAPH_RDLOCK
cluster_needs_new_alloc(BlockDriverState * bs,uint64_t l2_entry)1333*8f897341SKevin Wolf cluster_needs_new_alloc(BlockDriverState *bs, uint64_t l2_entry)
1334c1587d87SAlberto Garcia {
1335c1587d87SAlberto Garcia switch (qcow2_get_cluster_type(bs, l2_entry)) {
1336c1587d87SAlberto Garcia case QCOW2_CLUSTER_NORMAL:
133757538c86SAlberto Garcia case QCOW2_CLUSTER_ZERO_ALLOC:
1338c1587d87SAlberto Garcia if (l2_entry & QCOW_OFLAG_COPIED) {
1339c1587d87SAlberto Garcia return false;
1340c1587d87SAlberto Garcia }
1341b9be6faeSThomas Huth /* fallthrough */
1342c1587d87SAlberto Garcia case QCOW2_CLUSTER_UNALLOCATED:
1343c1587d87SAlberto Garcia case QCOW2_CLUSTER_COMPRESSED:
1344c1587d87SAlberto Garcia case QCOW2_CLUSTER_ZERO_PLAIN:
1345c1587d87SAlberto Garcia return true;
1346c1587d87SAlberto Garcia default:
1347c1587d87SAlberto Garcia abort();
1348c1587d87SAlberto Garcia }
1349c1587d87SAlberto Garcia }
1350c1587d87SAlberto Garcia
13518f91d690SAlberto Garcia /*
135257538c86SAlberto Garcia * Returns the number of contiguous clusters that can be written to
135357538c86SAlberto Garcia * using one single write request, starting from @l2_index.
135457538c86SAlberto Garcia * At most @nb_clusters are checked.
135557538c86SAlberto Garcia *
135657538c86SAlberto Garcia * If @new_alloc is true this counts clusters that are either
135757538c86SAlberto Garcia * unallocated, or allocated but with refcount > 1 (so they need to be
135857538c86SAlberto Garcia * newly allocated and COWed).
135957538c86SAlberto Garcia *
136057538c86SAlberto Garcia * If @new_alloc is false this counts clusters that are already
136157538c86SAlberto Garcia * allocated and can be overwritten in-place (this includes clusters
136257538c86SAlberto Garcia * of type QCOW2_CLUSTER_ZERO_ALLOC).
1363bf319eceSKevin Wolf */
1364*8f897341SKevin Wolf static int GRAPH_RDLOCK
count_single_write_clusters(BlockDriverState * bs,int nb_clusters,uint64_t * l2_slice,int l2_index,bool new_alloc)1365*8f897341SKevin Wolf count_single_write_clusters(BlockDriverState *bs, int nb_clusters,
1366*8f897341SKevin Wolf uint64_t *l2_slice, int l2_index, bool new_alloc)
1367bf319eceSKevin Wolf {
136857538c86SAlberto Garcia BDRVQcow2State *s = bs->opaque;
136912c6aebeSAlberto Garcia uint64_t l2_entry = get_l2_entry(s, l2_slice, l2_index);
137057538c86SAlberto Garcia uint64_t expected_offset = l2_entry & L2E_OFFSET_MASK;
1371143550a8SKevin Wolf int i;
1372bf319eceSKevin Wolf
1373143550a8SKevin Wolf for (i = 0; i < nb_clusters; i++) {
137412c6aebeSAlberto Garcia l2_entry = get_l2_entry(s, l2_slice, l2_index + i);
137557538c86SAlberto Garcia if (cluster_needs_new_alloc(bs, l2_entry) != new_alloc) {
1376bf319eceSKevin Wolf break;
1377143550a8SKevin Wolf }
137857538c86SAlberto Garcia if (!new_alloc) {
137957538c86SAlberto Garcia if (expected_offset != (l2_entry & L2E_OFFSET_MASK)) {
138057538c86SAlberto Garcia break;
138157538c86SAlberto Garcia }
138257538c86SAlberto Garcia expected_offset += s->cluster_size;
138357538c86SAlberto Garcia }
1384bf319eceSKevin Wolf }
1385bf319eceSKevin Wolf
1386bf319eceSKevin Wolf assert(i <= nb_clusters);
1387bf319eceSKevin Wolf return i;
1388bf319eceSKevin Wolf }
1389bf319eceSKevin Wolf
1390bf319eceSKevin Wolf /*
1391250196f1SKevin Wolf * Check if there already is an AIO write request in flight which allocates
1392250196f1SKevin Wolf * the same cluster. In this case we need to wait until the previous
1393250196f1SKevin Wolf * request has completed and updated the L2 table accordingly.
139465eb2e35SKevin Wolf *
139565eb2e35SKevin Wolf * Returns:
139665eb2e35SKevin Wolf * 0 if there was no dependency. *cur_bytes indicates the number of
139765eb2e35SKevin Wolf * bytes from guest_offset that can be read before the next
139865eb2e35SKevin Wolf * dependency must be processed (or the request is complete)
139965eb2e35SKevin Wolf *
140065eb2e35SKevin Wolf * -EAGAIN if we had to wait for another request, previously gathered
140165eb2e35SKevin Wolf * information on cluster allocation may be invalid now. The caller
140265eb2e35SKevin Wolf * must start over anyway, so consider *cur_bytes undefined.
1403250196f1SKevin Wolf */
handle_dependencies(BlockDriverState * bs,uint64_t guest_offset,uint64_t * cur_bytes,QCowL2Meta ** m)1404050ed2e7SPaolo Bonzini static int coroutine_fn handle_dependencies(BlockDriverState *bs,
1405050ed2e7SPaolo Bonzini uint64_t guest_offset,
1406ecdd5333SKevin Wolf uint64_t *cur_bytes, QCowL2Meta **m)
1407226c3c26SKevin Wolf {
1408ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque;
1409226c3c26SKevin Wolf QCowL2Meta *old_alloc;
141065eb2e35SKevin Wolf uint64_t bytes = *cur_bytes;
1411226c3c26SKevin Wolf
1412250196f1SKevin Wolf QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) {
1413250196f1SKevin Wolf
141465eb2e35SKevin Wolf uint64_t start = guest_offset;
141565eb2e35SKevin Wolf uint64_t end = start + bytes;
1416d53ec3d8SAlberto Garcia uint64_t old_start = start_of_cluster(s, l2meta_cow_start(old_alloc));
1417d53ec3d8SAlberto Garcia uint64_t old_end = ROUND_UP(l2meta_cow_end(old_alloc), s->cluster_size);
1418250196f1SKevin Wolf
1419d9d74f41SKevin Wolf if (end <= old_start || start >= old_end) {
1420250196f1SKevin Wolf /* No intersection */
14216d207d35SVladimir Sementsov-Ogievskiy continue;
14226d207d35SVladimir Sementsov-Ogievskiy }
14236d207d35SVladimir Sementsov-Ogievskiy
1424ff812c55SVladimir Sementsov-Ogievskiy if (old_alloc->keep_old_clusters &&
1425ff812c55SVladimir Sementsov-Ogievskiy (end <= l2meta_cow_start(old_alloc) ||
1426ff812c55SVladimir Sementsov-Ogievskiy start >= l2meta_cow_end(old_alloc)))
1427ff812c55SVladimir Sementsov-Ogievskiy {
1428ff812c55SVladimir Sementsov-Ogievskiy /*
1429ff812c55SVladimir Sementsov-Ogievskiy * Clusters intersect but COW areas don't. And cluster itself is
1430ff812c55SVladimir Sementsov-Ogievskiy * already allocated. So, there is no actual conflict.
1431ff812c55SVladimir Sementsov-Ogievskiy */
1432ff812c55SVladimir Sementsov-Ogievskiy continue;
1433ff812c55SVladimir Sementsov-Ogievskiy }
1434ff812c55SVladimir Sementsov-Ogievskiy
14356d207d35SVladimir Sementsov-Ogievskiy /* Conflict */
14366d207d35SVladimir Sementsov-Ogievskiy
1437250196f1SKevin Wolf if (start < old_start) {
1438250196f1SKevin Wolf /* Stop at the start of a running allocation */
143965eb2e35SKevin Wolf bytes = old_start - start;
1440250196f1SKevin Wolf } else {
144165eb2e35SKevin Wolf bytes = 0;
1442250196f1SKevin Wolf }
1443250196f1SKevin Wolf
14446d207d35SVladimir Sementsov-Ogievskiy /*
14456d207d35SVladimir Sementsov-Ogievskiy * Stop if an l2meta already exists. After yielding, it wouldn't
1446ecdd5333SKevin Wolf * be valid any more, so we'd have to clean up the old L2Metas
1447ecdd5333SKevin Wolf * and deal with requests depending on them before starting to
14486d207d35SVladimir Sementsov-Ogievskiy * gather new ones. Not worth the trouble.
14496d207d35SVladimir Sementsov-Ogievskiy */
1450ecdd5333SKevin Wolf if (bytes == 0 && *m) {
1451ecdd5333SKevin Wolf *cur_bytes = 0;
1452ecdd5333SKevin Wolf return 0;
1453ecdd5333SKevin Wolf }
1454ecdd5333SKevin Wolf
145565eb2e35SKevin Wolf if (bytes == 0) {
14566d207d35SVladimir Sementsov-Ogievskiy /*
14576d207d35SVladimir Sementsov-Ogievskiy * Wait for the dependency to complete. We need to recheck
14586d207d35SVladimir Sementsov-Ogievskiy * the free/allocated clusters when we continue.
14596d207d35SVladimir Sementsov-Ogievskiy */
14601ace7ceaSPaolo Bonzini qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock);
1461250196f1SKevin Wolf return -EAGAIN;
1462250196f1SKevin Wolf }
1463250196f1SKevin Wolf }
1464250196f1SKevin Wolf
146565eb2e35SKevin Wolf /* Make sure that existing clusters and new allocations are only used up to
146665eb2e35SKevin Wolf * the next dependency if we shortened the request above */
146765eb2e35SKevin Wolf *cur_bytes = bytes;
1468250196f1SKevin Wolf
1469226c3c26SKevin Wolf return 0;
1470226c3c26SKevin Wolf }
1471226c3c26SKevin Wolf
1472226c3c26SKevin Wolf /*
147357538c86SAlberto Garcia * Checks how many already allocated clusters that don't require a new
147457538c86SAlberto Garcia * allocation there are at the given guest_offset (up to *bytes).
147557538c86SAlberto Garcia * If *host_offset is not INV_OFFSET, only physically contiguous clusters
147657538c86SAlberto Garcia * beginning at this host offset are counted.
14770af729ecSKevin Wolf *
1478411d62b0SKevin Wolf * Note that guest_offset may not be cluster aligned. In this case, the
1479411d62b0SKevin Wolf * returned *host_offset points to exact byte referenced by guest_offset and
1480411d62b0SKevin Wolf * therefore isn't cluster aligned as well.
14810af729ecSKevin Wolf *
14820af729ecSKevin Wolf * Returns:
14830af729ecSKevin Wolf * 0: if no allocated clusters are available at the given offset.
14840af729ecSKevin Wolf * *bytes is normally unchanged. It is set to 0 if the cluster
148557538c86SAlberto Garcia * is allocated and can be overwritten in-place but doesn't have
148657538c86SAlberto Garcia * the right physical offset.
14870af729ecSKevin Wolf *
148857538c86SAlberto Garcia * 1: if allocated clusters that can be overwritten in place are
148957538c86SAlberto Garcia * available at the requested offset. *bytes may have decreased
149057538c86SAlberto Garcia * and describes the length of the area that can be written to.
14910af729ecSKevin Wolf *
14920af729ecSKevin Wolf * -errno: in error cases
14930af729ecSKevin Wolf */
14940bb79c97SKevin Wolf static int coroutine_fn GRAPH_RDLOCK
handle_copied(BlockDriverState * bs,uint64_t guest_offset,uint64_t * host_offset,uint64_t * bytes,QCowL2Meta ** m)14950bb79c97SKevin Wolf handle_copied(BlockDriverState *bs, uint64_t guest_offset,
14960bb79c97SKevin Wolf uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
14970af729ecSKevin Wolf {
1498ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque;
14990af729ecSKevin Wolf int l2_index;
150057538c86SAlberto Garcia uint64_t l2_entry, cluster_offset;
1501cde91766SAlberto Garcia uint64_t *l2_slice;
1502b6d36defSMax Reitz uint64_t nb_clusters;
1503c53ede9fSKevin Wolf unsigned int keep_clusters;
1504a3f1afb4SAlberto Garcia int ret;
15050af729ecSKevin Wolf
15060af729ecSKevin Wolf trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset,
15070af729ecSKevin Wolf *bytes);
15080af729ecSKevin Wolf
1509c6d619ccSKevin Wolf assert(*host_offset == INV_OFFSET || offset_into_cluster(s, guest_offset)
1510411d62b0SKevin Wolf == offset_into_cluster(s, *host_offset));
1511411d62b0SKevin Wolf
1512acb0467fSKevin Wolf /*
1513cde91766SAlberto Garcia * Calculate the number of clusters to look for. We stop at L2 slice
1514acb0467fSKevin Wolf * boundaries to keep things simple.
1515acb0467fSKevin Wolf */
1516acb0467fSKevin Wolf nb_clusters =
1517acb0467fSKevin Wolf size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
1518acb0467fSKevin Wolf
1519cde91766SAlberto Garcia l2_index = offset_to_l2_slice_index(s, guest_offset);
1520cde91766SAlberto Garcia nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
152157538c86SAlberto Garcia /* Limit total byte count to BDRV_REQUEST_MAX_BYTES */
152257538c86SAlberto Garcia nb_clusters = MIN(nb_clusters, BDRV_REQUEST_MAX_BYTES >> s->cluster_bits);
1523acb0467fSKevin Wolf
15240af729ecSKevin Wolf /* Find L2 entry for the first involved cluster */
1525cde91766SAlberto Garcia ret = get_cluster_table(bs, guest_offset, &l2_slice, &l2_index);
15260af729ecSKevin Wolf if (ret < 0) {
15270af729ecSKevin Wolf return ret;
15280af729ecSKevin Wolf }
15290af729ecSKevin Wolf
153012c6aebeSAlberto Garcia l2_entry = get_l2_entry(s, l2_slice, l2_index);
153157538c86SAlberto Garcia cluster_offset = l2_entry & L2E_OFFSET_MASK;
15320af729ecSKevin Wolf
153357538c86SAlberto Garcia if (!cluster_needs_new_alloc(bs, l2_entry)) {
153457538c86SAlberto Garcia if (offset_into_cluster(s, cluster_offset)) {
153557538c86SAlberto Garcia qcow2_signal_corruption(bs, true, -1, -1, "%s cluster offset "
153657538c86SAlberto Garcia "%#" PRIx64 " unaligned (guest offset: %#"
153757538c86SAlberto Garcia PRIx64 ")", l2_entry & QCOW_OFLAG_ZERO ?
153857538c86SAlberto Garcia "Preallocated zero" : "Data",
153957538c86SAlberto Garcia cluster_offset, guest_offset);
1540a97c67eeSMax Reitz ret = -EIO;
1541a97c67eeSMax Reitz goto out;
1542a97c67eeSMax Reitz }
1543a97c67eeSMax Reitz
154457538c86SAlberto Garcia /* If a specific host_offset is required, check it */
154557538c86SAlberto Garcia if (*host_offset != INV_OFFSET && cluster_offset != *host_offset) {
1546e62daaf6SKevin Wolf *bytes = 0;
1547e62daaf6SKevin Wolf ret = 0;
1548e62daaf6SKevin Wolf goto out;
1549e62daaf6SKevin Wolf }
1550e62daaf6SKevin Wolf
15510af729ecSKevin Wolf /* We keep all QCOW_OFLAG_COPIED clusters */
155257538c86SAlberto Garcia keep_clusters = count_single_write_clusters(bs, nb_clusters, l2_slice,
155357538c86SAlberto Garcia l2_index, false);
1554c53ede9fSKevin Wolf assert(keep_clusters <= nb_clusters);
1555c53ede9fSKevin Wolf
1556c53ede9fSKevin Wolf *bytes = MIN(*bytes,
1557c53ede9fSKevin Wolf keep_clusters * s->cluster_size
1558c53ede9fSKevin Wolf - offset_into_cluster(s, guest_offset));
155957538c86SAlberto Garcia assert(*bytes != 0);
156057538c86SAlberto Garcia
1561d53ec3d8SAlberto Garcia ret = calculate_l2_meta(bs, cluster_offset, guest_offset,
156257538c86SAlberto Garcia *bytes, l2_slice, m, true);
1563d53ec3d8SAlberto Garcia if (ret < 0) {
1564d53ec3d8SAlberto Garcia goto out;
1565d53ec3d8SAlberto Garcia }
15660af729ecSKevin Wolf
15670af729ecSKevin Wolf ret = 1;
15680af729ecSKevin Wolf } else {
15690af729ecSKevin Wolf ret = 0;
15700af729ecSKevin Wolf }
15710af729ecSKevin Wolf
15720af729ecSKevin Wolf /* Cleanup */
1573e62daaf6SKevin Wolf out:
1574cde91766SAlberto Garcia qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
15750af729ecSKevin Wolf
1576e62daaf6SKevin Wolf /* Only return a host offset if we actually made progress. Otherwise we
1577e62daaf6SKevin Wolf * would make requirements for handle_alloc() that it can't fulfill */
1578a97c67eeSMax Reitz if (ret > 0) {
157957538c86SAlberto Garcia *host_offset = cluster_offset + offset_into_cluster(s, guest_offset);
1580e62daaf6SKevin Wolf }
1581e62daaf6SKevin Wolf
15820af729ecSKevin Wolf return ret;
15830af729ecSKevin Wolf }
15840af729ecSKevin Wolf
15850af729ecSKevin Wolf /*
1586226c3c26SKevin Wolf * Allocates new clusters for the given guest_offset.
1587226c3c26SKevin Wolf *
1588226c3c26SKevin Wolf * At most *nb_clusters are allocated, and on return *nb_clusters is updated to
1589226c3c26SKevin Wolf * contain the number of clusters that have been allocated and are contiguous
1590226c3c26SKevin Wolf * in the image file.
1591226c3c26SKevin Wolf *
1592c6d619ccSKevin Wolf * If *host_offset is not INV_OFFSET, it specifies the offset in the image file
1593c6d619ccSKevin Wolf * at which the new clusters must start. *nb_clusters can be 0 on return in
1594c6d619ccSKevin Wolf * this case if the cluster at host_offset is already in use. If *host_offset
1595c6d619ccSKevin Wolf * is INV_OFFSET, the clusters can be allocated anywhere in the image file.
1596226c3c26SKevin Wolf *
1597226c3c26SKevin Wolf * *host_offset is updated to contain the offset into the image file at which
1598226c3c26SKevin Wolf * the first allocated cluster starts.
1599226c3c26SKevin Wolf *
1600226c3c26SKevin Wolf * Return 0 on success and -errno in error cases. -EAGAIN means that the
1601226c3c26SKevin Wolf * function has been waiting for another request and the allocation must be
1602226c3c26SKevin Wolf * restarted, but the whole request should not be failed.
1603226c3c26SKevin Wolf */
16040bb79c97SKevin Wolf static int coroutine_fn GRAPH_RDLOCK
do_alloc_cluster_offset(BlockDriverState * bs,uint64_t guest_offset,uint64_t * host_offset,uint64_t * nb_clusters)16050bb79c97SKevin Wolf do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
16060bb79c97SKevin Wolf uint64_t *host_offset, uint64_t *nb_clusters)
1607226c3c26SKevin Wolf {
1608ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque;
1609226c3c26SKevin Wolf
1610226c3c26SKevin Wolf trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset,
1611226c3c26SKevin Wolf *host_offset, *nb_clusters);
1612226c3c26SKevin Wolf
1613966b000fSKevin Wolf if (has_data_file(bs)) {
1614966b000fSKevin Wolf assert(*host_offset == INV_OFFSET ||
1615966b000fSKevin Wolf *host_offset == start_of_cluster(s, guest_offset));
1616966b000fSKevin Wolf *host_offset = start_of_cluster(s, guest_offset);
1617966b000fSKevin Wolf return 0;
1618966b000fSKevin Wolf }
1619966b000fSKevin Wolf
1620250196f1SKevin Wolf /* Allocate new clusters */
1621250196f1SKevin Wolf trace_qcow2_cluster_alloc_phys(qemu_coroutine_self());
1622c6d619ccSKevin Wolf if (*host_offset == INV_OFFSET) {
1623df021791SKevin Wolf int64_t cluster_offset =
1624df021791SKevin Wolf qcow2_alloc_clusters(bs, *nb_clusters * s->cluster_size);
1625250196f1SKevin Wolf if (cluster_offset < 0) {
1626250196f1SKevin Wolf return cluster_offset;
1627250196f1SKevin Wolf }
1628250196f1SKevin Wolf *host_offset = cluster_offset;
1629250196f1SKevin Wolf return 0;
1630df021791SKevin Wolf } else {
1631b6d36defSMax Reitz int64_t ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters);
1632df021791SKevin Wolf if (ret < 0) {
1633df021791SKevin Wolf return ret;
1634df021791SKevin Wolf }
1635df021791SKevin Wolf *nb_clusters = ret;
1636df021791SKevin Wolf return 0;
1637df021791SKevin Wolf }
1638250196f1SKevin Wolf }
1639250196f1SKevin Wolf
1640250196f1SKevin Wolf /*
164157538c86SAlberto Garcia * Allocates new clusters for an area that is either still unallocated or
164257538c86SAlberto Garcia * cannot be overwritten in-place. If *host_offset is not INV_OFFSET,
164357538c86SAlberto Garcia * clusters are only allocated if the new allocation can match the specified
164457538c86SAlberto Garcia * host offset.
164510f0ed8bSKevin Wolf *
1646411d62b0SKevin Wolf * Note that guest_offset may not be cluster aligned. In this case, the
1647411d62b0SKevin Wolf * returned *host_offset points to exact byte referenced by guest_offset and
1648411d62b0SKevin Wolf * therefore isn't cluster aligned as well.
164910f0ed8bSKevin Wolf *
165010f0ed8bSKevin Wolf * Returns:
165110f0ed8bSKevin Wolf * 0: if no clusters could be allocated. *bytes is set to 0,
165210f0ed8bSKevin Wolf * *host_offset is left unchanged.
165310f0ed8bSKevin Wolf *
165410f0ed8bSKevin Wolf * 1: if new clusters were allocated. *bytes may be decreased if the
165510f0ed8bSKevin Wolf * new allocation doesn't cover all of the requested area.
165610f0ed8bSKevin Wolf * *host_offset is updated to contain the host offset of the first
165710f0ed8bSKevin Wolf * newly allocated cluster.
165810f0ed8bSKevin Wolf *
165910f0ed8bSKevin Wolf * -errno: in error cases
166010f0ed8bSKevin Wolf */
16610bb79c97SKevin Wolf static int coroutine_fn GRAPH_RDLOCK
handle_alloc(BlockDriverState * bs,uint64_t guest_offset,uint64_t * host_offset,uint64_t * bytes,QCowL2Meta ** m)16620bb79c97SKevin Wolf handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
16630bb79c97SKevin Wolf uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
166410f0ed8bSKevin Wolf {
1665ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque;
166610f0ed8bSKevin Wolf int l2_index;
16676d99a344SAlberto Garcia uint64_t *l2_slice;
1668b6d36defSMax Reitz uint64_t nb_clusters;
166910f0ed8bSKevin Wolf int ret;
167010f0ed8bSKevin Wolf
167157538c86SAlberto Garcia uint64_t alloc_cluster_offset;
167210f0ed8bSKevin Wolf
167310f0ed8bSKevin Wolf trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset,
167410f0ed8bSKevin Wolf *bytes);
167510f0ed8bSKevin Wolf assert(*bytes > 0);
167610f0ed8bSKevin Wolf
1677f5bc6350SKevin Wolf /*
16786d99a344SAlberto Garcia * Calculate the number of clusters to look for. We stop at L2 slice
1679f5bc6350SKevin Wolf * boundaries to keep things simple.
1680f5bc6350SKevin Wolf */
1681c37f4cd7SKevin Wolf nb_clusters =
1682c37f4cd7SKevin Wolf size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
1683c37f4cd7SKevin Wolf
16846d99a344SAlberto Garcia l2_index = offset_to_l2_slice_index(s, guest_offset);
16856d99a344SAlberto Garcia nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
168657538c86SAlberto Garcia /* Limit total allocation byte count to BDRV_REQUEST_MAX_BYTES */
168757538c86SAlberto Garcia nb_clusters = MIN(nb_clusters, BDRV_REQUEST_MAX_BYTES >> s->cluster_bits);
1688d1b9d19fSMax Reitz
168910f0ed8bSKevin Wolf /* Find L2 entry for the first involved cluster */
16906d99a344SAlberto Garcia ret = get_cluster_table(bs, guest_offset, &l2_slice, &l2_index);
169110f0ed8bSKevin Wolf if (ret < 0) {
169210f0ed8bSKevin Wolf return ret;
169310f0ed8bSKevin Wolf }
169410f0ed8bSKevin Wolf
169557538c86SAlberto Garcia nb_clusters = count_single_write_clusters(bs, nb_clusters,
169657538c86SAlberto Garcia l2_slice, l2_index, true);
169710f0ed8bSKevin Wolf
1698ecdd5333SKevin Wolf /* This function is only called when there were no non-COW clusters, so if
1699ecdd5333SKevin Wolf * we can't find any unallocated or COW clusters either, something is
1700ecdd5333SKevin Wolf * wrong with our code. */
1701ecdd5333SKevin Wolf assert(nb_clusters > 0);
1702ecdd5333SKevin Wolf
170357538c86SAlberto Garcia /* Allocate at a given offset in the image file */
1704c6d619ccSKevin Wolf alloc_cluster_offset = *host_offset == INV_OFFSET ? INV_OFFSET :
1705c6d619ccSKevin Wolf start_of_cluster(s, *host_offset);
170683baa9a4SKevin Wolf ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset,
170710f0ed8bSKevin Wolf &nb_clusters);
170810f0ed8bSKevin Wolf if (ret < 0) {
170957538c86SAlberto Garcia goto out;
171010f0ed8bSKevin Wolf }
171110f0ed8bSKevin Wolf
171283baa9a4SKevin Wolf /* Can't extend contiguous allocation */
171383baa9a4SKevin Wolf if (nb_clusters == 0) {
171483baa9a4SKevin Wolf *bytes = 0;
171557538c86SAlberto Garcia ret = 0;
171657538c86SAlberto Garcia goto out;
171783baa9a4SKevin Wolf }
171883baa9a4SKevin Wolf
1719c6d619ccSKevin Wolf assert(alloc_cluster_offset != INV_OFFSET);
1720ff52aab2SMax Reitz
172110f0ed8bSKevin Wolf /*
172283baa9a4SKevin Wolf * Save info needed for meta data update.
172383baa9a4SKevin Wolf *
172485567393SKevin Wolf * requested_bytes: Number of bytes from the start of the first
172510f0ed8bSKevin Wolf * newly allocated cluster to the end of the (possibly shortened
172610f0ed8bSKevin Wolf * before) write request.
172710f0ed8bSKevin Wolf *
172885567393SKevin Wolf * avail_bytes: Number of bytes from the start of the first
172910f0ed8bSKevin Wolf * newly allocated to the end of the last newly allocated cluster.
173010f0ed8bSKevin Wolf *
173185567393SKevin Wolf * nb_bytes: The number of bytes from the start of the first
173283baa9a4SKevin Wolf * newly allocated cluster to the end of the area that the write
173310f0ed8bSKevin Wolf * request actually writes to (excluding COW at the end)
173410f0ed8bSKevin Wolf */
173585567393SKevin Wolf uint64_t requested_bytes = *bytes + offset_into_cluster(s, guest_offset);
1736d1b9d19fSMax Reitz int avail_bytes = nb_clusters << s->cluster_bits;
173785567393SKevin Wolf int nb_bytes = MIN(requested_bytes, avail_bytes);
173810f0ed8bSKevin Wolf
1739411d62b0SKevin Wolf *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset);
174085567393SKevin Wolf *bytes = MIN(*bytes, nb_bytes - offset_into_cluster(s, guest_offset));
1741c37f4cd7SKevin Wolf assert(*bytes != 0);
174210f0ed8bSKevin Wolf
1743d53ec3d8SAlberto Garcia ret = calculate_l2_meta(bs, alloc_cluster_offset, guest_offset, *bytes,
1744d53ec3d8SAlberto Garcia l2_slice, m, false);
1745d53ec3d8SAlberto Garcia if (ret < 0) {
1746d53ec3d8SAlberto Garcia goto out;
1747d53ec3d8SAlberto Garcia }
17488f91d690SAlberto Garcia
174957538c86SAlberto Garcia ret = 1;
175010f0ed8bSKevin Wolf
175157538c86SAlberto Garcia out:
175257538c86SAlberto Garcia qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
175310f0ed8bSKevin Wolf return ret;
175410f0ed8bSKevin Wolf }
175510f0ed8bSKevin Wolf
175610f0ed8bSKevin Wolf /*
17572b60c5b9SAlberto Garcia * For a given area on the virtual disk defined by @offset and @bytes,
17582b60c5b9SAlberto Garcia * find the corresponding area on the qcow2 image, allocating new
17592b60c5b9SAlberto Garcia * clusters (or subclusters) if necessary. The result can span a
17602b60c5b9SAlberto Garcia * combination of allocated and previously unallocated clusters.
176145aba42fSKevin Wolf *
1762bfd0989aSAlberto Garcia * Note that offset may not be cluster aligned. In this case, the returned
1763bfd0989aSAlberto Garcia * *host_offset points to exact byte referenced by offset and therefore
1764bfd0989aSAlberto Garcia * isn't cluster aligned as well.
1765bfd0989aSAlberto Garcia *
17662b60c5b9SAlberto Garcia * On return, @host_offset is set to the beginning of the requested
17672b60c5b9SAlberto Garcia * area. This area is guaranteed to be contiguous on the qcow2 file
17682b60c5b9SAlberto Garcia * but it can be smaller than initially requested. In this case @bytes
17692b60c5b9SAlberto Garcia * is updated with the actual size.
177045aba42fSKevin Wolf *
17712b60c5b9SAlberto Garcia * If any clusters or subclusters were allocated then @m contains a
17722b60c5b9SAlberto Garcia * list with the information of all the affected regions. Note that
17732b60c5b9SAlberto Garcia * this can happen regardless of whether this function succeeds or
17742b60c5b9SAlberto Garcia * not. The caller is responsible for updating the L2 metadata of the
17752b60c5b9SAlberto Garcia * allocated clusters (on success) or freeing them (on failure), and
17762b60c5b9SAlberto Garcia * for clearing the contents of @m afterwards in both cases.
1777148da7eaSKevin Wolf *
177868d100e9SKevin Wolf * If the request conflicts with another write request in flight, the coroutine
177968d100e9SKevin Wolf * is queued and will be reentered when the dependency has completed.
1780148da7eaSKevin Wolf *
1781148da7eaSKevin Wolf * Return 0 on success and -errno in error cases
178245aba42fSKevin Wolf */
qcow2_alloc_host_offset(BlockDriverState * bs,uint64_t offset,unsigned int * bytes,uint64_t * host_offset,QCowL2Meta ** m)1783050ed2e7SPaolo Bonzini int coroutine_fn qcow2_alloc_host_offset(BlockDriverState *bs, uint64_t offset,
1784050ed2e7SPaolo Bonzini unsigned int *bytes,
1785050ed2e7SPaolo Bonzini uint64_t *host_offset,
1786d46a0bb2SKevin Wolf QCowL2Meta **m)
178745aba42fSKevin Wolf {
1788ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque;
1789710c2496SKevin Wolf uint64_t start, remaining;
1790250196f1SKevin Wolf uint64_t cluster_offset;
179165eb2e35SKevin Wolf uint64_t cur_bytes;
1792710c2496SKevin Wolf int ret;
179345aba42fSKevin Wolf
1794d46a0bb2SKevin Wolf trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, *bytes);
1795710c2496SKevin Wolf
179672424114SKevin Wolf again:
179716f0587eSHu Tao start = offset;
1798d46a0bb2SKevin Wolf remaining = *bytes;
1799c6d619ccSKevin Wolf cluster_offset = INV_OFFSET;
1800c6d619ccSKevin Wolf *host_offset = INV_OFFSET;
1801ecdd5333SKevin Wolf cur_bytes = 0;
1802ecdd5333SKevin Wolf *m = NULL;
18030af729ecSKevin Wolf
18042c3b32d2SKevin Wolf while (true) {
1805ecdd5333SKevin Wolf
1806c6d619ccSKevin Wolf if (*host_offset == INV_OFFSET && cluster_offset != INV_OFFSET) {
1807bfd0989aSAlberto Garcia *host_offset = cluster_offset;
1808ecdd5333SKevin Wolf }
1809ecdd5333SKevin Wolf
1810ecdd5333SKevin Wolf assert(remaining >= cur_bytes);
1811ecdd5333SKevin Wolf
1812ecdd5333SKevin Wolf start += cur_bytes;
1813ecdd5333SKevin Wolf remaining -= cur_bytes;
1814c6d619ccSKevin Wolf
1815c6d619ccSKevin Wolf if (cluster_offset != INV_OFFSET) {
1816ecdd5333SKevin Wolf cluster_offset += cur_bytes;
1817c6d619ccSKevin Wolf }
1818ecdd5333SKevin Wolf
1819ecdd5333SKevin Wolf if (remaining == 0) {
1820ecdd5333SKevin Wolf break;
1821ecdd5333SKevin Wolf }
1822ecdd5333SKevin Wolf
1823ecdd5333SKevin Wolf cur_bytes = remaining;
1824ecdd5333SKevin Wolf
1825250196f1SKevin Wolf /*
182617a71e58SKevin Wolf * Now start gathering as many contiguous clusters as possible:
182717a71e58SKevin Wolf *
182817a71e58SKevin Wolf * 1. Check for overlaps with in-flight allocations
182917a71e58SKevin Wolf *
18302c3b32d2SKevin Wolf * a) Overlap not in the first cluster -> shorten this request and
18312c3b32d2SKevin Wolf * let the caller handle the rest in its next loop iteration.
183217a71e58SKevin Wolf *
18332c3b32d2SKevin Wolf * b) Real overlaps of two requests. Yield and restart the search
18342c3b32d2SKevin Wolf * for contiguous clusters (the situation could have changed
18352c3b32d2SKevin Wolf * while we were sleeping)
183617a71e58SKevin Wolf *
183717a71e58SKevin Wolf * c) TODO: Request starts in the same cluster as the in-flight
18382c3b32d2SKevin Wolf * allocation ends. Shorten the COW of the in-fight allocation,
18392c3b32d2SKevin Wolf * set cluster_offset to write to the same cluster and set up
18402c3b32d2SKevin Wolf * the right synchronisation between the in-flight request and
18412c3b32d2SKevin Wolf * the new one.
184217a71e58SKevin Wolf */
1843ecdd5333SKevin Wolf ret = handle_dependencies(bs, start, &cur_bytes, m);
184417a71e58SKevin Wolf if (ret == -EAGAIN) {
1845ecdd5333SKevin Wolf /* Currently handle_dependencies() doesn't yield if we already had
1846ecdd5333SKevin Wolf * an allocation. If it did, we would have to clean up the L2Meta
1847ecdd5333SKevin Wolf * structs before starting over. */
1848ecdd5333SKevin Wolf assert(*m == NULL);
184917a71e58SKevin Wolf goto again;
185017a71e58SKevin Wolf } else if (ret < 0) {
185117a71e58SKevin Wolf return ret;
1852ecdd5333SKevin Wolf } else if (cur_bytes == 0) {
1853ecdd5333SKevin Wolf break;
185417a71e58SKevin Wolf } else {
185517a71e58SKevin Wolf /* handle_dependencies() may have decreased cur_bytes (shortened
185617a71e58SKevin Wolf * the allocations below) so that the next dependency is processed
185717a71e58SKevin Wolf * correctly during the next loop iteration. */
185817a71e58SKevin Wolf }
185917a71e58SKevin Wolf
186072424114SKevin Wolf /*
18610af729ecSKevin Wolf * 2. Count contiguous COPIED clusters.
186272424114SKevin Wolf */
1863710c2496SKevin Wolf ret = handle_copied(bs, start, &cluster_offset, &cur_bytes, m);
186472424114SKevin Wolf if (ret < 0) {
186572424114SKevin Wolf return ret;
18660af729ecSKevin Wolf } else if (ret) {
1867ecdd5333SKevin Wolf continue;
1868e62daaf6SKevin Wolf } else if (cur_bytes == 0) {
18692c3b32d2SKevin Wolf break;
187072424114SKevin Wolf }
187172424114SKevin Wolf
18720af729ecSKevin Wolf /*
18730af729ecSKevin Wolf * 3. If the request still hasn't completed, allocate new clusters,
18740af729ecSKevin Wolf * considering any cluster_offset of steps 1c or 2.
18750af729ecSKevin Wolf */
1876710c2496SKevin Wolf ret = handle_alloc(bs, start, &cluster_offset, &cur_bytes, m);
1877037689d8SKevin Wolf if (ret < 0) {
1878037689d8SKevin Wolf return ret;
1879710c2496SKevin Wolf } else if (ret) {
1880ecdd5333SKevin Wolf continue;
18812c3b32d2SKevin Wolf } else {
18822c3b32d2SKevin Wolf assert(cur_bytes == 0);
18832c3b32d2SKevin Wolf break;
18842c3b32d2SKevin Wolf }
1885710c2496SKevin Wolf }
1886250196f1SKevin Wolf
1887d46a0bb2SKevin Wolf *bytes -= remaining;
1888d46a0bb2SKevin Wolf assert(*bytes > 0);
1889c6d619ccSKevin Wolf assert(*host_offset != INV_OFFSET);
1890bfd0989aSAlberto Garcia assert(offset_into_cluster(s, *host_offset) ==
1891bfd0989aSAlberto Garcia offset_into_cluster(s, offset));
189245aba42fSKevin Wolf
1893148da7eaSKevin Wolf return 0;
189445aba42fSKevin Wolf }
189545aba42fSKevin Wolf
18965ea929e3SKevin Wolf /*
18975ea929e3SKevin Wolf * This discards as many clusters of nb_clusters as possible at once (i.e.
189821ab3addSAlberto Garcia * all clusters in the same L2 slice) and returns the number of discarded
18995ea929e3SKevin Wolf * clusters.
19005ea929e3SKevin Wolf */
19010bb79c97SKevin Wolf static int GRAPH_RDLOCK
discard_in_l2_slice(BlockDriverState * bs,uint64_t offset,uint64_t nb_clusters,enum qcow2_discard_type type,bool full_discard)19020bb79c97SKevin Wolf discard_in_l2_slice(BlockDriverState *bs, uint64_t offset, uint64_t nb_clusters,
190321ab3addSAlberto Garcia enum qcow2_discard_type type, bool full_discard)
19045ea929e3SKevin Wolf {
1905ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque;
190621ab3addSAlberto Garcia uint64_t *l2_slice;
19075ea929e3SKevin Wolf int l2_index;
19085ea929e3SKevin Wolf int ret;
19095ea929e3SKevin Wolf int i;
19105ea929e3SKevin Wolf
191121ab3addSAlberto Garcia ret = get_cluster_table(bs, offset, &l2_slice, &l2_index);
19125ea929e3SKevin Wolf if (ret < 0) {
19135ea929e3SKevin Wolf return ret;
19145ea929e3SKevin Wolf }
19155ea929e3SKevin Wolf
191621ab3addSAlberto Garcia /* Limit nb_clusters to one L2 slice */
191721ab3addSAlberto Garcia nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
1918b6d36defSMax Reitz assert(nb_clusters <= INT_MAX);
19195ea929e3SKevin Wolf
19205ea929e3SKevin Wolf for (i = 0; i < nb_clusters; i++) {
1921a68cd703SAlberto Garcia uint64_t old_l2_entry = get_l2_entry(s, l2_slice, l2_index + i);
1922a68cd703SAlberto Garcia uint64_t old_l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index + i);
1923a68cd703SAlberto Garcia uint64_t new_l2_entry = old_l2_entry;
1924a68cd703SAlberto Garcia uint64_t new_l2_bitmap = old_l2_bitmap;
1925a68cd703SAlberto Garcia QCow2ClusterType cluster_type =
1926a68cd703SAlberto Garcia qcow2_get_cluster_type(bs, old_l2_entry);
192742a2890aSJean-Louis Dupond bool keep_reference = (cluster_type != QCOW2_CLUSTER_COMPRESSED) &&
192842a2890aSJean-Louis Dupond !full_discard &&
192942a2890aSJean-Louis Dupond (s->discard_no_unref &&
193042a2890aSJean-Louis Dupond type == QCOW2_DISCARD_REQUEST);
1931a71835a0SKevin Wolf
1932a71835a0SKevin Wolf /*
1933a68cd703SAlberto Garcia * If full_discard is true, the cluster should not read back as zeroes,
1934a68cd703SAlberto Garcia * but rather fall through to the backing file.
1935a68cd703SAlberto Garcia *
1936808c4b6fSMax Reitz * If full_discard is false, make sure that a discarded area reads back
1937808c4b6fSMax Reitz * as zeroes for v3 images (we cannot do it for v2 without actually
1938808c4b6fSMax Reitz * writing a zero-filled buffer). We can skip the operation if the
1939808c4b6fSMax Reitz * cluster is already marked as zero, or if it's unallocated and we
1940808c4b6fSMax Reitz * don't have a backing file.
1941a71835a0SKevin Wolf *
1942237d78f8SEric Blake * TODO We might want to use bdrv_block_status(bs) here, but we're
1943a71835a0SKevin Wolf * holding s->lock, so that doesn't work today.
1944a71835a0SKevin Wolf */
1945a68cd703SAlberto Garcia if (full_discard) {
1946a68cd703SAlberto Garcia new_l2_entry = new_l2_bitmap = 0;
1947a68cd703SAlberto Garcia } else if (bs->backing || qcow2_cluster_is_allocated(cluster_type)) {
1948a68cd703SAlberto Garcia if (has_subclusters(s)) {
194942a2890aSJean-Louis Dupond if (keep_reference) {
195042a2890aSJean-Louis Dupond new_l2_entry = old_l2_entry;
195142a2890aSJean-Louis Dupond } else {
1952a68cd703SAlberto Garcia new_l2_entry = 0;
195342a2890aSJean-Louis Dupond }
1954a68cd703SAlberto Garcia new_l2_bitmap = QCOW_L2_BITMAP_ALL_ZEROES;
1955a68cd703SAlberto Garcia } else {
195642a2890aSJean-Louis Dupond if (s->qcow_version >= 3) {
195742a2890aSJean-Louis Dupond if (keep_reference) {
195842a2890aSJean-Louis Dupond new_l2_entry |= QCOW_OFLAG_ZERO;
195942a2890aSJean-Louis Dupond } else {
196042a2890aSJean-Louis Dupond new_l2_entry = QCOW_OFLAG_ZERO;
196142a2890aSJean-Louis Dupond }
196242a2890aSJean-Louis Dupond } else {
196342a2890aSJean-Louis Dupond new_l2_entry = 0;
196442a2890aSJean-Louis Dupond }
1965a71835a0SKevin Wolf }
1966808c4b6fSMax Reitz }
1967c883db0dSMax Reitz
1968a68cd703SAlberto Garcia if (old_l2_entry == new_l2_entry && old_l2_bitmap == new_l2_bitmap) {
1969a68cd703SAlberto Garcia continue;
19705ea929e3SKevin Wolf }
19715ea929e3SKevin Wolf
19725ea929e3SKevin Wolf /* First remove L2 entries */
197321ab3addSAlberto Garcia qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
1974a68cd703SAlberto Garcia set_l2_entry(s, l2_slice, l2_index + i, new_l2_entry);
1975a68cd703SAlberto Garcia if (has_subclusters(s)) {
1976a68cd703SAlberto Garcia set_l2_bitmap(s, l2_slice, l2_index + i, new_l2_bitmap);
1977a71835a0SKevin Wolf }
197842a2890aSJean-Louis Dupond if (!keep_reference) {
19795ea929e3SKevin Wolf /* Then decrease the refcount */
19803fec237fSAlberto Garcia qcow2_free_any_cluster(bs, old_l2_entry, type);
198142a2890aSJean-Louis Dupond } else if (s->discard_passthrough[type] &&
198242a2890aSJean-Louis Dupond (cluster_type == QCOW2_CLUSTER_NORMAL ||
198342a2890aSJean-Louis Dupond cluster_type == QCOW2_CLUSTER_ZERO_ALLOC)) {
198442a2890aSJean-Louis Dupond /* If we keep the reference, pass on the discard still */
198542a2890aSJean-Louis Dupond bdrv_pdiscard(s->data_file, old_l2_entry & L2E_OFFSET_MASK,
198642a2890aSJean-Louis Dupond s->cluster_size);
198742a2890aSJean-Louis Dupond }
19885ea929e3SKevin Wolf }
19895ea929e3SKevin Wolf
199021ab3addSAlberto Garcia qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
19915ea929e3SKevin Wolf
19925ea929e3SKevin Wolf return nb_clusters;
19935ea929e3SKevin Wolf }
19945ea929e3SKevin Wolf
qcow2_cluster_discard(BlockDriverState * bs,uint64_t offset,uint64_t bytes,enum qcow2_discard_type type,bool full_discard)1995d2cb36afSEric Blake int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset,
1996d2cb36afSEric Blake uint64_t bytes, enum qcow2_discard_type type,
1997d2cb36afSEric Blake bool full_discard)
19985ea929e3SKevin Wolf {
1999ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque;
2000d2cb36afSEric Blake uint64_t end_offset = offset + bytes;
2001b6d36defSMax Reitz uint64_t nb_clusters;
2002d2cb36afSEric Blake int64_t cleared;
20035ea929e3SKevin Wolf int ret;
20045ea929e3SKevin Wolf
2005f10ee139SEric Blake /* Caller must pass aligned values, except at image end */
20060c1bd469SEric Blake assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
2007f10ee139SEric Blake assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) ||
2008f10ee139SEric Blake end_offset == bs->total_sectors << BDRV_SECTOR_BITS);
20095ea929e3SKevin Wolf
2010d2cb36afSEric Blake nb_clusters = size_to_clusters(s, bytes);
20115ea929e3SKevin Wolf
20120b919faeSKevin Wolf s->cache_discards = true;
20130b919faeSKevin Wolf
201421ab3addSAlberto Garcia /* Each L2 slice is handled by its own loop iteration */
20155ea929e3SKevin Wolf while (nb_clusters > 0) {
201621ab3addSAlberto Garcia cleared = discard_in_l2_slice(bs, offset, nb_clusters, type,
2017d2cb36afSEric Blake full_discard);
2018d2cb36afSEric Blake if (cleared < 0) {
2019d2cb36afSEric Blake ret = cleared;
20200b919faeSKevin Wolf goto fail;
20215ea929e3SKevin Wolf }
20225ea929e3SKevin Wolf
2023d2cb36afSEric Blake nb_clusters -= cleared;
2024d2cb36afSEric Blake offset += (cleared * s->cluster_size);
20255ea929e3SKevin Wolf }
20265ea929e3SKevin Wolf
20270b919faeSKevin Wolf ret = 0;
20280b919faeSKevin Wolf fail:
20290b919faeSKevin Wolf s->cache_discards = false;
20300b919faeSKevin Wolf qcow2_process_discards(bs, ret);
20310b919faeSKevin Wolf
20320b919faeSKevin Wolf return ret;
20335ea929e3SKevin Wolf }
2034621f0589SKevin Wolf
2035621f0589SKevin Wolf /*
2036621f0589SKevin Wolf * This zeroes as many clusters of nb_clusters as possible at once (i.e.
2037a9a9f8f0SAlberto Garcia * all clusters in the same L2 slice) and returns the number of zeroed
2038621f0589SKevin Wolf * clusters.
2039621f0589SKevin Wolf */
20400bb79c97SKevin Wolf static int coroutine_fn GRAPH_RDLOCK
zero_in_l2_slice(BlockDriverState * bs,uint64_t offset,uint64_t nb_clusters,int flags)204170bacc44SPaolo Bonzini zero_in_l2_slice(BlockDriverState *bs, uint64_t offset,
2042170f4b2eSFam Zheng uint64_t nb_clusters, int flags)
2043621f0589SKevin Wolf {
2044ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque;
2045a9a9f8f0SAlberto Garcia uint64_t *l2_slice;
2046621f0589SKevin Wolf int l2_index;
2047621f0589SKevin Wolf int ret;
2048621f0589SKevin Wolf int i;
2049621f0589SKevin Wolf
2050a9a9f8f0SAlberto Garcia ret = get_cluster_table(bs, offset, &l2_slice, &l2_index);
2051621f0589SKevin Wolf if (ret < 0) {
2052621f0589SKevin Wolf return ret;
2053621f0589SKevin Wolf }
2054621f0589SKevin Wolf
2055a9a9f8f0SAlberto Garcia /* Limit nb_clusters to one L2 slice */
2056a9a9f8f0SAlberto Garcia nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
2057b6d36defSMax Reitz assert(nb_clusters <= INT_MAX);
2058621f0589SKevin Wolf
2059621f0589SKevin Wolf for (i = 0; i < nb_clusters; i++) {
2060205fa507SAlberto Garcia uint64_t old_l2_entry = get_l2_entry(s, l2_slice, l2_index + i);
2061205fa507SAlberto Garcia uint64_t old_l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index + i);
2062205fa507SAlberto Garcia QCow2ClusterType type = qcow2_get_cluster_type(bs, old_l2_entry);
2063205fa507SAlberto Garcia bool unmap = (type == QCOW2_CLUSTER_COMPRESSED) ||
2064205fa507SAlberto Garcia ((flags & BDRV_REQ_MAY_UNMAP) && qcow2_cluster_is_allocated(type));
2065b2b10904SJean-Louis Dupond bool keep_reference =
2066b2b10904SJean-Louis Dupond (s->discard_no_unref && type != QCOW2_CLUSTER_COMPRESSED);
2067b2b10904SJean-Louis Dupond uint64_t new_l2_entry = old_l2_entry;
2068205fa507SAlberto Garcia uint64_t new_l2_bitmap = old_l2_bitmap;
2069621f0589SKevin Wolf
2070b2b10904SJean-Louis Dupond if (unmap && !keep_reference) {
2071b2b10904SJean-Louis Dupond new_l2_entry = 0;
2072b2b10904SJean-Louis Dupond }
2073b2b10904SJean-Louis Dupond
2074205fa507SAlberto Garcia if (has_subclusters(s)) {
2075205fa507SAlberto Garcia new_l2_bitmap = QCOW_L2_BITMAP_ALL_ZEROES;
2076205fa507SAlberto Garcia } else {
2077205fa507SAlberto Garcia new_l2_entry |= QCOW_OFLAG_ZERO;
2078205fa507SAlberto Garcia }
2079621f0589SKevin Wolf
2080205fa507SAlberto Garcia if (old_l2_entry == new_l2_entry && old_l2_bitmap == new_l2_bitmap) {
208106cc5e2bSEric Blake continue;
208206cc5e2bSEric Blake }
208306cc5e2bSEric Blake
2084c8bf9a91SMaxim Levitsky /* First update L2 entries */
2085a9a9f8f0SAlberto Garcia qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
2086205fa507SAlberto Garcia set_l2_entry(s, l2_slice, l2_index + i, new_l2_entry);
2087205fa507SAlberto Garcia if (has_subclusters(s)) {
2088205fa507SAlberto Garcia set_l2_bitmap(s, l2_slice, l2_index + i, new_l2_bitmap);
2089621f0589SKevin Wolf }
2090c8bf9a91SMaxim Levitsky
2091c8bf9a91SMaxim Levitsky if (unmap) {
2092b2b10904SJean-Louis Dupond if (!keep_reference) {
2093b2b10904SJean-Louis Dupond /* Then decrease the refcount */
2094c8bf9a91SMaxim Levitsky qcow2_free_any_cluster(bs, old_l2_entry, QCOW2_DISCARD_REQUEST);
2095b2b10904SJean-Louis Dupond } else if (s->discard_passthrough[QCOW2_DISCARD_REQUEST] &&
2096b2b10904SJean-Louis Dupond (type == QCOW2_CLUSTER_NORMAL ||
2097b2b10904SJean-Louis Dupond type == QCOW2_CLUSTER_ZERO_ALLOC)) {
2098b2b10904SJean-Louis Dupond /* If we keep the reference, pass on the discard still */
2099b2b10904SJean-Louis Dupond bdrv_pdiscard(s->data_file, old_l2_entry & L2E_OFFSET_MASK,
2100b2b10904SJean-Louis Dupond s->cluster_size);
2101b2b10904SJean-Louis Dupond }
2102c8bf9a91SMaxim Levitsky }
2103621f0589SKevin Wolf }
2104621f0589SKevin Wolf
2105a9a9f8f0SAlberto Garcia qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
2106621f0589SKevin Wolf
2107621f0589SKevin Wolf return nb_clusters;
2108621f0589SKevin Wolf }
2109621f0589SKevin Wolf
21100bb79c97SKevin Wolf static int coroutine_fn GRAPH_RDLOCK
zero_l2_subclusters(BlockDriverState * bs,uint64_t offset,unsigned nb_subclusters)2111a39bae4eSPaolo Bonzini zero_l2_subclusters(BlockDriverState *bs, uint64_t offset,
2112a6841a2dSAlberto Garcia unsigned nb_subclusters)
2113a6841a2dSAlberto Garcia {
2114a6841a2dSAlberto Garcia BDRVQcow2State *s = bs->opaque;
2115a6841a2dSAlberto Garcia uint64_t *l2_slice;
2116a6841a2dSAlberto Garcia uint64_t old_l2_bitmap, l2_bitmap;
2117a6841a2dSAlberto Garcia int l2_index, ret, sc = offset_to_sc_index(s, offset);
2118a6841a2dSAlberto Garcia
2119a6841a2dSAlberto Garcia /* For full clusters use zero_in_l2_slice() instead */
2120a6841a2dSAlberto Garcia assert(nb_subclusters > 0 && nb_subclusters < s->subclusters_per_cluster);
2121a6841a2dSAlberto Garcia assert(sc + nb_subclusters <= s->subclusters_per_cluster);
2122a6841a2dSAlberto Garcia assert(offset_into_subcluster(s, offset) == 0);
2123a6841a2dSAlberto Garcia
2124a6841a2dSAlberto Garcia ret = get_cluster_table(bs, offset, &l2_slice, &l2_index);
2125a6841a2dSAlberto Garcia if (ret < 0) {
2126a6841a2dSAlberto Garcia return ret;
2127a6841a2dSAlberto Garcia }
2128a6841a2dSAlberto Garcia
2129a6841a2dSAlberto Garcia switch (qcow2_get_cluster_type(bs, get_l2_entry(s, l2_slice, l2_index))) {
2130a6841a2dSAlberto Garcia case QCOW2_CLUSTER_COMPRESSED:
2131a6841a2dSAlberto Garcia ret = -ENOTSUP; /* We cannot partially zeroize compressed clusters */
2132a6841a2dSAlberto Garcia goto out;
2133a6841a2dSAlberto Garcia case QCOW2_CLUSTER_NORMAL:
2134a6841a2dSAlberto Garcia case QCOW2_CLUSTER_UNALLOCATED:
2135a6841a2dSAlberto Garcia break;
2136a6841a2dSAlberto Garcia default:
2137a6841a2dSAlberto Garcia g_assert_not_reached();
2138a6841a2dSAlberto Garcia }
2139a6841a2dSAlberto Garcia
2140a6841a2dSAlberto Garcia old_l2_bitmap = l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index);
2141a6841a2dSAlberto Garcia
2142a6841a2dSAlberto Garcia l2_bitmap |= QCOW_OFLAG_SUB_ZERO_RANGE(sc, sc + nb_subclusters);
2143a6841a2dSAlberto Garcia l2_bitmap &= ~QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc + nb_subclusters);
2144a6841a2dSAlberto Garcia
2145a6841a2dSAlberto Garcia if (old_l2_bitmap != l2_bitmap) {
2146a6841a2dSAlberto Garcia set_l2_bitmap(s, l2_slice, l2_index, l2_bitmap);
2147a6841a2dSAlberto Garcia qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
2148a6841a2dSAlberto Garcia }
2149a6841a2dSAlberto Garcia
2150a6841a2dSAlberto Garcia ret = 0;
2151a6841a2dSAlberto Garcia out:
2152a6841a2dSAlberto Garcia qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
2153a6841a2dSAlberto Garcia
2154a6841a2dSAlberto Garcia return ret;
2155a6841a2dSAlberto Garcia }
2156a6841a2dSAlberto Garcia
qcow2_subcluster_zeroize(BlockDriverState * bs,uint64_t offset,uint64_t bytes,int flags)2157050ed2e7SPaolo Bonzini int coroutine_fn qcow2_subcluster_zeroize(BlockDriverState *bs, uint64_t offset,
2158d2cb36afSEric Blake uint64_t bytes, int flags)
2159621f0589SKevin Wolf {
2160ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque;
2161d2cb36afSEric Blake uint64_t end_offset = offset + bytes;
2162b6d36defSMax Reitz uint64_t nb_clusters;
2163a6841a2dSAlberto Garcia unsigned head, tail;
2164d2cb36afSEric Blake int64_t cleared;
2165621f0589SKevin Wolf int ret;
2166621f0589SKevin Wolf
21676c3944dcSKevin Wolf /* If we have to stay in sync with an external data file, zero out
21686c3944dcSKevin Wolf * s->data_file first. */
21696c3944dcSKevin Wolf if (data_file_is_raw(bs)) {
21706c3944dcSKevin Wolf assert(has_data_file(bs));
21716c3944dcSKevin Wolf ret = bdrv_co_pwrite_zeroes(s->data_file, offset, bytes, flags);
21726c3944dcSKevin Wolf if (ret < 0) {
21736c3944dcSKevin Wolf return ret;
21746c3944dcSKevin Wolf }
21756c3944dcSKevin Wolf }
21766c3944dcSKevin Wolf
2177f10ee139SEric Blake /* Caller must pass aligned values, except at image end */
2178a6841a2dSAlberto Garcia assert(offset_into_subcluster(s, offset) == 0);
2179a6841a2dSAlberto Garcia assert(offset_into_subcluster(s, end_offset) == 0 ||
2180f01643fbSKevin Wolf end_offset >= bs->total_sectors << BDRV_SECTOR_BITS);
2181f10ee139SEric Blake
218261b30439SKevin Wolf /*
218361b30439SKevin Wolf * The zero flag is only supported by version 3 and newer. However, if we
218461b30439SKevin Wolf * have no backing file, we can resort to discard in version 2.
218561b30439SKevin Wolf */
2186621f0589SKevin Wolf if (s->qcow_version < 3) {
218761b30439SKevin Wolf if (!bs->backing) {
218861b30439SKevin Wolf return qcow2_cluster_discard(bs, offset, bytes,
218961b30439SKevin Wolf QCOW2_DISCARD_REQUEST, false);
219061b30439SKevin Wolf }
2191621f0589SKevin Wolf return -ENOTSUP;
2192621f0589SKevin Wolf }
2193621f0589SKevin Wolf
2194a6841a2dSAlberto Garcia head = MIN(end_offset, ROUND_UP(offset, s->cluster_size)) - offset;
2195a6841a2dSAlberto Garcia offset += head;
2196a6841a2dSAlberto Garcia
2197a6841a2dSAlberto Garcia tail = (end_offset >= bs->total_sectors << BDRV_SECTOR_BITS) ? 0 :
2198a6841a2dSAlberto Garcia end_offset - MAX(offset, start_of_cluster(s, end_offset));
2199a6841a2dSAlberto Garcia end_offset -= tail;
2200621f0589SKevin Wolf
22010b919faeSKevin Wolf s->cache_discards = true;
22020b919faeSKevin Wolf
2203a6841a2dSAlberto Garcia if (head) {
2204a6841a2dSAlberto Garcia ret = zero_l2_subclusters(bs, offset - head,
2205a6841a2dSAlberto Garcia size_to_subclusters(s, head));
2206a6841a2dSAlberto Garcia if (ret < 0) {
2207a6841a2dSAlberto Garcia goto fail;
2208a6841a2dSAlberto Garcia }
2209a6841a2dSAlberto Garcia }
2210a6841a2dSAlberto Garcia
2211a6841a2dSAlberto Garcia /* Each L2 slice is handled by its own loop iteration */
2212a6841a2dSAlberto Garcia nb_clusters = size_to_clusters(s, end_offset - offset);
2213a6841a2dSAlberto Garcia
2214621f0589SKevin Wolf while (nb_clusters > 0) {
2215a9a9f8f0SAlberto Garcia cleared = zero_in_l2_slice(bs, offset, nb_clusters, flags);
2216d2cb36afSEric Blake if (cleared < 0) {
2217d2cb36afSEric Blake ret = cleared;
22180b919faeSKevin Wolf goto fail;
2219621f0589SKevin Wolf }
2220621f0589SKevin Wolf
2221d2cb36afSEric Blake nb_clusters -= cleared;
2222d2cb36afSEric Blake offset += (cleared * s->cluster_size);
2223621f0589SKevin Wolf }
2224621f0589SKevin Wolf
2225a6841a2dSAlberto Garcia if (tail) {
2226a6841a2dSAlberto Garcia ret = zero_l2_subclusters(bs, end_offset, size_to_subclusters(s, tail));
2227a6841a2dSAlberto Garcia if (ret < 0) {
2228a6841a2dSAlberto Garcia goto fail;
2229a6841a2dSAlberto Garcia }
2230a6841a2dSAlberto Garcia }
2231a6841a2dSAlberto Garcia
22320b919faeSKevin Wolf ret = 0;
22330b919faeSKevin Wolf fail:
22340b919faeSKevin Wolf s->cache_discards = false;
22350b919faeSKevin Wolf qcow2_process_discards(bs, ret);
22360b919faeSKevin Wolf
22370b919faeSKevin Wolf return ret;
2238621f0589SKevin Wolf }
223932b6444dSMax Reitz
224032b6444dSMax Reitz /*
224132b6444dSMax Reitz * Expands all zero clusters in a specific L1 table (or deallocates them, for
224232b6444dSMax Reitz * non-backed non-pre-allocated zero clusters).
224332b6444dSMax Reitz *
22444057a2b2SMax Reitz * l1_entries and *visited_l1_entries are used to keep track of progress for
22454057a2b2SMax Reitz * status_cb(). l1_entries contains the total number of L1 entries and
22464057a2b2SMax Reitz * *visited_l1_entries counts all visited L1 entries.
224732b6444dSMax Reitz */
22480bb79c97SKevin Wolf static int GRAPH_RDLOCK
expand_zero_clusters_in_l1(BlockDriverState * bs,uint64_t * l1_table,int l1_size,int64_t * visited_l1_entries,int64_t l1_entries,BlockDriverAmendStatusCB * status_cb,void * cb_opaque)22490bb79c97SKevin Wolf expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
2250ecf58777SMax Reitz int l1_size, int64_t *visited_l1_entries,
22514057a2b2SMax Reitz int64_t l1_entries,
22528b13976dSMax Reitz BlockDriverAmendStatusCB *status_cb,
22538b13976dSMax Reitz void *cb_opaque)
225432b6444dSMax Reitz {
2255ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque;
225632b6444dSMax Reitz bool is_active_l1 = (l1_table == s->l1_table);
2257415184f5SAlberto Garcia uint64_t *l2_slice = NULL;
2258415184f5SAlberto Garcia unsigned slice, slice_size2, n_slices;
225932b6444dSMax Reitz int ret;
226032b6444dSMax Reitz int i, j;
226132b6444dSMax Reitz
22627bbb5920SAlberto Garcia /* qcow2_downgrade() is not allowed in images with subclusters */
22637bbb5920SAlberto Garcia assert(!has_subclusters(s));
22647bbb5920SAlberto Garcia
2265c8fd8554SAlberto Garcia slice_size2 = s->l2_slice_size * l2_entry_size(s);
2266415184f5SAlberto Garcia n_slices = s->cluster_size / slice_size2;
2267415184f5SAlberto Garcia
226832b6444dSMax Reitz if (!is_active_l1) {
226932b6444dSMax Reitz /* inactive L2 tables require a buffer to be stored in when loading
227032b6444dSMax Reitz * them from disk */
2271415184f5SAlberto Garcia l2_slice = qemu_try_blockalign(bs->file->bs, slice_size2);
2272415184f5SAlberto Garcia if (l2_slice == NULL) {
2273de82815dSKevin Wolf return -ENOMEM;
2274de82815dSKevin Wolf }
227532b6444dSMax Reitz }
227632b6444dSMax Reitz
227732b6444dSMax Reitz for (i = 0; i < l1_size; i++) {
227832b6444dSMax Reitz uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK;
22790e06528eSMax Reitz uint64_t l2_refcount;
228032b6444dSMax Reitz
228132b6444dSMax Reitz if (!l2_offset) {
228232b6444dSMax Reitz /* unallocated */
22834057a2b2SMax Reitz (*visited_l1_entries)++;
22844057a2b2SMax Reitz if (status_cb) {
22858b13976dSMax Reitz status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque);
22864057a2b2SMax Reitz }
228732b6444dSMax Reitz continue;
228832b6444dSMax Reitz }
228932b6444dSMax Reitz
22908dd93d93SMax Reitz if (offset_into_cluster(s, l2_offset)) {
22918dd93d93SMax Reitz qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
22928dd93d93SMax Reitz PRIx64 " unaligned (L1 index: %#x)",
22938dd93d93SMax Reitz l2_offset, i);
22948dd93d93SMax Reitz ret = -EIO;
22958dd93d93SMax Reitz goto fail;
22968dd93d93SMax Reitz }
22978dd93d93SMax Reitz
22989b765486SAlberto Garcia ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
22999b765486SAlberto Garcia &l2_refcount);
23009b765486SAlberto Garcia if (ret < 0) {
23019b765486SAlberto Garcia goto fail;
23029b765486SAlberto Garcia }
23039b765486SAlberto Garcia
2304415184f5SAlberto Garcia for (slice = 0; slice < n_slices; slice++) {
2305415184f5SAlberto Garcia uint64_t slice_offset = l2_offset + slice * slice_size2;
2306415184f5SAlberto Garcia bool l2_dirty = false;
230732b6444dSMax Reitz if (is_active_l1) {
230832b6444dSMax Reitz /* get active L2 tables from cache */
2309415184f5SAlberto Garcia ret = qcow2_cache_get(bs, s->l2_table_cache, slice_offset,
2310415184f5SAlberto Garcia (void **)&l2_slice);
231132b6444dSMax Reitz } else {
231232b6444dSMax Reitz /* load inactive L2 tables from disk */
231332cc71deSAlberto Faria ret = bdrv_pread(bs->file, slice_offset, slice_size2,
231432cc71deSAlberto Faria l2_slice, 0);
231532b6444dSMax Reitz }
231632b6444dSMax Reitz if (ret < 0) {
231732b6444dSMax Reitz goto fail;
231832b6444dSMax Reitz }
231932b6444dSMax Reitz
2320415184f5SAlberto Garcia for (j = 0; j < s->l2_slice_size; j++) {
232112c6aebeSAlberto Garcia uint64_t l2_entry = get_l2_entry(s, l2_slice, j);
2322ecf58777SMax Reitz int64_t offset = l2_entry & L2E_OFFSET_MASK;
2323226494ffSAlberto Garcia QCow2ClusterType cluster_type =
2324808c2bb4SKevin Wolf qcow2_get_cluster_type(bs, l2_entry);
232532b6444dSMax Reitz
2326fdfab37dSEric Blake if (cluster_type != QCOW2_CLUSTER_ZERO_PLAIN &&
2327fdfab37dSEric Blake cluster_type != QCOW2_CLUSTER_ZERO_ALLOC) {
232832b6444dSMax Reitz continue;
232932b6444dSMax Reitz }
233032b6444dSMax Reitz
2331fdfab37dSEric Blake if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
2332760e0063SKevin Wolf if (!bs->backing) {
23337bbb5920SAlberto Garcia /*
23347bbb5920SAlberto Garcia * not backed; therefore we can simply deallocate the
23357bbb5920SAlberto Garcia * cluster. No need to call set_l2_bitmap(), this
23367bbb5920SAlberto Garcia * function doesn't support images with subclusters.
23377bbb5920SAlberto Garcia */
233812c6aebeSAlberto Garcia set_l2_entry(s, l2_slice, j, 0);
233932b6444dSMax Reitz l2_dirty = true;
234032b6444dSMax Reitz continue;
234132b6444dSMax Reitz }
234232b6444dSMax Reitz
234332b6444dSMax Reitz offset = qcow2_alloc_clusters(bs, s->cluster_size);
234432b6444dSMax Reitz if (offset < 0) {
234532b6444dSMax Reitz ret = offset;
234632b6444dSMax Reitz goto fail;
234732b6444dSMax Reitz }
2348ecf58777SMax Reitz
23493a75a870SAlberto Garcia /* The offset must fit in the offset field */
23503a75a870SAlberto Garcia assert((offset & L2E_OFFSET_MASK) == offset);
23513a75a870SAlberto Garcia
2352ecf58777SMax Reitz if (l2_refcount > 1) {
2353226494ffSAlberto Garcia /* For shared L2 tables, set the refcount accordingly
2354226494ffSAlberto Garcia * (it is already 1 and needs to be l2_refcount) */
2355226494ffSAlberto Garcia ret = qcow2_update_cluster_refcount(
2356226494ffSAlberto Garcia bs, offset >> s->cluster_bits,
23572aabe7c7SMax Reitz refcount_diff(1, l2_refcount), false,
2358ecf58777SMax Reitz QCOW2_DISCARD_OTHER);
2359ecf58777SMax Reitz if (ret < 0) {
2360ecf58777SMax Reitz qcow2_free_clusters(bs, offset, s->cluster_size,
2361ecf58777SMax Reitz QCOW2_DISCARD_OTHER);
2362ecf58777SMax Reitz goto fail;
2363ecf58777SMax Reitz }
2364ecf58777SMax Reitz }
236532b6444dSMax Reitz }
236632b6444dSMax Reitz
23678dd93d93SMax Reitz if (offset_into_cluster(s, offset)) {
2368415184f5SAlberto Garcia int l2_index = slice * s->l2_slice_size + j;
2369226494ffSAlberto Garcia qcow2_signal_corruption(
2370226494ffSAlberto Garcia bs, true, -1, -1,
2371bcb07dbaSEric Blake "Cluster allocation offset "
23728dd93d93SMax Reitz "%#" PRIx64 " unaligned (L2 offset: %#"
23738dd93d93SMax Reitz PRIx64 ", L2 index: %#x)", offset,
2374415184f5SAlberto Garcia l2_offset, l2_index);
2375fdfab37dSEric Blake if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
23768dd93d93SMax Reitz qcow2_free_clusters(bs, offset, s->cluster_size,
23778dd93d93SMax Reitz QCOW2_DISCARD_ALWAYS);
23788dd93d93SMax Reitz }
23798dd93d93SMax Reitz ret = -EIO;
23808dd93d93SMax Reitz goto fail;
23818dd93d93SMax Reitz }
23828dd93d93SMax Reitz
2383226494ffSAlberto Garcia ret = qcow2_pre_write_overlap_check(bs, 0, offset,
2384966b000fSKevin Wolf s->cluster_size, true);
238532b6444dSMax Reitz if (ret < 0) {
2386fdfab37dSEric Blake if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
238732b6444dSMax Reitz qcow2_free_clusters(bs, offset, s->cluster_size,
238832b6444dSMax Reitz QCOW2_DISCARD_ALWAYS);
2389320c7066SMax Reitz }
239032b6444dSMax Reitz goto fail;
239132b6444dSMax Reitz }
239232b6444dSMax Reitz
2393966b000fSKevin Wolf ret = bdrv_pwrite_zeroes(s->data_file, offset,
2394966b000fSKevin Wolf s->cluster_size, 0);
239532b6444dSMax Reitz if (ret < 0) {
2396fdfab37dSEric Blake if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
239732b6444dSMax Reitz qcow2_free_clusters(bs, offset, s->cluster_size,
239832b6444dSMax Reitz QCOW2_DISCARD_ALWAYS);
2399320c7066SMax Reitz }
240032b6444dSMax Reitz goto fail;
240132b6444dSMax Reitz }
240232b6444dSMax Reitz
2403ecf58777SMax Reitz if (l2_refcount == 1) {
240412c6aebeSAlberto Garcia set_l2_entry(s, l2_slice, j, offset | QCOW_OFLAG_COPIED);
2405ecf58777SMax Reitz } else {
240612c6aebeSAlberto Garcia set_l2_entry(s, l2_slice, j, offset);
2407e390cf5aSMax Reitz }
24087bbb5920SAlberto Garcia /*
24097bbb5920SAlberto Garcia * No need to call set_l2_bitmap() after set_l2_entry() because
24107bbb5920SAlberto Garcia * this function doesn't support images with subclusters.
24117bbb5920SAlberto Garcia */
2412ecf58777SMax Reitz l2_dirty = true;
241332b6444dSMax Reitz }
241432b6444dSMax Reitz
241532b6444dSMax Reitz if (is_active_l1) {
241632b6444dSMax Reitz if (l2_dirty) {
2417415184f5SAlberto Garcia qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
241832b6444dSMax Reitz qcow2_cache_depends_on_flush(s->l2_table_cache);
241932b6444dSMax Reitz }
2420415184f5SAlberto Garcia qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
242132b6444dSMax Reitz } else {
242232b6444dSMax Reitz if (l2_dirty) {
2423226494ffSAlberto Garcia ret = qcow2_pre_write_overlap_check(
2424226494ffSAlberto Garcia bs, QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2,
2425966b000fSKevin Wolf slice_offset, slice_size2, false);
242632b6444dSMax Reitz if (ret < 0) {
242732b6444dSMax Reitz goto fail;
242832b6444dSMax Reitz }
242932b6444dSMax Reitz
243032cc71deSAlberto Faria ret = bdrv_pwrite(bs->file, slice_offset, slice_size2,
243132cc71deSAlberto Faria l2_slice, 0);
243232b6444dSMax Reitz if (ret < 0) {
243332b6444dSMax Reitz goto fail;
243432b6444dSMax Reitz }
243532b6444dSMax Reitz }
243632b6444dSMax Reitz }
2437226494ffSAlberto Garcia }
24384057a2b2SMax Reitz
24394057a2b2SMax Reitz (*visited_l1_entries)++;
24404057a2b2SMax Reitz if (status_cb) {
24418b13976dSMax Reitz status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque);
24424057a2b2SMax Reitz }
244332b6444dSMax Reitz }
244432b6444dSMax Reitz
244532b6444dSMax Reitz ret = 0;
244632b6444dSMax Reitz
244732b6444dSMax Reitz fail:
2448415184f5SAlberto Garcia if (l2_slice) {
244932b6444dSMax Reitz if (!is_active_l1) {
2450415184f5SAlberto Garcia qemu_vfree(l2_slice);
245132b6444dSMax Reitz } else {
2452415184f5SAlberto Garcia qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
245332b6444dSMax Reitz }
245432b6444dSMax Reitz }
245532b6444dSMax Reitz return ret;
245632b6444dSMax Reitz }
245732b6444dSMax Reitz
245832b6444dSMax Reitz /*
245932b6444dSMax Reitz * For backed images, expands all zero clusters on the image. For non-backed
246032b6444dSMax Reitz * images, deallocates all non-pre-allocated zero clusters (and claims the
246132b6444dSMax Reitz * allocation for pre-allocated ones). This is important for downgrading to a
246232b6444dSMax Reitz * qcow2 version which doesn't yet support metadata zero clusters.
246332b6444dSMax Reitz */
qcow2_expand_zero_clusters(BlockDriverState * bs,BlockDriverAmendStatusCB * status_cb,void * cb_opaque)24644057a2b2SMax Reitz int qcow2_expand_zero_clusters(BlockDriverState *bs,
24658b13976dSMax Reitz BlockDriverAmendStatusCB *status_cb,
24668b13976dSMax Reitz void *cb_opaque)
246732b6444dSMax Reitz {
2468ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque;
246932b6444dSMax Reitz uint64_t *l1_table = NULL;
24704057a2b2SMax Reitz int64_t l1_entries = 0, visited_l1_entries = 0;
247132b6444dSMax Reitz int ret;
247232b6444dSMax Reitz int i, j;
247332b6444dSMax Reitz
24744057a2b2SMax Reitz if (status_cb) {
24754057a2b2SMax Reitz l1_entries = s->l1_size;
24764057a2b2SMax Reitz for (i = 0; i < s->nb_snapshots; i++) {
24774057a2b2SMax Reitz l1_entries += s->snapshots[i].l1_size;
24784057a2b2SMax Reitz }
24794057a2b2SMax Reitz }
24804057a2b2SMax Reitz
248132b6444dSMax Reitz ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size,
24824057a2b2SMax Reitz &visited_l1_entries, l1_entries,
24838b13976dSMax Reitz status_cb, cb_opaque);
248432b6444dSMax Reitz if (ret < 0) {
248532b6444dSMax Reitz goto fail;
248632b6444dSMax Reitz }
248732b6444dSMax Reitz
248832b6444dSMax Reitz /* Inactive L1 tables may point to active L2 tables - therefore it is
248932b6444dSMax Reitz * necessary to flush the L2 table cache before trying to access the L2
249032b6444dSMax Reitz * tables pointed to by inactive L1 entries (else we might try to expand
249132b6444dSMax Reitz * zero clusters that have already been expanded); furthermore, it is also
249232b6444dSMax Reitz * necessary to empty the L2 table cache, since it may contain tables which
249332b6444dSMax Reitz * are now going to be modified directly on disk, bypassing the cache.
249432b6444dSMax Reitz * qcow2_cache_empty() does both for us. */
249532b6444dSMax Reitz ret = qcow2_cache_empty(bs, s->l2_table_cache);
249632b6444dSMax Reitz if (ret < 0) {
249732b6444dSMax Reitz goto fail;
249832b6444dSMax Reitz }
249932b6444dSMax Reitz
250032b6444dSMax Reitz for (i = 0; i < s->nb_snapshots; i++) {
2501c9a442e4SAlberto Garcia int l1_size2;
2502c9a442e4SAlberto Garcia uint64_t *new_l1_table;
2503c9a442e4SAlberto Garcia Error *local_err = NULL;
250432b6444dSMax Reitz
2505c9a442e4SAlberto Garcia ret = qcow2_validate_table(bs, s->snapshots[i].l1_table_offset,
250602b1ecfaSAlberto Garcia s->snapshots[i].l1_size, L1E_SIZE,
2507c9a442e4SAlberto Garcia QCOW_MAX_L1_SIZE, "Snapshot L1 table",
2508c9a442e4SAlberto Garcia &local_err);
2509c9a442e4SAlberto Garcia if (ret < 0) {
2510c9a442e4SAlberto Garcia error_report_err(local_err);
2511c9a442e4SAlberto Garcia goto fail;
2512c9a442e4SAlberto Garcia }
2513c9a442e4SAlberto Garcia
251402b1ecfaSAlberto Garcia l1_size2 = s->snapshots[i].l1_size * L1E_SIZE;
2515c9a442e4SAlberto Garcia new_l1_table = g_try_realloc(l1_table, l1_size2);
2516de7269d2SAlberto Garcia
2517de7269d2SAlberto Garcia if (!new_l1_table) {
2518de7269d2SAlberto Garcia ret = -ENOMEM;
2519de7269d2SAlberto Garcia goto fail;
2520de7269d2SAlberto Garcia }
2521de7269d2SAlberto Garcia
2522de7269d2SAlberto Garcia l1_table = new_l1_table;
252332b6444dSMax Reitz
252432cc71deSAlberto Faria ret = bdrv_pread(bs->file, s->snapshots[i].l1_table_offset, l1_size2,
252532cc71deSAlberto Faria l1_table, 0);
252632b6444dSMax Reitz if (ret < 0) {
252732b6444dSMax Reitz goto fail;
252832b6444dSMax Reitz }
252932b6444dSMax Reitz
253032b6444dSMax Reitz for (j = 0; j < s->snapshots[i].l1_size; j++) {
253132b6444dSMax Reitz be64_to_cpus(&l1_table[j]);
253232b6444dSMax Reitz }
253332b6444dSMax Reitz
253432b6444dSMax Reitz ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size,
25354057a2b2SMax Reitz &visited_l1_entries, l1_entries,
25368b13976dSMax Reitz status_cb, cb_opaque);
253732b6444dSMax Reitz if (ret < 0) {
253832b6444dSMax Reitz goto fail;
253932b6444dSMax Reitz }
254032b6444dSMax Reitz }
254132b6444dSMax Reitz
254232b6444dSMax Reitz ret = 0;
254332b6444dSMax Reitz
254432b6444dSMax Reitz fail:
254532b6444dSMax Reitz g_free(l1_table);
254632b6444dSMax Reitz return ret;
254732b6444dSMax Reitz }
2548a6e09846SVladimir Sementsov-Ogievskiy
qcow2_parse_compressed_l2_entry(BlockDriverState * bs,uint64_t l2_entry,uint64_t * coffset,int * csize)2549a6e09846SVladimir Sementsov-Ogievskiy void qcow2_parse_compressed_l2_entry(BlockDriverState *bs, uint64_t l2_entry,
2550a6e09846SVladimir Sementsov-Ogievskiy uint64_t *coffset, int *csize)
2551a6e09846SVladimir Sementsov-Ogievskiy {
2552a6e09846SVladimir Sementsov-Ogievskiy BDRVQcow2State *s = bs->opaque;
2553a6e09846SVladimir Sementsov-Ogievskiy int nb_csectors;
2554a6e09846SVladimir Sementsov-Ogievskiy
2555a6e09846SVladimir Sementsov-Ogievskiy assert(qcow2_get_cluster_type(bs, l2_entry) == QCOW2_CLUSTER_COMPRESSED);
2556a6e09846SVladimir Sementsov-Ogievskiy
2557a6e09846SVladimir Sementsov-Ogievskiy *coffset = l2_entry & s->cluster_offset_mask;
2558a6e09846SVladimir Sementsov-Ogievskiy
2559a6e09846SVladimir Sementsov-Ogievskiy nb_csectors = ((l2_entry >> s->csize_shift) & s->csize_mask) + 1;
2560a6e09846SVladimir Sementsov-Ogievskiy *csize = nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE -
2561a6e09846SVladimir Sementsov-Ogievskiy (*coffset & (QCOW2_COMPRESSED_SECTOR_SIZE - 1));
2562a6e09846SVladimir Sementsov-Ogievskiy }
2563