145aba42fSKevin Wolf /* 245aba42fSKevin Wolf * Block driver for the QCOW version 2 format 345aba42fSKevin Wolf * 445aba42fSKevin Wolf * Copyright (c) 2004-2006 Fabrice Bellard 545aba42fSKevin Wolf * 645aba42fSKevin Wolf * Permission is hereby granted, free of charge, to any person obtaining a copy 745aba42fSKevin Wolf * of this software and associated documentation files (the "Software"), to deal 845aba42fSKevin Wolf * in the Software without restriction, including without limitation the rights 945aba42fSKevin Wolf * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 1045aba42fSKevin Wolf * copies of the Software, and to permit persons to whom the Software is 1145aba42fSKevin Wolf * furnished to do so, subject to the following conditions: 1245aba42fSKevin Wolf * 1345aba42fSKevin Wolf * The above copyright notice and this permission notice shall be included in 1445aba42fSKevin Wolf * all copies or substantial portions of the Software. 1545aba42fSKevin Wolf * 1645aba42fSKevin Wolf * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1745aba42fSKevin Wolf * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1845aba42fSKevin Wolf * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1945aba42fSKevin Wolf * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 2045aba42fSKevin Wolf * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 2145aba42fSKevin Wolf * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 2245aba42fSKevin Wolf * THE SOFTWARE. 2345aba42fSKevin Wolf */ 2445aba42fSKevin Wolf 2580c71a24SPeter Maydell #include "qemu/osdep.h" 2645aba42fSKevin Wolf #include <zlib.h> 2745aba42fSKevin Wolf 28da34e65cSMarkus Armbruster #include "qapi/error.h" 2945aba42fSKevin Wolf #include "qemu-common.h" 30737e150eSPaolo Bonzini #include "block/block_int.h" 3145aba42fSKevin Wolf #include "block/qcow2.h" 3258369e22SPaolo Bonzini #include "qemu/bswap.h" 333cce16f4SKevin Wolf #include "trace.h" 3445aba42fSKevin Wolf 352cf7cfa1SKevin Wolf int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, 362cf7cfa1SKevin Wolf bool exact_size) 3745aba42fSKevin Wolf { 38ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 392cf7cfa1SKevin Wolf int new_l1_size2, ret, i; 4045aba42fSKevin Wolf uint64_t *new_l1_table; 41fda74f82SMax Reitz int64_t old_l1_table_offset, old_l1_size; 422cf7cfa1SKevin Wolf int64_t new_l1_table_offset, new_l1_size; 4345aba42fSKevin Wolf uint8_t data[12]; 4445aba42fSKevin Wolf 4572893756SStefan Hajnoczi if (min_size <= s->l1_size) 4645aba42fSKevin Wolf return 0; 4772893756SStefan Hajnoczi 48b93f9950SMax Reitz /* Do a sanity check on min_size before trying to calculate new_l1_size 49b93f9950SMax Reitz * (this prevents overflows during the while loop for the calculation of 50b93f9950SMax Reitz * new_l1_size) */ 51b93f9950SMax Reitz if (min_size > INT_MAX / sizeof(uint64_t)) { 52b93f9950SMax Reitz return -EFBIG; 53b93f9950SMax Reitz } 54b93f9950SMax Reitz 5572893756SStefan Hajnoczi if (exact_size) { 5672893756SStefan Hajnoczi new_l1_size = min_size; 5772893756SStefan Hajnoczi } else { 5872893756SStefan Hajnoczi /* Bump size up to reduce the number of times we have to grow */ 5972893756SStefan Hajnoczi new_l1_size = s->l1_size; 60d191d12dSStefan Weil if (new_l1_size == 0) { 61d191d12dSStefan Weil new_l1_size = 1; 62d191d12dSStefan Weil } 6345aba42fSKevin Wolf while (min_size > new_l1_size) { 6445aba42fSKevin Wolf new_l1_size = (new_l1_size * 3 + 1) / 2; 6545aba42fSKevin Wolf } 6672893756SStefan Hajnoczi } 6772893756SStefan Hajnoczi 6884c26520SMax Reitz QEMU_BUILD_BUG_ON(QCOW_MAX_L1_SIZE > INT_MAX); 6984c26520SMax Reitz if (new_l1_size > QCOW_MAX_L1_SIZE / sizeof(uint64_t)) { 702cf7cfa1SKevin Wolf return -EFBIG; 712cf7cfa1SKevin Wolf } 722cf7cfa1SKevin Wolf 7345aba42fSKevin Wolf #ifdef DEBUG_ALLOC2 742cf7cfa1SKevin Wolf fprintf(stderr, "grow l1_table from %d to %" PRId64 "\n", 752cf7cfa1SKevin Wolf s->l1_size, new_l1_size); 7645aba42fSKevin Wolf #endif 7745aba42fSKevin Wolf 7845aba42fSKevin Wolf new_l1_size2 = sizeof(uint64_t) * new_l1_size; 799a4f4c31SKevin Wolf new_l1_table = qemu_try_blockalign(bs->file->bs, 80de82815dSKevin Wolf align_offset(new_l1_size2, 512)); 81de82815dSKevin Wolf if (new_l1_table == NULL) { 82de82815dSKevin Wolf return -ENOMEM; 83de82815dSKevin Wolf } 84de82815dSKevin Wolf memset(new_l1_table, 0, align_offset(new_l1_size2, 512)); 85de82815dSKevin Wolf 860647d47cSStefan Hajnoczi if (s->l1_size) { 8745aba42fSKevin Wolf memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t)); 880647d47cSStefan Hajnoczi } 8945aba42fSKevin Wolf 9045aba42fSKevin Wolf /* write new table (align to cluster) */ 9166f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE); 92ed6ccf0fSKevin Wolf new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2); 935d757b56SKevin Wolf if (new_l1_table_offset < 0) { 94de82815dSKevin Wolf qemu_vfree(new_l1_table); 955d757b56SKevin Wolf return new_l1_table_offset; 965d757b56SKevin Wolf } 9729c1a730SKevin Wolf 9829c1a730SKevin Wolf ret = qcow2_cache_flush(bs, s->refcount_block_cache); 9929c1a730SKevin Wolf if (ret < 0) { 10080fa3341SKevin Wolf goto fail; 10129c1a730SKevin Wolf } 10245aba42fSKevin Wolf 103cf93980eSMax Reitz /* the L1 position has not yet been updated, so these clusters must 104cf93980eSMax Reitz * indeed be completely free */ 105231bb267SMax Reitz ret = qcow2_pre_write_overlap_check(bs, 0, new_l1_table_offset, 106231bb267SMax Reitz new_l1_size2); 107cf93980eSMax Reitz if (ret < 0) { 108cf93980eSMax Reitz goto fail; 109cf93980eSMax Reitz } 110cf93980eSMax Reitz 11166f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE); 11245aba42fSKevin Wolf for(i = 0; i < s->l1_size; i++) 11345aba42fSKevin Wolf new_l1_table[i] = cpu_to_be64(new_l1_table[i]); 114d9ca2ea2SKevin Wolf ret = bdrv_pwrite_sync(bs->file, new_l1_table_offset, 1159a4f4c31SKevin Wolf new_l1_table, new_l1_size2); 1168b3b7206SKevin Wolf if (ret < 0) 11745aba42fSKevin Wolf goto fail; 11845aba42fSKevin Wolf for(i = 0; i < s->l1_size; i++) 11945aba42fSKevin Wolf new_l1_table[i] = be64_to_cpu(new_l1_table[i]); 12045aba42fSKevin Wolf 12145aba42fSKevin Wolf /* set new table */ 12266f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE); 123f1f7a1ddSPeter Maydell stl_be_p(data, new_l1_size); 124e4ef9f46SPeter Maydell stq_be_p(data + 4, new_l1_table_offset); 125d9ca2ea2SKevin Wolf ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size), 1269a4f4c31SKevin Wolf data, sizeof(data)); 1278b3b7206SKevin Wolf if (ret < 0) { 12845aba42fSKevin Wolf goto fail; 129fb8fa77cSKevin Wolf } 130de82815dSKevin Wolf qemu_vfree(s->l1_table); 131fda74f82SMax Reitz old_l1_table_offset = s->l1_table_offset; 13245aba42fSKevin Wolf s->l1_table_offset = new_l1_table_offset; 13345aba42fSKevin Wolf s->l1_table = new_l1_table; 134fda74f82SMax Reitz old_l1_size = s->l1_size; 13545aba42fSKevin Wolf s->l1_size = new_l1_size; 136fda74f82SMax Reitz qcow2_free_clusters(bs, old_l1_table_offset, old_l1_size * sizeof(uint64_t), 137fda74f82SMax Reitz QCOW2_DISCARD_OTHER); 13845aba42fSKevin Wolf return 0; 13945aba42fSKevin Wolf fail: 140de82815dSKevin Wolf qemu_vfree(new_l1_table); 1416cfcb9b8SKevin Wolf qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2, 1426cfcb9b8SKevin Wolf QCOW2_DISCARD_OTHER); 1438b3b7206SKevin Wolf return ret; 14445aba42fSKevin Wolf } 14545aba42fSKevin Wolf 14645aba42fSKevin Wolf /* 14745aba42fSKevin Wolf * l2_load 14845aba42fSKevin Wolf * 14945aba42fSKevin Wolf * Loads a L2 table into memory. If the table is in the cache, the cache 15045aba42fSKevin Wolf * is used; otherwise the L2 table is loaded from the image file. 15145aba42fSKevin Wolf * 15245aba42fSKevin Wolf * Returns a pointer to the L2 table on success, or NULL if the read from 15345aba42fSKevin Wolf * the image file failed. 15445aba42fSKevin Wolf */ 15545aba42fSKevin Wolf 15655c17e98SKevin Wolf static int l2_load(BlockDriverState *bs, uint64_t l2_offset, 15755c17e98SKevin Wolf uint64_t **l2_table) 15845aba42fSKevin Wolf { 159ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 16045aba42fSKevin Wolf 1619be38598SEduardo Habkost return qcow2_cache_get(bs, s->l2_table_cache, l2_offset, 1629be38598SEduardo Habkost (void **)l2_table); 16355c17e98SKevin Wolf } 16455c17e98SKevin Wolf 16545aba42fSKevin Wolf /* 1666583e3c7SKevin Wolf * Writes one sector of the L1 table to the disk (can't update single entries 1676583e3c7SKevin Wolf * and we really don't want bdrv_pread to perform a read-modify-write) 1686583e3c7SKevin Wolf */ 1696583e3c7SKevin Wolf #define L1_ENTRIES_PER_SECTOR (512 / 8) 170e23e400eSMax Reitz int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index) 1716583e3c7SKevin Wolf { 172ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 173a1391444SMax Reitz uint64_t buf[L1_ENTRIES_PER_SECTOR] = { 0 }; 1746583e3c7SKevin Wolf int l1_start_index; 175f7defcb6SKevin Wolf int i, ret; 1766583e3c7SKevin Wolf 1776583e3c7SKevin Wolf l1_start_index = l1_index & ~(L1_ENTRIES_PER_SECTOR - 1); 178a1391444SMax Reitz for (i = 0; i < L1_ENTRIES_PER_SECTOR && l1_start_index + i < s->l1_size; 179a1391444SMax Reitz i++) 180a1391444SMax Reitz { 1816583e3c7SKevin Wolf buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]); 1826583e3c7SKevin Wolf } 1836583e3c7SKevin Wolf 184231bb267SMax Reitz ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L1, 185cf93980eSMax Reitz s->l1_table_offset + 8 * l1_start_index, sizeof(buf)); 186cf93980eSMax Reitz if (ret < 0) { 187cf93980eSMax Reitz return ret; 188cf93980eSMax Reitz } 189cf93980eSMax Reitz 19066f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); 191d9ca2ea2SKevin Wolf ret = bdrv_pwrite_sync(bs->file, 1929a4f4c31SKevin Wolf s->l1_table_offset + 8 * l1_start_index, 193f7defcb6SKevin Wolf buf, sizeof(buf)); 194f7defcb6SKevin Wolf if (ret < 0) { 195f7defcb6SKevin Wolf return ret; 1966583e3c7SKevin Wolf } 1976583e3c7SKevin Wolf 1986583e3c7SKevin Wolf return 0; 1996583e3c7SKevin Wolf } 2006583e3c7SKevin Wolf 2016583e3c7SKevin Wolf /* 20245aba42fSKevin Wolf * l2_allocate 20345aba42fSKevin Wolf * 20445aba42fSKevin Wolf * Allocate a new l2 entry in the file. If l1_index points to an already 20545aba42fSKevin Wolf * used entry in the L2 table (i.e. we are doing a copy on write for the L2 20645aba42fSKevin Wolf * table) copy the contents of the old L2 table into the newly allocated one. 20745aba42fSKevin Wolf * Otherwise the new table is initialized with zeros. 20845aba42fSKevin Wolf * 20945aba42fSKevin Wolf */ 21045aba42fSKevin Wolf 211c46e1167SKevin Wolf static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table) 21245aba42fSKevin Wolf { 213ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 2146583e3c7SKevin Wolf uint64_t old_l2_offset; 2158585afd8SMax Reitz uint64_t *l2_table = NULL; 216f4f0d391SKevin Wolf int64_t l2_offset; 217c46e1167SKevin Wolf int ret; 21845aba42fSKevin Wolf 21945aba42fSKevin Wolf old_l2_offset = s->l1_table[l1_index]; 22045aba42fSKevin Wolf 2213cce16f4SKevin Wolf trace_qcow2_l2_allocate(bs, l1_index); 2223cce16f4SKevin Wolf 22345aba42fSKevin Wolf /* allocate a new l2 entry */ 22445aba42fSKevin Wolf 225ed6ccf0fSKevin Wolf l2_offset = qcow2_alloc_clusters(bs, s->l2_size * sizeof(uint64_t)); 2265d757b56SKevin Wolf if (l2_offset < 0) { 227be0b742eSMax Reitz ret = l2_offset; 228be0b742eSMax Reitz goto fail; 2295d757b56SKevin Wolf } 23029c1a730SKevin Wolf 23129c1a730SKevin Wolf ret = qcow2_cache_flush(bs, s->refcount_block_cache); 23229c1a730SKevin Wolf if (ret < 0) { 23329c1a730SKevin Wolf goto fail; 23429c1a730SKevin Wolf } 23545aba42fSKevin Wolf 23645aba42fSKevin Wolf /* allocate a new entry in the l2 cache */ 23745aba42fSKevin Wolf 2383cce16f4SKevin Wolf trace_qcow2_l2_allocate_get_empty(bs, l1_index); 23929c1a730SKevin Wolf ret = qcow2_cache_get_empty(bs, s->l2_table_cache, l2_offset, (void**) table); 24029c1a730SKevin Wolf if (ret < 0) { 241be0b742eSMax Reitz goto fail; 24229c1a730SKevin Wolf } 24329c1a730SKevin Wolf 24429c1a730SKevin Wolf l2_table = *table; 24545aba42fSKevin Wolf 2468e37f681SKevin Wolf if ((old_l2_offset & L1E_OFFSET_MASK) == 0) { 24745aba42fSKevin Wolf /* if there was no old l2 table, clear the new table */ 24845aba42fSKevin Wolf memset(l2_table, 0, s->l2_size * sizeof(uint64_t)); 24945aba42fSKevin Wolf } else { 25029c1a730SKevin Wolf uint64_t* old_table; 25129c1a730SKevin Wolf 25245aba42fSKevin Wolf /* if there was an old l2 table, read it from the disk */ 25366f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ); 2548e37f681SKevin Wolf ret = qcow2_cache_get(bs, s->l2_table_cache, 2558e37f681SKevin Wolf old_l2_offset & L1E_OFFSET_MASK, 25629c1a730SKevin Wolf (void**) &old_table); 25729c1a730SKevin Wolf if (ret < 0) { 25829c1a730SKevin Wolf goto fail; 25929c1a730SKevin Wolf } 26029c1a730SKevin Wolf 26129c1a730SKevin Wolf memcpy(l2_table, old_table, s->cluster_size); 26229c1a730SKevin Wolf 263a3f1afb4SAlberto Garcia qcow2_cache_put(bs, s->l2_table_cache, (void **) &old_table); 26445aba42fSKevin Wolf } 26529c1a730SKevin Wolf 26645aba42fSKevin Wolf /* write the l2 table to the file */ 26766f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE); 26829c1a730SKevin Wolf 2693cce16f4SKevin Wolf trace_qcow2_l2_allocate_write_l2(bs, l1_index); 27072e80b89SAlberto Garcia qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); 27129c1a730SKevin Wolf ret = qcow2_cache_flush(bs, s->l2_table_cache); 272c46e1167SKevin Wolf if (ret < 0) { 273175e1152SKevin Wolf goto fail; 274175e1152SKevin Wolf } 275175e1152SKevin Wolf 276175e1152SKevin Wolf /* update the L1 entry */ 2773cce16f4SKevin Wolf trace_qcow2_l2_allocate_write_l1(bs, l1_index); 278175e1152SKevin Wolf s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED; 279e23e400eSMax Reitz ret = qcow2_write_l1_entry(bs, l1_index); 280175e1152SKevin Wolf if (ret < 0) { 281175e1152SKevin Wolf goto fail; 282c46e1167SKevin Wolf } 28345aba42fSKevin Wolf 284c46e1167SKevin Wolf *table = l2_table; 2853cce16f4SKevin Wolf trace_qcow2_l2_allocate_done(bs, l1_index, 0); 286c46e1167SKevin Wolf return 0; 287175e1152SKevin Wolf 288175e1152SKevin Wolf fail: 2893cce16f4SKevin Wolf trace_qcow2_l2_allocate_done(bs, l1_index, ret); 2908585afd8SMax Reitz if (l2_table != NULL) { 29129c1a730SKevin Wolf qcow2_cache_put(bs, s->l2_table_cache, (void**) table); 2928585afd8SMax Reitz } 29368dba0bfSKevin Wolf s->l1_table[l1_index] = old_l2_offset; 294e3b21ef9SMax Reitz if (l2_offset > 0) { 295e3b21ef9SMax Reitz qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t), 296e3b21ef9SMax Reitz QCOW2_DISCARD_ALWAYS); 297e3b21ef9SMax Reitz } 298175e1152SKevin Wolf return ret; 29945aba42fSKevin Wolf } 30045aba42fSKevin Wolf 3012bfcc4a0SKevin Wolf /* 3022bfcc4a0SKevin Wolf * Checks how many clusters in a given L2 table are contiguous in the image 3032bfcc4a0SKevin Wolf * file. As soon as one of the flags in the bitmask stop_flags changes compared 3042bfcc4a0SKevin Wolf * to the first cluster, the search is stopped and the cluster is not counted 3052bfcc4a0SKevin Wolf * as contiguous. (This allows it, for example, to stop at the first compressed 3062bfcc4a0SKevin Wolf * cluster which may require a different handling) 3072bfcc4a0SKevin Wolf */ 308b6d36defSMax Reitz static int count_contiguous_clusters(int nb_clusters, int cluster_size, 30961653008SKevin Wolf uint64_t *l2_table, uint64_t stop_flags) 31045aba42fSKevin Wolf { 31145aba42fSKevin Wolf int i; 3123ef95218SEric Blake QCow2ClusterType first_cluster_type; 31378a52ad5SPeter Lieven uint64_t mask = stop_flags | L2E_OFFSET_MASK | QCOW_OFLAG_COMPRESSED; 31415684a47SMax Reitz uint64_t first_entry = be64_to_cpu(l2_table[0]); 31515684a47SMax Reitz uint64_t offset = first_entry & mask; 31645aba42fSKevin Wolf 317564a6b69SMax Reitz if (!offset) { 31845aba42fSKevin Wolf return 0; 319564a6b69SMax Reitz } 32045aba42fSKevin Wolf 321564a6b69SMax Reitz /* must be allocated */ 322564a6b69SMax Reitz first_cluster_type = qcow2_get_cluster_type(first_entry); 323564a6b69SMax Reitz assert(first_cluster_type == QCOW2_CLUSTER_NORMAL || 324fdfab37dSEric Blake first_cluster_type == QCOW2_CLUSTER_ZERO_ALLOC); 32515684a47SMax Reitz 32661653008SKevin Wolf for (i = 0; i < nb_clusters; i++) { 3272bfcc4a0SKevin Wolf uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask; 3282bfcc4a0SKevin Wolf if (offset + (uint64_t) i * cluster_size != l2_entry) { 32945aba42fSKevin Wolf break; 3302bfcc4a0SKevin Wolf } 3312bfcc4a0SKevin Wolf } 33245aba42fSKevin Wolf 33361653008SKevin Wolf return i; 33445aba42fSKevin Wolf } 33545aba42fSKevin Wolf 3364341df8aSEric Blake /* 3374341df8aSEric Blake * Checks how many consecutive unallocated clusters in a given L2 3384341df8aSEric Blake * table have the same cluster type. 3394341df8aSEric Blake */ 3404341df8aSEric Blake static int count_contiguous_clusters_unallocated(int nb_clusters, 341a99dfb45SKevin Wolf uint64_t *l2_table, 3423ef95218SEric Blake QCow2ClusterType wanted_type) 34345aba42fSKevin Wolf { 3442bfcc4a0SKevin Wolf int i; 34545aba42fSKevin Wolf 346fdfab37dSEric Blake assert(wanted_type == QCOW2_CLUSTER_ZERO_PLAIN || 3474341df8aSEric Blake wanted_type == QCOW2_CLUSTER_UNALLOCATED); 3482bfcc4a0SKevin Wolf for (i = 0; i < nb_clusters; i++) { 3494341df8aSEric Blake uint64_t entry = be64_to_cpu(l2_table[i]); 3503ef95218SEric Blake QCow2ClusterType type = qcow2_get_cluster_type(entry); 3512bfcc4a0SKevin Wolf 352fdfab37dSEric Blake if (type != wanted_type) { 3532bfcc4a0SKevin Wolf break; 3542bfcc4a0SKevin Wolf } 3552bfcc4a0SKevin Wolf } 35645aba42fSKevin Wolf 35745aba42fSKevin Wolf return i; 35845aba42fSKevin Wolf } 35945aba42fSKevin Wolf 36045aba42fSKevin Wolf /* The crypt function is compatible with the linux cryptoloop 36145aba42fSKevin Wolf algorithm for < 4 GB images. NOTE: out_buf == in_buf is 36245aba42fSKevin Wolf supported */ 363ff99129aSKevin Wolf int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num, 36445aba42fSKevin Wolf uint8_t *out_buf, const uint8_t *in_buf, 365f6fa64f6SDaniel P. Berrange int nb_sectors, bool enc, 366f6fa64f6SDaniel P. Berrange Error **errp) 36745aba42fSKevin Wolf { 36845aba42fSKevin Wolf union { 36945aba42fSKevin Wolf uint64_t ll[2]; 37045aba42fSKevin Wolf uint8_t b[16]; 37145aba42fSKevin Wolf } ivec; 37245aba42fSKevin Wolf int i; 373f6fa64f6SDaniel P. Berrange int ret; 37445aba42fSKevin Wolf 37545aba42fSKevin Wolf for(i = 0; i < nb_sectors; i++) { 37645aba42fSKevin Wolf ivec.ll[0] = cpu_to_le64(sector_num); 37745aba42fSKevin Wolf ivec.ll[1] = 0; 378f6fa64f6SDaniel P. Berrange if (qcrypto_cipher_setiv(s->cipher, 379f6fa64f6SDaniel P. Berrange ivec.b, G_N_ELEMENTS(ivec.b), 380f6fa64f6SDaniel P. Berrange errp) < 0) { 381f6fa64f6SDaniel P. Berrange return -1; 382f6fa64f6SDaniel P. Berrange } 383f6fa64f6SDaniel P. Berrange if (enc) { 384f6fa64f6SDaniel P. Berrange ret = qcrypto_cipher_encrypt(s->cipher, 385f6fa64f6SDaniel P. Berrange in_buf, 386f6fa64f6SDaniel P. Berrange out_buf, 387f6fa64f6SDaniel P. Berrange 512, 388f6fa64f6SDaniel P. Berrange errp); 389f6fa64f6SDaniel P. Berrange } else { 390f6fa64f6SDaniel P. Berrange ret = qcrypto_cipher_decrypt(s->cipher, 391f6fa64f6SDaniel P. Berrange in_buf, 392f6fa64f6SDaniel P. Berrange out_buf, 393f6fa64f6SDaniel P. Berrange 512, 394f6fa64f6SDaniel P. Berrange errp); 395f6fa64f6SDaniel P. Berrange } 396f6fa64f6SDaniel P. Berrange if (ret < 0) { 397f6fa64f6SDaniel P. Berrange return -1; 398f6fa64f6SDaniel P. Berrange } 39945aba42fSKevin Wolf sector_num++; 40045aba42fSKevin Wolf in_buf += 512; 40145aba42fSKevin Wolf out_buf += 512; 40245aba42fSKevin Wolf } 403f6fa64f6SDaniel P. Berrange return 0; 40445aba42fSKevin Wolf } 40545aba42fSKevin Wolf 406aaa4d20bSKevin Wolf static int coroutine_fn do_perform_cow(BlockDriverState *bs, 407aaa4d20bSKevin Wolf uint64_t src_cluster_offset, 408aef4acb6SStefan Hajnoczi uint64_t cluster_offset, 409aaa4d20bSKevin Wolf int offset_in_cluster, 410aaa4d20bSKevin Wolf int bytes) 41145aba42fSKevin Wolf { 412ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 413aef4acb6SStefan Hajnoczi QEMUIOVector qiov; 414aef4acb6SStefan Hajnoczi struct iovec iov; 415aaa4d20bSKevin Wolf int ret; 4161b9f1491SKevin Wolf 417aaa4d20bSKevin Wolf iov.iov_len = bytes; 418de82815dSKevin Wolf iov.iov_base = qemu_try_blockalign(bs, iov.iov_len); 419de82815dSKevin Wolf if (iov.iov_base == NULL) { 420de82815dSKevin Wolf return -ENOMEM; 421de82815dSKevin Wolf } 422aef4acb6SStefan Hajnoczi 423aef4acb6SStefan Hajnoczi qemu_iovec_init_external(&qiov, &iov, 1); 4241b9f1491SKevin Wolf 42566f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_COW_READ); 426aef4acb6SStefan Hajnoczi 427dba28555SMax Reitz if (!bs->drv) { 428bd604369SKevin Wolf ret = -ENOMEDIUM; 429bd604369SKevin Wolf goto out; 430dba28555SMax Reitz } 431dba28555SMax Reitz 432aef4acb6SStefan Hajnoczi /* Call .bdrv_co_readv() directly instead of using the public block-layer 433aef4acb6SStefan Hajnoczi * interface. This avoids double I/O throttling and request tracking, 434aef4acb6SStefan Hajnoczi * which can lead to deadlock when block layer copy-on-read is enabled. 435aef4acb6SStefan Hajnoczi */ 436aaa4d20bSKevin Wolf ret = bs->drv->bdrv_co_preadv(bs, src_cluster_offset + offset_in_cluster, 437aaa4d20bSKevin Wolf bytes, &qiov, 0); 4381b9f1491SKevin Wolf if (ret < 0) { 4391b9f1491SKevin Wolf goto out; 4401b9f1491SKevin Wolf } 4411b9f1491SKevin Wolf 4428336aafaSDaniel P. Berrange if (bs->encrypted) { 443f6fa64f6SDaniel P. Berrange Error *err = NULL; 444bb9f8dd0SDaniel P. Berrange int64_t sector = (src_cluster_offset + offset_in_cluster) 445aaa4d20bSKevin Wolf >> BDRV_SECTOR_BITS; 446f6fa64f6SDaniel P. Berrange assert(s->cipher); 447aaa4d20bSKevin Wolf assert((offset_in_cluster & ~BDRV_SECTOR_MASK) == 0); 448aaa4d20bSKevin Wolf assert((bytes & ~BDRV_SECTOR_MASK) == 0); 449aaa4d20bSKevin Wolf if (qcow2_encrypt_sectors(s, sector, iov.iov_base, iov.iov_base, 450aaa4d20bSKevin Wolf bytes >> BDRV_SECTOR_BITS, true, &err) < 0) { 451f6fa64f6SDaniel P. Berrange ret = -EIO; 452f6fa64f6SDaniel P. Berrange error_free(err); 453f6fa64f6SDaniel P. Berrange goto out; 454f6fa64f6SDaniel P. Berrange } 45545aba42fSKevin Wolf } 4561b9f1491SKevin Wolf 457231bb267SMax Reitz ret = qcow2_pre_write_overlap_check(bs, 0, 458aaa4d20bSKevin Wolf cluster_offset + offset_in_cluster, bytes); 459cf93980eSMax Reitz if (ret < 0) { 460cf93980eSMax Reitz goto out; 461cf93980eSMax Reitz } 462cf93980eSMax Reitz 46366f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE); 464a03ef88fSKevin Wolf ret = bdrv_co_pwritev(bs->file, cluster_offset + offset_in_cluster, 465aaa4d20bSKevin Wolf bytes, &qiov, 0); 4661b9f1491SKevin Wolf if (ret < 0) { 4671b9f1491SKevin Wolf goto out; 4681b9f1491SKevin Wolf } 4691b9f1491SKevin Wolf 4701b9f1491SKevin Wolf ret = 0; 4711b9f1491SKevin Wolf out: 472aef4acb6SStefan Hajnoczi qemu_vfree(iov.iov_base); 47345aba42fSKevin Wolf return ret; 47445aba42fSKevin Wolf } 47545aba42fSKevin Wolf 47645aba42fSKevin Wolf 47745aba42fSKevin Wolf /* 47845aba42fSKevin Wolf * get_cluster_offset 47945aba42fSKevin Wolf * 480ecfe1863SKevin Wolf * For a given offset of the virtual disk, find the cluster type and offset in 481ecfe1863SKevin Wolf * the qcow2 file. The offset is stored in *cluster_offset. 48245aba42fSKevin Wolf * 483ecfe1863SKevin Wolf * On entry, *bytes is the maximum number of contiguous bytes starting at 484ecfe1863SKevin Wolf * offset that we are interested in. 48545aba42fSKevin Wolf * 486ecfe1863SKevin Wolf * On exit, *bytes is the number of bytes starting at offset that have the same 487ecfe1863SKevin Wolf * cluster type and (if applicable) are stored contiguously in the image file. 488ecfe1863SKevin Wolf * Compressed clusters are always returned one by one. 48945aba42fSKevin Wolf * 49068d000a3SKevin Wolf * Returns the cluster type (QCOW2_CLUSTER_*) on success, -errno in error 49168d000a3SKevin Wolf * cases. 49245aba42fSKevin Wolf */ 4931c46efaaSKevin Wolf int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, 494ecfe1863SKevin Wolf unsigned int *bytes, uint64_t *cluster_offset) 49545aba42fSKevin Wolf { 496ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 4972cf7cfa1SKevin Wolf unsigned int l2_index; 4982cf7cfa1SKevin Wolf uint64_t l1_index, l2_offset, *l2_table; 49945aba42fSKevin Wolf int l1_bits, c; 500c834cba9SMax Reitz unsigned int offset_in_cluster; 501c834cba9SMax Reitz uint64_t bytes_available, bytes_needed, nb_clusters; 5023ef95218SEric Blake QCow2ClusterType type; 50355c17e98SKevin Wolf int ret; 504b2f65d6bSKevin Wolf 505b2f65d6bSKevin Wolf offset_in_cluster = offset_into_cluster(s, offset); 506ecfe1863SKevin Wolf bytes_needed = (uint64_t) *bytes + offset_in_cluster; 50745aba42fSKevin Wolf 50845aba42fSKevin Wolf l1_bits = s->l2_bits + s->cluster_bits; 50945aba42fSKevin Wolf 510b2f65d6bSKevin Wolf /* compute how many bytes there are between the start of the cluster 511b2f65d6bSKevin Wolf * containing offset and the end of the l1 entry */ 512b2f65d6bSKevin Wolf bytes_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1)) 513b2f65d6bSKevin Wolf + offset_in_cluster; 51445aba42fSKevin Wolf 515b2f65d6bSKevin Wolf if (bytes_needed > bytes_available) { 516b2f65d6bSKevin Wolf bytes_needed = bytes_available; 51745aba42fSKevin Wolf } 51845aba42fSKevin Wolf 5191c46efaaSKevin Wolf *cluster_offset = 0; 52045aba42fSKevin Wolf 521b6af0975SDaniel P. Berrange /* seek to the l2 offset in the l1 table */ 52245aba42fSKevin Wolf 52345aba42fSKevin Wolf l1_index = offset >> l1_bits; 52468d000a3SKevin Wolf if (l1_index >= s->l1_size) { 5253ef95218SEric Blake type = QCOW2_CLUSTER_UNALLOCATED; 52645aba42fSKevin Wolf goto out; 52768d000a3SKevin Wolf } 52845aba42fSKevin Wolf 52968d000a3SKevin Wolf l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; 53068d000a3SKevin Wolf if (!l2_offset) { 5313ef95218SEric Blake type = QCOW2_CLUSTER_UNALLOCATED; 53245aba42fSKevin Wolf goto out; 53368d000a3SKevin Wolf } 53445aba42fSKevin Wolf 535a97c67eeSMax Reitz if (offset_into_cluster(s, l2_offset)) { 536a97c67eeSMax Reitz qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64 537a97c67eeSMax Reitz " unaligned (L1 index: %#" PRIx64 ")", 538a97c67eeSMax Reitz l2_offset, l1_index); 539a97c67eeSMax Reitz return -EIO; 540a97c67eeSMax Reitz } 541a97c67eeSMax Reitz 54245aba42fSKevin Wolf /* load the l2 table in memory */ 54345aba42fSKevin Wolf 54455c17e98SKevin Wolf ret = l2_load(bs, l2_offset, &l2_table); 54555c17e98SKevin Wolf if (ret < 0) { 54655c17e98SKevin Wolf return ret; 5471c46efaaSKevin Wolf } 54845aba42fSKevin Wolf 54945aba42fSKevin Wolf /* find the cluster offset for the given disk offset */ 55045aba42fSKevin Wolf 55145aba42fSKevin Wolf l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); 5521c46efaaSKevin Wolf *cluster_offset = be64_to_cpu(l2_table[l2_index]); 553b6d36defSMax Reitz 554b2f65d6bSKevin Wolf nb_clusters = size_to_clusters(s, bytes_needed); 555c834cba9SMax Reitz /* bytes_needed <= *bytes + offset_in_cluster, both of which are unsigned 556c834cba9SMax Reitz * integers; the minimum cluster size is 512, so this assertion is always 557c834cba9SMax Reitz * true */ 558c834cba9SMax Reitz assert(nb_clusters <= INT_MAX); 55945aba42fSKevin Wolf 5603ef95218SEric Blake type = qcow2_get_cluster_type(*cluster_offset); 561fdfab37dSEric Blake if (s->qcow_version < 3 && (type == QCOW2_CLUSTER_ZERO_PLAIN || 562fdfab37dSEric Blake type == QCOW2_CLUSTER_ZERO_ALLOC)) { 563a97c67eeSMax Reitz qcow2_signal_corruption(bs, true, -1, -1, "Zero cluster entry found" 564a97c67eeSMax Reitz " in pre-v3 image (L2 offset: %#" PRIx64 565a97c67eeSMax Reitz ", L2 index: %#x)", l2_offset, l2_index); 566a97c67eeSMax Reitz ret = -EIO; 567a97c67eeSMax Reitz goto fail; 568381b487dSPaolo Bonzini } 569fdfab37dSEric Blake switch (type) { 570fdfab37dSEric Blake case QCOW2_CLUSTER_COMPRESSED: 571fdfab37dSEric Blake /* Compressed clusters can only be processed one by one */ 572fdfab37dSEric Blake c = 1; 573fdfab37dSEric Blake *cluster_offset &= L2E_COMPRESSED_OFFSET_SIZE_MASK; 5746377af48SKevin Wolf break; 575fdfab37dSEric Blake case QCOW2_CLUSTER_ZERO_PLAIN: 57668d000a3SKevin Wolf case QCOW2_CLUSTER_UNALLOCATED: 57745aba42fSKevin Wolf /* how many empty clusters ? */ 5784341df8aSEric Blake c = count_contiguous_clusters_unallocated(nb_clusters, 579fdfab37dSEric Blake &l2_table[l2_index], type); 58068d000a3SKevin Wolf *cluster_offset = 0; 58168d000a3SKevin Wolf break; 582fdfab37dSEric Blake case QCOW2_CLUSTER_ZERO_ALLOC: 58368d000a3SKevin Wolf case QCOW2_CLUSTER_NORMAL: 58445aba42fSKevin Wolf /* how many allocated clusters ? */ 58545aba42fSKevin Wolf c = count_contiguous_clusters(nb_clusters, s->cluster_size, 58661653008SKevin Wolf &l2_table[l2_index], QCOW_OFLAG_ZERO); 58768d000a3SKevin Wolf *cluster_offset &= L2E_OFFSET_MASK; 588a97c67eeSMax Reitz if (offset_into_cluster(s, *cluster_offset)) { 589fdfab37dSEric Blake qcow2_signal_corruption(bs, true, -1, -1, 590fdfab37dSEric Blake "Cluster allocation offset %#" 591a97c67eeSMax Reitz PRIx64 " unaligned (L2 offset: %#" PRIx64 592a97c67eeSMax Reitz ", L2 index: %#x)", *cluster_offset, 593a97c67eeSMax Reitz l2_offset, l2_index); 594a97c67eeSMax Reitz ret = -EIO; 595a97c67eeSMax Reitz goto fail; 596a97c67eeSMax Reitz } 59768d000a3SKevin Wolf break; 5981417d7e4SKevin Wolf default: 5991417d7e4SKevin Wolf abort(); 60045aba42fSKevin Wolf } 60145aba42fSKevin Wolf 60229c1a730SKevin Wolf qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 60329c1a730SKevin Wolf 604c834cba9SMax Reitz bytes_available = (int64_t)c * s->cluster_size; 60568d000a3SKevin Wolf 60645aba42fSKevin Wolf out: 607b2f65d6bSKevin Wolf if (bytes_available > bytes_needed) { 608b2f65d6bSKevin Wolf bytes_available = bytes_needed; 609b2f65d6bSKevin Wolf } 61045aba42fSKevin Wolf 611c834cba9SMax Reitz /* bytes_available <= bytes_needed <= *bytes + offset_in_cluster; 612c834cba9SMax Reitz * subtracting offset_in_cluster will therefore definitely yield something 613c834cba9SMax Reitz * not exceeding UINT_MAX */ 614c834cba9SMax Reitz assert(bytes_available - offset_in_cluster <= UINT_MAX); 615ecfe1863SKevin Wolf *bytes = bytes_available - offset_in_cluster; 61645aba42fSKevin Wolf 6173ef95218SEric Blake return type; 618a97c67eeSMax Reitz 619a97c67eeSMax Reitz fail: 620a97c67eeSMax Reitz qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table); 621a97c67eeSMax Reitz return ret; 62245aba42fSKevin Wolf } 62345aba42fSKevin Wolf 62445aba42fSKevin Wolf /* 62545aba42fSKevin Wolf * get_cluster_table 62645aba42fSKevin Wolf * 62745aba42fSKevin Wolf * for a given disk offset, load (and allocate if needed) 62845aba42fSKevin Wolf * the l2 table. 62945aba42fSKevin Wolf * 63045aba42fSKevin Wolf * the l2 table offset in the qcow2 file and the cluster index 63145aba42fSKevin Wolf * in the l2 table are given to the caller. 63245aba42fSKevin Wolf * 6331e3e8f1aSKevin Wolf * Returns 0 on success, -errno in failure case 63445aba42fSKevin Wolf */ 63545aba42fSKevin Wolf static int get_cluster_table(BlockDriverState *bs, uint64_t offset, 63645aba42fSKevin Wolf uint64_t **new_l2_table, 63745aba42fSKevin Wolf int *new_l2_index) 63845aba42fSKevin Wolf { 639ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 6402cf7cfa1SKevin Wolf unsigned int l2_index; 6412cf7cfa1SKevin Wolf uint64_t l1_index, l2_offset; 642c46e1167SKevin Wolf uint64_t *l2_table = NULL; 64380ee15a6SKevin Wolf int ret; 64445aba42fSKevin Wolf 645b6af0975SDaniel P. Berrange /* seek to the l2 offset in the l1 table */ 64645aba42fSKevin Wolf 64745aba42fSKevin Wolf l1_index = offset >> (s->l2_bits + s->cluster_bits); 64845aba42fSKevin Wolf if (l1_index >= s->l1_size) { 64972893756SStefan Hajnoczi ret = qcow2_grow_l1_table(bs, l1_index + 1, false); 6501e3e8f1aSKevin Wolf if (ret < 0) { 6511e3e8f1aSKevin Wolf return ret; 6521e3e8f1aSKevin Wolf } 65345aba42fSKevin Wolf } 6548e37f681SKevin Wolf 6552cf7cfa1SKevin Wolf assert(l1_index < s->l1_size); 6568e37f681SKevin Wolf l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; 657a97c67eeSMax Reitz if (offset_into_cluster(s, l2_offset)) { 658a97c67eeSMax Reitz qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64 659a97c67eeSMax Reitz " unaligned (L1 index: %#" PRIx64 ")", 660a97c67eeSMax Reitz l2_offset, l1_index); 661a97c67eeSMax Reitz return -EIO; 662a97c67eeSMax Reitz } 66345aba42fSKevin Wolf 66445aba42fSKevin Wolf /* seek the l2 table of the given l2 offset */ 66545aba42fSKevin Wolf 6668e37f681SKevin Wolf if (s->l1_table[l1_index] & QCOW_OFLAG_COPIED) { 66745aba42fSKevin Wolf /* load the l2 table in memory */ 66855c17e98SKevin Wolf ret = l2_load(bs, l2_offset, &l2_table); 66955c17e98SKevin Wolf if (ret < 0) { 67055c17e98SKevin Wolf return ret; 6711e3e8f1aSKevin Wolf } 67245aba42fSKevin Wolf } else { 67316fde5f2SKevin Wolf /* First allocate a new L2 table (and do COW if needed) */ 674c46e1167SKevin Wolf ret = l2_allocate(bs, l1_index, &l2_table); 675c46e1167SKevin Wolf if (ret < 0) { 676c46e1167SKevin Wolf return ret; 6771e3e8f1aSKevin Wolf } 67816fde5f2SKevin Wolf 67916fde5f2SKevin Wolf /* Then decrease the refcount of the old table */ 68016fde5f2SKevin Wolf if (l2_offset) { 6816cfcb9b8SKevin Wolf qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t), 6826cfcb9b8SKevin Wolf QCOW2_DISCARD_OTHER); 68316fde5f2SKevin Wolf } 68445aba42fSKevin Wolf } 68545aba42fSKevin Wolf 68645aba42fSKevin Wolf /* find the cluster offset for the given disk offset */ 68745aba42fSKevin Wolf 68845aba42fSKevin Wolf l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); 68945aba42fSKevin Wolf 69045aba42fSKevin Wolf *new_l2_table = l2_table; 69145aba42fSKevin Wolf *new_l2_index = l2_index; 69245aba42fSKevin Wolf 6931e3e8f1aSKevin Wolf return 0; 69445aba42fSKevin Wolf } 69545aba42fSKevin Wolf 69645aba42fSKevin Wolf /* 69745aba42fSKevin Wolf * alloc_compressed_cluster_offset 69845aba42fSKevin Wolf * 69945aba42fSKevin Wolf * For a given offset of the disk image, return cluster offset in 70045aba42fSKevin Wolf * qcow2 file. 70145aba42fSKevin Wolf * 70245aba42fSKevin Wolf * If the offset is not found, allocate a new compressed cluster. 70345aba42fSKevin Wolf * 70445aba42fSKevin Wolf * Return the cluster offset if successful, 70545aba42fSKevin Wolf * Return 0, otherwise. 70645aba42fSKevin Wolf * 70745aba42fSKevin Wolf */ 70845aba42fSKevin Wolf 709ed6ccf0fSKevin Wolf uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, 71045aba42fSKevin Wolf uint64_t offset, 71145aba42fSKevin Wolf int compressed_size) 71245aba42fSKevin Wolf { 713ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 71445aba42fSKevin Wolf int l2_index, ret; 7153948d1d4SKevin Wolf uint64_t *l2_table; 716f4f0d391SKevin Wolf int64_t cluster_offset; 71745aba42fSKevin Wolf int nb_csectors; 71845aba42fSKevin Wolf 7193948d1d4SKevin Wolf ret = get_cluster_table(bs, offset, &l2_table, &l2_index); 7201e3e8f1aSKevin Wolf if (ret < 0) { 72145aba42fSKevin Wolf return 0; 7221e3e8f1aSKevin Wolf } 72345aba42fSKevin Wolf 724b0b6862eSKevin Wolf /* Compression can't overwrite anything. Fail if the cluster was already 725b0b6862eSKevin Wolf * allocated. */ 72645aba42fSKevin Wolf cluster_offset = be64_to_cpu(l2_table[l2_index]); 727b0b6862eSKevin Wolf if (cluster_offset & L2E_OFFSET_MASK) { 7288f1efd00SKevin Wolf qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 7298f1efd00SKevin Wolf return 0; 7308f1efd00SKevin Wolf } 73145aba42fSKevin Wolf 732ed6ccf0fSKevin Wolf cluster_offset = qcow2_alloc_bytes(bs, compressed_size); 7335d757b56SKevin Wolf if (cluster_offset < 0) { 73429c1a730SKevin Wolf qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 7355d757b56SKevin Wolf return 0; 7365d757b56SKevin Wolf } 7375d757b56SKevin Wolf 73845aba42fSKevin Wolf nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) - 73945aba42fSKevin Wolf (cluster_offset >> 9); 74045aba42fSKevin Wolf 74145aba42fSKevin Wolf cluster_offset |= QCOW_OFLAG_COMPRESSED | 74245aba42fSKevin Wolf ((uint64_t)nb_csectors << s->csize_shift); 74345aba42fSKevin Wolf 74445aba42fSKevin Wolf /* update L2 table */ 74545aba42fSKevin Wolf 74645aba42fSKevin Wolf /* compressed clusters never have the copied flag */ 74745aba42fSKevin Wolf 74866f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED); 74972e80b89SAlberto Garcia qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); 75045aba42fSKevin Wolf l2_table[l2_index] = cpu_to_be64(cluster_offset); 751a3f1afb4SAlberto Garcia qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 75245aba42fSKevin Wolf 75345aba42fSKevin Wolf return cluster_offset; 75445aba42fSKevin Wolf } 75545aba42fSKevin Wolf 756593fb83cSKevin Wolf static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r) 757593fb83cSKevin Wolf { 758ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 759593fb83cSKevin Wolf int ret; 760593fb83cSKevin Wolf 76185567393SKevin Wolf if (r->nb_bytes == 0) { 762593fb83cSKevin Wolf return 0; 763593fb83cSKevin Wolf } 764593fb83cSKevin Wolf 765593fb83cSKevin Wolf qemu_co_mutex_unlock(&s->lock); 76685567393SKevin Wolf ret = do_perform_cow(bs, m->offset, m->alloc_offset, r->offset, r->nb_bytes); 767593fb83cSKevin Wolf qemu_co_mutex_lock(&s->lock); 768593fb83cSKevin Wolf 769593fb83cSKevin Wolf if (ret < 0) { 770593fb83cSKevin Wolf return ret; 771593fb83cSKevin Wolf } 772593fb83cSKevin Wolf 773593fb83cSKevin Wolf /* 774593fb83cSKevin Wolf * Before we update the L2 table to actually point to the new cluster, we 775593fb83cSKevin Wolf * need to be sure that the refcounts have been increased and COW was 776593fb83cSKevin Wolf * handled. 777593fb83cSKevin Wolf */ 778593fb83cSKevin Wolf qcow2_cache_depends_on_flush(s->l2_table_cache); 779593fb83cSKevin Wolf 780593fb83cSKevin Wolf return 0; 781593fb83cSKevin Wolf } 782593fb83cSKevin Wolf 783148da7eaSKevin Wolf int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) 78445aba42fSKevin Wolf { 785ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 78645aba42fSKevin Wolf int i, j = 0, l2_index, ret; 787593fb83cSKevin Wolf uint64_t *old_cluster, *l2_table; 788250196f1SKevin Wolf uint64_t cluster_offset = m->alloc_offset; 78945aba42fSKevin Wolf 7903cce16f4SKevin Wolf trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters); 791f50f88b9SKevin Wolf assert(m->nb_clusters > 0); 79245aba42fSKevin Wolf 7935839e53bSMarkus Armbruster old_cluster = g_try_new(uint64_t, m->nb_clusters); 794de82815dSKevin Wolf if (old_cluster == NULL) { 795de82815dSKevin Wolf ret = -ENOMEM; 796de82815dSKevin Wolf goto err; 797de82815dSKevin Wolf } 79845aba42fSKevin Wolf 79945aba42fSKevin Wolf /* copy content of unmodified sectors */ 800593fb83cSKevin Wolf ret = perform_cow(bs, m, &m->cow_start); 801593fb83cSKevin Wolf if (ret < 0) { 80245aba42fSKevin Wolf goto err; 80345aba42fSKevin Wolf } 80445aba42fSKevin Wolf 805593fb83cSKevin Wolf ret = perform_cow(bs, m, &m->cow_end); 806593fb83cSKevin Wolf if (ret < 0) { 80745aba42fSKevin Wolf goto err; 80845aba42fSKevin Wolf } 80945aba42fSKevin Wolf 810593fb83cSKevin Wolf /* Update L2 table. */ 81174c4510aSKevin Wolf if (s->use_lazy_refcounts) { 812280d3735SKevin Wolf qcow2_mark_dirty(bs); 813280d3735SKevin Wolf } 814bfe8043eSStefan Hajnoczi if (qcow2_need_accurate_refcounts(s)) { 815bfe8043eSStefan Hajnoczi qcow2_cache_set_dependency(bs, s->l2_table_cache, 816bfe8043eSStefan Hajnoczi s->refcount_block_cache); 817bfe8043eSStefan Hajnoczi } 818280d3735SKevin Wolf 8193948d1d4SKevin Wolf ret = get_cluster_table(bs, m->offset, &l2_table, &l2_index); 8201e3e8f1aSKevin Wolf if (ret < 0) { 82145aba42fSKevin Wolf goto err; 8221e3e8f1aSKevin Wolf } 82372e80b89SAlberto Garcia qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); 82445aba42fSKevin Wolf 825c01dbccbSMax Reitz assert(l2_index + m->nb_clusters <= s->l2_size); 82645aba42fSKevin Wolf for (i = 0; i < m->nb_clusters; i++) { 82745aba42fSKevin Wolf /* if two concurrent writes happen to the same unallocated cluster 82845aba42fSKevin Wolf * each write allocates separate cluster and writes data concurrently. 82945aba42fSKevin Wolf * The first one to complete updates l2 table with pointer to its 83045aba42fSKevin Wolf * cluster the second one has to do RMW (which is done above by 831aaa4d20bSKevin Wolf * perform_cow()), update l2 table with its cluster pointer and free 83245aba42fSKevin Wolf * old cluster. This is what this loop does */ 833aaa4d20bSKevin Wolf if (l2_table[l2_index + i] != 0) { 83445aba42fSKevin Wolf old_cluster[j++] = l2_table[l2_index + i]; 835aaa4d20bSKevin Wolf } 83645aba42fSKevin Wolf 83745aba42fSKevin Wolf l2_table[l2_index + i] = cpu_to_be64((cluster_offset + 83845aba42fSKevin Wolf (i << s->cluster_bits)) | QCOW_OFLAG_COPIED); 83945aba42fSKevin Wolf } 84045aba42fSKevin Wolf 8419f8e668eSKevin Wolf 842a3f1afb4SAlberto Garcia qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 84345aba42fSKevin Wolf 8447ec5e6a4SKevin Wolf /* 8457ec5e6a4SKevin Wolf * If this was a COW, we need to decrease the refcount of the old cluster. 8466cfcb9b8SKevin Wolf * 8476cfcb9b8SKevin Wolf * Don't discard clusters that reach a refcount of 0 (e.g. compressed 8486cfcb9b8SKevin Wolf * clusters), the next write will reuse them anyway. 8497ec5e6a4SKevin Wolf */ 850564a6b69SMax Reitz if (!m->keep_old_clusters && j != 0) { 8517ec5e6a4SKevin Wolf for (i = 0; i < j; i++) { 8526cfcb9b8SKevin Wolf qcow2_free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1, 8536cfcb9b8SKevin Wolf QCOW2_DISCARD_NEVER); 8547ec5e6a4SKevin Wolf } 8557ec5e6a4SKevin Wolf } 85645aba42fSKevin Wolf 85745aba42fSKevin Wolf ret = 0; 85845aba42fSKevin Wolf err: 8597267c094SAnthony Liguori g_free(old_cluster); 86045aba42fSKevin Wolf return ret; 86145aba42fSKevin Wolf } 86245aba42fSKevin Wolf 86345aba42fSKevin Wolf /* 864bf319eceSKevin Wolf * Returns the number of contiguous clusters that can be used for an allocating 865bf319eceSKevin Wolf * write, but require COW to be performed (this includes yet unallocated space, 866bf319eceSKevin Wolf * which must copy from the backing file) 867bf319eceSKevin Wolf */ 868ff99129aSKevin Wolf static int count_cow_clusters(BDRVQcow2State *s, int nb_clusters, 869bf319eceSKevin Wolf uint64_t *l2_table, int l2_index) 870bf319eceSKevin Wolf { 871143550a8SKevin Wolf int i; 872bf319eceSKevin Wolf 873143550a8SKevin Wolf for (i = 0; i < nb_clusters; i++) { 874143550a8SKevin Wolf uint64_t l2_entry = be64_to_cpu(l2_table[l2_index + i]); 8753ef95218SEric Blake QCow2ClusterType cluster_type = qcow2_get_cluster_type(l2_entry); 876143550a8SKevin Wolf 877143550a8SKevin Wolf switch(cluster_type) { 878143550a8SKevin Wolf case QCOW2_CLUSTER_NORMAL: 879143550a8SKevin Wolf if (l2_entry & QCOW_OFLAG_COPIED) { 880143550a8SKevin Wolf goto out; 881143550a8SKevin Wolf } 882bf319eceSKevin Wolf break; 883143550a8SKevin Wolf case QCOW2_CLUSTER_UNALLOCATED: 884143550a8SKevin Wolf case QCOW2_CLUSTER_COMPRESSED: 885fdfab37dSEric Blake case QCOW2_CLUSTER_ZERO_PLAIN: 886fdfab37dSEric Blake case QCOW2_CLUSTER_ZERO_ALLOC: 887143550a8SKevin Wolf break; 888143550a8SKevin Wolf default: 889143550a8SKevin Wolf abort(); 890143550a8SKevin Wolf } 891bf319eceSKevin Wolf } 892bf319eceSKevin Wolf 893143550a8SKevin Wolf out: 894bf319eceSKevin Wolf assert(i <= nb_clusters); 895bf319eceSKevin Wolf return i; 896bf319eceSKevin Wolf } 897bf319eceSKevin Wolf 898bf319eceSKevin Wolf /* 899250196f1SKevin Wolf * Check if there already is an AIO write request in flight which allocates 900250196f1SKevin Wolf * the same cluster. In this case we need to wait until the previous 901250196f1SKevin Wolf * request has completed and updated the L2 table accordingly. 90265eb2e35SKevin Wolf * 90365eb2e35SKevin Wolf * Returns: 90465eb2e35SKevin Wolf * 0 if there was no dependency. *cur_bytes indicates the number of 90565eb2e35SKevin Wolf * bytes from guest_offset that can be read before the next 90665eb2e35SKevin Wolf * dependency must be processed (or the request is complete) 90765eb2e35SKevin Wolf * 90865eb2e35SKevin Wolf * -EAGAIN if we had to wait for another request, previously gathered 90965eb2e35SKevin Wolf * information on cluster allocation may be invalid now. The caller 91065eb2e35SKevin Wolf * must start over anyway, so consider *cur_bytes undefined. 911250196f1SKevin Wolf */ 912226c3c26SKevin Wolf static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, 913ecdd5333SKevin Wolf uint64_t *cur_bytes, QCowL2Meta **m) 914226c3c26SKevin Wolf { 915ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 916226c3c26SKevin Wolf QCowL2Meta *old_alloc; 91765eb2e35SKevin Wolf uint64_t bytes = *cur_bytes; 918226c3c26SKevin Wolf 919250196f1SKevin Wolf QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) { 920250196f1SKevin Wolf 92165eb2e35SKevin Wolf uint64_t start = guest_offset; 92265eb2e35SKevin Wolf uint64_t end = start + bytes; 92365eb2e35SKevin Wolf uint64_t old_start = l2meta_cow_start(old_alloc); 92465eb2e35SKevin Wolf uint64_t old_end = l2meta_cow_end(old_alloc); 925250196f1SKevin Wolf 926d9d74f41SKevin Wolf if (end <= old_start || start >= old_end) { 927250196f1SKevin Wolf /* No intersection */ 928250196f1SKevin Wolf } else { 929250196f1SKevin Wolf if (start < old_start) { 930250196f1SKevin Wolf /* Stop at the start of a running allocation */ 93165eb2e35SKevin Wolf bytes = old_start - start; 932250196f1SKevin Wolf } else { 93365eb2e35SKevin Wolf bytes = 0; 934250196f1SKevin Wolf } 935250196f1SKevin Wolf 936ecdd5333SKevin Wolf /* Stop if already an l2meta exists. After yielding, it wouldn't 937ecdd5333SKevin Wolf * be valid any more, so we'd have to clean up the old L2Metas 938ecdd5333SKevin Wolf * and deal with requests depending on them before starting to 939ecdd5333SKevin Wolf * gather new ones. Not worth the trouble. */ 940ecdd5333SKevin Wolf if (bytes == 0 && *m) { 941ecdd5333SKevin Wolf *cur_bytes = 0; 942ecdd5333SKevin Wolf return 0; 943ecdd5333SKevin Wolf } 944ecdd5333SKevin Wolf 94565eb2e35SKevin Wolf if (bytes == 0) { 946250196f1SKevin Wolf /* Wait for the dependency to complete. We need to recheck 947250196f1SKevin Wolf * the free/allocated clusters when we continue. */ 9481ace7ceaSPaolo Bonzini qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock); 949250196f1SKevin Wolf return -EAGAIN; 950250196f1SKevin Wolf } 951250196f1SKevin Wolf } 952250196f1SKevin Wolf } 953250196f1SKevin Wolf 95465eb2e35SKevin Wolf /* Make sure that existing clusters and new allocations are only used up to 95565eb2e35SKevin Wolf * the next dependency if we shortened the request above */ 95665eb2e35SKevin Wolf *cur_bytes = bytes; 957250196f1SKevin Wolf 958226c3c26SKevin Wolf return 0; 959226c3c26SKevin Wolf } 960226c3c26SKevin Wolf 961226c3c26SKevin Wolf /* 9620af729ecSKevin Wolf * Checks how many already allocated clusters that don't require a copy on 9630af729ecSKevin Wolf * write there are at the given guest_offset (up to *bytes). If 9640af729ecSKevin Wolf * *host_offset is not zero, only physically contiguous clusters beginning at 9650af729ecSKevin Wolf * this host offset are counted. 9660af729ecSKevin Wolf * 967411d62b0SKevin Wolf * Note that guest_offset may not be cluster aligned. In this case, the 968411d62b0SKevin Wolf * returned *host_offset points to exact byte referenced by guest_offset and 969411d62b0SKevin Wolf * therefore isn't cluster aligned as well. 9700af729ecSKevin Wolf * 9710af729ecSKevin Wolf * Returns: 9720af729ecSKevin Wolf * 0: if no allocated clusters are available at the given offset. 9730af729ecSKevin Wolf * *bytes is normally unchanged. It is set to 0 if the cluster 9740af729ecSKevin Wolf * is allocated and doesn't need COW, but doesn't have the right 9750af729ecSKevin Wolf * physical offset. 9760af729ecSKevin Wolf * 9770af729ecSKevin Wolf * 1: if allocated clusters that don't require a COW are available at 9780af729ecSKevin Wolf * the requested offset. *bytes may have decreased and describes 9790af729ecSKevin Wolf * the length of the area that can be written to. 9800af729ecSKevin Wolf * 9810af729ecSKevin Wolf * -errno: in error cases 9820af729ecSKevin Wolf */ 9830af729ecSKevin Wolf static int handle_copied(BlockDriverState *bs, uint64_t guest_offset, 984c53ede9fSKevin Wolf uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m) 9850af729ecSKevin Wolf { 986ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 9870af729ecSKevin Wolf int l2_index; 9880af729ecSKevin Wolf uint64_t cluster_offset; 9890af729ecSKevin Wolf uint64_t *l2_table; 990b6d36defSMax Reitz uint64_t nb_clusters; 991c53ede9fSKevin Wolf unsigned int keep_clusters; 992a3f1afb4SAlberto Garcia int ret; 9930af729ecSKevin Wolf 9940af729ecSKevin Wolf trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset, 9950af729ecSKevin Wolf *bytes); 9960af729ecSKevin Wolf 997411d62b0SKevin Wolf assert(*host_offset == 0 || offset_into_cluster(s, guest_offset) 998411d62b0SKevin Wolf == offset_into_cluster(s, *host_offset)); 999411d62b0SKevin Wolf 1000acb0467fSKevin Wolf /* 1001acb0467fSKevin Wolf * Calculate the number of clusters to look for. We stop at L2 table 1002acb0467fSKevin Wolf * boundaries to keep things simple. 1003acb0467fSKevin Wolf */ 1004acb0467fSKevin Wolf nb_clusters = 1005acb0467fSKevin Wolf size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes); 1006acb0467fSKevin Wolf 1007acb0467fSKevin Wolf l2_index = offset_to_l2_index(s, guest_offset); 1008acb0467fSKevin Wolf nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); 1009b6d36defSMax Reitz assert(nb_clusters <= INT_MAX); 1010acb0467fSKevin Wolf 10110af729ecSKevin Wolf /* Find L2 entry for the first involved cluster */ 10120af729ecSKevin Wolf ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index); 10130af729ecSKevin Wolf if (ret < 0) { 10140af729ecSKevin Wolf return ret; 10150af729ecSKevin Wolf } 10160af729ecSKevin Wolf 10170af729ecSKevin Wolf cluster_offset = be64_to_cpu(l2_table[l2_index]); 10180af729ecSKevin Wolf 10190af729ecSKevin Wolf /* Check how many clusters are already allocated and don't need COW */ 10200af729ecSKevin Wolf if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL 10210af729ecSKevin Wolf && (cluster_offset & QCOW_OFLAG_COPIED)) 10220af729ecSKevin Wolf { 1023e62daaf6SKevin Wolf /* If a specific host_offset is required, check it */ 1024e62daaf6SKevin Wolf bool offset_matches = 1025e62daaf6SKevin Wolf (cluster_offset & L2E_OFFSET_MASK) == *host_offset; 1026e62daaf6SKevin Wolf 1027a97c67eeSMax Reitz if (offset_into_cluster(s, cluster_offset & L2E_OFFSET_MASK)) { 1028a97c67eeSMax Reitz qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset " 1029a97c67eeSMax Reitz "%#llx unaligned (guest offset: %#" PRIx64 1030a97c67eeSMax Reitz ")", cluster_offset & L2E_OFFSET_MASK, 1031a97c67eeSMax Reitz guest_offset); 1032a97c67eeSMax Reitz ret = -EIO; 1033a97c67eeSMax Reitz goto out; 1034a97c67eeSMax Reitz } 1035a97c67eeSMax Reitz 1036e62daaf6SKevin Wolf if (*host_offset != 0 && !offset_matches) { 1037e62daaf6SKevin Wolf *bytes = 0; 1038e62daaf6SKevin Wolf ret = 0; 1039e62daaf6SKevin Wolf goto out; 1040e62daaf6SKevin Wolf } 1041e62daaf6SKevin Wolf 10420af729ecSKevin Wolf /* We keep all QCOW_OFLAG_COPIED clusters */ 1043c53ede9fSKevin Wolf keep_clusters = 1044acb0467fSKevin Wolf count_contiguous_clusters(nb_clusters, s->cluster_size, 104561653008SKevin Wolf &l2_table[l2_index], 10460af729ecSKevin Wolf QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO); 1047c53ede9fSKevin Wolf assert(keep_clusters <= nb_clusters); 1048c53ede9fSKevin Wolf 1049c53ede9fSKevin Wolf *bytes = MIN(*bytes, 1050c53ede9fSKevin Wolf keep_clusters * s->cluster_size 1051c53ede9fSKevin Wolf - offset_into_cluster(s, guest_offset)); 10520af729ecSKevin Wolf 10530af729ecSKevin Wolf ret = 1; 10540af729ecSKevin Wolf } else { 10550af729ecSKevin Wolf ret = 0; 10560af729ecSKevin Wolf } 10570af729ecSKevin Wolf 10580af729ecSKevin Wolf /* Cleanup */ 1059e62daaf6SKevin Wolf out: 1060a3f1afb4SAlberto Garcia qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 10610af729ecSKevin Wolf 1062e62daaf6SKevin Wolf /* Only return a host offset if we actually made progress. Otherwise we 1063e62daaf6SKevin Wolf * would make requirements for handle_alloc() that it can't fulfill */ 1064a97c67eeSMax Reitz if (ret > 0) { 1065411d62b0SKevin Wolf *host_offset = (cluster_offset & L2E_OFFSET_MASK) 1066411d62b0SKevin Wolf + offset_into_cluster(s, guest_offset); 1067e62daaf6SKevin Wolf } 1068e62daaf6SKevin Wolf 10690af729ecSKevin Wolf return ret; 10700af729ecSKevin Wolf } 10710af729ecSKevin Wolf 10720af729ecSKevin Wolf /* 1073226c3c26SKevin Wolf * Allocates new clusters for the given guest_offset. 1074226c3c26SKevin Wolf * 1075226c3c26SKevin Wolf * At most *nb_clusters are allocated, and on return *nb_clusters is updated to 1076226c3c26SKevin Wolf * contain the number of clusters that have been allocated and are contiguous 1077226c3c26SKevin Wolf * in the image file. 1078226c3c26SKevin Wolf * 1079226c3c26SKevin Wolf * If *host_offset is non-zero, it specifies the offset in the image file at 1080226c3c26SKevin Wolf * which the new clusters must start. *nb_clusters can be 0 on return in this 1081226c3c26SKevin Wolf * case if the cluster at host_offset is already in use. If *host_offset is 1082226c3c26SKevin Wolf * zero, the clusters can be allocated anywhere in the image file. 1083226c3c26SKevin Wolf * 1084226c3c26SKevin Wolf * *host_offset is updated to contain the offset into the image file at which 1085226c3c26SKevin Wolf * the first allocated cluster starts. 1086226c3c26SKevin Wolf * 1087226c3c26SKevin Wolf * Return 0 on success and -errno in error cases. -EAGAIN means that the 1088226c3c26SKevin Wolf * function has been waiting for another request and the allocation must be 1089226c3c26SKevin Wolf * restarted, but the whole request should not be failed. 1090226c3c26SKevin Wolf */ 1091226c3c26SKevin Wolf static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset, 1092b6d36defSMax Reitz uint64_t *host_offset, uint64_t *nb_clusters) 1093226c3c26SKevin Wolf { 1094ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 1095226c3c26SKevin Wolf 1096226c3c26SKevin Wolf trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset, 1097226c3c26SKevin Wolf *host_offset, *nb_clusters); 1098226c3c26SKevin Wolf 1099250196f1SKevin Wolf /* Allocate new clusters */ 1100250196f1SKevin Wolf trace_qcow2_cluster_alloc_phys(qemu_coroutine_self()); 1101250196f1SKevin Wolf if (*host_offset == 0) { 1102df021791SKevin Wolf int64_t cluster_offset = 1103df021791SKevin Wolf qcow2_alloc_clusters(bs, *nb_clusters * s->cluster_size); 1104250196f1SKevin Wolf if (cluster_offset < 0) { 1105250196f1SKevin Wolf return cluster_offset; 1106250196f1SKevin Wolf } 1107250196f1SKevin Wolf *host_offset = cluster_offset; 1108250196f1SKevin Wolf return 0; 1109df021791SKevin Wolf } else { 1110b6d36defSMax Reitz int64_t ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters); 1111df021791SKevin Wolf if (ret < 0) { 1112df021791SKevin Wolf return ret; 1113df021791SKevin Wolf } 1114df021791SKevin Wolf *nb_clusters = ret; 1115df021791SKevin Wolf return 0; 1116df021791SKevin Wolf } 1117250196f1SKevin Wolf } 1118250196f1SKevin Wolf 1119250196f1SKevin Wolf /* 112010f0ed8bSKevin Wolf * Allocates new clusters for an area that either is yet unallocated or needs a 112110f0ed8bSKevin Wolf * copy on write. If *host_offset is non-zero, clusters are only allocated if 112210f0ed8bSKevin Wolf * the new allocation can match the specified host offset. 112310f0ed8bSKevin Wolf * 1124411d62b0SKevin Wolf * Note that guest_offset may not be cluster aligned. In this case, the 1125411d62b0SKevin Wolf * returned *host_offset points to exact byte referenced by guest_offset and 1126411d62b0SKevin Wolf * therefore isn't cluster aligned as well. 112710f0ed8bSKevin Wolf * 112810f0ed8bSKevin Wolf * Returns: 112910f0ed8bSKevin Wolf * 0: if no clusters could be allocated. *bytes is set to 0, 113010f0ed8bSKevin Wolf * *host_offset is left unchanged. 113110f0ed8bSKevin Wolf * 113210f0ed8bSKevin Wolf * 1: if new clusters were allocated. *bytes may be decreased if the 113310f0ed8bSKevin Wolf * new allocation doesn't cover all of the requested area. 113410f0ed8bSKevin Wolf * *host_offset is updated to contain the host offset of the first 113510f0ed8bSKevin Wolf * newly allocated cluster. 113610f0ed8bSKevin Wolf * 113710f0ed8bSKevin Wolf * -errno: in error cases 113810f0ed8bSKevin Wolf */ 113910f0ed8bSKevin Wolf static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, 1140c37f4cd7SKevin Wolf uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m) 114110f0ed8bSKevin Wolf { 1142ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 114310f0ed8bSKevin Wolf int l2_index; 114410f0ed8bSKevin Wolf uint64_t *l2_table; 114510f0ed8bSKevin Wolf uint64_t entry; 1146b6d36defSMax Reitz uint64_t nb_clusters; 114710f0ed8bSKevin Wolf int ret; 1148564a6b69SMax Reitz bool keep_old_clusters = false; 114910f0ed8bSKevin Wolf 1150564a6b69SMax Reitz uint64_t alloc_cluster_offset = 0; 115110f0ed8bSKevin Wolf 115210f0ed8bSKevin Wolf trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset, 115310f0ed8bSKevin Wolf *bytes); 115410f0ed8bSKevin Wolf assert(*bytes > 0); 115510f0ed8bSKevin Wolf 1156f5bc6350SKevin Wolf /* 1157f5bc6350SKevin Wolf * Calculate the number of clusters to look for. We stop at L2 table 1158f5bc6350SKevin Wolf * boundaries to keep things simple. 1159f5bc6350SKevin Wolf */ 1160c37f4cd7SKevin Wolf nb_clusters = 1161c37f4cd7SKevin Wolf size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes); 1162c37f4cd7SKevin Wolf 1163f5bc6350SKevin Wolf l2_index = offset_to_l2_index(s, guest_offset); 1164c37f4cd7SKevin Wolf nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); 1165b6d36defSMax Reitz assert(nb_clusters <= INT_MAX); 1166f5bc6350SKevin Wolf 116710f0ed8bSKevin Wolf /* Find L2 entry for the first involved cluster */ 116810f0ed8bSKevin Wolf ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index); 116910f0ed8bSKevin Wolf if (ret < 0) { 117010f0ed8bSKevin Wolf return ret; 117110f0ed8bSKevin Wolf } 117210f0ed8bSKevin Wolf 11733b8e2e26SKevin Wolf entry = be64_to_cpu(l2_table[l2_index]); 117410f0ed8bSKevin Wolf 117510f0ed8bSKevin Wolf /* For the moment, overwrite compressed clusters one by one */ 117610f0ed8bSKevin Wolf if (entry & QCOW_OFLAG_COMPRESSED) { 117710f0ed8bSKevin Wolf nb_clusters = 1; 117810f0ed8bSKevin Wolf } else { 11793b8e2e26SKevin Wolf nb_clusters = count_cow_clusters(s, nb_clusters, l2_table, l2_index); 118010f0ed8bSKevin Wolf } 118110f0ed8bSKevin Wolf 1182ecdd5333SKevin Wolf /* This function is only called when there were no non-COW clusters, so if 1183ecdd5333SKevin Wolf * we can't find any unallocated or COW clusters either, something is 1184ecdd5333SKevin Wolf * wrong with our code. */ 1185ecdd5333SKevin Wolf assert(nb_clusters > 0); 1186ecdd5333SKevin Wolf 1187fdfab37dSEric Blake if (qcow2_get_cluster_type(entry) == QCOW2_CLUSTER_ZERO_ALLOC && 1188fdfab37dSEric Blake (entry & QCOW_OFLAG_COPIED) && 1189564a6b69SMax Reitz (!*host_offset || 1190564a6b69SMax Reitz start_of_cluster(s, *host_offset) == (entry & L2E_OFFSET_MASK))) 1191564a6b69SMax Reitz { 1192564a6b69SMax Reitz /* Try to reuse preallocated zero clusters; contiguous normal clusters 1193564a6b69SMax Reitz * would be fine, too, but count_cow_clusters() above has limited 1194564a6b69SMax Reitz * nb_clusters already to a range of COW clusters */ 1195564a6b69SMax Reitz int preallocated_nb_clusters = 1196564a6b69SMax Reitz count_contiguous_clusters(nb_clusters, s->cluster_size, 1197564a6b69SMax Reitz &l2_table[l2_index], QCOW_OFLAG_COPIED); 1198564a6b69SMax Reitz assert(preallocated_nb_clusters > 0); 1199564a6b69SMax Reitz 1200564a6b69SMax Reitz nb_clusters = preallocated_nb_clusters; 1201564a6b69SMax Reitz alloc_cluster_offset = entry & L2E_OFFSET_MASK; 1202564a6b69SMax Reitz 1203564a6b69SMax Reitz /* We want to reuse these clusters, so qcow2_alloc_cluster_link_l2() 1204564a6b69SMax Reitz * should not free them. */ 1205564a6b69SMax Reitz keep_old_clusters = true; 1206564a6b69SMax Reitz } 1207564a6b69SMax Reitz 1208a3f1afb4SAlberto Garcia qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 120910f0ed8bSKevin Wolf 1210564a6b69SMax Reitz if (!alloc_cluster_offset) { 121110f0ed8bSKevin Wolf /* Allocate, if necessary at a given offset in the image file */ 1212411d62b0SKevin Wolf alloc_cluster_offset = start_of_cluster(s, *host_offset); 121383baa9a4SKevin Wolf ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset, 121410f0ed8bSKevin Wolf &nb_clusters); 121510f0ed8bSKevin Wolf if (ret < 0) { 121610f0ed8bSKevin Wolf goto fail; 121710f0ed8bSKevin Wolf } 121810f0ed8bSKevin Wolf 121983baa9a4SKevin Wolf /* Can't extend contiguous allocation */ 122083baa9a4SKevin Wolf if (nb_clusters == 0) { 122183baa9a4SKevin Wolf *bytes = 0; 122283baa9a4SKevin Wolf return 0; 122383baa9a4SKevin Wolf } 122483baa9a4SKevin Wolf 1225564a6b69SMax Reitz /* !*host_offset would overwrite the image header and is reserved for 1226564a6b69SMax Reitz * "no host offset preferred". If 0 was a valid host offset, it'd 1227564a6b69SMax Reitz * trigger the following overlap check; do that now to avoid having an 1228564a6b69SMax Reitz * invalid value in *host_offset. */ 1229ff52aab2SMax Reitz if (!alloc_cluster_offset) { 1230ff52aab2SMax Reitz ret = qcow2_pre_write_overlap_check(bs, 0, alloc_cluster_offset, 1231ff52aab2SMax Reitz nb_clusters * s->cluster_size); 1232ff52aab2SMax Reitz assert(ret < 0); 1233ff52aab2SMax Reitz goto fail; 1234ff52aab2SMax Reitz } 1235564a6b69SMax Reitz } 1236ff52aab2SMax Reitz 123710f0ed8bSKevin Wolf /* 123883baa9a4SKevin Wolf * Save info needed for meta data update. 123983baa9a4SKevin Wolf * 124085567393SKevin Wolf * requested_bytes: Number of bytes from the start of the first 124110f0ed8bSKevin Wolf * newly allocated cluster to the end of the (possibly shortened 124210f0ed8bSKevin Wolf * before) write request. 124310f0ed8bSKevin Wolf * 124485567393SKevin Wolf * avail_bytes: Number of bytes from the start of the first 124510f0ed8bSKevin Wolf * newly allocated to the end of the last newly allocated cluster. 124610f0ed8bSKevin Wolf * 124785567393SKevin Wolf * nb_bytes: The number of bytes from the start of the first 124883baa9a4SKevin Wolf * newly allocated cluster to the end of the area that the write 124910f0ed8bSKevin Wolf * request actually writes to (excluding COW at the end) 125010f0ed8bSKevin Wolf */ 125185567393SKevin Wolf uint64_t requested_bytes = *bytes + offset_into_cluster(s, guest_offset); 125285567393SKevin Wolf int avail_bytes = MIN(INT_MAX, nb_clusters << s->cluster_bits); 125385567393SKevin Wolf int nb_bytes = MIN(requested_bytes, avail_bytes); 125488c6588cSKevin Wolf QCowL2Meta *old_m = *m; 125510f0ed8bSKevin Wolf 125610f0ed8bSKevin Wolf *m = g_malloc0(sizeof(**m)); 125710f0ed8bSKevin Wolf 125810f0ed8bSKevin Wolf **m = (QCowL2Meta) { 125988c6588cSKevin Wolf .next = old_m, 126088c6588cSKevin Wolf 1261411d62b0SKevin Wolf .alloc_offset = alloc_cluster_offset, 126283baa9a4SKevin Wolf .offset = start_of_cluster(s, guest_offset), 126310f0ed8bSKevin Wolf .nb_clusters = nb_clusters, 126410f0ed8bSKevin Wolf 1265564a6b69SMax Reitz .keep_old_clusters = keep_old_clusters, 1266564a6b69SMax Reitz 126710f0ed8bSKevin Wolf .cow_start = { 126810f0ed8bSKevin Wolf .offset = 0, 126985567393SKevin Wolf .nb_bytes = offset_into_cluster(s, guest_offset), 127010f0ed8bSKevin Wolf }, 127110f0ed8bSKevin Wolf .cow_end = { 127285567393SKevin Wolf .offset = nb_bytes, 127385567393SKevin Wolf .nb_bytes = avail_bytes - nb_bytes, 127410f0ed8bSKevin Wolf }, 127510f0ed8bSKevin Wolf }; 127610f0ed8bSKevin Wolf qemu_co_queue_init(&(*m)->dependent_requests); 127710f0ed8bSKevin Wolf QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight); 127810f0ed8bSKevin Wolf 1279411d62b0SKevin Wolf *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset); 128085567393SKevin Wolf *bytes = MIN(*bytes, nb_bytes - offset_into_cluster(s, guest_offset)); 1281c37f4cd7SKevin Wolf assert(*bytes != 0); 128210f0ed8bSKevin Wolf 128310f0ed8bSKevin Wolf return 1; 128410f0ed8bSKevin Wolf 128510f0ed8bSKevin Wolf fail: 128610f0ed8bSKevin Wolf if (*m && (*m)->nb_clusters > 0) { 128710f0ed8bSKevin Wolf QLIST_REMOVE(*m, next_in_flight); 128810f0ed8bSKevin Wolf } 128910f0ed8bSKevin Wolf return ret; 129010f0ed8bSKevin Wolf } 129110f0ed8bSKevin Wolf 129210f0ed8bSKevin Wolf /* 129345aba42fSKevin Wolf * alloc_cluster_offset 129445aba42fSKevin Wolf * 1295250196f1SKevin Wolf * For a given offset on the virtual disk, find the cluster offset in qcow2 1296250196f1SKevin Wolf * file. If the offset is not found, allocate a new cluster. 129745aba42fSKevin Wolf * 1298250196f1SKevin Wolf * If the cluster was already allocated, m->nb_clusters is set to 0 and 1299a7912369SFrediano Ziglio * other fields in m are meaningless. 130045aba42fSKevin Wolf * 1301148da7eaSKevin Wolf * If the cluster is newly allocated, m->nb_clusters is set to the number of 130268d100e9SKevin Wolf * contiguous clusters that have been allocated. In this case, the other 130368d100e9SKevin Wolf * fields of m are valid and contain information about the first allocated 130468d100e9SKevin Wolf * cluster. 1305148da7eaSKevin Wolf * 130668d100e9SKevin Wolf * If the request conflicts with another write request in flight, the coroutine 130768d100e9SKevin Wolf * is queued and will be reentered when the dependency has completed. 1308148da7eaSKevin Wolf * 1309148da7eaSKevin Wolf * Return 0 on success and -errno in error cases 131045aba42fSKevin Wolf */ 1311f4f0d391SKevin Wolf int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, 1312d46a0bb2SKevin Wolf unsigned int *bytes, uint64_t *host_offset, 1313d46a0bb2SKevin Wolf QCowL2Meta **m) 131445aba42fSKevin Wolf { 1315ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 1316710c2496SKevin Wolf uint64_t start, remaining; 1317250196f1SKevin Wolf uint64_t cluster_offset; 131865eb2e35SKevin Wolf uint64_t cur_bytes; 1319710c2496SKevin Wolf int ret; 132045aba42fSKevin Wolf 1321d46a0bb2SKevin Wolf trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, *bytes); 1322710c2496SKevin Wolf 132372424114SKevin Wolf again: 132416f0587eSHu Tao start = offset; 1325d46a0bb2SKevin Wolf remaining = *bytes; 13260af729ecSKevin Wolf cluster_offset = 0; 13270af729ecSKevin Wolf *host_offset = 0; 1328ecdd5333SKevin Wolf cur_bytes = 0; 1329ecdd5333SKevin Wolf *m = NULL; 13300af729ecSKevin Wolf 13312c3b32d2SKevin Wolf while (true) { 1332ecdd5333SKevin Wolf 1333ecdd5333SKevin Wolf if (!*host_offset) { 1334ecdd5333SKevin Wolf *host_offset = start_of_cluster(s, cluster_offset); 1335ecdd5333SKevin Wolf } 1336ecdd5333SKevin Wolf 1337ecdd5333SKevin Wolf assert(remaining >= cur_bytes); 1338ecdd5333SKevin Wolf 1339ecdd5333SKevin Wolf start += cur_bytes; 1340ecdd5333SKevin Wolf remaining -= cur_bytes; 1341ecdd5333SKevin Wolf cluster_offset += cur_bytes; 1342ecdd5333SKevin Wolf 1343ecdd5333SKevin Wolf if (remaining == 0) { 1344ecdd5333SKevin Wolf break; 1345ecdd5333SKevin Wolf } 1346ecdd5333SKevin Wolf 1347ecdd5333SKevin Wolf cur_bytes = remaining; 1348ecdd5333SKevin Wolf 1349250196f1SKevin Wolf /* 135017a71e58SKevin Wolf * Now start gathering as many contiguous clusters as possible: 135117a71e58SKevin Wolf * 135217a71e58SKevin Wolf * 1. Check for overlaps with in-flight allocations 135317a71e58SKevin Wolf * 13542c3b32d2SKevin Wolf * a) Overlap not in the first cluster -> shorten this request and 13552c3b32d2SKevin Wolf * let the caller handle the rest in its next loop iteration. 135617a71e58SKevin Wolf * 13572c3b32d2SKevin Wolf * b) Real overlaps of two requests. Yield and restart the search 13582c3b32d2SKevin Wolf * for contiguous clusters (the situation could have changed 13592c3b32d2SKevin Wolf * while we were sleeping) 136017a71e58SKevin Wolf * 136117a71e58SKevin Wolf * c) TODO: Request starts in the same cluster as the in-flight 13622c3b32d2SKevin Wolf * allocation ends. Shorten the COW of the in-fight allocation, 13632c3b32d2SKevin Wolf * set cluster_offset to write to the same cluster and set up 13642c3b32d2SKevin Wolf * the right synchronisation between the in-flight request and 13652c3b32d2SKevin Wolf * the new one. 136617a71e58SKevin Wolf */ 1367ecdd5333SKevin Wolf ret = handle_dependencies(bs, start, &cur_bytes, m); 136817a71e58SKevin Wolf if (ret == -EAGAIN) { 1369ecdd5333SKevin Wolf /* Currently handle_dependencies() doesn't yield if we already had 1370ecdd5333SKevin Wolf * an allocation. If it did, we would have to clean up the L2Meta 1371ecdd5333SKevin Wolf * structs before starting over. */ 1372ecdd5333SKevin Wolf assert(*m == NULL); 137317a71e58SKevin Wolf goto again; 137417a71e58SKevin Wolf } else if (ret < 0) { 137517a71e58SKevin Wolf return ret; 1376ecdd5333SKevin Wolf } else if (cur_bytes == 0) { 1377ecdd5333SKevin Wolf break; 137817a71e58SKevin Wolf } else { 137917a71e58SKevin Wolf /* handle_dependencies() may have decreased cur_bytes (shortened 138017a71e58SKevin Wolf * the allocations below) so that the next dependency is processed 138117a71e58SKevin Wolf * correctly during the next loop iteration. */ 138217a71e58SKevin Wolf } 138317a71e58SKevin Wolf 138472424114SKevin Wolf /* 13850af729ecSKevin Wolf * 2. Count contiguous COPIED clusters. 138672424114SKevin Wolf */ 1387710c2496SKevin Wolf ret = handle_copied(bs, start, &cluster_offset, &cur_bytes, m); 138872424114SKevin Wolf if (ret < 0) { 138972424114SKevin Wolf return ret; 13900af729ecSKevin Wolf } else if (ret) { 1391ecdd5333SKevin Wolf continue; 1392e62daaf6SKevin Wolf } else if (cur_bytes == 0) { 13932c3b32d2SKevin Wolf break; 139472424114SKevin Wolf } 139572424114SKevin Wolf 13960af729ecSKevin Wolf /* 13970af729ecSKevin Wolf * 3. If the request still hasn't completed, allocate new clusters, 13980af729ecSKevin Wolf * considering any cluster_offset of steps 1c or 2. 13990af729ecSKevin Wolf */ 1400710c2496SKevin Wolf ret = handle_alloc(bs, start, &cluster_offset, &cur_bytes, m); 1401037689d8SKevin Wolf if (ret < 0) { 1402037689d8SKevin Wolf return ret; 1403710c2496SKevin Wolf } else if (ret) { 1404ecdd5333SKevin Wolf continue; 14052c3b32d2SKevin Wolf } else { 14062c3b32d2SKevin Wolf assert(cur_bytes == 0); 14072c3b32d2SKevin Wolf break; 14082c3b32d2SKevin Wolf } 1409710c2496SKevin Wolf } 1410250196f1SKevin Wolf 1411d46a0bb2SKevin Wolf *bytes -= remaining; 1412d46a0bb2SKevin Wolf assert(*bytes > 0); 1413710c2496SKevin Wolf assert(*host_offset != 0); 141445aba42fSKevin Wolf 1415148da7eaSKevin Wolf return 0; 141645aba42fSKevin Wolf } 141745aba42fSKevin Wolf 141845aba42fSKevin Wolf static int decompress_buffer(uint8_t *out_buf, int out_buf_size, 141945aba42fSKevin Wolf const uint8_t *buf, int buf_size) 142045aba42fSKevin Wolf { 142145aba42fSKevin Wolf z_stream strm1, *strm = &strm1; 142245aba42fSKevin Wolf int ret, out_len; 142345aba42fSKevin Wolf 142445aba42fSKevin Wolf memset(strm, 0, sizeof(*strm)); 142545aba42fSKevin Wolf 142645aba42fSKevin Wolf strm->next_in = (uint8_t *)buf; 142745aba42fSKevin Wolf strm->avail_in = buf_size; 142845aba42fSKevin Wolf strm->next_out = out_buf; 142945aba42fSKevin Wolf strm->avail_out = out_buf_size; 143045aba42fSKevin Wolf 143145aba42fSKevin Wolf ret = inflateInit2(strm, -12); 143245aba42fSKevin Wolf if (ret != Z_OK) 143345aba42fSKevin Wolf return -1; 143445aba42fSKevin Wolf ret = inflate(strm, Z_FINISH); 143545aba42fSKevin Wolf out_len = strm->next_out - out_buf; 143645aba42fSKevin Wolf if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) || 143745aba42fSKevin Wolf out_len != out_buf_size) { 143845aba42fSKevin Wolf inflateEnd(strm); 143945aba42fSKevin Wolf return -1; 144045aba42fSKevin Wolf } 144145aba42fSKevin Wolf inflateEnd(strm); 144245aba42fSKevin Wolf return 0; 144345aba42fSKevin Wolf } 144445aba42fSKevin Wolf 144566f82ceeSKevin Wolf int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) 144645aba42fSKevin Wolf { 1447ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 144845aba42fSKevin Wolf int ret, csize, nb_csectors, sector_offset; 144945aba42fSKevin Wolf uint64_t coffset; 145045aba42fSKevin Wolf 145145aba42fSKevin Wolf coffset = cluster_offset & s->cluster_offset_mask; 145245aba42fSKevin Wolf if (s->cluster_cache_offset != coffset) { 145345aba42fSKevin Wolf nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1; 145445aba42fSKevin Wolf sector_offset = coffset & 511; 145545aba42fSKevin Wolf csize = nb_csectors * 512 - sector_offset; 145666f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED); 1457fbcbbf4eSKevin Wolf ret = bdrv_read(bs->file, coffset >> 9, s->cluster_data, 14589a4f4c31SKevin Wolf nb_csectors); 145945aba42fSKevin Wolf if (ret < 0) { 14608af36488SKevin Wolf return ret; 146145aba42fSKevin Wolf } 146245aba42fSKevin Wolf if (decompress_buffer(s->cluster_cache, s->cluster_size, 146345aba42fSKevin Wolf s->cluster_data + sector_offset, csize) < 0) { 14648af36488SKevin Wolf return -EIO; 146545aba42fSKevin Wolf } 146645aba42fSKevin Wolf s->cluster_cache_offset = coffset; 146745aba42fSKevin Wolf } 146845aba42fSKevin Wolf return 0; 146945aba42fSKevin Wolf } 14705ea929e3SKevin Wolf 14715ea929e3SKevin Wolf /* 14725ea929e3SKevin Wolf * This discards as many clusters of nb_clusters as possible at once (i.e. 14735ea929e3SKevin Wolf * all clusters in the same L2 table) and returns the number of discarded 14745ea929e3SKevin Wolf * clusters. 14755ea929e3SKevin Wolf */ 14765ea929e3SKevin Wolf static int discard_single_l2(BlockDriverState *bs, uint64_t offset, 1477b6d36defSMax Reitz uint64_t nb_clusters, enum qcow2_discard_type type, 1478b6d36defSMax Reitz bool full_discard) 14795ea929e3SKevin Wolf { 1480ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 14813948d1d4SKevin Wolf uint64_t *l2_table; 14825ea929e3SKevin Wolf int l2_index; 14835ea929e3SKevin Wolf int ret; 14845ea929e3SKevin Wolf int i; 14855ea929e3SKevin Wolf 14863948d1d4SKevin Wolf ret = get_cluster_table(bs, offset, &l2_table, &l2_index); 14875ea929e3SKevin Wolf if (ret < 0) { 14885ea929e3SKevin Wolf return ret; 14895ea929e3SKevin Wolf } 14905ea929e3SKevin Wolf 14915ea929e3SKevin Wolf /* Limit nb_clusters to one L2 table */ 14925ea929e3SKevin Wolf nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); 1493b6d36defSMax Reitz assert(nb_clusters <= INT_MAX); 14945ea929e3SKevin Wolf 14955ea929e3SKevin Wolf for (i = 0; i < nb_clusters; i++) { 1496c883db0dSMax Reitz uint64_t old_l2_entry; 14975ea929e3SKevin Wolf 1498c883db0dSMax Reitz old_l2_entry = be64_to_cpu(l2_table[l2_index + i]); 1499a71835a0SKevin Wolf 1500a71835a0SKevin Wolf /* 1501808c4b6fSMax Reitz * If full_discard is false, make sure that a discarded area reads back 1502808c4b6fSMax Reitz * as zeroes for v3 images (we cannot do it for v2 without actually 1503808c4b6fSMax Reitz * writing a zero-filled buffer). We can skip the operation if the 1504808c4b6fSMax Reitz * cluster is already marked as zero, or if it's unallocated and we 1505808c4b6fSMax Reitz * don't have a backing file. 1506a71835a0SKevin Wolf * 1507a71835a0SKevin Wolf * TODO We might want to use bdrv_get_block_status(bs) here, but we're 1508a71835a0SKevin Wolf * holding s->lock, so that doesn't work today. 1509808c4b6fSMax Reitz * 1510808c4b6fSMax Reitz * If full_discard is true, the sector should not read back as zeroes, 1511808c4b6fSMax Reitz * but rather fall through to the backing file. 1512a71835a0SKevin Wolf */ 1513c883db0dSMax Reitz switch (qcow2_get_cluster_type(old_l2_entry)) { 1514c883db0dSMax Reitz case QCOW2_CLUSTER_UNALLOCATED: 1515760e0063SKevin Wolf if (full_discard || !bs->backing) { 1516a71835a0SKevin Wolf continue; 1517a71835a0SKevin Wolf } 1518c883db0dSMax Reitz break; 1519a71835a0SKevin Wolf 1520fdfab37dSEric Blake case QCOW2_CLUSTER_ZERO_PLAIN: 1521fdfab37dSEric Blake if (!full_discard) { 15225ea929e3SKevin Wolf continue; 1523808c4b6fSMax Reitz } 1524808c4b6fSMax Reitz break; 1525c883db0dSMax Reitz 1526fdfab37dSEric Blake case QCOW2_CLUSTER_ZERO_ALLOC: 1527c883db0dSMax Reitz case QCOW2_CLUSTER_NORMAL: 1528c883db0dSMax Reitz case QCOW2_CLUSTER_COMPRESSED: 1529c883db0dSMax Reitz break; 1530c883db0dSMax Reitz 1531c883db0dSMax Reitz default: 1532c883db0dSMax Reitz abort(); 15335ea929e3SKevin Wolf } 15345ea929e3SKevin Wolf 15355ea929e3SKevin Wolf /* First remove L2 entries */ 153672e80b89SAlberto Garcia qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); 1537808c4b6fSMax Reitz if (!full_discard && s->qcow_version >= 3) { 1538a71835a0SKevin Wolf l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO); 1539a71835a0SKevin Wolf } else { 15405ea929e3SKevin Wolf l2_table[l2_index + i] = cpu_to_be64(0); 1541a71835a0SKevin Wolf } 15425ea929e3SKevin Wolf 15435ea929e3SKevin Wolf /* Then decrease the refcount */ 1544c883db0dSMax Reitz qcow2_free_any_clusters(bs, old_l2_entry, 1, type); 15455ea929e3SKevin Wolf } 15465ea929e3SKevin Wolf 1547a3f1afb4SAlberto Garcia qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 15485ea929e3SKevin Wolf 15495ea929e3SKevin Wolf return nb_clusters; 15505ea929e3SKevin Wolf } 15515ea929e3SKevin Wolf 1552*d2cb36afSEric Blake int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset, 1553*d2cb36afSEric Blake uint64_t bytes, enum qcow2_discard_type type, 1554*d2cb36afSEric Blake bool full_discard) 15555ea929e3SKevin Wolf { 1556ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 1557*d2cb36afSEric Blake uint64_t end_offset = offset + bytes; 1558b6d36defSMax Reitz uint64_t nb_clusters; 1559*d2cb36afSEric Blake int64_t cleared; 15605ea929e3SKevin Wolf int ret; 15615ea929e3SKevin Wolf 1562f10ee139SEric Blake /* Caller must pass aligned values, except at image end */ 15630c1bd469SEric Blake assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); 1564f10ee139SEric Blake assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) || 1565f10ee139SEric Blake end_offset == bs->total_sectors << BDRV_SECTOR_BITS); 15665ea929e3SKevin Wolf 1567*d2cb36afSEric Blake nb_clusters = size_to_clusters(s, bytes); 15685ea929e3SKevin Wolf 15690b919faeSKevin Wolf s->cache_discards = true; 15700b919faeSKevin Wolf 15715ea929e3SKevin Wolf /* Each L2 table is handled by its own loop iteration */ 15725ea929e3SKevin Wolf while (nb_clusters > 0) { 1573*d2cb36afSEric Blake cleared = discard_single_l2(bs, offset, nb_clusters, type, 1574*d2cb36afSEric Blake full_discard); 1575*d2cb36afSEric Blake if (cleared < 0) { 1576*d2cb36afSEric Blake ret = cleared; 15770b919faeSKevin Wolf goto fail; 15785ea929e3SKevin Wolf } 15795ea929e3SKevin Wolf 1580*d2cb36afSEric Blake nb_clusters -= cleared; 1581*d2cb36afSEric Blake offset += (cleared * s->cluster_size); 15825ea929e3SKevin Wolf } 15835ea929e3SKevin Wolf 15840b919faeSKevin Wolf ret = 0; 15850b919faeSKevin Wolf fail: 15860b919faeSKevin Wolf s->cache_discards = false; 15870b919faeSKevin Wolf qcow2_process_discards(bs, ret); 15880b919faeSKevin Wolf 15890b919faeSKevin Wolf return ret; 15905ea929e3SKevin Wolf } 1591621f0589SKevin Wolf 1592621f0589SKevin Wolf /* 1593621f0589SKevin Wolf * This zeroes as many clusters of nb_clusters as possible at once (i.e. 1594621f0589SKevin Wolf * all clusters in the same L2 table) and returns the number of zeroed 1595621f0589SKevin Wolf * clusters. 1596621f0589SKevin Wolf */ 1597621f0589SKevin Wolf static int zero_single_l2(BlockDriverState *bs, uint64_t offset, 1598170f4b2eSFam Zheng uint64_t nb_clusters, int flags) 1599621f0589SKevin Wolf { 1600ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 1601621f0589SKevin Wolf uint64_t *l2_table; 1602621f0589SKevin Wolf int l2_index; 1603621f0589SKevin Wolf int ret; 1604621f0589SKevin Wolf int i; 160506cc5e2bSEric Blake bool unmap = !!(flags & BDRV_REQ_MAY_UNMAP); 1606621f0589SKevin Wolf 1607621f0589SKevin Wolf ret = get_cluster_table(bs, offset, &l2_table, &l2_index); 1608621f0589SKevin Wolf if (ret < 0) { 1609621f0589SKevin Wolf return ret; 1610621f0589SKevin Wolf } 1611621f0589SKevin Wolf 1612621f0589SKevin Wolf /* Limit nb_clusters to one L2 table */ 1613621f0589SKevin Wolf nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); 1614b6d36defSMax Reitz assert(nb_clusters <= INT_MAX); 1615621f0589SKevin Wolf 1616621f0589SKevin Wolf for (i = 0; i < nb_clusters; i++) { 1617621f0589SKevin Wolf uint64_t old_offset; 161806cc5e2bSEric Blake QCow2ClusterType cluster_type; 1619621f0589SKevin Wolf 1620621f0589SKevin Wolf old_offset = be64_to_cpu(l2_table[l2_index + i]); 1621621f0589SKevin Wolf 162206cc5e2bSEric Blake /* 162306cc5e2bSEric Blake * Minimize L2 changes if the cluster already reads back as 162406cc5e2bSEric Blake * zeroes with correct allocation. 162506cc5e2bSEric Blake */ 162606cc5e2bSEric Blake cluster_type = qcow2_get_cluster_type(old_offset); 162706cc5e2bSEric Blake if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN || 162806cc5e2bSEric Blake (cluster_type == QCOW2_CLUSTER_ZERO_ALLOC && !unmap)) { 162906cc5e2bSEric Blake continue; 163006cc5e2bSEric Blake } 163106cc5e2bSEric Blake 163272e80b89SAlberto Garcia qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); 163306cc5e2bSEric Blake if (cluster_type == QCOW2_CLUSTER_COMPRESSED || unmap) { 1634621f0589SKevin Wolf l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO); 16356cfcb9b8SKevin Wolf qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST); 1636621f0589SKevin Wolf } else { 1637621f0589SKevin Wolf l2_table[l2_index + i] |= cpu_to_be64(QCOW_OFLAG_ZERO); 1638621f0589SKevin Wolf } 1639621f0589SKevin Wolf } 1640621f0589SKevin Wolf 1641a3f1afb4SAlberto Garcia qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 1642621f0589SKevin Wolf 1643621f0589SKevin Wolf return nb_clusters; 1644621f0589SKevin Wolf } 1645621f0589SKevin Wolf 1646*d2cb36afSEric Blake int qcow2_cluster_zeroize(BlockDriverState *bs, uint64_t offset, 1647*d2cb36afSEric Blake uint64_t bytes, int flags) 1648621f0589SKevin Wolf { 1649ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 1650*d2cb36afSEric Blake uint64_t end_offset = offset + bytes; 1651b6d36defSMax Reitz uint64_t nb_clusters; 1652*d2cb36afSEric Blake int64_t cleared; 1653621f0589SKevin Wolf int ret; 1654621f0589SKevin Wolf 1655f10ee139SEric Blake /* Caller must pass aligned values, except at image end */ 1656f10ee139SEric Blake assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); 1657f10ee139SEric Blake assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) || 1658f10ee139SEric Blake end_offset == bs->total_sectors << BDRV_SECTOR_BITS); 1659f10ee139SEric Blake 1660621f0589SKevin Wolf /* The zero flag is only supported by version 3 and newer */ 1661621f0589SKevin Wolf if (s->qcow_version < 3) { 1662621f0589SKevin Wolf return -ENOTSUP; 1663621f0589SKevin Wolf } 1664621f0589SKevin Wolf 1665621f0589SKevin Wolf /* Each L2 table is handled by its own loop iteration */ 1666*d2cb36afSEric Blake nb_clusters = size_to_clusters(s, bytes); 1667621f0589SKevin Wolf 16680b919faeSKevin Wolf s->cache_discards = true; 16690b919faeSKevin Wolf 1670621f0589SKevin Wolf while (nb_clusters > 0) { 1671*d2cb36afSEric Blake cleared = zero_single_l2(bs, offset, nb_clusters, flags); 1672*d2cb36afSEric Blake if (cleared < 0) { 1673*d2cb36afSEric Blake ret = cleared; 16740b919faeSKevin Wolf goto fail; 1675621f0589SKevin Wolf } 1676621f0589SKevin Wolf 1677*d2cb36afSEric Blake nb_clusters -= cleared; 1678*d2cb36afSEric Blake offset += (cleared * s->cluster_size); 1679621f0589SKevin Wolf } 1680621f0589SKevin Wolf 16810b919faeSKevin Wolf ret = 0; 16820b919faeSKevin Wolf fail: 16830b919faeSKevin Wolf s->cache_discards = false; 16840b919faeSKevin Wolf qcow2_process_discards(bs, ret); 16850b919faeSKevin Wolf 16860b919faeSKevin Wolf return ret; 1687621f0589SKevin Wolf } 168832b6444dSMax Reitz 168932b6444dSMax Reitz /* 169032b6444dSMax Reitz * Expands all zero clusters in a specific L1 table (or deallocates them, for 169132b6444dSMax Reitz * non-backed non-pre-allocated zero clusters). 169232b6444dSMax Reitz * 16934057a2b2SMax Reitz * l1_entries and *visited_l1_entries are used to keep track of progress for 16944057a2b2SMax Reitz * status_cb(). l1_entries contains the total number of L1 entries and 16954057a2b2SMax Reitz * *visited_l1_entries counts all visited L1 entries. 169632b6444dSMax Reitz */ 169732b6444dSMax Reitz static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, 1698ecf58777SMax Reitz int l1_size, int64_t *visited_l1_entries, 16994057a2b2SMax Reitz int64_t l1_entries, 17008b13976dSMax Reitz BlockDriverAmendStatusCB *status_cb, 17018b13976dSMax Reitz void *cb_opaque) 170232b6444dSMax Reitz { 1703ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 170432b6444dSMax Reitz bool is_active_l1 = (l1_table == s->l1_table); 170532b6444dSMax Reitz uint64_t *l2_table = NULL; 170632b6444dSMax Reitz int ret; 170732b6444dSMax Reitz int i, j; 170832b6444dSMax Reitz 170932b6444dSMax Reitz if (!is_active_l1) { 171032b6444dSMax Reitz /* inactive L2 tables require a buffer to be stored in when loading 171132b6444dSMax Reitz * them from disk */ 17129a4f4c31SKevin Wolf l2_table = qemu_try_blockalign(bs->file->bs, s->cluster_size); 1713de82815dSKevin Wolf if (l2_table == NULL) { 1714de82815dSKevin Wolf return -ENOMEM; 1715de82815dSKevin Wolf } 171632b6444dSMax Reitz } 171732b6444dSMax Reitz 171832b6444dSMax Reitz for (i = 0; i < l1_size; i++) { 171932b6444dSMax Reitz uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK; 172032b6444dSMax Reitz bool l2_dirty = false; 17210e06528eSMax Reitz uint64_t l2_refcount; 172232b6444dSMax Reitz 172332b6444dSMax Reitz if (!l2_offset) { 172432b6444dSMax Reitz /* unallocated */ 17254057a2b2SMax Reitz (*visited_l1_entries)++; 17264057a2b2SMax Reitz if (status_cb) { 17278b13976dSMax Reitz status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque); 17284057a2b2SMax Reitz } 172932b6444dSMax Reitz continue; 173032b6444dSMax Reitz } 173132b6444dSMax Reitz 17328dd93d93SMax Reitz if (offset_into_cluster(s, l2_offset)) { 17338dd93d93SMax Reitz qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" 17348dd93d93SMax Reitz PRIx64 " unaligned (L1 index: %#x)", 17358dd93d93SMax Reitz l2_offset, i); 17368dd93d93SMax Reitz ret = -EIO; 17378dd93d93SMax Reitz goto fail; 17388dd93d93SMax Reitz } 17398dd93d93SMax Reitz 174032b6444dSMax Reitz if (is_active_l1) { 174132b6444dSMax Reitz /* get active L2 tables from cache */ 174232b6444dSMax Reitz ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, 174332b6444dSMax Reitz (void **)&l2_table); 174432b6444dSMax Reitz } else { 174532b6444dSMax Reitz /* load inactive L2 tables from disk */ 1746fbcbbf4eSKevin Wolf ret = bdrv_read(bs->file, l2_offset / BDRV_SECTOR_SIZE, 174732b6444dSMax Reitz (void *)l2_table, s->cluster_sectors); 174832b6444dSMax Reitz } 174932b6444dSMax Reitz if (ret < 0) { 175032b6444dSMax Reitz goto fail; 175132b6444dSMax Reitz } 175232b6444dSMax Reitz 17537324c10fSMax Reitz ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits, 17547324c10fSMax Reitz &l2_refcount); 17557324c10fSMax Reitz if (ret < 0) { 1756ecf58777SMax Reitz goto fail; 1757ecf58777SMax Reitz } 1758ecf58777SMax Reitz 175932b6444dSMax Reitz for (j = 0; j < s->l2_size; j++) { 176032b6444dSMax Reitz uint64_t l2_entry = be64_to_cpu(l2_table[j]); 1761ecf58777SMax Reitz int64_t offset = l2_entry & L2E_OFFSET_MASK; 17623ef95218SEric Blake QCow2ClusterType cluster_type = qcow2_get_cluster_type(l2_entry); 176332b6444dSMax Reitz 1764fdfab37dSEric Blake if (cluster_type != QCOW2_CLUSTER_ZERO_PLAIN && 1765fdfab37dSEric Blake cluster_type != QCOW2_CLUSTER_ZERO_ALLOC) { 176632b6444dSMax Reitz continue; 176732b6444dSMax Reitz } 176832b6444dSMax Reitz 1769fdfab37dSEric Blake if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) { 1770760e0063SKevin Wolf if (!bs->backing) { 177132b6444dSMax Reitz /* not backed; therefore we can simply deallocate the 177232b6444dSMax Reitz * cluster */ 177332b6444dSMax Reitz l2_table[j] = 0; 177432b6444dSMax Reitz l2_dirty = true; 177532b6444dSMax Reitz continue; 177632b6444dSMax Reitz } 177732b6444dSMax Reitz 177832b6444dSMax Reitz offset = qcow2_alloc_clusters(bs, s->cluster_size); 177932b6444dSMax Reitz if (offset < 0) { 178032b6444dSMax Reitz ret = offset; 178132b6444dSMax Reitz goto fail; 178232b6444dSMax Reitz } 1783ecf58777SMax Reitz 1784ecf58777SMax Reitz if (l2_refcount > 1) { 1785ecf58777SMax Reitz /* For shared L2 tables, set the refcount accordingly (it is 1786ecf58777SMax Reitz * already 1 and needs to be l2_refcount) */ 1787ecf58777SMax Reitz ret = qcow2_update_cluster_refcount(bs, 17882aabe7c7SMax Reitz offset >> s->cluster_bits, 17892aabe7c7SMax Reitz refcount_diff(1, l2_refcount), false, 1790ecf58777SMax Reitz QCOW2_DISCARD_OTHER); 1791ecf58777SMax Reitz if (ret < 0) { 1792ecf58777SMax Reitz qcow2_free_clusters(bs, offset, s->cluster_size, 1793ecf58777SMax Reitz QCOW2_DISCARD_OTHER); 1794ecf58777SMax Reitz goto fail; 1795ecf58777SMax Reitz } 1796ecf58777SMax Reitz } 179732b6444dSMax Reitz } 179832b6444dSMax Reitz 17998dd93d93SMax Reitz if (offset_into_cluster(s, offset)) { 18008dd93d93SMax Reitz qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset " 18018dd93d93SMax Reitz "%#" PRIx64 " unaligned (L2 offset: %#" 18028dd93d93SMax Reitz PRIx64 ", L2 index: %#x)", offset, 18038dd93d93SMax Reitz l2_offset, j); 1804fdfab37dSEric Blake if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) { 18058dd93d93SMax Reitz qcow2_free_clusters(bs, offset, s->cluster_size, 18068dd93d93SMax Reitz QCOW2_DISCARD_ALWAYS); 18078dd93d93SMax Reitz } 18088dd93d93SMax Reitz ret = -EIO; 18098dd93d93SMax Reitz goto fail; 18108dd93d93SMax Reitz } 18118dd93d93SMax Reitz 1812231bb267SMax Reitz ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size); 181332b6444dSMax Reitz if (ret < 0) { 1814fdfab37dSEric Blake if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) { 181532b6444dSMax Reitz qcow2_free_clusters(bs, offset, s->cluster_size, 181632b6444dSMax Reitz QCOW2_DISCARD_ALWAYS); 1817320c7066SMax Reitz } 181832b6444dSMax Reitz goto fail; 181932b6444dSMax Reitz } 182032b6444dSMax Reitz 1821720ff280SKevin Wolf ret = bdrv_pwrite_zeroes(bs->file, offset, s->cluster_size, 0); 182232b6444dSMax Reitz if (ret < 0) { 1823fdfab37dSEric Blake if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) { 182432b6444dSMax Reitz qcow2_free_clusters(bs, offset, s->cluster_size, 182532b6444dSMax Reitz QCOW2_DISCARD_ALWAYS); 1826320c7066SMax Reitz } 182732b6444dSMax Reitz goto fail; 182832b6444dSMax Reitz } 182932b6444dSMax Reitz 1830ecf58777SMax Reitz if (l2_refcount == 1) { 183132b6444dSMax Reitz l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED); 1832ecf58777SMax Reitz } else { 1833ecf58777SMax Reitz l2_table[j] = cpu_to_be64(offset); 1834e390cf5aSMax Reitz } 1835ecf58777SMax Reitz l2_dirty = true; 183632b6444dSMax Reitz } 183732b6444dSMax Reitz 183832b6444dSMax Reitz if (is_active_l1) { 183932b6444dSMax Reitz if (l2_dirty) { 184072e80b89SAlberto Garcia qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); 184132b6444dSMax Reitz qcow2_cache_depends_on_flush(s->l2_table_cache); 184232b6444dSMax Reitz } 1843a3f1afb4SAlberto Garcia qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 184432b6444dSMax Reitz } else { 184532b6444dSMax Reitz if (l2_dirty) { 1846231bb267SMax Reitz ret = qcow2_pre_write_overlap_check(bs, 1847231bb267SMax Reitz QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2, l2_offset, 184832b6444dSMax Reitz s->cluster_size); 184932b6444dSMax Reitz if (ret < 0) { 185032b6444dSMax Reitz goto fail; 185132b6444dSMax Reitz } 185232b6444dSMax Reitz 185318d51c4bSKevin Wolf ret = bdrv_write(bs->file, l2_offset / BDRV_SECTOR_SIZE, 185432b6444dSMax Reitz (void *)l2_table, s->cluster_sectors); 185532b6444dSMax Reitz if (ret < 0) { 185632b6444dSMax Reitz goto fail; 185732b6444dSMax Reitz } 185832b6444dSMax Reitz } 185932b6444dSMax Reitz } 18604057a2b2SMax Reitz 18614057a2b2SMax Reitz (*visited_l1_entries)++; 18624057a2b2SMax Reitz if (status_cb) { 18638b13976dSMax Reitz status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque); 18644057a2b2SMax Reitz } 186532b6444dSMax Reitz } 186632b6444dSMax Reitz 186732b6444dSMax Reitz ret = 0; 186832b6444dSMax Reitz 186932b6444dSMax Reitz fail: 187032b6444dSMax Reitz if (l2_table) { 187132b6444dSMax Reitz if (!is_active_l1) { 187232b6444dSMax Reitz qemu_vfree(l2_table); 187332b6444dSMax Reitz } else { 187432b6444dSMax Reitz qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 187532b6444dSMax Reitz } 187632b6444dSMax Reitz } 187732b6444dSMax Reitz return ret; 187832b6444dSMax Reitz } 187932b6444dSMax Reitz 188032b6444dSMax Reitz /* 188132b6444dSMax Reitz * For backed images, expands all zero clusters on the image. For non-backed 188232b6444dSMax Reitz * images, deallocates all non-pre-allocated zero clusters (and claims the 188332b6444dSMax Reitz * allocation for pre-allocated ones). This is important for downgrading to a 188432b6444dSMax Reitz * qcow2 version which doesn't yet support metadata zero clusters. 188532b6444dSMax Reitz */ 18864057a2b2SMax Reitz int qcow2_expand_zero_clusters(BlockDriverState *bs, 18878b13976dSMax Reitz BlockDriverAmendStatusCB *status_cb, 18888b13976dSMax Reitz void *cb_opaque) 188932b6444dSMax Reitz { 1890ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 189132b6444dSMax Reitz uint64_t *l1_table = NULL; 18924057a2b2SMax Reitz int64_t l1_entries = 0, visited_l1_entries = 0; 189332b6444dSMax Reitz int ret; 189432b6444dSMax Reitz int i, j; 189532b6444dSMax Reitz 18964057a2b2SMax Reitz if (status_cb) { 18974057a2b2SMax Reitz l1_entries = s->l1_size; 18984057a2b2SMax Reitz for (i = 0; i < s->nb_snapshots; i++) { 18994057a2b2SMax Reitz l1_entries += s->snapshots[i].l1_size; 19004057a2b2SMax Reitz } 19014057a2b2SMax Reitz } 19024057a2b2SMax Reitz 190332b6444dSMax Reitz ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size, 19044057a2b2SMax Reitz &visited_l1_entries, l1_entries, 19058b13976dSMax Reitz status_cb, cb_opaque); 190632b6444dSMax Reitz if (ret < 0) { 190732b6444dSMax Reitz goto fail; 190832b6444dSMax Reitz } 190932b6444dSMax Reitz 191032b6444dSMax Reitz /* Inactive L1 tables may point to active L2 tables - therefore it is 191132b6444dSMax Reitz * necessary to flush the L2 table cache before trying to access the L2 191232b6444dSMax Reitz * tables pointed to by inactive L1 entries (else we might try to expand 191332b6444dSMax Reitz * zero clusters that have already been expanded); furthermore, it is also 191432b6444dSMax Reitz * necessary to empty the L2 table cache, since it may contain tables which 191532b6444dSMax Reitz * are now going to be modified directly on disk, bypassing the cache. 191632b6444dSMax Reitz * qcow2_cache_empty() does both for us. */ 191732b6444dSMax Reitz ret = qcow2_cache_empty(bs, s->l2_table_cache); 191832b6444dSMax Reitz if (ret < 0) { 191932b6444dSMax Reitz goto fail; 192032b6444dSMax Reitz } 192132b6444dSMax Reitz 192232b6444dSMax Reitz for (i = 0; i < s->nb_snapshots; i++) { 1923d737b78cSLaurent Vivier int l1_sectors = DIV_ROUND_UP(s->snapshots[i].l1_size * 1924d737b78cSLaurent Vivier sizeof(uint64_t), BDRV_SECTOR_SIZE); 192532b6444dSMax Reitz 192632b6444dSMax Reitz l1_table = g_realloc(l1_table, l1_sectors * BDRV_SECTOR_SIZE); 192732b6444dSMax Reitz 1928fbcbbf4eSKevin Wolf ret = bdrv_read(bs->file, 19299a4f4c31SKevin Wolf s->snapshots[i].l1_table_offset / BDRV_SECTOR_SIZE, 19309a4f4c31SKevin Wolf (void *)l1_table, l1_sectors); 193132b6444dSMax Reitz if (ret < 0) { 193232b6444dSMax Reitz goto fail; 193332b6444dSMax Reitz } 193432b6444dSMax Reitz 193532b6444dSMax Reitz for (j = 0; j < s->snapshots[i].l1_size; j++) { 193632b6444dSMax Reitz be64_to_cpus(&l1_table[j]); 193732b6444dSMax Reitz } 193832b6444dSMax Reitz 193932b6444dSMax Reitz ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size, 19404057a2b2SMax Reitz &visited_l1_entries, l1_entries, 19418b13976dSMax Reitz status_cb, cb_opaque); 194232b6444dSMax Reitz if (ret < 0) { 194332b6444dSMax Reitz goto fail; 194432b6444dSMax Reitz } 194532b6444dSMax Reitz } 194632b6444dSMax Reitz 194732b6444dSMax Reitz ret = 0; 194832b6444dSMax Reitz 194932b6444dSMax Reitz fail: 195032b6444dSMax Reitz g_free(l1_table); 195132b6444dSMax Reitz return ret; 195232b6444dSMax Reitz } 1953