1 /* 2 * L2/refcount table cache for the QCOW2 format 3 * 4 * Copyright (c) 2010 Kevin Wolf <kwolf@redhat.com> 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "qemu/osdep.h" 26 #include "qemu/memalign.h" 27 #include "qcow2.h" 28 #include "trace.h" 29 30 typedef struct Qcow2CachedTable { 31 int64_t offset; 32 uint64_t lru_counter; 33 int ref; 34 bool dirty; 35 } Qcow2CachedTable; 36 37 struct Qcow2Cache { 38 Qcow2CachedTable *entries; 39 struct Qcow2Cache *depends; 40 int size; 41 int table_size; 42 bool depends_on_flush; 43 void *table_array; 44 uint64_t lru_counter; 45 uint64_t cache_clean_lru_counter; 46 }; 47 48 static inline void *qcow2_cache_get_table_addr(Qcow2Cache *c, int table) 49 { 50 return (uint8_t *) c->table_array + (size_t) table * c->table_size; 51 } 52 53 static inline int qcow2_cache_get_table_idx(Qcow2Cache *c, void *table) 54 { 55 ptrdiff_t table_offset = (uint8_t *) table - (uint8_t *) c->table_array; 56 int idx = table_offset / c->table_size; 57 assert(idx >= 0 && idx < c->size && table_offset % c->table_size == 0); 58 return idx; 59 } 60 61 static inline const char *qcow2_cache_get_name(BDRVQcow2State *s, Qcow2Cache *c) 62 { 63 if (c == s->refcount_block_cache) { 64 return "refcount block"; 65 } else if (c == s->l2_table_cache) { 66 return "L2 table"; 67 } else { 68 /* Do not abort, because this is not critical */ 69 return "unknown"; 70 } 71 } 72 73 static void qcow2_cache_table_release(Qcow2Cache *c, int i, int num_tables) 74 { 75 /* Using MADV_DONTNEED to discard memory is a Linux-specific feature */ 76 #ifdef CONFIG_LINUX 77 void *t = qcow2_cache_get_table_addr(c, i); 78 int align = qemu_real_host_page_size(); 79 size_t mem_size = (size_t) c->table_size * num_tables; 80 size_t offset = QEMU_ALIGN_UP((uintptr_t) t, align) - (uintptr_t) t; 81 size_t length = QEMU_ALIGN_DOWN(mem_size - offset, align); 82 if (mem_size > offset && length > 0) { 83 madvise((uint8_t *) t + offset, length, MADV_DONTNEED); 84 } 85 #endif 86 } 87 88 static inline bool can_clean_entry(Qcow2Cache *c, int i) 89 { 90 Qcow2CachedTable *t = &c->entries[i]; 91 return t->ref == 0 && !t->dirty && t->offset != 0 && 92 t->lru_counter <= c->cache_clean_lru_counter; 93 } 94 95 void qcow2_cache_clean_unused(Qcow2Cache *c) 96 { 97 int i = 0; 98 while (i < c->size) { 99 int to_clean = 0; 100 101 /* Skip the entries that we don't need to clean */ 102 while (i < c->size && !can_clean_entry(c, i)) { 103 i++; 104 } 105 106 /* And count how many we can clean in a row */ 107 while (i < c->size && can_clean_entry(c, i)) { 108 c->entries[i].offset = 0; 109 c->entries[i].lru_counter = 0; 110 i++; 111 to_clean++; 112 } 113 114 if (to_clean > 0) { 115 qcow2_cache_table_release(c, i - to_clean, to_clean); 116 } 117 } 118 119 c->cache_clean_lru_counter = c->lru_counter; 120 } 121 122 Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables, 123 unsigned table_size) 124 { 125 BDRVQcow2State *s = bs->opaque; 126 Qcow2Cache *c; 127 128 assert(num_tables > 0); 129 assert(is_power_of_2(table_size)); 130 assert(table_size >= (1 << MIN_CLUSTER_BITS)); 131 assert(table_size <= s->cluster_size); 132 133 c = g_new0(Qcow2Cache, 1); 134 c->size = num_tables; 135 c->table_size = table_size; 136 c->entries = g_try_new0(Qcow2CachedTable, num_tables); 137 c->table_array = qemu_try_blockalign(bs->file->bs, 138 (size_t) num_tables * c->table_size); 139 140 if (!c->entries || !c->table_array) { 141 qemu_vfree(c->table_array); 142 g_free(c->entries); 143 g_free(c); 144 c = NULL; 145 } 146 147 return c; 148 } 149 150 int qcow2_cache_destroy(Qcow2Cache *c) 151 { 152 int i; 153 154 for (i = 0; i < c->size; i++) { 155 assert(c->entries[i].ref == 0); 156 } 157 158 qemu_vfree(c->table_array); 159 g_free(c->entries); 160 g_free(c); 161 162 return 0; 163 } 164 165 static int qcow2_cache_flush_dependency(BlockDriverState *bs, Qcow2Cache *c) 166 { 167 int ret; 168 169 ret = qcow2_cache_flush(bs, c->depends); 170 if (ret < 0) { 171 return ret; 172 } 173 174 c->depends = NULL; 175 c->depends_on_flush = false; 176 177 return 0; 178 } 179 180 static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i) 181 { 182 BDRVQcow2State *s = bs->opaque; 183 int ret = 0; 184 185 if (!c->entries[i].dirty || !c->entries[i].offset) { 186 return 0; 187 } 188 189 trace_qcow2_cache_entry_flush(qemu_coroutine_self(), 190 c == s->l2_table_cache, i); 191 192 if (c->depends) { 193 ret = qcow2_cache_flush_dependency(bs, c); 194 } else if (c->depends_on_flush) { 195 ret = bdrv_flush(bs->file->bs); 196 if (ret >= 0) { 197 c->depends_on_flush = false; 198 } 199 } 200 201 if (ret < 0) { 202 return ret; 203 } 204 205 if (c == s->refcount_block_cache) { 206 ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_REFCOUNT_BLOCK, 207 c->entries[i].offset, c->table_size, false); 208 } else if (c == s->l2_table_cache) { 209 ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2, 210 c->entries[i].offset, c->table_size, false); 211 } else { 212 ret = qcow2_pre_write_overlap_check(bs, 0, 213 c->entries[i].offset, c->table_size, false); 214 } 215 216 if (ret < 0) { 217 return ret; 218 } 219 220 if (c == s->refcount_block_cache) { 221 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE_PART); 222 } else if (c == s->l2_table_cache) { 223 BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE); 224 } 225 226 ret = bdrv_pwrite(bs->file, c->entries[i].offset, c->table_size, 227 qcow2_cache_get_table_addr(c, i), 0); 228 if (ret < 0) { 229 return ret; 230 } 231 232 c->entries[i].dirty = false; 233 234 return 0; 235 } 236 237 int qcow2_cache_write(BlockDriverState *bs, Qcow2Cache *c) 238 { 239 BDRVQcow2State *s = bs->opaque; 240 int result = 0; 241 int ret; 242 int i; 243 244 trace_qcow2_cache_flush(qemu_coroutine_self(), c == s->l2_table_cache); 245 246 for (i = 0; i < c->size; i++) { 247 ret = qcow2_cache_entry_flush(bs, c, i); 248 if (ret < 0 && result != -ENOSPC) { 249 result = ret; 250 } 251 } 252 253 return result; 254 } 255 256 int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c) 257 { 258 int result = qcow2_cache_write(bs, c); 259 260 if (result == 0) { 261 int ret = bdrv_flush(bs->file->bs); 262 if (ret < 0) { 263 result = ret; 264 } 265 } 266 267 return result; 268 } 269 270 int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c, 271 Qcow2Cache *dependency) 272 { 273 int ret; 274 275 if (dependency->depends) { 276 ret = qcow2_cache_flush_dependency(bs, dependency); 277 if (ret < 0) { 278 return ret; 279 } 280 } 281 282 if (c->depends && (c->depends != dependency)) { 283 ret = qcow2_cache_flush_dependency(bs, c); 284 if (ret < 0) { 285 return ret; 286 } 287 } 288 289 c->depends = dependency; 290 return 0; 291 } 292 293 void qcow2_cache_depends_on_flush(Qcow2Cache *c) 294 { 295 c->depends_on_flush = true; 296 } 297 298 int qcow2_cache_empty(BlockDriverState *bs, Qcow2Cache *c) 299 { 300 int ret, i; 301 302 ret = qcow2_cache_flush(bs, c); 303 if (ret < 0) { 304 return ret; 305 } 306 307 for (i = 0; i < c->size; i++) { 308 assert(c->entries[i].ref == 0); 309 c->entries[i].offset = 0; 310 c->entries[i].lru_counter = 0; 311 } 312 313 qcow2_cache_table_release(c, 0, c->size); 314 315 c->lru_counter = 0; 316 317 return 0; 318 } 319 320 static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c, 321 uint64_t offset, void **table, bool read_from_disk) 322 { 323 BDRVQcow2State *s = bs->opaque; 324 int i; 325 int ret; 326 int lookup_index; 327 uint64_t min_lru_counter = UINT64_MAX; 328 int min_lru_index = -1; 329 330 assert(offset != 0); 331 332 trace_qcow2_cache_get(qemu_coroutine_self(), c == s->l2_table_cache, 333 offset, read_from_disk); 334 335 if (!QEMU_IS_ALIGNED(offset, c->table_size)) { 336 qcow2_signal_corruption(bs, true, -1, -1, "Cannot get entry from %s " 337 "cache: Offset %#" PRIx64 " is unaligned", 338 qcow2_cache_get_name(s, c), offset); 339 return -EIO; 340 } 341 342 /* Check if the table is already cached */ 343 i = lookup_index = (offset / c->table_size * 4) % c->size; 344 do { 345 const Qcow2CachedTable *t = &c->entries[i]; 346 if (t->offset == offset) { 347 goto found; 348 } 349 if (t->ref == 0 && t->lru_counter < min_lru_counter) { 350 min_lru_counter = t->lru_counter; 351 min_lru_index = i; 352 } 353 if (++i == c->size) { 354 i = 0; 355 } 356 } while (i != lookup_index); 357 358 if (min_lru_index == -1) { 359 /* This can't happen in current synchronous code, but leave the check 360 * here as a reminder for whoever starts using AIO with the cache */ 361 abort(); 362 } 363 364 /* Cache miss: write a table back and replace it */ 365 i = min_lru_index; 366 trace_qcow2_cache_get_replace_entry(qemu_coroutine_self(), 367 c == s->l2_table_cache, i); 368 369 ret = qcow2_cache_entry_flush(bs, c, i); 370 if (ret < 0) { 371 return ret; 372 } 373 374 trace_qcow2_cache_get_read(qemu_coroutine_self(), 375 c == s->l2_table_cache, i); 376 c->entries[i].offset = 0; 377 if (read_from_disk) { 378 if (c == s->l2_table_cache) { 379 BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD); 380 } 381 382 ret = bdrv_pread(bs->file, offset, c->table_size, 383 qcow2_cache_get_table_addr(c, i), 0); 384 if (ret < 0) { 385 return ret; 386 } 387 } 388 389 c->entries[i].offset = offset; 390 391 /* And return the right table */ 392 found: 393 c->entries[i].ref++; 394 *table = qcow2_cache_get_table_addr(c, i); 395 396 trace_qcow2_cache_get_done(qemu_coroutine_self(), 397 c == s->l2_table_cache, i); 398 399 return 0; 400 } 401 402 int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, 403 void **table) 404 { 405 return qcow2_cache_do_get(bs, c, offset, table, true); 406 } 407 408 int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, 409 void **table) 410 { 411 return qcow2_cache_do_get(bs, c, offset, table, false); 412 } 413 414 void qcow2_cache_put(Qcow2Cache *c, void **table) 415 { 416 int i = qcow2_cache_get_table_idx(c, *table); 417 418 c->entries[i].ref--; 419 *table = NULL; 420 421 if (c->entries[i].ref == 0) { 422 c->entries[i].lru_counter = ++c->lru_counter; 423 } 424 425 assert(c->entries[i].ref >= 0); 426 } 427 428 void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table) 429 { 430 int i = qcow2_cache_get_table_idx(c, table); 431 assert(c->entries[i].offset != 0); 432 c->entries[i].dirty = true; 433 } 434 435 void *qcow2_cache_is_table_offset(Qcow2Cache *c, uint64_t offset) 436 { 437 int i; 438 439 for (i = 0; i < c->size; i++) { 440 if (c->entries[i].offset == offset) { 441 return qcow2_cache_get_table_addr(c, i); 442 } 443 } 444 return NULL; 445 } 446 447 void qcow2_cache_discard(Qcow2Cache *c, void *table) 448 { 449 int i = qcow2_cache_get_table_idx(c, table); 450 451 assert(c->entries[i].ref == 0); 452 453 c->entries[i].offset = 0; 454 c->entries[i].lru_counter = 0; 455 c->entries[i].dirty = false; 456 457 qcow2_cache_table_release(c, i, 1); 458 } 459