1 /* 2 * L2/refcount table cache for the QCOW2 format 3 * 4 * Copyright (c) 2010 Kevin Wolf <kwolf@redhat.com> 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "qemu/osdep.h" 26 #include "block/block_int.h" 27 #include "qemu-common.h" 28 #include "qcow2.h" 29 #include "trace.h" 30 31 typedef struct Qcow2CachedTable { 32 int64_t offset; 33 uint64_t lru_counter; 34 int ref; 35 bool dirty; 36 } Qcow2CachedTable; 37 38 struct Qcow2Cache { 39 Qcow2CachedTable *entries; 40 struct Qcow2Cache *depends; 41 int size; 42 bool depends_on_flush; 43 void *table_array; 44 uint64_t lru_counter; 45 uint64_t cache_clean_lru_counter; 46 }; 47 48 static inline void *qcow2_cache_get_table_addr(BlockDriverState *bs, 49 Qcow2Cache *c, int table) 50 { 51 BDRVQcow2State *s = bs->opaque; 52 return (uint8_t *) c->table_array + (size_t) table * s->cluster_size; 53 } 54 55 static inline int qcow2_cache_get_table_idx(BlockDriverState *bs, 56 Qcow2Cache *c, void *table) 57 { 58 BDRVQcow2State *s = bs->opaque; 59 ptrdiff_t table_offset = (uint8_t *) table - (uint8_t *) c->table_array; 60 int idx = table_offset / s->cluster_size; 61 assert(idx >= 0 && idx < c->size && table_offset % s->cluster_size == 0); 62 return idx; 63 } 64 65 static void qcow2_cache_table_release(BlockDriverState *bs, Qcow2Cache *c, 66 int i, int num_tables) 67 { 68 /* Using MADV_DONTNEED to discard memory is a Linux-specific feature */ 69 #ifdef CONFIG_LINUX 70 BDRVQcow2State *s = bs->opaque; 71 void *t = qcow2_cache_get_table_addr(bs, c, i); 72 int align = getpagesize(); 73 size_t mem_size = (size_t) s->cluster_size * num_tables; 74 size_t offset = QEMU_ALIGN_UP((uintptr_t) t, align) - (uintptr_t) t; 75 size_t length = QEMU_ALIGN_DOWN(mem_size - offset, align); 76 if (length > 0) { 77 madvise((uint8_t *) t + offset, length, MADV_DONTNEED); 78 } 79 #endif 80 } 81 82 static inline bool can_clean_entry(Qcow2Cache *c, int i) 83 { 84 Qcow2CachedTable *t = &c->entries[i]; 85 return t->ref == 0 && !t->dirty && t->offset != 0 && 86 t->lru_counter <= c->cache_clean_lru_counter; 87 } 88 89 void qcow2_cache_clean_unused(BlockDriverState *bs, Qcow2Cache *c) 90 { 91 int i = 0; 92 while (i < c->size) { 93 int to_clean = 0; 94 95 /* Skip the entries that we don't need to clean */ 96 while (i < c->size && !can_clean_entry(c, i)) { 97 i++; 98 } 99 100 /* And count how many we can clean in a row */ 101 while (i < c->size && can_clean_entry(c, i)) { 102 c->entries[i].offset = 0; 103 c->entries[i].lru_counter = 0; 104 i++; 105 to_clean++; 106 } 107 108 if (to_clean > 0) { 109 qcow2_cache_table_release(bs, c, i - to_clean, to_clean); 110 } 111 } 112 113 c->cache_clean_lru_counter = c->lru_counter; 114 } 115 116 Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables) 117 { 118 BDRVQcow2State *s = bs->opaque; 119 Qcow2Cache *c; 120 121 c = g_new0(Qcow2Cache, 1); 122 c->size = num_tables; 123 c->entries = g_try_new0(Qcow2CachedTable, num_tables); 124 c->table_array = qemu_try_blockalign(bs->file->bs, 125 (size_t) num_tables * s->cluster_size); 126 127 if (!c->entries || !c->table_array) { 128 qemu_vfree(c->table_array); 129 g_free(c->entries); 130 g_free(c); 131 c = NULL; 132 } 133 134 return c; 135 } 136 137 int qcow2_cache_destroy(BlockDriverState *bs, Qcow2Cache *c) 138 { 139 int i; 140 141 for (i = 0; i < c->size; i++) { 142 assert(c->entries[i].ref == 0); 143 } 144 145 qemu_vfree(c->table_array); 146 g_free(c->entries); 147 g_free(c); 148 149 return 0; 150 } 151 152 static int qcow2_cache_flush_dependency(BlockDriverState *bs, Qcow2Cache *c) 153 { 154 int ret; 155 156 ret = qcow2_cache_flush(bs, c->depends); 157 if (ret < 0) { 158 return ret; 159 } 160 161 c->depends = NULL; 162 c->depends_on_flush = false; 163 164 return 0; 165 } 166 167 static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i) 168 { 169 BDRVQcow2State *s = bs->opaque; 170 int ret = 0; 171 172 if (!c->entries[i].dirty || !c->entries[i].offset) { 173 return 0; 174 } 175 176 trace_qcow2_cache_entry_flush(qemu_coroutine_self(), 177 c == s->l2_table_cache, i); 178 179 if (c->depends) { 180 ret = qcow2_cache_flush_dependency(bs, c); 181 } else if (c->depends_on_flush) { 182 ret = bdrv_flush(bs->file->bs); 183 if (ret >= 0) { 184 c->depends_on_flush = false; 185 } 186 } 187 188 if (ret < 0) { 189 return ret; 190 } 191 192 if (c == s->refcount_block_cache) { 193 ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_REFCOUNT_BLOCK, 194 c->entries[i].offset, s->cluster_size); 195 } else if (c == s->l2_table_cache) { 196 ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2, 197 c->entries[i].offset, s->cluster_size); 198 } else { 199 ret = qcow2_pre_write_overlap_check(bs, 0, 200 c->entries[i].offset, s->cluster_size); 201 } 202 203 if (ret < 0) { 204 return ret; 205 } 206 207 if (c == s->refcount_block_cache) { 208 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE_PART); 209 } else if (c == s->l2_table_cache) { 210 BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE); 211 } 212 213 ret = bdrv_pwrite(bs->file, c->entries[i].offset, 214 qcow2_cache_get_table_addr(bs, c, i), s->cluster_size); 215 if (ret < 0) { 216 return ret; 217 } 218 219 c->entries[i].dirty = false; 220 221 return 0; 222 } 223 224 int qcow2_cache_write(BlockDriverState *bs, Qcow2Cache *c) 225 { 226 BDRVQcow2State *s = bs->opaque; 227 int result = 0; 228 int ret; 229 int i; 230 231 trace_qcow2_cache_flush(qemu_coroutine_self(), c == s->l2_table_cache); 232 233 for (i = 0; i < c->size; i++) { 234 ret = qcow2_cache_entry_flush(bs, c, i); 235 if (ret < 0 && result != -ENOSPC) { 236 result = ret; 237 } 238 } 239 240 return result; 241 } 242 243 int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c) 244 { 245 int result = qcow2_cache_write(bs, c); 246 247 if (result == 0) { 248 int ret = bdrv_flush(bs->file->bs); 249 if (ret < 0) { 250 result = ret; 251 } 252 } 253 254 return result; 255 } 256 257 int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c, 258 Qcow2Cache *dependency) 259 { 260 int ret; 261 262 if (dependency->depends) { 263 ret = qcow2_cache_flush_dependency(bs, dependency); 264 if (ret < 0) { 265 return ret; 266 } 267 } 268 269 if (c->depends && (c->depends != dependency)) { 270 ret = qcow2_cache_flush_dependency(bs, c); 271 if (ret < 0) { 272 return ret; 273 } 274 } 275 276 c->depends = dependency; 277 return 0; 278 } 279 280 void qcow2_cache_depends_on_flush(Qcow2Cache *c) 281 { 282 c->depends_on_flush = true; 283 } 284 285 int qcow2_cache_empty(BlockDriverState *bs, Qcow2Cache *c) 286 { 287 int ret, i; 288 289 ret = qcow2_cache_flush(bs, c); 290 if (ret < 0) { 291 return ret; 292 } 293 294 for (i = 0; i < c->size; i++) { 295 assert(c->entries[i].ref == 0); 296 c->entries[i].offset = 0; 297 c->entries[i].lru_counter = 0; 298 } 299 300 qcow2_cache_table_release(bs, c, 0, c->size); 301 302 c->lru_counter = 0; 303 304 return 0; 305 } 306 307 static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c, 308 uint64_t offset, void **table, bool read_from_disk) 309 { 310 BDRVQcow2State *s = bs->opaque; 311 int i; 312 int ret; 313 int lookup_index; 314 uint64_t min_lru_counter = UINT64_MAX; 315 int min_lru_index = -1; 316 317 trace_qcow2_cache_get(qemu_coroutine_self(), c == s->l2_table_cache, 318 offset, read_from_disk); 319 320 /* Check if the table is already cached */ 321 i = lookup_index = (offset / s->cluster_size * 4) % c->size; 322 do { 323 const Qcow2CachedTable *t = &c->entries[i]; 324 if (t->offset == offset) { 325 goto found; 326 } 327 if (t->ref == 0 && t->lru_counter < min_lru_counter) { 328 min_lru_counter = t->lru_counter; 329 min_lru_index = i; 330 } 331 if (++i == c->size) { 332 i = 0; 333 } 334 } while (i != lookup_index); 335 336 if (min_lru_index == -1) { 337 /* This can't happen in current synchronous code, but leave the check 338 * here as a reminder for whoever starts using AIO with the cache */ 339 abort(); 340 } 341 342 /* Cache miss: write a table back and replace it */ 343 i = min_lru_index; 344 trace_qcow2_cache_get_replace_entry(qemu_coroutine_self(), 345 c == s->l2_table_cache, i); 346 347 ret = qcow2_cache_entry_flush(bs, c, i); 348 if (ret < 0) { 349 return ret; 350 } 351 352 trace_qcow2_cache_get_read(qemu_coroutine_self(), 353 c == s->l2_table_cache, i); 354 c->entries[i].offset = 0; 355 if (read_from_disk) { 356 if (c == s->l2_table_cache) { 357 BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD); 358 } 359 360 ret = bdrv_pread(bs->file, offset, 361 qcow2_cache_get_table_addr(bs, c, i), 362 s->cluster_size); 363 if (ret < 0) { 364 return ret; 365 } 366 } 367 368 c->entries[i].offset = offset; 369 370 /* And return the right table */ 371 found: 372 c->entries[i].ref++; 373 *table = qcow2_cache_get_table_addr(bs, c, i); 374 375 trace_qcow2_cache_get_done(qemu_coroutine_self(), 376 c == s->l2_table_cache, i); 377 378 return 0; 379 } 380 381 int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, 382 void **table) 383 { 384 return qcow2_cache_do_get(bs, c, offset, table, true); 385 } 386 387 int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, 388 void **table) 389 { 390 return qcow2_cache_do_get(bs, c, offset, table, false); 391 } 392 393 void qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table) 394 { 395 int i = qcow2_cache_get_table_idx(bs, c, *table); 396 397 c->entries[i].ref--; 398 *table = NULL; 399 400 if (c->entries[i].ref == 0) { 401 c->entries[i].lru_counter = ++c->lru_counter; 402 } 403 404 assert(c->entries[i].ref >= 0); 405 } 406 407 void qcow2_cache_entry_mark_dirty(BlockDriverState *bs, Qcow2Cache *c, 408 void *table) 409 { 410 int i = qcow2_cache_get_table_idx(bs, c, table); 411 assert(c->entries[i].offset != 0); 412 c->entries[i].dirty = true; 413 } 414 415 void *qcow2_cache_is_table_offset(BlockDriverState *bs, Qcow2Cache *c, 416 uint64_t offset) 417 { 418 int i; 419 420 for (i = 0; i < c->size; i++) { 421 if (c->entries[i].offset == offset) { 422 return qcow2_cache_get_table_addr(bs, c, i); 423 } 424 } 425 return NULL; 426 } 427 428 void qcow2_cache_discard(BlockDriverState *bs, Qcow2Cache *c, void *table) 429 { 430 int i = qcow2_cache_get_table_idx(bs, c, table); 431 432 assert(c->entries[i].ref == 0); 433 434 c->entries[i].offset = 0; 435 c->entries[i].lru_counter = 0; 436 c->entries[i].dirty = false; 437 438 qcow2_cache_table_release(bs, c, i, 1); 439 } 440