1 /* 2 * linux/fs/mbcache.c 3 * (C) 2001-2002 Andreas Gruenbacher, <a.gruenbacher@computer.org> 4 */ 5 6 /* 7 * Filesystem Meta Information Block Cache (mbcache) 8 * 9 * The mbcache caches blocks of block devices that need to be located 10 * by their device/block number, as well as by other criteria (such 11 * as the block's contents). 12 * 13 * There can only be one cache entry in a cache per device and block number. 14 * Additional indexes need not be unique in this sense. The number of 15 * additional indexes (=other criteria) can be hardwired at compile time 16 * or specified at cache create time. 17 * 18 * Each cache entry is of fixed size. An entry may be `valid' or `invalid' 19 * in the cache. A valid entry is in the main hash tables of the cache, 20 * and may also be in the lru list. An invalid entry is not in any hashes 21 * or lists. 22 * 23 * A valid cache entry is only in the lru list if no handles refer to it. 24 * Invalid cache entries will be freed when the last handle to the cache 25 * entry is released. Entries that cannot be freed immediately are put 26 * back on the lru list. 27 */ 28 29 #include <linux/kernel.h> 30 #include <linux/module.h> 31 32 #include <linux/hash.h> 33 #include <linux/fs.h> 34 #include <linux/mm.h> 35 #include <linux/slab.h> 36 #include <linux/sched.h> 37 #include <linux/init.h> 38 #include <linux/mbcache.h> 39 40 41 #ifdef MB_CACHE_DEBUG 42 # define mb_debug(f...) do { \ 43 printk(KERN_DEBUG f); \ 44 printk("\n"); \ 45 } while (0) 46 #define mb_assert(c) do { if (!(c)) \ 47 printk(KERN_ERR "assertion " #c " failed\n"); \ 48 } while(0) 49 #else 50 # define mb_debug(f...) do { } while(0) 51 # define mb_assert(c) do { } while(0) 52 #endif 53 #define mb_error(f...) do { \ 54 printk(KERN_ERR f); \ 55 printk("\n"); \ 56 } while(0) 57 58 #define MB_CACHE_WRITER ((unsigned short)~0U >> 1) 59 60 static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue); 61 62 MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>"); 63 MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); 64 MODULE_LICENSE("GPL"); 65 66 EXPORT_SYMBOL(mb_cache_create); 67 EXPORT_SYMBOL(mb_cache_shrink); 68 EXPORT_SYMBOL(mb_cache_destroy); 69 EXPORT_SYMBOL(mb_cache_entry_alloc); 70 EXPORT_SYMBOL(mb_cache_entry_insert); 71 EXPORT_SYMBOL(mb_cache_entry_release); 72 EXPORT_SYMBOL(mb_cache_entry_free); 73 EXPORT_SYMBOL(mb_cache_entry_get); 74 #if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) 75 EXPORT_SYMBOL(mb_cache_entry_find_first); 76 EXPORT_SYMBOL(mb_cache_entry_find_next); 77 #endif 78 79 /* 80 * Global data: list of all mbcache's, lru list, and a spinlock for 81 * accessing cache data structures on SMP machines. The lru list is 82 * global across all mbcaches. 83 */ 84 85 static LIST_HEAD(mb_cache_list); 86 static LIST_HEAD(mb_cache_lru_list); 87 static DEFINE_SPINLOCK(mb_cache_spinlock); 88 89 /* 90 * What the mbcache registers as to get shrunk dynamically. 91 */ 92 93 static int mb_cache_shrink_fn(struct shrinker *shrink, 94 struct shrink_control *sc); 95 96 static struct shrinker mb_cache_shrinker = { 97 .shrink = mb_cache_shrink_fn, 98 .seeks = DEFAULT_SEEKS, 99 }; 100 101 static inline int 102 __mb_cache_entry_is_hashed(struct mb_cache_entry *ce) 103 { 104 return !list_empty(&ce->e_block_list); 105 } 106 107 108 static void 109 __mb_cache_entry_unhash(struct mb_cache_entry *ce) 110 { 111 if (__mb_cache_entry_is_hashed(ce)) { 112 list_del_init(&ce->e_block_list); 113 list_del(&ce->e_index.o_list); 114 } 115 } 116 117 118 static void 119 __mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask) 120 { 121 struct mb_cache *cache = ce->e_cache; 122 123 mb_assert(!(ce->e_used || ce->e_queued)); 124 kmem_cache_free(cache->c_entry_cache, ce); 125 atomic_dec(&cache->c_entry_count); 126 } 127 128 129 static void 130 __mb_cache_entry_release_unlock(struct mb_cache_entry *ce) 131 __releases(mb_cache_spinlock) 132 { 133 /* Wake up all processes queuing for this cache entry. */ 134 if (ce->e_queued) 135 wake_up_all(&mb_cache_queue); 136 if (ce->e_used >= MB_CACHE_WRITER) 137 ce->e_used -= MB_CACHE_WRITER; 138 ce->e_used--; 139 if (!(ce->e_used || ce->e_queued)) { 140 if (!__mb_cache_entry_is_hashed(ce)) 141 goto forget; 142 mb_assert(list_empty(&ce->e_lru_list)); 143 list_add_tail(&ce->e_lru_list, &mb_cache_lru_list); 144 } 145 spin_unlock(&mb_cache_spinlock); 146 return; 147 forget: 148 spin_unlock(&mb_cache_spinlock); 149 __mb_cache_entry_forget(ce, GFP_KERNEL); 150 } 151 152 153 /* 154 * mb_cache_shrink_fn() memory pressure callback 155 * 156 * This function is called by the kernel memory management when memory 157 * gets low. 158 * 159 * @shrink: (ignored) 160 * @sc: shrink_control passed from reclaim 161 * 162 * Returns the number of objects which are present in the cache. 163 */ 164 static int 165 mb_cache_shrink_fn(struct shrinker *shrink, struct shrink_control *sc) 166 { 167 LIST_HEAD(free_list); 168 struct mb_cache *cache; 169 struct mb_cache_entry *entry, *tmp; 170 int count = 0; 171 int nr_to_scan = sc->nr_to_scan; 172 gfp_t gfp_mask = sc->gfp_mask; 173 174 mb_debug("trying to free %d entries", nr_to_scan); 175 spin_lock(&mb_cache_spinlock); 176 while (nr_to_scan-- && !list_empty(&mb_cache_lru_list)) { 177 struct mb_cache_entry *ce = 178 list_entry(mb_cache_lru_list.next, 179 struct mb_cache_entry, e_lru_list); 180 list_move_tail(&ce->e_lru_list, &free_list); 181 __mb_cache_entry_unhash(ce); 182 } 183 list_for_each_entry(cache, &mb_cache_list, c_cache_list) { 184 mb_debug("cache %s (%d)", cache->c_name, 185 atomic_read(&cache->c_entry_count)); 186 count += atomic_read(&cache->c_entry_count); 187 } 188 spin_unlock(&mb_cache_spinlock); 189 list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) { 190 __mb_cache_entry_forget(entry, gfp_mask); 191 } 192 return (count / 100) * sysctl_vfs_cache_pressure; 193 } 194 195 196 /* 197 * mb_cache_create() create a new cache 198 * 199 * All entries in one cache are equal size. Cache entries may be from 200 * multiple devices. If this is the first mbcache created, registers 201 * the cache with kernel memory management. Returns NULL if no more 202 * memory was available. 203 * 204 * @name: name of the cache (informal) 205 * @bucket_bits: log2(number of hash buckets) 206 */ 207 struct mb_cache * 208 mb_cache_create(const char *name, int bucket_bits) 209 { 210 int n, bucket_count = 1 << bucket_bits; 211 struct mb_cache *cache = NULL; 212 213 cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL); 214 if (!cache) 215 return NULL; 216 cache->c_name = name; 217 atomic_set(&cache->c_entry_count, 0); 218 cache->c_bucket_bits = bucket_bits; 219 cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head), 220 GFP_KERNEL); 221 if (!cache->c_block_hash) 222 goto fail; 223 for (n=0; n<bucket_count; n++) 224 INIT_LIST_HEAD(&cache->c_block_hash[n]); 225 cache->c_index_hash = kmalloc(bucket_count * sizeof(struct list_head), 226 GFP_KERNEL); 227 if (!cache->c_index_hash) 228 goto fail; 229 for (n=0; n<bucket_count; n++) 230 INIT_LIST_HEAD(&cache->c_index_hash[n]); 231 cache->c_entry_cache = kmem_cache_create(name, 232 sizeof(struct mb_cache_entry), 0, 233 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); 234 if (!cache->c_entry_cache) 235 goto fail2; 236 237 /* 238 * Set an upper limit on the number of cache entries so that the hash 239 * chains won't grow too long. 240 */ 241 cache->c_max_entries = bucket_count << 4; 242 243 spin_lock(&mb_cache_spinlock); 244 list_add(&cache->c_cache_list, &mb_cache_list); 245 spin_unlock(&mb_cache_spinlock); 246 return cache; 247 248 fail2: 249 kfree(cache->c_index_hash); 250 251 fail: 252 kfree(cache->c_block_hash); 253 kfree(cache); 254 return NULL; 255 } 256 257 258 /* 259 * mb_cache_shrink() 260 * 261 * Removes all cache entries of a device from the cache. All cache entries 262 * currently in use cannot be freed, and thus remain in the cache. All others 263 * are freed. 264 * 265 * @bdev: which device's cache entries to shrink 266 */ 267 void 268 mb_cache_shrink(struct block_device *bdev) 269 { 270 LIST_HEAD(free_list); 271 struct list_head *l, *ltmp; 272 273 spin_lock(&mb_cache_spinlock); 274 list_for_each_safe(l, ltmp, &mb_cache_lru_list) { 275 struct mb_cache_entry *ce = 276 list_entry(l, struct mb_cache_entry, e_lru_list); 277 if (ce->e_bdev == bdev) { 278 list_move_tail(&ce->e_lru_list, &free_list); 279 __mb_cache_entry_unhash(ce); 280 } 281 } 282 spin_unlock(&mb_cache_spinlock); 283 list_for_each_safe(l, ltmp, &free_list) { 284 __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, 285 e_lru_list), GFP_KERNEL); 286 } 287 } 288 289 290 /* 291 * mb_cache_destroy() 292 * 293 * Shrinks the cache to its minimum possible size (hopefully 0 entries), 294 * and then destroys it. If this was the last mbcache, un-registers the 295 * mbcache from kernel memory management. 296 */ 297 void 298 mb_cache_destroy(struct mb_cache *cache) 299 { 300 LIST_HEAD(free_list); 301 struct list_head *l, *ltmp; 302 303 spin_lock(&mb_cache_spinlock); 304 list_for_each_safe(l, ltmp, &mb_cache_lru_list) { 305 struct mb_cache_entry *ce = 306 list_entry(l, struct mb_cache_entry, e_lru_list); 307 if (ce->e_cache == cache) { 308 list_move_tail(&ce->e_lru_list, &free_list); 309 __mb_cache_entry_unhash(ce); 310 } 311 } 312 list_del(&cache->c_cache_list); 313 spin_unlock(&mb_cache_spinlock); 314 315 list_for_each_safe(l, ltmp, &free_list) { 316 __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, 317 e_lru_list), GFP_KERNEL); 318 } 319 320 if (atomic_read(&cache->c_entry_count) > 0) { 321 mb_error("cache %s: %d orphaned entries", 322 cache->c_name, 323 atomic_read(&cache->c_entry_count)); 324 } 325 326 kmem_cache_destroy(cache->c_entry_cache); 327 328 kfree(cache->c_index_hash); 329 kfree(cache->c_block_hash); 330 kfree(cache); 331 } 332 333 /* 334 * mb_cache_entry_alloc() 335 * 336 * Allocates a new cache entry. The new entry will not be valid initially, 337 * and thus cannot be looked up yet. It should be filled with data, and 338 * then inserted into the cache using mb_cache_entry_insert(). Returns NULL 339 * if no more memory was available. 340 */ 341 struct mb_cache_entry * 342 mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags) 343 { 344 struct mb_cache_entry *ce = NULL; 345 346 if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) { 347 spin_lock(&mb_cache_spinlock); 348 if (!list_empty(&mb_cache_lru_list)) { 349 ce = list_entry(mb_cache_lru_list.next, 350 struct mb_cache_entry, e_lru_list); 351 list_del_init(&ce->e_lru_list); 352 __mb_cache_entry_unhash(ce); 353 } 354 spin_unlock(&mb_cache_spinlock); 355 } 356 if (!ce) { 357 ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags); 358 if (!ce) 359 return NULL; 360 atomic_inc(&cache->c_entry_count); 361 INIT_LIST_HEAD(&ce->e_lru_list); 362 INIT_LIST_HEAD(&ce->e_block_list); 363 ce->e_cache = cache; 364 ce->e_queued = 0; 365 } 366 ce->e_used = 1 + MB_CACHE_WRITER; 367 return ce; 368 } 369 370 371 /* 372 * mb_cache_entry_insert() 373 * 374 * Inserts an entry that was allocated using mb_cache_entry_alloc() into 375 * the cache. After this, the cache entry can be looked up, but is not yet 376 * in the lru list as the caller still holds a handle to it. Returns 0 on 377 * success, or -EBUSY if a cache entry for that device + inode exists 378 * already (this may happen after a failed lookup, but when another process 379 * has inserted the same cache entry in the meantime). 380 * 381 * @bdev: device the cache entry belongs to 382 * @block: block number 383 * @key: lookup key 384 */ 385 int 386 mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev, 387 sector_t block, unsigned int key) 388 { 389 struct mb_cache *cache = ce->e_cache; 390 unsigned int bucket; 391 struct list_head *l; 392 int error = -EBUSY; 393 394 bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), 395 cache->c_bucket_bits); 396 spin_lock(&mb_cache_spinlock); 397 list_for_each_prev(l, &cache->c_block_hash[bucket]) { 398 struct mb_cache_entry *ce = 399 list_entry(l, struct mb_cache_entry, e_block_list); 400 if (ce->e_bdev == bdev && ce->e_block == block) 401 goto out; 402 } 403 __mb_cache_entry_unhash(ce); 404 ce->e_bdev = bdev; 405 ce->e_block = block; 406 list_add(&ce->e_block_list, &cache->c_block_hash[bucket]); 407 ce->e_index.o_key = key; 408 bucket = hash_long(key, cache->c_bucket_bits); 409 list_add(&ce->e_index.o_list, &cache->c_index_hash[bucket]); 410 error = 0; 411 out: 412 spin_unlock(&mb_cache_spinlock); 413 return error; 414 } 415 416 417 /* 418 * mb_cache_entry_release() 419 * 420 * Release a handle to a cache entry. When the last handle to a cache entry 421 * is released it is either freed (if it is invalid) or otherwise inserted 422 * in to the lru list. 423 */ 424 void 425 mb_cache_entry_release(struct mb_cache_entry *ce) 426 { 427 spin_lock(&mb_cache_spinlock); 428 __mb_cache_entry_release_unlock(ce); 429 } 430 431 432 /* 433 * mb_cache_entry_free() 434 * 435 * This is equivalent to the sequence mb_cache_entry_takeout() -- 436 * mb_cache_entry_release(). 437 */ 438 void 439 mb_cache_entry_free(struct mb_cache_entry *ce) 440 { 441 spin_lock(&mb_cache_spinlock); 442 mb_assert(list_empty(&ce->e_lru_list)); 443 __mb_cache_entry_unhash(ce); 444 __mb_cache_entry_release_unlock(ce); 445 } 446 447 448 /* 449 * mb_cache_entry_get() 450 * 451 * Get a cache entry by device / block number. (There can only be one entry 452 * in the cache per device and block.) Returns NULL if no such cache entry 453 * exists. The returned cache entry is locked for exclusive access ("single 454 * writer"). 455 */ 456 struct mb_cache_entry * 457 mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev, 458 sector_t block) 459 { 460 unsigned int bucket; 461 struct list_head *l; 462 struct mb_cache_entry *ce; 463 464 bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), 465 cache->c_bucket_bits); 466 spin_lock(&mb_cache_spinlock); 467 list_for_each(l, &cache->c_block_hash[bucket]) { 468 ce = list_entry(l, struct mb_cache_entry, e_block_list); 469 if (ce->e_bdev == bdev && ce->e_block == block) { 470 DEFINE_WAIT(wait); 471 472 if (!list_empty(&ce->e_lru_list)) 473 list_del_init(&ce->e_lru_list); 474 475 while (ce->e_used > 0) { 476 ce->e_queued++; 477 prepare_to_wait(&mb_cache_queue, &wait, 478 TASK_UNINTERRUPTIBLE); 479 spin_unlock(&mb_cache_spinlock); 480 schedule(); 481 spin_lock(&mb_cache_spinlock); 482 ce->e_queued--; 483 } 484 finish_wait(&mb_cache_queue, &wait); 485 ce->e_used += 1 + MB_CACHE_WRITER; 486 487 if (!__mb_cache_entry_is_hashed(ce)) { 488 __mb_cache_entry_release_unlock(ce); 489 return NULL; 490 } 491 goto cleanup; 492 } 493 } 494 ce = NULL; 495 496 cleanup: 497 spin_unlock(&mb_cache_spinlock); 498 return ce; 499 } 500 501 #if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) 502 503 static struct mb_cache_entry * 504 __mb_cache_entry_find(struct list_head *l, struct list_head *head, 505 struct block_device *bdev, unsigned int key) 506 { 507 while (l != head) { 508 struct mb_cache_entry *ce = 509 list_entry(l, struct mb_cache_entry, e_index.o_list); 510 if (ce->e_bdev == bdev && ce->e_index.o_key == key) { 511 DEFINE_WAIT(wait); 512 513 if (!list_empty(&ce->e_lru_list)) 514 list_del_init(&ce->e_lru_list); 515 516 /* Incrementing before holding the lock gives readers 517 priority over writers. */ 518 ce->e_used++; 519 while (ce->e_used >= MB_CACHE_WRITER) { 520 ce->e_queued++; 521 prepare_to_wait(&mb_cache_queue, &wait, 522 TASK_UNINTERRUPTIBLE); 523 spin_unlock(&mb_cache_spinlock); 524 schedule(); 525 spin_lock(&mb_cache_spinlock); 526 ce->e_queued--; 527 } 528 finish_wait(&mb_cache_queue, &wait); 529 530 if (!__mb_cache_entry_is_hashed(ce)) { 531 __mb_cache_entry_release_unlock(ce); 532 spin_lock(&mb_cache_spinlock); 533 return ERR_PTR(-EAGAIN); 534 } 535 return ce; 536 } 537 l = l->next; 538 } 539 return NULL; 540 } 541 542 543 /* 544 * mb_cache_entry_find_first() 545 * 546 * Find the first cache entry on a given device with a certain key in 547 * an additional index. Additional matches can be found with 548 * mb_cache_entry_find_next(). Returns NULL if no match was found. The 549 * returned cache entry is locked for shared access ("multiple readers"). 550 * 551 * @cache: the cache to search 552 * @bdev: the device the cache entry should belong to 553 * @key: the key in the index 554 */ 555 struct mb_cache_entry * 556 mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev, 557 unsigned int key) 558 { 559 unsigned int bucket = hash_long(key, cache->c_bucket_bits); 560 struct list_head *l; 561 struct mb_cache_entry *ce; 562 563 spin_lock(&mb_cache_spinlock); 564 l = cache->c_index_hash[bucket].next; 565 ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key); 566 spin_unlock(&mb_cache_spinlock); 567 return ce; 568 } 569 570 571 /* 572 * mb_cache_entry_find_next() 573 * 574 * Find the next cache entry on a given device with a certain key in an 575 * additional index. Returns NULL if no match could be found. The previous 576 * entry is atomatically released, so that mb_cache_entry_find_next() can 577 * be called like this: 578 * 579 * entry = mb_cache_entry_find_first(); 580 * while (entry) { 581 * ... 582 * entry = mb_cache_entry_find_next(entry, ...); 583 * } 584 * 585 * @prev: The previous match 586 * @bdev: the device the cache entry should belong to 587 * @key: the key in the index 588 */ 589 struct mb_cache_entry * 590 mb_cache_entry_find_next(struct mb_cache_entry *prev, 591 struct block_device *bdev, unsigned int key) 592 { 593 struct mb_cache *cache = prev->e_cache; 594 unsigned int bucket = hash_long(key, cache->c_bucket_bits); 595 struct list_head *l; 596 struct mb_cache_entry *ce; 597 598 spin_lock(&mb_cache_spinlock); 599 l = prev->e_index.o_list.next; 600 ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key); 601 __mb_cache_entry_release_unlock(prev); 602 return ce; 603 } 604 605 #endif /* !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) */ 606 607 static int __init init_mbcache(void) 608 { 609 register_shrinker(&mb_cache_shrinker); 610 return 0; 611 } 612 613 static void __exit exit_mbcache(void) 614 { 615 unregister_shrinker(&mb_cache_shrinker); 616 } 617 618 module_init(init_mbcache) 619 module_exit(exit_mbcache) 620 621