xref: /openbmc/linux/fs/mbcache.c (revision 1f3e55fe)
1 /*
2  * linux/fs/mbcache.c
3  * (C) 2001-2002 Andreas Gruenbacher, <a.gruenbacher@computer.org>
4  */
5 
6 /*
7  * Filesystem Meta Information Block Cache (mbcache)
8  *
9  * The mbcache caches blocks of block devices that need to be located
10  * by their device/block number, as well as by other criteria (such
11  * as the block's contents).
12  *
13  * There can only be one cache entry in a cache per device and block number.
14  * Additional indexes need not be unique in this sense. The number of
15  * additional indexes (=other criteria) can be hardwired at compile time
16  * or specified at cache create time.
17  *
18  * Each cache entry is of fixed size. An entry may be `valid' or `invalid'
19  * in the cache. A valid entry is in the main hash tables of the cache,
20  * and may also be in the lru list. An invalid entry is not in any hashes
21  * or lists.
22  *
23  * A valid cache entry is only in the lru list if no handles refer to it.
24  * Invalid cache entries will be freed when the last handle to the cache
25  * entry is released. Entries that cannot be freed immediately are put
26  * back on the lru list.
27  */
28 
29 /*
30  * Lock descriptions and usage:
31  *
32  * Each hash chain of both the block and index hash tables now contains
33  * a built-in lock used to serialize accesses to the hash chain.
34  *
35  * Accesses to global data structures mb_cache_list and mb_cache_lru_list
36  * are serialized via the global spinlock mb_cache_spinlock.
37  *
38  * Each mb_cache_entry contains a spinlock, e_entry_lock, to serialize
39  * accesses to its local data, such as e_used and e_queued.
40  *
41  * Lock ordering:
42  *
43  * Each block hash chain's lock has the highest lock order, followed by an
44  * index hash chain's lock, mb_cache_bg_lock (used to implement mb_cache_entry's
45  * lock), and mb_cach_spinlock, with the lowest order.  While holding
46  * either a block or index hash chain lock, a thread can acquire an
47  * mc_cache_bg_lock, which in turn can also acquire mb_cache_spinlock.
48  *
49  * Synchronization:
50  *
51  * Since both mb_cache_entry_get and mb_cache_entry_find scan the block and
52  * index hash chian, it needs to lock the corresponding hash chain.  For each
53  * mb_cache_entry within the chain, it needs to lock the mb_cache_entry to
54  * prevent either any simultaneous release or free on the entry and also
55  * to serialize accesses to either the e_used or e_queued member of the entry.
56  *
57  * To avoid having a dangling reference to an already freed
58  * mb_cache_entry, an mb_cache_entry is only freed when it is not on a
59  * block hash chain and also no longer being referenced, both e_used,
60  * and e_queued are 0's.  When an mb_cache_entry is explicitly freed it is
61  * first removed from a block hash chain.
62  */
63 
64 #include <linux/kernel.h>
65 #include <linux/module.h>
66 
67 #include <linux/hash.h>
68 #include <linux/fs.h>
69 #include <linux/mm.h>
70 #include <linux/slab.h>
71 #include <linux/sched.h>
72 #include <linux/list_bl.h>
73 #include <linux/mbcache.h>
74 #include <linux/init.h>
75 #include <linux/blockgroup_lock.h>
76 
77 #ifdef MB_CACHE_DEBUG
78 # define mb_debug(f...) do { \
79 		printk(KERN_DEBUG f); \
80 		printk("\n"); \
81 	} while (0)
82 #define mb_assert(c) do { if (!(c)) \
83 		printk(KERN_ERR "assertion " #c " failed\n"); \
84 	} while(0)
85 #else
86 # define mb_debug(f...) do { } while(0)
87 # define mb_assert(c) do { } while(0)
88 #endif
89 #define mb_error(f...) do { \
90 		printk(KERN_ERR f); \
91 		printk("\n"); \
92 	} while(0)
93 
94 #define MB_CACHE_WRITER ((unsigned short)~0U >> 1)
95 
96 #define MB_CACHE_ENTRY_LOCK_BITS	__builtin_log2(NR_BG_LOCKS)
97 #define	MB_CACHE_ENTRY_LOCK_INDEX(ce)			\
98 	(hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS))
99 
100 static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue);
101 static struct blockgroup_lock *mb_cache_bg_lock;
102 
103 MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>");
104 MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
105 MODULE_LICENSE("GPL");
106 
107 EXPORT_SYMBOL(mb_cache_create);
108 EXPORT_SYMBOL(mb_cache_shrink);
109 EXPORT_SYMBOL(mb_cache_destroy);
110 EXPORT_SYMBOL(mb_cache_entry_alloc);
111 EXPORT_SYMBOL(mb_cache_entry_insert);
112 EXPORT_SYMBOL(mb_cache_entry_release);
113 EXPORT_SYMBOL(mb_cache_entry_free);
114 EXPORT_SYMBOL(mb_cache_entry_get);
115 #if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
116 EXPORT_SYMBOL(mb_cache_entry_find_first);
117 EXPORT_SYMBOL(mb_cache_entry_find_next);
118 #endif
119 
120 /*
121  * Global data: list of all mbcache's, lru list, and a spinlock for
122  * accessing cache data structures on SMP machines. The lru list is
123  * global across all mbcaches.
124  */
125 
126 static LIST_HEAD(mb_cache_list);
127 static LIST_HEAD(mb_cache_lru_list);
128 static DEFINE_SPINLOCK(mb_cache_spinlock);
129 
130 static inline void
131 __spin_lock_mb_cache_entry(struct mb_cache_entry *ce)
132 {
133 	spin_lock(bgl_lock_ptr(mb_cache_bg_lock,
134 		MB_CACHE_ENTRY_LOCK_INDEX(ce)));
135 }
136 
137 static inline void
138 __spin_unlock_mb_cache_entry(struct mb_cache_entry *ce)
139 {
140 	spin_unlock(bgl_lock_ptr(mb_cache_bg_lock,
141 		MB_CACHE_ENTRY_LOCK_INDEX(ce)));
142 }
143 
144 static inline int
145 __mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce)
146 {
147 	return !hlist_bl_unhashed(&ce->e_block_list);
148 }
149 
150 
151 static inline void
152 __mb_cache_entry_unhash_block(struct mb_cache_entry *ce)
153 {
154 	if (__mb_cache_entry_is_block_hashed(ce))
155 		hlist_bl_del_init(&ce->e_block_list);
156 }
157 
158 static inline int
159 __mb_cache_entry_is_index_hashed(struct mb_cache_entry *ce)
160 {
161 	return !hlist_bl_unhashed(&ce->e_index.o_list);
162 }
163 
164 static inline void
165 __mb_cache_entry_unhash_index(struct mb_cache_entry *ce)
166 {
167 	if (__mb_cache_entry_is_index_hashed(ce))
168 		hlist_bl_del_init(&ce->e_index.o_list);
169 }
170 
171 /*
172  * __mb_cache_entry_unhash_unlock()
173  *
174  * This function is called to unhash both the block and index hash
175  * chain.
176  * It assumes both the block and index hash chain is locked upon entry.
177  * It also unlock both hash chains both exit
178  */
179 static inline void
180 __mb_cache_entry_unhash_unlock(struct mb_cache_entry *ce)
181 {
182 	__mb_cache_entry_unhash_index(ce);
183 	hlist_bl_unlock(ce->e_index_hash_p);
184 	__mb_cache_entry_unhash_block(ce);
185 	hlist_bl_unlock(ce->e_block_hash_p);
186 }
187 
188 static void
189 __mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
190 {
191 	struct mb_cache *cache = ce->e_cache;
192 
193 	mb_assert(!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt)));
194 	kmem_cache_free(cache->c_entry_cache, ce);
195 	atomic_dec(&cache->c_entry_count);
196 }
197 
198 static void
199 __mb_cache_entry_release(struct mb_cache_entry *ce)
200 {
201 	/* First lock the entry to serialize access to its local data. */
202 	__spin_lock_mb_cache_entry(ce);
203 	/* Wake up all processes queuing for this cache entry. */
204 	if (ce->e_queued)
205 		wake_up_all(&mb_cache_queue);
206 	if (ce->e_used >= MB_CACHE_WRITER)
207 		ce->e_used -= MB_CACHE_WRITER;
208 	/*
209 	 * Make sure that all cache entries on lru_list have
210 	 * both e_used and e_qued of 0s.
211 	 */
212 	ce->e_used--;
213 	if (!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))) {
214 		if (!__mb_cache_entry_is_block_hashed(ce)) {
215 			__spin_unlock_mb_cache_entry(ce);
216 			goto forget;
217 		}
218 		/*
219 		 * Need access to lru list, first drop entry lock,
220 		 * then reacquire the lock in the proper order.
221 		 */
222 		spin_lock(&mb_cache_spinlock);
223 		if (list_empty(&ce->e_lru_list))
224 			list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
225 		spin_unlock(&mb_cache_spinlock);
226 	}
227 	__spin_unlock_mb_cache_entry(ce);
228 	return;
229 forget:
230 	mb_assert(list_empty(&ce->e_lru_list));
231 	__mb_cache_entry_forget(ce, GFP_KERNEL);
232 }
233 
234 /*
235  * mb_cache_shrink_scan()  memory pressure callback
236  *
237  * This function is called by the kernel memory management when memory
238  * gets low.
239  *
240  * @shrink: (ignored)
241  * @sc: shrink_control passed from reclaim
242  *
243  * Returns the number of objects freed.
244  */
245 static unsigned long
246 mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
247 {
248 	LIST_HEAD(free_list);
249 	struct mb_cache_entry *entry, *tmp;
250 	int nr_to_scan = sc->nr_to_scan;
251 	gfp_t gfp_mask = sc->gfp_mask;
252 	unsigned long freed = 0;
253 
254 	mb_debug("trying to free %d entries", nr_to_scan);
255 	spin_lock(&mb_cache_spinlock);
256 	while ((nr_to_scan-- > 0) && !list_empty(&mb_cache_lru_list)) {
257 		struct mb_cache_entry *ce =
258 			list_entry(mb_cache_lru_list.next,
259 				struct mb_cache_entry, e_lru_list);
260 		list_del_init(&ce->e_lru_list);
261 		if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))
262 			continue;
263 		spin_unlock(&mb_cache_spinlock);
264 		/* Prevent any find or get operation on the entry */
265 		hlist_bl_lock(ce->e_block_hash_p);
266 		hlist_bl_lock(ce->e_index_hash_p);
267 		/* Ignore if it is touched by a find/get */
268 		if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt) ||
269 			!list_empty(&ce->e_lru_list)) {
270 			hlist_bl_unlock(ce->e_index_hash_p);
271 			hlist_bl_unlock(ce->e_block_hash_p);
272 			spin_lock(&mb_cache_spinlock);
273 			continue;
274 		}
275 		__mb_cache_entry_unhash_unlock(ce);
276 		list_add_tail(&ce->e_lru_list, &free_list);
277 		spin_lock(&mb_cache_spinlock);
278 	}
279 	spin_unlock(&mb_cache_spinlock);
280 
281 	list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) {
282 		__mb_cache_entry_forget(entry, gfp_mask);
283 		freed++;
284 	}
285 	return freed;
286 }
287 
288 static unsigned long
289 mb_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
290 {
291 	struct mb_cache *cache;
292 	unsigned long count = 0;
293 
294 	spin_lock(&mb_cache_spinlock);
295 	list_for_each_entry(cache, &mb_cache_list, c_cache_list) {
296 		mb_debug("cache %s (%d)", cache->c_name,
297 			  atomic_read(&cache->c_entry_count));
298 		count += atomic_read(&cache->c_entry_count);
299 	}
300 	spin_unlock(&mb_cache_spinlock);
301 
302 	return vfs_pressure_ratio(count);
303 }
304 
305 static struct shrinker mb_cache_shrinker = {
306 	.count_objects = mb_cache_shrink_count,
307 	.scan_objects = mb_cache_shrink_scan,
308 	.seeks = DEFAULT_SEEKS,
309 };
310 
311 /*
312  * mb_cache_create()  create a new cache
313  *
314  * All entries in one cache are equal size. Cache entries may be from
315  * multiple devices. If this is the first mbcache created, registers
316  * the cache with kernel memory management. Returns NULL if no more
317  * memory was available.
318  *
319  * @name: name of the cache (informal)
320  * @bucket_bits: log2(number of hash buckets)
321  */
322 struct mb_cache *
323 mb_cache_create(const char *name, int bucket_bits)
324 {
325 	int n, bucket_count = 1 << bucket_bits;
326 	struct mb_cache *cache = NULL;
327 
328 	if (!mb_cache_bg_lock) {
329 		mb_cache_bg_lock = kmalloc(sizeof(struct blockgroup_lock),
330 			GFP_KERNEL);
331 		if (!mb_cache_bg_lock)
332 			return NULL;
333 		bgl_lock_init(mb_cache_bg_lock);
334 	}
335 
336 	cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL);
337 	if (!cache)
338 		return NULL;
339 	cache->c_name = name;
340 	atomic_set(&cache->c_entry_count, 0);
341 	cache->c_bucket_bits = bucket_bits;
342 	cache->c_block_hash = kmalloc(bucket_count *
343 		sizeof(struct hlist_bl_head), GFP_KERNEL);
344 	if (!cache->c_block_hash)
345 		goto fail;
346 	for (n=0; n<bucket_count; n++)
347 		INIT_HLIST_BL_HEAD(&cache->c_block_hash[n]);
348 	cache->c_index_hash = kmalloc(bucket_count *
349 		sizeof(struct hlist_bl_head), GFP_KERNEL);
350 	if (!cache->c_index_hash)
351 		goto fail;
352 	for (n=0; n<bucket_count; n++)
353 		INIT_HLIST_BL_HEAD(&cache->c_index_hash[n]);
354 	cache->c_entry_cache = kmem_cache_create(name,
355 		sizeof(struct mb_cache_entry), 0,
356 		SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
357 	if (!cache->c_entry_cache)
358 		goto fail2;
359 
360 	/*
361 	 * Set an upper limit on the number of cache entries so that the hash
362 	 * chains won't grow too long.
363 	 */
364 	cache->c_max_entries = bucket_count << 4;
365 
366 	spin_lock(&mb_cache_spinlock);
367 	list_add(&cache->c_cache_list, &mb_cache_list);
368 	spin_unlock(&mb_cache_spinlock);
369 	return cache;
370 
371 fail2:
372 	kfree(cache->c_index_hash);
373 
374 fail:
375 	kfree(cache->c_block_hash);
376 	kfree(cache);
377 	return NULL;
378 }
379 
380 
381 /*
382  * mb_cache_shrink()
383  *
384  * Removes all cache entries of a device from the cache. All cache entries
385  * currently in use cannot be freed, and thus remain in the cache. All others
386  * are freed.
387  *
388  * @bdev: which device's cache entries to shrink
389  */
390 void
391 mb_cache_shrink(struct block_device *bdev)
392 {
393 	LIST_HEAD(free_list);
394 	struct list_head *l;
395 	struct mb_cache_entry *ce, *tmp;
396 
397 	l = &mb_cache_lru_list;
398 	spin_lock(&mb_cache_spinlock);
399 	while (!list_is_last(l, &mb_cache_lru_list)) {
400 		l = l->next;
401 		ce = list_entry(l, struct mb_cache_entry, e_lru_list);
402 		if (ce->e_bdev == bdev) {
403 			list_del_init(&ce->e_lru_list);
404 			if (ce->e_used || ce->e_queued ||
405 				atomic_read(&ce->e_refcnt))
406 				continue;
407 			spin_unlock(&mb_cache_spinlock);
408 			/*
409 			 * Prevent any find or get operation on the entry.
410 			 */
411 			hlist_bl_lock(ce->e_block_hash_p);
412 			hlist_bl_lock(ce->e_index_hash_p);
413 			/* Ignore if it is touched by a find/get */
414 			if (ce->e_used || ce->e_queued ||
415 				atomic_read(&ce->e_refcnt) ||
416 				!list_empty(&ce->e_lru_list)) {
417 				hlist_bl_unlock(ce->e_index_hash_p);
418 				hlist_bl_unlock(ce->e_block_hash_p);
419 				l = &mb_cache_lru_list;
420 				spin_lock(&mb_cache_spinlock);
421 				continue;
422 			}
423 			__mb_cache_entry_unhash_unlock(ce);
424 			mb_assert(!(ce->e_used || ce->e_queued ||
425 				atomic_read(&ce->e_refcnt)));
426 			list_add_tail(&ce->e_lru_list, &free_list);
427 			l = &mb_cache_lru_list;
428 			spin_lock(&mb_cache_spinlock);
429 		}
430 	}
431 	spin_unlock(&mb_cache_spinlock);
432 
433 	list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
434 		__mb_cache_entry_forget(ce, GFP_KERNEL);
435 	}
436 }
437 
438 
439 /*
440  * mb_cache_destroy()
441  *
442  * Shrinks the cache to its minimum possible size (hopefully 0 entries),
443  * and then destroys it. If this was the last mbcache, un-registers the
444  * mbcache from kernel memory management.
445  */
446 void
447 mb_cache_destroy(struct mb_cache *cache)
448 {
449 	LIST_HEAD(free_list);
450 	struct mb_cache_entry *ce, *tmp;
451 
452 	spin_lock(&mb_cache_spinlock);
453 	list_for_each_entry_safe(ce, tmp, &mb_cache_lru_list, e_lru_list) {
454 		if (ce->e_cache == cache)
455 			list_move_tail(&ce->e_lru_list, &free_list);
456 	}
457 	list_del(&cache->c_cache_list);
458 	spin_unlock(&mb_cache_spinlock);
459 
460 	list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
461 		list_del_init(&ce->e_lru_list);
462 		/*
463 		 * Prevent any find or get operation on the entry.
464 		 */
465 		hlist_bl_lock(ce->e_block_hash_p);
466 		hlist_bl_lock(ce->e_index_hash_p);
467 		mb_assert(!(ce->e_used || ce->e_queued ||
468 			atomic_read(&ce->e_refcnt)));
469 		__mb_cache_entry_unhash_unlock(ce);
470 		__mb_cache_entry_forget(ce, GFP_KERNEL);
471 	}
472 
473 	if (atomic_read(&cache->c_entry_count) > 0) {
474 		mb_error("cache %s: %d orphaned entries",
475 			  cache->c_name,
476 			  atomic_read(&cache->c_entry_count));
477 	}
478 
479 	kfree(cache->c_index_hash);
480 	kfree(cache->c_block_hash);
481 	kfree(cache);
482 }
483 
484 /*
485  * mb_cache_entry_alloc()
486  *
487  * Allocates a new cache entry. The new entry will not be valid initially,
488  * and thus cannot be looked up yet. It should be filled with data, and
489  * then inserted into the cache using mb_cache_entry_insert(). Returns NULL
490  * if no more memory was available.
491  */
492 struct mb_cache_entry *
493 mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
494 {
495 	struct mb_cache_entry *ce;
496 
497 	if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) {
498 		struct list_head *l;
499 
500 		l = &mb_cache_lru_list;
501 		spin_lock(&mb_cache_spinlock);
502 		while (!list_is_last(l, &mb_cache_lru_list)) {
503 			l = l->next;
504 			ce = list_entry(l, struct mb_cache_entry, e_lru_list);
505 			if (ce->e_cache == cache) {
506 				list_del_init(&ce->e_lru_list);
507 				if (ce->e_used || ce->e_queued ||
508 					atomic_read(&ce->e_refcnt))
509 					continue;
510 				spin_unlock(&mb_cache_spinlock);
511 				/*
512 				 * Prevent any find or get operation on the
513 				 * entry.
514 				 */
515 				hlist_bl_lock(ce->e_block_hash_p);
516 				hlist_bl_lock(ce->e_index_hash_p);
517 				/* Ignore if it is touched by a find/get */
518 				if (ce->e_used || ce->e_queued ||
519 					atomic_read(&ce->e_refcnt) ||
520 					!list_empty(&ce->e_lru_list)) {
521 					hlist_bl_unlock(ce->e_index_hash_p);
522 					hlist_bl_unlock(ce->e_block_hash_p);
523 					l = &mb_cache_lru_list;
524 					spin_lock(&mb_cache_spinlock);
525 					continue;
526 				}
527 				mb_assert(list_empty(&ce->e_lru_list));
528 				mb_assert(!(ce->e_used || ce->e_queued ||
529 					atomic_read(&ce->e_refcnt)));
530 				__mb_cache_entry_unhash_unlock(ce);
531 				goto found;
532 			}
533 		}
534 		spin_unlock(&mb_cache_spinlock);
535 	}
536 
537 	ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
538 	if (!ce)
539 		return NULL;
540 	atomic_inc(&cache->c_entry_count);
541 	INIT_LIST_HEAD(&ce->e_lru_list);
542 	INIT_HLIST_BL_NODE(&ce->e_block_list);
543 	INIT_HLIST_BL_NODE(&ce->e_index.o_list);
544 	ce->e_cache = cache;
545 	ce->e_queued = 0;
546 	atomic_set(&ce->e_refcnt, 0);
547 found:
548 	ce->e_block_hash_p = &cache->c_block_hash[0];
549 	ce->e_index_hash_p = &cache->c_index_hash[0];
550 	ce->e_used = 1 + MB_CACHE_WRITER;
551 	return ce;
552 }
553 
554 
555 /*
556  * mb_cache_entry_insert()
557  *
558  * Inserts an entry that was allocated using mb_cache_entry_alloc() into
559  * the cache. After this, the cache entry can be looked up, but is not yet
560  * in the lru list as the caller still holds a handle to it. Returns 0 on
561  * success, or -EBUSY if a cache entry for that device + inode exists
562  * already (this may happen after a failed lookup, but when another process
563  * has inserted the same cache entry in the meantime).
564  *
565  * @bdev: device the cache entry belongs to
566  * @block: block number
567  * @key: lookup key
568  */
569 int
570 mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
571 		      sector_t block, unsigned int key)
572 {
573 	struct mb_cache *cache = ce->e_cache;
574 	unsigned int bucket;
575 	struct hlist_bl_node *l;
576 	struct hlist_bl_head *block_hash_p;
577 	struct hlist_bl_head *index_hash_p;
578 	struct mb_cache_entry *lce;
579 
580 	mb_assert(ce);
581 	bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
582 			   cache->c_bucket_bits);
583 	block_hash_p = &cache->c_block_hash[bucket];
584 	hlist_bl_lock(block_hash_p);
585 	hlist_bl_for_each_entry(lce, l, block_hash_p, e_block_list) {
586 		if (lce->e_bdev == bdev && lce->e_block == block) {
587 			hlist_bl_unlock(block_hash_p);
588 			return -EBUSY;
589 		}
590 	}
591 	mb_assert(!__mb_cache_entry_is_block_hashed(ce));
592 	__mb_cache_entry_unhash_block(ce);
593 	__mb_cache_entry_unhash_index(ce);
594 	ce->e_bdev = bdev;
595 	ce->e_block = block;
596 	ce->e_block_hash_p = block_hash_p;
597 	ce->e_index.o_key = key;
598 	hlist_bl_add_head(&ce->e_block_list, block_hash_p);
599 	hlist_bl_unlock(block_hash_p);
600 	bucket = hash_long(key, cache->c_bucket_bits);
601 	index_hash_p = &cache->c_index_hash[bucket];
602 	hlist_bl_lock(index_hash_p);
603 	ce->e_index_hash_p = index_hash_p;
604 	hlist_bl_add_head(&ce->e_index.o_list, index_hash_p);
605 	hlist_bl_unlock(index_hash_p);
606 	return 0;
607 }
608 
609 
610 /*
611  * mb_cache_entry_release()
612  *
613  * Release a handle to a cache entry. When the last handle to a cache entry
614  * is released it is either freed (if it is invalid) or otherwise inserted
615  * in to the lru list.
616  */
617 void
618 mb_cache_entry_release(struct mb_cache_entry *ce)
619 {
620 	__mb_cache_entry_release(ce);
621 }
622 
623 
624 /*
625  * mb_cache_entry_free()
626  *
627  */
628 void
629 mb_cache_entry_free(struct mb_cache_entry *ce)
630 {
631 	mb_assert(ce);
632 	mb_assert(list_empty(&ce->e_lru_list));
633 	hlist_bl_lock(ce->e_index_hash_p);
634 	__mb_cache_entry_unhash_index(ce);
635 	hlist_bl_unlock(ce->e_index_hash_p);
636 	hlist_bl_lock(ce->e_block_hash_p);
637 	__mb_cache_entry_unhash_block(ce);
638 	hlist_bl_unlock(ce->e_block_hash_p);
639 	__mb_cache_entry_release(ce);
640 }
641 
642 
643 /*
644  * mb_cache_entry_get()
645  *
646  * Get a cache entry  by device / block number. (There can only be one entry
647  * in the cache per device and block.) Returns NULL if no such cache entry
648  * exists. The returned cache entry is locked for exclusive access ("single
649  * writer").
650  */
651 struct mb_cache_entry *
652 mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev,
653 		   sector_t block)
654 {
655 	unsigned int bucket;
656 	struct hlist_bl_node *l;
657 	struct mb_cache_entry *ce;
658 	struct hlist_bl_head *block_hash_p;
659 
660 	bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
661 			   cache->c_bucket_bits);
662 	block_hash_p = &cache->c_block_hash[bucket];
663 	/* First serialize access to the block corresponding hash chain. */
664 	hlist_bl_lock(block_hash_p);
665 	hlist_bl_for_each_entry(ce, l, block_hash_p, e_block_list) {
666 		mb_assert(ce->e_block_hash_p == block_hash_p);
667 		if (ce->e_bdev == bdev && ce->e_block == block) {
668 			/*
669 			 * Prevent a free from removing the entry.
670 			 */
671 			atomic_inc(&ce->e_refcnt);
672 			hlist_bl_unlock(block_hash_p);
673 			__spin_lock_mb_cache_entry(ce);
674 			atomic_dec(&ce->e_refcnt);
675 			if (ce->e_used > 0) {
676 				DEFINE_WAIT(wait);
677 				while (ce->e_used > 0) {
678 					ce->e_queued++;
679 					prepare_to_wait(&mb_cache_queue, &wait,
680 							TASK_UNINTERRUPTIBLE);
681 					__spin_unlock_mb_cache_entry(ce);
682 					schedule();
683 					__spin_lock_mb_cache_entry(ce);
684 					ce->e_queued--;
685 				}
686 				finish_wait(&mb_cache_queue, &wait);
687 			}
688 			ce->e_used += 1 + MB_CACHE_WRITER;
689 			__spin_unlock_mb_cache_entry(ce);
690 
691 			if (!list_empty(&ce->e_lru_list)) {
692 				spin_lock(&mb_cache_spinlock);
693 				list_del_init(&ce->e_lru_list);
694 				spin_unlock(&mb_cache_spinlock);
695 			}
696 			if (!__mb_cache_entry_is_block_hashed(ce)) {
697 				__mb_cache_entry_release(ce);
698 				return NULL;
699 			}
700 			return ce;
701 		}
702 	}
703 	hlist_bl_unlock(block_hash_p);
704 	return NULL;
705 }
706 
707 #if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
708 
709 static struct mb_cache_entry *
710 __mb_cache_entry_find(struct hlist_bl_node *l, struct hlist_bl_head *head,
711 		      struct block_device *bdev, unsigned int key)
712 {
713 
714 	/* The index hash chain is alredy acquire by caller. */
715 	while (l != NULL) {
716 		struct mb_cache_entry *ce =
717 			hlist_bl_entry(l, struct mb_cache_entry,
718 				e_index.o_list);
719 		mb_assert(ce->e_index_hash_p == head);
720 		if (ce->e_bdev == bdev && ce->e_index.o_key == key) {
721 			/*
722 			 * Prevent a free from removing the entry.
723 			 */
724 			atomic_inc(&ce->e_refcnt);
725 			hlist_bl_unlock(head);
726 			__spin_lock_mb_cache_entry(ce);
727 			atomic_dec(&ce->e_refcnt);
728 			ce->e_used++;
729 			/* Incrementing before holding the lock gives readers
730 			   priority over writers. */
731 			if (ce->e_used >= MB_CACHE_WRITER) {
732 				DEFINE_WAIT(wait);
733 
734 				while (ce->e_used >= MB_CACHE_WRITER) {
735 					ce->e_queued++;
736 					prepare_to_wait(&mb_cache_queue, &wait,
737 							TASK_UNINTERRUPTIBLE);
738 					__spin_unlock_mb_cache_entry(ce);
739 					schedule();
740 					__spin_lock_mb_cache_entry(ce);
741 					ce->e_queued--;
742 				}
743 				finish_wait(&mb_cache_queue, &wait);
744 			}
745 			__spin_unlock_mb_cache_entry(ce);
746 			if (!list_empty(&ce->e_lru_list)) {
747 				spin_lock(&mb_cache_spinlock);
748 				list_del_init(&ce->e_lru_list);
749 				spin_unlock(&mb_cache_spinlock);
750 			}
751 			if (!__mb_cache_entry_is_block_hashed(ce)) {
752 				__mb_cache_entry_release(ce);
753 				return ERR_PTR(-EAGAIN);
754 			}
755 			return ce;
756 		}
757 		l = l->next;
758 	}
759 	hlist_bl_unlock(head);
760 	return NULL;
761 }
762 
763 
764 /*
765  * mb_cache_entry_find_first()
766  *
767  * Find the first cache entry on a given device with a certain key in
768  * an additional index. Additional matches can be found with
769  * mb_cache_entry_find_next(). Returns NULL if no match was found. The
770  * returned cache entry is locked for shared access ("multiple readers").
771  *
772  * @cache: the cache to search
773  * @bdev: the device the cache entry should belong to
774  * @key: the key in the index
775  */
776 struct mb_cache_entry *
777 mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev,
778 			  unsigned int key)
779 {
780 	unsigned int bucket = hash_long(key, cache->c_bucket_bits);
781 	struct hlist_bl_node *l;
782 	struct mb_cache_entry *ce = NULL;
783 	struct hlist_bl_head *index_hash_p;
784 
785 	index_hash_p = &cache->c_index_hash[bucket];
786 	hlist_bl_lock(index_hash_p);
787 	if (!hlist_bl_empty(index_hash_p)) {
788 		l = hlist_bl_first(index_hash_p);
789 		ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
790 	} else
791 		hlist_bl_unlock(index_hash_p);
792 	return ce;
793 }
794 
795 
796 /*
797  * mb_cache_entry_find_next()
798  *
799  * Find the next cache entry on a given device with a certain key in an
800  * additional index. Returns NULL if no match could be found. The previous
801  * entry is atomatically released, so that mb_cache_entry_find_next() can
802  * be called like this:
803  *
804  * entry = mb_cache_entry_find_first();
805  * while (entry) {
806  * 	...
807  *	entry = mb_cache_entry_find_next(entry, ...);
808  * }
809  *
810  * @prev: The previous match
811  * @bdev: the device the cache entry should belong to
812  * @key: the key in the index
813  */
814 struct mb_cache_entry *
815 mb_cache_entry_find_next(struct mb_cache_entry *prev,
816 			 struct block_device *bdev, unsigned int key)
817 {
818 	struct mb_cache *cache = prev->e_cache;
819 	unsigned int bucket = hash_long(key, cache->c_bucket_bits);
820 	struct hlist_bl_node *l;
821 	struct mb_cache_entry *ce;
822 	struct hlist_bl_head *index_hash_p;
823 
824 	index_hash_p = &cache->c_index_hash[bucket];
825 	mb_assert(prev->e_index_hash_p == index_hash_p);
826 	hlist_bl_lock(index_hash_p);
827 	mb_assert(!hlist_bl_empty(index_hash_p));
828 	l = prev->e_index.o_list.next;
829 	ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
830 	__mb_cache_entry_release(prev);
831 	return ce;
832 }
833 
834 #endif  /* !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) */
835 
836 static int __init init_mbcache(void)
837 {
838 	register_shrinker(&mb_cache_shrinker);
839 	return 0;
840 }
841 
842 static void __exit exit_mbcache(void)
843 {
844 	unregister_shrinker(&mb_cache_shrinker);
845 }
846 
847 module_init(init_mbcache)
848 module_exit(exit_mbcache)
849 
850