xref: /openbmc/linux/include/linux/lru_cache.h (revision 2cd10a49)
1c6ae4c04SThomas Gleixner /* SPDX-License-Identifier: GPL-2.0-or-later */
2b411b363SPhilipp Reisner /*
3b411b363SPhilipp Reisner    lru_cache.c
4b411b363SPhilipp Reisner 
5b411b363SPhilipp Reisner    This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
6b411b363SPhilipp Reisner 
7b411b363SPhilipp Reisner    Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
8b411b363SPhilipp Reisner    Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
9b411b363SPhilipp Reisner    Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
10b411b363SPhilipp Reisner 
11b411b363SPhilipp Reisner 
12b411b363SPhilipp Reisner  */
13b411b363SPhilipp Reisner 
14b411b363SPhilipp Reisner #ifndef LRU_CACHE_H
15b411b363SPhilipp Reisner #define LRU_CACHE_H
16b411b363SPhilipp Reisner 
17b411b363SPhilipp Reisner #include <linux/list.h>
18b411b363SPhilipp Reisner #include <linux/slab.h>
19b411b363SPhilipp Reisner #include <linux/bitops.h>
20b411b363SPhilipp Reisner #include <linux/string.h> /* for memset */
21b411b363SPhilipp Reisner #include <linux/seq_file.h>
22b411b363SPhilipp Reisner 
23b411b363SPhilipp Reisner /*
24b411b363SPhilipp Reisner This header file (and its .c file; kernel-doc of functions see there)
25b411b363SPhilipp Reisner   define a helper framework to easily keep track of index:label associations,
26b411b363SPhilipp Reisner   and changes to an "active set" of objects, as well as pending transactions,
27b411b363SPhilipp Reisner   to persistently record those changes.
28b411b363SPhilipp Reisner 
29b411b363SPhilipp Reisner   We use an LRU policy if it is necessary to "cool down" a region currently in
30b411b363SPhilipp Reisner   the active set before we can "heat" a previously unused region.
31b411b363SPhilipp Reisner 
32b411b363SPhilipp Reisner   Because of this later property, it is called "lru_cache".
33b411b363SPhilipp Reisner   As it actually Tracks Objects in an Active SeT, we could also call it
34b411b363SPhilipp Reisner   toast (incidentally that is what may happen to the data on the
35*c23c8082SZhen Lei   backend storage upon next resync, if we don't get it right).
36b411b363SPhilipp Reisner 
37b411b363SPhilipp Reisner What for?
38b411b363SPhilipp Reisner 
39b411b363SPhilipp Reisner We replicate IO (more or less synchronously) to local and remote disk.
40b411b363SPhilipp Reisner 
41b411b363SPhilipp Reisner For crash recovery after replication node failure,
42b411b363SPhilipp Reisner   we need to resync all regions that have been target of in-flight WRITE IO
4348fc7f7eSAdam Buchbinder   (in use, or "hot", regions), as we don't know whether or not those WRITEs
4448fc7f7eSAdam Buchbinder   have made it to stable storage.
45b411b363SPhilipp Reisner 
46b411b363SPhilipp Reisner   To avoid a "full resync", we need to persistently track these regions.
47b411b363SPhilipp Reisner 
48b411b363SPhilipp Reisner   This is known as "write intent log", and can be implemented as on-disk
49b411b363SPhilipp Reisner   (coarse or fine grained) bitmap, or other meta data.
50b411b363SPhilipp Reisner 
51b411b363SPhilipp Reisner   To avoid the overhead of frequent extra writes to this meta data area,
52b411b363SPhilipp Reisner   usually the condition is softened to regions that _may_ have been target of
53b411b363SPhilipp Reisner   in-flight WRITE IO, e.g. by only lazily clearing the on-disk write-intent
54b411b363SPhilipp Reisner   bitmap, trading frequency of meta data transactions against amount of
553ad2f3fbSDaniel Mack   (possibly unnecessary) resync traffic.
56b411b363SPhilipp Reisner 
57b411b363SPhilipp Reisner   If we set a hard limit on the area that may be "hot" at any given time, we
58b411b363SPhilipp Reisner   limit the amount of resync traffic needed for crash recovery.
59b411b363SPhilipp Reisner 
60b411b363SPhilipp Reisner For recovery after replication link failure,
61b411b363SPhilipp Reisner   we need to resync all blocks that have been changed on the other replica
62b411b363SPhilipp Reisner   in the mean time, or, if both replica have been changed independently [*],
63b411b363SPhilipp Reisner   all blocks that have been changed on either replica in the mean time.
64b411b363SPhilipp Reisner   [*] usually as a result of a cluster split-brain and insufficient protection.
65b411b363SPhilipp Reisner       but there are valid use cases to do this on purpose.
66b411b363SPhilipp Reisner 
67b411b363SPhilipp Reisner   Tracking those blocks can be implemented as "dirty bitmap".
68b411b363SPhilipp Reisner   Having it fine-grained reduces the amount of resync traffic.
69b411b363SPhilipp Reisner   It should also be persistent, to allow for reboots (or crashes)
70b411b363SPhilipp Reisner   while the replication link is down.
71b411b363SPhilipp Reisner 
72b411b363SPhilipp Reisner There are various possible implementations for persistently storing
73b411b363SPhilipp Reisner write intent log information, three of which are mentioned here.
74b411b363SPhilipp Reisner 
75b411b363SPhilipp Reisner "Chunk dirtying"
76b411b363SPhilipp Reisner   The on-disk "dirty bitmap" may be re-used as "write-intent" bitmap as well.
77b411b363SPhilipp Reisner   To reduce the frequency of bitmap updates for write-intent log purposes,
78b411b363SPhilipp Reisner   one could dirty "chunks" (of some size) at a time of the (fine grained)
79b411b363SPhilipp Reisner   on-disk bitmap, while keeping the in-memory "dirty" bitmap as clean as
80b411b363SPhilipp Reisner   possible, flushing it to disk again when a previously "hot" (and on-disk
81b411b363SPhilipp Reisner   dirtied as full chunk) area "cools down" again (no IO in flight anymore,
82b411b363SPhilipp Reisner   and none expected in the near future either).
83b411b363SPhilipp Reisner 
84b411b363SPhilipp Reisner "Explicit (coarse) write intent bitmap"
85b411b363SPhilipp Reisner   An other implementation could chose a (probably coarse) explicit bitmap,
86b411b363SPhilipp Reisner   for write-intent log purposes, additionally to the fine grained dirty bitmap.
87b411b363SPhilipp Reisner 
88b411b363SPhilipp Reisner "Activity log"
89b411b363SPhilipp Reisner   Yet an other implementation may keep track of the hot regions, by starting
90b411b363SPhilipp Reisner   with an empty set, and writing down a journal of region numbers that have
91b411b363SPhilipp Reisner   become "hot", or have "cooled down" again.
92b411b363SPhilipp Reisner 
93b411b363SPhilipp Reisner   To be able to use a ring buffer for this journal of changes to the active
94b411b363SPhilipp Reisner   set, we not only record the actual changes to that set, but also record the
95b411b363SPhilipp Reisner   not changing members of the set in a round robin fashion. To do so, we use a
96b411b363SPhilipp Reisner   fixed (but configurable) number of slots which we can identify by index, and
97b411b363SPhilipp Reisner   associate region numbers (labels) with these indices.
98b411b363SPhilipp Reisner   For each transaction recording a change to the active set, we record the
99b411b363SPhilipp Reisner   change itself (index: -old_label, +new_label), and which index is associated
100b411b363SPhilipp Reisner   with which label (index: current_label) within a certain sliding window that
101b411b363SPhilipp Reisner   is moved further over the available indices with each such transaction.
102b411b363SPhilipp Reisner 
103b411b363SPhilipp Reisner   Thus, for crash recovery, if the ringbuffer is sufficiently large, we can
104b411b363SPhilipp Reisner   accurately reconstruct the active set.
105b411b363SPhilipp Reisner 
106b411b363SPhilipp Reisner   Sufficiently large depends only on maximum number of active objects, and the
107b411b363SPhilipp Reisner   size of the sliding window recording "index: current_label" associations within
108b411b363SPhilipp Reisner   each transaction.
109b411b363SPhilipp Reisner 
110b411b363SPhilipp Reisner   This is what we call the "activity log".
111b411b363SPhilipp Reisner 
112b411b363SPhilipp Reisner   Currently we need one activity log transaction per single label change, which
113b411b363SPhilipp Reisner   does not give much benefit over the "dirty chunks of bitmap" approach, other
114b411b363SPhilipp Reisner   than potentially less seeks.
115b411b363SPhilipp Reisner 
116b411b363SPhilipp Reisner   We plan to change the transaction format to support multiple changes per
117b411b363SPhilipp Reisner   transaction, which then would reduce several (disjoint, "random") updates to
118b411b363SPhilipp Reisner   the bitmap into one transaction to the activity log ring buffer.
119b411b363SPhilipp Reisner */
120b411b363SPhilipp Reisner 
121b411b363SPhilipp Reisner /* this defines an element in a tracked set
122b411b363SPhilipp Reisner  * .colision is for hash table lookup.
123b411b363SPhilipp Reisner  * When we process a new IO request, we know its sector, thus can deduce the
124b411b363SPhilipp Reisner  * region number (label) easily.  To do the label -> object lookup without a
125b411b363SPhilipp Reisner  * full list walk, we use a simple hash table.
126b411b363SPhilipp Reisner  *
127b411b363SPhilipp Reisner  * .list is on one of three lists:
128b411b363SPhilipp Reisner  *  in_use: currently in use (refcnt > 0, lc_number != LC_FREE)
129b411b363SPhilipp Reisner  *     lru: unused but ready to be reused or recycled
130600942e0SLars Ellenberg  *          (lc_refcnt == 0, lc_number != LC_FREE),
131b411b363SPhilipp Reisner  *    free: unused but ready to be recycled
132600942e0SLars Ellenberg  *          (lc_refcnt == 0, lc_number == LC_FREE),
133b411b363SPhilipp Reisner  *
134b411b363SPhilipp Reisner  * an element is said to be "in the active set",
135b411b363SPhilipp Reisner  * if either on "in_use" or "lru", i.e. lc_number != LC_FREE.
136b411b363SPhilipp Reisner  *
137b411b363SPhilipp Reisner  * DRBD currently (May 2009) only uses 61 elements on the resync lru_cache
138b411b363SPhilipp Reisner  * (total memory usage 2 pages), and up to 3833 elements on the act_log
13925985edcSLucas De Marchi  * lru_cache, totalling ~215 kB for 64bit architecture, ~53 pages.
140b411b363SPhilipp Reisner  *
141b411b363SPhilipp Reisner  * We usually do not actually free these objects again, but only "recycle"
142b411b363SPhilipp Reisner  * them, as the change "index: -old_label, +LC_FREE" would need a transaction
143b411b363SPhilipp Reisner  * as well.  Which also means that using a kmem_cache to allocate the objects
144b411b363SPhilipp Reisner  * from wastes some resources.
145b411b363SPhilipp Reisner  * But it avoids high order page allocations in kmalloc.
146b411b363SPhilipp Reisner  */
147b411b363SPhilipp Reisner struct lc_element {
148b411b363SPhilipp Reisner 	struct hlist_node colision;
149b411b363SPhilipp Reisner 	struct list_head list;		 /* LRU list or free list */
150b411b363SPhilipp Reisner 	unsigned refcnt;
151600942e0SLars Ellenberg 	/* back "pointer" into lc_cache->element[index],
152600942e0SLars Ellenberg 	 * for paranoia, and for "lc_element_to_index" */
153b411b363SPhilipp Reisner 	unsigned lc_index;
154b411b363SPhilipp Reisner 	/* if we want to track a larger set of objects,
155*c23c8082SZhen Lei 	 * it needs to become an architecture independent u64 */
156b411b363SPhilipp Reisner 	unsigned lc_number;
157b411b363SPhilipp Reisner 	/* special label when on free list */
158b411b363SPhilipp Reisner #define LC_FREE (~0U)
15946a15bc3SLars Ellenberg 
16046a15bc3SLars Ellenberg 	/* for pending changes */
16146a15bc3SLars Ellenberg 	unsigned lc_new_number;
162b411b363SPhilipp Reisner };
163b411b363SPhilipp Reisner 
164b411b363SPhilipp Reisner struct lru_cache {
165b411b363SPhilipp Reisner 	/* the least recently used item is kept at lru->prev */
166b411b363SPhilipp Reisner 	struct list_head lru;
167b411b363SPhilipp Reisner 	struct list_head free;
168b411b363SPhilipp Reisner 	struct list_head in_use;
16946a15bc3SLars Ellenberg 	struct list_head to_be_changed;
170b411b363SPhilipp Reisner 
171b411b363SPhilipp Reisner 	/* the pre-created kmem cache to allocate the objects from */
172b411b363SPhilipp Reisner 	struct kmem_cache *lc_cache;
173b411b363SPhilipp Reisner 
174b411b363SPhilipp Reisner 	/* size of tracked objects, used to memset(,0,) them in lc_reset */
175b411b363SPhilipp Reisner 	size_t element_size;
176b411b363SPhilipp Reisner 	/* offset of struct lc_element member in the tracked object */
177b411b363SPhilipp Reisner 	size_t element_off;
178b411b363SPhilipp Reisner 
179b411b363SPhilipp Reisner 	/* number of elements (indices) */
180b411b363SPhilipp Reisner 	unsigned int nr_elements;
181b411b363SPhilipp Reisner 	/* Arbitrary limit on maximum tracked objects. Practical limit is much
182b411b363SPhilipp Reisner 	 * lower due to allocation failures, probably. For typical use cases,
183b411b363SPhilipp Reisner 	 * nr_elements should be a few thousand at most.
184600942e0SLars Ellenberg 	 * This also limits the maximum value of lc_element.lc_index, allowing the
185600942e0SLars Ellenberg 	 * 8 high bits of .lc_index to be overloaded with flags in the future. */
186b411b363SPhilipp Reisner #define LC_MAX_ACTIVE	(1<<24)
187b411b363SPhilipp Reisner 
18846a15bc3SLars Ellenberg 	/* allow to accumulate a few (index:label) changes,
18946a15bc3SLars Ellenberg 	 * but no more than max_pending_changes */
19046a15bc3SLars Ellenberg 	unsigned int max_pending_changes;
19146a15bc3SLars Ellenberg 	/* number of elements currently on to_be_changed list */
19246a15bc3SLars Ellenberg 	unsigned int pending_changes;
19346a15bc3SLars Ellenberg 
194b411b363SPhilipp Reisner 	/* statistics */
19546a15bc3SLars Ellenberg 	unsigned used; /* number of elements currently on in_use list */
19646a15bc3SLars Ellenberg 	unsigned long hits, misses, starving, locked, changed;
197b411b363SPhilipp Reisner 
198b411b363SPhilipp Reisner 	/* see below: flag-bits for lru_cache */
199b411b363SPhilipp Reisner 	unsigned long flags;
200b411b363SPhilipp Reisner 
201b411b363SPhilipp Reisner 
202b411b363SPhilipp Reisner 	const char *name;
203b411b363SPhilipp Reisner 
204b411b363SPhilipp Reisner 	/* nr_elements there */
205b411b363SPhilipp Reisner 	struct hlist_head *lc_slot;
206b411b363SPhilipp Reisner 	struct lc_element **lc_element;
207b411b363SPhilipp Reisner };
208b411b363SPhilipp Reisner 
209b411b363SPhilipp Reisner 
210b411b363SPhilipp Reisner /* flag-bits for lru_cache */
211b411b363SPhilipp Reisner enum {
212b411b363SPhilipp Reisner 	/* debugging aid, to catch concurrent access early.
213b411b363SPhilipp Reisner 	 * user needs to guarantee exclusive access by proper locking! */
214b411b363SPhilipp Reisner 	__LC_PARANOIA,
21546a15bc3SLars Ellenberg 
21646a15bc3SLars Ellenberg 	/* annotate that the set is "dirty", possibly accumulating further
21746a15bc3SLars Ellenberg 	 * changes, until a transaction is finally triggered */
218b411b363SPhilipp Reisner 	__LC_DIRTY,
21946a15bc3SLars Ellenberg 
22046a15bc3SLars Ellenberg 	/* Locked, no further changes allowed.
22146a15bc3SLars Ellenberg 	 * Also used to serialize changing transactions. */
22246a15bc3SLars Ellenberg 	__LC_LOCKED,
22346a15bc3SLars Ellenberg 
224b411b363SPhilipp Reisner 	/* if we need to change the set, but currently there is no free nor
225b411b363SPhilipp Reisner 	 * unused element available, we are "starving", and must not give out
226b411b363SPhilipp Reisner 	 * further references, to guarantee that eventually some refcnt will
227b411b363SPhilipp Reisner 	 * drop to zero and we will be able to make progress again, changing
228b411b363SPhilipp Reisner 	 * the set, writing the transaction.
229b411b363SPhilipp Reisner 	 * if the statistics say we are frequently starving,
230b411b363SPhilipp Reisner 	 * nr_elements is too small. */
231b411b363SPhilipp Reisner 	__LC_STARVING,
232b411b363SPhilipp Reisner };
233b411b363SPhilipp Reisner #define LC_PARANOIA (1<<__LC_PARANOIA)
234b411b363SPhilipp Reisner #define LC_DIRTY    (1<<__LC_DIRTY)
23546a15bc3SLars Ellenberg #define LC_LOCKED   (1<<__LC_LOCKED)
236b411b363SPhilipp Reisner #define LC_STARVING (1<<__LC_STARVING)
237b411b363SPhilipp Reisner 
238b411b363SPhilipp Reisner extern struct lru_cache *lc_create(const char *name, struct kmem_cache *cache,
23946a15bc3SLars Ellenberg 		unsigned max_pending_changes,
240b411b363SPhilipp Reisner 		unsigned e_count, size_t e_size, size_t e_off);
241b411b363SPhilipp Reisner extern void lc_reset(struct lru_cache *lc);
242b411b363SPhilipp Reisner extern void lc_destroy(struct lru_cache *lc);
243b411b363SPhilipp Reisner extern void lc_del(struct lru_cache *lc, struct lc_element *element);
244b411b363SPhilipp Reisner 
245cbe5e610SLars Ellenberg extern struct lc_element *lc_get_cumulative(struct lru_cache *lc, unsigned int enr);
246b411b363SPhilipp Reisner extern struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr);
247b411b363SPhilipp Reisner extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr);
248b411b363SPhilipp Reisner extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr);
249b411b363SPhilipp Reisner extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e);
25046a15bc3SLars Ellenberg extern void lc_committed(struct lru_cache *lc);
251b411b363SPhilipp Reisner 
252b411b363SPhilipp Reisner struct seq_file;
253bb649b34SRoland Kammerer extern void lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc);
254b411b363SPhilipp Reisner 
255b411b363SPhilipp Reisner extern void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext,
256b411b363SPhilipp Reisner 				void (*detail) (struct seq_file *, struct lc_element *));
257b411b363SPhilipp Reisner 
258b411b363SPhilipp Reisner /**
25946a15bc3SLars Ellenberg  * lc_try_lock_for_transaction - can be used to stop lc_get() from changing the tracked set
26046a15bc3SLars Ellenberg  * @lc: the lru cache to operate on
26146a15bc3SLars Ellenberg  *
26246a15bc3SLars Ellenberg  * Allows (expects) the set to be "dirty".  Note that the reference counts and
26346a15bc3SLars Ellenberg  * order on the active and lru lists may still change.  Used to serialize
264*c23c8082SZhen Lei  * changing transactions.  Returns true if we acquired the lock.
26546a15bc3SLars Ellenberg  */
lc_try_lock_for_transaction(struct lru_cache * lc)26646a15bc3SLars Ellenberg static inline int lc_try_lock_for_transaction(struct lru_cache *lc)
26746a15bc3SLars Ellenberg {
26846a15bc3SLars Ellenberg 	return !test_and_set_bit(__LC_LOCKED, &lc->flags);
26946a15bc3SLars Ellenberg }
27046a15bc3SLars Ellenberg 
27146a15bc3SLars Ellenberg /**
27246a15bc3SLars Ellenberg  * lc_try_lock - variant to stop lc_get() from changing the tracked set
273b411b363SPhilipp Reisner  * @lc: the lru cache to operate on
274b411b363SPhilipp Reisner  *
275b411b363SPhilipp Reisner  * Note that the reference counts and order on the active and lru lists may
276*c23c8082SZhen Lei  * still change.  Only works on a "clean" set.  Returns true if we acquired the
27746a15bc3SLars Ellenberg  * lock, which means there are no pending changes, and any further attempt to
27846a15bc3SLars Ellenberg  * change the set will not succeed until the next lc_unlock().
279b411b363SPhilipp Reisner  */
28046a15bc3SLars Ellenberg extern int lc_try_lock(struct lru_cache *lc);
281b411b363SPhilipp Reisner 
282b411b363SPhilipp Reisner /**
283b411b363SPhilipp Reisner  * lc_unlock - unlock @lc, allow lc_get() to change the set again
284b411b363SPhilipp Reisner  * @lc: the lru cache to operate on
285b411b363SPhilipp Reisner  */
lc_unlock(struct lru_cache * lc)286b411b363SPhilipp Reisner static inline void lc_unlock(struct lru_cache *lc)
287b411b363SPhilipp Reisner {
288b411b363SPhilipp Reisner 	clear_bit(__LC_DIRTY, &lc->flags);
28946a15bc3SLars Ellenberg 	clear_bit_unlock(__LC_LOCKED, &lc->flags);
290b411b363SPhilipp Reisner }
291b411b363SPhilipp Reisner 
29246a15bc3SLars Ellenberg extern bool lc_is_used(struct lru_cache *lc, unsigned int enr);
293b411b363SPhilipp Reisner 
294b411b363SPhilipp Reisner #define lc_entry(ptr, type, member) \
295b411b363SPhilipp Reisner 	container_of(ptr, type, member)
296b411b363SPhilipp Reisner 
297b411b363SPhilipp Reisner extern struct lc_element *lc_element_by_index(struct lru_cache *lc, unsigned i);
298b411b363SPhilipp Reisner 
299b411b363SPhilipp Reisner #endif
300