xref: /openbmc/linux/drivers/md/dm-bufio.c (revision 5cfcea64)
13bd94003SHeinz Mauelshagen // SPDX-License-Identifier: GPL-2.0-only
295d402f0SMikulas Patocka /*
395d402f0SMikulas Patocka  * Copyright (C) 2009-2011 Red Hat, Inc.
495d402f0SMikulas Patocka  *
595d402f0SMikulas Patocka  * Author: Mikulas Patocka <mpatocka@redhat.com>
695d402f0SMikulas Patocka  *
795d402f0SMikulas Patocka  * This file is released under the GPL.
895d402f0SMikulas Patocka  */
995d402f0SMikulas Patocka 
10afa53df8SMikulas Patocka #include <linux/dm-bufio.h>
1195d402f0SMikulas Patocka 
1295d402f0SMikulas Patocka #include <linux/device-mapper.h>
1395d402f0SMikulas Patocka #include <linux/dm-io.h>
1495d402f0SMikulas Patocka #include <linux/slab.h>
155b3cc15aSIngo Molnar #include <linux/sched/mm.h>
16f495339cSAsaf Vertz #include <linux/jiffies.h>
1795d402f0SMikulas Patocka #include <linux/vmalloc.h>
1895d402f0SMikulas Patocka #include <linux/shrinker.h>
196f66263fSStephen Rothwell #include <linux/module.h>
204e420c45SJoe Thornber #include <linux/rbtree.h>
2186bad0c7SMikulas Patocka #include <linux/stacktrace.h>
223c1c875dSMike Snitzer #include <linux/jump_label.h>
2395d402f0SMikulas Patocka 
241e84c4b7SMike Snitzer #include "dm.h"
251e84c4b7SMike Snitzer 
2695d402f0SMikulas Patocka #define DM_MSG_PREFIX "bufio"
2795d402f0SMikulas Patocka 
2895d402f0SMikulas Patocka /*
2995d402f0SMikulas Patocka  * Memory management policy:
3095d402f0SMikulas Patocka  *	Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory
3195d402f0SMikulas Patocka  *	or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower).
3295d402f0SMikulas Patocka  *	Always allocate at least DM_BUFIO_MIN_BUFFERS buffers.
3395d402f0SMikulas Patocka  *	Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT
3495d402f0SMikulas Patocka  *	dirty buffers.
3595d402f0SMikulas Patocka  */
3695d402f0SMikulas Patocka #define DM_BUFIO_MIN_BUFFERS		8
3795d402f0SMikulas Patocka 
3895d402f0SMikulas Patocka #define DM_BUFIO_MEMORY_PERCENT		2
3995d402f0SMikulas Patocka #define DM_BUFIO_VMALLOC_PERCENT	25
40b132ff33SMikulas Patocka #define DM_BUFIO_WRITEBACK_RATIO	3
416e913b28SMikulas Patocka #define DM_BUFIO_LOW_WATERMARK_RATIO	16
4295d402f0SMikulas Patocka 
4395d402f0SMikulas Patocka /*
4495d402f0SMikulas Patocka  * Check buffer ages in this interval (seconds)
4595d402f0SMikulas Patocka  */
4633096a78SJoe Thornber #define DM_BUFIO_WORK_TIMER_SECS	30
4795d402f0SMikulas Patocka 
4895d402f0SMikulas Patocka /*
4995d402f0SMikulas Patocka  * Free buffers when they are older than this (seconds)
5095d402f0SMikulas Patocka  */
5133096a78SJoe Thornber #define DM_BUFIO_DEFAULT_AGE_SECS	300
5233096a78SJoe Thornber 
5333096a78SJoe Thornber /*
5433096a78SJoe Thornber  * The nr of bytes of cached data to keep around.
5533096a78SJoe Thornber  */
5633096a78SJoe Thornber #define DM_BUFIO_DEFAULT_RETAIN_BYTES   (256 * 1024)
5795d402f0SMikulas Patocka 
5895d402f0SMikulas Patocka /*
591e3b21c6SMikulas Patocka  * Align buffer writes to this boundary.
601e3b21c6SMikulas Patocka  * Tests show that SSDs have the highest IOPS when using 4k writes.
611e3b21c6SMikulas Patocka  */
621e3b21c6SMikulas Patocka #define DM_BUFIO_WRITE_ALIGN		4096
631e3b21c6SMikulas Patocka 
641e3b21c6SMikulas Patocka /*
6595d402f0SMikulas Patocka  * dm_buffer->list_mode
6695d402f0SMikulas Patocka  */
6795d402f0SMikulas Patocka #define LIST_CLEAN	0
6895d402f0SMikulas Patocka #define LIST_DIRTY	1
6995d402f0SMikulas Patocka #define LIST_SIZE	2
7095d402f0SMikulas Patocka 
71be845babSJoe Thornber /*--------------------------------------------------------------*/
72be845babSJoe Thornber 
73be845babSJoe Thornber /*
74be845babSJoe Thornber  * Rather than use an LRU list, we use a clock algorithm where entries
75be845babSJoe Thornber  * are held in a circular list.  When an entry is 'hit' a reference bit
76be845babSJoe Thornber  * is set.  The least recently used entry is approximated by running a
77be845babSJoe Thornber  * cursor around the list selecting unreferenced entries. Referenced
78be845babSJoe Thornber  * entries have their reference bit cleared as the cursor passes them.
79be845babSJoe Thornber  */
80be845babSJoe Thornber struct lru_entry {
81be845babSJoe Thornber 	struct list_head list;
82be845babSJoe Thornber 	atomic_t referenced;
83be845babSJoe Thornber };
84be845babSJoe Thornber 
85be845babSJoe Thornber struct lru_iter {
86be845babSJoe Thornber 	struct lru *lru;
87be845babSJoe Thornber 	struct list_head list;
88be845babSJoe Thornber 	struct lru_entry *stop;
89be845babSJoe Thornber 	struct lru_entry *e;
90be845babSJoe Thornber };
91be845babSJoe Thornber 
92be845babSJoe Thornber struct lru {
93be845babSJoe Thornber 	struct list_head *cursor;
94be845babSJoe Thornber 	unsigned long count;
95be845babSJoe Thornber 
96be845babSJoe Thornber 	struct list_head iterators;
97be845babSJoe Thornber };
98be845babSJoe Thornber 
99be845babSJoe Thornber /*--------------*/
100be845babSJoe Thornber 
lru_init(struct lru * lru)101be845babSJoe Thornber static void lru_init(struct lru *lru)
102be845babSJoe Thornber {
103be845babSJoe Thornber 	lru->cursor = NULL;
104be845babSJoe Thornber 	lru->count = 0;
105be845babSJoe Thornber 	INIT_LIST_HEAD(&lru->iterators);
106be845babSJoe Thornber }
107be845babSJoe Thornber 
lru_destroy(struct lru * lru)108be845babSJoe Thornber static void lru_destroy(struct lru *lru)
109be845babSJoe Thornber {
110be845babSJoe Thornber 	WARN_ON_ONCE(lru->cursor);
111be845babSJoe Thornber 	WARN_ON_ONCE(!list_empty(&lru->iterators));
112be845babSJoe Thornber }
113be845babSJoe Thornber 
114be845babSJoe Thornber /*
115be845babSJoe Thornber  * Insert a new entry into the lru.
116be845babSJoe Thornber  */
lru_insert(struct lru * lru,struct lru_entry * le)117be845babSJoe Thornber static void lru_insert(struct lru *lru, struct lru_entry *le)
118be845babSJoe Thornber {
119be845babSJoe Thornber 	/*
120be845babSJoe Thornber 	 * Don't be tempted to set to 1, makes the lru aspect
121be845babSJoe Thornber 	 * perform poorly.
122be845babSJoe Thornber 	 */
123be845babSJoe Thornber 	atomic_set(&le->referenced, 0);
124be845babSJoe Thornber 
125be845babSJoe Thornber 	if (lru->cursor) {
126be845babSJoe Thornber 		list_add_tail(&le->list, lru->cursor);
127be845babSJoe Thornber 	} else {
128be845babSJoe Thornber 		INIT_LIST_HEAD(&le->list);
129be845babSJoe Thornber 		lru->cursor = &le->list;
130be845babSJoe Thornber 	}
131be845babSJoe Thornber 	lru->count++;
132be845babSJoe Thornber }
133be845babSJoe Thornber 
134be845babSJoe Thornber /*--------------*/
135be845babSJoe Thornber 
136be845babSJoe Thornber /*
137be845babSJoe Thornber  * Convert a list_head pointer to an lru_entry pointer.
138be845babSJoe Thornber  */
to_le(struct list_head * l)139be845babSJoe Thornber static inline struct lru_entry *to_le(struct list_head *l)
140be845babSJoe Thornber {
141be845babSJoe Thornber 	return container_of(l, struct lru_entry, list);
142be845babSJoe Thornber }
143be845babSJoe Thornber 
144be845babSJoe Thornber /*
145be845babSJoe Thornber  * Initialize an lru_iter and add it to the list of cursors in the lru.
146be845babSJoe Thornber  */
lru_iter_begin(struct lru * lru,struct lru_iter * it)147be845babSJoe Thornber static void lru_iter_begin(struct lru *lru, struct lru_iter *it)
148be845babSJoe Thornber {
149be845babSJoe Thornber 	it->lru = lru;
150be845babSJoe Thornber 	it->stop = lru->cursor ? to_le(lru->cursor->prev) : NULL;
151be845babSJoe Thornber 	it->e = lru->cursor ? to_le(lru->cursor) : NULL;
152be845babSJoe Thornber 	list_add(&it->list, &lru->iterators);
153be845babSJoe Thornber }
154be845babSJoe Thornber 
155be845babSJoe Thornber /*
156be845babSJoe Thornber  * Remove an lru_iter from the list of cursors in the lru.
157be845babSJoe Thornber  */
lru_iter_end(struct lru_iter * it)158be845babSJoe Thornber static inline void lru_iter_end(struct lru_iter *it)
159be845babSJoe Thornber {
160be845babSJoe Thornber 	list_del(&it->list);
161be845babSJoe Thornber }
162be845babSJoe Thornber 
163be845babSJoe Thornber /* Predicate function type to be used with lru_iter_next */
164be845babSJoe Thornber typedef bool (*iter_predicate)(struct lru_entry *le, void *context);
165be845babSJoe Thornber 
166be845babSJoe Thornber /*
167be845babSJoe Thornber  * Advance the cursor to the next entry that passes the
168be845babSJoe Thornber  * predicate, and return that entry.  Returns NULL if the
169be845babSJoe Thornber  * iteration is complete.
170be845babSJoe Thornber  */
lru_iter_next(struct lru_iter * it,iter_predicate pred,void * context)171be845babSJoe Thornber static struct lru_entry *lru_iter_next(struct lru_iter *it,
172be845babSJoe Thornber 				       iter_predicate pred, void *context)
173be845babSJoe Thornber {
174be845babSJoe Thornber 	struct lru_entry *e;
175be845babSJoe Thornber 
176be845babSJoe Thornber 	while (it->e) {
177be845babSJoe Thornber 		e = it->e;
178be845babSJoe Thornber 
179be845babSJoe Thornber 		/* advance the cursor */
180be845babSJoe Thornber 		if (it->e == it->stop)
181be845babSJoe Thornber 			it->e = NULL;
182be845babSJoe Thornber 		else
183be845babSJoe Thornber 			it->e = to_le(it->e->list.next);
184be845babSJoe Thornber 
185be845babSJoe Thornber 		if (pred(e, context))
186be845babSJoe Thornber 			return e;
187be845babSJoe Thornber 	}
188be845babSJoe Thornber 
189be845babSJoe Thornber 	return NULL;
190be845babSJoe Thornber }
191be845babSJoe Thornber 
192be845babSJoe Thornber /*
193be845babSJoe Thornber  * Invalidate a specific lru_entry and update all cursors in
194be845babSJoe Thornber  * the lru accordingly.
195be845babSJoe Thornber  */
lru_iter_invalidate(struct lru * lru,struct lru_entry * e)196be845babSJoe Thornber static void lru_iter_invalidate(struct lru *lru, struct lru_entry *e)
197be845babSJoe Thornber {
198be845babSJoe Thornber 	struct lru_iter *it;
199be845babSJoe Thornber 
200be845babSJoe Thornber 	list_for_each_entry(it, &lru->iterators, list) {
201be845babSJoe Thornber 		/* Move c->e forwards if necc. */
202be845babSJoe Thornber 		if (it->e == e) {
203be845babSJoe Thornber 			it->e = to_le(it->e->list.next);
204be845babSJoe Thornber 			if (it->e == e)
205be845babSJoe Thornber 				it->e = NULL;
206be845babSJoe Thornber 		}
207be845babSJoe Thornber 
208be845babSJoe Thornber 		/* Move it->stop backwards if necc. */
209be845babSJoe Thornber 		if (it->stop == e) {
210be845babSJoe Thornber 			it->stop = to_le(it->stop->list.prev);
211be845babSJoe Thornber 			if (it->stop == e)
212be845babSJoe Thornber 				it->stop = NULL;
213be845babSJoe Thornber 		}
214be845babSJoe Thornber 	}
215be845babSJoe Thornber }
216be845babSJoe Thornber 
217be845babSJoe Thornber /*--------------*/
218be845babSJoe Thornber 
219be845babSJoe Thornber /*
220be845babSJoe Thornber  * Remove a specific entry from the lru.
221be845babSJoe Thornber  */
lru_remove(struct lru * lru,struct lru_entry * le)222be845babSJoe Thornber static void lru_remove(struct lru *lru, struct lru_entry *le)
223be845babSJoe Thornber {
224be845babSJoe Thornber 	lru_iter_invalidate(lru, le);
225be845babSJoe Thornber 	if (lru->count == 1) {
226be845babSJoe Thornber 		lru->cursor = NULL;
227be845babSJoe Thornber 	} else {
228be845babSJoe Thornber 		if (lru->cursor == &le->list)
229be845babSJoe Thornber 			lru->cursor = lru->cursor->next;
230be845babSJoe Thornber 		list_del(&le->list);
231be845babSJoe Thornber 	}
232be845babSJoe Thornber 	lru->count--;
233be845babSJoe Thornber }
234be845babSJoe Thornber 
235be845babSJoe Thornber /*
236be845babSJoe Thornber  * Mark as referenced.
237be845babSJoe Thornber  */
lru_reference(struct lru_entry * le)238be845babSJoe Thornber static inline void lru_reference(struct lru_entry *le)
239be845babSJoe Thornber {
240be845babSJoe Thornber 	atomic_set(&le->referenced, 1);
241be845babSJoe Thornber }
242be845babSJoe Thornber 
243be845babSJoe Thornber /*--------------*/
244be845babSJoe Thornber 
245be845babSJoe Thornber /*
246be845babSJoe Thornber  * Remove the least recently used entry (approx), that passes the predicate.
247be845babSJoe Thornber  * Returns NULL on failure.
248be845babSJoe Thornber  */
249be845babSJoe Thornber enum evict_result {
250be845babSJoe Thornber 	ER_EVICT,
251be845babSJoe Thornber 	ER_DONT_EVICT,
252be845babSJoe Thornber 	ER_STOP, /* stop looking for something to evict */
253be845babSJoe Thornber };
254be845babSJoe Thornber 
255be845babSJoe Thornber typedef enum evict_result (*le_predicate)(struct lru_entry *le, void *context);
256be845babSJoe Thornber 
2575be21d65SMikulas Patocka static struct lru_entry *lru_evict(struct lru *lru, le_predicate pred, void *context, bool no_sleep)
258be845babSJoe Thornber {
259be845babSJoe Thornber 	unsigned long tested = 0;
260be845babSJoe Thornber 	struct list_head *h = lru->cursor;
261be845babSJoe Thornber 	struct lru_entry *le;
262be845babSJoe Thornber 
263be845babSJoe Thornber 	if (!h)
264be845babSJoe Thornber 		return NULL;
265be845babSJoe Thornber 	/*
266be845babSJoe Thornber 	 * In the worst case we have to loop around twice. Once to clear
267be845babSJoe Thornber 	 * the reference flags, and then again to discover the predicate
268be845babSJoe Thornber 	 * fails for all entries.
269be845babSJoe Thornber 	 */
270be845babSJoe Thornber 	while (tested < lru->count) {
271be845babSJoe Thornber 		le = container_of(h, struct lru_entry, list);
272be845babSJoe Thornber 
273be845babSJoe Thornber 		if (atomic_read(&le->referenced)) {
274be845babSJoe Thornber 			atomic_set(&le->referenced, 0);
275be845babSJoe Thornber 		} else {
276be845babSJoe Thornber 			tested++;
277be845babSJoe Thornber 			switch (pred(le, context)) {
278be845babSJoe Thornber 			case ER_EVICT:
279be845babSJoe Thornber 				/*
280be845babSJoe Thornber 				 * Adjust the cursor, so we start the next
281be845babSJoe Thornber 				 * search from here.
282be845babSJoe Thornber 				 */
283be845babSJoe Thornber 				lru->cursor = le->list.next;
284be845babSJoe Thornber 				lru_remove(lru, le);
285be845babSJoe Thornber 				return le;
286be845babSJoe Thornber 
287be845babSJoe Thornber 			case ER_DONT_EVICT:
288be845babSJoe Thornber 				break;
289be845babSJoe Thornber 
290be845babSJoe Thornber 			case ER_STOP:
291be845babSJoe Thornber 				lru->cursor = le->list.next;
292be845babSJoe Thornber 				return NULL;
293be845babSJoe Thornber 			}
294be845babSJoe Thornber 		}
295be845babSJoe Thornber 
296be845babSJoe Thornber 		h = h->next;
297be845babSJoe Thornber 
2985be21d65SMikulas Patocka 		if (!no_sleep)
299be845babSJoe Thornber 			cond_resched();
300be845babSJoe Thornber 	}
301be845babSJoe Thornber 
302be845babSJoe Thornber 	return NULL;
303be845babSJoe Thornber }
304be845babSJoe Thornber 
305be845babSJoe Thornber /*--------------------------------------------------------------*/
306be845babSJoe Thornber 
30795d402f0SMikulas Patocka /*
3082cd7a6d4SJoe Thornber  * Buffer state bits.
3092cd7a6d4SJoe Thornber  */
3102cd7a6d4SJoe Thornber #define B_READING	0
3112cd7a6d4SJoe Thornber #define B_WRITING	1
3122cd7a6d4SJoe Thornber #define B_DIRTY		2
3132cd7a6d4SJoe Thornber 
3142cd7a6d4SJoe Thornber /*
3152cd7a6d4SJoe Thornber  * Describes how the block was allocated:
3162cd7a6d4SJoe Thornber  * kmem_cache_alloc(), __get_free_pages() or vmalloc().
3172cd7a6d4SJoe Thornber  * See the comment at alloc_buffer_data.
3182cd7a6d4SJoe Thornber  */
3192cd7a6d4SJoe Thornber enum data_mode {
3202cd7a6d4SJoe Thornber 	DATA_MODE_SLAB = 0,
3212cd7a6d4SJoe Thornber 	DATA_MODE_GET_FREE_PAGES = 1,
3222cd7a6d4SJoe Thornber 	DATA_MODE_VMALLOC = 2,
3232cd7a6d4SJoe Thornber 	DATA_MODE_LIMIT = 3
3242cd7a6d4SJoe Thornber };
3252cd7a6d4SJoe Thornber 
3262cd7a6d4SJoe Thornber struct dm_buffer {
327450e8deeSJoe Thornber 	/* protected by the locks in dm_buffer_cache */
3282cd7a6d4SJoe Thornber 	struct rb_node node;
3292cd7a6d4SJoe Thornber 
330450e8deeSJoe Thornber 	/* immutable, so don't need protecting */
3312cd7a6d4SJoe Thornber 	sector_t block;
3322cd7a6d4SJoe Thornber 	void *data;
3332cd7a6d4SJoe Thornber 	unsigned char data_mode;		/* DATA_MODE_* */
3342cd7a6d4SJoe Thornber 
335450e8deeSJoe Thornber 	/*
336450e8deeSJoe Thornber 	 * These two fields are used in isolation, so do not need
337450e8deeSJoe Thornber 	 * a surrounding lock.
338450e8deeSJoe Thornber 	 */
339450e8deeSJoe Thornber 	atomic_t hold_count;
3402cd7a6d4SJoe Thornber 	unsigned long last_accessed;
3412cd7a6d4SJoe Thornber 
342450e8deeSJoe Thornber 	/*
343450e8deeSJoe Thornber 	 * Everything else is protected by the mutex in
344450e8deeSJoe Thornber 	 * dm_bufio_client
345450e8deeSJoe Thornber 	 */
346450e8deeSJoe Thornber 	unsigned long state;
347450e8deeSJoe Thornber 	struct lru_entry lru;
3482cd7a6d4SJoe Thornber 	unsigned char list_mode;		/* LIST_* */
3492cd7a6d4SJoe Thornber 	blk_status_t read_error;
3502cd7a6d4SJoe Thornber 	blk_status_t write_error;
3512cd7a6d4SJoe Thornber 	unsigned int dirty_start;
3522cd7a6d4SJoe Thornber 	unsigned int dirty_end;
3532cd7a6d4SJoe Thornber 	unsigned int write_start;
3542cd7a6d4SJoe Thornber 	unsigned int write_end;
3552cd7a6d4SJoe Thornber 	struct list_head write_list;
356450e8deeSJoe Thornber 	struct dm_bufio_client *c;
3572cd7a6d4SJoe Thornber 	void (*end_io)(struct dm_buffer *b, blk_status_t bs);
3582cd7a6d4SJoe Thornber #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
3592cd7a6d4SJoe Thornber #define MAX_STACK 10
3602cd7a6d4SJoe Thornber 	unsigned int stack_len;
3612cd7a6d4SJoe Thornber 	unsigned long stack_entries[MAX_STACK];
3622cd7a6d4SJoe Thornber #endif
3632cd7a6d4SJoe Thornber };
3642cd7a6d4SJoe Thornber 
3652cd7a6d4SJoe Thornber /*--------------------------------------------------------------*/
3662cd7a6d4SJoe Thornber 
3672cd7a6d4SJoe Thornber /*
3682cd7a6d4SJoe Thornber  * The buffer cache manages buffers, particularly:
3692cd7a6d4SJoe Thornber  *  - inc/dec of holder count
3702cd7a6d4SJoe Thornber  *  - setting the last_accessed field
3712cd7a6d4SJoe Thornber  *  - maintains clean/dirty state along with lru
3722cd7a6d4SJoe Thornber  *  - selecting buffers that match predicates
3732cd7a6d4SJoe Thornber  *
3742cd7a6d4SJoe Thornber  * It does *not* handle:
3752cd7a6d4SJoe Thornber  *  - allocation/freeing of buffers.
3762cd7a6d4SJoe Thornber  *  - IO
3772cd7a6d4SJoe Thornber  *  - Eviction or cache sizing.
3782cd7a6d4SJoe Thornber  *
3792cd7a6d4SJoe Thornber  * cache_get() and cache_put() are threadsafe, you do not need to
3802cd7a6d4SJoe Thornber  * protect these calls with a surrounding mutex.  All the other
3812cd7a6d4SJoe Thornber  * methods are not threadsafe; they do use locking primitives, but
3822cd7a6d4SJoe Thornber  * only enough to ensure get/put are threadsafe.
3832cd7a6d4SJoe Thornber  */
3842cd7a6d4SJoe Thornber 
3852cd7a6d4SJoe Thornber struct buffer_tree {
3865be21d65SMikulas Patocka 	union {
3872cd7a6d4SJoe Thornber 		struct rw_semaphore lock;
3885be21d65SMikulas Patocka 		rwlock_t spinlock;
3895be21d65SMikulas Patocka 	} u;
3902cd7a6d4SJoe Thornber 	struct rb_root root;
3912cd7a6d4SJoe Thornber } ____cacheline_aligned_in_smp;
3922cd7a6d4SJoe Thornber 
3932cd7a6d4SJoe Thornber struct dm_buffer_cache {
39436c18b86SMike Snitzer 	struct lru lru[LIST_SIZE];
3952cd7a6d4SJoe Thornber 	/*
3962cd7a6d4SJoe Thornber 	 * We spread entries across multiple trees to reduce contention
3972cd7a6d4SJoe Thornber 	 * on the locks.
3982cd7a6d4SJoe Thornber 	 */
39936c18b86SMike Snitzer 	unsigned int num_locks;
4005be21d65SMikulas Patocka 	bool no_sleep;
4011e84c4b7SMike Snitzer 	struct buffer_tree trees[];
4022cd7a6d4SJoe Thornber };
4032cd7a6d4SJoe Thornber 
4045be21d65SMikulas Patocka static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled);
4055be21d65SMikulas Patocka 
cache_index(sector_t block,unsigned int num_locks)40636c18b86SMike Snitzer static inline unsigned int cache_index(sector_t block, unsigned int num_locks)
4072cd7a6d4SJoe Thornber {
408363b7fd7SJoe Thornber 	return dm_hash_locks_index(block, num_locks);
4092cd7a6d4SJoe Thornber }
4102cd7a6d4SJoe Thornber 
cache_read_lock(struct dm_buffer_cache * bc,sector_t block)4112cd7a6d4SJoe Thornber static inline void cache_read_lock(struct dm_buffer_cache *bc, sector_t block)
4122cd7a6d4SJoe Thornber {
4135be21d65SMikulas Patocka 	if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep)
4145be21d65SMikulas Patocka 		read_lock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock);
4155be21d65SMikulas Patocka 	else
4165be21d65SMikulas Patocka 		down_read(&bc->trees[cache_index(block, bc->num_locks)].u.lock);
4172cd7a6d4SJoe Thornber }
4182cd7a6d4SJoe Thornber 
cache_read_unlock(struct dm_buffer_cache * bc,sector_t block)4192cd7a6d4SJoe Thornber static inline void cache_read_unlock(struct dm_buffer_cache *bc, sector_t block)
4202cd7a6d4SJoe Thornber {
4215be21d65SMikulas Patocka 	if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep)
4225be21d65SMikulas Patocka 		read_unlock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock);
4235be21d65SMikulas Patocka 	else
4245be21d65SMikulas Patocka 		up_read(&bc->trees[cache_index(block, bc->num_locks)].u.lock);
4252cd7a6d4SJoe Thornber }
4262cd7a6d4SJoe Thornber 
cache_write_lock(struct dm_buffer_cache * bc,sector_t block)4272cd7a6d4SJoe Thornber static inline void cache_write_lock(struct dm_buffer_cache *bc, sector_t block)
4282cd7a6d4SJoe Thornber {
4295be21d65SMikulas Patocka 	if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep)
4305be21d65SMikulas Patocka 		write_lock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock);
4315be21d65SMikulas Patocka 	else
4325be21d65SMikulas Patocka 		down_write(&bc->trees[cache_index(block, bc->num_locks)].u.lock);
4332cd7a6d4SJoe Thornber }
4342cd7a6d4SJoe Thornber 
cache_write_unlock(struct dm_buffer_cache * bc,sector_t block)4352cd7a6d4SJoe Thornber static inline void cache_write_unlock(struct dm_buffer_cache *bc, sector_t block)
4362cd7a6d4SJoe Thornber {
4375be21d65SMikulas Patocka 	if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep)
4385be21d65SMikulas Patocka 		write_unlock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock);
4395be21d65SMikulas Patocka 	else
4405be21d65SMikulas Patocka 		up_write(&bc->trees[cache_index(block, bc->num_locks)].u.lock);
4412cd7a6d4SJoe Thornber }
4422cd7a6d4SJoe Thornber 
44379118806SJoe Thornber /*
44479118806SJoe Thornber  * Sometimes we want to repeatedly get and drop locks as part of an iteration.
44579118806SJoe Thornber  * This struct helps avoid redundant drop and gets of the same lock.
44679118806SJoe Thornber  */
44779118806SJoe Thornber struct lock_history {
44879118806SJoe Thornber 	struct dm_buffer_cache *cache;
44979118806SJoe Thornber 	bool write;
45079118806SJoe Thornber 	unsigned int previous;
45136c18b86SMike Snitzer 	unsigned int no_previous;
45279118806SJoe Thornber };
45379118806SJoe Thornber 
lh_init(struct lock_history * lh,struct dm_buffer_cache * cache,bool write)45479118806SJoe Thornber static void lh_init(struct lock_history *lh, struct dm_buffer_cache *cache, bool write)
45579118806SJoe Thornber {
45679118806SJoe Thornber 	lh->cache = cache;
45779118806SJoe Thornber 	lh->write = write;
45836c18b86SMike Snitzer 	lh->no_previous = cache->num_locks;
45936c18b86SMike Snitzer 	lh->previous = lh->no_previous;
46079118806SJoe Thornber }
46179118806SJoe Thornber 
__lh_lock(struct lock_history * lh,unsigned int index)46279118806SJoe Thornber static void __lh_lock(struct lock_history *lh, unsigned int index)
46379118806SJoe Thornber {
4645be21d65SMikulas Patocka 	if (lh->write) {
4655be21d65SMikulas Patocka 		if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep)
4665be21d65SMikulas Patocka 			write_lock_bh(&lh->cache->trees[index].u.spinlock);
46779118806SJoe Thornber 		else
4685be21d65SMikulas Patocka 			down_write(&lh->cache->trees[index].u.lock);
4695be21d65SMikulas Patocka 	} else {
4705be21d65SMikulas Patocka 		if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep)
4715be21d65SMikulas Patocka 			read_lock_bh(&lh->cache->trees[index].u.spinlock);
4725be21d65SMikulas Patocka 		else
4735be21d65SMikulas Patocka 			down_read(&lh->cache->trees[index].u.lock);
4745be21d65SMikulas Patocka 	}
47579118806SJoe Thornber }
47679118806SJoe Thornber 
__lh_unlock(struct lock_history * lh,unsigned int index)47779118806SJoe Thornber static void __lh_unlock(struct lock_history *lh, unsigned int index)
47879118806SJoe Thornber {
4795be21d65SMikulas Patocka 	if (lh->write) {
4805be21d65SMikulas Patocka 		if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep)
4815be21d65SMikulas Patocka 			write_unlock_bh(&lh->cache->trees[index].u.spinlock);
48279118806SJoe Thornber 		else
4835be21d65SMikulas Patocka 			up_write(&lh->cache->trees[index].u.lock);
4845be21d65SMikulas Patocka 	} else {
4855be21d65SMikulas Patocka 		if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep)
4865be21d65SMikulas Patocka 			read_unlock_bh(&lh->cache->trees[index].u.spinlock);
4875be21d65SMikulas Patocka 		else
4885be21d65SMikulas Patocka 			up_read(&lh->cache->trees[index].u.lock);
4895be21d65SMikulas Patocka 	}
49079118806SJoe Thornber }
49179118806SJoe Thornber 
49279118806SJoe Thornber /*
49379118806SJoe Thornber  * Make sure you call this since it will unlock the final lock.
49479118806SJoe Thornber  */
lh_exit(struct lock_history * lh)49579118806SJoe Thornber static void lh_exit(struct lock_history *lh)
49679118806SJoe Thornber {
49736c18b86SMike Snitzer 	if (lh->previous != lh->no_previous) {
49879118806SJoe Thornber 		__lh_unlock(lh, lh->previous);
49936c18b86SMike Snitzer 		lh->previous = lh->no_previous;
50079118806SJoe Thornber 	}
50179118806SJoe Thornber }
50279118806SJoe Thornber 
50379118806SJoe Thornber /*
50479118806SJoe Thornber  * Named 'next' because there is no corresponding
50579118806SJoe Thornber  * 'up/unlock' call since it's done automatically.
50679118806SJoe Thornber  */
lh_next(struct lock_history * lh,sector_t b)50779118806SJoe Thornber static void lh_next(struct lock_history *lh, sector_t b)
50879118806SJoe Thornber {
50936c18b86SMike Snitzer 	unsigned int index = cache_index(b, lh->no_previous); /* no_previous is num_locks */
51079118806SJoe Thornber 
51136c18b86SMike Snitzer 	if (lh->previous != lh->no_previous) {
51279118806SJoe Thornber 		if (lh->previous != index) {
51379118806SJoe Thornber 			__lh_unlock(lh, lh->previous);
51479118806SJoe Thornber 			__lh_lock(lh, index);
51579118806SJoe Thornber 			lh->previous = index;
51679118806SJoe Thornber 		}
51779118806SJoe Thornber 	} else {
51879118806SJoe Thornber 		__lh_lock(lh, index);
51979118806SJoe Thornber 		lh->previous = index;
52079118806SJoe Thornber 	}
52179118806SJoe Thornber }
52279118806SJoe Thornber 
le_to_buffer(struct lru_entry * le)5232cd7a6d4SJoe Thornber static inline struct dm_buffer *le_to_buffer(struct lru_entry *le)
5242cd7a6d4SJoe Thornber {
5252cd7a6d4SJoe Thornber 	return container_of(le, struct dm_buffer, lru);
5262cd7a6d4SJoe Thornber }
5272cd7a6d4SJoe Thornber 
list_to_buffer(struct list_head * l)5282cd7a6d4SJoe Thornber static struct dm_buffer *list_to_buffer(struct list_head *l)
5292cd7a6d4SJoe Thornber {
5302cd7a6d4SJoe Thornber 	struct lru_entry *le = list_entry(l, struct lru_entry, list);
5312cd7a6d4SJoe Thornber 
5322cd7a6d4SJoe Thornber 	if (!le)
5332cd7a6d4SJoe Thornber 		return NULL;
5342cd7a6d4SJoe Thornber 
5352cd7a6d4SJoe Thornber 	return le_to_buffer(le);
5362cd7a6d4SJoe Thornber }
5372cd7a6d4SJoe Thornber 
cache_init(struct dm_buffer_cache * bc,unsigned int num_locks,bool no_sleep)5385be21d65SMikulas Patocka static void cache_init(struct dm_buffer_cache *bc, unsigned int num_locks, bool no_sleep)
5392cd7a6d4SJoe Thornber {
5402cd7a6d4SJoe Thornber 	unsigned int i;
5412cd7a6d4SJoe Thornber 
54236c18b86SMike Snitzer 	bc->num_locks = num_locks;
5435be21d65SMikulas Patocka 	bc->no_sleep = no_sleep;
54436c18b86SMike Snitzer 
54536c18b86SMike Snitzer 	for (i = 0; i < bc->num_locks; i++) {
5465be21d65SMikulas Patocka 		if (no_sleep)
5475be21d65SMikulas Patocka 			rwlock_init(&bc->trees[i].u.spinlock);
5485be21d65SMikulas Patocka 		else
5495be21d65SMikulas Patocka 			init_rwsem(&bc->trees[i].u.lock);
5502cd7a6d4SJoe Thornber 		bc->trees[i].root = RB_ROOT;
5512cd7a6d4SJoe Thornber 	}
5522cd7a6d4SJoe Thornber 
5532cd7a6d4SJoe Thornber 	lru_init(&bc->lru[LIST_CLEAN]);
5542cd7a6d4SJoe Thornber 	lru_init(&bc->lru[LIST_DIRTY]);
5552cd7a6d4SJoe Thornber }
5562cd7a6d4SJoe Thornber 
cache_destroy(struct dm_buffer_cache * bc)5572cd7a6d4SJoe Thornber static void cache_destroy(struct dm_buffer_cache *bc)
5582cd7a6d4SJoe Thornber {
5592cd7a6d4SJoe Thornber 	unsigned int i;
5602cd7a6d4SJoe Thornber 
56136c18b86SMike Snitzer 	for (i = 0; i < bc->num_locks; i++)
5622cd7a6d4SJoe Thornber 		WARN_ON_ONCE(!RB_EMPTY_ROOT(&bc->trees[i].root));
5632cd7a6d4SJoe Thornber 
5642cd7a6d4SJoe Thornber 	lru_destroy(&bc->lru[LIST_CLEAN]);
5652cd7a6d4SJoe Thornber 	lru_destroy(&bc->lru[LIST_DIRTY]);
5662cd7a6d4SJoe Thornber }
5672cd7a6d4SJoe Thornber 
5682cd7a6d4SJoe Thornber /*--------------*/
5692cd7a6d4SJoe Thornber 
5702cd7a6d4SJoe Thornber /*
5712cd7a6d4SJoe Thornber  * not threadsafe, or racey depending how you look at it
5722cd7a6d4SJoe Thornber  */
cache_count(struct dm_buffer_cache * bc,int list_mode)5732cd7a6d4SJoe Thornber static inline unsigned long cache_count(struct dm_buffer_cache *bc, int list_mode)
5742cd7a6d4SJoe Thornber {
5752cd7a6d4SJoe Thornber 	return bc->lru[list_mode].count;
5762cd7a6d4SJoe Thornber }
5772cd7a6d4SJoe Thornber 
cache_total(struct dm_buffer_cache * bc)5782cd7a6d4SJoe Thornber static inline unsigned long cache_total(struct dm_buffer_cache *bc)
5792cd7a6d4SJoe Thornber {
5802cd7a6d4SJoe Thornber 	return cache_count(bc, LIST_CLEAN) + cache_count(bc, LIST_DIRTY);
5812cd7a6d4SJoe Thornber }
5822cd7a6d4SJoe Thornber 
5832cd7a6d4SJoe Thornber /*--------------*/
5842cd7a6d4SJoe Thornber 
5852cd7a6d4SJoe Thornber /*
5862cd7a6d4SJoe Thornber  * Gets a specific buffer, indexed by block.
5872cd7a6d4SJoe Thornber  * If the buffer is found then its holder count will be incremented and
5882cd7a6d4SJoe Thornber  * lru_reference will be called.
5892cd7a6d4SJoe Thornber  *
5902cd7a6d4SJoe Thornber  * threadsafe
5912cd7a6d4SJoe Thornber  */
__cache_get(const struct rb_root * root,sector_t block)5922cd7a6d4SJoe Thornber static struct dm_buffer *__cache_get(const struct rb_root *root, sector_t block)
5932cd7a6d4SJoe Thornber {
5942cd7a6d4SJoe Thornber 	struct rb_node *n = root->rb_node;
5952cd7a6d4SJoe Thornber 	struct dm_buffer *b;
5962cd7a6d4SJoe Thornber 
5972cd7a6d4SJoe Thornber 	while (n) {
5982cd7a6d4SJoe Thornber 		b = container_of(n, struct dm_buffer, node);
5992cd7a6d4SJoe Thornber 
6002cd7a6d4SJoe Thornber 		if (b->block == block)
6012cd7a6d4SJoe Thornber 			return b;
6022cd7a6d4SJoe Thornber 
6032cd7a6d4SJoe Thornber 		n = block < b->block ? n->rb_left : n->rb_right;
6042cd7a6d4SJoe Thornber 	}
6052cd7a6d4SJoe Thornber 
6062cd7a6d4SJoe Thornber 	return NULL;
6072cd7a6d4SJoe Thornber }
6082cd7a6d4SJoe Thornber 
__cache_inc_buffer(struct dm_buffer * b)6092cd7a6d4SJoe Thornber static void __cache_inc_buffer(struct dm_buffer *b)
6102cd7a6d4SJoe Thornber {
6112cd7a6d4SJoe Thornber 	atomic_inc(&b->hold_count);
6122cd7a6d4SJoe Thornber 	WRITE_ONCE(b->last_accessed, jiffies);
6132cd7a6d4SJoe Thornber }
6142cd7a6d4SJoe Thornber 
cache_get(struct dm_buffer_cache * bc,sector_t block)6152cd7a6d4SJoe Thornber static struct dm_buffer *cache_get(struct dm_buffer_cache *bc, sector_t block)
6162cd7a6d4SJoe Thornber {
6172cd7a6d4SJoe Thornber 	struct dm_buffer *b;
6182cd7a6d4SJoe Thornber 
6192cd7a6d4SJoe Thornber 	cache_read_lock(bc, block);
62036c18b86SMike Snitzer 	b = __cache_get(&bc->trees[cache_index(block, bc->num_locks)].root, block);
6212cd7a6d4SJoe Thornber 	if (b) {
6222cd7a6d4SJoe Thornber 		lru_reference(&b->lru);
6232cd7a6d4SJoe Thornber 		__cache_inc_buffer(b);
6242cd7a6d4SJoe Thornber 	}
6252cd7a6d4SJoe Thornber 	cache_read_unlock(bc, block);
6262cd7a6d4SJoe Thornber 
6272cd7a6d4SJoe Thornber 	return b;
6282cd7a6d4SJoe Thornber }
6292cd7a6d4SJoe Thornber 
6302cd7a6d4SJoe Thornber /*--------------*/
6312cd7a6d4SJoe Thornber 
6322cd7a6d4SJoe Thornber /*
6332cd7a6d4SJoe Thornber  * Returns true if the hold count hits zero.
6342cd7a6d4SJoe Thornber  * threadsafe
6352cd7a6d4SJoe Thornber  */
cache_put(struct dm_buffer_cache * bc,struct dm_buffer * b)6362cd7a6d4SJoe Thornber static bool cache_put(struct dm_buffer_cache *bc, struct dm_buffer *b)
6372cd7a6d4SJoe Thornber {
6382cd7a6d4SJoe Thornber 	bool r;
6392cd7a6d4SJoe Thornber 
6402cd7a6d4SJoe Thornber 	cache_read_lock(bc, b->block);
6412cd7a6d4SJoe Thornber 	BUG_ON(!atomic_read(&b->hold_count));
6422cd7a6d4SJoe Thornber 	r = atomic_dec_and_test(&b->hold_count);
6432cd7a6d4SJoe Thornber 	cache_read_unlock(bc, b->block);
6442cd7a6d4SJoe Thornber 
6452cd7a6d4SJoe Thornber 	return r;
6462cd7a6d4SJoe Thornber }
6472cd7a6d4SJoe Thornber 
6482cd7a6d4SJoe Thornber /*--------------*/
6492cd7a6d4SJoe Thornber 
6502cd7a6d4SJoe Thornber typedef enum evict_result (*b_predicate)(struct dm_buffer *, void *);
6512cd7a6d4SJoe Thornber 
6522cd7a6d4SJoe Thornber /*
6532cd7a6d4SJoe Thornber  * Evicts a buffer based on a predicate.  The oldest buffer that
6542cd7a6d4SJoe Thornber  * matches the predicate will be selected.  In addition to the
6552cd7a6d4SJoe Thornber  * predicate the hold_count of the selected buffer will be zero.
6562cd7a6d4SJoe Thornber  */
6572cd7a6d4SJoe Thornber struct evict_wrapper {
65879118806SJoe Thornber 	struct lock_history *lh;
6592cd7a6d4SJoe Thornber 	b_predicate pred;
6602cd7a6d4SJoe Thornber 	void *context;
6612cd7a6d4SJoe Thornber };
6622cd7a6d4SJoe Thornber 
6632cd7a6d4SJoe Thornber /*
6642cd7a6d4SJoe Thornber  * Wraps the buffer predicate turning it into an lru predicate.  Adds
6652cd7a6d4SJoe Thornber  * extra test for hold_count.
6662cd7a6d4SJoe Thornber  */
__evict_pred(struct lru_entry * le,void * context)6672cd7a6d4SJoe Thornber static enum evict_result __evict_pred(struct lru_entry *le, void *context)
6682cd7a6d4SJoe Thornber {
6692cd7a6d4SJoe Thornber 	struct evict_wrapper *w = context;
6702cd7a6d4SJoe Thornber 	struct dm_buffer *b = le_to_buffer(le);
6712cd7a6d4SJoe Thornber 
67279118806SJoe Thornber 	lh_next(w->lh, b->block);
67379118806SJoe Thornber 
6742cd7a6d4SJoe Thornber 	if (atomic_read(&b->hold_count))
6752cd7a6d4SJoe Thornber 		return ER_DONT_EVICT;
6762cd7a6d4SJoe Thornber 
6772cd7a6d4SJoe Thornber 	return w->pred(b, w->context);
6782cd7a6d4SJoe Thornber }
6792cd7a6d4SJoe Thornber 
__cache_evict(struct dm_buffer_cache * bc,int list_mode,b_predicate pred,void * context,struct lock_history * lh)68079118806SJoe Thornber static struct dm_buffer *__cache_evict(struct dm_buffer_cache *bc, int list_mode,
68179118806SJoe Thornber 				       b_predicate pred, void *context,
68279118806SJoe Thornber 				       struct lock_history *lh)
6832cd7a6d4SJoe Thornber {
68479118806SJoe Thornber 	struct evict_wrapper w = {.lh = lh, .pred = pred, .context = context};
6852cd7a6d4SJoe Thornber 	struct lru_entry *le;
6862cd7a6d4SJoe Thornber 	struct dm_buffer *b;
6872cd7a6d4SJoe Thornber 
6885be21d65SMikulas Patocka 	le = lru_evict(&bc->lru[list_mode], __evict_pred, &w, bc->no_sleep);
6892cd7a6d4SJoe Thornber 	if (!le)
6902cd7a6d4SJoe Thornber 		return NULL;
6912cd7a6d4SJoe Thornber 
6922cd7a6d4SJoe Thornber 	b = le_to_buffer(le);
6932cd7a6d4SJoe Thornber 	/* __evict_pred will have locked the appropriate tree. */
69436c18b86SMike Snitzer 	rb_erase(&b->node, &bc->trees[cache_index(b->block, bc->num_locks)].root);
6952cd7a6d4SJoe Thornber 
6962cd7a6d4SJoe Thornber 	return b;
6972cd7a6d4SJoe Thornber }
6982cd7a6d4SJoe Thornber 
cache_evict(struct dm_buffer_cache * bc,int list_mode,b_predicate pred,void * context)69979118806SJoe Thornber static struct dm_buffer *cache_evict(struct dm_buffer_cache *bc, int list_mode,
70079118806SJoe Thornber 				     b_predicate pred, void *context)
70179118806SJoe Thornber {
70279118806SJoe Thornber 	struct dm_buffer *b;
70379118806SJoe Thornber 	struct lock_history lh;
70479118806SJoe Thornber 
70579118806SJoe Thornber 	lh_init(&lh, bc, true);
70679118806SJoe Thornber 	b = __cache_evict(bc, list_mode, pred, context, &lh);
70779118806SJoe Thornber 	lh_exit(&lh);
70879118806SJoe Thornber 
70979118806SJoe Thornber 	return b;
71079118806SJoe Thornber }
71179118806SJoe Thornber 
7122cd7a6d4SJoe Thornber /*--------------*/
7132cd7a6d4SJoe Thornber 
7142cd7a6d4SJoe Thornber /*
7152cd7a6d4SJoe Thornber  * Mark a buffer as clean or dirty. Not threadsafe.
7162cd7a6d4SJoe Thornber  */
cache_mark(struct dm_buffer_cache * bc,struct dm_buffer * b,int list_mode)7172cd7a6d4SJoe Thornber static void cache_mark(struct dm_buffer_cache *bc, struct dm_buffer *b, int list_mode)
7182cd7a6d4SJoe Thornber {
7192cd7a6d4SJoe Thornber 	cache_write_lock(bc, b->block);
7202cd7a6d4SJoe Thornber 	if (list_mode != b->list_mode) {
7212cd7a6d4SJoe Thornber 		lru_remove(&bc->lru[b->list_mode], &b->lru);
7222cd7a6d4SJoe Thornber 		b->list_mode = list_mode;
7232cd7a6d4SJoe Thornber 		lru_insert(&bc->lru[b->list_mode], &b->lru);
7242cd7a6d4SJoe Thornber 	}
7252cd7a6d4SJoe Thornber 	cache_write_unlock(bc, b->block);
7262cd7a6d4SJoe Thornber }
7272cd7a6d4SJoe Thornber 
7282cd7a6d4SJoe Thornber /*--------------*/
7292cd7a6d4SJoe Thornber 
7302cd7a6d4SJoe Thornber /*
7312cd7a6d4SJoe Thornber  * Runs through the lru associated with 'old_mode', if the predicate matches then
7322cd7a6d4SJoe Thornber  * it moves them to 'new_mode'.  Not threadsafe.
7332cd7a6d4SJoe Thornber  */
__cache_mark_many(struct dm_buffer_cache * bc,int old_mode,int new_mode,b_predicate pred,void * context,struct lock_history * lh)73479118806SJoe Thornber static void __cache_mark_many(struct dm_buffer_cache *bc, int old_mode, int new_mode,
73579118806SJoe Thornber 			      b_predicate pred, void *context, struct lock_history *lh)
7362cd7a6d4SJoe Thornber {
7372cd7a6d4SJoe Thornber 	struct lru_entry *le;
7382cd7a6d4SJoe Thornber 	struct dm_buffer *b;
73979118806SJoe Thornber 	struct evict_wrapper w = {.lh = lh, .pred = pred, .context = context};
7402cd7a6d4SJoe Thornber 
7412cd7a6d4SJoe Thornber 	while (true) {
7425be21d65SMikulas Patocka 		le = lru_evict(&bc->lru[old_mode], __evict_pred, &w, bc->no_sleep);
7432cd7a6d4SJoe Thornber 		if (!le)
7442cd7a6d4SJoe Thornber 			break;
7452cd7a6d4SJoe Thornber 
7462cd7a6d4SJoe Thornber 		b = le_to_buffer(le);
7472cd7a6d4SJoe Thornber 		b->list_mode = new_mode;
7482cd7a6d4SJoe Thornber 		lru_insert(&bc->lru[b->list_mode], &b->lru);
7492cd7a6d4SJoe Thornber 	}
7502cd7a6d4SJoe Thornber }
7512cd7a6d4SJoe Thornber 
cache_mark_many(struct dm_buffer_cache * bc,int old_mode,int new_mode,b_predicate pred,void * context)75279118806SJoe Thornber static void cache_mark_many(struct dm_buffer_cache *bc, int old_mode, int new_mode,
75379118806SJoe Thornber 			    b_predicate pred, void *context)
75479118806SJoe Thornber {
75579118806SJoe Thornber 	struct lock_history lh;
75679118806SJoe Thornber 
75779118806SJoe Thornber 	lh_init(&lh, bc, true);
75879118806SJoe Thornber 	__cache_mark_many(bc, old_mode, new_mode, pred, context, &lh);
75979118806SJoe Thornber 	lh_exit(&lh);
76079118806SJoe Thornber }
76179118806SJoe Thornber 
7622cd7a6d4SJoe Thornber /*--------------*/
7632cd7a6d4SJoe Thornber 
7642cd7a6d4SJoe Thornber /*
7652cd7a6d4SJoe Thornber  * Iterates through all clean or dirty entries calling a function for each
7662cd7a6d4SJoe Thornber  * entry.  The callback may terminate the iteration early.  Not threadsafe.
7672cd7a6d4SJoe Thornber  */
7682cd7a6d4SJoe Thornber 
7692cd7a6d4SJoe Thornber /*
7702cd7a6d4SJoe Thornber  * Iterator functions should return one of these actions to indicate
7712cd7a6d4SJoe Thornber  * how the iteration should proceed.
7722cd7a6d4SJoe Thornber  */
7732cd7a6d4SJoe Thornber enum it_action {
7742cd7a6d4SJoe Thornber 	IT_NEXT,
7752cd7a6d4SJoe Thornber 	IT_COMPLETE,
7762cd7a6d4SJoe Thornber };
7772cd7a6d4SJoe Thornber 
7782cd7a6d4SJoe Thornber typedef enum it_action (*iter_fn)(struct dm_buffer *b, void *context);
7792cd7a6d4SJoe Thornber 
78079118806SJoe Thornber static void __cache_iterate(struct dm_buffer_cache *bc, int list_mode,
78179118806SJoe Thornber 			    iter_fn fn, void *context, struct lock_history *lh)
7822cd7a6d4SJoe Thornber {
7832cd7a6d4SJoe Thornber 	struct lru *lru = &bc->lru[list_mode];
7842cd7a6d4SJoe Thornber 	struct lru_entry *le, *first;
7852cd7a6d4SJoe Thornber 
7862cd7a6d4SJoe Thornber 	if (!lru->cursor)
7872cd7a6d4SJoe Thornber 		return;
7882cd7a6d4SJoe Thornber 
7892cd7a6d4SJoe Thornber 	first = le = to_le(lru->cursor);
7902cd7a6d4SJoe Thornber 	do {
7912cd7a6d4SJoe Thornber 		struct dm_buffer *b = le_to_buffer(le);
7922cd7a6d4SJoe Thornber 
79379118806SJoe Thornber 		lh_next(lh, b->block);
79479118806SJoe Thornber 
7952cd7a6d4SJoe Thornber 		switch (fn(b, context)) {
7962cd7a6d4SJoe Thornber 		case IT_NEXT:
7972cd7a6d4SJoe Thornber 			break;
7982cd7a6d4SJoe Thornber 
7992cd7a6d4SJoe Thornber 		case IT_COMPLETE:
8002cd7a6d4SJoe Thornber 			return;
8012cd7a6d4SJoe Thornber 		}
8022cd7a6d4SJoe Thornber 		cond_resched();
8032cd7a6d4SJoe Thornber 
8042cd7a6d4SJoe Thornber 		le = to_le(le->list.next);
8052cd7a6d4SJoe Thornber 	} while (le != first);
8062cd7a6d4SJoe Thornber }
8072cd7a6d4SJoe Thornber 
cache_iterate(struct dm_buffer_cache * bc,int list_mode,iter_fn fn,void * context)80879118806SJoe Thornber static void cache_iterate(struct dm_buffer_cache *bc, int list_mode,
80979118806SJoe Thornber 			  iter_fn fn, void *context)
81079118806SJoe Thornber {
81179118806SJoe Thornber 	struct lock_history lh;
81279118806SJoe Thornber 
81379118806SJoe Thornber 	lh_init(&lh, bc, false);
81479118806SJoe Thornber 	__cache_iterate(bc, list_mode, fn, context, &lh);
81579118806SJoe Thornber 	lh_exit(&lh);
81679118806SJoe Thornber }
81779118806SJoe Thornber 
8182cd7a6d4SJoe Thornber /*--------------*/
8192cd7a6d4SJoe Thornber 
8202cd7a6d4SJoe Thornber /*
8212cd7a6d4SJoe Thornber  * Passes ownership of the buffer to the cache. Returns false if the
8222cd7a6d4SJoe Thornber  * buffer was already present (in which case ownership does not pass).
8232cd7a6d4SJoe Thornber  * eg, a race with another thread.
8242cd7a6d4SJoe Thornber  *
8252cd7a6d4SJoe Thornber  * Holder count should be 1 on insertion.
8262cd7a6d4SJoe Thornber  *
8272cd7a6d4SJoe Thornber  * Not threadsafe.
8282cd7a6d4SJoe Thornber  */
__cache_insert(struct rb_root * root,struct dm_buffer * b)8292cd7a6d4SJoe Thornber static bool __cache_insert(struct rb_root *root, struct dm_buffer *b)
8302cd7a6d4SJoe Thornber {
8312cd7a6d4SJoe Thornber 	struct rb_node **new = &root->rb_node, *parent = NULL;
8322cd7a6d4SJoe Thornber 	struct dm_buffer *found;
8332cd7a6d4SJoe Thornber 
8342cd7a6d4SJoe Thornber 	while (*new) {
8352cd7a6d4SJoe Thornber 		found = container_of(*new, struct dm_buffer, node);
8362cd7a6d4SJoe Thornber 
8372cd7a6d4SJoe Thornber 		if (found->block == b->block)
8382cd7a6d4SJoe Thornber 			return false;
8392cd7a6d4SJoe Thornber 
8402cd7a6d4SJoe Thornber 		parent = *new;
8412cd7a6d4SJoe Thornber 		new = b->block < found->block ?
8422cd7a6d4SJoe Thornber 			&found->node.rb_left : &found->node.rb_right;
8432cd7a6d4SJoe Thornber 	}
8442cd7a6d4SJoe Thornber 
8452cd7a6d4SJoe Thornber 	rb_link_node(&b->node, parent, new);
8462cd7a6d4SJoe Thornber 	rb_insert_color(&b->node, root);
8472cd7a6d4SJoe Thornber 
8482cd7a6d4SJoe Thornber 	return true;
8492cd7a6d4SJoe Thornber }
8502cd7a6d4SJoe Thornber 
cache_insert(struct dm_buffer_cache * bc,struct dm_buffer * b)8512cd7a6d4SJoe Thornber static bool cache_insert(struct dm_buffer_cache *bc, struct dm_buffer *b)
8522cd7a6d4SJoe Thornber {
8532cd7a6d4SJoe Thornber 	bool r;
8542cd7a6d4SJoe Thornber 
8552cd7a6d4SJoe Thornber 	if (WARN_ON_ONCE(b->list_mode >= LIST_SIZE))
8562cd7a6d4SJoe Thornber 		return false;
8572cd7a6d4SJoe Thornber 
8582cd7a6d4SJoe Thornber 	cache_write_lock(bc, b->block);
8592cd7a6d4SJoe Thornber 	BUG_ON(atomic_read(&b->hold_count) != 1);
86036c18b86SMike Snitzer 	r = __cache_insert(&bc->trees[cache_index(b->block, bc->num_locks)].root, b);
8612cd7a6d4SJoe Thornber 	if (r)
8622cd7a6d4SJoe Thornber 		lru_insert(&bc->lru[b->list_mode], &b->lru);
8632cd7a6d4SJoe Thornber 	cache_write_unlock(bc, b->block);
8642cd7a6d4SJoe Thornber 
8652cd7a6d4SJoe Thornber 	return r;
8662cd7a6d4SJoe Thornber }
8672cd7a6d4SJoe Thornber 
8682cd7a6d4SJoe Thornber /*--------------*/
8692cd7a6d4SJoe Thornber 
8702cd7a6d4SJoe Thornber /*
8712cd7a6d4SJoe Thornber  * Removes buffer from cache, ownership of the buffer passes back to the caller.
8722cd7a6d4SJoe Thornber  * Fails if the hold_count is not one (ie. the caller holds the only reference).
8732cd7a6d4SJoe Thornber  *
8742cd7a6d4SJoe Thornber  * Not threadsafe.
8752cd7a6d4SJoe Thornber  */
cache_remove(struct dm_buffer_cache * bc,struct dm_buffer * b)8762cd7a6d4SJoe Thornber static bool cache_remove(struct dm_buffer_cache *bc, struct dm_buffer *b)
8772cd7a6d4SJoe Thornber {
8782cd7a6d4SJoe Thornber 	bool r;
8792cd7a6d4SJoe Thornber 
8802cd7a6d4SJoe Thornber 	cache_write_lock(bc, b->block);
8812cd7a6d4SJoe Thornber 
8822cd7a6d4SJoe Thornber 	if (atomic_read(&b->hold_count) != 1) {
8832cd7a6d4SJoe Thornber 		r = false;
8842cd7a6d4SJoe Thornber 	} else {
8852cd7a6d4SJoe Thornber 		r = true;
88636c18b86SMike Snitzer 		rb_erase(&b->node, &bc->trees[cache_index(b->block, bc->num_locks)].root);
8872cd7a6d4SJoe Thornber 		lru_remove(&bc->lru[b->list_mode], &b->lru);
8882cd7a6d4SJoe Thornber 	}
8892cd7a6d4SJoe Thornber 
8902cd7a6d4SJoe Thornber 	cache_write_unlock(bc, b->block);
8912cd7a6d4SJoe Thornber 
8922cd7a6d4SJoe Thornber 	return r;
8932cd7a6d4SJoe Thornber }
8942cd7a6d4SJoe Thornber 
8952cd7a6d4SJoe Thornber /*--------------*/
8962cd7a6d4SJoe Thornber 
8972cd7a6d4SJoe Thornber typedef void (*b_release)(struct dm_buffer *);
8982cd7a6d4SJoe Thornber 
__find_next(struct rb_root * root,sector_t block)8992cd7a6d4SJoe Thornber static struct dm_buffer *__find_next(struct rb_root *root, sector_t block)
9002cd7a6d4SJoe Thornber {
9012cd7a6d4SJoe Thornber 	struct rb_node *n = root->rb_node;
9022cd7a6d4SJoe Thornber 	struct dm_buffer *b;
9032cd7a6d4SJoe Thornber 	struct dm_buffer *best = NULL;
9042cd7a6d4SJoe Thornber 
9052cd7a6d4SJoe Thornber 	while (n) {
9062cd7a6d4SJoe Thornber 		b = container_of(n, struct dm_buffer, node);
9072cd7a6d4SJoe Thornber 
9082cd7a6d4SJoe Thornber 		if (b->block == block)
9092cd7a6d4SJoe Thornber 			return b;
9102cd7a6d4SJoe Thornber 
9112cd7a6d4SJoe Thornber 		if (block <= b->block) {
9122cd7a6d4SJoe Thornber 			n = n->rb_left;
9132cd7a6d4SJoe Thornber 			best = b;
9142cd7a6d4SJoe Thornber 		} else {
9152cd7a6d4SJoe Thornber 			n = n->rb_right;
9162cd7a6d4SJoe Thornber 		}
9172cd7a6d4SJoe Thornber 	}
9182cd7a6d4SJoe Thornber 
9192cd7a6d4SJoe Thornber 	return best;
9202cd7a6d4SJoe Thornber }
9212cd7a6d4SJoe Thornber 
__remove_range(struct dm_buffer_cache * bc,struct rb_root * root,sector_t begin,sector_t end,b_predicate pred,b_release release)9222cd7a6d4SJoe Thornber static void __remove_range(struct dm_buffer_cache *bc,
9232cd7a6d4SJoe Thornber 			   struct rb_root *root,
9242cd7a6d4SJoe Thornber 			   sector_t begin, sector_t end,
9252cd7a6d4SJoe Thornber 			   b_predicate pred, b_release release)
9262cd7a6d4SJoe Thornber {
9272cd7a6d4SJoe Thornber 	struct dm_buffer *b;
9282cd7a6d4SJoe Thornber 
9292cd7a6d4SJoe Thornber 	while (true) {
9302cd7a6d4SJoe Thornber 		cond_resched();
9312cd7a6d4SJoe Thornber 
9322cd7a6d4SJoe Thornber 		b = __find_next(root, begin);
9332cd7a6d4SJoe Thornber 		if (!b || (b->block >= end))
9342cd7a6d4SJoe Thornber 			break;
9352cd7a6d4SJoe Thornber 
9362cd7a6d4SJoe Thornber 		begin = b->block + 1;
9372cd7a6d4SJoe Thornber 
9382cd7a6d4SJoe Thornber 		if (atomic_read(&b->hold_count))
9392cd7a6d4SJoe Thornber 			continue;
9402cd7a6d4SJoe Thornber 
9412cd7a6d4SJoe Thornber 		if (pred(b, NULL) == ER_EVICT) {
9422cd7a6d4SJoe Thornber 			rb_erase(&b->node, root);
9432cd7a6d4SJoe Thornber 			lru_remove(&bc->lru[b->list_mode], &b->lru);
9442cd7a6d4SJoe Thornber 			release(b);
9452cd7a6d4SJoe Thornber 		}
9462cd7a6d4SJoe Thornber 	}
9472cd7a6d4SJoe Thornber }
9482cd7a6d4SJoe Thornber 
cache_remove_range(struct dm_buffer_cache * bc,sector_t begin,sector_t end,b_predicate pred,b_release release)9492cd7a6d4SJoe Thornber static void cache_remove_range(struct dm_buffer_cache *bc,
9502cd7a6d4SJoe Thornber 			       sector_t begin, sector_t end,
9512cd7a6d4SJoe Thornber 			       b_predicate pred, b_release release)
9522cd7a6d4SJoe Thornber {
9532cd7a6d4SJoe Thornber 	unsigned int i;
9542cd7a6d4SJoe Thornber 
9555be21d65SMikulas Patocka 	BUG_ON(bc->no_sleep);
95636c18b86SMike Snitzer 	for (i = 0; i < bc->num_locks; i++) {
9575be21d65SMikulas Patocka 		down_write(&bc->trees[i].u.lock);
9582cd7a6d4SJoe Thornber 		__remove_range(bc, &bc->trees[i].root, begin, end, pred, release);
9595be21d65SMikulas Patocka 		up_write(&bc->trees[i].u.lock);
9602cd7a6d4SJoe Thornber 	}
9612cd7a6d4SJoe Thornber }
9622cd7a6d4SJoe Thornber 
9632cd7a6d4SJoe Thornber /*----------------------------------------------------------------*/
9642cd7a6d4SJoe Thornber 
9652cd7a6d4SJoe Thornber /*
96695d402f0SMikulas Patocka  * Linking of buffers:
967450e8deeSJoe Thornber  *	All buffers are linked to buffer_cache with their node field.
96895d402f0SMikulas Patocka  *
96995d402f0SMikulas Patocka  *	Clean buffers that are not being written (B_WRITING not set)
97095d402f0SMikulas Patocka  *	are linked to lru[LIST_CLEAN] with their lru_list field.
97195d402f0SMikulas Patocka  *
97295d402f0SMikulas Patocka  *	Dirty and clean buffers that are being written are linked to
97395d402f0SMikulas Patocka  *	lru[LIST_DIRTY] with their lru_list field. When the write
97495d402f0SMikulas Patocka  *	finishes, the buffer cannot be relinked immediately (because we
97595d402f0SMikulas Patocka  *	are in an interrupt context and relinking requires process
97695d402f0SMikulas Patocka  *	context), so some clean-not-writing buffers can be held on
97795d402f0SMikulas Patocka  *	dirty_lru too.  They are later added to lru in the process
97895d402f0SMikulas Patocka  *	context.
97995d402f0SMikulas Patocka  */
98095d402f0SMikulas Patocka struct dm_bufio_client {
98195d402f0SMikulas Patocka 	struct block_device *bdev;
98286a3238cSHeinz Mauelshagen 	unsigned int block_size;
983f51f2e0aSMikulas Patocka 	s8 sectors_per_block_bits;
984530f683dSMike Snitzer 
985530f683dSMike Snitzer 	bool no_sleep;
986530f683dSMike Snitzer 	struct mutex lock;
987530f683dSMike Snitzer 	spinlock_t spinlock;
988530f683dSMike Snitzer 
989530f683dSMike Snitzer 	int async_write_error;
990530f683dSMike Snitzer 
99102f10ba1SHeinz Mauelshagen 	void (*alloc_callback)(struct dm_buffer *buf);
99202f10ba1SHeinz Mauelshagen 	void (*write_callback)(struct dm_buffer *buf);
993359dbf19SMikulas Patocka 	struct kmem_cache *slab_buffer;
99421bb1327SMikulas Patocka 	struct kmem_cache *slab_cache;
99595d402f0SMikulas Patocka 	struct dm_io_client *dm_io;
99695d402f0SMikulas Patocka 
99795d402f0SMikulas Patocka 	struct list_head reserved_buffers;
99886a3238cSHeinz Mauelshagen 	unsigned int need_reserved_buffers;
99995d402f0SMikulas Patocka 
100086a3238cSHeinz Mauelshagen 	unsigned int minimum_buffers;
100155b082e6SMikulas Patocka 
1002400a0befSMikulas Patocka 	sector_t start;
1003400a0befSMikulas Patocka 
100495d402f0SMikulas Patocka 	struct shrinker shrinker;
100570704c33SMikulas Patocka 	struct work_struct shrink_work;
100670704c33SMikulas Patocka 	atomic_long_t need_shrink;
1007450e8deeSJoe Thornber 
1008530f683dSMike Snitzer 	wait_queue_head_t free_buffer_wait;
1009530f683dSMike Snitzer 
1010530f683dSMike Snitzer 	struct list_head client_list;
1011530f683dSMike Snitzer 
1012450e8deeSJoe Thornber 	/*
1013450e8deeSJoe Thornber 	 * Used by global_cleanup to sort the clients list.
1014450e8deeSJoe Thornber 	 */
1015450e8deeSJoe Thornber 	unsigned long oldest_buffer;
1016530f683dSMike Snitzer 
10171e84c4b7SMike Snitzer 	struct dm_buffer_cache cache; /* must be last member */
101895d402f0SMikulas Patocka };
101995d402f0SMikulas Patocka 
102095d402f0SMikulas Patocka /*----------------------------------------------------------------*/
102195d402f0SMikulas Patocka 
102295d402f0SMikulas Patocka #define dm_bufio_in_request()	(!!current->bio_list)
102395d402f0SMikulas Patocka 
dm_bufio_lock(struct dm_bufio_client * c)102495d402f0SMikulas Patocka static void dm_bufio_lock(struct dm_bufio_client *c)
102595d402f0SMikulas Patocka {
10263c1c875dSMike Snitzer 	if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
1027b33b6fdcSMike Snitzer 		spin_lock_bh(&c->spinlock);
1028b32d4582SNathan Huckleberry 	else
102995d402f0SMikulas Patocka 		mutex_lock_nested(&c->lock, dm_bufio_in_request());
103095d402f0SMikulas Patocka }
103195d402f0SMikulas Patocka 
dm_bufio_unlock(struct dm_bufio_client * c)103295d402f0SMikulas Patocka static void dm_bufio_unlock(struct dm_bufio_client *c)
103395d402f0SMikulas Patocka {
10343c1c875dSMike Snitzer 	if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
1035b33b6fdcSMike Snitzer 		spin_unlock_bh(&c->spinlock);
1036b32d4582SNathan Huckleberry 	else
103795d402f0SMikulas Patocka 		mutex_unlock(&c->lock);
103895d402f0SMikulas Patocka }
103995d402f0SMikulas Patocka 
104095d402f0SMikulas Patocka /*----------------------------------------------------------------*/
104195d402f0SMikulas Patocka 
104295d402f0SMikulas Patocka /*
104395d402f0SMikulas Patocka  * Default cache size: available memory divided by the ratio.
104495d402f0SMikulas Patocka  */
104595d402f0SMikulas Patocka static unsigned long dm_bufio_default_cache_size;
104695d402f0SMikulas Patocka 
104795d402f0SMikulas Patocka /*
104895d402f0SMikulas Patocka  * Total cache size set by the user.
104995d402f0SMikulas Patocka  */
105095d402f0SMikulas Patocka static unsigned long dm_bufio_cache_size;
105195d402f0SMikulas Patocka 
105295d402f0SMikulas Patocka /*
105395d402f0SMikulas Patocka  * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change
105495d402f0SMikulas Patocka  * at any time.  If it disagrees, the user has changed cache size.
105595d402f0SMikulas Patocka  */
105695d402f0SMikulas Patocka static unsigned long dm_bufio_cache_size_latch;
105795d402f0SMikulas Patocka 
1058af53badcSMikulas Patocka static DEFINE_SPINLOCK(global_spinlock);
1059af53badcSMikulas Patocka 
106095d402f0SMikulas Patocka /*
106195d402f0SMikulas Patocka  * Buffers are freed after this timeout
106295d402f0SMikulas Patocka  */
106386a3238cSHeinz Mauelshagen static unsigned int dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
106413840d38SMikulas Patocka static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
106595d402f0SMikulas Patocka 
106695d402f0SMikulas Patocka static unsigned long dm_bufio_peak_allocated;
106795d402f0SMikulas Patocka static unsigned long dm_bufio_allocated_kmem_cache;
106895d402f0SMikulas Patocka static unsigned long dm_bufio_allocated_get_free_pages;
106995d402f0SMikulas Patocka static unsigned long dm_bufio_allocated_vmalloc;
107095d402f0SMikulas Patocka static unsigned long dm_bufio_current_allocated;
107195d402f0SMikulas Patocka 
107295d402f0SMikulas Patocka /*----------------------------------------------------------------*/
107395d402f0SMikulas Patocka 
107495d402f0SMikulas Patocka /*
107595d402f0SMikulas Patocka  * The current number of clients.
107695d402f0SMikulas Patocka  */
107795d402f0SMikulas Patocka static int dm_bufio_client_count;
107895d402f0SMikulas Patocka 
107995d402f0SMikulas Patocka /*
108095d402f0SMikulas Patocka  * The list of all clients.
108195d402f0SMikulas Patocka  */
108295d402f0SMikulas Patocka static LIST_HEAD(dm_bufio_all_clients);
108395d402f0SMikulas Patocka 
108495d402f0SMikulas Patocka /*
1085b132ff33SMikulas Patocka  * This mutex protects dm_bufio_cache_size_latch and dm_bufio_client_count
108695d402f0SMikulas Patocka  */
108795d402f0SMikulas Patocka static DEFINE_MUTEX(dm_bufio_clients_lock);
108895d402f0SMikulas Patocka 
10896e913b28SMikulas Patocka static struct workqueue_struct *dm_bufio_wq;
10906e913b28SMikulas Patocka static struct delayed_work dm_bufio_cleanup_old_work;
10916e913b28SMikulas Patocka static struct work_struct dm_bufio_replacement_work;
10926e913b28SMikulas Patocka 
10936e913b28SMikulas Patocka 
109486bad0c7SMikulas Patocka #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
buffer_record_stack(struct dm_buffer * b)109586bad0c7SMikulas Patocka static void buffer_record_stack(struct dm_buffer *b)
109686bad0c7SMikulas Patocka {
1097741b58f3SThomas Gleixner 	b->stack_len = stack_trace_save(b->stack_entries, MAX_STACK, 2);
109886bad0c7SMikulas Patocka }
109986bad0c7SMikulas Patocka #endif
110086bad0c7SMikulas Patocka 
110195d402f0SMikulas Patocka /*----------------------------------------------------------------*/
110295d402f0SMikulas Patocka 
adjust_total_allocated(struct dm_buffer * b,bool unlink)1103d0a328a3SMikulas Patocka static void adjust_total_allocated(struct dm_buffer *b, bool unlink)
110495d402f0SMikulas Patocka {
1105d0a328a3SMikulas Patocka 	unsigned char data_mode;
1106d0a328a3SMikulas Patocka 	long diff;
1107d0a328a3SMikulas Patocka 
110895d402f0SMikulas Patocka 	static unsigned long * const class_ptr[DATA_MODE_LIMIT] = {
110995d402f0SMikulas Patocka 		&dm_bufio_allocated_kmem_cache,
111095d402f0SMikulas Patocka 		&dm_bufio_allocated_get_free_pages,
111195d402f0SMikulas Patocka 		&dm_bufio_allocated_vmalloc,
111295d402f0SMikulas Patocka 	};
111395d402f0SMikulas Patocka 
1114d0a328a3SMikulas Patocka 	data_mode = b->data_mode;
1115d0a328a3SMikulas Patocka 	diff = (long)b->c->block_size;
1116d0a328a3SMikulas Patocka 	if (unlink)
1117d0a328a3SMikulas Patocka 		diff = -diff;
1118d0a328a3SMikulas Patocka 
1119af53badcSMikulas Patocka 	spin_lock(&global_spinlock);
112095d402f0SMikulas Patocka 
112195d402f0SMikulas Patocka 	*class_ptr[data_mode] += diff;
112295d402f0SMikulas Patocka 
112395d402f0SMikulas Patocka 	dm_bufio_current_allocated += diff;
112495d402f0SMikulas Patocka 
112595d402f0SMikulas Patocka 	if (dm_bufio_current_allocated > dm_bufio_peak_allocated)
112695d402f0SMikulas Patocka 		dm_bufio_peak_allocated = dm_bufio_current_allocated;
112795d402f0SMikulas Patocka 
1128af53badcSMikulas Patocka 	if (!unlink) {
11296e913b28SMikulas Patocka 		if (dm_bufio_current_allocated > dm_bufio_cache_size)
11306e913b28SMikulas Patocka 			queue_work(dm_bufio_wq, &dm_bufio_replacement_work);
1131af53badcSMikulas Patocka 	}
1132af53badcSMikulas Patocka 
1133af53badcSMikulas Patocka 	spin_unlock(&global_spinlock);
113495d402f0SMikulas Patocka }
113595d402f0SMikulas Patocka 
113695d402f0SMikulas Patocka /*
113795d402f0SMikulas Patocka  * Change the number of clients and recalculate per-client limit.
113895d402f0SMikulas Patocka  */
__cache_size_refresh(void)113995d402f0SMikulas Patocka static void __cache_size_refresh(void)
114095d402f0SMikulas Patocka {
1141b75a80f4SMike Snitzer 	if (WARN_ON(!mutex_is_locked(&dm_bufio_clients_lock)))
1142b75a80f4SMike Snitzer 		return;
1143b75a80f4SMike Snitzer 	if (WARN_ON(dm_bufio_client_count < 0))
1144b75a80f4SMike Snitzer 		return;
114595d402f0SMikulas Patocka 
11466aa7de05SMark Rutland 	dm_bufio_cache_size_latch = READ_ONCE(dm_bufio_cache_size);
114795d402f0SMikulas Patocka 
114895d402f0SMikulas Patocka 	/*
114995d402f0SMikulas Patocka 	 * Use default if set to 0 and report the actual cache size used.
115095d402f0SMikulas Patocka 	 */
115195d402f0SMikulas Patocka 	if (!dm_bufio_cache_size_latch) {
115295d402f0SMikulas Patocka 		(void)cmpxchg(&dm_bufio_cache_size, 0,
115395d402f0SMikulas Patocka 			      dm_bufio_default_cache_size);
115495d402f0SMikulas Patocka 		dm_bufio_cache_size_latch = dm_bufio_default_cache_size;
115595d402f0SMikulas Patocka 	}
115695d402f0SMikulas Patocka }
115795d402f0SMikulas Patocka 
115895d402f0SMikulas Patocka /*
115995d402f0SMikulas Patocka  * Allocating buffer data.
116095d402f0SMikulas Patocka  *
116195d402f0SMikulas Patocka  * Small buffers are allocated with kmem_cache, to use space optimally.
116295d402f0SMikulas Patocka  *
116395d402f0SMikulas Patocka  * For large buffers, we choose between get_free_pages and vmalloc.
116495d402f0SMikulas Patocka  * Each has advantages and disadvantages.
116595d402f0SMikulas Patocka  *
116695d402f0SMikulas Patocka  * __get_free_pages can randomly fail if the memory is fragmented.
116795d402f0SMikulas Patocka  * __vmalloc won't randomly fail, but vmalloc space is limited (it may be
116895d402f0SMikulas Patocka  * as low as 128M) so using it for caching is not appropriate.
116995d402f0SMikulas Patocka  *
117095d402f0SMikulas Patocka  * If the allocation may fail we use __get_free_pages. Memory fragmentation
117195d402f0SMikulas Patocka  * won't have a fatal effect here, but it just causes flushes of some other
117295d402f0SMikulas Patocka  * buffers and more I/O will be performed. Don't use __get_free_pages if it
117323baf831SKirill A. Shutemov  * always fails (i.e. order > MAX_ORDER).
117495d402f0SMikulas Patocka  *
117595d402f0SMikulas Patocka  * If the allocation shouldn't fail we use __vmalloc. This is only for the
117695d402f0SMikulas Patocka  * initial reserve allocation, so there's no risk of wasting all vmalloc
117795d402f0SMikulas Patocka  * space.
117895d402f0SMikulas Patocka  */
alloc_buffer_data(struct dm_bufio_client * c,gfp_t gfp_mask,unsigned char * data_mode)117995d402f0SMikulas Patocka static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
118003b02939SMikulas Patocka 			       unsigned char *data_mode)
118195d402f0SMikulas Patocka {
118221bb1327SMikulas Patocka 	if (unlikely(c->slab_cache != NULL)) {
118395d402f0SMikulas Patocka 		*data_mode = DATA_MODE_SLAB;
118421bb1327SMikulas Patocka 		return kmem_cache_alloc(c->slab_cache, gfp_mask);
118595d402f0SMikulas Patocka 	}
118695d402f0SMikulas Patocka 
1187f51f2e0aSMikulas Patocka 	if (c->block_size <= KMALLOC_MAX_SIZE &&
118895d402f0SMikulas Patocka 	    gfp_mask & __GFP_NORETRY) {
118995d402f0SMikulas Patocka 		*data_mode = DATA_MODE_GET_FREE_PAGES;
119095d402f0SMikulas Patocka 		return (void *)__get_free_pages(gfp_mask,
1191f51f2e0aSMikulas Patocka 						c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
119295d402f0SMikulas Patocka 	}
119395d402f0SMikulas Patocka 
119495d402f0SMikulas Patocka 	*data_mode = DATA_MODE_VMALLOC;
1195502624bdSMikulas Patocka 
119688dca4caSChristoph Hellwig 	return __vmalloc(c->block_size, gfp_mask);
1197590347e4SArnd Bergmann }
1198590347e4SArnd Bergmann 
119995d402f0SMikulas Patocka /*
120095d402f0SMikulas Patocka  * Free buffer's data.
120195d402f0SMikulas Patocka  */
free_buffer_data(struct dm_bufio_client * c,void * data,unsigned char data_mode)120295d402f0SMikulas Patocka static void free_buffer_data(struct dm_bufio_client *c,
120303b02939SMikulas Patocka 			     void *data, unsigned char data_mode)
120495d402f0SMikulas Patocka {
120595d402f0SMikulas Patocka 	switch (data_mode) {
120695d402f0SMikulas Patocka 	case DATA_MODE_SLAB:
120721bb1327SMikulas Patocka 		kmem_cache_free(c->slab_cache, data);
120895d402f0SMikulas Patocka 		break;
120995d402f0SMikulas Patocka 
121095d402f0SMikulas Patocka 	case DATA_MODE_GET_FREE_PAGES:
1211f51f2e0aSMikulas Patocka 		free_pages((unsigned long)data,
1212f51f2e0aSMikulas Patocka 			   c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
121395d402f0SMikulas Patocka 		break;
121495d402f0SMikulas Patocka 
121595d402f0SMikulas Patocka 	case DATA_MODE_VMALLOC:
121695d402f0SMikulas Patocka 		vfree(data);
121795d402f0SMikulas Patocka 		break;
121895d402f0SMikulas Patocka 
121995d402f0SMikulas Patocka 	default:
122095d402f0SMikulas Patocka 		DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d",
122195d402f0SMikulas Patocka 		       data_mode);
122295d402f0SMikulas Patocka 		BUG();
122395d402f0SMikulas Patocka 	}
122495d402f0SMikulas Patocka }
122595d402f0SMikulas Patocka 
122695d402f0SMikulas Patocka /*
122795d402f0SMikulas Patocka  * Allocate buffer and its data.
122895d402f0SMikulas Patocka  */
alloc_buffer(struct dm_bufio_client * c,gfp_t gfp_mask)122995d402f0SMikulas Patocka static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
123095d402f0SMikulas Patocka {
1231359dbf19SMikulas Patocka 	struct dm_buffer *b = kmem_cache_alloc(c->slab_buffer, gfp_mask);
123295d402f0SMikulas Patocka 
123395d402f0SMikulas Patocka 	if (!b)
123495d402f0SMikulas Patocka 		return NULL;
123595d402f0SMikulas Patocka 
123695d402f0SMikulas Patocka 	b->c = c;
123795d402f0SMikulas Patocka 
123895d402f0SMikulas Patocka 	b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode);
123995d402f0SMikulas Patocka 	if (!b->data) {
1240359dbf19SMikulas Patocka 		kmem_cache_free(c->slab_buffer, b);
124195d402f0SMikulas Patocka 		return NULL;
124295d402f0SMikulas Patocka 	}
1243450e8deeSJoe Thornber 	adjust_total_allocated(b, false);
124495d402f0SMikulas Patocka 
124586bad0c7SMikulas Patocka #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1246741b58f3SThomas Gleixner 	b->stack_len = 0;
124786bad0c7SMikulas Patocka #endif
124895d402f0SMikulas Patocka 	return b;
124995d402f0SMikulas Patocka }
125095d402f0SMikulas Patocka 
125195d402f0SMikulas Patocka /*
125295d402f0SMikulas Patocka  * Free buffer and its data.
125395d402f0SMikulas Patocka  */
free_buffer(struct dm_buffer * b)125495d402f0SMikulas Patocka static void free_buffer(struct dm_buffer *b)
125595d402f0SMikulas Patocka {
125695d402f0SMikulas Patocka 	struct dm_bufio_client *c = b->c;
125795d402f0SMikulas Patocka 
1258450e8deeSJoe Thornber 	adjust_total_allocated(b, true);
125995d402f0SMikulas Patocka 	free_buffer_data(c, b->data, b->data_mode);
1260359dbf19SMikulas Patocka 	kmem_cache_free(c->slab_buffer, b);
126195d402f0SMikulas Patocka }
126295d402f0SMikulas Patocka 
126395d402f0SMikulas Patocka /*
1264a4a82ce3SHeinz Mauelshagen  *--------------------------------------------------------------------------
126595d402f0SMikulas Patocka  * Submit I/O on the buffer.
126695d402f0SMikulas Patocka  *
126795d402f0SMikulas Patocka  * Bio interface is faster but it has some problems:
126895d402f0SMikulas Patocka  *	the vector list is limited (increasing this limit increases
126995d402f0SMikulas Patocka  *	memory-consumption per buffer, so it is not viable);
127095d402f0SMikulas Patocka  *
127195d402f0SMikulas Patocka  *	the memory must be direct-mapped, not vmalloced;
127295d402f0SMikulas Patocka  *
127395d402f0SMikulas Patocka  * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and
127495d402f0SMikulas Patocka  * it is not vmalloced, try using the bio interface.
127595d402f0SMikulas Patocka  *
127695d402f0SMikulas Patocka  * If the buffer is big, if it is vmalloced or if the underlying device
127795d402f0SMikulas Patocka  * rejects the bio because it is too large, use dm-io layer to do the I/O.
127895d402f0SMikulas Patocka  * The dm-io layer splits the I/O into multiple requests, avoiding the above
127995d402f0SMikulas Patocka  * shortcomings.
1280a4a82ce3SHeinz Mauelshagen  *--------------------------------------------------------------------------
1281a4a82ce3SHeinz Mauelshagen  */
128295d402f0SMikulas Patocka 
128395d402f0SMikulas Patocka /*
128495d402f0SMikulas Patocka  * dm-io completion routine. It just calls b->bio.bi_end_io, pretending
128595d402f0SMikulas Patocka  * that the request was handled directly with bio interface.
128695d402f0SMikulas Patocka  */
dmio_complete(unsigned long error,void * context)128795d402f0SMikulas Patocka static void dmio_complete(unsigned long error, void *context)
128895d402f0SMikulas Patocka {
128995d402f0SMikulas Patocka 	struct dm_buffer *b = context;
129095d402f0SMikulas Patocka 
129145354f1eSMikulas Patocka 	b->end_io(b, unlikely(error != 0) ? BLK_STS_IOERR : 0);
129295d402f0SMikulas Patocka }
129395d402f0SMikulas Patocka 
use_dmio(struct dm_buffer * b,enum req_op op,sector_t sector,unsigned int n_sectors,unsigned int offset)1294a3282b43SBart Van Assche static void use_dmio(struct dm_buffer *b, enum req_op op, sector_t sector,
129586a3238cSHeinz Mauelshagen 		     unsigned int n_sectors, unsigned int offset)
129695d402f0SMikulas Patocka {
129795d402f0SMikulas Patocka 	int r;
129895d402f0SMikulas Patocka 	struct dm_io_request io_req = {
1299a3282b43SBart Van Assche 		.bi_opf = op,
130095d402f0SMikulas Patocka 		.notify.fn = dmio_complete,
130195d402f0SMikulas Patocka 		.notify.context = b,
130295d402f0SMikulas Patocka 		.client = b->c->dm_io,
130395d402f0SMikulas Patocka 	};
130495d402f0SMikulas Patocka 	struct dm_io_region region = {
130595d402f0SMikulas Patocka 		.bdev = b->c->bdev,
1306400a0befSMikulas Patocka 		.sector = sector,
1307400a0befSMikulas Patocka 		.count = n_sectors,
130895d402f0SMikulas Patocka 	};
130995d402f0SMikulas Patocka 
131095d402f0SMikulas Patocka 	if (b->data_mode != DATA_MODE_VMALLOC) {
131195d402f0SMikulas Patocka 		io_req.mem.type = DM_IO_KMEM;
13121e3b21c6SMikulas Patocka 		io_req.mem.ptr.addr = (char *)b->data + offset;
131395d402f0SMikulas Patocka 	} else {
131495d402f0SMikulas Patocka 		io_req.mem.type = DM_IO_VMA;
13151e3b21c6SMikulas Patocka 		io_req.mem.ptr.vma = (char *)b->data + offset;
131695d402f0SMikulas Patocka 	}
131795d402f0SMikulas Patocka 
1318*5cfcea64SHongyu Jin 	r = dm_io(&io_req, 1, &region, NULL, IOPRIO_DEFAULT);
131945354f1eSMikulas Patocka 	if (unlikely(r))
132045354f1eSMikulas Patocka 		b->end_io(b, errno_to_blk_status(r));
132195d402f0SMikulas Patocka }
132295d402f0SMikulas Patocka 
bio_complete(struct bio * bio)132345354f1eSMikulas Patocka static void bio_complete(struct bio *bio)
1324445559cdSDarrick J. Wong {
132545354f1eSMikulas Patocka 	struct dm_buffer *b = bio->bi_private;
13264e4cbee9SChristoph Hellwig 	blk_status_t status = bio->bi_status;
13270ef0b471SHeinz Mauelshagen 
1328066ff571SChristoph Hellwig 	bio_uninit(bio);
1329066ff571SChristoph Hellwig 	kfree(bio);
133045354f1eSMikulas Patocka 	b->end_io(b, status);
1331445559cdSDarrick J. Wong }
1332445559cdSDarrick J. Wong 
use_bio(struct dm_buffer * b,enum req_op op,sector_t sector,unsigned int n_sectors,unsigned int offset)1333a3282b43SBart Van Assche static void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector,
133486a3238cSHeinz Mauelshagen 		    unsigned int n_sectors, unsigned int offset)
133595d402f0SMikulas Patocka {
133645354f1eSMikulas Patocka 	struct bio *bio;
133795d402f0SMikulas Patocka 	char *ptr;
133856c5de44SMikulas Patocka 	unsigned int len;
133995d402f0SMikulas Patocka 
134056c5de44SMikulas Patocka 	bio = bio_kmalloc(1, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOWARN);
134145354f1eSMikulas Patocka 	if (!bio) {
1342a3282b43SBart Van Assche 		use_dmio(b, op, sector, n_sectors, offset);
134345354f1eSMikulas Patocka 		return;
134445354f1eSMikulas Patocka 	}
134556c5de44SMikulas Patocka 	bio_init(bio, b->c->bdev, bio->bi_inline_vecs, 1, op);
134645354f1eSMikulas Patocka 	bio->bi_iter.bi_sector = sector;
134745354f1eSMikulas Patocka 	bio->bi_end_io = bio_complete;
134845354f1eSMikulas Patocka 	bio->bi_private = b;
134995d402f0SMikulas Patocka 
13501e3b21c6SMikulas Patocka 	ptr = (char *)b->data + offset;
1351400a0befSMikulas Patocka 	len = n_sectors << SECTOR_SHIFT;
135295d402f0SMikulas Patocka 
135356c5de44SMikulas Patocka 	__bio_add_page(bio, virt_to_page(ptr), len, offset_in_page(ptr));
135495d402f0SMikulas Patocka 
135545354f1eSMikulas Patocka 	submit_bio(bio);
135695d402f0SMikulas Patocka }
135795d402f0SMikulas Patocka 
block_to_sector(struct dm_bufio_client * c,sector_t block)13586fbeb004SMikulas Patocka static inline sector_t block_to_sector(struct dm_bufio_client *c, sector_t block)
13596fbeb004SMikulas Patocka {
13606fbeb004SMikulas Patocka 	sector_t sector;
13616fbeb004SMikulas Patocka 
13626fbeb004SMikulas Patocka 	if (likely(c->sectors_per_block_bits >= 0))
13636fbeb004SMikulas Patocka 		sector = block << c->sectors_per_block_bits;
13646fbeb004SMikulas Patocka 	else
13656fbeb004SMikulas Patocka 		sector = block * (c->block_size >> SECTOR_SHIFT);
13666fbeb004SMikulas Patocka 	sector += c->start;
13676fbeb004SMikulas Patocka 
13686fbeb004SMikulas Patocka 	return sector;
13696fbeb004SMikulas Patocka }
13706fbeb004SMikulas Patocka 
submit_io(struct dm_buffer * b,enum req_op op,void (* end_io)(struct dm_buffer *,blk_status_t))1371a3282b43SBart Van Assche static void submit_io(struct dm_buffer *b, enum req_op op,
1372a3282b43SBart Van Assche 		      void (*end_io)(struct dm_buffer *, blk_status_t))
137395d402f0SMikulas Patocka {
137486a3238cSHeinz Mauelshagen 	unsigned int n_sectors;
1375400a0befSMikulas Patocka 	sector_t sector;
137686a3238cSHeinz Mauelshagen 	unsigned int offset, end;
137795d402f0SMikulas Patocka 
137845354f1eSMikulas Patocka 	b->end_io = end_io;
137945354f1eSMikulas Patocka 
13806fbeb004SMikulas Patocka 	sector = block_to_sector(b->c, b->block);
13811e3b21c6SMikulas Patocka 
1382a3282b43SBart Van Assche 	if (op != REQ_OP_WRITE) {
1383f51f2e0aSMikulas Patocka 		n_sectors = b->c->block_size >> SECTOR_SHIFT;
13841e3b21c6SMikulas Patocka 		offset = 0;
13851e3b21c6SMikulas Patocka 	} else {
13861e3b21c6SMikulas Patocka 		if (b->c->write_callback)
13871e3b21c6SMikulas Patocka 			b->c->write_callback(b);
13881e3b21c6SMikulas Patocka 		offset = b->write_start;
13891e3b21c6SMikulas Patocka 		end = b->write_end;
13901e3b21c6SMikulas Patocka 		offset &= -DM_BUFIO_WRITE_ALIGN;
13911e3b21c6SMikulas Patocka 		end += DM_BUFIO_WRITE_ALIGN - 1;
13921e3b21c6SMikulas Patocka 		end &= -DM_BUFIO_WRITE_ALIGN;
13931e3b21c6SMikulas Patocka 		if (unlikely(end > b->c->block_size))
13941e3b21c6SMikulas Patocka 			end = b->c->block_size;
13951e3b21c6SMikulas Patocka 
13961e3b21c6SMikulas Patocka 		sector += offset >> SECTOR_SHIFT;
13971e3b21c6SMikulas Patocka 		n_sectors = (end - offset) >> SECTOR_SHIFT;
13981e3b21c6SMikulas Patocka 	}
1399400a0befSMikulas Patocka 
140045354f1eSMikulas Patocka 	if (b->data_mode != DATA_MODE_VMALLOC)
1401a3282b43SBart Van Assche 		use_bio(b, op, sector, n_sectors, offset);
140295d402f0SMikulas Patocka 	else
1403a3282b43SBart Van Assche 		use_dmio(b, op, sector, n_sectors, offset);
140495d402f0SMikulas Patocka }
140595d402f0SMikulas Patocka 
1406a4a82ce3SHeinz Mauelshagen /*
1407a4a82ce3SHeinz Mauelshagen  *--------------------------------------------------------------
140895d402f0SMikulas Patocka  * Writing dirty buffers
1409a4a82ce3SHeinz Mauelshagen  *--------------------------------------------------------------
1410a4a82ce3SHeinz Mauelshagen  */
141195d402f0SMikulas Patocka 
141295d402f0SMikulas Patocka /*
141395d402f0SMikulas Patocka  * The endio routine for write.
141495d402f0SMikulas Patocka  *
141595d402f0SMikulas Patocka  * Set the error, clear B_WRITING bit and wake anyone who was waiting on
141695d402f0SMikulas Patocka  * it.
141795d402f0SMikulas Patocka  */
write_endio(struct dm_buffer * b,blk_status_t status)141845354f1eSMikulas Patocka static void write_endio(struct dm_buffer *b, blk_status_t status)
141995d402f0SMikulas Patocka {
142045354f1eSMikulas Patocka 	b->write_error = status;
142145354f1eSMikulas Patocka 	if (unlikely(status)) {
142295d402f0SMikulas Patocka 		struct dm_bufio_client *c = b->c;
14234e4cbee9SChristoph Hellwig 
14244e4cbee9SChristoph Hellwig 		(void)cmpxchg(&c->async_write_error, 0,
142545354f1eSMikulas Patocka 				blk_status_to_errno(status));
142695d402f0SMikulas Patocka 	}
142795d402f0SMikulas Patocka 
142895d402f0SMikulas Patocka 	BUG_ON(!test_bit(B_WRITING, &b->state));
142995d402f0SMikulas Patocka 
14304e857c58SPeter Zijlstra 	smp_mb__before_atomic();
143195d402f0SMikulas Patocka 	clear_bit(B_WRITING, &b->state);
14324e857c58SPeter Zijlstra 	smp_mb__after_atomic();
143395d402f0SMikulas Patocka 
143495d402f0SMikulas Patocka 	wake_up_bit(&b->state, B_WRITING);
143595d402f0SMikulas Patocka }
143695d402f0SMikulas Patocka 
143795d402f0SMikulas Patocka /*
143895d402f0SMikulas Patocka  * Initiate a write on a dirty buffer, but don't wait for it.
143995d402f0SMikulas Patocka  *
144095d402f0SMikulas Patocka  * - If the buffer is not dirty, exit.
144195d402f0SMikulas Patocka  * - If there some previous write going on, wait for it to finish (we can't
144295d402f0SMikulas Patocka  *   have two writes on the same buffer simultaneously).
144395d402f0SMikulas Patocka  * - Submit our write and don't wait on it. We set B_WRITING indicating
144495d402f0SMikulas Patocka  *   that there is a write in progress.
144595d402f0SMikulas Patocka  */
__write_dirty_buffer(struct dm_buffer * b,struct list_head * write_list)14462480945cSMikulas Patocka static void __write_dirty_buffer(struct dm_buffer *b,
14472480945cSMikulas Patocka 				 struct list_head *write_list)
144895d402f0SMikulas Patocka {
144995d402f0SMikulas Patocka 	if (!test_bit(B_DIRTY, &b->state))
145095d402f0SMikulas Patocka 		return;
145195d402f0SMikulas Patocka 
145295d402f0SMikulas Patocka 	clear_bit(B_DIRTY, &b->state);
145374316201SNeilBrown 	wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
145495d402f0SMikulas Patocka 
14551e3b21c6SMikulas Patocka 	b->write_start = b->dirty_start;
14561e3b21c6SMikulas Patocka 	b->write_end = b->dirty_end;
14571e3b21c6SMikulas Patocka 
14582480945cSMikulas Patocka 	if (!write_list)
1459905be0a1SMikulas Patocka 		submit_io(b, REQ_OP_WRITE, write_endio);
14602480945cSMikulas Patocka 	else
14612480945cSMikulas Patocka 		list_add_tail(&b->write_list, write_list);
14622480945cSMikulas Patocka }
14632480945cSMikulas Patocka 
__flush_write_list(struct list_head * write_list)14642480945cSMikulas Patocka static void __flush_write_list(struct list_head *write_list)
14652480945cSMikulas Patocka {
14662480945cSMikulas Patocka 	struct blk_plug plug;
14670ef0b471SHeinz Mauelshagen 
14682480945cSMikulas Patocka 	blk_start_plug(&plug);
14692480945cSMikulas Patocka 	while (!list_empty(write_list)) {
14702480945cSMikulas Patocka 		struct dm_buffer *b =
14712480945cSMikulas Patocka 			list_entry(write_list->next, struct dm_buffer, write_list);
14722480945cSMikulas Patocka 		list_del(&b->write_list);
1473905be0a1SMikulas Patocka 		submit_io(b, REQ_OP_WRITE, write_endio);
14747cd32674SPeter Zijlstra 		cond_resched();
14752480945cSMikulas Patocka 	}
14762480945cSMikulas Patocka 	blk_finish_plug(&plug);
147795d402f0SMikulas Patocka }
147895d402f0SMikulas Patocka 
147995d402f0SMikulas Patocka /*
148095d402f0SMikulas Patocka  * Wait until any activity on the buffer finishes.  Possibly write the
148195d402f0SMikulas Patocka  * buffer if it is dirty.  When this function finishes, there is no I/O
148295d402f0SMikulas Patocka  * running on the buffer and the buffer is not dirty.
148395d402f0SMikulas Patocka  */
__make_buffer_clean(struct dm_buffer * b)148495d402f0SMikulas Patocka static void __make_buffer_clean(struct dm_buffer *b)
148595d402f0SMikulas Patocka {
1486450e8deeSJoe Thornber 	BUG_ON(atomic_read(&b->hold_count));
148795d402f0SMikulas Patocka 
1488141b3523SMikulas Patocka 	/* smp_load_acquire() pairs with read_endio()'s smp_mb__before_atomic() */
1489141b3523SMikulas Patocka 	if (!smp_load_acquire(&b->state))	/* fast case */
149095d402f0SMikulas Patocka 		return;
149195d402f0SMikulas Patocka 
149274316201SNeilBrown 	wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
14932480945cSMikulas Patocka 	__write_dirty_buffer(b, NULL);
149474316201SNeilBrown 	wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
149595d402f0SMikulas Patocka }
149695d402f0SMikulas Patocka 
is_clean(struct dm_buffer * b,void * context)1497450e8deeSJoe Thornber static enum evict_result is_clean(struct dm_buffer *b, void *context)
1498450e8deeSJoe Thornber {
1499450e8deeSJoe Thornber 	struct dm_bufio_client *c = context;
1500450e8deeSJoe Thornber 
1501450e8deeSJoe Thornber 	/* These should never happen */
1502450e8deeSJoe Thornber 	if (WARN_ON_ONCE(test_bit(B_WRITING, &b->state)))
1503450e8deeSJoe Thornber 		return ER_DONT_EVICT;
1504450e8deeSJoe Thornber 	if (WARN_ON_ONCE(test_bit(B_DIRTY, &b->state)))
1505450e8deeSJoe Thornber 		return ER_DONT_EVICT;
1506450e8deeSJoe Thornber 	if (WARN_ON_ONCE(b->list_mode != LIST_CLEAN))
1507450e8deeSJoe Thornber 		return ER_DONT_EVICT;
1508450e8deeSJoe Thornber 
1509450e8deeSJoe Thornber 	if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep &&
1510450e8deeSJoe Thornber 	    unlikely(test_bit(B_READING, &b->state)))
1511450e8deeSJoe Thornber 		return ER_DONT_EVICT;
1512450e8deeSJoe Thornber 
1513450e8deeSJoe Thornber 	return ER_EVICT;
1514450e8deeSJoe Thornber }
1515450e8deeSJoe Thornber 
is_dirty(struct dm_buffer * b,void * context)1516450e8deeSJoe Thornber static enum evict_result is_dirty(struct dm_buffer *b, void *context)
1517450e8deeSJoe Thornber {
1518450e8deeSJoe Thornber 	/* These should never happen */
1519450e8deeSJoe Thornber 	if (WARN_ON_ONCE(test_bit(B_READING, &b->state)))
1520450e8deeSJoe Thornber 		return ER_DONT_EVICT;
1521450e8deeSJoe Thornber 	if (WARN_ON_ONCE(b->list_mode != LIST_DIRTY))
1522450e8deeSJoe Thornber 		return ER_DONT_EVICT;
1523450e8deeSJoe Thornber 
1524450e8deeSJoe Thornber 	return ER_EVICT;
1525450e8deeSJoe Thornber }
1526450e8deeSJoe Thornber 
152795d402f0SMikulas Patocka /*
152895d402f0SMikulas Patocka  * Find some buffer that is not held by anybody, clean it, unlink it and
152995d402f0SMikulas Patocka  * return it.
153095d402f0SMikulas Patocka  */
__get_unclaimed_buffer(struct dm_bufio_client * c)153195d402f0SMikulas Patocka static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c)
153295d402f0SMikulas Patocka {
153395d402f0SMikulas Patocka 	struct dm_buffer *b;
153495d402f0SMikulas Patocka 
1535450e8deeSJoe Thornber 	b = cache_evict(&c->cache, LIST_CLEAN, is_clean, c);
1536450e8deeSJoe Thornber 	if (b) {
1537450e8deeSJoe Thornber 		/* this also waits for pending reads */
153895d402f0SMikulas Patocka 		__make_buffer_clean(b);
153995d402f0SMikulas Patocka 		return b;
154095d402f0SMikulas Patocka 	}
154195d402f0SMikulas Patocka 
1542e3a7c294SMikulas Patocka 	if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
1543e3a7c294SMikulas Patocka 		return NULL;
1544e3a7c294SMikulas Patocka 
1545450e8deeSJoe Thornber 	b = cache_evict(&c->cache, LIST_DIRTY, is_dirty, NULL);
1546450e8deeSJoe Thornber 	if (b) {
154795d402f0SMikulas Patocka 		__make_buffer_clean(b);
154895d402f0SMikulas Patocka 		return b;
154995d402f0SMikulas Patocka 	}
155095d402f0SMikulas Patocka 
155195d402f0SMikulas Patocka 	return NULL;
155295d402f0SMikulas Patocka }
155395d402f0SMikulas Patocka 
155495d402f0SMikulas Patocka /*
155595d402f0SMikulas Patocka  * Wait until some other threads free some buffer or release hold count on
155695d402f0SMikulas Patocka  * some buffer.
155795d402f0SMikulas Patocka  *
155895d402f0SMikulas Patocka  * This function is entered with c->lock held, drops it and regains it
155995d402f0SMikulas Patocka  * before exiting.
156095d402f0SMikulas Patocka  */
__wait_for_free_buffer(struct dm_bufio_client * c)156195d402f0SMikulas Patocka static void __wait_for_free_buffer(struct dm_bufio_client *c)
156295d402f0SMikulas Patocka {
156395d402f0SMikulas Patocka 	DECLARE_WAITQUEUE(wait, current);
156495d402f0SMikulas Patocka 
156595d402f0SMikulas Patocka 	add_wait_queue(&c->free_buffer_wait, &wait);
1566642fa448SDavidlohr Bueso 	set_current_state(TASK_UNINTERRUPTIBLE);
156795d402f0SMikulas Patocka 	dm_bufio_unlock(c);
156895d402f0SMikulas Patocka 
1569450e8deeSJoe Thornber 	/*
1570450e8deeSJoe Thornber 	 * It's possible to miss a wake up event since we don't always
1571450e8deeSJoe Thornber 	 * hold c->lock when wake_up is called.  So we have a timeout here,
1572450e8deeSJoe Thornber 	 * just in case.
1573450e8deeSJoe Thornber 	 */
1574450e8deeSJoe Thornber 	io_schedule_timeout(5 * HZ);
157595d402f0SMikulas Patocka 
157695d402f0SMikulas Patocka 	remove_wait_queue(&c->free_buffer_wait, &wait);
157795d402f0SMikulas Patocka 
157895d402f0SMikulas Patocka 	dm_bufio_lock(c);
157995d402f0SMikulas Patocka }
158095d402f0SMikulas Patocka 
1581a66cc28fSMikulas Patocka enum new_flag {
1582a66cc28fSMikulas Patocka 	NF_FRESH = 0,
1583a66cc28fSMikulas Patocka 	NF_READ = 1,
1584a66cc28fSMikulas Patocka 	NF_GET = 2,
1585a66cc28fSMikulas Patocka 	NF_PREFETCH = 3
1586a66cc28fSMikulas Patocka };
1587a66cc28fSMikulas Patocka 
158895d402f0SMikulas Patocka /*
158995d402f0SMikulas Patocka  * Allocate a new buffer. If the allocation is not possible, wait until
159095d402f0SMikulas Patocka  * some other thread frees a buffer.
159195d402f0SMikulas Patocka  *
159295d402f0SMikulas Patocka  * May drop the lock and regain it.
159395d402f0SMikulas Patocka  */
__alloc_buffer_wait_no_callback(struct dm_bufio_client * c,enum new_flag nf)1594a66cc28fSMikulas Patocka static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf)
159595d402f0SMikulas Patocka {
159695d402f0SMikulas Patocka 	struct dm_buffer *b;
159741c73a49SMikulas Patocka 	bool tried_noio_alloc = false;
159895d402f0SMikulas Patocka 
159995d402f0SMikulas Patocka 	/*
160095d402f0SMikulas Patocka 	 * dm-bufio is resistant to allocation failures (it just keeps
160195d402f0SMikulas Patocka 	 * one buffer reserved in cases all the allocations fail).
160295d402f0SMikulas Patocka 	 * So set flags to not try too hard:
16039ea61cacSDouglas Anderson 	 *	GFP_NOWAIT: don't wait; if we need to sleep we'll release our
16049ea61cacSDouglas Anderson 	 *		    mutex and wait ourselves.
160595d402f0SMikulas Patocka 	 *	__GFP_NORETRY: don't retry and rather return failure
160695d402f0SMikulas Patocka 	 *	__GFP_NOMEMALLOC: don't use emergency reserves
160795d402f0SMikulas Patocka 	 *	__GFP_NOWARN: don't print a warning in case of failure
160895d402f0SMikulas Patocka 	 *
160995d402f0SMikulas Patocka 	 * For debugging, if we set the cache size to 1, no new buffers will
161095d402f0SMikulas Patocka 	 * be allocated.
161195d402f0SMikulas Patocka 	 */
161295d402f0SMikulas Patocka 	while (1) {
161395d402f0SMikulas Patocka 		if (dm_bufio_cache_size_latch != 1) {
16149ea61cacSDouglas Anderson 			b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
161595d402f0SMikulas Patocka 			if (b)
161695d402f0SMikulas Patocka 				return b;
161795d402f0SMikulas Patocka 		}
161895d402f0SMikulas Patocka 
1619a66cc28fSMikulas Patocka 		if (nf == NF_PREFETCH)
1620a66cc28fSMikulas Patocka 			return NULL;
1621a66cc28fSMikulas Patocka 
162241c73a49SMikulas Patocka 		if (dm_bufio_cache_size_latch != 1 && !tried_noio_alloc) {
162341c73a49SMikulas Patocka 			dm_bufio_unlock(c);
162441c73a49SMikulas Patocka 			b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
162541c73a49SMikulas Patocka 			dm_bufio_lock(c);
162641c73a49SMikulas Patocka 			if (b)
162741c73a49SMikulas Patocka 				return b;
162841c73a49SMikulas Patocka 			tried_noio_alloc = true;
162941c73a49SMikulas Patocka 		}
163041c73a49SMikulas Patocka 
163195d402f0SMikulas Patocka 		if (!list_empty(&c->reserved_buffers)) {
1632450e8deeSJoe Thornber 			b = list_to_buffer(c->reserved_buffers.next);
1633450e8deeSJoe Thornber 			list_del(&b->lru.list);
163495d402f0SMikulas Patocka 			c->need_reserved_buffers++;
163595d402f0SMikulas Patocka 
163695d402f0SMikulas Patocka 			return b;
163795d402f0SMikulas Patocka 		}
163895d402f0SMikulas Patocka 
163995d402f0SMikulas Patocka 		b = __get_unclaimed_buffer(c);
164095d402f0SMikulas Patocka 		if (b)
164195d402f0SMikulas Patocka 			return b;
164295d402f0SMikulas Patocka 
164395d402f0SMikulas Patocka 		__wait_for_free_buffer(c);
164495d402f0SMikulas Patocka 	}
164595d402f0SMikulas Patocka }
164695d402f0SMikulas Patocka 
__alloc_buffer_wait(struct dm_bufio_client * c,enum new_flag nf)1647a66cc28fSMikulas Patocka static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf)
164895d402f0SMikulas Patocka {
1649a66cc28fSMikulas Patocka 	struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf);
1650a66cc28fSMikulas Patocka 
1651a66cc28fSMikulas Patocka 	if (!b)
1652a66cc28fSMikulas Patocka 		return NULL;
165395d402f0SMikulas Patocka 
165495d402f0SMikulas Patocka 	if (c->alloc_callback)
165595d402f0SMikulas Patocka 		c->alloc_callback(b);
165695d402f0SMikulas Patocka 
165795d402f0SMikulas Patocka 	return b;
165895d402f0SMikulas Patocka }
165995d402f0SMikulas Patocka 
166095d402f0SMikulas Patocka /*
166195d402f0SMikulas Patocka  * Free a buffer and wake other threads waiting for free buffers.
166295d402f0SMikulas Patocka  */
__free_buffer_wake(struct dm_buffer * b)166395d402f0SMikulas Patocka static void __free_buffer_wake(struct dm_buffer *b)
166495d402f0SMikulas Patocka {
166595d402f0SMikulas Patocka 	struct dm_bufio_client *c = b->c;
166695d402f0SMikulas Patocka 
1667450e8deeSJoe Thornber 	b->block = -1;
166895d402f0SMikulas Patocka 	if (!c->need_reserved_buffers)
166995d402f0SMikulas Patocka 		free_buffer(b);
167095d402f0SMikulas Patocka 	else {
1671450e8deeSJoe Thornber 		list_add(&b->lru.list, &c->reserved_buffers);
167295d402f0SMikulas Patocka 		c->need_reserved_buffers--;
167395d402f0SMikulas Patocka 	}
167495d402f0SMikulas Patocka 
1675f5f93541SMikulas Patocka 	/*
1676f5f93541SMikulas Patocka 	 * We hold the bufio lock here, so no one can add entries to the
1677f5f93541SMikulas Patocka 	 * wait queue anyway.
1678f5f93541SMikulas Patocka 	 */
1679f5f93541SMikulas Patocka 	if (unlikely(waitqueue_active(&c->free_buffer_wait)))
168095d402f0SMikulas Patocka 		wake_up(&c->free_buffer_wait);
168195d402f0SMikulas Patocka }
168295d402f0SMikulas Patocka 
cleaned(struct dm_buffer * b,void * context)1683450e8deeSJoe Thornber static enum evict_result cleaned(struct dm_buffer *b, void *context)
1684450e8deeSJoe Thornber {
1685450e8deeSJoe Thornber 	if (WARN_ON_ONCE(test_bit(B_READING, &b->state)))
1686450e8deeSJoe Thornber 		return ER_DONT_EVICT; /* should never happen */
1687450e8deeSJoe Thornber 
1688450e8deeSJoe Thornber 	if (test_bit(B_DIRTY, &b->state) || test_bit(B_WRITING, &b->state))
1689450e8deeSJoe Thornber 		return ER_DONT_EVICT;
1690450e8deeSJoe Thornber 	else
1691450e8deeSJoe Thornber 		return ER_EVICT;
1692450e8deeSJoe Thornber }
1693450e8deeSJoe Thornber 
__move_clean_buffers(struct dm_bufio_client * c)1694450e8deeSJoe Thornber static void __move_clean_buffers(struct dm_bufio_client *c)
1695450e8deeSJoe Thornber {
1696450e8deeSJoe Thornber 	cache_mark_many(&c->cache, LIST_DIRTY, LIST_CLEAN, cleaned, NULL);
1697450e8deeSJoe Thornber }
1698450e8deeSJoe Thornber 
1699450e8deeSJoe Thornber struct write_context {
1700450e8deeSJoe Thornber 	int no_wait;
1701450e8deeSJoe Thornber 	struct list_head *write_list;
1702450e8deeSJoe Thornber };
1703450e8deeSJoe Thornber 
write_one(struct dm_buffer * b,void * context)1704450e8deeSJoe Thornber static enum it_action write_one(struct dm_buffer *b, void *context)
1705450e8deeSJoe Thornber {
1706450e8deeSJoe Thornber 	struct write_context *wc = context;
1707450e8deeSJoe Thornber 
1708450e8deeSJoe Thornber 	if (wc->no_wait && test_bit(B_WRITING, &b->state))
1709450e8deeSJoe Thornber 		return IT_COMPLETE;
1710450e8deeSJoe Thornber 
1711450e8deeSJoe Thornber 	__write_dirty_buffer(b, wc->write_list);
1712450e8deeSJoe Thornber 	return IT_NEXT;
1713450e8deeSJoe Thornber }
1714450e8deeSJoe Thornber 
__write_dirty_buffers_async(struct dm_bufio_client * c,int no_wait,struct list_head * write_list)17152480945cSMikulas Patocka static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait,
17162480945cSMikulas Patocka 					struct list_head *write_list)
171795d402f0SMikulas Patocka {
1718450e8deeSJoe Thornber 	struct write_context wc = {.no_wait = no_wait, .write_list = write_list};
171995d402f0SMikulas Patocka 
1720450e8deeSJoe Thornber 	__move_clean_buffers(c);
1721450e8deeSJoe Thornber 	cache_iterate(&c->cache, LIST_DIRTY, write_one, &wc);
172295d402f0SMikulas Patocka }
172395d402f0SMikulas Patocka 
172495d402f0SMikulas Patocka /*
172595d402f0SMikulas Patocka  * Check if we're over watermark.
172695d402f0SMikulas Patocka  * If we are over threshold_buffers, start freeing buffers.
172795d402f0SMikulas Patocka  * If we're over "limit_buffers", block until we get under the limit.
172895d402f0SMikulas Patocka  */
__check_watermark(struct dm_bufio_client * c,struct list_head * write_list)17292480945cSMikulas Patocka static void __check_watermark(struct dm_bufio_client *c,
17302480945cSMikulas Patocka 			      struct list_head *write_list)
173195d402f0SMikulas Patocka {
1732450e8deeSJoe Thornber 	if (cache_count(&c->cache, LIST_DIRTY) >
1733450e8deeSJoe Thornber 	    cache_count(&c->cache, LIST_CLEAN) * DM_BUFIO_WRITEBACK_RATIO)
17342480945cSMikulas Patocka 		__write_dirty_buffers_async(c, 1, write_list);
173595d402f0SMikulas Patocka }
173695d402f0SMikulas Patocka 
1737a4a82ce3SHeinz Mauelshagen /*
1738a4a82ce3SHeinz Mauelshagen  *--------------------------------------------------------------
173995d402f0SMikulas Patocka  * Getting a buffer
1740a4a82ce3SHeinz Mauelshagen  *--------------------------------------------------------------
1741a4a82ce3SHeinz Mauelshagen  */
174295d402f0SMikulas Patocka 
cache_put_and_wake(struct dm_bufio_client * c,struct dm_buffer * b)1743450e8deeSJoe Thornber static void cache_put_and_wake(struct dm_bufio_client *c, struct dm_buffer *b)
1744450e8deeSJoe Thornber {
1745450e8deeSJoe Thornber 	/*
1746450e8deeSJoe Thornber 	 * Relying on waitqueue_active() is racey, but we sleep
1747450e8deeSJoe Thornber 	 * with schedule_timeout anyway.
1748450e8deeSJoe Thornber 	 */
1749450e8deeSJoe Thornber 	if (cache_put(&c->cache, b) &&
1750450e8deeSJoe Thornber 	    unlikely(waitqueue_active(&c->free_buffer_wait)))
1751450e8deeSJoe Thornber 		wake_up(&c->free_buffer_wait);
1752450e8deeSJoe Thornber }
1753450e8deeSJoe Thornber 
1754450e8deeSJoe Thornber /*
1755450e8deeSJoe Thornber  * This assumes you have already checked the cache to see if the buffer
1756450e8deeSJoe Thornber  * is already present (it will recheck after dropping the lock for allocation).
1757450e8deeSJoe Thornber  */
__bufio_new(struct dm_bufio_client * c,sector_t block,enum new_flag nf,int * need_submit,struct list_head * write_list)175895d402f0SMikulas Patocka static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
17592480945cSMikulas Patocka 				     enum new_flag nf, int *need_submit,
17602480945cSMikulas Patocka 				     struct list_head *write_list)
176195d402f0SMikulas Patocka {
176295d402f0SMikulas Patocka 	struct dm_buffer *b, *new_b = NULL;
176395d402f0SMikulas Patocka 
176495d402f0SMikulas Patocka 	*need_submit = 0;
176595d402f0SMikulas Patocka 
1766450e8deeSJoe Thornber 	/* This can't be called with NF_GET */
1767450e8deeSJoe Thornber 	if (WARN_ON_ONCE(nf == NF_GET))
176895d402f0SMikulas Patocka 		return NULL;
176995d402f0SMikulas Patocka 
1770a66cc28fSMikulas Patocka 	new_b = __alloc_buffer_wait(c, nf);
1771a66cc28fSMikulas Patocka 	if (!new_b)
1772a66cc28fSMikulas Patocka 		return NULL;
177395d402f0SMikulas Patocka 
177495d402f0SMikulas Patocka 	/*
177595d402f0SMikulas Patocka 	 * We've had a period where the mutex was unlocked, so need to
1776ef992373SShenghui Wang 	 * recheck the buffer tree.
177795d402f0SMikulas Patocka 	 */
1778450e8deeSJoe Thornber 	b = cache_get(&c->cache, block);
177995d402f0SMikulas Patocka 	if (b) {
178095d402f0SMikulas Patocka 		__free_buffer_wake(new_b);
1781a66cc28fSMikulas Patocka 		goto found_buffer;
178295d402f0SMikulas Patocka 	}
178395d402f0SMikulas Patocka 
17842480945cSMikulas Patocka 	__check_watermark(c, write_list);
178595d402f0SMikulas Patocka 
178695d402f0SMikulas Patocka 	b = new_b;
1787450e8deeSJoe Thornber 	atomic_set(&b->hold_count, 1);
1788450e8deeSJoe Thornber 	WRITE_ONCE(b->last_accessed, jiffies);
1789450e8deeSJoe Thornber 	b->block = block;
179095d402f0SMikulas Patocka 	b->read_error = 0;
179195d402f0SMikulas Patocka 	b->write_error = 0;
1792450e8deeSJoe Thornber 	b->list_mode = LIST_CLEAN;
179395d402f0SMikulas Patocka 
1794450e8deeSJoe Thornber 	if (nf == NF_FRESH)
179595d402f0SMikulas Patocka 		b->state = 0;
1796450e8deeSJoe Thornber 	else {
179795d402f0SMikulas Patocka 		b->state = 1 << B_READING;
179895d402f0SMikulas Patocka 		*need_submit = 1;
1799450e8deeSJoe Thornber 	}
1800450e8deeSJoe Thornber 
1801450e8deeSJoe Thornber 	/*
1802450e8deeSJoe Thornber 	 * We mustn't insert into the cache until the B_READING state
1803450e8deeSJoe Thornber 	 * is set.  Otherwise another thread could get it and use
1804450e8deeSJoe Thornber 	 * it before it had been read.
1805450e8deeSJoe Thornber 	 */
1806450e8deeSJoe Thornber 	cache_insert(&c->cache, b);
180795d402f0SMikulas Patocka 
180895d402f0SMikulas Patocka 	return b;
1809a66cc28fSMikulas Patocka 
1810a66cc28fSMikulas Patocka found_buffer:
1811450e8deeSJoe Thornber 	if (nf == NF_PREFETCH) {
1812450e8deeSJoe Thornber 		cache_put_and_wake(c, b);
1813a66cc28fSMikulas Patocka 		return NULL;
1814450e8deeSJoe Thornber 	}
1815450e8deeSJoe Thornber 
1816a66cc28fSMikulas Patocka 	/*
1817a66cc28fSMikulas Patocka 	 * Note: it is essential that we don't wait for the buffer to be
1818a66cc28fSMikulas Patocka 	 * read if dm_bufio_get function is used. Both dm_bufio_get and
1819a66cc28fSMikulas Patocka 	 * dm_bufio_prefetch can be used in the driver request routine.
1820a66cc28fSMikulas Patocka 	 * If the user called both dm_bufio_prefetch and dm_bufio_get on
1821a66cc28fSMikulas Patocka 	 * the same buffer, it would deadlock if we waited.
1822a66cc28fSMikulas Patocka 	 */
1823450e8deeSJoe Thornber 	if (nf == NF_GET && unlikely(test_bit_acquire(B_READING, &b->state))) {
1824450e8deeSJoe Thornber 		cache_put_and_wake(c, b);
1825a66cc28fSMikulas Patocka 		return NULL;
1826450e8deeSJoe Thornber 	}
1827a66cc28fSMikulas Patocka 
1828a66cc28fSMikulas Patocka 	return b;
182995d402f0SMikulas Patocka }
183095d402f0SMikulas Patocka 
183195d402f0SMikulas Patocka /*
183295d402f0SMikulas Patocka  * The endio routine for reading: set the error, clear the bit and wake up
183395d402f0SMikulas Patocka  * anyone waiting on the buffer.
183495d402f0SMikulas Patocka  */
read_endio(struct dm_buffer * b,blk_status_t status)183545354f1eSMikulas Patocka static void read_endio(struct dm_buffer *b, blk_status_t status)
183695d402f0SMikulas Patocka {
183745354f1eSMikulas Patocka 	b->read_error = status;
183895d402f0SMikulas Patocka 
183995d402f0SMikulas Patocka 	BUG_ON(!test_bit(B_READING, &b->state));
184095d402f0SMikulas Patocka 
18414e857c58SPeter Zijlstra 	smp_mb__before_atomic();
184295d402f0SMikulas Patocka 	clear_bit(B_READING, &b->state);
18434e857c58SPeter Zijlstra 	smp_mb__after_atomic();
184495d402f0SMikulas Patocka 
184595d402f0SMikulas Patocka 	wake_up_bit(&b->state, B_READING);
184695d402f0SMikulas Patocka }
184795d402f0SMikulas Patocka 
184895d402f0SMikulas Patocka /*
184995d402f0SMikulas Patocka  * A common routine for dm_bufio_new and dm_bufio_read.  Operation of these
185095d402f0SMikulas Patocka  * functions is similar except that dm_bufio_new doesn't read the
185195d402f0SMikulas Patocka  * buffer from the disk (assuming that the caller overwrites all the data
185295d402f0SMikulas Patocka  * and uses dm_bufio_mark_buffer_dirty to write new data back).
185395d402f0SMikulas Patocka  */
new_read(struct dm_bufio_client * c,sector_t block,enum new_flag nf,struct dm_buffer ** bp)185495d402f0SMikulas Patocka static void *new_read(struct dm_bufio_client *c, sector_t block,
185595d402f0SMikulas Patocka 		      enum new_flag nf, struct dm_buffer **bp)
185695d402f0SMikulas Patocka {
1857450e8deeSJoe Thornber 	int need_submit = 0;
185895d402f0SMikulas Patocka 	struct dm_buffer *b;
185995d402f0SMikulas Patocka 
18602480945cSMikulas Patocka 	LIST_HEAD(write_list);
18612480945cSMikulas Patocka 
1862450e8deeSJoe Thornber 	*bp = NULL;
1863450e8deeSJoe Thornber 
1864450e8deeSJoe Thornber 	/*
1865450e8deeSJoe Thornber 	 * Fast path, hopefully the block is already in the cache.  No need
1866450e8deeSJoe Thornber 	 * to get the client lock for this.
1867450e8deeSJoe Thornber 	 */
1868450e8deeSJoe Thornber 	b = cache_get(&c->cache, block);
1869450e8deeSJoe Thornber 	if (b) {
1870450e8deeSJoe Thornber 		if (nf == NF_PREFETCH) {
1871450e8deeSJoe Thornber 			cache_put_and_wake(c, b);
1872450e8deeSJoe Thornber 			return NULL;
1873450e8deeSJoe Thornber 		}
1874450e8deeSJoe Thornber 
1875450e8deeSJoe Thornber 		/*
1876450e8deeSJoe Thornber 		 * Note: it is essential that we don't wait for the buffer to be
1877450e8deeSJoe Thornber 		 * read if dm_bufio_get function is used. Both dm_bufio_get and
1878450e8deeSJoe Thornber 		 * dm_bufio_prefetch can be used in the driver request routine.
1879450e8deeSJoe Thornber 		 * If the user called both dm_bufio_prefetch and dm_bufio_get on
1880450e8deeSJoe Thornber 		 * the same buffer, it would deadlock if we waited.
1881450e8deeSJoe Thornber 		 */
1882450e8deeSJoe Thornber 		if (nf == NF_GET && unlikely(test_bit_acquire(B_READING, &b->state))) {
1883450e8deeSJoe Thornber 			cache_put_and_wake(c, b);
1884450e8deeSJoe Thornber 			return NULL;
1885450e8deeSJoe Thornber 		}
1886450e8deeSJoe Thornber 	}
1887450e8deeSJoe Thornber 
1888450e8deeSJoe Thornber 	if (!b) {
1889450e8deeSJoe Thornber 		if (nf == NF_GET)
1890450e8deeSJoe Thornber 			return NULL;
1891450e8deeSJoe Thornber 
189295d402f0SMikulas Patocka 		dm_bufio_lock(c);
18932480945cSMikulas Patocka 		b = __bufio_new(c, block, nf, &need_submit, &write_list);
1894450e8deeSJoe Thornber 		dm_bufio_unlock(c);
1895450e8deeSJoe Thornber 	}
1896450e8deeSJoe Thornber 
189786bad0c7SMikulas Patocka #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1898450e8deeSJoe Thornber 	if (b && (atomic_read(&b->hold_count) == 1))
189986bad0c7SMikulas Patocka 		buffer_record_stack(b);
190086bad0c7SMikulas Patocka #endif
190195d402f0SMikulas Patocka 
19022480945cSMikulas Patocka 	__flush_write_list(&write_list);
19032480945cSMikulas Patocka 
1904a66cc28fSMikulas Patocka 	if (!b)
1905f98c8f79SMikulas Patocka 		return NULL;
190695d402f0SMikulas Patocka 
190795d402f0SMikulas Patocka 	if (need_submit)
1908905be0a1SMikulas Patocka 		submit_io(b, REQ_OP_READ, read_endio);
190995d402f0SMikulas Patocka 
19105be21d65SMikulas Patocka 	if (nf != NF_GET)	/* we already tested this condition above */
191174316201SNeilBrown 		wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
191295d402f0SMikulas Patocka 
191395d402f0SMikulas Patocka 	if (b->read_error) {
19144e4cbee9SChristoph Hellwig 		int error = blk_status_to_errno(b->read_error);
191595d402f0SMikulas Patocka 
191695d402f0SMikulas Patocka 		dm_bufio_release(b);
191795d402f0SMikulas Patocka 
191895d402f0SMikulas Patocka 		return ERR_PTR(error);
191995d402f0SMikulas Patocka 	}
192095d402f0SMikulas Patocka 
192195d402f0SMikulas Patocka 	*bp = b;
192295d402f0SMikulas Patocka 
192395d402f0SMikulas Patocka 	return b->data;
192495d402f0SMikulas Patocka }
192595d402f0SMikulas Patocka 
dm_bufio_get(struct dm_bufio_client * c,sector_t block,struct dm_buffer ** bp)192695d402f0SMikulas Patocka void *dm_bufio_get(struct dm_bufio_client *c, sector_t block,
192795d402f0SMikulas Patocka 		   struct dm_buffer **bp)
192895d402f0SMikulas Patocka {
192995d402f0SMikulas Patocka 	return new_read(c, block, NF_GET, bp);
193095d402f0SMikulas Patocka }
193195d402f0SMikulas Patocka EXPORT_SYMBOL_GPL(dm_bufio_get);
193295d402f0SMikulas Patocka 
dm_bufio_read(struct dm_bufio_client * c,sector_t block,struct dm_buffer ** bp)193395d402f0SMikulas Patocka void *dm_bufio_read(struct dm_bufio_client *c, sector_t block,
193495d402f0SMikulas Patocka 		    struct dm_buffer **bp)
193595d402f0SMikulas Patocka {
193605112287SMike Snitzer 	if (WARN_ON_ONCE(dm_bufio_in_request()))
193705112287SMike Snitzer 		return ERR_PTR(-EINVAL);
193895d402f0SMikulas Patocka 
193995d402f0SMikulas Patocka 	return new_read(c, block, NF_READ, bp);
194095d402f0SMikulas Patocka }
194195d402f0SMikulas Patocka EXPORT_SYMBOL_GPL(dm_bufio_read);
194295d402f0SMikulas Patocka 
dm_bufio_new(struct dm_bufio_client * c,sector_t block,struct dm_buffer ** bp)194395d402f0SMikulas Patocka void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
194495d402f0SMikulas Patocka 		   struct dm_buffer **bp)
194595d402f0SMikulas Patocka {
194605112287SMike Snitzer 	if (WARN_ON_ONCE(dm_bufio_in_request()))
194705112287SMike Snitzer 		return ERR_PTR(-EINVAL);
194895d402f0SMikulas Patocka 
194995d402f0SMikulas Patocka 	return new_read(c, block, NF_FRESH, bp);
195095d402f0SMikulas Patocka }
195195d402f0SMikulas Patocka EXPORT_SYMBOL_GPL(dm_bufio_new);
195295d402f0SMikulas Patocka 
dm_bufio_prefetch(struct dm_bufio_client * c,sector_t block,unsigned int n_blocks)1953a66cc28fSMikulas Patocka void dm_bufio_prefetch(struct dm_bufio_client *c,
195486a3238cSHeinz Mauelshagen 		       sector_t block, unsigned int n_blocks)
1955a66cc28fSMikulas Patocka {
1956a66cc28fSMikulas Patocka 	struct blk_plug plug;
1957a66cc28fSMikulas Patocka 
19582480945cSMikulas Patocka 	LIST_HEAD(write_list);
19592480945cSMikulas Patocka 
196005112287SMike Snitzer 	if (WARN_ON_ONCE(dm_bufio_in_request()))
196105112287SMike Snitzer 		return; /* should never happen */
19623b6b7813SMikulas Patocka 
1963a66cc28fSMikulas Patocka 	blk_start_plug(&plug);
1964a66cc28fSMikulas Patocka 
1965a66cc28fSMikulas Patocka 	for (; n_blocks--; block++) {
1966a66cc28fSMikulas Patocka 		int need_submit;
1967a66cc28fSMikulas Patocka 		struct dm_buffer *b;
19680ef0b471SHeinz Mauelshagen 
1969450e8deeSJoe Thornber 		b = cache_get(&c->cache, block);
1970450e8deeSJoe Thornber 		if (b) {
1971450e8deeSJoe Thornber 			/* already in cache */
1972450e8deeSJoe Thornber 			cache_put_and_wake(c, b);
1973450e8deeSJoe Thornber 			continue;
1974450e8deeSJoe Thornber 		}
1975450e8deeSJoe Thornber 
1976450e8deeSJoe Thornber 		dm_bufio_lock(c);
19772480945cSMikulas Patocka 		b = __bufio_new(c, block, NF_PREFETCH, &need_submit,
19782480945cSMikulas Patocka 				&write_list);
19792480945cSMikulas Patocka 		if (unlikely(!list_empty(&write_list))) {
19802480945cSMikulas Patocka 			dm_bufio_unlock(c);
19812480945cSMikulas Patocka 			blk_finish_plug(&plug);
19822480945cSMikulas Patocka 			__flush_write_list(&write_list);
19832480945cSMikulas Patocka 			blk_start_plug(&plug);
19842480945cSMikulas Patocka 			dm_bufio_lock(c);
19852480945cSMikulas Patocka 		}
1986a66cc28fSMikulas Patocka 		if (unlikely(b != NULL)) {
1987a66cc28fSMikulas Patocka 			dm_bufio_unlock(c);
1988a66cc28fSMikulas Patocka 
1989a66cc28fSMikulas Patocka 			if (need_submit)
1990905be0a1SMikulas Patocka 				submit_io(b, REQ_OP_READ, read_endio);
1991a66cc28fSMikulas Patocka 			dm_bufio_release(b);
1992a66cc28fSMikulas Patocka 
19937cd32674SPeter Zijlstra 			cond_resched();
1994a66cc28fSMikulas Patocka 
1995a66cc28fSMikulas Patocka 			if (!n_blocks)
1996a66cc28fSMikulas Patocka 				goto flush_plug;
1997a66cc28fSMikulas Patocka 			dm_bufio_lock(c);
1998a66cc28fSMikulas Patocka 		}
1999a66cc28fSMikulas Patocka 		dm_bufio_unlock(c);
2000450e8deeSJoe Thornber 	}
2001a66cc28fSMikulas Patocka 
2002a66cc28fSMikulas Patocka flush_plug:
2003a66cc28fSMikulas Patocka 	blk_finish_plug(&plug);
2004a66cc28fSMikulas Patocka }
2005a66cc28fSMikulas Patocka EXPORT_SYMBOL_GPL(dm_bufio_prefetch);
2006a66cc28fSMikulas Patocka 
dm_bufio_release(struct dm_buffer * b)200795d402f0SMikulas Patocka void dm_bufio_release(struct dm_buffer *b)
200895d402f0SMikulas Patocka {
200995d402f0SMikulas Patocka 	struct dm_bufio_client *c = b->c;
201095d402f0SMikulas Patocka 
201195d402f0SMikulas Patocka 	/*
201295d402f0SMikulas Patocka 	 * If there were errors on the buffer, and the buffer is not
201395d402f0SMikulas Patocka 	 * to be written, free the buffer. There is no point in caching
201495d402f0SMikulas Patocka 	 * invalid buffer.
201595d402f0SMikulas Patocka 	 */
201695d402f0SMikulas Patocka 	if ((b->read_error || b->write_error) &&
2017141b3523SMikulas Patocka 	    !test_bit_acquire(B_READING, &b->state) &&
201895d402f0SMikulas Patocka 	    !test_bit(B_WRITING, &b->state) &&
201995d402f0SMikulas Patocka 	    !test_bit(B_DIRTY, &b->state)) {
2020450e8deeSJoe Thornber 		dm_bufio_lock(c);
2021450e8deeSJoe Thornber 
2022450e8deeSJoe Thornber 		/* cache remove can fail if there are other holders */
2023450e8deeSJoe Thornber 		if (cache_remove(&c->cache, b)) {
202495d402f0SMikulas Patocka 			__free_buffer_wake(b);
2025450e8deeSJoe Thornber 			dm_bufio_unlock(c);
2026450e8deeSJoe Thornber 			return;
202795d402f0SMikulas Patocka 		}
202895d402f0SMikulas Patocka 
202995d402f0SMikulas Patocka 		dm_bufio_unlock(c);
203095d402f0SMikulas Patocka 	}
2031450e8deeSJoe Thornber 
2032450e8deeSJoe Thornber 	cache_put_and_wake(c, b);
2033450e8deeSJoe Thornber }
203495d402f0SMikulas Patocka EXPORT_SYMBOL_GPL(dm_bufio_release);
203595d402f0SMikulas Patocka 
dm_bufio_mark_partial_buffer_dirty(struct dm_buffer * b,unsigned int start,unsigned int end)20361e3b21c6SMikulas Patocka void dm_bufio_mark_partial_buffer_dirty(struct dm_buffer *b,
203786a3238cSHeinz Mauelshagen 					unsigned int start, unsigned int end)
203895d402f0SMikulas Patocka {
203995d402f0SMikulas Patocka 	struct dm_bufio_client *c = b->c;
204095d402f0SMikulas Patocka 
20411e3b21c6SMikulas Patocka 	BUG_ON(start >= end);
20421e3b21c6SMikulas Patocka 	BUG_ON(end > b->c->block_size);
20431e3b21c6SMikulas Patocka 
204495d402f0SMikulas Patocka 	dm_bufio_lock(c);
204595d402f0SMikulas Patocka 
2046a66cc28fSMikulas Patocka 	BUG_ON(test_bit(B_READING, &b->state));
2047a66cc28fSMikulas Patocka 
20481e3b21c6SMikulas Patocka 	if (!test_and_set_bit(B_DIRTY, &b->state)) {
20491e3b21c6SMikulas Patocka 		b->dirty_start = start;
20501e3b21c6SMikulas Patocka 		b->dirty_end = end;
2051450e8deeSJoe Thornber 		cache_mark(&c->cache, b, LIST_DIRTY);
20521e3b21c6SMikulas Patocka 	} else {
20531e3b21c6SMikulas Patocka 		if (start < b->dirty_start)
20541e3b21c6SMikulas Patocka 			b->dirty_start = start;
20551e3b21c6SMikulas Patocka 		if (end > b->dirty_end)
20561e3b21c6SMikulas Patocka 			b->dirty_end = end;
20571e3b21c6SMikulas Patocka 	}
205895d402f0SMikulas Patocka 
205995d402f0SMikulas Patocka 	dm_bufio_unlock(c);
206095d402f0SMikulas Patocka }
20611e3b21c6SMikulas Patocka EXPORT_SYMBOL_GPL(dm_bufio_mark_partial_buffer_dirty);
20621e3b21c6SMikulas Patocka 
dm_bufio_mark_buffer_dirty(struct dm_buffer * b)20631e3b21c6SMikulas Patocka void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
20641e3b21c6SMikulas Patocka {
20651e3b21c6SMikulas Patocka 	dm_bufio_mark_partial_buffer_dirty(b, 0, b->c->block_size);
20661e3b21c6SMikulas Patocka }
206795d402f0SMikulas Patocka EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty);
206895d402f0SMikulas Patocka 
dm_bufio_write_dirty_buffers_async(struct dm_bufio_client * c)206995d402f0SMikulas Patocka void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c)
207095d402f0SMikulas Patocka {
20712480945cSMikulas Patocka 	LIST_HEAD(write_list);
20722480945cSMikulas Patocka 
207305112287SMike Snitzer 	if (WARN_ON_ONCE(dm_bufio_in_request()))
207405112287SMike Snitzer 		return; /* should never happen */
207595d402f0SMikulas Patocka 
207695d402f0SMikulas Patocka 	dm_bufio_lock(c);
20772480945cSMikulas Patocka 	__write_dirty_buffers_async(c, 0, &write_list);
207895d402f0SMikulas Patocka 	dm_bufio_unlock(c);
20792480945cSMikulas Patocka 	__flush_write_list(&write_list);
208095d402f0SMikulas Patocka }
208195d402f0SMikulas Patocka EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
208295d402f0SMikulas Patocka 
208395d402f0SMikulas Patocka /*
208495d402f0SMikulas Patocka  * For performance, it is essential that the buffers are written asynchronously
208595d402f0SMikulas Patocka  * and simultaneously (so that the block layer can merge the writes) and then
208695d402f0SMikulas Patocka  * waited upon.
208795d402f0SMikulas Patocka  *
208895d402f0SMikulas Patocka  * Finally, we flush hardware disk cache.
208995d402f0SMikulas Patocka  */
is_writing(struct lru_entry * e,void * context)2090450e8deeSJoe Thornber static bool is_writing(struct lru_entry *e, void *context)
2091450e8deeSJoe Thornber {
2092450e8deeSJoe Thornber 	struct dm_buffer *b = le_to_buffer(e);
2093450e8deeSJoe Thornber 
2094450e8deeSJoe Thornber 	return test_bit(B_WRITING, &b->state);
2095450e8deeSJoe Thornber }
2096450e8deeSJoe Thornber 
dm_bufio_write_dirty_buffers(struct dm_bufio_client * c)209795d402f0SMikulas Patocka int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
209895d402f0SMikulas Patocka {
2099edc11d49SDan Carpenter 	int a, f;
2100450e8deeSJoe Thornber 	unsigned long nr_buffers;
2101450e8deeSJoe Thornber 	struct lru_entry *e;
2102450e8deeSJoe Thornber 	struct lru_iter it;
210395d402f0SMikulas Patocka 
21042480945cSMikulas Patocka 	LIST_HEAD(write_list);
21052480945cSMikulas Patocka 
210695d402f0SMikulas Patocka 	dm_bufio_lock(c);
21072480945cSMikulas Patocka 	__write_dirty_buffers_async(c, 0, &write_list);
21082480945cSMikulas Patocka 	dm_bufio_unlock(c);
21092480945cSMikulas Patocka 	__flush_write_list(&write_list);
21102480945cSMikulas Patocka 	dm_bufio_lock(c);
211195d402f0SMikulas Patocka 
2112450e8deeSJoe Thornber 	nr_buffers = cache_count(&c->cache, LIST_DIRTY);
2113450e8deeSJoe Thornber 	lru_iter_begin(&c->cache.lru[LIST_DIRTY], &it);
2114450e8deeSJoe Thornber 	while ((e = lru_iter_next(&it, is_writing, c))) {
2115450e8deeSJoe Thornber 		struct dm_buffer *b = le_to_buffer(e);
2116450e8deeSJoe Thornber 		__cache_inc_buffer(b);
211795d402f0SMikulas Patocka 
211895d402f0SMikulas Patocka 		BUG_ON(test_bit(B_READING, &b->state));
211995d402f0SMikulas Patocka 
2120450e8deeSJoe Thornber 		if (nr_buffers) {
2121450e8deeSJoe Thornber 			nr_buffers--;
212295d402f0SMikulas Patocka 			dm_bufio_unlock(c);
2123450e8deeSJoe Thornber 			wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
212495d402f0SMikulas Patocka 			dm_bufio_lock(c);
2125450e8deeSJoe Thornber 		} else {
2126450e8deeSJoe Thornber 			wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
212795d402f0SMikulas Patocka 		}
212895d402f0SMikulas Patocka 
2129450e8deeSJoe Thornber 		if (!test_bit(B_DIRTY, &b->state) && !test_bit(B_WRITING, &b->state))
2130450e8deeSJoe Thornber 			cache_mark(&c->cache, b, LIST_CLEAN);
2131450e8deeSJoe Thornber 
2132450e8deeSJoe Thornber 		cache_put_and_wake(c, b);
213395d402f0SMikulas Patocka 
21347cd32674SPeter Zijlstra 		cond_resched();
213595d402f0SMikulas Patocka 	}
2136450e8deeSJoe Thornber 	lru_iter_end(&it);
2137450e8deeSJoe Thornber 
213895d402f0SMikulas Patocka 	wake_up(&c->free_buffer_wait);
213995d402f0SMikulas Patocka 	dm_bufio_unlock(c);
214095d402f0SMikulas Patocka 
214195d402f0SMikulas Patocka 	a = xchg(&c->async_write_error, 0);
214295d402f0SMikulas Patocka 	f = dm_bufio_issue_flush(c);
214395d402f0SMikulas Patocka 	if (a)
214495d402f0SMikulas Patocka 		return a;
214595d402f0SMikulas Patocka 
214695d402f0SMikulas Patocka 	return f;
214795d402f0SMikulas Patocka }
214895d402f0SMikulas Patocka EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers);
214995d402f0SMikulas Patocka 
215095d402f0SMikulas Patocka /*
2151ef992373SShenghui Wang  * Use dm-io to send an empty barrier to flush the device.
215295d402f0SMikulas Patocka  */
dm_bufio_issue_flush(struct dm_bufio_client * c)215395d402f0SMikulas Patocka int dm_bufio_issue_flush(struct dm_bufio_client *c)
215495d402f0SMikulas Patocka {
215595d402f0SMikulas Patocka 	struct dm_io_request io_req = {
2156581075e4SBart Van Assche 		.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC,
215795d402f0SMikulas Patocka 		.mem.type = DM_IO_KMEM,
215895d402f0SMikulas Patocka 		.mem.ptr.addr = NULL,
215995d402f0SMikulas Patocka 		.client = c->dm_io,
216095d402f0SMikulas Patocka 	};
216195d402f0SMikulas Patocka 	struct dm_io_region io_reg = {
216295d402f0SMikulas Patocka 		.bdev = c->bdev,
216395d402f0SMikulas Patocka 		.sector = 0,
216495d402f0SMikulas Patocka 		.count = 0,
216595d402f0SMikulas Patocka 	};
216695d402f0SMikulas Patocka 
216705112287SMike Snitzer 	if (WARN_ON_ONCE(dm_bufio_in_request()))
216805112287SMike Snitzer 		return -EINVAL;
216995d402f0SMikulas Patocka 
2170*5cfcea64SHongyu Jin 	return dm_io(&io_req, 1, &io_reg, NULL, IOPRIO_DEFAULT);
217195d402f0SMikulas Patocka }
217295d402f0SMikulas Patocka EXPORT_SYMBOL_GPL(dm_bufio_issue_flush);
217395d402f0SMikulas Patocka 
217495d402f0SMikulas Patocka /*
21756fbeb004SMikulas Patocka  * Use dm-io to send a discard request to flush the device.
21766fbeb004SMikulas Patocka  */
dm_bufio_issue_discard(struct dm_bufio_client * c,sector_t block,sector_t count)21776fbeb004SMikulas Patocka int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t count)
21786fbeb004SMikulas Patocka {
21796fbeb004SMikulas Patocka 	struct dm_io_request io_req = {
2180581075e4SBart Van Assche 		.bi_opf = REQ_OP_DISCARD | REQ_SYNC,
21816fbeb004SMikulas Patocka 		.mem.type = DM_IO_KMEM,
21826fbeb004SMikulas Patocka 		.mem.ptr.addr = NULL,
21836fbeb004SMikulas Patocka 		.client = c->dm_io,
21846fbeb004SMikulas Patocka 	};
21856fbeb004SMikulas Patocka 	struct dm_io_region io_reg = {
21866fbeb004SMikulas Patocka 		.bdev = c->bdev,
21876fbeb004SMikulas Patocka 		.sector = block_to_sector(c, block),
21886fbeb004SMikulas Patocka 		.count = block_to_sector(c, count),
21896fbeb004SMikulas Patocka 	};
21906fbeb004SMikulas Patocka 
219105112287SMike Snitzer 	if (WARN_ON_ONCE(dm_bufio_in_request()))
219205112287SMike Snitzer 		return -EINVAL; /* discards are optional */
21936fbeb004SMikulas Patocka 
2194*5cfcea64SHongyu Jin 	return dm_io(&io_req, 1, &io_reg, NULL, IOPRIO_DEFAULT);
21956fbeb004SMikulas Patocka }
21966fbeb004SMikulas Patocka EXPORT_SYMBOL_GPL(dm_bufio_issue_discard);
21976fbeb004SMikulas Patocka 
forget_buffer(struct dm_bufio_client * c,sector_t block)2198450e8deeSJoe Thornber static bool forget_buffer(struct dm_bufio_client *c, sector_t block)
219933a18062SMikulas Patocka {
2200450e8deeSJoe Thornber 	struct dm_buffer *b;
2201450e8deeSJoe Thornber 
2202450e8deeSJoe Thornber 	b = cache_get(&c->cache, block);
2203450e8deeSJoe Thornber 	if (b) {
2204450e8deeSJoe Thornber 		if (likely(!smp_load_acquire(&b->state))) {
2205450e8deeSJoe Thornber 			if (cache_remove(&c->cache, b))
220633a18062SMikulas Patocka 				__free_buffer_wake(b);
2207450e8deeSJoe Thornber 			else
2208450e8deeSJoe Thornber 				cache_put_and_wake(c, b);
2209450e8deeSJoe Thornber 		} else {
2210450e8deeSJoe Thornber 			cache_put_and_wake(c, b);
221133a18062SMikulas Patocka 		}
221233a18062SMikulas Patocka 	}
221333a18062SMikulas Patocka 
2214450e8deeSJoe Thornber 	return b ? true : false;
2215450e8deeSJoe Thornber }
2216450e8deeSJoe Thornber 
221755494bf2SMikulas Patocka /*
221855494bf2SMikulas Patocka  * Free the given buffer.
221955494bf2SMikulas Patocka  *
222055494bf2SMikulas Patocka  * This is just a hint, if the buffer is in use or dirty, this function
222155494bf2SMikulas Patocka  * does nothing.
222255494bf2SMikulas Patocka  */
dm_bufio_forget(struct dm_bufio_client * c,sector_t block)222355494bf2SMikulas Patocka void dm_bufio_forget(struct dm_bufio_client *c, sector_t block)
222455494bf2SMikulas Patocka {
222555494bf2SMikulas Patocka 	dm_bufio_lock(c);
2226450e8deeSJoe Thornber 	forget_buffer(c, block);
222755494bf2SMikulas Patocka 	dm_bufio_unlock(c);
222855494bf2SMikulas Patocka }
2229afa53df8SMikulas Patocka EXPORT_SYMBOL_GPL(dm_bufio_forget);
223055494bf2SMikulas Patocka 
idle(struct dm_buffer * b,void * context)2231450e8deeSJoe Thornber static enum evict_result idle(struct dm_buffer *b, void *context)
2232450e8deeSJoe Thornber {
2233450e8deeSJoe Thornber 	return b->state ? ER_DONT_EVICT : ER_EVICT;
2234450e8deeSJoe Thornber }
2235450e8deeSJoe Thornber 
dm_bufio_forget_buffers(struct dm_bufio_client * c,sector_t block,sector_t n_blocks)223633a18062SMikulas Patocka void dm_bufio_forget_buffers(struct dm_bufio_client *c, sector_t block, sector_t n_blocks)
223733a18062SMikulas Patocka {
223833a18062SMikulas Patocka 	dm_bufio_lock(c);
2239450e8deeSJoe Thornber 	cache_remove_range(&c->cache, block, block + n_blocks, idle, __free_buffer_wake);
224033a18062SMikulas Patocka 	dm_bufio_unlock(c);
224133a18062SMikulas Patocka }
224233a18062SMikulas Patocka EXPORT_SYMBOL_GPL(dm_bufio_forget_buffers);
224333a18062SMikulas Patocka 
dm_bufio_set_minimum_buffers(struct dm_bufio_client * c,unsigned int n)224486a3238cSHeinz Mauelshagen void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned int n)
224555b082e6SMikulas Patocka {
224655b082e6SMikulas Patocka 	c->minimum_buffers = n;
224755b082e6SMikulas Patocka }
2248afa53df8SMikulas Patocka EXPORT_SYMBOL_GPL(dm_bufio_set_minimum_buffers);
224955b082e6SMikulas Patocka 
dm_bufio_get_block_size(struct dm_bufio_client * c)225086a3238cSHeinz Mauelshagen unsigned int dm_bufio_get_block_size(struct dm_bufio_client *c)
225195d402f0SMikulas Patocka {
225295d402f0SMikulas Patocka 	return c->block_size;
225395d402f0SMikulas Patocka }
225495d402f0SMikulas Patocka EXPORT_SYMBOL_GPL(dm_bufio_get_block_size);
225595d402f0SMikulas Patocka 
dm_bufio_get_device_size(struct dm_bufio_client * c)225695d402f0SMikulas Patocka sector_t dm_bufio_get_device_size(struct dm_bufio_client *c)
225795d402f0SMikulas Patocka {
22586dcbb52cSChristoph Hellwig 	sector_t s = bdev_nr_sectors(c->bdev);
22590ef0b471SHeinz Mauelshagen 
2260a14e5ec6SMikulas Patocka 	if (s >= c->start)
2261a14e5ec6SMikulas Patocka 		s -= c->start;
2262a14e5ec6SMikulas Patocka 	else
2263a14e5ec6SMikulas Patocka 		s = 0;
2264f51f2e0aSMikulas Patocka 	if (likely(c->sectors_per_block_bits >= 0))
2265f51f2e0aSMikulas Patocka 		s >>= c->sectors_per_block_bits;
2266f51f2e0aSMikulas Patocka 	else
2267f51f2e0aSMikulas Patocka 		sector_div(s, c->block_size >> SECTOR_SHIFT);
2268f51f2e0aSMikulas Patocka 	return s;
226995d402f0SMikulas Patocka }
227095d402f0SMikulas Patocka EXPORT_SYMBOL_GPL(dm_bufio_get_device_size);
227195d402f0SMikulas Patocka 
dm_bufio_get_dm_io_client(struct dm_bufio_client * c)22729b594826SMikulas Patocka struct dm_io_client *dm_bufio_get_dm_io_client(struct dm_bufio_client *c)
22739b594826SMikulas Patocka {
22749b594826SMikulas Patocka 	return c->dm_io;
22759b594826SMikulas Patocka }
22769b594826SMikulas Patocka EXPORT_SYMBOL_GPL(dm_bufio_get_dm_io_client);
22779b594826SMikulas Patocka 
dm_bufio_get_block_number(struct dm_buffer * b)227895d402f0SMikulas Patocka sector_t dm_bufio_get_block_number(struct dm_buffer *b)
227995d402f0SMikulas Patocka {
228095d402f0SMikulas Patocka 	return b->block;
228195d402f0SMikulas Patocka }
228295d402f0SMikulas Patocka EXPORT_SYMBOL_GPL(dm_bufio_get_block_number);
228395d402f0SMikulas Patocka 
dm_bufio_get_block_data(struct dm_buffer * b)228495d402f0SMikulas Patocka void *dm_bufio_get_block_data(struct dm_buffer *b)
228595d402f0SMikulas Patocka {
228695d402f0SMikulas Patocka 	return b->data;
228795d402f0SMikulas Patocka }
228895d402f0SMikulas Patocka EXPORT_SYMBOL_GPL(dm_bufio_get_block_data);
228995d402f0SMikulas Patocka 
dm_bufio_get_aux_data(struct dm_buffer * b)229095d402f0SMikulas Patocka void *dm_bufio_get_aux_data(struct dm_buffer *b)
229195d402f0SMikulas Patocka {
229295d402f0SMikulas Patocka 	return b + 1;
229395d402f0SMikulas Patocka }
229495d402f0SMikulas Patocka EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data);
229595d402f0SMikulas Patocka 
dm_bufio_get_client(struct dm_buffer * b)229695d402f0SMikulas Patocka struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b)
229795d402f0SMikulas Patocka {
229895d402f0SMikulas Patocka 	return b->c;
229995d402f0SMikulas Patocka }
230095d402f0SMikulas Patocka EXPORT_SYMBOL_GPL(dm_bufio_get_client);
230195d402f0SMikulas Patocka 
warn_leak(struct dm_buffer * b,void * context)2302450e8deeSJoe Thornber static enum it_action warn_leak(struct dm_buffer *b, void *context)
2303450e8deeSJoe Thornber {
2304450e8deeSJoe Thornber 	bool *warned = context;
2305450e8deeSJoe Thornber 
2306450e8deeSJoe Thornber 	WARN_ON(!(*warned));
2307450e8deeSJoe Thornber 	*warned = true;
2308450e8deeSJoe Thornber 	DMERR("leaked buffer %llx, hold count %u, list %d",
2309450e8deeSJoe Thornber 	      (unsigned long long)b->block, atomic_read(&b->hold_count), b->list_mode);
2310450e8deeSJoe Thornber #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
2311450e8deeSJoe Thornber 	stack_trace_print(b->stack_entries, b->stack_len, 1);
2312450e8deeSJoe Thornber 	/* mark unclaimed to avoid WARN_ON at end of drop_buffers() */
2313450e8deeSJoe Thornber 	atomic_set(&b->hold_count, 0);
2314450e8deeSJoe Thornber #endif
2315450e8deeSJoe Thornber 	return IT_NEXT;
2316450e8deeSJoe Thornber }
2317450e8deeSJoe Thornber 
drop_buffers(struct dm_bufio_client * c)231895d402f0SMikulas Patocka static void drop_buffers(struct dm_bufio_client *c)
231995d402f0SMikulas Patocka {
232095d402f0SMikulas Patocka 	int i;
2321450e8deeSJoe Thornber 	struct dm_buffer *b;
232295d402f0SMikulas Patocka 
2323b75a80f4SMike Snitzer 	if (WARN_ON(dm_bufio_in_request()))
2324b75a80f4SMike Snitzer 		return; /* should never happen */
232595d402f0SMikulas Patocka 
232695d402f0SMikulas Patocka 	/*
232795d402f0SMikulas Patocka 	 * An optimization so that the buffers are not written one-by-one.
232895d402f0SMikulas Patocka 	 */
232995d402f0SMikulas Patocka 	dm_bufio_write_dirty_buffers_async(c);
233095d402f0SMikulas Patocka 
233195d402f0SMikulas Patocka 	dm_bufio_lock(c);
233295d402f0SMikulas Patocka 
233395d402f0SMikulas Patocka 	while ((b = __get_unclaimed_buffer(c)))
233495d402f0SMikulas Patocka 		__free_buffer_wake(b);
233595d402f0SMikulas Patocka 
2336450e8deeSJoe Thornber 	for (i = 0; i < LIST_SIZE; i++) {
2337450e8deeSJoe Thornber 		bool warned = false;
2338450e8deeSJoe Thornber 
2339450e8deeSJoe Thornber 		cache_iterate(&c->cache, i, warn_leak, &warned);
234086bad0c7SMikulas Patocka 	}
234186bad0c7SMikulas Patocka 
234286bad0c7SMikulas Patocka #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
234386bad0c7SMikulas Patocka 	while ((b = __get_unclaimed_buffer(c)))
234486bad0c7SMikulas Patocka 		__free_buffer_wake(b);
234586bad0c7SMikulas Patocka #endif
234695d402f0SMikulas Patocka 
234795d402f0SMikulas Patocka 	for (i = 0; i < LIST_SIZE; i++)
2348450e8deeSJoe Thornber 		WARN_ON(cache_count(&c->cache, i));
234995d402f0SMikulas Patocka 
235095d402f0SMikulas Patocka 	dm_bufio_unlock(c);
235195d402f0SMikulas Patocka }
235295d402f0SMikulas Patocka 
get_retain_buffers(struct dm_bufio_client * c)235313840d38SMikulas Patocka static unsigned long get_retain_buffers(struct dm_bufio_client *c)
235433096a78SJoe Thornber {
23556aa7de05SMark Rutland 	unsigned long retain_bytes = READ_ONCE(dm_bufio_retain_bytes);
23560ef0b471SHeinz Mauelshagen 
2357f51f2e0aSMikulas Patocka 	if (likely(c->sectors_per_block_bits >= 0))
2358f51f2e0aSMikulas Patocka 		retain_bytes >>= c->sectors_per_block_bits + SECTOR_SHIFT;
2359f51f2e0aSMikulas Patocka 	else
2360f51f2e0aSMikulas Patocka 		retain_bytes /= c->block_size;
23610ef0b471SHeinz Mauelshagen 
2362f51f2e0aSMikulas Patocka 	return retain_bytes;
236333096a78SJoe Thornber }
236433096a78SJoe Thornber 
__scan(struct dm_bufio_client * c)236570704c33SMikulas Patocka static void __scan(struct dm_bufio_client *c)
236695d402f0SMikulas Patocka {
236795d402f0SMikulas Patocka 	int l;
2368450e8deeSJoe Thornber 	struct dm_buffer *b;
236933096a78SJoe Thornber 	unsigned long freed = 0;
237013840d38SMikulas Patocka 	unsigned long retain_target = get_retain_buffers(c);
2371450e8deeSJoe Thornber 	unsigned long count = cache_total(&c->cache);
237295d402f0SMikulas Patocka 
237395d402f0SMikulas Patocka 	for (l = 0; l < LIST_SIZE; l++) {
2374450e8deeSJoe Thornber 		while (true) {
237570704c33SMikulas Patocka 			if (count - freed <= retain_target)
237670704c33SMikulas Patocka 				atomic_long_set(&c->need_shrink, 0);
237770704c33SMikulas Patocka 			if (!atomic_long_read(&c->need_shrink))
2378450e8deeSJoe Thornber 				break;
2379450e8deeSJoe Thornber 
2380450e8deeSJoe Thornber 			b = cache_evict(&c->cache, l,
2381450e8deeSJoe Thornber 					l == LIST_CLEAN ? is_clean : is_dirty, c);
2382450e8deeSJoe Thornber 			if (!b)
2383450e8deeSJoe Thornber 				break;
2384450e8deeSJoe Thornber 
2385450e8deeSJoe Thornber 			__make_buffer_clean(b);
2386450e8deeSJoe Thornber 			__free_buffer_wake(b);
2387450e8deeSJoe Thornber 
238870704c33SMikulas Patocka 			atomic_long_dec(&c->need_shrink);
238933096a78SJoe Thornber 			freed++;
23907cd32674SPeter Zijlstra 			cond_resched();
239195d402f0SMikulas Patocka 		}
23920e825862SMikulas Patocka 	}
239395d402f0SMikulas Patocka }
239495d402f0SMikulas Patocka 
shrink_work(struct work_struct * w)239570704c33SMikulas Patocka static void shrink_work(struct work_struct *w)
239670704c33SMikulas Patocka {
239770704c33SMikulas Patocka 	struct dm_bufio_client *c = container_of(w, struct dm_bufio_client, shrink_work);
239870704c33SMikulas Patocka 
239970704c33SMikulas Patocka 	dm_bufio_lock(c);
240070704c33SMikulas Patocka 	__scan(c);
240170704c33SMikulas Patocka 	dm_bufio_unlock(c);
240270704c33SMikulas Patocka }
240370704c33SMikulas Patocka 
dm_bufio_shrink_scan(struct shrinker * shrink,struct shrink_control * sc)240470704c33SMikulas Patocka static unsigned long dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
240595d402f0SMikulas Patocka {
24067dc19d5aSDave Chinner 	struct dm_bufio_client *c;
240795d402f0SMikulas Patocka 
24087dc19d5aSDave Chinner 	c = container_of(shrink, struct dm_bufio_client, shrinker);
240970704c33SMikulas Patocka 	atomic_long_add(sc->nr_to_scan, &c->need_shrink);
241070704c33SMikulas Patocka 	queue_work(dm_bufio_wq, &c->shrink_work);
241195d402f0SMikulas Patocka 
241270704c33SMikulas Patocka 	return sc->nr_to_scan;
24137dc19d5aSDave Chinner }
241495d402f0SMikulas Patocka 
dm_bufio_shrink_count(struct shrinker * shrink,struct shrink_control * sc)241570704c33SMikulas Patocka static unsigned long dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
24167dc19d5aSDave Chinner {
2417d12067f4SMikulas Patocka 	struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker);
2418450e8deeSJoe Thornber 	unsigned long count = cache_total(&c->cache);
2419fbc7c07eSSuren Baghdasaryan 	unsigned long retain_target = get_retain_buffers(c);
242070704c33SMikulas Patocka 	unsigned long queued_for_cleanup = atomic_long_read(&c->need_shrink);
24217dc19d5aSDave Chinner 
242270704c33SMikulas Patocka 	if (unlikely(count < retain_target))
242370704c33SMikulas Patocka 		count = 0;
242470704c33SMikulas Patocka 	else
242570704c33SMikulas Patocka 		count -= retain_target;
242670704c33SMikulas Patocka 
242770704c33SMikulas Patocka 	if (unlikely(count < queued_for_cleanup))
242870704c33SMikulas Patocka 		count = 0;
242970704c33SMikulas Patocka 	else
243070704c33SMikulas Patocka 		count -= queued_for_cleanup;
243170704c33SMikulas Patocka 
243270704c33SMikulas Patocka 	return count;
243395d402f0SMikulas Patocka }
243495d402f0SMikulas Patocka 
243595d402f0SMikulas Patocka /*
243695d402f0SMikulas Patocka  * Create the buffering interface
243795d402f0SMikulas Patocka  */
dm_bufio_client_create(struct block_device * bdev,unsigned int block_size,unsigned int reserved_buffers,unsigned int aux_size,void (* alloc_callback)(struct dm_buffer *),void (* write_callback)(struct dm_buffer *),unsigned int flags)243886a3238cSHeinz Mauelshagen struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned int block_size,
243986a3238cSHeinz Mauelshagen 					       unsigned int reserved_buffers, unsigned int aux_size,
244095d402f0SMikulas Patocka 					       void (*alloc_callback)(struct dm_buffer *),
24410fcb100dSNathan Huckleberry 					       void (*write_callback)(struct dm_buffer *),
24420fcb100dSNathan Huckleberry 					       unsigned int flags)
244395d402f0SMikulas Patocka {
244495d402f0SMikulas Patocka 	int r;
24451e84c4b7SMike Snitzer 	unsigned int num_locks;
244695d402f0SMikulas Patocka 	struct dm_bufio_client *c;
2447359dbf19SMikulas Patocka 	char slab_name[27];
244895d402f0SMikulas Patocka 
2449f51f2e0aSMikulas Patocka 	if (!block_size || block_size & ((1 << SECTOR_SHIFT) - 1)) {
2450f51f2e0aSMikulas Patocka 		DMERR("%s: block size not specified or is not multiple of 512b", __func__);
2451f51f2e0aSMikulas Patocka 		r = -EINVAL;
2452f51f2e0aSMikulas Patocka 		goto bad_client;
2453f51f2e0aSMikulas Patocka 	}
245495d402f0SMikulas Patocka 
24551e84c4b7SMike Snitzer 	num_locks = dm_num_hash_locks();
24561e84c4b7SMike Snitzer 	c = kzalloc(sizeof(*c) + (num_locks * sizeof(struct buffer_tree)), GFP_KERNEL);
245795d402f0SMikulas Patocka 	if (!c) {
245895d402f0SMikulas Patocka 		r = -ENOMEM;
245995d402f0SMikulas Patocka 		goto bad_client;
246095d402f0SMikulas Patocka 	}
24615be21d65SMikulas Patocka 	cache_init(&c->cache, num_locks, (flags & DM_BUFIO_CLIENT_NO_SLEEP) != 0);
246295d402f0SMikulas Patocka 
246395d402f0SMikulas Patocka 	c->bdev = bdev;
246495d402f0SMikulas Patocka 	c->block_size = block_size;
2465f51f2e0aSMikulas Patocka 	if (is_power_of_2(block_size))
2466a3d939aeSMikulas Patocka 		c->sectors_per_block_bits = __ffs(block_size) - SECTOR_SHIFT;
2467f51f2e0aSMikulas Patocka 	else
2468f51f2e0aSMikulas Patocka 		c->sectors_per_block_bits = -1;
246995d402f0SMikulas Patocka 
247095d402f0SMikulas Patocka 	c->alloc_callback = alloc_callback;
247195d402f0SMikulas Patocka 	c->write_callback = write_callback;
247295d402f0SMikulas Patocka 
24733c1c875dSMike Snitzer 	if (flags & DM_BUFIO_CLIENT_NO_SLEEP) {
2474b32d4582SNathan Huckleberry 		c->no_sleep = true;
24753c1c875dSMike Snitzer 		static_branch_inc(&no_sleep_enabled);
24763c1c875dSMike Snitzer 	}
2477b32d4582SNathan Huckleberry 
247895d402f0SMikulas Patocka 	mutex_init(&c->lock);
2479b32d4582SNathan Huckleberry 	spin_lock_init(&c->spinlock);
248095d402f0SMikulas Patocka 	INIT_LIST_HEAD(&c->reserved_buffers);
248195d402f0SMikulas Patocka 	c->need_reserved_buffers = reserved_buffers;
248295d402f0SMikulas Patocka 
2483afa53df8SMikulas Patocka 	dm_bufio_set_minimum_buffers(c, DM_BUFIO_MIN_BUFFERS);
248455b082e6SMikulas Patocka 
248595d402f0SMikulas Patocka 	init_waitqueue_head(&c->free_buffer_wait);
248695d402f0SMikulas Patocka 	c->async_write_error = 0;
248795d402f0SMikulas Patocka 
248895d402f0SMikulas Patocka 	c->dm_io = dm_io_client_create();
248995d402f0SMikulas Patocka 	if (IS_ERR(c->dm_io)) {
249095d402f0SMikulas Patocka 		r = PTR_ERR(c->dm_io);
249195d402f0SMikulas Patocka 		goto bad_dm_io;
249295d402f0SMikulas Patocka 	}
249395d402f0SMikulas Patocka 
2494f51f2e0aSMikulas Patocka 	if (block_size <= KMALLOC_MAX_SIZE &&
2495f51f2e0aSMikulas Patocka 	    (block_size < PAGE_SIZE || !is_power_of_2(block_size))) {
249686a3238cSHeinz Mauelshagen 		unsigned int align = min(1U << __ffs(block_size), (unsigned int)PAGE_SIZE);
24970ef0b471SHeinz Mauelshagen 
24988d1058fbSHeinz Mauelshagen 		snprintf(slab_name, sizeof(slab_name), "dm_bufio_cache-%u", block_size);
2499f7879b4cSMikulas Patocka 		c->slab_cache = kmem_cache_create(slab_name, block_size, align,
25006b5e718cSMikulas Patocka 						  SLAB_RECLAIM_ACCOUNT, NULL);
250121bb1327SMikulas Patocka 		if (!c->slab_cache) {
250295d402f0SMikulas Patocka 			r = -ENOMEM;
25030e696d38SMike Snitzer 			goto bad;
250495d402f0SMikulas Patocka 		}
250595d402f0SMikulas Patocka 	}
2506359dbf19SMikulas Patocka 	if (aux_size)
25078d1058fbSHeinz Mauelshagen 		snprintf(slab_name, sizeof(slab_name), "dm_bufio_buffer-%u", aux_size);
2508359dbf19SMikulas Patocka 	else
25098d1058fbSHeinz Mauelshagen 		snprintf(slab_name, sizeof(slab_name), "dm_bufio_buffer");
2510359dbf19SMikulas Patocka 	c->slab_buffer = kmem_cache_create(slab_name, sizeof(struct dm_buffer) + aux_size,
2511359dbf19SMikulas Patocka 					   0, SLAB_RECLAIM_ACCOUNT, NULL);
2512359dbf19SMikulas Patocka 	if (!c->slab_buffer) {
2513359dbf19SMikulas Patocka 		r = -ENOMEM;
2514359dbf19SMikulas Patocka 		goto bad;
2515359dbf19SMikulas Patocka 	}
251695d402f0SMikulas Patocka 
251795d402f0SMikulas Patocka 	while (c->need_reserved_buffers) {
251895d402f0SMikulas Patocka 		struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL);
251995d402f0SMikulas Patocka 
252095d402f0SMikulas Patocka 		if (!b) {
252195d402f0SMikulas Patocka 			r = -ENOMEM;
25220e696d38SMike Snitzer 			goto bad;
252395d402f0SMikulas Patocka 		}
252495d402f0SMikulas Patocka 		__free_buffer_wake(b);
252595d402f0SMikulas Patocka 	}
252695d402f0SMikulas Patocka 
252770704c33SMikulas Patocka 	INIT_WORK(&c->shrink_work, shrink_work);
252870704c33SMikulas Patocka 	atomic_long_set(&c->need_shrink, 0);
252970704c33SMikulas Patocka 
253046898e9aSAliaksei Karaliou 	c->shrinker.count_objects = dm_bufio_shrink_count;
253146898e9aSAliaksei Karaliou 	c->shrinker.scan_objects = dm_bufio_shrink_scan;
253246898e9aSAliaksei Karaliou 	c->shrinker.seeks = 1;
253346898e9aSAliaksei Karaliou 	c->shrinker.batch = 0;
2534c87791bcSMike Snitzer 	r = register_shrinker(&c->shrinker, "dm-bufio:(%u:%u)",
2535e33c267aSRoman Gushchin 			      MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
253646898e9aSAliaksei Karaliou 	if (r)
25370e696d38SMike Snitzer 		goto bad;
253846898e9aSAliaksei Karaliou 
253995d402f0SMikulas Patocka 	mutex_lock(&dm_bufio_clients_lock);
254095d402f0SMikulas Patocka 	dm_bufio_client_count++;
254195d402f0SMikulas Patocka 	list_add(&c->client_list, &dm_bufio_all_clients);
254295d402f0SMikulas Patocka 	__cache_size_refresh();
254395d402f0SMikulas Patocka 	mutex_unlock(&dm_bufio_clients_lock);
254495d402f0SMikulas Patocka 
254595d402f0SMikulas Patocka 	return c;
254695d402f0SMikulas Patocka 
25470e696d38SMike Snitzer bad:
254895d402f0SMikulas Patocka 	while (!list_empty(&c->reserved_buffers)) {
2549450e8deeSJoe Thornber 		struct dm_buffer *b = list_to_buffer(c->reserved_buffers.next);
2550450e8deeSJoe Thornber 
2551450e8deeSJoe Thornber 		list_del(&b->lru.list);
255295d402f0SMikulas Patocka 		free_buffer(b);
255395d402f0SMikulas Patocka 	}
255421bb1327SMikulas Patocka 	kmem_cache_destroy(c->slab_cache);
2555359dbf19SMikulas Patocka 	kmem_cache_destroy(c->slab_buffer);
255695d402f0SMikulas Patocka 	dm_io_client_destroy(c->dm_io);
255795d402f0SMikulas Patocka bad_dm_io:
2558bde14184SAliaksei Karaliou 	mutex_destroy(&c->lock);
25590dfc1f4cSZhihao Cheng 	if (c->no_sleep)
25600dfc1f4cSZhihao Cheng 		static_branch_dec(&no_sleep_enabled);
256195d402f0SMikulas Patocka 	kfree(c);
256295d402f0SMikulas Patocka bad_client:
256395d402f0SMikulas Patocka 	return ERR_PTR(r);
256495d402f0SMikulas Patocka }
256595d402f0SMikulas Patocka EXPORT_SYMBOL_GPL(dm_bufio_client_create);
256695d402f0SMikulas Patocka 
256795d402f0SMikulas Patocka /*
256895d402f0SMikulas Patocka  * Free the buffering interface.
256995d402f0SMikulas Patocka  * It is required that there are no references on any buffers.
257095d402f0SMikulas Patocka  */
dm_bufio_client_destroy(struct dm_bufio_client * c)257195d402f0SMikulas Patocka void dm_bufio_client_destroy(struct dm_bufio_client *c)
257295d402f0SMikulas Patocka {
257386a3238cSHeinz Mauelshagen 	unsigned int i;
257495d402f0SMikulas Patocka 
257595d402f0SMikulas Patocka 	drop_buffers(c);
257695d402f0SMikulas Patocka 
257795d402f0SMikulas Patocka 	unregister_shrinker(&c->shrinker);
257870704c33SMikulas Patocka 	flush_work(&c->shrink_work);
257995d402f0SMikulas Patocka 
258095d402f0SMikulas Patocka 	mutex_lock(&dm_bufio_clients_lock);
258195d402f0SMikulas Patocka 
258295d402f0SMikulas Patocka 	list_del(&c->client_list);
258395d402f0SMikulas Patocka 	dm_bufio_client_count--;
258495d402f0SMikulas Patocka 	__cache_size_refresh();
258595d402f0SMikulas Patocka 
258695d402f0SMikulas Patocka 	mutex_unlock(&dm_bufio_clients_lock);
258795d402f0SMikulas Patocka 
2588555977ddSMike Snitzer 	WARN_ON(c->need_reserved_buffers);
258995d402f0SMikulas Patocka 
259095d402f0SMikulas Patocka 	while (!list_empty(&c->reserved_buffers)) {
2591450e8deeSJoe Thornber 		struct dm_buffer *b = list_to_buffer(c->reserved_buffers.next);
2592450e8deeSJoe Thornber 
2593450e8deeSJoe Thornber 		list_del(&b->lru.list);
259495d402f0SMikulas Patocka 		free_buffer(b);
259595d402f0SMikulas Patocka 	}
259695d402f0SMikulas Patocka 
259795d402f0SMikulas Patocka 	for (i = 0; i < LIST_SIZE; i++)
2598450e8deeSJoe Thornber 		if (cache_count(&c->cache, i))
2599450e8deeSJoe Thornber 			DMERR("leaked buffer count %d: %lu", i, cache_count(&c->cache, i));
260095d402f0SMikulas Patocka 
260195d402f0SMikulas Patocka 	for (i = 0; i < LIST_SIZE; i++)
2602450e8deeSJoe Thornber 		WARN_ON(cache_count(&c->cache, i));
260395d402f0SMikulas Patocka 
2604450e8deeSJoe Thornber 	cache_destroy(&c->cache);
260521bb1327SMikulas Patocka 	kmem_cache_destroy(c->slab_cache);
2606359dbf19SMikulas Patocka 	kmem_cache_destroy(c->slab_buffer);
260795d402f0SMikulas Patocka 	dm_io_client_destroy(c->dm_io);
2608bde14184SAliaksei Karaliou 	mutex_destroy(&c->lock);
26093c1c875dSMike Snitzer 	if (c->no_sleep)
26103c1c875dSMike Snitzer 		static_branch_dec(&no_sleep_enabled);
261195d402f0SMikulas Patocka 	kfree(c);
261295d402f0SMikulas Patocka }
261395d402f0SMikulas Patocka EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
261495d402f0SMikulas Patocka 
dm_bufio_client_reset(struct dm_bufio_client * c)2615d4830012SLi Lingfeng void dm_bufio_client_reset(struct dm_bufio_client *c)
2616d4830012SLi Lingfeng {
2617d4830012SLi Lingfeng 	drop_buffers(c);
2618d4830012SLi Lingfeng 	flush_work(&c->shrink_work);
2619d4830012SLi Lingfeng }
2620d4830012SLi Lingfeng EXPORT_SYMBOL_GPL(dm_bufio_client_reset);
2621d4830012SLi Lingfeng 
dm_bufio_set_sector_offset(struct dm_bufio_client * c,sector_t start)2622400a0befSMikulas Patocka void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start)
2623400a0befSMikulas Patocka {
2624400a0befSMikulas Patocka 	c->start = start;
2625400a0befSMikulas Patocka }
2626400a0befSMikulas Patocka EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset);
2627400a0befSMikulas Patocka 
2628450e8deeSJoe Thornber /*--------------------------------------------------------------*/
2629450e8deeSJoe Thornber 
get_max_age_hz(void)263086a3238cSHeinz Mauelshagen static unsigned int get_max_age_hz(void)
263195d402f0SMikulas Patocka {
263286a3238cSHeinz Mauelshagen 	unsigned int max_age = READ_ONCE(dm_bufio_max_age);
263395d402f0SMikulas Patocka 
263433096a78SJoe Thornber 	if (max_age > UINT_MAX / HZ)
263533096a78SJoe Thornber 		max_age = UINT_MAX / HZ;
263695d402f0SMikulas Patocka 
263733096a78SJoe Thornber 	return max_age * HZ;
263833096a78SJoe Thornber }
263995d402f0SMikulas Patocka 
older_than(struct dm_buffer * b,unsigned long age_hz)264033096a78SJoe Thornber static bool older_than(struct dm_buffer *b, unsigned long age_hz)
264133096a78SJoe Thornber {
2642450e8deeSJoe Thornber 	return time_after_eq(jiffies, READ_ONCE(b->last_accessed) + age_hz);
264333096a78SJoe Thornber }
264433096a78SJoe Thornber 
2645450e8deeSJoe Thornber struct evict_params {
2646450e8deeSJoe Thornber 	gfp_t gfp;
2647450e8deeSJoe Thornber 	unsigned long age_hz;
2648450e8deeSJoe Thornber 
2649450e8deeSJoe Thornber 	/*
2650450e8deeSJoe Thornber 	 * This gets updated with the largest last_accessed (ie. most
2651450e8deeSJoe Thornber 	 * recently used) of the evicted buffers.  It will not be reinitialised
2652450e8deeSJoe Thornber 	 * by __evict_many(), so you can use it across multiple invocations.
2653450e8deeSJoe Thornber 	 */
2654450e8deeSJoe Thornber 	unsigned long last_accessed;
2655450e8deeSJoe Thornber };
2656450e8deeSJoe Thornber 
2657450e8deeSJoe Thornber /*
2658450e8deeSJoe Thornber  * We may not be able to evict this buffer if IO pending or the client
2659450e8deeSJoe Thornber  * is still using it.
2660450e8deeSJoe Thornber  *
2661450e8deeSJoe Thornber  * And if GFP_NOFS is used, we must not do any I/O because we hold
2662450e8deeSJoe Thornber  * dm_bufio_clients_lock and we would risk deadlock if the I/O gets
2663450e8deeSJoe Thornber  * rerouted to different bufio client.
2664450e8deeSJoe Thornber  */
select_for_evict(struct dm_buffer * b,void * context)2665450e8deeSJoe Thornber static enum evict_result select_for_evict(struct dm_buffer *b, void *context)
266633096a78SJoe Thornber {
2667450e8deeSJoe Thornber 	struct evict_params *params = context;
2668450e8deeSJoe Thornber 
2669450e8deeSJoe Thornber 	if (!(params->gfp & __GFP_FS) ||
2670450e8deeSJoe Thornber 	    (static_branch_unlikely(&no_sleep_enabled) && b->c->no_sleep)) {
2671450e8deeSJoe Thornber 		if (test_bit_acquire(B_READING, &b->state) ||
2672450e8deeSJoe Thornber 		    test_bit(B_WRITING, &b->state) ||
2673450e8deeSJoe Thornber 		    test_bit(B_DIRTY, &b->state))
2674450e8deeSJoe Thornber 			return ER_DONT_EVICT;
2675450e8deeSJoe Thornber 	}
2676450e8deeSJoe Thornber 
2677450e8deeSJoe Thornber 	return older_than(b, params->age_hz) ? ER_EVICT : ER_STOP;
2678450e8deeSJoe Thornber }
2679450e8deeSJoe Thornber 
__evict_many(struct dm_bufio_client * c,struct evict_params * params,int list_mode,unsigned long max_count)2680450e8deeSJoe Thornber static unsigned long __evict_many(struct dm_bufio_client *c,
2681450e8deeSJoe Thornber 				  struct evict_params *params,
2682450e8deeSJoe Thornber 				  int list_mode, unsigned long max_count)
2683450e8deeSJoe Thornber {
2684450e8deeSJoe Thornber 	unsigned long count;
2685450e8deeSJoe Thornber 	unsigned long last_accessed;
2686450e8deeSJoe Thornber 	struct dm_buffer *b;
2687450e8deeSJoe Thornber 
2688450e8deeSJoe Thornber 	for (count = 0; count < max_count; count++) {
2689450e8deeSJoe Thornber 		b = cache_evict(&c->cache, list_mode, select_for_evict, params);
2690450e8deeSJoe Thornber 		if (!b)
2691450e8deeSJoe Thornber 			break;
2692450e8deeSJoe Thornber 
2693450e8deeSJoe Thornber 		last_accessed = READ_ONCE(b->last_accessed);
2694450e8deeSJoe Thornber 		if (time_after_eq(params->last_accessed, last_accessed))
2695450e8deeSJoe Thornber 			params->last_accessed = last_accessed;
2696450e8deeSJoe Thornber 
2697450e8deeSJoe Thornber 		__make_buffer_clean(b);
2698450e8deeSJoe Thornber 		__free_buffer_wake(b);
2699450e8deeSJoe Thornber 
2700450e8deeSJoe Thornber 		cond_resched();
2701450e8deeSJoe Thornber 	}
2702450e8deeSJoe Thornber 
2703450e8deeSJoe Thornber 	return count;
2704450e8deeSJoe Thornber }
2705450e8deeSJoe Thornber 
evict_old_buffers(struct dm_bufio_client * c,unsigned long age_hz)2706450e8deeSJoe Thornber static void evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
2707450e8deeSJoe Thornber {
2708450e8deeSJoe Thornber 	struct evict_params params = {.gfp = 0, .age_hz = age_hz, .last_accessed = 0};
2709450e8deeSJoe Thornber 	unsigned long retain = get_retain_buffers(c);
271013840d38SMikulas Patocka 	unsigned long count;
2711390020adSMikulas Patocka 	LIST_HEAD(write_list);
271233096a78SJoe Thornber 
271333096a78SJoe Thornber 	dm_bufio_lock(c);
271433096a78SJoe Thornber 
2715390020adSMikulas Patocka 	__check_watermark(c, &write_list);
2716390020adSMikulas Patocka 	if (unlikely(!list_empty(&write_list))) {
2717390020adSMikulas Patocka 		dm_bufio_unlock(c);
2718390020adSMikulas Patocka 		__flush_write_list(&write_list);
2719390020adSMikulas Patocka 		dm_bufio_lock(c);
2720390020adSMikulas Patocka 	}
2721390020adSMikulas Patocka 
2722450e8deeSJoe Thornber 	count = cache_total(&c->cache);
2723450e8deeSJoe Thornber 	if (count > retain)
2724450e8deeSJoe Thornber 		__evict_many(c, &params, LIST_CLEAN, count - retain);
272595d402f0SMikulas Patocka 
272695d402f0SMikulas Patocka 	dm_bufio_unlock(c);
272795d402f0SMikulas Patocka }
272833096a78SJoe Thornber 
cleanup_old_buffers(void)272933096a78SJoe Thornber static void cleanup_old_buffers(void)
273033096a78SJoe Thornber {
273133096a78SJoe Thornber 	unsigned long max_age_hz = get_max_age_hz();
273233096a78SJoe Thornber 	struct dm_bufio_client *c;
273333096a78SJoe Thornber 
273433096a78SJoe Thornber 	mutex_lock(&dm_bufio_clients_lock);
273533096a78SJoe Thornber 
2736390020adSMikulas Patocka 	__cache_size_refresh();
2737390020adSMikulas Patocka 
273833096a78SJoe Thornber 	list_for_each_entry(c, &dm_bufio_all_clients, client_list)
2739450e8deeSJoe Thornber 		evict_old_buffers(c, max_age_hz);
274033096a78SJoe Thornber 
274195d402f0SMikulas Patocka 	mutex_unlock(&dm_bufio_clients_lock);
274295d402f0SMikulas Patocka }
274395d402f0SMikulas Patocka 
work_fn(struct work_struct * w)274495d402f0SMikulas Patocka static void work_fn(struct work_struct *w)
274595d402f0SMikulas Patocka {
274695d402f0SMikulas Patocka 	cleanup_old_buffers();
274795d402f0SMikulas Patocka 
27486e913b28SMikulas Patocka 	queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
274995d402f0SMikulas Patocka 			   DM_BUFIO_WORK_TIMER_SECS * HZ);
275095d402f0SMikulas Patocka }
275195d402f0SMikulas Patocka 
2752450e8deeSJoe Thornber /*--------------------------------------------------------------*/
2753450e8deeSJoe Thornber 
2754450e8deeSJoe Thornber /*
2755450e8deeSJoe Thornber  * Global cleanup tries to evict the oldest buffers from across _all_
2756450e8deeSJoe Thornber  * the clients.  It does this by repeatedly evicting a few buffers from
2757450e8deeSJoe Thornber  * the client that holds the oldest buffer.  It's approximate, but hopefully
2758450e8deeSJoe Thornber  * good enough.
2759450e8deeSJoe Thornber  */
__pop_client(void)2760450e8deeSJoe Thornber static struct dm_bufio_client *__pop_client(void)
2761450e8deeSJoe Thornber {
2762450e8deeSJoe Thornber 	struct list_head *h;
2763450e8deeSJoe Thornber 
2764450e8deeSJoe Thornber 	if (list_empty(&dm_bufio_all_clients))
2765450e8deeSJoe Thornber 		return NULL;
2766450e8deeSJoe Thornber 
2767450e8deeSJoe Thornber 	h = dm_bufio_all_clients.next;
2768450e8deeSJoe Thornber 	list_del(h);
2769450e8deeSJoe Thornber 	return container_of(h, struct dm_bufio_client, client_list);
2770450e8deeSJoe Thornber }
2771450e8deeSJoe Thornber 
2772450e8deeSJoe Thornber /*
2773450e8deeSJoe Thornber  * Inserts the client in the global client list based on its
2774450e8deeSJoe Thornber  * 'oldest_buffer' field.
2775450e8deeSJoe Thornber  */
__insert_client(struct dm_bufio_client * new_client)2776450e8deeSJoe Thornber static void __insert_client(struct dm_bufio_client *new_client)
2777450e8deeSJoe Thornber {
2778450e8deeSJoe Thornber 	struct dm_bufio_client *c;
2779450e8deeSJoe Thornber 	struct list_head *h = dm_bufio_all_clients.next;
2780450e8deeSJoe Thornber 
2781450e8deeSJoe Thornber 	while (h != &dm_bufio_all_clients) {
2782450e8deeSJoe Thornber 		c = container_of(h, struct dm_bufio_client, client_list);
2783450e8deeSJoe Thornber 		if (time_after_eq(c->oldest_buffer, new_client->oldest_buffer))
2784450e8deeSJoe Thornber 			break;
2785450e8deeSJoe Thornber 		h = h->next;
2786450e8deeSJoe Thornber 	}
2787450e8deeSJoe Thornber 
2788450e8deeSJoe Thornber 	list_add_tail(&new_client->client_list, h);
2789450e8deeSJoe Thornber }
2790450e8deeSJoe Thornber 
__evict_a_few(unsigned long nr_buffers)2791450e8deeSJoe Thornber static unsigned long __evict_a_few(unsigned long nr_buffers)
2792450e8deeSJoe Thornber {
2793450e8deeSJoe Thornber 	unsigned long count;
2794450e8deeSJoe Thornber 	struct dm_bufio_client *c;
2795450e8deeSJoe Thornber 	struct evict_params params = {
2796450e8deeSJoe Thornber 		.gfp = GFP_KERNEL,
2797450e8deeSJoe Thornber 		.age_hz = 0,
2798450e8deeSJoe Thornber 		/* set to jiffies in case there are no buffers in this client */
2799450e8deeSJoe Thornber 		.last_accessed = jiffies
2800450e8deeSJoe Thornber 	};
2801450e8deeSJoe Thornber 
2802450e8deeSJoe Thornber 	c = __pop_client();
2803450e8deeSJoe Thornber 	if (!c)
2804450e8deeSJoe Thornber 		return 0;
2805450e8deeSJoe Thornber 
2806450e8deeSJoe Thornber 	dm_bufio_lock(c);
2807450e8deeSJoe Thornber 	count = __evict_many(c, &params, LIST_CLEAN, nr_buffers);
2808450e8deeSJoe Thornber 	dm_bufio_unlock(c);
2809450e8deeSJoe Thornber 
2810450e8deeSJoe Thornber 	if (count)
2811450e8deeSJoe Thornber 		c->oldest_buffer = params.last_accessed;
2812450e8deeSJoe Thornber 	__insert_client(c);
2813450e8deeSJoe Thornber 
2814450e8deeSJoe Thornber 	return count;
2815450e8deeSJoe Thornber }
2816450e8deeSJoe Thornber 
check_watermarks(void)2817450e8deeSJoe Thornber static void check_watermarks(void)
2818450e8deeSJoe Thornber {
2819450e8deeSJoe Thornber 	LIST_HEAD(write_list);
2820450e8deeSJoe Thornber 	struct dm_bufio_client *c;
2821450e8deeSJoe Thornber 
2822450e8deeSJoe Thornber 	mutex_lock(&dm_bufio_clients_lock);
2823450e8deeSJoe Thornber 	list_for_each_entry(c, &dm_bufio_all_clients, client_list) {
2824450e8deeSJoe Thornber 		dm_bufio_lock(c);
2825450e8deeSJoe Thornber 		__check_watermark(c, &write_list);
2826450e8deeSJoe Thornber 		dm_bufio_unlock(c);
2827450e8deeSJoe Thornber 	}
2828450e8deeSJoe Thornber 	mutex_unlock(&dm_bufio_clients_lock);
2829450e8deeSJoe Thornber 
2830450e8deeSJoe Thornber 	__flush_write_list(&write_list);
2831450e8deeSJoe Thornber }
2832450e8deeSJoe Thornber 
evict_old(void)2833450e8deeSJoe Thornber static void evict_old(void)
2834450e8deeSJoe Thornber {
2835450e8deeSJoe Thornber 	unsigned long threshold = dm_bufio_cache_size -
2836450e8deeSJoe Thornber 		dm_bufio_cache_size / DM_BUFIO_LOW_WATERMARK_RATIO;
2837450e8deeSJoe Thornber 
2838450e8deeSJoe Thornber 	mutex_lock(&dm_bufio_clients_lock);
2839450e8deeSJoe Thornber 	while (dm_bufio_current_allocated > threshold) {
2840450e8deeSJoe Thornber 		if (!__evict_a_few(64))
2841450e8deeSJoe Thornber 			break;
2842450e8deeSJoe Thornber 		cond_resched();
2843450e8deeSJoe Thornber 	}
2844450e8deeSJoe Thornber 	mutex_unlock(&dm_bufio_clients_lock);
2845450e8deeSJoe Thornber }
2846450e8deeSJoe Thornber 
do_global_cleanup(struct work_struct * w)2847450e8deeSJoe Thornber static void do_global_cleanup(struct work_struct *w)
2848450e8deeSJoe Thornber {
2849450e8deeSJoe Thornber 	check_watermarks();
2850450e8deeSJoe Thornber 	evict_old();
2851450e8deeSJoe Thornber }
2852450e8deeSJoe Thornber 
2853a4a82ce3SHeinz Mauelshagen /*
2854a4a82ce3SHeinz Mauelshagen  *--------------------------------------------------------------
285595d402f0SMikulas Patocka  * Module setup
2856a4a82ce3SHeinz Mauelshagen  *--------------------------------------------------------------
2857a4a82ce3SHeinz Mauelshagen  */
285895d402f0SMikulas Patocka 
285995d402f0SMikulas Patocka /*
286095d402f0SMikulas Patocka  * This is called only once for the whole dm_bufio module.
286195d402f0SMikulas Patocka  * It initializes memory limit.
286295d402f0SMikulas Patocka  */
dm_bufio_init(void)286395d402f0SMikulas Patocka static int __init dm_bufio_init(void)
286495d402f0SMikulas Patocka {
286595d402f0SMikulas Patocka 	__u64 mem;
286695d402f0SMikulas Patocka 
28674cb57ab4SMikulas Patocka 	dm_bufio_allocated_kmem_cache = 0;
28684cb57ab4SMikulas Patocka 	dm_bufio_allocated_get_free_pages = 0;
28694cb57ab4SMikulas Patocka 	dm_bufio_allocated_vmalloc = 0;
28704cb57ab4SMikulas Patocka 	dm_bufio_current_allocated = 0;
28714cb57ab4SMikulas Patocka 
2872ca79b0c2SArun KS 	mem = (__u64)mult_frac(totalram_pages() - totalhigh_pages(),
287374d4108dSEric Biggers 			       DM_BUFIO_MEMORY_PERCENT, 100) << PAGE_SHIFT;
287495d402f0SMikulas Patocka 
287595d402f0SMikulas Patocka 	if (mem > ULONG_MAX)
287695d402f0SMikulas Patocka 		mem = ULONG_MAX;
287795d402f0SMikulas Patocka 
287895d402f0SMikulas Patocka #ifdef CONFIG_MMU
287974d4108dSEric Biggers 	if (mem > mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100))
288074d4108dSEric Biggers 		mem = mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100);
288195d402f0SMikulas Patocka #endif
288295d402f0SMikulas Patocka 
288395d402f0SMikulas Patocka 	dm_bufio_default_cache_size = mem;
288495d402f0SMikulas Patocka 
288595d402f0SMikulas Patocka 	mutex_lock(&dm_bufio_clients_lock);
288695d402f0SMikulas Patocka 	__cache_size_refresh();
288795d402f0SMikulas Patocka 	mutex_unlock(&dm_bufio_clients_lock);
288895d402f0SMikulas Patocka 
2889edd1ea2aSBhaktipriya Shridhar 	dm_bufio_wq = alloc_workqueue("dm_bufio_cache", WQ_MEM_RECLAIM, 0);
289095d402f0SMikulas Patocka 	if (!dm_bufio_wq)
289195d402f0SMikulas Patocka 		return -ENOMEM;
289295d402f0SMikulas Patocka 
28936e913b28SMikulas Patocka 	INIT_DELAYED_WORK(&dm_bufio_cleanup_old_work, work_fn);
28946e913b28SMikulas Patocka 	INIT_WORK(&dm_bufio_replacement_work, do_global_cleanup);
28956e913b28SMikulas Patocka 	queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
289695d402f0SMikulas Patocka 			   DM_BUFIO_WORK_TIMER_SECS * HZ);
289795d402f0SMikulas Patocka 
289895d402f0SMikulas Patocka 	return 0;
289995d402f0SMikulas Patocka }
290095d402f0SMikulas Patocka 
290195d402f0SMikulas Patocka /*
290295d402f0SMikulas Patocka  * This is called once when unloading the dm_bufio module.
290395d402f0SMikulas Patocka  */
dm_bufio_exit(void)290495d402f0SMikulas Patocka static void __exit dm_bufio_exit(void)
290595d402f0SMikulas Patocka {
290695d402f0SMikulas Patocka 	int bug = 0;
290795d402f0SMikulas Patocka 
29086e913b28SMikulas Patocka 	cancel_delayed_work_sync(&dm_bufio_cleanup_old_work);
290995d402f0SMikulas Patocka 	destroy_workqueue(dm_bufio_wq);
291095d402f0SMikulas Patocka 
291195d402f0SMikulas Patocka 	if (dm_bufio_client_count) {
291295d402f0SMikulas Patocka 		DMCRIT("%s: dm_bufio_client_count leaked: %d",
291395d402f0SMikulas Patocka 			__func__, dm_bufio_client_count);
291495d402f0SMikulas Patocka 		bug = 1;
291595d402f0SMikulas Patocka 	}
291695d402f0SMikulas Patocka 
291795d402f0SMikulas Patocka 	if (dm_bufio_current_allocated) {
291895d402f0SMikulas Patocka 		DMCRIT("%s: dm_bufio_current_allocated leaked: %lu",
291995d402f0SMikulas Patocka 			__func__, dm_bufio_current_allocated);
292095d402f0SMikulas Patocka 		bug = 1;
292195d402f0SMikulas Patocka 	}
292295d402f0SMikulas Patocka 
292395d402f0SMikulas Patocka 	if (dm_bufio_allocated_get_free_pages) {
292495d402f0SMikulas Patocka 		DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu",
292595d402f0SMikulas Patocka 		       __func__, dm_bufio_allocated_get_free_pages);
292695d402f0SMikulas Patocka 		bug = 1;
292795d402f0SMikulas Patocka 	}
292895d402f0SMikulas Patocka 
292995d402f0SMikulas Patocka 	if (dm_bufio_allocated_vmalloc) {
293095d402f0SMikulas Patocka 		DMCRIT("%s: dm_bufio_vmalloc leaked: %lu",
293195d402f0SMikulas Patocka 		       __func__, dm_bufio_allocated_vmalloc);
293295d402f0SMikulas Patocka 		bug = 1;
293395d402f0SMikulas Patocka 	}
293495d402f0SMikulas Patocka 
2935555977ddSMike Snitzer 	WARN_ON(bug); /* leaks are not worth crashing the system */
293695d402f0SMikulas Patocka }
293795d402f0SMikulas Patocka 
293895d402f0SMikulas Patocka module_init(dm_bufio_init)
293995d402f0SMikulas Patocka module_exit(dm_bufio_exit)
294095d402f0SMikulas Patocka 
29416a808034SHeinz Mauelshagen module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, 0644);
294295d402f0SMikulas Patocka MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
294395d402f0SMikulas Patocka 
29446a808034SHeinz Mauelshagen module_param_named(max_age_seconds, dm_bufio_max_age, uint, 0644);
294595d402f0SMikulas Patocka MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
294695d402f0SMikulas Patocka 
29476a808034SHeinz Mauelshagen module_param_named(retain_bytes, dm_bufio_retain_bytes, ulong, 0644);
294833096a78SJoe Thornber MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory");
294933096a78SJoe Thornber 
29506a808034SHeinz Mauelshagen module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, 0644);
295195d402f0SMikulas Patocka MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");
295295d402f0SMikulas Patocka 
29536a808034SHeinz Mauelshagen module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, 0444);
295495d402f0SMikulas Patocka MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc");
295595d402f0SMikulas Patocka 
29566a808034SHeinz Mauelshagen module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, 0444);
295795d402f0SMikulas Patocka MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages");
295895d402f0SMikulas Patocka 
29596a808034SHeinz Mauelshagen module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, 0444);
296095d402f0SMikulas Patocka MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc");
296195d402f0SMikulas Patocka 
29626a808034SHeinz Mauelshagen module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, 0444);
296395d402f0SMikulas Patocka MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache");
296495d402f0SMikulas Patocka 
296595d402f0SMikulas Patocka MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
296695d402f0SMikulas Patocka MODULE_DESCRIPTION(DM_NAME " buffered I/O library");
296795d402f0SMikulas Patocka MODULE_LICENSE("GPL");
2968