xref: /openbmc/linux/drivers/md/dm-bufio.c (revision c4c3c32d)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2009-2011 Red Hat, Inc.
4  *
5  * Author: Mikulas Patocka <mpatocka@redhat.com>
6  *
7  * This file is released under the GPL.
8  */
9 
10 #include <linux/dm-bufio.h>
11 
12 #include <linux/device-mapper.h>
13 #include <linux/dm-io.h>
14 #include <linux/slab.h>
15 #include <linux/sched/mm.h>
16 #include <linux/jiffies.h>
17 #include <linux/vmalloc.h>
18 #include <linux/shrinker.h>
19 #include <linux/module.h>
20 #include <linux/rbtree.h>
21 #include <linux/stacktrace.h>
22 #include <linux/jump_label.h>
23 
24 #include "dm.h"
25 
26 #define DM_MSG_PREFIX "bufio"
27 
28 /*
29  * Memory management policy:
30  *	Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory
31  *	or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower).
32  *	Always allocate at least DM_BUFIO_MIN_BUFFERS buffers.
33  *	Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT
34  *	dirty buffers.
35  */
36 #define DM_BUFIO_MIN_BUFFERS		8
37 
38 #define DM_BUFIO_MEMORY_PERCENT		2
39 #define DM_BUFIO_VMALLOC_PERCENT	25
40 #define DM_BUFIO_WRITEBACK_RATIO	3
41 #define DM_BUFIO_LOW_WATERMARK_RATIO	16
42 
43 /*
44  * Check buffer ages in this interval (seconds)
45  */
46 #define DM_BUFIO_WORK_TIMER_SECS	30
47 
48 /*
49  * Free buffers when they are older than this (seconds)
50  */
51 #define DM_BUFIO_DEFAULT_AGE_SECS	300
52 
53 /*
54  * The nr of bytes of cached data to keep around.
55  */
56 #define DM_BUFIO_DEFAULT_RETAIN_BYTES   (256 * 1024)
57 
58 /*
59  * Align buffer writes to this boundary.
60  * Tests show that SSDs have the highest IOPS when using 4k writes.
61  */
62 #define DM_BUFIO_WRITE_ALIGN		4096
63 
64 /*
65  * dm_buffer->list_mode
66  */
67 #define LIST_CLEAN	0
68 #define LIST_DIRTY	1
69 #define LIST_SIZE	2
70 
71 /*--------------------------------------------------------------*/
72 
73 /*
74  * Rather than use an LRU list, we use a clock algorithm where entries
75  * are held in a circular list.  When an entry is 'hit' a reference bit
76  * is set.  The least recently used entry is approximated by running a
77  * cursor around the list selecting unreferenced entries. Referenced
78  * entries have their reference bit cleared as the cursor passes them.
79  */
80 struct lru_entry {
81 	struct list_head list;
82 	atomic_t referenced;
83 };
84 
85 struct lru_iter {
86 	struct lru *lru;
87 	struct list_head list;
88 	struct lru_entry *stop;
89 	struct lru_entry *e;
90 };
91 
92 struct lru {
93 	struct list_head *cursor;
94 	unsigned long count;
95 
96 	struct list_head iterators;
97 };
98 
99 /*--------------*/
100 
101 static void lru_init(struct lru *lru)
102 {
103 	lru->cursor = NULL;
104 	lru->count = 0;
105 	INIT_LIST_HEAD(&lru->iterators);
106 }
107 
108 static void lru_destroy(struct lru *lru)
109 {
110 	WARN_ON_ONCE(lru->cursor);
111 	WARN_ON_ONCE(!list_empty(&lru->iterators));
112 }
113 
114 /*
115  * Insert a new entry into the lru.
116  */
117 static void lru_insert(struct lru *lru, struct lru_entry *le)
118 {
119 	/*
120 	 * Don't be tempted to set to 1, makes the lru aspect
121 	 * perform poorly.
122 	 */
123 	atomic_set(&le->referenced, 0);
124 
125 	if (lru->cursor) {
126 		list_add_tail(&le->list, lru->cursor);
127 	} else {
128 		INIT_LIST_HEAD(&le->list);
129 		lru->cursor = &le->list;
130 	}
131 	lru->count++;
132 }
133 
134 /*--------------*/
135 
136 /*
137  * Convert a list_head pointer to an lru_entry pointer.
138  */
139 static inline struct lru_entry *to_le(struct list_head *l)
140 {
141 	return container_of(l, struct lru_entry, list);
142 }
143 
144 /*
145  * Initialize an lru_iter and add it to the list of cursors in the lru.
146  */
147 static void lru_iter_begin(struct lru *lru, struct lru_iter *it)
148 {
149 	it->lru = lru;
150 	it->stop = lru->cursor ? to_le(lru->cursor->prev) : NULL;
151 	it->e = lru->cursor ? to_le(lru->cursor) : NULL;
152 	list_add(&it->list, &lru->iterators);
153 }
154 
155 /*
156  * Remove an lru_iter from the list of cursors in the lru.
157  */
158 static inline void lru_iter_end(struct lru_iter *it)
159 {
160 	list_del(&it->list);
161 }
162 
163 /* Predicate function type to be used with lru_iter_next */
164 typedef bool (*iter_predicate)(struct lru_entry *le, void *context);
165 
166 /*
167  * Advance the cursor to the next entry that passes the
168  * predicate, and return that entry.  Returns NULL if the
169  * iteration is complete.
170  */
171 static struct lru_entry *lru_iter_next(struct lru_iter *it,
172 				       iter_predicate pred, void *context)
173 {
174 	struct lru_entry *e;
175 
176 	while (it->e) {
177 		e = it->e;
178 
179 		/* advance the cursor */
180 		if (it->e == it->stop)
181 			it->e = NULL;
182 		else
183 			it->e = to_le(it->e->list.next);
184 
185 		if (pred(e, context))
186 			return e;
187 	}
188 
189 	return NULL;
190 }
191 
192 /*
193  * Invalidate a specific lru_entry and update all cursors in
194  * the lru accordingly.
195  */
196 static void lru_iter_invalidate(struct lru *lru, struct lru_entry *e)
197 {
198 	struct lru_iter *it;
199 
200 	list_for_each_entry(it, &lru->iterators, list) {
201 		/* Move c->e forwards if necc. */
202 		if (it->e == e) {
203 			it->e = to_le(it->e->list.next);
204 			if (it->e == e)
205 				it->e = NULL;
206 		}
207 
208 		/* Move it->stop backwards if necc. */
209 		if (it->stop == e) {
210 			it->stop = to_le(it->stop->list.prev);
211 			if (it->stop == e)
212 				it->stop = NULL;
213 		}
214 	}
215 }
216 
217 /*--------------*/
218 
219 /*
220  * Remove a specific entry from the lru.
221  */
222 static void lru_remove(struct lru *lru, struct lru_entry *le)
223 {
224 	lru_iter_invalidate(lru, le);
225 	if (lru->count == 1) {
226 		lru->cursor = NULL;
227 	} else {
228 		if (lru->cursor == &le->list)
229 			lru->cursor = lru->cursor->next;
230 		list_del(&le->list);
231 	}
232 	lru->count--;
233 }
234 
235 /*
236  * Mark as referenced.
237  */
238 static inline void lru_reference(struct lru_entry *le)
239 {
240 	atomic_set(&le->referenced, 1);
241 }
242 
243 /*--------------*/
244 
245 /*
246  * Remove the least recently used entry (approx), that passes the predicate.
247  * Returns NULL on failure.
248  */
249 enum evict_result {
250 	ER_EVICT,
251 	ER_DONT_EVICT,
252 	ER_STOP, /* stop looking for something to evict */
253 };
254 
255 typedef enum evict_result (*le_predicate)(struct lru_entry *le, void *context);
256 
257 static struct lru_entry *lru_evict(struct lru *lru, le_predicate pred, void *context)
258 {
259 	unsigned long tested = 0;
260 	struct list_head *h = lru->cursor;
261 	struct lru_entry *le;
262 
263 	if (!h)
264 		return NULL;
265 	/*
266 	 * In the worst case we have to loop around twice. Once to clear
267 	 * the reference flags, and then again to discover the predicate
268 	 * fails for all entries.
269 	 */
270 	while (tested < lru->count) {
271 		le = container_of(h, struct lru_entry, list);
272 
273 		if (atomic_read(&le->referenced)) {
274 			atomic_set(&le->referenced, 0);
275 		} else {
276 			tested++;
277 			switch (pred(le, context)) {
278 			case ER_EVICT:
279 				/*
280 				 * Adjust the cursor, so we start the next
281 				 * search from here.
282 				 */
283 				lru->cursor = le->list.next;
284 				lru_remove(lru, le);
285 				return le;
286 
287 			case ER_DONT_EVICT:
288 				break;
289 
290 			case ER_STOP:
291 				lru->cursor = le->list.next;
292 				return NULL;
293 			}
294 		}
295 
296 		h = h->next;
297 
298 		cond_resched();
299 	}
300 
301 	return NULL;
302 }
303 
304 /*--------------------------------------------------------------*/
305 
306 /*
307  * Buffer state bits.
308  */
309 #define B_READING	0
310 #define B_WRITING	1
311 #define B_DIRTY		2
312 
313 /*
314  * Describes how the block was allocated:
315  * kmem_cache_alloc(), __get_free_pages() or vmalloc().
316  * See the comment at alloc_buffer_data.
317  */
318 enum data_mode {
319 	DATA_MODE_SLAB = 0,
320 	DATA_MODE_GET_FREE_PAGES = 1,
321 	DATA_MODE_VMALLOC = 2,
322 	DATA_MODE_LIMIT = 3
323 };
324 
325 struct dm_buffer {
326 	/* protected by the locks in dm_buffer_cache */
327 	struct rb_node node;
328 
329 	/* immutable, so don't need protecting */
330 	sector_t block;
331 	void *data;
332 	unsigned char data_mode;		/* DATA_MODE_* */
333 
334 	/*
335 	 * These two fields are used in isolation, so do not need
336 	 * a surrounding lock.
337 	 */
338 	atomic_t hold_count;
339 	unsigned long last_accessed;
340 
341 	/*
342 	 * Everything else is protected by the mutex in
343 	 * dm_bufio_client
344 	 */
345 	unsigned long state;
346 	struct lru_entry lru;
347 	unsigned char list_mode;		/* LIST_* */
348 	blk_status_t read_error;
349 	blk_status_t write_error;
350 	unsigned int dirty_start;
351 	unsigned int dirty_end;
352 	unsigned int write_start;
353 	unsigned int write_end;
354 	struct list_head write_list;
355 	struct dm_bufio_client *c;
356 	void (*end_io)(struct dm_buffer *b, blk_status_t bs);
357 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
358 #define MAX_STACK 10
359 	unsigned int stack_len;
360 	unsigned long stack_entries[MAX_STACK];
361 #endif
362 };
363 
364 /*--------------------------------------------------------------*/
365 
366 /*
367  * The buffer cache manages buffers, particularly:
368  *  - inc/dec of holder count
369  *  - setting the last_accessed field
370  *  - maintains clean/dirty state along with lru
371  *  - selecting buffers that match predicates
372  *
373  * It does *not* handle:
374  *  - allocation/freeing of buffers.
375  *  - IO
376  *  - Eviction or cache sizing.
377  *
378  * cache_get() and cache_put() are threadsafe, you do not need to
379  * protect these calls with a surrounding mutex.  All the other
380  * methods are not threadsafe; they do use locking primitives, but
381  * only enough to ensure get/put are threadsafe.
382  */
383 
384 struct buffer_tree {
385 	struct rw_semaphore lock;
386 	struct rb_root root;
387 } ____cacheline_aligned_in_smp;
388 
389 struct dm_buffer_cache {
390 	struct lru lru[LIST_SIZE];
391 	/*
392 	 * We spread entries across multiple trees to reduce contention
393 	 * on the locks.
394 	 */
395 	unsigned int num_locks;
396 	struct buffer_tree trees[];
397 };
398 
399 static inline unsigned int cache_index(sector_t block, unsigned int num_locks)
400 {
401 	return dm_hash_locks_index(block, num_locks);
402 }
403 
404 static inline void cache_read_lock(struct dm_buffer_cache *bc, sector_t block)
405 {
406 	down_read(&bc->trees[cache_index(block, bc->num_locks)].lock);
407 }
408 
409 static inline void cache_read_unlock(struct dm_buffer_cache *bc, sector_t block)
410 {
411 	up_read(&bc->trees[cache_index(block, bc->num_locks)].lock);
412 }
413 
414 static inline void cache_write_lock(struct dm_buffer_cache *bc, sector_t block)
415 {
416 	down_write(&bc->trees[cache_index(block, bc->num_locks)].lock);
417 }
418 
419 static inline void cache_write_unlock(struct dm_buffer_cache *bc, sector_t block)
420 {
421 	up_write(&bc->trees[cache_index(block, bc->num_locks)].lock);
422 }
423 
424 /*
425  * Sometimes we want to repeatedly get and drop locks as part of an iteration.
426  * This struct helps avoid redundant drop and gets of the same lock.
427  */
428 struct lock_history {
429 	struct dm_buffer_cache *cache;
430 	bool write;
431 	unsigned int previous;
432 	unsigned int no_previous;
433 };
434 
435 static void lh_init(struct lock_history *lh, struct dm_buffer_cache *cache, bool write)
436 {
437 	lh->cache = cache;
438 	lh->write = write;
439 	lh->no_previous = cache->num_locks;
440 	lh->previous = lh->no_previous;
441 }
442 
443 static void __lh_lock(struct lock_history *lh, unsigned int index)
444 {
445 	if (lh->write)
446 		down_write(&lh->cache->trees[index].lock);
447 	else
448 		down_read(&lh->cache->trees[index].lock);
449 }
450 
451 static void __lh_unlock(struct lock_history *lh, unsigned int index)
452 {
453 	if (lh->write)
454 		up_write(&lh->cache->trees[index].lock);
455 	else
456 		up_read(&lh->cache->trees[index].lock);
457 }
458 
459 /*
460  * Make sure you call this since it will unlock the final lock.
461  */
462 static void lh_exit(struct lock_history *lh)
463 {
464 	if (lh->previous != lh->no_previous) {
465 		__lh_unlock(lh, lh->previous);
466 		lh->previous = lh->no_previous;
467 	}
468 }
469 
470 /*
471  * Named 'next' because there is no corresponding
472  * 'up/unlock' call since it's done automatically.
473  */
474 static void lh_next(struct lock_history *lh, sector_t b)
475 {
476 	unsigned int index = cache_index(b, lh->no_previous); /* no_previous is num_locks */
477 
478 	if (lh->previous != lh->no_previous) {
479 		if (lh->previous != index) {
480 			__lh_unlock(lh, lh->previous);
481 			__lh_lock(lh, index);
482 			lh->previous = index;
483 		}
484 	} else {
485 		__lh_lock(lh, index);
486 		lh->previous = index;
487 	}
488 }
489 
490 static inline struct dm_buffer *le_to_buffer(struct lru_entry *le)
491 {
492 	return container_of(le, struct dm_buffer, lru);
493 }
494 
495 static struct dm_buffer *list_to_buffer(struct list_head *l)
496 {
497 	struct lru_entry *le = list_entry(l, struct lru_entry, list);
498 
499 	if (!le)
500 		return NULL;
501 
502 	return le_to_buffer(le);
503 }
504 
505 static void cache_init(struct dm_buffer_cache *bc, unsigned int num_locks)
506 {
507 	unsigned int i;
508 
509 	bc->num_locks = num_locks;
510 
511 	for (i = 0; i < bc->num_locks; i++) {
512 		init_rwsem(&bc->trees[i].lock);
513 		bc->trees[i].root = RB_ROOT;
514 	}
515 
516 	lru_init(&bc->lru[LIST_CLEAN]);
517 	lru_init(&bc->lru[LIST_DIRTY]);
518 }
519 
520 static void cache_destroy(struct dm_buffer_cache *bc)
521 {
522 	unsigned int i;
523 
524 	for (i = 0; i < bc->num_locks; i++)
525 		WARN_ON_ONCE(!RB_EMPTY_ROOT(&bc->trees[i].root));
526 
527 	lru_destroy(&bc->lru[LIST_CLEAN]);
528 	lru_destroy(&bc->lru[LIST_DIRTY]);
529 }
530 
531 /*--------------*/
532 
533 /*
534  * not threadsafe, or racey depending how you look at it
535  */
536 static inline unsigned long cache_count(struct dm_buffer_cache *bc, int list_mode)
537 {
538 	return bc->lru[list_mode].count;
539 }
540 
541 static inline unsigned long cache_total(struct dm_buffer_cache *bc)
542 {
543 	return cache_count(bc, LIST_CLEAN) + cache_count(bc, LIST_DIRTY);
544 }
545 
546 /*--------------*/
547 
548 /*
549  * Gets a specific buffer, indexed by block.
550  * If the buffer is found then its holder count will be incremented and
551  * lru_reference will be called.
552  *
553  * threadsafe
554  */
555 static struct dm_buffer *__cache_get(const struct rb_root *root, sector_t block)
556 {
557 	struct rb_node *n = root->rb_node;
558 	struct dm_buffer *b;
559 
560 	while (n) {
561 		b = container_of(n, struct dm_buffer, node);
562 
563 		if (b->block == block)
564 			return b;
565 
566 		n = block < b->block ? n->rb_left : n->rb_right;
567 	}
568 
569 	return NULL;
570 }
571 
572 static void __cache_inc_buffer(struct dm_buffer *b)
573 {
574 	atomic_inc(&b->hold_count);
575 	WRITE_ONCE(b->last_accessed, jiffies);
576 }
577 
578 static struct dm_buffer *cache_get(struct dm_buffer_cache *bc, sector_t block)
579 {
580 	struct dm_buffer *b;
581 
582 	cache_read_lock(bc, block);
583 	b = __cache_get(&bc->trees[cache_index(block, bc->num_locks)].root, block);
584 	if (b) {
585 		lru_reference(&b->lru);
586 		__cache_inc_buffer(b);
587 	}
588 	cache_read_unlock(bc, block);
589 
590 	return b;
591 }
592 
593 /*--------------*/
594 
595 /*
596  * Returns true if the hold count hits zero.
597  * threadsafe
598  */
599 static bool cache_put(struct dm_buffer_cache *bc, struct dm_buffer *b)
600 {
601 	bool r;
602 
603 	cache_read_lock(bc, b->block);
604 	BUG_ON(!atomic_read(&b->hold_count));
605 	r = atomic_dec_and_test(&b->hold_count);
606 	cache_read_unlock(bc, b->block);
607 
608 	return r;
609 }
610 
611 /*--------------*/
612 
613 typedef enum evict_result (*b_predicate)(struct dm_buffer *, void *);
614 
615 /*
616  * Evicts a buffer based on a predicate.  The oldest buffer that
617  * matches the predicate will be selected.  In addition to the
618  * predicate the hold_count of the selected buffer will be zero.
619  */
620 struct evict_wrapper {
621 	struct lock_history *lh;
622 	b_predicate pred;
623 	void *context;
624 };
625 
626 /*
627  * Wraps the buffer predicate turning it into an lru predicate.  Adds
628  * extra test for hold_count.
629  */
630 static enum evict_result __evict_pred(struct lru_entry *le, void *context)
631 {
632 	struct evict_wrapper *w = context;
633 	struct dm_buffer *b = le_to_buffer(le);
634 
635 	lh_next(w->lh, b->block);
636 
637 	if (atomic_read(&b->hold_count))
638 		return ER_DONT_EVICT;
639 
640 	return w->pred(b, w->context);
641 }
642 
643 static struct dm_buffer *__cache_evict(struct dm_buffer_cache *bc, int list_mode,
644 				       b_predicate pred, void *context,
645 				       struct lock_history *lh)
646 {
647 	struct evict_wrapper w = {.lh = lh, .pred = pred, .context = context};
648 	struct lru_entry *le;
649 	struct dm_buffer *b;
650 
651 	le = lru_evict(&bc->lru[list_mode], __evict_pred, &w);
652 	if (!le)
653 		return NULL;
654 
655 	b = le_to_buffer(le);
656 	/* __evict_pred will have locked the appropriate tree. */
657 	rb_erase(&b->node, &bc->trees[cache_index(b->block, bc->num_locks)].root);
658 
659 	return b;
660 }
661 
662 static struct dm_buffer *cache_evict(struct dm_buffer_cache *bc, int list_mode,
663 				     b_predicate pred, void *context)
664 {
665 	struct dm_buffer *b;
666 	struct lock_history lh;
667 
668 	lh_init(&lh, bc, true);
669 	b = __cache_evict(bc, list_mode, pred, context, &lh);
670 	lh_exit(&lh);
671 
672 	return b;
673 }
674 
675 /*--------------*/
676 
677 /*
678  * Mark a buffer as clean or dirty. Not threadsafe.
679  */
680 static void cache_mark(struct dm_buffer_cache *bc, struct dm_buffer *b, int list_mode)
681 {
682 	cache_write_lock(bc, b->block);
683 	if (list_mode != b->list_mode) {
684 		lru_remove(&bc->lru[b->list_mode], &b->lru);
685 		b->list_mode = list_mode;
686 		lru_insert(&bc->lru[b->list_mode], &b->lru);
687 	}
688 	cache_write_unlock(bc, b->block);
689 }
690 
691 /*--------------*/
692 
693 /*
694  * Runs through the lru associated with 'old_mode', if the predicate matches then
695  * it moves them to 'new_mode'.  Not threadsafe.
696  */
697 static void __cache_mark_many(struct dm_buffer_cache *bc, int old_mode, int new_mode,
698 			      b_predicate pred, void *context, struct lock_history *lh)
699 {
700 	struct lru_entry *le;
701 	struct dm_buffer *b;
702 	struct evict_wrapper w = {.lh = lh, .pred = pred, .context = context};
703 
704 	while (true) {
705 		le = lru_evict(&bc->lru[old_mode], __evict_pred, &w);
706 		if (!le)
707 			break;
708 
709 		b = le_to_buffer(le);
710 		b->list_mode = new_mode;
711 		lru_insert(&bc->lru[b->list_mode], &b->lru);
712 	}
713 }
714 
715 static void cache_mark_many(struct dm_buffer_cache *bc, int old_mode, int new_mode,
716 			    b_predicate pred, void *context)
717 {
718 	struct lock_history lh;
719 
720 	lh_init(&lh, bc, true);
721 	__cache_mark_many(bc, old_mode, new_mode, pred, context, &lh);
722 	lh_exit(&lh);
723 }
724 
725 /*--------------*/
726 
727 /*
728  * Iterates through all clean or dirty entries calling a function for each
729  * entry.  The callback may terminate the iteration early.  Not threadsafe.
730  */
731 
732 /*
733  * Iterator functions should return one of these actions to indicate
734  * how the iteration should proceed.
735  */
736 enum it_action {
737 	IT_NEXT,
738 	IT_COMPLETE,
739 };
740 
741 typedef enum it_action (*iter_fn)(struct dm_buffer *b, void *context);
742 
743 static void __cache_iterate(struct dm_buffer_cache *bc, int list_mode,
744 			    iter_fn fn, void *context, struct lock_history *lh)
745 {
746 	struct lru *lru = &bc->lru[list_mode];
747 	struct lru_entry *le, *first;
748 
749 	if (!lru->cursor)
750 		return;
751 
752 	first = le = to_le(lru->cursor);
753 	do {
754 		struct dm_buffer *b = le_to_buffer(le);
755 
756 		lh_next(lh, b->block);
757 
758 		switch (fn(b, context)) {
759 		case IT_NEXT:
760 			break;
761 
762 		case IT_COMPLETE:
763 			return;
764 		}
765 		cond_resched();
766 
767 		le = to_le(le->list.next);
768 	} while (le != first);
769 }
770 
771 static void cache_iterate(struct dm_buffer_cache *bc, int list_mode,
772 			  iter_fn fn, void *context)
773 {
774 	struct lock_history lh;
775 
776 	lh_init(&lh, bc, false);
777 	__cache_iterate(bc, list_mode, fn, context, &lh);
778 	lh_exit(&lh);
779 }
780 
781 /*--------------*/
782 
783 /*
784  * Passes ownership of the buffer to the cache. Returns false if the
785  * buffer was already present (in which case ownership does not pass).
786  * eg, a race with another thread.
787  *
788  * Holder count should be 1 on insertion.
789  *
790  * Not threadsafe.
791  */
792 static bool __cache_insert(struct rb_root *root, struct dm_buffer *b)
793 {
794 	struct rb_node **new = &root->rb_node, *parent = NULL;
795 	struct dm_buffer *found;
796 
797 	while (*new) {
798 		found = container_of(*new, struct dm_buffer, node);
799 
800 		if (found->block == b->block)
801 			return false;
802 
803 		parent = *new;
804 		new = b->block < found->block ?
805 			&found->node.rb_left : &found->node.rb_right;
806 	}
807 
808 	rb_link_node(&b->node, parent, new);
809 	rb_insert_color(&b->node, root);
810 
811 	return true;
812 }
813 
814 static bool cache_insert(struct dm_buffer_cache *bc, struct dm_buffer *b)
815 {
816 	bool r;
817 
818 	if (WARN_ON_ONCE(b->list_mode >= LIST_SIZE))
819 		return false;
820 
821 	cache_write_lock(bc, b->block);
822 	BUG_ON(atomic_read(&b->hold_count) != 1);
823 	r = __cache_insert(&bc->trees[cache_index(b->block, bc->num_locks)].root, b);
824 	if (r)
825 		lru_insert(&bc->lru[b->list_mode], &b->lru);
826 	cache_write_unlock(bc, b->block);
827 
828 	return r;
829 }
830 
831 /*--------------*/
832 
833 /*
834  * Removes buffer from cache, ownership of the buffer passes back to the caller.
835  * Fails if the hold_count is not one (ie. the caller holds the only reference).
836  *
837  * Not threadsafe.
838  */
839 static bool cache_remove(struct dm_buffer_cache *bc, struct dm_buffer *b)
840 {
841 	bool r;
842 
843 	cache_write_lock(bc, b->block);
844 
845 	if (atomic_read(&b->hold_count) != 1) {
846 		r = false;
847 	} else {
848 		r = true;
849 		rb_erase(&b->node, &bc->trees[cache_index(b->block, bc->num_locks)].root);
850 		lru_remove(&bc->lru[b->list_mode], &b->lru);
851 	}
852 
853 	cache_write_unlock(bc, b->block);
854 
855 	return r;
856 }
857 
858 /*--------------*/
859 
860 typedef void (*b_release)(struct dm_buffer *);
861 
862 static struct dm_buffer *__find_next(struct rb_root *root, sector_t block)
863 {
864 	struct rb_node *n = root->rb_node;
865 	struct dm_buffer *b;
866 	struct dm_buffer *best = NULL;
867 
868 	while (n) {
869 		b = container_of(n, struct dm_buffer, node);
870 
871 		if (b->block == block)
872 			return b;
873 
874 		if (block <= b->block) {
875 			n = n->rb_left;
876 			best = b;
877 		} else {
878 			n = n->rb_right;
879 		}
880 	}
881 
882 	return best;
883 }
884 
885 static void __remove_range(struct dm_buffer_cache *bc,
886 			   struct rb_root *root,
887 			   sector_t begin, sector_t end,
888 			   b_predicate pred, b_release release)
889 {
890 	struct dm_buffer *b;
891 
892 	while (true) {
893 		cond_resched();
894 
895 		b = __find_next(root, begin);
896 		if (!b || (b->block >= end))
897 			break;
898 
899 		begin = b->block + 1;
900 
901 		if (atomic_read(&b->hold_count))
902 			continue;
903 
904 		if (pred(b, NULL) == ER_EVICT) {
905 			rb_erase(&b->node, root);
906 			lru_remove(&bc->lru[b->list_mode], &b->lru);
907 			release(b);
908 		}
909 	}
910 }
911 
912 static void cache_remove_range(struct dm_buffer_cache *bc,
913 			       sector_t begin, sector_t end,
914 			       b_predicate pred, b_release release)
915 {
916 	unsigned int i;
917 
918 	for (i = 0; i < bc->num_locks; i++) {
919 		down_write(&bc->trees[i].lock);
920 		__remove_range(bc, &bc->trees[i].root, begin, end, pred, release);
921 		up_write(&bc->trees[i].lock);
922 	}
923 }
924 
925 /*----------------------------------------------------------------*/
926 
927 /*
928  * Linking of buffers:
929  *	All buffers are linked to buffer_cache with their node field.
930  *
931  *	Clean buffers that are not being written (B_WRITING not set)
932  *	are linked to lru[LIST_CLEAN] with their lru_list field.
933  *
934  *	Dirty and clean buffers that are being written are linked to
935  *	lru[LIST_DIRTY] with their lru_list field. When the write
936  *	finishes, the buffer cannot be relinked immediately (because we
937  *	are in an interrupt context and relinking requires process
938  *	context), so some clean-not-writing buffers can be held on
939  *	dirty_lru too.  They are later added to lru in the process
940  *	context.
941  */
942 struct dm_bufio_client {
943 	struct block_device *bdev;
944 	unsigned int block_size;
945 	s8 sectors_per_block_bits;
946 
947 	bool no_sleep;
948 	struct mutex lock;
949 	spinlock_t spinlock;
950 
951 	int async_write_error;
952 
953 	void (*alloc_callback)(struct dm_buffer *buf);
954 	void (*write_callback)(struct dm_buffer *buf);
955 	struct kmem_cache *slab_buffer;
956 	struct kmem_cache *slab_cache;
957 	struct dm_io_client *dm_io;
958 
959 	struct list_head reserved_buffers;
960 	unsigned int need_reserved_buffers;
961 
962 	unsigned int minimum_buffers;
963 
964 	sector_t start;
965 
966 	struct shrinker shrinker;
967 	struct work_struct shrink_work;
968 	atomic_long_t need_shrink;
969 
970 	wait_queue_head_t free_buffer_wait;
971 
972 	struct list_head client_list;
973 
974 	/*
975 	 * Used by global_cleanup to sort the clients list.
976 	 */
977 	unsigned long oldest_buffer;
978 
979 	struct dm_buffer_cache cache; /* must be last member */
980 };
981 
982 static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled);
983 
984 /*----------------------------------------------------------------*/
985 
986 #define dm_bufio_in_request()	(!!current->bio_list)
987 
988 static void dm_bufio_lock(struct dm_bufio_client *c)
989 {
990 	if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
991 		spin_lock_bh(&c->spinlock);
992 	else
993 		mutex_lock_nested(&c->lock, dm_bufio_in_request());
994 }
995 
996 static void dm_bufio_unlock(struct dm_bufio_client *c)
997 {
998 	if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
999 		spin_unlock_bh(&c->spinlock);
1000 	else
1001 		mutex_unlock(&c->lock);
1002 }
1003 
1004 /*----------------------------------------------------------------*/
1005 
1006 /*
1007  * Default cache size: available memory divided by the ratio.
1008  */
1009 static unsigned long dm_bufio_default_cache_size;
1010 
1011 /*
1012  * Total cache size set by the user.
1013  */
1014 static unsigned long dm_bufio_cache_size;
1015 
1016 /*
1017  * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change
1018  * at any time.  If it disagrees, the user has changed cache size.
1019  */
1020 static unsigned long dm_bufio_cache_size_latch;
1021 
1022 static DEFINE_SPINLOCK(global_spinlock);
1023 
1024 /*
1025  * Buffers are freed after this timeout
1026  */
1027 static unsigned int dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
1028 static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
1029 
1030 static unsigned long dm_bufio_peak_allocated;
1031 static unsigned long dm_bufio_allocated_kmem_cache;
1032 static unsigned long dm_bufio_allocated_get_free_pages;
1033 static unsigned long dm_bufio_allocated_vmalloc;
1034 static unsigned long dm_bufio_current_allocated;
1035 
1036 /*----------------------------------------------------------------*/
1037 
1038 /*
1039  * The current number of clients.
1040  */
1041 static int dm_bufio_client_count;
1042 
1043 /*
1044  * The list of all clients.
1045  */
1046 static LIST_HEAD(dm_bufio_all_clients);
1047 
1048 /*
1049  * This mutex protects dm_bufio_cache_size_latch and dm_bufio_client_count
1050  */
1051 static DEFINE_MUTEX(dm_bufio_clients_lock);
1052 
1053 static struct workqueue_struct *dm_bufio_wq;
1054 static struct delayed_work dm_bufio_cleanup_old_work;
1055 static struct work_struct dm_bufio_replacement_work;
1056 
1057 
1058 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1059 static void buffer_record_stack(struct dm_buffer *b)
1060 {
1061 	b->stack_len = stack_trace_save(b->stack_entries, MAX_STACK, 2);
1062 }
1063 #endif
1064 
1065 /*----------------------------------------------------------------*/
1066 
1067 static void adjust_total_allocated(struct dm_buffer *b, bool unlink)
1068 {
1069 	unsigned char data_mode;
1070 	long diff;
1071 
1072 	static unsigned long * const class_ptr[DATA_MODE_LIMIT] = {
1073 		&dm_bufio_allocated_kmem_cache,
1074 		&dm_bufio_allocated_get_free_pages,
1075 		&dm_bufio_allocated_vmalloc,
1076 	};
1077 
1078 	data_mode = b->data_mode;
1079 	diff = (long)b->c->block_size;
1080 	if (unlink)
1081 		diff = -diff;
1082 
1083 	spin_lock(&global_spinlock);
1084 
1085 	*class_ptr[data_mode] += diff;
1086 
1087 	dm_bufio_current_allocated += diff;
1088 
1089 	if (dm_bufio_current_allocated > dm_bufio_peak_allocated)
1090 		dm_bufio_peak_allocated = dm_bufio_current_allocated;
1091 
1092 	if (!unlink) {
1093 		if (dm_bufio_current_allocated > dm_bufio_cache_size)
1094 			queue_work(dm_bufio_wq, &dm_bufio_replacement_work);
1095 	}
1096 
1097 	spin_unlock(&global_spinlock);
1098 }
1099 
1100 /*
1101  * Change the number of clients and recalculate per-client limit.
1102  */
1103 static void __cache_size_refresh(void)
1104 {
1105 	if (WARN_ON(!mutex_is_locked(&dm_bufio_clients_lock)))
1106 		return;
1107 	if (WARN_ON(dm_bufio_client_count < 0))
1108 		return;
1109 
1110 	dm_bufio_cache_size_latch = READ_ONCE(dm_bufio_cache_size);
1111 
1112 	/*
1113 	 * Use default if set to 0 and report the actual cache size used.
1114 	 */
1115 	if (!dm_bufio_cache_size_latch) {
1116 		(void)cmpxchg(&dm_bufio_cache_size, 0,
1117 			      dm_bufio_default_cache_size);
1118 		dm_bufio_cache_size_latch = dm_bufio_default_cache_size;
1119 	}
1120 }
1121 
1122 /*
1123  * Allocating buffer data.
1124  *
1125  * Small buffers are allocated with kmem_cache, to use space optimally.
1126  *
1127  * For large buffers, we choose between get_free_pages and vmalloc.
1128  * Each has advantages and disadvantages.
1129  *
1130  * __get_free_pages can randomly fail if the memory is fragmented.
1131  * __vmalloc won't randomly fail, but vmalloc space is limited (it may be
1132  * as low as 128M) so using it for caching is not appropriate.
1133  *
1134  * If the allocation may fail we use __get_free_pages. Memory fragmentation
1135  * won't have a fatal effect here, but it just causes flushes of some other
1136  * buffers and more I/O will be performed. Don't use __get_free_pages if it
1137  * always fails (i.e. order > MAX_ORDER).
1138  *
1139  * If the allocation shouldn't fail we use __vmalloc. This is only for the
1140  * initial reserve allocation, so there's no risk of wasting all vmalloc
1141  * space.
1142  */
1143 static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
1144 			       unsigned char *data_mode)
1145 {
1146 	if (unlikely(c->slab_cache != NULL)) {
1147 		*data_mode = DATA_MODE_SLAB;
1148 		return kmem_cache_alloc(c->slab_cache, gfp_mask);
1149 	}
1150 
1151 	if (c->block_size <= KMALLOC_MAX_SIZE &&
1152 	    gfp_mask & __GFP_NORETRY) {
1153 		*data_mode = DATA_MODE_GET_FREE_PAGES;
1154 		return (void *)__get_free_pages(gfp_mask,
1155 						c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
1156 	}
1157 
1158 	*data_mode = DATA_MODE_VMALLOC;
1159 
1160 	return __vmalloc(c->block_size, gfp_mask);
1161 }
1162 
1163 /*
1164  * Free buffer's data.
1165  */
1166 static void free_buffer_data(struct dm_bufio_client *c,
1167 			     void *data, unsigned char data_mode)
1168 {
1169 	switch (data_mode) {
1170 	case DATA_MODE_SLAB:
1171 		kmem_cache_free(c->slab_cache, data);
1172 		break;
1173 
1174 	case DATA_MODE_GET_FREE_PAGES:
1175 		free_pages((unsigned long)data,
1176 			   c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
1177 		break;
1178 
1179 	case DATA_MODE_VMALLOC:
1180 		vfree(data);
1181 		break;
1182 
1183 	default:
1184 		DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d",
1185 		       data_mode);
1186 		BUG();
1187 	}
1188 }
1189 
1190 /*
1191  * Allocate buffer and its data.
1192  */
1193 static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
1194 {
1195 	struct dm_buffer *b = kmem_cache_alloc(c->slab_buffer, gfp_mask);
1196 
1197 	if (!b)
1198 		return NULL;
1199 
1200 	b->c = c;
1201 
1202 	b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode);
1203 	if (!b->data) {
1204 		kmem_cache_free(c->slab_buffer, b);
1205 		return NULL;
1206 	}
1207 	adjust_total_allocated(b, false);
1208 
1209 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1210 	b->stack_len = 0;
1211 #endif
1212 	return b;
1213 }
1214 
1215 /*
1216  * Free buffer and its data.
1217  */
1218 static void free_buffer(struct dm_buffer *b)
1219 {
1220 	struct dm_bufio_client *c = b->c;
1221 
1222 	adjust_total_allocated(b, true);
1223 	free_buffer_data(c, b->data, b->data_mode);
1224 	kmem_cache_free(c->slab_buffer, b);
1225 }
1226 
1227 /*
1228  *--------------------------------------------------------------------------
1229  * Submit I/O on the buffer.
1230  *
1231  * Bio interface is faster but it has some problems:
1232  *	the vector list is limited (increasing this limit increases
1233  *	memory-consumption per buffer, so it is not viable);
1234  *
1235  *	the memory must be direct-mapped, not vmalloced;
1236  *
1237  * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and
1238  * it is not vmalloced, try using the bio interface.
1239  *
1240  * If the buffer is big, if it is vmalloced or if the underlying device
1241  * rejects the bio because it is too large, use dm-io layer to do the I/O.
1242  * The dm-io layer splits the I/O into multiple requests, avoiding the above
1243  * shortcomings.
1244  *--------------------------------------------------------------------------
1245  */
1246 
1247 /*
1248  * dm-io completion routine. It just calls b->bio.bi_end_io, pretending
1249  * that the request was handled directly with bio interface.
1250  */
1251 static void dmio_complete(unsigned long error, void *context)
1252 {
1253 	struct dm_buffer *b = context;
1254 
1255 	b->end_io(b, unlikely(error != 0) ? BLK_STS_IOERR : 0);
1256 }
1257 
1258 static void use_dmio(struct dm_buffer *b, enum req_op op, sector_t sector,
1259 		     unsigned int n_sectors, unsigned int offset)
1260 {
1261 	int r;
1262 	struct dm_io_request io_req = {
1263 		.bi_opf = op,
1264 		.notify.fn = dmio_complete,
1265 		.notify.context = b,
1266 		.client = b->c->dm_io,
1267 	};
1268 	struct dm_io_region region = {
1269 		.bdev = b->c->bdev,
1270 		.sector = sector,
1271 		.count = n_sectors,
1272 	};
1273 
1274 	if (b->data_mode != DATA_MODE_VMALLOC) {
1275 		io_req.mem.type = DM_IO_KMEM;
1276 		io_req.mem.ptr.addr = (char *)b->data + offset;
1277 	} else {
1278 		io_req.mem.type = DM_IO_VMA;
1279 		io_req.mem.ptr.vma = (char *)b->data + offset;
1280 	}
1281 
1282 	r = dm_io(&io_req, 1, &region, NULL);
1283 	if (unlikely(r))
1284 		b->end_io(b, errno_to_blk_status(r));
1285 }
1286 
1287 static void bio_complete(struct bio *bio)
1288 {
1289 	struct dm_buffer *b = bio->bi_private;
1290 	blk_status_t status = bio->bi_status;
1291 
1292 	bio_uninit(bio);
1293 	kfree(bio);
1294 	b->end_io(b, status);
1295 }
1296 
1297 static void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector,
1298 		    unsigned int n_sectors, unsigned int offset)
1299 {
1300 	struct bio *bio;
1301 	char *ptr;
1302 	unsigned int len;
1303 
1304 	bio = bio_kmalloc(1, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOWARN);
1305 	if (!bio) {
1306 		use_dmio(b, op, sector, n_sectors, offset);
1307 		return;
1308 	}
1309 	bio_init(bio, b->c->bdev, bio->bi_inline_vecs, 1, op);
1310 	bio->bi_iter.bi_sector = sector;
1311 	bio->bi_end_io = bio_complete;
1312 	bio->bi_private = b;
1313 
1314 	ptr = (char *)b->data + offset;
1315 	len = n_sectors << SECTOR_SHIFT;
1316 
1317 	__bio_add_page(bio, virt_to_page(ptr), len, offset_in_page(ptr));
1318 
1319 	submit_bio(bio);
1320 }
1321 
1322 static inline sector_t block_to_sector(struct dm_bufio_client *c, sector_t block)
1323 {
1324 	sector_t sector;
1325 
1326 	if (likely(c->sectors_per_block_bits >= 0))
1327 		sector = block << c->sectors_per_block_bits;
1328 	else
1329 		sector = block * (c->block_size >> SECTOR_SHIFT);
1330 	sector += c->start;
1331 
1332 	return sector;
1333 }
1334 
1335 static void submit_io(struct dm_buffer *b, enum req_op op,
1336 		      void (*end_io)(struct dm_buffer *, blk_status_t))
1337 {
1338 	unsigned int n_sectors;
1339 	sector_t sector;
1340 	unsigned int offset, end;
1341 
1342 	b->end_io = end_io;
1343 
1344 	sector = block_to_sector(b->c, b->block);
1345 
1346 	if (op != REQ_OP_WRITE) {
1347 		n_sectors = b->c->block_size >> SECTOR_SHIFT;
1348 		offset = 0;
1349 	} else {
1350 		if (b->c->write_callback)
1351 			b->c->write_callback(b);
1352 		offset = b->write_start;
1353 		end = b->write_end;
1354 		offset &= -DM_BUFIO_WRITE_ALIGN;
1355 		end += DM_BUFIO_WRITE_ALIGN - 1;
1356 		end &= -DM_BUFIO_WRITE_ALIGN;
1357 		if (unlikely(end > b->c->block_size))
1358 			end = b->c->block_size;
1359 
1360 		sector += offset >> SECTOR_SHIFT;
1361 		n_sectors = (end - offset) >> SECTOR_SHIFT;
1362 	}
1363 
1364 	if (b->data_mode != DATA_MODE_VMALLOC)
1365 		use_bio(b, op, sector, n_sectors, offset);
1366 	else
1367 		use_dmio(b, op, sector, n_sectors, offset);
1368 }
1369 
1370 /*
1371  *--------------------------------------------------------------
1372  * Writing dirty buffers
1373  *--------------------------------------------------------------
1374  */
1375 
1376 /*
1377  * The endio routine for write.
1378  *
1379  * Set the error, clear B_WRITING bit and wake anyone who was waiting on
1380  * it.
1381  */
1382 static void write_endio(struct dm_buffer *b, blk_status_t status)
1383 {
1384 	b->write_error = status;
1385 	if (unlikely(status)) {
1386 		struct dm_bufio_client *c = b->c;
1387 
1388 		(void)cmpxchg(&c->async_write_error, 0,
1389 				blk_status_to_errno(status));
1390 	}
1391 
1392 	BUG_ON(!test_bit(B_WRITING, &b->state));
1393 
1394 	smp_mb__before_atomic();
1395 	clear_bit(B_WRITING, &b->state);
1396 	smp_mb__after_atomic();
1397 
1398 	wake_up_bit(&b->state, B_WRITING);
1399 }
1400 
1401 /*
1402  * Initiate a write on a dirty buffer, but don't wait for it.
1403  *
1404  * - If the buffer is not dirty, exit.
1405  * - If there some previous write going on, wait for it to finish (we can't
1406  *   have two writes on the same buffer simultaneously).
1407  * - Submit our write and don't wait on it. We set B_WRITING indicating
1408  *   that there is a write in progress.
1409  */
1410 static void __write_dirty_buffer(struct dm_buffer *b,
1411 				 struct list_head *write_list)
1412 {
1413 	if (!test_bit(B_DIRTY, &b->state))
1414 		return;
1415 
1416 	clear_bit(B_DIRTY, &b->state);
1417 	wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
1418 
1419 	b->write_start = b->dirty_start;
1420 	b->write_end = b->dirty_end;
1421 
1422 	if (!write_list)
1423 		submit_io(b, REQ_OP_WRITE, write_endio);
1424 	else
1425 		list_add_tail(&b->write_list, write_list);
1426 }
1427 
1428 static void __flush_write_list(struct list_head *write_list)
1429 {
1430 	struct blk_plug plug;
1431 
1432 	blk_start_plug(&plug);
1433 	while (!list_empty(write_list)) {
1434 		struct dm_buffer *b =
1435 			list_entry(write_list->next, struct dm_buffer, write_list);
1436 		list_del(&b->write_list);
1437 		submit_io(b, REQ_OP_WRITE, write_endio);
1438 		cond_resched();
1439 	}
1440 	blk_finish_plug(&plug);
1441 }
1442 
1443 /*
1444  * Wait until any activity on the buffer finishes.  Possibly write the
1445  * buffer if it is dirty.  When this function finishes, there is no I/O
1446  * running on the buffer and the buffer is not dirty.
1447  */
1448 static void __make_buffer_clean(struct dm_buffer *b)
1449 {
1450 	BUG_ON(atomic_read(&b->hold_count));
1451 
1452 	/* smp_load_acquire() pairs with read_endio()'s smp_mb__before_atomic() */
1453 	if (!smp_load_acquire(&b->state))	/* fast case */
1454 		return;
1455 
1456 	wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
1457 	__write_dirty_buffer(b, NULL);
1458 	wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
1459 }
1460 
1461 static enum evict_result is_clean(struct dm_buffer *b, void *context)
1462 {
1463 	struct dm_bufio_client *c = context;
1464 
1465 	/* These should never happen */
1466 	if (WARN_ON_ONCE(test_bit(B_WRITING, &b->state)))
1467 		return ER_DONT_EVICT;
1468 	if (WARN_ON_ONCE(test_bit(B_DIRTY, &b->state)))
1469 		return ER_DONT_EVICT;
1470 	if (WARN_ON_ONCE(b->list_mode != LIST_CLEAN))
1471 		return ER_DONT_EVICT;
1472 
1473 	if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep &&
1474 	    unlikely(test_bit(B_READING, &b->state)))
1475 		return ER_DONT_EVICT;
1476 
1477 	return ER_EVICT;
1478 }
1479 
1480 static enum evict_result is_dirty(struct dm_buffer *b, void *context)
1481 {
1482 	/* These should never happen */
1483 	if (WARN_ON_ONCE(test_bit(B_READING, &b->state)))
1484 		return ER_DONT_EVICT;
1485 	if (WARN_ON_ONCE(b->list_mode != LIST_DIRTY))
1486 		return ER_DONT_EVICT;
1487 
1488 	return ER_EVICT;
1489 }
1490 
1491 /*
1492  * Find some buffer that is not held by anybody, clean it, unlink it and
1493  * return it.
1494  */
1495 static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c)
1496 {
1497 	struct dm_buffer *b;
1498 
1499 	b = cache_evict(&c->cache, LIST_CLEAN, is_clean, c);
1500 	if (b) {
1501 		/* this also waits for pending reads */
1502 		__make_buffer_clean(b);
1503 		return b;
1504 	}
1505 
1506 	if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
1507 		return NULL;
1508 
1509 	b = cache_evict(&c->cache, LIST_DIRTY, is_dirty, NULL);
1510 	if (b) {
1511 		__make_buffer_clean(b);
1512 		return b;
1513 	}
1514 
1515 	return NULL;
1516 }
1517 
1518 /*
1519  * Wait until some other threads free some buffer or release hold count on
1520  * some buffer.
1521  *
1522  * This function is entered with c->lock held, drops it and regains it
1523  * before exiting.
1524  */
1525 static void __wait_for_free_buffer(struct dm_bufio_client *c)
1526 {
1527 	DECLARE_WAITQUEUE(wait, current);
1528 
1529 	add_wait_queue(&c->free_buffer_wait, &wait);
1530 	set_current_state(TASK_UNINTERRUPTIBLE);
1531 	dm_bufio_unlock(c);
1532 
1533 	/*
1534 	 * It's possible to miss a wake up event since we don't always
1535 	 * hold c->lock when wake_up is called.  So we have a timeout here,
1536 	 * just in case.
1537 	 */
1538 	io_schedule_timeout(5 * HZ);
1539 
1540 	remove_wait_queue(&c->free_buffer_wait, &wait);
1541 
1542 	dm_bufio_lock(c);
1543 }
1544 
1545 enum new_flag {
1546 	NF_FRESH = 0,
1547 	NF_READ = 1,
1548 	NF_GET = 2,
1549 	NF_PREFETCH = 3
1550 };
1551 
1552 /*
1553  * Allocate a new buffer. If the allocation is not possible, wait until
1554  * some other thread frees a buffer.
1555  *
1556  * May drop the lock and regain it.
1557  */
1558 static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf)
1559 {
1560 	struct dm_buffer *b;
1561 	bool tried_noio_alloc = false;
1562 
1563 	/*
1564 	 * dm-bufio is resistant to allocation failures (it just keeps
1565 	 * one buffer reserved in cases all the allocations fail).
1566 	 * So set flags to not try too hard:
1567 	 *	GFP_NOWAIT: don't wait; if we need to sleep we'll release our
1568 	 *		    mutex and wait ourselves.
1569 	 *	__GFP_NORETRY: don't retry and rather return failure
1570 	 *	__GFP_NOMEMALLOC: don't use emergency reserves
1571 	 *	__GFP_NOWARN: don't print a warning in case of failure
1572 	 *
1573 	 * For debugging, if we set the cache size to 1, no new buffers will
1574 	 * be allocated.
1575 	 */
1576 	while (1) {
1577 		if (dm_bufio_cache_size_latch != 1) {
1578 			b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
1579 			if (b)
1580 				return b;
1581 		}
1582 
1583 		if (nf == NF_PREFETCH)
1584 			return NULL;
1585 
1586 		if (dm_bufio_cache_size_latch != 1 && !tried_noio_alloc) {
1587 			dm_bufio_unlock(c);
1588 			b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
1589 			dm_bufio_lock(c);
1590 			if (b)
1591 				return b;
1592 			tried_noio_alloc = true;
1593 		}
1594 
1595 		if (!list_empty(&c->reserved_buffers)) {
1596 			b = list_to_buffer(c->reserved_buffers.next);
1597 			list_del(&b->lru.list);
1598 			c->need_reserved_buffers++;
1599 
1600 			return b;
1601 		}
1602 
1603 		b = __get_unclaimed_buffer(c);
1604 		if (b)
1605 			return b;
1606 
1607 		__wait_for_free_buffer(c);
1608 	}
1609 }
1610 
1611 static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf)
1612 {
1613 	struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf);
1614 
1615 	if (!b)
1616 		return NULL;
1617 
1618 	if (c->alloc_callback)
1619 		c->alloc_callback(b);
1620 
1621 	return b;
1622 }
1623 
1624 /*
1625  * Free a buffer and wake other threads waiting for free buffers.
1626  */
1627 static void __free_buffer_wake(struct dm_buffer *b)
1628 {
1629 	struct dm_bufio_client *c = b->c;
1630 
1631 	b->block = -1;
1632 	if (!c->need_reserved_buffers)
1633 		free_buffer(b);
1634 	else {
1635 		list_add(&b->lru.list, &c->reserved_buffers);
1636 		c->need_reserved_buffers--;
1637 	}
1638 
1639 	/*
1640 	 * We hold the bufio lock here, so no one can add entries to the
1641 	 * wait queue anyway.
1642 	 */
1643 	if (unlikely(waitqueue_active(&c->free_buffer_wait)))
1644 		wake_up(&c->free_buffer_wait);
1645 }
1646 
1647 static enum evict_result cleaned(struct dm_buffer *b, void *context)
1648 {
1649 	if (WARN_ON_ONCE(test_bit(B_READING, &b->state)))
1650 		return ER_DONT_EVICT; /* should never happen */
1651 
1652 	if (test_bit(B_DIRTY, &b->state) || test_bit(B_WRITING, &b->state))
1653 		return ER_DONT_EVICT;
1654 	else
1655 		return ER_EVICT;
1656 }
1657 
1658 static void __move_clean_buffers(struct dm_bufio_client *c)
1659 {
1660 	cache_mark_many(&c->cache, LIST_DIRTY, LIST_CLEAN, cleaned, NULL);
1661 }
1662 
1663 struct write_context {
1664 	int no_wait;
1665 	struct list_head *write_list;
1666 };
1667 
1668 static enum it_action write_one(struct dm_buffer *b, void *context)
1669 {
1670 	struct write_context *wc = context;
1671 
1672 	if (wc->no_wait && test_bit(B_WRITING, &b->state))
1673 		return IT_COMPLETE;
1674 
1675 	__write_dirty_buffer(b, wc->write_list);
1676 	return IT_NEXT;
1677 }
1678 
1679 static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait,
1680 					struct list_head *write_list)
1681 {
1682 	struct write_context wc = {.no_wait = no_wait, .write_list = write_list};
1683 
1684 	__move_clean_buffers(c);
1685 	cache_iterate(&c->cache, LIST_DIRTY, write_one, &wc);
1686 }
1687 
1688 /*
1689  * Check if we're over watermark.
1690  * If we are over threshold_buffers, start freeing buffers.
1691  * If we're over "limit_buffers", block until we get under the limit.
1692  */
1693 static void __check_watermark(struct dm_bufio_client *c,
1694 			      struct list_head *write_list)
1695 {
1696 	if (cache_count(&c->cache, LIST_DIRTY) >
1697 	    cache_count(&c->cache, LIST_CLEAN) * DM_BUFIO_WRITEBACK_RATIO)
1698 		__write_dirty_buffers_async(c, 1, write_list);
1699 }
1700 
1701 /*
1702  *--------------------------------------------------------------
1703  * Getting a buffer
1704  *--------------------------------------------------------------
1705  */
1706 
1707 static void cache_put_and_wake(struct dm_bufio_client *c, struct dm_buffer *b)
1708 {
1709 	/*
1710 	 * Relying on waitqueue_active() is racey, but we sleep
1711 	 * with schedule_timeout anyway.
1712 	 */
1713 	if (cache_put(&c->cache, b) &&
1714 	    unlikely(waitqueue_active(&c->free_buffer_wait)))
1715 		wake_up(&c->free_buffer_wait);
1716 }
1717 
1718 /*
1719  * This assumes you have already checked the cache to see if the buffer
1720  * is already present (it will recheck after dropping the lock for allocation).
1721  */
1722 static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
1723 				     enum new_flag nf, int *need_submit,
1724 				     struct list_head *write_list)
1725 {
1726 	struct dm_buffer *b, *new_b = NULL;
1727 
1728 	*need_submit = 0;
1729 
1730 	/* This can't be called with NF_GET */
1731 	if (WARN_ON_ONCE(nf == NF_GET))
1732 		return NULL;
1733 
1734 	new_b = __alloc_buffer_wait(c, nf);
1735 	if (!new_b)
1736 		return NULL;
1737 
1738 	/*
1739 	 * We've had a period where the mutex was unlocked, so need to
1740 	 * recheck the buffer tree.
1741 	 */
1742 	b = cache_get(&c->cache, block);
1743 	if (b) {
1744 		__free_buffer_wake(new_b);
1745 		goto found_buffer;
1746 	}
1747 
1748 	__check_watermark(c, write_list);
1749 
1750 	b = new_b;
1751 	atomic_set(&b->hold_count, 1);
1752 	WRITE_ONCE(b->last_accessed, jiffies);
1753 	b->block = block;
1754 	b->read_error = 0;
1755 	b->write_error = 0;
1756 	b->list_mode = LIST_CLEAN;
1757 
1758 	if (nf == NF_FRESH)
1759 		b->state = 0;
1760 	else {
1761 		b->state = 1 << B_READING;
1762 		*need_submit = 1;
1763 	}
1764 
1765 	/*
1766 	 * We mustn't insert into the cache until the B_READING state
1767 	 * is set.  Otherwise another thread could get it and use
1768 	 * it before it had been read.
1769 	 */
1770 	cache_insert(&c->cache, b);
1771 
1772 	return b;
1773 
1774 found_buffer:
1775 	if (nf == NF_PREFETCH) {
1776 		cache_put_and_wake(c, b);
1777 		return NULL;
1778 	}
1779 
1780 	/*
1781 	 * Note: it is essential that we don't wait for the buffer to be
1782 	 * read if dm_bufio_get function is used. Both dm_bufio_get and
1783 	 * dm_bufio_prefetch can be used in the driver request routine.
1784 	 * If the user called both dm_bufio_prefetch and dm_bufio_get on
1785 	 * the same buffer, it would deadlock if we waited.
1786 	 */
1787 	if (nf == NF_GET && unlikely(test_bit_acquire(B_READING, &b->state))) {
1788 		cache_put_and_wake(c, b);
1789 		return NULL;
1790 	}
1791 
1792 	return b;
1793 }
1794 
1795 /*
1796  * The endio routine for reading: set the error, clear the bit and wake up
1797  * anyone waiting on the buffer.
1798  */
1799 static void read_endio(struct dm_buffer *b, blk_status_t status)
1800 {
1801 	b->read_error = status;
1802 
1803 	BUG_ON(!test_bit(B_READING, &b->state));
1804 
1805 	smp_mb__before_atomic();
1806 	clear_bit(B_READING, &b->state);
1807 	smp_mb__after_atomic();
1808 
1809 	wake_up_bit(&b->state, B_READING);
1810 }
1811 
1812 /*
1813  * A common routine for dm_bufio_new and dm_bufio_read.  Operation of these
1814  * functions is similar except that dm_bufio_new doesn't read the
1815  * buffer from the disk (assuming that the caller overwrites all the data
1816  * and uses dm_bufio_mark_buffer_dirty to write new data back).
1817  */
1818 static void *new_read(struct dm_bufio_client *c, sector_t block,
1819 		      enum new_flag nf, struct dm_buffer **bp)
1820 {
1821 	int need_submit = 0;
1822 	struct dm_buffer *b;
1823 
1824 	LIST_HEAD(write_list);
1825 
1826 	*bp = NULL;
1827 
1828 	/*
1829 	 * Fast path, hopefully the block is already in the cache.  No need
1830 	 * to get the client lock for this.
1831 	 */
1832 	b = cache_get(&c->cache, block);
1833 	if (b) {
1834 		if (nf == NF_PREFETCH) {
1835 			cache_put_and_wake(c, b);
1836 			return NULL;
1837 		}
1838 
1839 		/*
1840 		 * Note: it is essential that we don't wait for the buffer to be
1841 		 * read if dm_bufio_get function is used. Both dm_bufio_get and
1842 		 * dm_bufio_prefetch can be used in the driver request routine.
1843 		 * If the user called both dm_bufio_prefetch and dm_bufio_get on
1844 		 * the same buffer, it would deadlock if we waited.
1845 		 */
1846 		if (nf == NF_GET && unlikely(test_bit_acquire(B_READING, &b->state))) {
1847 			cache_put_and_wake(c, b);
1848 			return NULL;
1849 		}
1850 	}
1851 
1852 	if (!b) {
1853 		if (nf == NF_GET)
1854 			return NULL;
1855 
1856 		dm_bufio_lock(c);
1857 		b = __bufio_new(c, block, nf, &need_submit, &write_list);
1858 		dm_bufio_unlock(c);
1859 	}
1860 
1861 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1862 	if (b && (atomic_read(&b->hold_count) == 1))
1863 		buffer_record_stack(b);
1864 #endif
1865 
1866 	__flush_write_list(&write_list);
1867 
1868 	if (!b)
1869 		return NULL;
1870 
1871 	if (need_submit)
1872 		submit_io(b, REQ_OP_READ, read_endio);
1873 
1874 	wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
1875 
1876 	if (b->read_error) {
1877 		int error = blk_status_to_errno(b->read_error);
1878 
1879 		dm_bufio_release(b);
1880 
1881 		return ERR_PTR(error);
1882 	}
1883 
1884 	*bp = b;
1885 
1886 	return b->data;
1887 }
1888 
1889 void *dm_bufio_get(struct dm_bufio_client *c, sector_t block,
1890 		   struct dm_buffer **bp)
1891 {
1892 	return new_read(c, block, NF_GET, bp);
1893 }
1894 EXPORT_SYMBOL_GPL(dm_bufio_get);
1895 
1896 void *dm_bufio_read(struct dm_bufio_client *c, sector_t block,
1897 		    struct dm_buffer **bp)
1898 {
1899 	if (WARN_ON_ONCE(dm_bufio_in_request()))
1900 		return ERR_PTR(-EINVAL);
1901 
1902 	return new_read(c, block, NF_READ, bp);
1903 }
1904 EXPORT_SYMBOL_GPL(dm_bufio_read);
1905 
1906 void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
1907 		   struct dm_buffer **bp)
1908 {
1909 	if (WARN_ON_ONCE(dm_bufio_in_request()))
1910 		return ERR_PTR(-EINVAL);
1911 
1912 	return new_read(c, block, NF_FRESH, bp);
1913 }
1914 EXPORT_SYMBOL_GPL(dm_bufio_new);
1915 
1916 void dm_bufio_prefetch(struct dm_bufio_client *c,
1917 		       sector_t block, unsigned int n_blocks)
1918 {
1919 	struct blk_plug plug;
1920 
1921 	LIST_HEAD(write_list);
1922 
1923 	if (WARN_ON_ONCE(dm_bufio_in_request()))
1924 		return; /* should never happen */
1925 
1926 	blk_start_plug(&plug);
1927 
1928 	for (; n_blocks--; block++) {
1929 		int need_submit;
1930 		struct dm_buffer *b;
1931 
1932 		b = cache_get(&c->cache, block);
1933 		if (b) {
1934 			/* already in cache */
1935 			cache_put_and_wake(c, b);
1936 			continue;
1937 		}
1938 
1939 		dm_bufio_lock(c);
1940 		b = __bufio_new(c, block, NF_PREFETCH, &need_submit,
1941 				&write_list);
1942 		if (unlikely(!list_empty(&write_list))) {
1943 			dm_bufio_unlock(c);
1944 			blk_finish_plug(&plug);
1945 			__flush_write_list(&write_list);
1946 			blk_start_plug(&plug);
1947 			dm_bufio_lock(c);
1948 		}
1949 		if (unlikely(b != NULL)) {
1950 			dm_bufio_unlock(c);
1951 
1952 			if (need_submit)
1953 				submit_io(b, REQ_OP_READ, read_endio);
1954 			dm_bufio_release(b);
1955 
1956 			cond_resched();
1957 
1958 			if (!n_blocks)
1959 				goto flush_plug;
1960 			dm_bufio_lock(c);
1961 		}
1962 		dm_bufio_unlock(c);
1963 	}
1964 
1965 flush_plug:
1966 	blk_finish_plug(&plug);
1967 }
1968 EXPORT_SYMBOL_GPL(dm_bufio_prefetch);
1969 
1970 void dm_bufio_release(struct dm_buffer *b)
1971 {
1972 	struct dm_bufio_client *c = b->c;
1973 
1974 	/*
1975 	 * If there were errors on the buffer, and the buffer is not
1976 	 * to be written, free the buffer. There is no point in caching
1977 	 * invalid buffer.
1978 	 */
1979 	if ((b->read_error || b->write_error) &&
1980 	    !test_bit_acquire(B_READING, &b->state) &&
1981 	    !test_bit(B_WRITING, &b->state) &&
1982 	    !test_bit(B_DIRTY, &b->state)) {
1983 		dm_bufio_lock(c);
1984 
1985 		/* cache remove can fail if there are other holders */
1986 		if (cache_remove(&c->cache, b)) {
1987 			__free_buffer_wake(b);
1988 			dm_bufio_unlock(c);
1989 			return;
1990 		}
1991 
1992 		dm_bufio_unlock(c);
1993 	}
1994 
1995 	cache_put_and_wake(c, b);
1996 }
1997 EXPORT_SYMBOL_GPL(dm_bufio_release);
1998 
1999 void dm_bufio_mark_partial_buffer_dirty(struct dm_buffer *b,
2000 					unsigned int start, unsigned int end)
2001 {
2002 	struct dm_bufio_client *c = b->c;
2003 
2004 	BUG_ON(start >= end);
2005 	BUG_ON(end > b->c->block_size);
2006 
2007 	dm_bufio_lock(c);
2008 
2009 	BUG_ON(test_bit(B_READING, &b->state));
2010 
2011 	if (!test_and_set_bit(B_DIRTY, &b->state)) {
2012 		b->dirty_start = start;
2013 		b->dirty_end = end;
2014 		cache_mark(&c->cache, b, LIST_DIRTY);
2015 	} else {
2016 		if (start < b->dirty_start)
2017 			b->dirty_start = start;
2018 		if (end > b->dirty_end)
2019 			b->dirty_end = end;
2020 	}
2021 
2022 	dm_bufio_unlock(c);
2023 }
2024 EXPORT_SYMBOL_GPL(dm_bufio_mark_partial_buffer_dirty);
2025 
2026 void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
2027 {
2028 	dm_bufio_mark_partial_buffer_dirty(b, 0, b->c->block_size);
2029 }
2030 EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty);
2031 
2032 void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c)
2033 {
2034 	LIST_HEAD(write_list);
2035 
2036 	if (WARN_ON_ONCE(dm_bufio_in_request()))
2037 		return; /* should never happen */
2038 
2039 	dm_bufio_lock(c);
2040 	__write_dirty_buffers_async(c, 0, &write_list);
2041 	dm_bufio_unlock(c);
2042 	__flush_write_list(&write_list);
2043 }
2044 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
2045 
2046 /*
2047  * For performance, it is essential that the buffers are written asynchronously
2048  * and simultaneously (so that the block layer can merge the writes) and then
2049  * waited upon.
2050  *
2051  * Finally, we flush hardware disk cache.
2052  */
2053 static bool is_writing(struct lru_entry *e, void *context)
2054 {
2055 	struct dm_buffer *b = le_to_buffer(e);
2056 
2057 	return test_bit(B_WRITING, &b->state);
2058 }
2059 
2060 int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
2061 {
2062 	int a, f;
2063 	unsigned long nr_buffers;
2064 	struct lru_entry *e;
2065 	struct lru_iter it;
2066 
2067 	LIST_HEAD(write_list);
2068 
2069 	dm_bufio_lock(c);
2070 	__write_dirty_buffers_async(c, 0, &write_list);
2071 	dm_bufio_unlock(c);
2072 	__flush_write_list(&write_list);
2073 	dm_bufio_lock(c);
2074 
2075 	nr_buffers = cache_count(&c->cache, LIST_DIRTY);
2076 	lru_iter_begin(&c->cache.lru[LIST_DIRTY], &it);
2077 	while ((e = lru_iter_next(&it, is_writing, c))) {
2078 		struct dm_buffer *b = le_to_buffer(e);
2079 		__cache_inc_buffer(b);
2080 
2081 		BUG_ON(test_bit(B_READING, &b->state));
2082 
2083 		if (nr_buffers) {
2084 			nr_buffers--;
2085 			dm_bufio_unlock(c);
2086 			wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
2087 			dm_bufio_lock(c);
2088 		} else {
2089 			wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
2090 		}
2091 
2092 		if (!test_bit(B_DIRTY, &b->state) && !test_bit(B_WRITING, &b->state))
2093 			cache_mark(&c->cache, b, LIST_CLEAN);
2094 
2095 		cache_put_and_wake(c, b);
2096 
2097 		cond_resched();
2098 	}
2099 	lru_iter_end(&it);
2100 
2101 	wake_up(&c->free_buffer_wait);
2102 	dm_bufio_unlock(c);
2103 
2104 	a = xchg(&c->async_write_error, 0);
2105 	f = dm_bufio_issue_flush(c);
2106 	if (a)
2107 		return a;
2108 
2109 	return f;
2110 }
2111 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers);
2112 
2113 /*
2114  * Use dm-io to send an empty barrier to flush the device.
2115  */
2116 int dm_bufio_issue_flush(struct dm_bufio_client *c)
2117 {
2118 	struct dm_io_request io_req = {
2119 		.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC,
2120 		.mem.type = DM_IO_KMEM,
2121 		.mem.ptr.addr = NULL,
2122 		.client = c->dm_io,
2123 	};
2124 	struct dm_io_region io_reg = {
2125 		.bdev = c->bdev,
2126 		.sector = 0,
2127 		.count = 0,
2128 	};
2129 
2130 	if (WARN_ON_ONCE(dm_bufio_in_request()))
2131 		return -EINVAL;
2132 
2133 	return dm_io(&io_req, 1, &io_reg, NULL);
2134 }
2135 EXPORT_SYMBOL_GPL(dm_bufio_issue_flush);
2136 
2137 /*
2138  * Use dm-io to send a discard request to flush the device.
2139  */
2140 int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t count)
2141 {
2142 	struct dm_io_request io_req = {
2143 		.bi_opf = REQ_OP_DISCARD | REQ_SYNC,
2144 		.mem.type = DM_IO_KMEM,
2145 		.mem.ptr.addr = NULL,
2146 		.client = c->dm_io,
2147 	};
2148 	struct dm_io_region io_reg = {
2149 		.bdev = c->bdev,
2150 		.sector = block_to_sector(c, block),
2151 		.count = block_to_sector(c, count),
2152 	};
2153 
2154 	if (WARN_ON_ONCE(dm_bufio_in_request()))
2155 		return -EINVAL; /* discards are optional */
2156 
2157 	return dm_io(&io_req, 1, &io_reg, NULL);
2158 }
2159 EXPORT_SYMBOL_GPL(dm_bufio_issue_discard);
2160 
2161 static bool forget_buffer(struct dm_bufio_client *c, sector_t block)
2162 {
2163 	struct dm_buffer *b;
2164 
2165 	b = cache_get(&c->cache, block);
2166 	if (b) {
2167 		if (likely(!smp_load_acquire(&b->state))) {
2168 			if (cache_remove(&c->cache, b))
2169 				__free_buffer_wake(b);
2170 			else
2171 				cache_put_and_wake(c, b);
2172 		} else {
2173 			cache_put_and_wake(c, b);
2174 		}
2175 	}
2176 
2177 	return b ? true : false;
2178 }
2179 
2180 /*
2181  * Free the given buffer.
2182  *
2183  * This is just a hint, if the buffer is in use or dirty, this function
2184  * does nothing.
2185  */
2186 void dm_bufio_forget(struct dm_bufio_client *c, sector_t block)
2187 {
2188 	dm_bufio_lock(c);
2189 	forget_buffer(c, block);
2190 	dm_bufio_unlock(c);
2191 }
2192 EXPORT_SYMBOL_GPL(dm_bufio_forget);
2193 
2194 static enum evict_result idle(struct dm_buffer *b, void *context)
2195 {
2196 	return b->state ? ER_DONT_EVICT : ER_EVICT;
2197 }
2198 
2199 void dm_bufio_forget_buffers(struct dm_bufio_client *c, sector_t block, sector_t n_blocks)
2200 {
2201 	dm_bufio_lock(c);
2202 	cache_remove_range(&c->cache, block, block + n_blocks, idle, __free_buffer_wake);
2203 	dm_bufio_unlock(c);
2204 }
2205 EXPORT_SYMBOL_GPL(dm_bufio_forget_buffers);
2206 
2207 void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned int n)
2208 {
2209 	c->minimum_buffers = n;
2210 }
2211 EXPORT_SYMBOL_GPL(dm_bufio_set_minimum_buffers);
2212 
2213 unsigned int dm_bufio_get_block_size(struct dm_bufio_client *c)
2214 {
2215 	return c->block_size;
2216 }
2217 EXPORT_SYMBOL_GPL(dm_bufio_get_block_size);
2218 
2219 sector_t dm_bufio_get_device_size(struct dm_bufio_client *c)
2220 {
2221 	sector_t s = bdev_nr_sectors(c->bdev);
2222 
2223 	if (s >= c->start)
2224 		s -= c->start;
2225 	else
2226 		s = 0;
2227 	if (likely(c->sectors_per_block_bits >= 0))
2228 		s >>= c->sectors_per_block_bits;
2229 	else
2230 		sector_div(s, c->block_size >> SECTOR_SHIFT);
2231 	return s;
2232 }
2233 EXPORT_SYMBOL_GPL(dm_bufio_get_device_size);
2234 
2235 struct dm_io_client *dm_bufio_get_dm_io_client(struct dm_bufio_client *c)
2236 {
2237 	return c->dm_io;
2238 }
2239 EXPORT_SYMBOL_GPL(dm_bufio_get_dm_io_client);
2240 
2241 sector_t dm_bufio_get_block_number(struct dm_buffer *b)
2242 {
2243 	return b->block;
2244 }
2245 EXPORT_SYMBOL_GPL(dm_bufio_get_block_number);
2246 
2247 void *dm_bufio_get_block_data(struct dm_buffer *b)
2248 {
2249 	return b->data;
2250 }
2251 EXPORT_SYMBOL_GPL(dm_bufio_get_block_data);
2252 
2253 void *dm_bufio_get_aux_data(struct dm_buffer *b)
2254 {
2255 	return b + 1;
2256 }
2257 EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data);
2258 
2259 struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b)
2260 {
2261 	return b->c;
2262 }
2263 EXPORT_SYMBOL_GPL(dm_bufio_get_client);
2264 
2265 static enum it_action warn_leak(struct dm_buffer *b, void *context)
2266 {
2267 	bool *warned = context;
2268 
2269 	WARN_ON(!(*warned));
2270 	*warned = true;
2271 	DMERR("leaked buffer %llx, hold count %u, list %d",
2272 	      (unsigned long long)b->block, atomic_read(&b->hold_count), b->list_mode);
2273 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
2274 	stack_trace_print(b->stack_entries, b->stack_len, 1);
2275 	/* mark unclaimed to avoid WARN_ON at end of drop_buffers() */
2276 	atomic_set(&b->hold_count, 0);
2277 #endif
2278 	return IT_NEXT;
2279 }
2280 
2281 static void drop_buffers(struct dm_bufio_client *c)
2282 {
2283 	int i;
2284 	struct dm_buffer *b;
2285 
2286 	if (WARN_ON(dm_bufio_in_request()))
2287 		return; /* should never happen */
2288 
2289 	/*
2290 	 * An optimization so that the buffers are not written one-by-one.
2291 	 */
2292 	dm_bufio_write_dirty_buffers_async(c);
2293 
2294 	dm_bufio_lock(c);
2295 
2296 	while ((b = __get_unclaimed_buffer(c)))
2297 		__free_buffer_wake(b);
2298 
2299 	for (i = 0; i < LIST_SIZE; i++) {
2300 		bool warned = false;
2301 
2302 		cache_iterate(&c->cache, i, warn_leak, &warned);
2303 	}
2304 
2305 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
2306 	while ((b = __get_unclaimed_buffer(c)))
2307 		__free_buffer_wake(b);
2308 #endif
2309 
2310 	for (i = 0; i < LIST_SIZE; i++)
2311 		WARN_ON(cache_count(&c->cache, i));
2312 
2313 	dm_bufio_unlock(c);
2314 }
2315 
2316 static unsigned long get_retain_buffers(struct dm_bufio_client *c)
2317 {
2318 	unsigned long retain_bytes = READ_ONCE(dm_bufio_retain_bytes);
2319 
2320 	if (likely(c->sectors_per_block_bits >= 0))
2321 		retain_bytes >>= c->sectors_per_block_bits + SECTOR_SHIFT;
2322 	else
2323 		retain_bytes /= c->block_size;
2324 
2325 	return retain_bytes;
2326 }
2327 
2328 static void __scan(struct dm_bufio_client *c)
2329 {
2330 	int l;
2331 	struct dm_buffer *b;
2332 	unsigned long freed = 0;
2333 	unsigned long retain_target = get_retain_buffers(c);
2334 	unsigned long count = cache_total(&c->cache);
2335 
2336 	for (l = 0; l < LIST_SIZE; l++) {
2337 		while (true) {
2338 			if (count - freed <= retain_target)
2339 				atomic_long_set(&c->need_shrink, 0);
2340 			if (!atomic_long_read(&c->need_shrink))
2341 				break;
2342 
2343 			b = cache_evict(&c->cache, l,
2344 					l == LIST_CLEAN ? is_clean : is_dirty, c);
2345 			if (!b)
2346 				break;
2347 
2348 			__make_buffer_clean(b);
2349 			__free_buffer_wake(b);
2350 
2351 			atomic_long_dec(&c->need_shrink);
2352 			freed++;
2353 			cond_resched();
2354 		}
2355 	}
2356 }
2357 
2358 static void shrink_work(struct work_struct *w)
2359 {
2360 	struct dm_bufio_client *c = container_of(w, struct dm_bufio_client, shrink_work);
2361 
2362 	dm_bufio_lock(c);
2363 	__scan(c);
2364 	dm_bufio_unlock(c);
2365 }
2366 
2367 static unsigned long dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
2368 {
2369 	struct dm_bufio_client *c;
2370 
2371 	c = container_of(shrink, struct dm_bufio_client, shrinker);
2372 	atomic_long_add(sc->nr_to_scan, &c->need_shrink);
2373 	queue_work(dm_bufio_wq, &c->shrink_work);
2374 
2375 	return sc->nr_to_scan;
2376 }
2377 
2378 static unsigned long dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
2379 {
2380 	struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker);
2381 	unsigned long count = cache_total(&c->cache);
2382 	unsigned long retain_target = get_retain_buffers(c);
2383 	unsigned long queued_for_cleanup = atomic_long_read(&c->need_shrink);
2384 
2385 	if (unlikely(count < retain_target))
2386 		count = 0;
2387 	else
2388 		count -= retain_target;
2389 
2390 	if (unlikely(count < queued_for_cleanup))
2391 		count = 0;
2392 	else
2393 		count -= queued_for_cleanup;
2394 
2395 	return count;
2396 }
2397 
2398 /*
2399  * Create the buffering interface
2400  */
2401 struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned int block_size,
2402 					       unsigned int reserved_buffers, unsigned int aux_size,
2403 					       void (*alloc_callback)(struct dm_buffer *),
2404 					       void (*write_callback)(struct dm_buffer *),
2405 					       unsigned int flags)
2406 {
2407 	int r;
2408 	unsigned int num_locks;
2409 	struct dm_bufio_client *c;
2410 	char slab_name[27];
2411 
2412 	if (!block_size || block_size & ((1 << SECTOR_SHIFT) - 1)) {
2413 		DMERR("%s: block size not specified or is not multiple of 512b", __func__);
2414 		r = -EINVAL;
2415 		goto bad_client;
2416 	}
2417 
2418 	num_locks = dm_num_hash_locks();
2419 	c = kzalloc(sizeof(*c) + (num_locks * sizeof(struct buffer_tree)), GFP_KERNEL);
2420 	if (!c) {
2421 		r = -ENOMEM;
2422 		goto bad_client;
2423 	}
2424 	cache_init(&c->cache, num_locks);
2425 
2426 	c->bdev = bdev;
2427 	c->block_size = block_size;
2428 	if (is_power_of_2(block_size))
2429 		c->sectors_per_block_bits = __ffs(block_size) - SECTOR_SHIFT;
2430 	else
2431 		c->sectors_per_block_bits = -1;
2432 
2433 	c->alloc_callback = alloc_callback;
2434 	c->write_callback = write_callback;
2435 
2436 	if (flags & DM_BUFIO_CLIENT_NO_SLEEP) {
2437 		c->no_sleep = true;
2438 		static_branch_inc(&no_sleep_enabled);
2439 	}
2440 
2441 	mutex_init(&c->lock);
2442 	spin_lock_init(&c->spinlock);
2443 	INIT_LIST_HEAD(&c->reserved_buffers);
2444 	c->need_reserved_buffers = reserved_buffers;
2445 
2446 	dm_bufio_set_minimum_buffers(c, DM_BUFIO_MIN_BUFFERS);
2447 
2448 	init_waitqueue_head(&c->free_buffer_wait);
2449 	c->async_write_error = 0;
2450 
2451 	c->dm_io = dm_io_client_create();
2452 	if (IS_ERR(c->dm_io)) {
2453 		r = PTR_ERR(c->dm_io);
2454 		goto bad_dm_io;
2455 	}
2456 
2457 	if (block_size <= KMALLOC_MAX_SIZE &&
2458 	    (block_size < PAGE_SIZE || !is_power_of_2(block_size))) {
2459 		unsigned int align = min(1U << __ffs(block_size), (unsigned int)PAGE_SIZE);
2460 
2461 		snprintf(slab_name, sizeof(slab_name), "dm_bufio_cache-%u", block_size);
2462 		c->slab_cache = kmem_cache_create(slab_name, block_size, align,
2463 						  SLAB_RECLAIM_ACCOUNT, NULL);
2464 		if (!c->slab_cache) {
2465 			r = -ENOMEM;
2466 			goto bad;
2467 		}
2468 	}
2469 	if (aux_size)
2470 		snprintf(slab_name, sizeof(slab_name), "dm_bufio_buffer-%u", aux_size);
2471 	else
2472 		snprintf(slab_name, sizeof(slab_name), "dm_bufio_buffer");
2473 	c->slab_buffer = kmem_cache_create(slab_name, sizeof(struct dm_buffer) + aux_size,
2474 					   0, SLAB_RECLAIM_ACCOUNT, NULL);
2475 	if (!c->slab_buffer) {
2476 		r = -ENOMEM;
2477 		goto bad;
2478 	}
2479 
2480 	while (c->need_reserved_buffers) {
2481 		struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL);
2482 
2483 		if (!b) {
2484 			r = -ENOMEM;
2485 			goto bad;
2486 		}
2487 		__free_buffer_wake(b);
2488 	}
2489 
2490 	INIT_WORK(&c->shrink_work, shrink_work);
2491 	atomic_long_set(&c->need_shrink, 0);
2492 
2493 	c->shrinker.count_objects = dm_bufio_shrink_count;
2494 	c->shrinker.scan_objects = dm_bufio_shrink_scan;
2495 	c->shrinker.seeks = 1;
2496 	c->shrinker.batch = 0;
2497 	r = register_shrinker(&c->shrinker, "dm-bufio:(%u:%u)",
2498 			      MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
2499 	if (r)
2500 		goto bad;
2501 
2502 	mutex_lock(&dm_bufio_clients_lock);
2503 	dm_bufio_client_count++;
2504 	list_add(&c->client_list, &dm_bufio_all_clients);
2505 	__cache_size_refresh();
2506 	mutex_unlock(&dm_bufio_clients_lock);
2507 
2508 	return c;
2509 
2510 bad:
2511 	while (!list_empty(&c->reserved_buffers)) {
2512 		struct dm_buffer *b = list_to_buffer(c->reserved_buffers.next);
2513 
2514 		list_del(&b->lru.list);
2515 		free_buffer(b);
2516 	}
2517 	kmem_cache_destroy(c->slab_cache);
2518 	kmem_cache_destroy(c->slab_buffer);
2519 	dm_io_client_destroy(c->dm_io);
2520 bad_dm_io:
2521 	mutex_destroy(&c->lock);
2522 	if (c->no_sleep)
2523 		static_branch_dec(&no_sleep_enabled);
2524 	kfree(c);
2525 bad_client:
2526 	return ERR_PTR(r);
2527 }
2528 EXPORT_SYMBOL_GPL(dm_bufio_client_create);
2529 
2530 /*
2531  * Free the buffering interface.
2532  * It is required that there are no references on any buffers.
2533  */
2534 void dm_bufio_client_destroy(struct dm_bufio_client *c)
2535 {
2536 	unsigned int i;
2537 
2538 	drop_buffers(c);
2539 
2540 	unregister_shrinker(&c->shrinker);
2541 	flush_work(&c->shrink_work);
2542 
2543 	mutex_lock(&dm_bufio_clients_lock);
2544 
2545 	list_del(&c->client_list);
2546 	dm_bufio_client_count--;
2547 	__cache_size_refresh();
2548 
2549 	mutex_unlock(&dm_bufio_clients_lock);
2550 
2551 	WARN_ON(c->need_reserved_buffers);
2552 
2553 	while (!list_empty(&c->reserved_buffers)) {
2554 		struct dm_buffer *b = list_to_buffer(c->reserved_buffers.next);
2555 
2556 		list_del(&b->lru.list);
2557 		free_buffer(b);
2558 	}
2559 
2560 	for (i = 0; i < LIST_SIZE; i++)
2561 		if (cache_count(&c->cache, i))
2562 			DMERR("leaked buffer count %d: %lu", i, cache_count(&c->cache, i));
2563 
2564 	for (i = 0; i < LIST_SIZE; i++)
2565 		WARN_ON(cache_count(&c->cache, i));
2566 
2567 	cache_destroy(&c->cache);
2568 	kmem_cache_destroy(c->slab_cache);
2569 	kmem_cache_destroy(c->slab_buffer);
2570 	dm_io_client_destroy(c->dm_io);
2571 	mutex_destroy(&c->lock);
2572 	if (c->no_sleep)
2573 		static_branch_dec(&no_sleep_enabled);
2574 	kfree(c);
2575 }
2576 EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
2577 
2578 void dm_bufio_client_reset(struct dm_bufio_client *c)
2579 {
2580 	drop_buffers(c);
2581 	flush_work(&c->shrink_work);
2582 }
2583 EXPORT_SYMBOL_GPL(dm_bufio_client_reset);
2584 
2585 void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start)
2586 {
2587 	c->start = start;
2588 }
2589 EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset);
2590 
2591 /*--------------------------------------------------------------*/
2592 
2593 static unsigned int get_max_age_hz(void)
2594 {
2595 	unsigned int max_age = READ_ONCE(dm_bufio_max_age);
2596 
2597 	if (max_age > UINT_MAX / HZ)
2598 		max_age = UINT_MAX / HZ;
2599 
2600 	return max_age * HZ;
2601 }
2602 
2603 static bool older_than(struct dm_buffer *b, unsigned long age_hz)
2604 {
2605 	return time_after_eq(jiffies, READ_ONCE(b->last_accessed) + age_hz);
2606 }
2607 
2608 struct evict_params {
2609 	gfp_t gfp;
2610 	unsigned long age_hz;
2611 
2612 	/*
2613 	 * This gets updated with the largest last_accessed (ie. most
2614 	 * recently used) of the evicted buffers.  It will not be reinitialised
2615 	 * by __evict_many(), so you can use it across multiple invocations.
2616 	 */
2617 	unsigned long last_accessed;
2618 };
2619 
2620 /*
2621  * We may not be able to evict this buffer if IO pending or the client
2622  * is still using it.
2623  *
2624  * And if GFP_NOFS is used, we must not do any I/O because we hold
2625  * dm_bufio_clients_lock and we would risk deadlock if the I/O gets
2626  * rerouted to different bufio client.
2627  */
2628 static enum evict_result select_for_evict(struct dm_buffer *b, void *context)
2629 {
2630 	struct evict_params *params = context;
2631 
2632 	if (!(params->gfp & __GFP_FS) ||
2633 	    (static_branch_unlikely(&no_sleep_enabled) && b->c->no_sleep)) {
2634 		if (test_bit_acquire(B_READING, &b->state) ||
2635 		    test_bit(B_WRITING, &b->state) ||
2636 		    test_bit(B_DIRTY, &b->state))
2637 			return ER_DONT_EVICT;
2638 	}
2639 
2640 	return older_than(b, params->age_hz) ? ER_EVICT : ER_STOP;
2641 }
2642 
2643 static unsigned long __evict_many(struct dm_bufio_client *c,
2644 				  struct evict_params *params,
2645 				  int list_mode, unsigned long max_count)
2646 {
2647 	unsigned long count;
2648 	unsigned long last_accessed;
2649 	struct dm_buffer *b;
2650 
2651 	for (count = 0; count < max_count; count++) {
2652 		b = cache_evict(&c->cache, list_mode, select_for_evict, params);
2653 		if (!b)
2654 			break;
2655 
2656 		last_accessed = READ_ONCE(b->last_accessed);
2657 		if (time_after_eq(params->last_accessed, last_accessed))
2658 			params->last_accessed = last_accessed;
2659 
2660 		__make_buffer_clean(b);
2661 		__free_buffer_wake(b);
2662 
2663 		cond_resched();
2664 	}
2665 
2666 	return count;
2667 }
2668 
2669 static void evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
2670 {
2671 	struct evict_params params = {.gfp = 0, .age_hz = age_hz, .last_accessed = 0};
2672 	unsigned long retain = get_retain_buffers(c);
2673 	unsigned long count;
2674 	LIST_HEAD(write_list);
2675 
2676 	dm_bufio_lock(c);
2677 
2678 	__check_watermark(c, &write_list);
2679 	if (unlikely(!list_empty(&write_list))) {
2680 		dm_bufio_unlock(c);
2681 		__flush_write_list(&write_list);
2682 		dm_bufio_lock(c);
2683 	}
2684 
2685 	count = cache_total(&c->cache);
2686 	if (count > retain)
2687 		__evict_many(c, &params, LIST_CLEAN, count - retain);
2688 
2689 	dm_bufio_unlock(c);
2690 }
2691 
2692 static void cleanup_old_buffers(void)
2693 {
2694 	unsigned long max_age_hz = get_max_age_hz();
2695 	struct dm_bufio_client *c;
2696 
2697 	mutex_lock(&dm_bufio_clients_lock);
2698 
2699 	__cache_size_refresh();
2700 
2701 	list_for_each_entry(c, &dm_bufio_all_clients, client_list)
2702 		evict_old_buffers(c, max_age_hz);
2703 
2704 	mutex_unlock(&dm_bufio_clients_lock);
2705 }
2706 
2707 static void work_fn(struct work_struct *w)
2708 {
2709 	cleanup_old_buffers();
2710 
2711 	queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
2712 			   DM_BUFIO_WORK_TIMER_SECS * HZ);
2713 }
2714 
2715 /*--------------------------------------------------------------*/
2716 
2717 /*
2718  * Global cleanup tries to evict the oldest buffers from across _all_
2719  * the clients.  It does this by repeatedly evicting a few buffers from
2720  * the client that holds the oldest buffer.  It's approximate, but hopefully
2721  * good enough.
2722  */
2723 static struct dm_bufio_client *__pop_client(void)
2724 {
2725 	struct list_head *h;
2726 
2727 	if (list_empty(&dm_bufio_all_clients))
2728 		return NULL;
2729 
2730 	h = dm_bufio_all_clients.next;
2731 	list_del(h);
2732 	return container_of(h, struct dm_bufio_client, client_list);
2733 }
2734 
2735 /*
2736  * Inserts the client in the global client list based on its
2737  * 'oldest_buffer' field.
2738  */
2739 static void __insert_client(struct dm_bufio_client *new_client)
2740 {
2741 	struct dm_bufio_client *c;
2742 	struct list_head *h = dm_bufio_all_clients.next;
2743 
2744 	while (h != &dm_bufio_all_clients) {
2745 		c = container_of(h, struct dm_bufio_client, client_list);
2746 		if (time_after_eq(c->oldest_buffer, new_client->oldest_buffer))
2747 			break;
2748 		h = h->next;
2749 	}
2750 
2751 	list_add_tail(&new_client->client_list, h);
2752 }
2753 
2754 static unsigned long __evict_a_few(unsigned long nr_buffers)
2755 {
2756 	unsigned long count;
2757 	struct dm_bufio_client *c;
2758 	struct evict_params params = {
2759 		.gfp = GFP_KERNEL,
2760 		.age_hz = 0,
2761 		/* set to jiffies in case there are no buffers in this client */
2762 		.last_accessed = jiffies
2763 	};
2764 
2765 	c = __pop_client();
2766 	if (!c)
2767 		return 0;
2768 
2769 	dm_bufio_lock(c);
2770 	count = __evict_many(c, &params, LIST_CLEAN, nr_buffers);
2771 	dm_bufio_unlock(c);
2772 
2773 	if (count)
2774 		c->oldest_buffer = params.last_accessed;
2775 	__insert_client(c);
2776 
2777 	return count;
2778 }
2779 
2780 static void check_watermarks(void)
2781 {
2782 	LIST_HEAD(write_list);
2783 	struct dm_bufio_client *c;
2784 
2785 	mutex_lock(&dm_bufio_clients_lock);
2786 	list_for_each_entry(c, &dm_bufio_all_clients, client_list) {
2787 		dm_bufio_lock(c);
2788 		__check_watermark(c, &write_list);
2789 		dm_bufio_unlock(c);
2790 	}
2791 	mutex_unlock(&dm_bufio_clients_lock);
2792 
2793 	__flush_write_list(&write_list);
2794 }
2795 
2796 static void evict_old(void)
2797 {
2798 	unsigned long threshold = dm_bufio_cache_size -
2799 		dm_bufio_cache_size / DM_BUFIO_LOW_WATERMARK_RATIO;
2800 
2801 	mutex_lock(&dm_bufio_clients_lock);
2802 	while (dm_bufio_current_allocated > threshold) {
2803 		if (!__evict_a_few(64))
2804 			break;
2805 		cond_resched();
2806 	}
2807 	mutex_unlock(&dm_bufio_clients_lock);
2808 }
2809 
2810 static void do_global_cleanup(struct work_struct *w)
2811 {
2812 	check_watermarks();
2813 	evict_old();
2814 }
2815 
2816 /*
2817  *--------------------------------------------------------------
2818  * Module setup
2819  *--------------------------------------------------------------
2820  */
2821 
2822 /*
2823  * This is called only once for the whole dm_bufio module.
2824  * It initializes memory limit.
2825  */
2826 static int __init dm_bufio_init(void)
2827 {
2828 	__u64 mem;
2829 
2830 	dm_bufio_allocated_kmem_cache = 0;
2831 	dm_bufio_allocated_get_free_pages = 0;
2832 	dm_bufio_allocated_vmalloc = 0;
2833 	dm_bufio_current_allocated = 0;
2834 
2835 	mem = (__u64)mult_frac(totalram_pages() - totalhigh_pages(),
2836 			       DM_BUFIO_MEMORY_PERCENT, 100) << PAGE_SHIFT;
2837 
2838 	if (mem > ULONG_MAX)
2839 		mem = ULONG_MAX;
2840 
2841 #ifdef CONFIG_MMU
2842 	if (mem > mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100))
2843 		mem = mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100);
2844 #endif
2845 
2846 	dm_bufio_default_cache_size = mem;
2847 
2848 	mutex_lock(&dm_bufio_clients_lock);
2849 	__cache_size_refresh();
2850 	mutex_unlock(&dm_bufio_clients_lock);
2851 
2852 	dm_bufio_wq = alloc_workqueue("dm_bufio_cache", WQ_MEM_RECLAIM, 0);
2853 	if (!dm_bufio_wq)
2854 		return -ENOMEM;
2855 
2856 	INIT_DELAYED_WORK(&dm_bufio_cleanup_old_work, work_fn);
2857 	INIT_WORK(&dm_bufio_replacement_work, do_global_cleanup);
2858 	queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
2859 			   DM_BUFIO_WORK_TIMER_SECS * HZ);
2860 
2861 	return 0;
2862 }
2863 
2864 /*
2865  * This is called once when unloading the dm_bufio module.
2866  */
2867 static void __exit dm_bufio_exit(void)
2868 {
2869 	int bug = 0;
2870 
2871 	cancel_delayed_work_sync(&dm_bufio_cleanup_old_work);
2872 	destroy_workqueue(dm_bufio_wq);
2873 
2874 	if (dm_bufio_client_count) {
2875 		DMCRIT("%s: dm_bufio_client_count leaked: %d",
2876 			__func__, dm_bufio_client_count);
2877 		bug = 1;
2878 	}
2879 
2880 	if (dm_bufio_current_allocated) {
2881 		DMCRIT("%s: dm_bufio_current_allocated leaked: %lu",
2882 			__func__, dm_bufio_current_allocated);
2883 		bug = 1;
2884 	}
2885 
2886 	if (dm_bufio_allocated_get_free_pages) {
2887 		DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu",
2888 		       __func__, dm_bufio_allocated_get_free_pages);
2889 		bug = 1;
2890 	}
2891 
2892 	if (dm_bufio_allocated_vmalloc) {
2893 		DMCRIT("%s: dm_bufio_vmalloc leaked: %lu",
2894 		       __func__, dm_bufio_allocated_vmalloc);
2895 		bug = 1;
2896 	}
2897 
2898 	WARN_ON(bug); /* leaks are not worth crashing the system */
2899 }
2900 
2901 module_init(dm_bufio_init)
2902 module_exit(dm_bufio_exit)
2903 
2904 module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, 0644);
2905 MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
2906 
2907 module_param_named(max_age_seconds, dm_bufio_max_age, uint, 0644);
2908 MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
2909 
2910 module_param_named(retain_bytes, dm_bufio_retain_bytes, ulong, 0644);
2911 MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory");
2912 
2913 module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, 0644);
2914 MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");
2915 
2916 module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, 0444);
2917 MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc");
2918 
2919 module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, 0444);
2920 MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages");
2921 
2922 module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, 0444);
2923 MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc");
2924 
2925 module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, 0444);
2926 MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache");
2927 
2928 MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
2929 MODULE_DESCRIPTION(DM_NAME " buffered I/O library");
2930 MODULE_LICENSE("GPL");
2931