xref: /openbmc/linux/drivers/md/dm-bufio.c (revision 86a3238c7b9b759cb864f4f768ab2e24687dc0e6)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2009-2011 Red Hat, Inc.
4  *
5  * Author: Mikulas Patocka <mpatocka@redhat.com>
6  *
7  * This file is released under the GPL.
8  */
9 
10 #include <linux/dm-bufio.h>
11 
12 #include <linux/device-mapper.h>
13 #include <linux/dm-io.h>
14 #include <linux/slab.h>
15 #include <linux/sched/mm.h>
16 #include <linux/jiffies.h>
17 #include <linux/vmalloc.h>
18 #include <linux/shrinker.h>
19 #include <linux/module.h>
20 #include <linux/rbtree.h>
21 #include <linux/stacktrace.h>
22 #include <linux/jump_label.h>
23 
24 #define DM_MSG_PREFIX "bufio"
25 
26 /*
27  * Memory management policy:
28  *	Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory
29  *	or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower).
30  *	Always allocate at least DM_BUFIO_MIN_BUFFERS buffers.
31  *	Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT
32  *	dirty buffers.
33  */
34 #define DM_BUFIO_MIN_BUFFERS		8
35 
36 #define DM_BUFIO_MEMORY_PERCENT		2
37 #define DM_BUFIO_VMALLOC_PERCENT	25
38 #define DM_BUFIO_WRITEBACK_RATIO	3
39 #define DM_BUFIO_LOW_WATERMARK_RATIO	16
40 
41 /*
42  * Check buffer ages in this interval (seconds)
43  */
44 #define DM_BUFIO_WORK_TIMER_SECS	30
45 
46 /*
47  * Free buffers when they are older than this (seconds)
48  */
49 #define DM_BUFIO_DEFAULT_AGE_SECS	300
50 
51 /*
52  * The nr of bytes of cached data to keep around.
53  */
54 #define DM_BUFIO_DEFAULT_RETAIN_BYTES   (256 * 1024)
55 
56 /*
57  * Align buffer writes to this boundary.
58  * Tests show that SSDs have the highest IOPS when using 4k writes.
59  */
60 #define DM_BUFIO_WRITE_ALIGN		4096
61 
62 /*
63  * dm_buffer->list_mode
64  */
65 #define LIST_CLEAN	0
66 #define LIST_DIRTY	1
67 #define LIST_SIZE	2
68 
69 /*
70  * Linking of buffers:
71  *	All buffers are linked to buffer_tree with their node field.
72  *
73  *	Clean buffers that are not being written (B_WRITING not set)
74  *	are linked to lru[LIST_CLEAN] with their lru_list field.
75  *
76  *	Dirty and clean buffers that are being written are linked to
77  *	lru[LIST_DIRTY] with their lru_list field. When the write
78  *	finishes, the buffer cannot be relinked immediately (because we
79  *	are in an interrupt context and relinking requires process
80  *	context), so some clean-not-writing buffers can be held on
81  *	dirty_lru too.  They are later added to lru in the process
82  *	context.
83  */
84 struct dm_bufio_client {
85 	struct mutex lock;
86 	spinlock_t spinlock;
87 	bool no_sleep;
88 
89 	struct list_head lru[LIST_SIZE];
90 	unsigned long n_buffers[LIST_SIZE];
91 
92 	struct block_device *bdev;
93 	unsigned int block_size;
94 	s8 sectors_per_block_bits;
95 	void (*alloc_callback)(struct dm_buffer *);
96 	void (*write_callback)(struct dm_buffer *);
97 	struct kmem_cache *slab_buffer;
98 	struct kmem_cache *slab_cache;
99 	struct dm_io_client *dm_io;
100 
101 	struct list_head reserved_buffers;
102 	unsigned int need_reserved_buffers;
103 
104 	unsigned int minimum_buffers;
105 
106 	struct rb_root buffer_tree;
107 	wait_queue_head_t free_buffer_wait;
108 
109 	sector_t start;
110 
111 	int async_write_error;
112 
113 	struct list_head client_list;
114 
115 	struct shrinker shrinker;
116 	struct work_struct shrink_work;
117 	atomic_long_t need_shrink;
118 };
119 
120 /*
121  * Buffer state bits.
122  */
123 #define B_READING	0
124 #define B_WRITING	1
125 #define B_DIRTY		2
126 
127 /*
128  * Describes how the block was allocated:
129  * kmem_cache_alloc(), __get_free_pages() or vmalloc().
130  * See the comment at alloc_buffer_data.
131  */
132 enum data_mode {
133 	DATA_MODE_SLAB = 0,
134 	DATA_MODE_GET_FREE_PAGES = 1,
135 	DATA_MODE_VMALLOC = 2,
136 	DATA_MODE_LIMIT = 3
137 };
138 
139 struct dm_buffer {
140 	struct rb_node node;
141 	struct list_head lru_list;
142 	struct list_head global_list;
143 	sector_t block;
144 	void *data;
145 	unsigned char data_mode;		/* DATA_MODE_* */
146 	unsigned char list_mode;		/* LIST_* */
147 	blk_status_t read_error;
148 	blk_status_t write_error;
149 	unsigned int accessed;
150 	unsigned int hold_count;
151 	unsigned long state;
152 	unsigned long last_accessed;
153 	unsigned int dirty_start;
154 	unsigned int dirty_end;
155 	unsigned int write_start;
156 	unsigned int write_end;
157 	struct dm_bufio_client *c;
158 	struct list_head write_list;
159 	void (*end_io)(struct dm_buffer *, blk_status_t);
160 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
161 #define MAX_STACK 10
162 	unsigned int stack_len;
163 	unsigned long stack_entries[MAX_STACK];
164 #endif
165 };
166 
167 static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled);
168 
169 /*----------------------------------------------------------------*/
170 
171 #define dm_bufio_in_request()	(!!current->bio_list)
172 
173 static void dm_bufio_lock(struct dm_bufio_client *c)
174 {
175 	if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
176 		spin_lock_bh(&c->spinlock);
177 	else
178 		mutex_lock_nested(&c->lock, dm_bufio_in_request());
179 }
180 
181 static int dm_bufio_trylock(struct dm_bufio_client *c)
182 {
183 	if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
184 		return spin_trylock_bh(&c->spinlock);
185 	else
186 		return mutex_trylock(&c->lock);
187 }
188 
189 static void dm_bufio_unlock(struct dm_bufio_client *c)
190 {
191 	if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
192 		spin_unlock_bh(&c->spinlock);
193 	else
194 		mutex_unlock(&c->lock);
195 }
196 
197 /*----------------------------------------------------------------*/
198 
199 /*
200  * Default cache size: available memory divided by the ratio.
201  */
202 static unsigned long dm_bufio_default_cache_size;
203 
204 /*
205  * Total cache size set by the user.
206  */
207 static unsigned long dm_bufio_cache_size;
208 
209 /*
210  * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change
211  * at any time.  If it disagrees, the user has changed cache size.
212  */
213 static unsigned long dm_bufio_cache_size_latch;
214 
215 static DEFINE_SPINLOCK(global_spinlock);
216 
217 static LIST_HEAD(global_queue);
218 
219 static unsigned long global_num = 0;
220 
221 /*
222  * Buffers are freed after this timeout
223  */
224 static unsigned int dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
225 static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
226 
227 static unsigned long dm_bufio_peak_allocated;
228 static unsigned long dm_bufio_allocated_kmem_cache;
229 static unsigned long dm_bufio_allocated_get_free_pages;
230 static unsigned long dm_bufio_allocated_vmalloc;
231 static unsigned long dm_bufio_current_allocated;
232 
233 /*----------------------------------------------------------------*/
234 
235 /*
236  * The current number of clients.
237  */
238 static int dm_bufio_client_count;
239 
240 /*
241  * The list of all clients.
242  */
243 static LIST_HEAD(dm_bufio_all_clients);
244 
245 /*
246  * This mutex protects dm_bufio_cache_size_latch and dm_bufio_client_count
247  */
248 static DEFINE_MUTEX(dm_bufio_clients_lock);
249 
250 static struct workqueue_struct *dm_bufio_wq;
251 static struct delayed_work dm_bufio_cleanup_old_work;
252 static struct work_struct dm_bufio_replacement_work;
253 
254 
255 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
256 static void buffer_record_stack(struct dm_buffer *b)
257 {
258 	b->stack_len = stack_trace_save(b->stack_entries, MAX_STACK, 2);
259 }
260 #endif
261 
262 /*----------------------------------------------------------------
263  * A red/black tree acts as an index for all the buffers.
264  *--------------------------------------------------------------*/
265 static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
266 {
267 	struct rb_node *n = c->buffer_tree.rb_node;
268 	struct dm_buffer *b;
269 
270 	while (n) {
271 		b = container_of(n, struct dm_buffer, node);
272 
273 		if (b->block == block)
274 			return b;
275 
276 		n = block < b->block ? n->rb_left : n->rb_right;
277 	}
278 
279 	return NULL;
280 }
281 
282 static struct dm_buffer *__find_next(struct dm_bufio_client *c, sector_t block)
283 {
284 	struct rb_node *n = c->buffer_tree.rb_node;
285 	struct dm_buffer *b;
286 	struct dm_buffer *best = NULL;
287 
288 	while (n) {
289 		b = container_of(n, struct dm_buffer, node);
290 
291 		if (b->block == block)
292 			return b;
293 
294 		if (block <= b->block) {
295 			n = n->rb_left;
296 			best = b;
297 		} else {
298 			n = n->rb_right;
299 		}
300 	}
301 
302 	return best;
303 }
304 
305 static void __insert(struct dm_bufio_client *c, struct dm_buffer *b)
306 {
307 	struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL;
308 	struct dm_buffer *found;
309 
310 	while (*new) {
311 		found = container_of(*new, struct dm_buffer, node);
312 
313 		if (found->block == b->block) {
314 			BUG_ON(found != b);
315 			return;
316 		}
317 
318 		parent = *new;
319 		new = b->block < found->block ?
320 			&found->node.rb_left : &found->node.rb_right;
321 	}
322 
323 	rb_link_node(&b->node, parent, new);
324 	rb_insert_color(&b->node, &c->buffer_tree);
325 }
326 
327 static void __remove(struct dm_bufio_client *c, struct dm_buffer *b)
328 {
329 	rb_erase(&b->node, &c->buffer_tree);
330 }
331 
332 /*----------------------------------------------------------------*/
333 
334 static void adjust_total_allocated(struct dm_buffer *b, bool unlink)
335 {
336 	unsigned char data_mode;
337 	long diff;
338 
339 	static unsigned long * const class_ptr[DATA_MODE_LIMIT] = {
340 		&dm_bufio_allocated_kmem_cache,
341 		&dm_bufio_allocated_get_free_pages,
342 		&dm_bufio_allocated_vmalloc,
343 	};
344 
345 	data_mode = b->data_mode;
346 	diff = (long)b->c->block_size;
347 	if (unlink)
348 		diff = -diff;
349 
350 	spin_lock(&global_spinlock);
351 
352 	*class_ptr[data_mode] += diff;
353 
354 	dm_bufio_current_allocated += diff;
355 
356 	if (dm_bufio_current_allocated > dm_bufio_peak_allocated)
357 		dm_bufio_peak_allocated = dm_bufio_current_allocated;
358 
359 	b->accessed = 1;
360 
361 	if (!unlink) {
362 		list_add(&b->global_list, &global_queue);
363 		global_num++;
364 		if (dm_bufio_current_allocated > dm_bufio_cache_size)
365 			queue_work(dm_bufio_wq, &dm_bufio_replacement_work);
366 	} else {
367 		list_del(&b->global_list);
368 		global_num--;
369 	}
370 
371 	spin_unlock(&global_spinlock);
372 }
373 
374 /*
375  * Change the number of clients and recalculate per-client limit.
376  */
377 static void __cache_size_refresh(void)
378 {
379 	BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock));
380 	BUG_ON(dm_bufio_client_count < 0);
381 
382 	dm_bufio_cache_size_latch = READ_ONCE(dm_bufio_cache_size);
383 
384 	/*
385 	 * Use default if set to 0 and report the actual cache size used.
386 	 */
387 	if (!dm_bufio_cache_size_latch) {
388 		(void)cmpxchg(&dm_bufio_cache_size, 0,
389 			      dm_bufio_default_cache_size);
390 		dm_bufio_cache_size_latch = dm_bufio_default_cache_size;
391 	}
392 }
393 
394 /*
395  * Allocating buffer data.
396  *
397  * Small buffers are allocated with kmem_cache, to use space optimally.
398  *
399  * For large buffers, we choose between get_free_pages and vmalloc.
400  * Each has advantages and disadvantages.
401  *
402  * __get_free_pages can randomly fail if the memory is fragmented.
403  * __vmalloc won't randomly fail, but vmalloc space is limited (it may be
404  * as low as 128M) so using it for caching is not appropriate.
405  *
406  * If the allocation may fail we use __get_free_pages. Memory fragmentation
407  * won't have a fatal effect here, but it just causes flushes of some other
408  * buffers and more I/O will be performed. Don't use __get_free_pages if it
409  * always fails (i.e. order >= MAX_ORDER).
410  *
411  * If the allocation shouldn't fail we use __vmalloc. This is only for the
412  * initial reserve allocation, so there's no risk of wasting all vmalloc
413  * space.
414  */
415 static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
416 			       unsigned char *data_mode)
417 {
418 	if (unlikely(c->slab_cache != NULL)) {
419 		*data_mode = DATA_MODE_SLAB;
420 		return kmem_cache_alloc(c->slab_cache, gfp_mask);
421 	}
422 
423 	if (c->block_size <= KMALLOC_MAX_SIZE &&
424 	    gfp_mask & __GFP_NORETRY) {
425 		*data_mode = DATA_MODE_GET_FREE_PAGES;
426 		return (void *)__get_free_pages(gfp_mask,
427 						c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
428 	}
429 
430 	*data_mode = DATA_MODE_VMALLOC;
431 
432 	/*
433 	 * __vmalloc allocates the data pages and auxiliary structures with
434 	 * gfp_flags that were specified, but pagetables are always allocated
435 	 * with GFP_KERNEL, no matter what was specified as gfp_mask.
436 	 *
437 	 * Consequently, we must set per-process flag PF_MEMALLOC_NOIO so that
438 	 * all allocations done by this process (including pagetables) are done
439 	 * as if GFP_NOIO was specified.
440 	 */
441 	if (gfp_mask & __GFP_NORETRY) {
442 		unsigned int noio_flag = memalloc_noio_save();
443 		void *ptr = __vmalloc(c->block_size, gfp_mask);
444 
445 		memalloc_noio_restore(noio_flag);
446 		return ptr;
447 	}
448 
449 	return __vmalloc(c->block_size, gfp_mask);
450 }
451 
452 /*
453  * Free buffer's data.
454  */
455 static void free_buffer_data(struct dm_bufio_client *c,
456 			     void *data, unsigned char data_mode)
457 {
458 	switch (data_mode) {
459 	case DATA_MODE_SLAB:
460 		kmem_cache_free(c->slab_cache, data);
461 		break;
462 
463 	case DATA_MODE_GET_FREE_PAGES:
464 		free_pages((unsigned long)data,
465 			   c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
466 		break;
467 
468 	case DATA_MODE_VMALLOC:
469 		vfree(data);
470 		break;
471 
472 	default:
473 		DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d",
474 		       data_mode);
475 		BUG();
476 	}
477 }
478 
479 /*
480  * Allocate buffer and its data.
481  */
482 static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
483 {
484 	struct dm_buffer *b = kmem_cache_alloc(c->slab_buffer, gfp_mask);
485 
486 	if (!b)
487 		return NULL;
488 
489 	b->c = c;
490 
491 	b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode);
492 	if (!b->data) {
493 		kmem_cache_free(c->slab_buffer, b);
494 		return NULL;
495 	}
496 
497 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
498 	b->stack_len = 0;
499 #endif
500 	return b;
501 }
502 
503 /*
504  * Free buffer and its data.
505  */
506 static void free_buffer(struct dm_buffer *b)
507 {
508 	struct dm_bufio_client *c = b->c;
509 
510 	free_buffer_data(c, b->data, b->data_mode);
511 	kmem_cache_free(c->slab_buffer, b);
512 }
513 
514 /*
515  * Link buffer to the buffer tree and clean or dirty queue.
516  */
517 static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty)
518 {
519 	struct dm_bufio_client *c = b->c;
520 
521 	c->n_buffers[dirty]++;
522 	b->block = block;
523 	b->list_mode = dirty;
524 	list_add(&b->lru_list, &c->lru[dirty]);
525 	__insert(b->c, b);
526 	b->last_accessed = jiffies;
527 
528 	adjust_total_allocated(b, false);
529 }
530 
531 /*
532  * Unlink buffer from the buffer tree and dirty or clean queue.
533  */
534 static void __unlink_buffer(struct dm_buffer *b)
535 {
536 	struct dm_bufio_client *c = b->c;
537 
538 	BUG_ON(!c->n_buffers[b->list_mode]);
539 
540 	c->n_buffers[b->list_mode]--;
541 	__remove(b->c, b);
542 	list_del(&b->lru_list);
543 
544 	adjust_total_allocated(b, true);
545 }
546 
547 /*
548  * Place the buffer to the head of dirty or clean LRU queue.
549  */
550 static void __relink_lru(struct dm_buffer *b, int dirty)
551 {
552 	struct dm_bufio_client *c = b->c;
553 
554 	b->accessed = 1;
555 
556 	BUG_ON(!c->n_buffers[b->list_mode]);
557 
558 	c->n_buffers[b->list_mode]--;
559 	c->n_buffers[dirty]++;
560 	b->list_mode = dirty;
561 	list_move(&b->lru_list, &c->lru[dirty]);
562 	b->last_accessed = jiffies;
563 }
564 
565 /*----------------------------------------------------------------
566  * Submit I/O on the buffer.
567  *
568  * Bio interface is faster but it has some problems:
569  *	the vector list is limited (increasing this limit increases
570  *	memory-consumption per buffer, so it is not viable);
571  *
572  *	the memory must be direct-mapped, not vmalloced;
573  *
574  * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and
575  * it is not vmalloced, try using the bio interface.
576  *
577  * If the buffer is big, if it is vmalloced or if the underlying device
578  * rejects the bio because it is too large, use dm-io layer to do the I/O.
579  * The dm-io layer splits the I/O into multiple requests, avoiding the above
580  * shortcomings.
581  *--------------------------------------------------------------*/
582 
583 /*
584  * dm-io completion routine. It just calls b->bio.bi_end_io, pretending
585  * that the request was handled directly with bio interface.
586  */
587 static void dmio_complete(unsigned long error, void *context)
588 {
589 	struct dm_buffer *b = context;
590 
591 	b->end_io(b, unlikely(error != 0) ? BLK_STS_IOERR : 0);
592 }
593 
594 static void use_dmio(struct dm_buffer *b, enum req_op op, sector_t sector,
595 		     unsigned int n_sectors, unsigned int offset)
596 {
597 	int r;
598 	struct dm_io_request io_req = {
599 		.bi_opf = op,
600 		.notify.fn = dmio_complete,
601 		.notify.context = b,
602 		.client = b->c->dm_io,
603 	};
604 	struct dm_io_region region = {
605 		.bdev = b->c->bdev,
606 		.sector = sector,
607 		.count = n_sectors,
608 	};
609 
610 	if (b->data_mode != DATA_MODE_VMALLOC) {
611 		io_req.mem.type = DM_IO_KMEM;
612 		io_req.mem.ptr.addr = (char *)b->data + offset;
613 	} else {
614 		io_req.mem.type = DM_IO_VMA;
615 		io_req.mem.ptr.vma = (char *)b->data + offset;
616 	}
617 
618 	r = dm_io(&io_req, 1, &region, NULL);
619 	if (unlikely(r))
620 		b->end_io(b, errno_to_blk_status(r));
621 }
622 
623 static void bio_complete(struct bio *bio)
624 {
625 	struct dm_buffer *b = bio->bi_private;
626 	blk_status_t status = bio->bi_status;
627 	bio_uninit(bio);
628 	kfree(bio);
629 	b->end_io(b, status);
630 }
631 
632 static void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector,
633 		    unsigned int n_sectors, unsigned int offset)
634 {
635 	struct bio *bio;
636 	char *ptr;
637 	unsigned int vec_size, len;
638 
639 	vec_size = b->c->block_size >> PAGE_SHIFT;
640 	if (unlikely(b->c->sectors_per_block_bits < PAGE_SHIFT - SECTOR_SHIFT))
641 		vec_size += 2;
642 
643 	bio = bio_kmalloc(vec_size, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOWARN);
644 	if (!bio) {
645 dmio:
646 		use_dmio(b, op, sector, n_sectors, offset);
647 		return;
648 	}
649 	bio_init(bio, b->c->bdev, bio->bi_inline_vecs, vec_size, op);
650 	bio->bi_iter.bi_sector = sector;
651 	bio->bi_end_io = bio_complete;
652 	bio->bi_private = b;
653 
654 	ptr = (char *)b->data + offset;
655 	len = n_sectors << SECTOR_SHIFT;
656 
657 	do {
658 		unsigned int this_step = min((unsigned int)(PAGE_SIZE - offset_in_page(ptr)), len);
659 		if (!bio_add_page(bio, virt_to_page(ptr), this_step,
660 				  offset_in_page(ptr))) {
661 			bio_put(bio);
662 			goto dmio;
663 		}
664 
665 		len -= this_step;
666 		ptr += this_step;
667 	} while (len > 0);
668 
669 	submit_bio(bio);
670 }
671 
672 static inline sector_t block_to_sector(struct dm_bufio_client *c, sector_t block)
673 {
674 	sector_t sector;
675 
676 	if (likely(c->sectors_per_block_bits >= 0))
677 		sector = block << c->sectors_per_block_bits;
678 	else
679 		sector = block * (c->block_size >> SECTOR_SHIFT);
680 	sector += c->start;
681 
682 	return sector;
683 }
684 
685 static void submit_io(struct dm_buffer *b, enum req_op op,
686 		      void (*end_io)(struct dm_buffer *, blk_status_t))
687 {
688 	unsigned int n_sectors;
689 	sector_t sector;
690 	unsigned int offset, end;
691 
692 	b->end_io = end_io;
693 
694 	sector = block_to_sector(b->c, b->block);
695 
696 	if (op != REQ_OP_WRITE) {
697 		n_sectors = b->c->block_size >> SECTOR_SHIFT;
698 		offset = 0;
699 	} else {
700 		if (b->c->write_callback)
701 			b->c->write_callback(b);
702 		offset = b->write_start;
703 		end = b->write_end;
704 		offset &= -DM_BUFIO_WRITE_ALIGN;
705 		end += DM_BUFIO_WRITE_ALIGN - 1;
706 		end &= -DM_BUFIO_WRITE_ALIGN;
707 		if (unlikely(end > b->c->block_size))
708 			end = b->c->block_size;
709 
710 		sector += offset >> SECTOR_SHIFT;
711 		n_sectors = (end - offset) >> SECTOR_SHIFT;
712 	}
713 
714 	if (b->data_mode != DATA_MODE_VMALLOC)
715 		use_bio(b, op, sector, n_sectors, offset);
716 	else
717 		use_dmio(b, op, sector, n_sectors, offset);
718 }
719 
720 /*----------------------------------------------------------------
721  * Writing dirty buffers
722  *--------------------------------------------------------------*/
723 
724 /*
725  * The endio routine for write.
726  *
727  * Set the error, clear B_WRITING bit and wake anyone who was waiting on
728  * it.
729  */
730 static void write_endio(struct dm_buffer *b, blk_status_t status)
731 {
732 	b->write_error = status;
733 	if (unlikely(status)) {
734 		struct dm_bufio_client *c = b->c;
735 
736 		(void)cmpxchg(&c->async_write_error, 0,
737 				blk_status_to_errno(status));
738 	}
739 
740 	BUG_ON(!test_bit(B_WRITING, &b->state));
741 
742 	smp_mb__before_atomic();
743 	clear_bit(B_WRITING, &b->state);
744 	smp_mb__after_atomic();
745 
746 	wake_up_bit(&b->state, B_WRITING);
747 }
748 
749 /*
750  * Initiate a write on a dirty buffer, but don't wait for it.
751  *
752  * - If the buffer is not dirty, exit.
753  * - If there some previous write going on, wait for it to finish (we can't
754  *   have two writes on the same buffer simultaneously).
755  * - Submit our write and don't wait on it. We set B_WRITING indicating
756  *   that there is a write in progress.
757  */
758 static void __write_dirty_buffer(struct dm_buffer *b,
759 				 struct list_head *write_list)
760 {
761 	if (!test_bit(B_DIRTY, &b->state))
762 		return;
763 
764 	clear_bit(B_DIRTY, &b->state);
765 	wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
766 
767 	b->write_start = b->dirty_start;
768 	b->write_end = b->dirty_end;
769 
770 	if (!write_list)
771 		submit_io(b, REQ_OP_WRITE, write_endio);
772 	else
773 		list_add_tail(&b->write_list, write_list);
774 }
775 
776 static void __flush_write_list(struct list_head *write_list)
777 {
778 	struct blk_plug plug;
779 	blk_start_plug(&plug);
780 	while (!list_empty(write_list)) {
781 		struct dm_buffer *b =
782 			list_entry(write_list->next, struct dm_buffer, write_list);
783 		list_del(&b->write_list);
784 		submit_io(b, REQ_OP_WRITE, write_endio);
785 		cond_resched();
786 	}
787 	blk_finish_plug(&plug);
788 }
789 
790 /*
791  * Wait until any activity on the buffer finishes.  Possibly write the
792  * buffer if it is dirty.  When this function finishes, there is no I/O
793  * running on the buffer and the buffer is not dirty.
794  */
795 static void __make_buffer_clean(struct dm_buffer *b)
796 {
797 	BUG_ON(b->hold_count);
798 
799 	/* smp_load_acquire() pairs with read_endio()'s smp_mb__before_atomic() */
800 	if (!smp_load_acquire(&b->state))	/* fast case */
801 		return;
802 
803 	wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
804 	__write_dirty_buffer(b, NULL);
805 	wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
806 }
807 
808 /*
809  * Find some buffer that is not held by anybody, clean it, unlink it and
810  * return it.
811  */
812 static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c)
813 {
814 	struct dm_buffer *b;
815 
816 	list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) {
817 		BUG_ON(test_bit(B_WRITING, &b->state));
818 		BUG_ON(test_bit(B_DIRTY, &b->state));
819 
820 		if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep &&
821 		    unlikely(test_bit_acquire(B_READING, &b->state)))
822 			continue;
823 
824 		if (!b->hold_count) {
825 			__make_buffer_clean(b);
826 			__unlink_buffer(b);
827 			return b;
828 		}
829 		cond_resched();
830 	}
831 
832 	if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
833 		return NULL;
834 
835 	list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) {
836 		BUG_ON(test_bit(B_READING, &b->state));
837 
838 		if (!b->hold_count) {
839 			__make_buffer_clean(b);
840 			__unlink_buffer(b);
841 			return b;
842 		}
843 		cond_resched();
844 	}
845 
846 	return NULL;
847 }
848 
849 /*
850  * Wait until some other threads free some buffer or release hold count on
851  * some buffer.
852  *
853  * This function is entered with c->lock held, drops it and regains it
854  * before exiting.
855  */
856 static void __wait_for_free_buffer(struct dm_bufio_client *c)
857 {
858 	DECLARE_WAITQUEUE(wait, current);
859 
860 	add_wait_queue(&c->free_buffer_wait, &wait);
861 	set_current_state(TASK_UNINTERRUPTIBLE);
862 	dm_bufio_unlock(c);
863 
864 	io_schedule();
865 
866 	remove_wait_queue(&c->free_buffer_wait, &wait);
867 
868 	dm_bufio_lock(c);
869 }
870 
871 enum new_flag {
872 	NF_FRESH = 0,
873 	NF_READ = 1,
874 	NF_GET = 2,
875 	NF_PREFETCH = 3
876 };
877 
878 /*
879  * Allocate a new buffer. If the allocation is not possible, wait until
880  * some other thread frees a buffer.
881  *
882  * May drop the lock and regain it.
883  */
884 static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf)
885 {
886 	struct dm_buffer *b;
887 	bool tried_noio_alloc = false;
888 
889 	/*
890 	 * dm-bufio is resistant to allocation failures (it just keeps
891 	 * one buffer reserved in cases all the allocations fail).
892 	 * So set flags to not try too hard:
893 	 *	GFP_NOWAIT: don't wait; if we need to sleep we'll release our
894 	 *		    mutex and wait ourselves.
895 	 *	__GFP_NORETRY: don't retry and rather return failure
896 	 *	__GFP_NOMEMALLOC: don't use emergency reserves
897 	 *	__GFP_NOWARN: don't print a warning in case of failure
898 	 *
899 	 * For debugging, if we set the cache size to 1, no new buffers will
900 	 * be allocated.
901 	 */
902 	while (1) {
903 		if (dm_bufio_cache_size_latch != 1) {
904 			b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
905 			if (b)
906 				return b;
907 		}
908 
909 		if (nf == NF_PREFETCH)
910 			return NULL;
911 
912 		if (dm_bufio_cache_size_latch != 1 && !tried_noio_alloc) {
913 			dm_bufio_unlock(c);
914 			b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
915 			dm_bufio_lock(c);
916 			if (b)
917 				return b;
918 			tried_noio_alloc = true;
919 		}
920 
921 		if (!list_empty(&c->reserved_buffers)) {
922 			b = list_entry(c->reserved_buffers.next,
923 				       struct dm_buffer, lru_list);
924 			list_del(&b->lru_list);
925 			c->need_reserved_buffers++;
926 
927 			return b;
928 		}
929 
930 		b = __get_unclaimed_buffer(c);
931 		if (b)
932 			return b;
933 
934 		__wait_for_free_buffer(c);
935 	}
936 }
937 
938 static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf)
939 {
940 	struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf);
941 
942 	if (!b)
943 		return NULL;
944 
945 	if (c->alloc_callback)
946 		c->alloc_callback(b);
947 
948 	return b;
949 }
950 
951 /*
952  * Free a buffer and wake other threads waiting for free buffers.
953  */
954 static void __free_buffer_wake(struct dm_buffer *b)
955 {
956 	struct dm_bufio_client *c = b->c;
957 
958 	if (!c->need_reserved_buffers)
959 		free_buffer(b);
960 	else {
961 		list_add(&b->lru_list, &c->reserved_buffers);
962 		c->need_reserved_buffers--;
963 	}
964 
965 	wake_up(&c->free_buffer_wait);
966 }
967 
968 static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait,
969 					struct list_head *write_list)
970 {
971 	struct dm_buffer *b, *tmp;
972 
973 	list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
974 		BUG_ON(test_bit(B_READING, &b->state));
975 
976 		if (!test_bit(B_DIRTY, &b->state) &&
977 		    !test_bit(B_WRITING, &b->state)) {
978 			__relink_lru(b, LIST_CLEAN);
979 			continue;
980 		}
981 
982 		if (no_wait && test_bit(B_WRITING, &b->state))
983 			return;
984 
985 		__write_dirty_buffer(b, write_list);
986 		cond_resched();
987 	}
988 }
989 
990 /*
991  * Check if we're over watermark.
992  * If we are over threshold_buffers, start freeing buffers.
993  * If we're over "limit_buffers", block until we get under the limit.
994  */
995 static void __check_watermark(struct dm_bufio_client *c,
996 			      struct list_head *write_list)
997 {
998 	if (c->n_buffers[LIST_DIRTY] > c->n_buffers[LIST_CLEAN] * DM_BUFIO_WRITEBACK_RATIO)
999 		__write_dirty_buffers_async(c, 1, write_list);
1000 }
1001 
1002 /*----------------------------------------------------------------
1003  * Getting a buffer
1004  *--------------------------------------------------------------*/
1005 
1006 static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
1007 				     enum new_flag nf, int *need_submit,
1008 				     struct list_head *write_list)
1009 {
1010 	struct dm_buffer *b, *new_b = NULL;
1011 
1012 	*need_submit = 0;
1013 
1014 	b = __find(c, block);
1015 	if (b)
1016 		goto found_buffer;
1017 
1018 	if (nf == NF_GET)
1019 		return NULL;
1020 
1021 	new_b = __alloc_buffer_wait(c, nf);
1022 	if (!new_b)
1023 		return NULL;
1024 
1025 	/*
1026 	 * We've had a period where the mutex was unlocked, so need to
1027 	 * recheck the buffer tree.
1028 	 */
1029 	b = __find(c, block);
1030 	if (b) {
1031 		__free_buffer_wake(new_b);
1032 		goto found_buffer;
1033 	}
1034 
1035 	__check_watermark(c, write_list);
1036 
1037 	b = new_b;
1038 	b->hold_count = 1;
1039 	b->read_error = 0;
1040 	b->write_error = 0;
1041 	__link_buffer(b, block, LIST_CLEAN);
1042 
1043 	if (nf == NF_FRESH) {
1044 		b->state = 0;
1045 		return b;
1046 	}
1047 
1048 	b->state = 1 << B_READING;
1049 	*need_submit = 1;
1050 
1051 	return b;
1052 
1053 found_buffer:
1054 	if (nf == NF_PREFETCH)
1055 		return NULL;
1056 	/*
1057 	 * Note: it is essential that we don't wait for the buffer to be
1058 	 * read if dm_bufio_get function is used. Both dm_bufio_get and
1059 	 * dm_bufio_prefetch can be used in the driver request routine.
1060 	 * If the user called both dm_bufio_prefetch and dm_bufio_get on
1061 	 * the same buffer, it would deadlock if we waited.
1062 	 */
1063 	if (nf == NF_GET && unlikely(test_bit_acquire(B_READING, &b->state)))
1064 		return NULL;
1065 
1066 	b->hold_count++;
1067 	__relink_lru(b, test_bit(B_DIRTY, &b->state) ||
1068 		     test_bit(B_WRITING, &b->state));
1069 	return b;
1070 }
1071 
1072 /*
1073  * The endio routine for reading: set the error, clear the bit and wake up
1074  * anyone waiting on the buffer.
1075  */
1076 static void read_endio(struct dm_buffer *b, blk_status_t status)
1077 {
1078 	b->read_error = status;
1079 
1080 	BUG_ON(!test_bit(B_READING, &b->state));
1081 
1082 	smp_mb__before_atomic();
1083 	clear_bit(B_READING, &b->state);
1084 	smp_mb__after_atomic();
1085 
1086 	wake_up_bit(&b->state, B_READING);
1087 }
1088 
1089 /*
1090  * A common routine for dm_bufio_new and dm_bufio_read.  Operation of these
1091  * functions is similar except that dm_bufio_new doesn't read the
1092  * buffer from the disk (assuming that the caller overwrites all the data
1093  * and uses dm_bufio_mark_buffer_dirty to write new data back).
1094  */
1095 static void *new_read(struct dm_bufio_client *c, sector_t block,
1096 		      enum new_flag nf, struct dm_buffer **bp)
1097 {
1098 	int need_submit;
1099 	struct dm_buffer *b;
1100 
1101 	LIST_HEAD(write_list);
1102 
1103 	dm_bufio_lock(c);
1104 	b = __bufio_new(c, block, nf, &need_submit, &write_list);
1105 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1106 	if (b && b->hold_count == 1)
1107 		buffer_record_stack(b);
1108 #endif
1109 	dm_bufio_unlock(c);
1110 
1111 	__flush_write_list(&write_list);
1112 
1113 	if (!b)
1114 		return NULL;
1115 
1116 	if (need_submit)
1117 		submit_io(b, REQ_OP_READ, read_endio);
1118 
1119 	wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
1120 
1121 	if (b->read_error) {
1122 		int error = blk_status_to_errno(b->read_error);
1123 
1124 		dm_bufio_release(b);
1125 
1126 		return ERR_PTR(error);
1127 	}
1128 
1129 	*bp = b;
1130 
1131 	return b->data;
1132 }
1133 
1134 void *dm_bufio_get(struct dm_bufio_client *c, sector_t block,
1135 		   struct dm_buffer **bp)
1136 {
1137 	return new_read(c, block, NF_GET, bp);
1138 }
1139 EXPORT_SYMBOL_GPL(dm_bufio_get);
1140 
1141 void *dm_bufio_read(struct dm_bufio_client *c, sector_t block,
1142 		    struct dm_buffer **bp)
1143 {
1144 	BUG_ON(dm_bufio_in_request());
1145 
1146 	return new_read(c, block, NF_READ, bp);
1147 }
1148 EXPORT_SYMBOL_GPL(dm_bufio_read);
1149 
1150 void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
1151 		   struct dm_buffer **bp)
1152 {
1153 	BUG_ON(dm_bufio_in_request());
1154 
1155 	return new_read(c, block, NF_FRESH, bp);
1156 }
1157 EXPORT_SYMBOL_GPL(dm_bufio_new);
1158 
1159 void dm_bufio_prefetch(struct dm_bufio_client *c,
1160 		       sector_t block, unsigned int n_blocks)
1161 {
1162 	struct blk_plug plug;
1163 
1164 	LIST_HEAD(write_list);
1165 
1166 	BUG_ON(dm_bufio_in_request());
1167 
1168 	blk_start_plug(&plug);
1169 	dm_bufio_lock(c);
1170 
1171 	for (; n_blocks--; block++) {
1172 		int need_submit;
1173 		struct dm_buffer *b;
1174 		b = __bufio_new(c, block, NF_PREFETCH, &need_submit,
1175 				&write_list);
1176 		if (unlikely(!list_empty(&write_list))) {
1177 			dm_bufio_unlock(c);
1178 			blk_finish_plug(&plug);
1179 			__flush_write_list(&write_list);
1180 			blk_start_plug(&plug);
1181 			dm_bufio_lock(c);
1182 		}
1183 		if (unlikely(b != NULL)) {
1184 			dm_bufio_unlock(c);
1185 
1186 			if (need_submit)
1187 				submit_io(b, REQ_OP_READ, read_endio);
1188 			dm_bufio_release(b);
1189 
1190 			cond_resched();
1191 
1192 			if (!n_blocks)
1193 				goto flush_plug;
1194 			dm_bufio_lock(c);
1195 		}
1196 	}
1197 
1198 	dm_bufio_unlock(c);
1199 
1200 flush_plug:
1201 	blk_finish_plug(&plug);
1202 }
1203 EXPORT_SYMBOL_GPL(dm_bufio_prefetch);
1204 
1205 void dm_bufio_release(struct dm_buffer *b)
1206 {
1207 	struct dm_bufio_client *c = b->c;
1208 
1209 	dm_bufio_lock(c);
1210 
1211 	BUG_ON(!b->hold_count);
1212 
1213 	b->hold_count--;
1214 	if (!b->hold_count) {
1215 		wake_up(&c->free_buffer_wait);
1216 
1217 		/*
1218 		 * If there were errors on the buffer, and the buffer is not
1219 		 * to be written, free the buffer. There is no point in caching
1220 		 * invalid buffer.
1221 		 */
1222 		if ((b->read_error || b->write_error) &&
1223 		    !test_bit_acquire(B_READING, &b->state) &&
1224 		    !test_bit(B_WRITING, &b->state) &&
1225 		    !test_bit(B_DIRTY, &b->state)) {
1226 			__unlink_buffer(b);
1227 			__free_buffer_wake(b);
1228 		}
1229 	}
1230 
1231 	dm_bufio_unlock(c);
1232 }
1233 EXPORT_SYMBOL_GPL(dm_bufio_release);
1234 
1235 void dm_bufio_mark_partial_buffer_dirty(struct dm_buffer *b,
1236 					unsigned int start, unsigned int end)
1237 {
1238 	struct dm_bufio_client *c = b->c;
1239 
1240 	BUG_ON(start >= end);
1241 	BUG_ON(end > b->c->block_size);
1242 
1243 	dm_bufio_lock(c);
1244 
1245 	BUG_ON(test_bit(B_READING, &b->state));
1246 
1247 	if (!test_and_set_bit(B_DIRTY, &b->state)) {
1248 		b->dirty_start = start;
1249 		b->dirty_end = end;
1250 		__relink_lru(b, LIST_DIRTY);
1251 	} else {
1252 		if (start < b->dirty_start)
1253 			b->dirty_start = start;
1254 		if (end > b->dirty_end)
1255 			b->dirty_end = end;
1256 	}
1257 
1258 	dm_bufio_unlock(c);
1259 }
1260 EXPORT_SYMBOL_GPL(dm_bufio_mark_partial_buffer_dirty);
1261 
1262 void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
1263 {
1264 	dm_bufio_mark_partial_buffer_dirty(b, 0, b->c->block_size);
1265 }
1266 EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty);
1267 
1268 void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c)
1269 {
1270 	LIST_HEAD(write_list);
1271 
1272 	BUG_ON(dm_bufio_in_request());
1273 
1274 	dm_bufio_lock(c);
1275 	__write_dirty_buffers_async(c, 0, &write_list);
1276 	dm_bufio_unlock(c);
1277 	__flush_write_list(&write_list);
1278 }
1279 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
1280 
1281 /*
1282  * For performance, it is essential that the buffers are written asynchronously
1283  * and simultaneously (so that the block layer can merge the writes) and then
1284  * waited upon.
1285  *
1286  * Finally, we flush hardware disk cache.
1287  */
1288 int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
1289 {
1290 	int a, f;
1291 	unsigned long buffers_processed = 0;
1292 	struct dm_buffer *b, *tmp;
1293 
1294 	LIST_HEAD(write_list);
1295 
1296 	dm_bufio_lock(c);
1297 	__write_dirty_buffers_async(c, 0, &write_list);
1298 	dm_bufio_unlock(c);
1299 	__flush_write_list(&write_list);
1300 	dm_bufio_lock(c);
1301 
1302 again:
1303 	list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
1304 		int dropped_lock = 0;
1305 
1306 		if (buffers_processed < c->n_buffers[LIST_DIRTY])
1307 			buffers_processed++;
1308 
1309 		BUG_ON(test_bit(B_READING, &b->state));
1310 
1311 		if (test_bit(B_WRITING, &b->state)) {
1312 			if (buffers_processed < c->n_buffers[LIST_DIRTY]) {
1313 				dropped_lock = 1;
1314 				b->hold_count++;
1315 				dm_bufio_unlock(c);
1316 				wait_on_bit_io(&b->state, B_WRITING,
1317 					       TASK_UNINTERRUPTIBLE);
1318 				dm_bufio_lock(c);
1319 				b->hold_count--;
1320 			} else
1321 				wait_on_bit_io(&b->state, B_WRITING,
1322 					       TASK_UNINTERRUPTIBLE);
1323 		}
1324 
1325 		if (!test_bit(B_DIRTY, &b->state) &&
1326 		    !test_bit(B_WRITING, &b->state))
1327 			__relink_lru(b, LIST_CLEAN);
1328 
1329 		cond_resched();
1330 
1331 		/*
1332 		 * If we dropped the lock, the list is no longer consistent,
1333 		 * so we must restart the search.
1334 		 *
1335 		 * In the most common case, the buffer just processed is
1336 		 * relinked to the clean list, so we won't loop scanning the
1337 		 * same buffer again and again.
1338 		 *
1339 		 * This may livelock if there is another thread simultaneously
1340 		 * dirtying buffers, so we count the number of buffers walked
1341 		 * and if it exceeds the total number of buffers, it means that
1342 		 * someone is doing some writes simultaneously with us.  In
1343 		 * this case, stop, dropping the lock.
1344 		 */
1345 		if (dropped_lock)
1346 			goto again;
1347 	}
1348 	wake_up(&c->free_buffer_wait);
1349 	dm_bufio_unlock(c);
1350 
1351 	a = xchg(&c->async_write_error, 0);
1352 	f = dm_bufio_issue_flush(c);
1353 	if (a)
1354 		return a;
1355 
1356 	return f;
1357 }
1358 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers);
1359 
1360 /*
1361  * Use dm-io to send an empty barrier to flush the device.
1362  */
1363 int dm_bufio_issue_flush(struct dm_bufio_client *c)
1364 {
1365 	struct dm_io_request io_req = {
1366 		.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC,
1367 		.mem.type = DM_IO_KMEM,
1368 		.mem.ptr.addr = NULL,
1369 		.client = c->dm_io,
1370 	};
1371 	struct dm_io_region io_reg = {
1372 		.bdev = c->bdev,
1373 		.sector = 0,
1374 		.count = 0,
1375 	};
1376 
1377 	BUG_ON(dm_bufio_in_request());
1378 
1379 	return dm_io(&io_req, 1, &io_reg, NULL);
1380 }
1381 EXPORT_SYMBOL_GPL(dm_bufio_issue_flush);
1382 
1383 /*
1384  * Use dm-io to send a discard request to flush the device.
1385  */
1386 int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t count)
1387 {
1388 	struct dm_io_request io_req = {
1389 		.bi_opf = REQ_OP_DISCARD | REQ_SYNC,
1390 		.mem.type = DM_IO_KMEM,
1391 		.mem.ptr.addr = NULL,
1392 		.client = c->dm_io,
1393 	};
1394 	struct dm_io_region io_reg = {
1395 		.bdev = c->bdev,
1396 		.sector = block_to_sector(c, block),
1397 		.count = block_to_sector(c, count),
1398 	};
1399 
1400 	BUG_ON(dm_bufio_in_request());
1401 
1402 	return dm_io(&io_req, 1, &io_reg, NULL);
1403 }
1404 EXPORT_SYMBOL_GPL(dm_bufio_issue_discard);
1405 
1406 /*
1407  * We first delete any other buffer that may be at that new location.
1408  *
1409  * Then, we write the buffer to the original location if it was dirty.
1410  *
1411  * Then, if we are the only one who is holding the buffer, relink the buffer
1412  * in the buffer tree for the new location.
1413  *
1414  * If there was someone else holding the buffer, we write it to the new
1415  * location but not relink it, because that other user needs to have the buffer
1416  * at the same place.
1417  */
1418 void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block)
1419 {
1420 	struct dm_bufio_client *c = b->c;
1421 	struct dm_buffer *new;
1422 
1423 	BUG_ON(dm_bufio_in_request());
1424 
1425 	dm_bufio_lock(c);
1426 
1427 retry:
1428 	new = __find(c, new_block);
1429 	if (new) {
1430 		if (new->hold_count) {
1431 			__wait_for_free_buffer(c);
1432 			goto retry;
1433 		}
1434 
1435 		/*
1436 		 * FIXME: Is there any point waiting for a write that's going
1437 		 * to be overwritten in a bit?
1438 		 */
1439 		__make_buffer_clean(new);
1440 		__unlink_buffer(new);
1441 		__free_buffer_wake(new);
1442 	}
1443 
1444 	BUG_ON(!b->hold_count);
1445 	BUG_ON(test_bit(B_READING, &b->state));
1446 
1447 	__write_dirty_buffer(b, NULL);
1448 	if (b->hold_count == 1) {
1449 		wait_on_bit_io(&b->state, B_WRITING,
1450 			       TASK_UNINTERRUPTIBLE);
1451 		set_bit(B_DIRTY, &b->state);
1452 		b->dirty_start = 0;
1453 		b->dirty_end = c->block_size;
1454 		__unlink_buffer(b);
1455 		__link_buffer(b, new_block, LIST_DIRTY);
1456 	} else {
1457 		sector_t old_block;
1458 		wait_on_bit_lock_io(&b->state, B_WRITING,
1459 				    TASK_UNINTERRUPTIBLE);
1460 		/*
1461 		 * Relink buffer to "new_block" so that write_callback
1462 		 * sees "new_block" as a block number.
1463 		 * After the write, link the buffer back to old_block.
1464 		 * All this must be done in bufio lock, so that block number
1465 		 * change isn't visible to other threads.
1466 		 */
1467 		old_block = b->block;
1468 		__unlink_buffer(b);
1469 		__link_buffer(b, new_block, b->list_mode);
1470 		submit_io(b, REQ_OP_WRITE, write_endio);
1471 		wait_on_bit_io(&b->state, B_WRITING,
1472 			       TASK_UNINTERRUPTIBLE);
1473 		__unlink_buffer(b);
1474 		__link_buffer(b, old_block, b->list_mode);
1475 	}
1476 
1477 	dm_bufio_unlock(c);
1478 	dm_bufio_release(b);
1479 }
1480 EXPORT_SYMBOL_GPL(dm_bufio_release_move);
1481 
1482 static void forget_buffer_locked(struct dm_buffer *b)
1483 {
1484 	if (likely(!b->hold_count) && likely(!smp_load_acquire(&b->state))) {
1485 		__unlink_buffer(b);
1486 		__free_buffer_wake(b);
1487 	}
1488 }
1489 
1490 /*
1491  * Free the given buffer.
1492  *
1493  * This is just a hint, if the buffer is in use or dirty, this function
1494  * does nothing.
1495  */
1496 void dm_bufio_forget(struct dm_bufio_client *c, sector_t block)
1497 {
1498 	struct dm_buffer *b;
1499 
1500 	dm_bufio_lock(c);
1501 
1502 	b = __find(c, block);
1503 	if (b)
1504 		forget_buffer_locked(b);
1505 
1506 	dm_bufio_unlock(c);
1507 }
1508 EXPORT_SYMBOL_GPL(dm_bufio_forget);
1509 
1510 void dm_bufio_forget_buffers(struct dm_bufio_client *c, sector_t block, sector_t n_blocks)
1511 {
1512 	struct dm_buffer *b;
1513 	sector_t end_block = block + n_blocks;
1514 
1515 	while (block < end_block) {
1516 		dm_bufio_lock(c);
1517 
1518 		b = __find_next(c, block);
1519 		if (b) {
1520 			block = b->block + 1;
1521 			forget_buffer_locked(b);
1522 		}
1523 
1524 		dm_bufio_unlock(c);
1525 
1526 		if (!b)
1527 			break;
1528 	}
1529 
1530 }
1531 EXPORT_SYMBOL_GPL(dm_bufio_forget_buffers);
1532 
1533 void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned int n)
1534 {
1535 	c->minimum_buffers = n;
1536 }
1537 EXPORT_SYMBOL_GPL(dm_bufio_set_minimum_buffers);
1538 
1539 unsigned int dm_bufio_get_block_size(struct dm_bufio_client *c)
1540 {
1541 	return c->block_size;
1542 }
1543 EXPORT_SYMBOL_GPL(dm_bufio_get_block_size);
1544 
1545 sector_t dm_bufio_get_device_size(struct dm_bufio_client *c)
1546 {
1547 	sector_t s = bdev_nr_sectors(c->bdev);
1548 	if (s >= c->start)
1549 		s -= c->start;
1550 	else
1551 		s = 0;
1552 	if (likely(c->sectors_per_block_bits >= 0))
1553 		s >>= c->sectors_per_block_bits;
1554 	else
1555 		sector_div(s, c->block_size >> SECTOR_SHIFT);
1556 	return s;
1557 }
1558 EXPORT_SYMBOL_GPL(dm_bufio_get_device_size);
1559 
1560 struct dm_io_client *dm_bufio_get_dm_io_client(struct dm_bufio_client *c)
1561 {
1562 	return c->dm_io;
1563 }
1564 EXPORT_SYMBOL_GPL(dm_bufio_get_dm_io_client);
1565 
1566 sector_t dm_bufio_get_block_number(struct dm_buffer *b)
1567 {
1568 	return b->block;
1569 }
1570 EXPORT_SYMBOL_GPL(dm_bufio_get_block_number);
1571 
1572 void *dm_bufio_get_block_data(struct dm_buffer *b)
1573 {
1574 	return b->data;
1575 }
1576 EXPORT_SYMBOL_GPL(dm_bufio_get_block_data);
1577 
1578 void *dm_bufio_get_aux_data(struct dm_buffer *b)
1579 {
1580 	return b + 1;
1581 }
1582 EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data);
1583 
1584 struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b)
1585 {
1586 	return b->c;
1587 }
1588 EXPORT_SYMBOL_GPL(dm_bufio_get_client);
1589 
1590 static void drop_buffers(struct dm_bufio_client *c)
1591 {
1592 	struct dm_buffer *b;
1593 	int i;
1594 	bool warned = false;
1595 
1596 	BUG_ON(dm_bufio_in_request());
1597 
1598 	/*
1599 	 * An optimization so that the buffers are not written one-by-one.
1600 	 */
1601 	dm_bufio_write_dirty_buffers_async(c);
1602 
1603 	dm_bufio_lock(c);
1604 
1605 	while ((b = __get_unclaimed_buffer(c)))
1606 		__free_buffer_wake(b);
1607 
1608 	for (i = 0; i < LIST_SIZE; i++)
1609 		list_for_each_entry(b, &c->lru[i], lru_list) {
1610 			WARN_ON(!warned);
1611 			warned = true;
1612 			DMERR("leaked buffer %llx, hold count %u, list %d",
1613 			      (unsigned long long)b->block, b->hold_count, i);
1614 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1615 			stack_trace_print(b->stack_entries, b->stack_len, 1);
1616 			/* mark unclaimed to avoid BUG_ON below */
1617 			b->hold_count = 0;
1618 #endif
1619 		}
1620 
1621 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1622 	while ((b = __get_unclaimed_buffer(c)))
1623 		__free_buffer_wake(b);
1624 #endif
1625 
1626 	for (i = 0; i < LIST_SIZE; i++)
1627 		BUG_ON(!list_empty(&c->lru[i]));
1628 
1629 	dm_bufio_unlock(c);
1630 }
1631 
1632 /*
1633  * We may not be able to evict this buffer if IO pending or the client
1634  * is still using it.  Caller is expected to know buffer is too old.
1635  *
1636  * And if GFP_NOFS is used, we must not do any I/O because we hold
1637  * dm_bufio_clients_lock and we would risk deadlock if the I/O gets
1638  * rerouted to different bufio client.
1639  */
1640 static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp)
1641 {
1642 	if (!(gfp & __GFP_FS) ||
1643 	    (static_branch_unlikely(&no_sleep_enabled) && b->c->no_sleep)) {
1644 		if (test_bit_acquire(B_READING, &b->state) ||
1645 		    test_bit(B_WRITING, &b->state) ||
1646 		    test_bit(B_DIRTY, &b->state))
1647 			return false;
1648 	}
1649 
1650 	if (b->hold_count)
1651 		return false;
1652 
1653 	__make_buffer_clean(b);
1654 	__unlink_buffer(b);
1655 	__free_buffer_wake(b);
1656 
1657 	return true;
1658 }
1659 
1660 static unsigned long get_retain_buffers(struct dm_bufio_client *c)
1661 {
1662 	unsigned long retain_bytes = READ_ONCE(dm_bufio_retain_bytes);
1663 	if (likely(c->sectors_per_block_bits >= 0))
1664 		retain_bytes >>= c->sectors_per_block_bits + SECTOR_SHIFT;
1665 	else
1666 		retain_bytes /= c->block_size;
1667 	return retain_bytes;
1668 }
1669 
1670 static void __scan(struct dm_bufio_client *c)
1671 {
1672 	int l;
1673 	struct dm_buffer *b, *tmp;
1674 	unsigned long freed = 0;
1675 	unsigned long count = c->n_buffers[LIST_CLEAN] +
1676 			      c->n_buffers[LIST_DIRTY];
1677 	unsigned long retain_target = get_retain_buffers(c);
1678 
1679 	for (l = 0; l < LIST_SIZE; l++) {
1680 		list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) {
1681 			if (count - freed <= retain_target)
1682 				atomic_long_set(&c->need_shrink, 0);
1683 			if (!atomic_long_read(&c->need_shrink))
1684 				return;
1685 			if (__try_evict_buffer(b, GFP_KERNEL)) {
1686 				atomic_long_dec(&c->need_shrink);
1687 				freed++;
1688 			}
1689 			cond_resched();
1690 		}
1691 	}
1692 }
1693 
1694 static void shrink_work(struct work_struct *w)
1695 {
1696 	struct dm_bufio_client *c = container_of(w, struct dm_bufio_client, shrink_work);
1697 
1698 	dm_bufio_lock(c);
1699 	__scan(c);
1700 	dm_bufio_unlock(c);
1701 }
1702 
1703 static unsigned long dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
1704 {
1705 	struct dm_bufio_client *c;
1706 
1707 	c = container_of(shrink, struct dm_bufio_client, shrinker);
1708 	atomic_long_add(sc->nr_to_scan, &c->need_shrink);
1709 	queue_work(dm_bufio_wq, &c->shrink_work);
1710 
1711 	return sc->nr_to_scan;
1712 }
1713 
1714 static unsigned long dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
1715 {
1716 	struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker);
1717 	unsigned long count = READ_ONCE(c->n_buffers[LIST_CLEAN]) +
1718 			      READ_ONCE(c->n_buffers[LIST_DIRTY]);
1719 	unsigned long retain_target = get_retain_buffers(c);
1720 	unsigned long queued_for_cleanup = atomic_long_read(&c->need_shrink);
1721 
1722 	if (unlikely(count < retain_target))
1723 		count = 0;
1724 	else
1725 		count -= retain_target;
1726 
1727 	if (unlikely(count < queued_for_cleanup))
1728 		count = 0;
1729 	else
1730 		count -= queued_for_cleanup;
1731 
1732 	return count;
1733 }
1734 
1735 /*
1736  * Create the buffering interface
1737  */
1738 struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned int block_size,
1739 					       unsigned int reserved_buffers, unsigned int aux_size,
1740 					       void (*alloc_callback)(struct dm_buffer *),
1741 					       void (*write_callback)(struct dm_buffer *),
1742 					       unsigned int flags)
1743 {
1744 	int r;
1745 	struct dm_bufio_client *c;
1746 	unsigned int i;
1747 	char slab_name[27];
1748 
1749 	if (!block_size || block_size & ((1 << SECTOR_SHIFT) - 1)) {
1750 		DMERR("%s: block size not specified or is not multiple of 512b", __func__);
1751 		r = -EINVAL;
1752 		goto bad_client;
1753 	}
1754 
1755 	c = kzalloc(sizeof(*c), GFP_KERNEL);
1756 	if (!c) {
1757 		r = -ENOMEM;
1758 		goto bad_client;
1759 	}
1760 	c->buffer_tree = RB_ROOT;
1761 
1762 	c->bdev = bdev;
1763 	c->block_size = block_size;
1764 	if (is_power_of_2(block_size))
1765 		c->sectors_per_block_bits = __ffs(block_size) - SECTOR_SHIFT;
1766 	else
1767 		c->sectors_per_block_bits = -1;
1768 
1769 	c->alloc_callback = alloc_callback;
1770 	c->write_callback = write_callback;
1771 
1772 	if (flags & DM_BUFIO_CLIENT_NO_SLEEP) {
1773 		c->no_sleep = true;
1774 		static_branch_inc(&no_sleep_enabled);
1775 	}
1776 
1777 	for (i = 0; i < LIST_SIZE; i++) {
1778 		INIT_LIST_HEAD(&c->lru[i]);
1779 		c->n_buffers[i] = 0;
1780 	}
1781 
1782 	mutex_init(&c->lock);
1783 	spin_lock_init(&c->spinlock);
1784 	INIT_LIST_HEAD(&c->reserved_buffers);
1785 	c->need_reserved_buffers = reserved_buffers;
1786 
1787 	dm_bufio_set_minimum_buffers(c, DM_BUFIO_MIN_BUFFERS);
1788 
1789 	init_waitqueue_head(&c->free_buffer_wait);
1790 	c->async_write_error = 0;
1791 
1792 	c->dm_io = dm_io_client_create();
1793 	if (IS_ERR(c->dm_io)) {
1794 		r = PTR_ERR(c->dm_io);
1795 		goto bad_dm_io;
1796 	}
1797 
1798 	if (block_size <= KMALLOC_MAX_SIZE &&
1799 	    (block_size < PAGE_SIZE || !is_power_of_2(block_size))) {
1800 		unsigned int align = min(1U << __ffs(block_size), (unsigned int)PAGE_SIZE);
1801 		snprintf(slab_name, sizeof slab_name, "dm_bufio_cache-%u", block_size);
1802 		c->slab_cache = kmem_cache_create(slab_name, block_size, align,
1803 						  SLAB_RECLAIM_ACCOUNT, NULL);
1804 		if (!c->slab_cache) {
1805 			r = -ENOMEM;
1806 			goto bad;
1807 		}
1808 	}
1809 	if (aux_size)
1810 		snprintf(slab_name, sizeof slab_name, "dm_bufio_buffer-%u", aux_size);
1811 	else
1812 		snprintf(slab_name, sizeof slab_name, "dm_bufio_buffer");
1813 	c->slab_buffer = kmem_cache_create(slab_name, sizeof(struct dm_buffer) + aux_size,
1814 					   0, SLAB_RECLAIM_ACCOUNT, NULL);
1815 	if (!c->slab_buffer) {
1816 		r = -ENOMEM;
1817 		goto bad;
1818 	}
1819 
1820 	while (c->need_reserved_buffers) {
1821 		struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL);
1822 
1823 		if (!b) {
1824 			r = -ENOMEM;
1825 			goto bad;
1826 		}
1827 		__free_buffer_wake(b);
1828 	}
1829 
1830 	INIT_WORK(&c->shrink_work, shrink_work);
1831 	atomic_long_set(&c->need_shrink, 0);
1832 
1833 	c->shrinker.count_objects = dm_bufio_shrink_count;
1834 	c->shrinker.scan_objects = dm_bufio_shrink_scan;
1835 	c->shrinker.seeks = 1;
1836 	c->shrinker.batch = 0;
1837 	r = register_shrinker(&c->shrinker, "dm-bufio:(%u:%u)",
1838 			      MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
1839 	if (r)
1840 		goto bad;
1841 
1842 	mutex_lock(&dm_bufio_clients_lock);
1843 	dm_bufio_client_count++;
1844 	list_add(&c->client_list, &dm_bufio_all_clients);
1845 	__cache_size_refresh();
1846 	mutex_unlock(&dm_bufio_clients_lock);
1847 
1848 	return c;
1849 
1850 bad:
1851 	while (!list_empty(&c->reserved_buffers)) {
1852 		struct dm_buffer *b = list_entry(c->reserved_buffers.next,
1853 						 struct dm_buffer, lru_list);
1854 		list_del(&b->lru_list);
1855 		free_buffer(b);
1856 	}
1857 	kmem_cache_destroy(c->slab_cache);
1858 	kmem_cache_destroy(c->slab_buffer);
1859 	dm_io_client_destroy(c->dm_io);
1860 bad_dm_io:
1861 	mutex_destroy(&c->lock);
1862 	if (c->no_sleep)
1863 		static_branch_dec(&no_sleep_enabled);
1864 	kfree(c);
1865 bad_client:
1866 	return ERR_PTR(r);
1867 }
1868 EXPORT_SYMBOL_GPL(dm_bufio_client_create);
1869 
1870 /*
1871  * Free the buffering interface.
1872  * It is required that there are no references on any buffers.
1873  */
1874 void dm_bufio_client_destroy(struct dm_bufio_client *c)
1875 {
1876 	unsigned int i;
1877 
1878 	drop_buffers(c);
1879 
1880 	unregister_shrinker(&c->shrinker);
1881 	flush_work(&c->shrink_work);
1882 
1883 	mutex_lock(&dm_bufio_clients_lock);
1884 
1885 	list_del(&c->client_list);
1886 	dm_bufio_client_count--;
1887 	__cache_size_refresh();
1888 
1889 	mutex_unlock(&dm_bufio_clients_lock);
1890 
1891 	BUG_ON(!RB_EMPTY_ROOT(&c->buffer_tree));
1892 	BUG_ON(c->need_reserved_buffers);
1893 
1894 	while (!list_empty(&c->reserved_buffers)) {
1895 		struct dm_buffer *b = list_entry(c->reserved_buffers.next,
1896 						 struct dm_buffer, lru_list);
1897 		list_del(&b->lru_list);
1898 		free_buffer(b);
1899 	}
1900 
1901 	for (i = 0; i < LIST_SIZE; i++)
1902 		if (c->n_buffers[i])
1903 			DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]);
1904 
1905 	for (i = 0; i < LIST_SIZE; i++)
1906 		BUG_ON(c->n_buffers[i]);
1907 
1908 	kmem_cache_destroy(c->slab_cache);
1909 	kmem_cache_destroy(c->slab_buffer);
1910 	dm_io_client_destroy(c->dm_io);
1911 	mutex_destroy(&c->lock);
1912 	if (c->no_sleep)
1913 		static_branch_dec(&no_sleep_enabled);
1914 	kfree(c);
1915 }
1916 EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
1917 
1918 void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start)
1919 {
1920 	c->start = start;
1921 }
1922 EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset);
1923 
1924 static unsigned int get_max_age_hz(void)
1925 {
1926 	unsigned int max_age = READ_ONCE(dm_bufio_max_age);
1927 
1928 	if (max_age > UINT_MAX / HZ)
1929 		max_age = UINT_MAX / HZ;
1930 
1931 	return max_age * HZ;
1932 }
1933 
1934 static bool older_than(struct dm_buffer *b, unsigned long age_hz)
1935 {
1936 	return time_after_eq(jiffies, b->last_accessed + age_hz);
1937 }
1938 
1939 static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
1940 {
1941 	struct dm_buffer *b, *tmp;
1942 	unsigned long retain_target = get_retain_buffers(c);
1943 	unsigned long count;
1944 	LIST_HEAD(write_list);
1945 
1946 	dm_bufio_lock(c);
1947 
1948 	__check_watermark(c, &write_list);
1949 	if (unlikely(!list_empty(&write_list))) {
1950 		dm_bufio_unlock(c);
1951 		__flush_write_list(&write_list);
1952 		dm_bufio_lock(c);
1953 	}
1954 
1955 	count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
1956 	list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) {
1957 		if (count <= retain_target)
1958 			break;
1959 
1960 		if (!older_than(b, age_hz))
1961 			break;
1962 
1963 		if (__try_evict_buffer(b, 0))
1964 			count--;
1965 
1966 		cond_resched();
1967 	}
1968 
1969 	dm_bufio_unlock(c);
1970 }
1971 
1972 static void do_global_cleanup(struct work_struct *w)
1973 {
1974 	struct dm_bufio_client *locked_client = NULL;
1975 	struct dm_bufio_client *current_client;
1976 	struct dm_buffer *b;
1977 	unsigned int spinlock_hold_count;
1978 	unsigned long threshold = dm_bufio_cache_size -
1979 		dm_bufio_cache_size / DM_BUFIO_LOW_WATERMARK_RATIO;
1980 	unsigned long loops = global_num * 2;
1981 
1982 	mutex_lock(&dm_bufio_clients_lock);
1983 
1984 	while (1) {
1985 		cond_resched();
1986 
1987 		spin_lock(&global_spinlock);
1988 		if (unlikely(dm_bufio_current_allocated <= threshold))
1989 			break;
1990 
1991 		spinlock_hold_count = 0;
1992 get_next:
1993 		if (!loops--)
1994 			break;
1995 		if (unlikely(list_empty(&global_queue)))
1996 			break;
1997 		b = list_entry(global_queue.prev, struct dm_buffer, global_list);
1998 
1999 		if (b->accessed) {
2000 			b->accessed = 0;
2001 			list_move(&b->global_list, &global_queue);
2002 			if (likely(++spinlock_hold_count < 16))
2003 				goto get_next;
2004 			spin_unlock(&global_spinlock);
2005 			continue;
2006 		}
2007 
2008 		current_client = b->c;
2009 		if (unlikely(current_client != locked_client)) {
2010 			if (locked_client)
2011 				dm_bufio_unlock(locked_client);
2012 
2013 			if (!dm_bufio_trylock(current_client)) {
2014 				spin_unlock(&global_spinlock);
2015 				dm_bufio_lock(current_client);
2016 				locked_client = current_client;
2017 				continue;
2018 			}
2019 
2020 			locked_client = current_client;
2021 		}
2022 
2023 		spin_unlock(&global_spinlock);
2024 
2025 		if (unlikely(!__try_evict_buffer(b, GFP_KERNEL))) {
2026 			spin_lock(&global_spinlock);
2027 			list_move(&b->global_list, &global_queue);
2028 			spin_unlock(&global_spinlock);
2029 		}
2030 	}
2031 
2032 	spin_unlock(&global_spinlock);
2033 
2034 	if (locked_client)
2035 		dm_bufio_unlock(locked_client);
2036 
2037 	mutex_unlock(&dm_bufio_clients_lock);
2038 }
2039 
2040 static void cleanup_old_buffers(void)
2041 {
2042 	unsigned long max_age_hz = get_max_age_hz();
2043 	struct dm_bufio_client *c;
2044 
2045 	mutex_lock(&dm_bufio_clients_lock);
2046 
2047 	__cache_size_refresh();
2048 
2049 	list_for_each_entry(c, &dm_bufio_all_clients, client_list)
2050 		__evict_old_buffers(c, max_age_hz);
2051 
2052 	mutex_unlock(&dm_bufio_clients_lock);
2053 }
2054 
2055 static void work_fn(struct work_struct *w)
2056 {
2057 	cleanup_old_buffers();
2058 
2059 	queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
2060 			   DM_BUFIO_WORK_TIMER_SECS * HZ);
2061 }
2062 
2063 /*----------------------------------------------------------------
2064  * Module setup
2065  *--------------------------------------------------------------*/
2066 
2067 /*
2068  * This is called only once for the whole dm_bufio module.
2069  * It initializes memory limit.
2070  */
2071 static int __init dm_bufio_init(void)
2072 {
2073 	__u64 mem;
2074 
2075 	dm_bufio_allocated_kmem_cache = 0;
2076 	dm_bufio_allocated_get_free_pages = 0;
2077 	dm_bufio_allocated_vmalloc = 0;
2078 	dm_bufio_current_allocated = 0;
2079 
2080 	mem = (__u64)mult_frac(totalram_pages() - totalhigh_pages(),
2081 			       DM_BUFIO_MEMORY_PERCENT, 100) << PAGE_SHIFT;
2082 
2083 	if (mem > ULONG_MAX)
2084 		mem = ULONG_MAX;
2085 
2086 #ifdef CONFIG_MMU
2087 	if (mem > mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100))
2088 		mem = mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100);
2089 #endif
2090 
2091 	dm_bufio_default_cache_size = mem;
2092 
2093 	mutex_lock(&dm_bufio_clients_lock);
2094 	__cache_size_refresh();
2095 	mutex_unlock(&dm_bufio_clients_lock);
2096 
2097 	dm_bufio_wq = alloc_workqueue("dm_bufio_cache", WQ_MEM_RECLAIM, 0);
2098 	if (!dm_bufio_wq)
2099 		return -ENOMEM;
2100 
2101 	INIT_DELAYED_WORK(&dm_bufio_cleanup_old_work, work_fn);
2102 	INIT_WORK(&dm_bufio_replacement_work, do_global_cleanup);
2103 	queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
2104 			   DM_BUFIO_WORK_TIMER_SECS * HZ);
2105 
2106 	return 0;
2107 }
2108 
2109 /*
2110  * This is called once when unloading the dm_bufio module.
2111  */
2112 static void __exit dm_bufio_exit(void)
2113 {
2114 	int bug = 0;
2115 
2116 	cancel_delayed_work_sync(&dm_bufio_cleanup_old_work);
2117 	destroy_workqueue(dm_bufio_wq);
2118 
2119 	if (dm_bufio_client_count) {
2120 		DMCRIT("%s: dm_bufio_client_count leaked: %d",
2121 			__func__, dm_bufio_client_count);
2122 		bug = 1;
2123 	}
2124 
2125 	if (dm_bufio_current_allocated) {
2126 		DMCRIT("%s: dm_bufio_current_allocated leaked: %lu",
2127 			__func__, dm_bufio_current_allocated);
2128 		bug = 1;
2129 	}
2130 
2131 	if (dm_bufio_allocated_get_free_pages) {
2132 		DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu",
2133 		       __func__, dm_bufio_allocated_get_free_pages);
2134 		bug = 1;
2135 	}
2136 
2137 	if (dm_bufio_allocated_vmalloc) {
2138 		DMCRIT("%s: dm_bufio_vmalloc leaked: %lu",
2139 		       __func__, dm_bufio_allocated_vmalloc);
2140 		bug = 1;
2141 	}
2142 
2143 	BUG_ON(bug);
2144 }
2145 
2146 module_init(dm_bufio_init)
2147 module_exit(dm_bufio_exit)
2148 
2149 module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, S_IRUGO | S_IWUSR);
2150 MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
2151 
2152 module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR);
2153 MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
2154 
2155 module_param_named(retain_bytes, dm_bufio_retain_bytes, ulong, S_IRUGO | S_IWUSR);
2156 MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory");
2157 
2158 module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR);
2159 MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");
2160 
2161 module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, S_IRUGO);
2162 MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc");
2163 
2164 module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, S_IRUGO);
2165 MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages");
2166 
2167 module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, S_IRUGO);
2168 MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc");
2169 
2170 module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, S_IRUGO);
2171 MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache");
2172 
2173 MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
2174 MODULE_DESCRIPTION(DM_NAME " buffered I/O library");
2175 MODULE_LICENSE("GPL");
2176