xref: /openbmc/linux/drivers/md/dm-bufio.c (revision 6fbeb0048e6b93f7b7f195864f3ddc876ac4d42e)
1 /*
2  * Copyright (C) 2009-2011 Red Hat, Inc.
3  *
4  * Author: Mikulas Patocka <mpatocka@redhat.com>
5  *
6  * This file is released under the GPL.
7  */
8 
9 #include <linux/dm-bufio.h>
10 
11 #include <linux/device-mapper.h>
12 #include <linux/dm-io.h>
13 #include <linux/slab.h>
14 #include <linux/sched/mm.h>
15 #include <linux/jiffies.h>
16 #include <linux/vmalloc.h>
17 #include <linux/shrinker.h>
18 #include <linux/module.h>
19 #include <linux/rbtree.h>
20 #include <linux/stacktrace.h>
21 
22 #define DM_MSG_PREFIX "bufio"
23 
24 /*
25  * Memory management policy:
26  *	Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory
27  *	or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower).
28  *	Always allocate at least DM_BUFIO_MIN_BUFFERS buffers.
29  *	Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT
30  *	dirty buffers.
31  */
32 #define DM_BUFIO_MIN_BUFFERS		8
33 
34 #define DM_BUFIO_MEMORY_PERCENT		2
35 #define DM_BUFIO_VMALLOC_PERCENT	25
36 #define DM_BUFIO_WRITEBACK_RATIO	3
37 #define DM_BUFIO_LOW_WATERMARK_RATIO	16
38 
39 /*
40  * Check buffer ages in this interval (seconds)
41  */
42 #define DM_BUFIO_WORK_TIMER_SECS	30
43 
44 /*
45  * Free buffers when they are older than this (seconds)
46  */
47 #define DM_BUFIO_DEFAULT_AGE_SECS	300
48 
49 /*
50  * The nr of bytes of cached data to keep around.
51  */
52 #define DM_BUFIO_DEFAULT_RETAIN_BYTES   (256 * 1024)
53 
54 /*
55  * Align buffer writes to this boundary.
56  * Tests show that SSDs have the highest IOPS when using 4k writes.
57  */
58 #define DM_BUFIO_WRITE_ALIGN		4096
59 
60 /*
61  * dm_buffer->list_mode
62  */
63 #define LIST_CLEAN	0
64 #define LIST_DIRTY	1
65 #define LIST_SIZE	2
66 
67 /*
68  * Linking of buffers:
69  *	All buffers are linked to buffer_tree with their node field.
70  *
71  *	Clean buffers that are not being written (B_WRITING not set)
72  *	are linked to lru[LIST_CLEAN] with their lru_list field.
73  *
74  *	Dirty and clean buffers that are being written are linked to
75  *	lru[LIST_DIRTY] with their lru_list field. When the write
76  *	finishes, the buffer cannot be relinked immediately (because we
77  *	are in an interrupt context and relinking requires process
78  *	context), so some clean-not-writing buffers can be held on
79  *	dirty_lru too.  They are later added to lru in the process
80  *	context.
81  */
82 struct dm_bufio_client {
83 	struct mutex lock;
84 
85 	struct list_head lru[LIST_SIZE];
86 	unsigned long n_buffers[LIST_SIZE];
87 
88 	struct block_device *bdev;
89 	unsigned block_size;
90 	s8 sectors_per_block_bits;
91 	void (*alloc_callback)(struct dm_buffer *);
92 	void (*write_callback)(struct dm_buffer *);
93 
94 	struct kmem_cache *slab_buffer;
95 	struct kmem_cache *slab_cache;
96 	struct dm_io_client *dm_io;
97 
98 	struct list_head reserved_buffers;
99 	unsigned need_reserved_buffers;
100 
101 	unsigned minimum_buffers;
102 
103 	struct rb_root buffer_tree;
104 	wait_queue_head_t free_buffer_wait;
105 
106 	sector_t start;
107 
108 	int async_write_error;
109 
110 	struct list_head client_list;
111 	struct shrinker shrinker;
112 };
113 
114 /*
115  * Buffer state bits.
116  */
117 #define B_READING	0
118 #define B_WRITING	1
119 #define B_DIRTY		2
120 
121 /*
122  * Describes how the block was allocated:
123  * kmem_cache_alloc(), __get_free_pages() or vmalloc().
124  * See the comment at alloc_buffer_data.
125  */
126 enum data_mode {
127 	DATA_MODE_SLAB = 0,
128 	DATA_MODE_GET_FREE_PAGES = 1,
129 	DATA_MODE_VMALLOC = 2,
130 	DATA_MODE_LIMIT = 3
131 };
132 
133 struct dm_buffer {
134 	struct rb_node node;
135 	struct list_head lru_list;
136 	struct list_head global_list;
137 	sector_t block;
138 	void *data;
139 	unsigned char data_mode;		/* DATA_MODE_* */
140 	unsigned char list_mode;		/* LIST_* */
141 	blk_status_t read_error;
142 	blk_status_t write_error;
143 	unsigned accessed;
144 	unsigned hold_count;
145 	unsigned long state;
146 	unsigned long last_accessed;
147 	unsigned dirty_start;
148 	unsigned dirty_end;
149 	unsigned write_start;
150 	unsigned write_end;
151 	struct dm_bufio_client *c;
152 	struct list_head write_list;
153 	void (*end_io)(struct dm_buffer *, blk_status_t);
154 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
155 #define MAX_STACK 10
156 	unsigned int stack_len;
157 	unsigned long stack_entries[MAX_STACK];
158 #endif
159 };
160 
161 /*----------------------------------------------------------------*/
162 
163 #define dm_bufio_in_request()	(!!current->bio_list)
164 
165 static void dm_bufio_lock(struct dm_bufio_client *c)
166 {
167 	mutex_lock_nested(&c->lock, dm_bufio_in_request());
168 }
169 
170 static int dm_bufio_trylock(struct dm_bufio_client *c)
171 {
172 	return mutex_trylock(&c->lock);
173 }
174 
175 static void dm_bufio_unlock(struct dm_bufio_client *c)
176 {
177 	mutex_unlock(&c->lock);
178 }
179 
180 /*----------------------------------------------------------------*/
181 
182 /*
183  * Default cache size: available memory divided by the ratio.
184  */
185 static unsigned long dm_bufio_default_cache_size;
186 
187 /*
188  * Total cache size set by the user.
189  */
190 static unsigned long dm_bufio_cache_size;
191 
192 /*
193  * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change
194  * at any time.  If it disagrees, the user has changed cache size.
195  */
196 static unsigned long dm_bufio_cache_size_latch;
197 
198 static DEFINE_SPINLOCK(global_spinlock);
199 
200 static LIST_HEAD(global_queue);
201 
202 static unsigned long global_num = 0;
203 
204 /*
205  * Buffers are freed after this timeout
206  */
207 static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
208 static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
209 
210 static unsigned long dm_bufio_peak_allocated;
211 static unsigned long dm_bufio_allocated_kmem_cache;
212 static unsigned long dm_bufio_allocated_get_free_pages;
213 static unsigned long dm_bufio_allocated_vmalloc;
214 static unsigned long dm_bufio_current_allocated;
215 
216 /*----------------------------------------------------------------*/
217 
218 /*
219  * The current number of clients.
220  */
221 static int dm_bufio_client_count;
222 
223 /*
224  * The list of all clients.
225  */
226 static LIST_HEAD(dm_bufio_all_clients);
227 
228 /*
229  * This mutex protects dm_bufio_cache_size_latch and dm_bufio_client_count
230  */
231 static DEFINE_MUTEX(dm_bufio_clients_lock);
232 
233 static struct workqueue_struct *dm_bufio_wq;
234 static struct delayed_work dm_bufio_cleanup_old_work;
235 static struct work_struct dm_bufio_replacement_work;
236 
237 
238 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
239 static void buffer_record_stack(struct dm_buffer *b)
240 {
241 	b->stack_len = stack_trace_save(b->stack_entries, MAX_STACK, 2);
242 }
243 #endif
244 
245 /*----------------------------------------------------------------
246  * A red/black tree acts as an index for all the buffers.
247  *--------------------------------------------------------------*/
248 static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
249 {
250 	struct rb_node *n = c->buffer_tree.rb_node;
251 	struct dm_buffer *b;
252 
253 	while (n) {
254 		b = container_of(n, struct dm_buffer, node);
255 
256 		if (b->block == block)
257 			return b;
258 
259 		n = (b->block < block) ? n->rb_left : n->rb_right;
260 	}
261 
262 	return NULL;
263 }
264 
265 static void __insert(struct dm_bufio_client *c, struct dm_buffer *b)
266 {
267 	struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL;
268 	struct dm_buffer *found;
269 
270 	while (*new) {
271 		found = container_of(*new, struct dm_buffer, node);
272 
273 		if (found->block == b->block) {
274 			BUG_ON(found != b);
275 			return;
276 		}
277 
278 		parent = *new;
279 		new = (found->block < b->block) ?
280 			&((*new)->rb_left) : &((*new)->rb_right);
281 	}
282 
283 	rb_link_node(&b->node, parent, new);
284 	rb_insert_color(&b->node, &c->buffer_tree);
285 }
286 
287 static void __remove(struct dm_bufio_client *c, struct dm_buffer *b)
288 {
289 	rb_erase(&b->node, &c->buffer_tree);
290 }
291 
292 /*----------------------------------------------------------------*/
293 
294 static void adjust_total_allocated(struct dm_buffer *b, bool unlink)
295 {
296 	unsigned char data_mode;
297 	long diff;
298 
299 	static unsigned long * const class_ptr[DATA_MODE_LIMIT] = {
300 		&dm_bufio_allocated_kmem_cache,
301 		&dm_bufio_allocated_get_free_pages,
302 		&dm_bufio_allocated_vmalloc,
303 	};
304 
305 	data_mode = b->data_mode;
306 	diff = (long)b->c->block_size;
307 	if (unlink)
308 		diff = -diff;
309 
310 	spin_lock(&global_spinlock);
311 
312 	*class_ptr[data_mode] += diff;
313 
314 	dm_bufio_current_allocated += diff;
315 
316 	if (dm_bufio_current_allocated > dm_bufio_peak_allocated)
317 		dm_bufio_peak_allocated = dm_bufio_current_allocated;
318 
319 	b->accessed = 1;
320 
321 	if (!unlink) {
322 		list_add(&b->global_list, &global_queue);
323 		global_num++;
324 		if (dm_bufio_current_allocated > dm_bufio_cache_size)
325 			queue_work(dm_bufio_wq, &dm_bufio_replacement_work);
326 	} else {
327 		list_del(&b->global_list);
328 		global_num--;
329 	}
330 
331 	spin_unlock(&global_spinlock);
332 }
333 
334 /*
335  * Change the number of clients and recalculate per-client limit.
336  */
337 static void __cache_size_refresh(void)
338 {
339 	BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock));
340 	BUG_ON(dm_bufio_client_count < 0);
341 
342 	dm_bufio_cache_size_latch = READ_ONCE(dm_bufio_cache_size);
343 
344 	/*
345 	 * Use default if set to 0 and report the actual cache size used.
346 	 */
347 	if (!dm_bufio_cache_size_latch) {
348 		(void)cmpxchg(&dm_bufio_cache_size, 0,
349 			      dm_bufio_default_cache_size);
350 		dm_bufio_cache_size_latch = dm_bufio_default_cache_size;
351 	}
352 }
353 
354 /*
355  * Allocating buffer data.
356  *
357  * Small buffers are allocated with kmem_cache, to use space optimally.
358  *
359  * For large buffers, we choose between get_free_pages and vmalloc.
360  * Each has advantages and disadvantages.
361  *
362  * __get_free_pages can randomly fail if the memory is fragmented.
363  * __vmalloc won't randomly fail, but vmalloc space is limited (it may be
364  * as low as 128M) so using it for caching is not appropriate.
365  *
366  * If the allocation may fail we use __get_free_pages. Memory fragmentation
367  * won't have a fatal effect here, but it just causes flushes of some other
368  * buffers and more I/O will be performed. Don't use __get_free_pages if it
369  * always fails (i.e. order >= MAX_ORDER).
370  *
371  * If the allocation shouldn't fail we use __vmalloc. This is only for the
372  * initial reserve allocation, so there's no risk of wasting all vmalloc
373  * space.
374  */
375 static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
376 			       unsigned char *data_mode)
377 {
378 	if (unlikely(c->slab_cache != NULL)) {
379 		*data_mode = DATA_MODE_SLAB;
380 		return kmem_cache_alloc(c->slab_cache, gfp_mask);
381 	}
382 
383 	if (c->block_size <= KMALLOC_MAX_SIZE &&
384 	    gfp_mask & __GFP_NORETRY) {
385 		*data_mode = DATA_MODE_GET_FREE_PAGES;
386 		return (void *)__get_free_pages(gfp_mask,
387 						c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
388 	}
389 
390 	*data_mode = DATA_MODE_VMALLOC;
391 
392 	/*
393 	 * __vmalloc allocates the data pages and auxiliary structures with
394 	 * gfp_flags that were specified, but pagetables are always allocated
395 	 * with GFP_KERNEL, no matter what was specified as gfp_mask.
396 	 *
397 	 * Consequently, we must set per-process flag PF_MEMALLOC_NOIO so that
398 	 * all allocations done by this process (including pagetables) are done
399 	 * as if GFP_NOIO was specified.
400 	 */
401 	if (gfp_mask & __GFP_NORETRY) {
402 		unsigned noio_flag = memalloc_noio_save();
403 		void *ptr = __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL);
404 
405 		memalloc_noio_restore(noio_flag);
406 		return ptr;
407 	}
408 
409 	return __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL);
410 }
411 
412 /*
413  * Free buffer's data.
414  */
415 static void free_buffer_data(struct dm_bufio_client *c,
416 			     void *data, unsigned char data_mode)
417 {
418 	switch (data_mode) {
419 	case DATA_MODE_SLAB:
420 		kmem_cache_free(c->slab_cache, data);
421 		break;
422 
423 	case DATA_MODE_GET_FREE_PAGES:
424 		free_pages((unsigned long)data,
425 			   c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
426 		break;
427 
428 	case DATA_MODE_VMALLOC:
429 		vfree(data);
430 		break;
431 
432 	default:
433 		DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d",
434 		       data_mode);
435 		BUG();
436 	}
437 }
438 
439 /*
440  * Allocate buffer and its data.
441  */
442 static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
443 {
444 	struct dm_buffer *b = kmem_cache_alloc(c->slab_buffer, gfp_mask);
445 
446 	if (!b)
447 		return NULL;
448 
449 	b->c = c;
450 
451 	b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode);
452 	if (!b->data) {
453 		kmem_cache_free(c->slab_buffer, b);
454 		return NULL;
455 	}
456 
457 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
458 	b->stack_len = 0;
459 #endif
460 	return b;
461 }
462 
463 /*
464  * Free buffer and its data.
465  */
466 static void free_buffer(struct dm_buffer *b)
467 {
468 	struct dm_bufio_client *c = b->c;
469 
470 	free_buffer_data(c, b->data, b->data_mode);
471 	kmem_cache_free(c->slab_buffer, b);
472 }
473 
474 /*
475  * Link buffer to the buffer tree and clean or dirty queue.
476  */
477 static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty)
478 {
479 	struct dm_bufio_client *c = b->c;
480 
481 	c->n_buffers[dirty]++;
482 	b->block = block;
483 	b->list_mode = dirty;
484 	list_add(&b->lru_list, &c->lru[dirty]);
485 	__insert(b->c, b);
486 	b->last_accessed = jiffies;
487 
488 	adjust_total_allocated(b, false);
489 }
490 
491 /*
492  * Unlink buffer from the buffer tree and dirty or clean queue.
493  */
494 static void __unlink_buffer(struct dm_buffer *b)
495 {
496 	struct dm_bufio_client *c = b->c;
497 
498 	BUG_ON(!c->n_buffers[b->list_mode]);
499 
500 	c->n_buffers[b->list_mode]--;
501 	__remove(b->c, b);
502 	list_del(&b->lru_list);
503 
504 	adjust_total_allocated(b, true);
505 }
506 
507 /*
508  * Place the buffer to the head of dirty or clean LRU queue.
509  */
510 static void __relink_lru(struct dm_buffer *b, int dirty)
511 {
512 	struct dm_bufio_client *c = b->c;
513 
514 	b->accessed = 1;
515 
516 	BUG_ON(!c->n_buffers[b->list_mode]);
517 
518 	c->n_buffers[b->list_mode]--;
519 	c->n_buffers[dirty]++;
520 	b->list_mode = dirty;
521 	list_move(&b->lru_list, &c->lru[dirty]);
522 	b->last_accessed = jiffies;
523 }
524 
525 /*----------------------------------------------------------------
526  * Submit I/O on the buffer.
527  *
528  * Bio interface is faster but it has some problems:
529  *	the vector list is limited (increasing this limit increases
530  *	memory-consumption per buffer, so it is not viable);
531  *
532  *	the memory must be direct-mapped, not vmalloced;
533  *
534  * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and
535  * it is not vmalloced, try using the bio interface.
536  *
537  * If the buffer is big, if it is vmalloced or if the underlying device
538  * rejects the bio because it is too large, use dm-io layer to do the I/O.
539  * The dm-io layer splits the I/O into multiple requests, avoiding the above
540  * shortcomings.
541  *--------------------------------------------------------------*/
542 
543 /*
544  * dm-io completion routine. It just calls b->bio.bi_end_io, pretending
545  * that the request was handled directly with bio interface.
546  */
547 static void dmio_complete(unsigned long error, void *context)
548 {
549 	struct dm_buffer *b = context;
550 
551 	b->end_io(b, unlikely(error != 0) ? BLK_STS_IOERR : 0);
552 }
553 
554 static void use_dmio(struct dm_buffer *b, int rw, sector_t sector,
555 		     unsigned n_sectors, unsigned offset)
556 {
557 	int r;
558 	struct dm_io_request io_req = {
559 		.bi_op = rw,
560 		.bi_op_flags = 0,
561 		.notify.fn = dmio_complete,
562 		.notify.context = b,
563 		.client = b->c->dm_io,
564 	};
565 	struct dm_io_region region = {
566 		.bdev = b->c->bdev,
567 		.sector = sector,
568 		.count = n_sectors,
569 	};
570 
571 	if (b->data_mode != DATA_MODE_VMALLOC) {
572 		io_req.mem.type = DM_IO_KMEM;
573 		io_req.mem.ptr.addr = (char *)b->data + offset;
574 	} else {
575 		io_req.mem.type = DM_IO_VMA;
576 		io_req.mem.ptr.vma = (char *)b->data + offset;
577 	}
578 
579 	r = dm_io(&io_req, 1, &region, NULL);
580 	if (unlikely(r))
581 		b->end_io(b, errno_to_blk_status(r));
582 }
583 
584 static void bio_complete(struct bio *bio)
585 {
586 	struct dm_buffer *b = bio->bi_private;
587 	blk_status_t status = bio->bi_status;
588 	bio_put(bio);
589 	b->end_io(b, status);
590 }
591 
592 static void use_bio(struct dm_buffer *b, int rw, sector_t sector,
593 		    unsigned n_sectors, unsigned offset)
594 {
595 	struct bio *bio;
596 	char *ptr;
597 	unsigned vec_size, len;
598 
599 	vec_size = b->c->block_size >> PAGE_SHIFT;
600 	if (unlikely(b->c->sectors_per_block_bits < PAGE_SHIFT - SECTOR_SHIFT))
601 		vec_size += 2;
602 
603 	bio = bio_kmalloc(GFP_NOWAIT | __GFP_NORETRY | __GFP_NOWARN, vec_size);
604 	if (!bio) {
605 dmio:
606 		use_dmio(b, rw, sector, n_sectors, offset);
607 		return;
608 	}
609 
610 	bio->bi_iter.bi_sector = sector;
611 	bio_set_dev(bio, b->c->bdev);
612 	bio_set_op_attrs(bio, rw, 0);
613 	bio->bi_end_io = bio_complete;
614 	bio->bi_private = b;
615 
616 	ptr = (char *)b->data + offset;
617 	len = n_sectors << SECTOR_SHIFT;
618 
619 	do {
620 		unsigned this_step = min((unsigned)(PAGE_SIZE - offset_in_page(ptr)), len);
621 		if (!bio_add_page(bio, virt_to_page(ptr), this_step,
622 				  offset_in_page(ptr))) {
623 			bio_put(bio);
624 			goto dmio;
625 		}
626 
627 		len -= this_step;
628 		ptr += this_step;
629 	} while (len > 0);
630 
631 	submit_bio(bio);
632 }
633 
634 static inline sector_t block_to_sector(struct dm_bufio_client *c, sector_t block)
635 {
636 	sector_t sector;
637 
638 	if (likely(c->sectors_per_block_bits >= 0))
639 		sector = block << c->sectors_per_block_bits;
640 	else
641 		sector = block * (c->block_size >> SECTOR_SHIFT);
642 	sector += c->start;
643 
644 	return sector;
645 }
646 
647 static void submit_io(struct dm_buffer *b, int rw, void (*end_io)(struct dm_buffer *, blk_status_t))
648 {
649 	unsigned n_sectors;
650 	sector_t sector;
651 	unsigned offset, end;
652 
653 	b->end_io = end_io;
654 
655 	sector = block_to_sector(b->c, b->block);
656 
657 	if (rw != REQ_OP_WRITE) {
658 		n_sectors = b->c->block_size >> SECTOR_SHIFT;
659 		offset = 0;
660 	} else {
661 		if (b->c->write_callback)
662 			b->c->write_callback(b);
663 		offset = b->write_start;
664 		end = b->write_end;
665 		offset &= -DM_BUFIO_WRITE_ALIGN;
666 		end += DM_BUFIO_WRITE_ALIGN - 1;
667 		end &= -DM_BUFIO_WRITE_ALIGN;
668 		if (unlikely(end > b->c->block_size))
669 			end = b->c->block_size;
670 
671 		sector += offset >> SECTOR_SHIFT;
672 		n_sectors = (end - offset) >> SECTOR_SHIFT;
673 	}
674 
675 	if (b->data_mode != DATA_MODE_VMALLOC)
676 		use_bio(b, rw, sector, n_sectors, offset);
677 	else
678 		use_dmio(b, rw, sector, n_sectors, offset);
679 }
680 
681 /*----------------------------------------------------------------
682  * Writing dirty buffers
683  *--------------------------------------------------------------*/
684 
685 /*
686  * The endio routine for write.
687  *
688  * Set the error, clear B_WRITING bit and wake anyone who was waiting on
689  * it.
690  */
691 static void write_endio(struct dm_buffer *b, blk_status_t status)
692 {
693 	b->write_error = status;
694 	if (unlikely(status)) {
695 		struct dm_bufio_client *c = b->c;
696 
697 		(void)cmpxchg(&c->async_write_error, 0,
698 				blk_status_to_errno(status));
699 	}
700 
701 	BUG_ON(!test_bit(B_WRITING, &b->state));
702 
703 	smp_mb__before_atomic();
704 	clear_bit(B_WRITING, &b->state);
705 	smp_mb__after_atomic();
706 
707 	wake_up_bit(&b->state, B_WRITING);
708 }
709 
710 /*
711  * Initiate a write on a dirty buffer, but don't wait for it.
712  *
713  * - If the buffer is not dirty, exit.
714  * - If there some previous write going on, wait for it to finish (we can't
715  *   have two writes on the same buffer simultaneously).
716  * - Submit our write and don't wait on it. We set B_WRITING indicating
717  *   that there is a write in progress.
718  */
719 static void __write_dirty_buffer(struct dm_buffer *b,
720 				 struct list_head *write_list)
721 {
722 	if (!test_bit(B_DIRTY, &b->state))
723 		return;
724 
725 	clear_bit(B_DIRTY, &b->state);
726 	wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
727 
728 	b->write_start = b->dirty_start;
729 	b->write_end = b->dirty_end;
730 
731 	if (!write_list)
732 		submit_io(b, REQ_OP_WRITE, write_endio);
733 	else
734 		list_add_tail(&b->write_list, write_list);
735 }
736 
737 static void __flush_write_list(struct list_head *write_list)
738 {
739 	struct blk_plug plug;
740 	blk_start_plug(&plug);
741 	while (!list_empty(write_list)) {
742 		struct dm_buffer *b =
743 			list_entry(write_list->next, struct dm_buffer, write_list);
744 		list_del(&b->write_list);
745 		submit_io(b, REQ_OP_WRITE, write_endio);
746 		cond_resched();
747 	}
748 	blk_finish_plug(&plug);
749 }
750 
751 /*
752  * Wait until any activity on the buffer finishes.  Possibly write the
753  * buffer if it is dirty.  When this function finishes, there is no I/O
754  * running on the buffer and the buffer is not dirty.
755  */
756 static void __make_buffer_clean(struct dm_buffer *b)
757 {
758 	BUG_ON(b->hold_count);
759 
760 	if (!b->state)	/* fast case */
761 		return;
762 
763 	wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
764 	__write_dirty_buffer(b, NULL);
765 	wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
766 }
767 
768 /*
769  * Find some buffer that is not held by anybody, clean it, unlink it and
770  * return it.
771  */
772 static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c)
773 {
774 	struct dm_buffer *b;
775 
776 	list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) {
777 		BUG_ON(test_bit(B_WRITING, &b->state));
778 		BUG_ON(test_bit(B_DIRTY, &b->state));
779 
780 		if (!b->hold_count) {
781 			__make_buffer_clean(b);
782 			__unlink_buffer(b);
783 			return b;
784 		}
785 		cond_resched();
786 	}
787 
788 	list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) {
789 		BUG_ON(test_bit(B_READING, &b->state));
790 
791 		if (!b->hold_count) {
792 			__make_buffer_clean(b);
793 			__unlink_buffer(b);
794 			return b;
795 		}
796 		cond_resched();
797 	}
798 
799 	return NULL;
800 }
801 
802 /*
803  * Wait until some other threads free some buffer or release hold count on
804  * some buffer.
805  *
806  * This function is entered with c->lock held, drops it and regains it
807  * before exiting.
808  */
809 static void __wait_for_free_buffer(struct dm_bufio_client *c)
810 {
811 	DECLARE_WAITQUEUE(wait, current);
812 
813 	add_wait_queue(&c->free_buffer_wait, &wait);
814 	set_current_state(TASK_UNINTERRUPTIBLE);
815 	dm_bufio_unlock(c);
816 
817 	io_schedule();
818 
819 	remove_wait_queue(&c->free_buffer_wait, &wait);
820 
821 	dm_bufio_lock(c);
822 }
823 
824 enum new_flag {
825 	NF_FRESH = 0,
826 	NF_READ = 1,
827 	NF_GET = 2,
828 	NF_PREFETCH = 3
829 };
830 
831 /*
832  * Allocate a new buffer. If the allocation is not possible, wait until
833  * some other thread frees a buffer.
834  *
835  * May drop the lock and regain it.
836  */
837 static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf)
838 {
839 	struct dm_buffer *b;
840 	bool tried_noio_alloc = false;
841 
842 	/*
843 	 * dm-bufio is resistant to allocation failures (it just keeps
844 	 * one buffer reserved in cases all the allocations fail).
845 	 * So set flags to not try too hard:
846 	 *	GFP_NOWAIT: don't wait; if we need to sleep we'll release our
847 	 *		    mutex and wait ourselves.
848 	 *	__GFP_NORETRY: don't retry and rather return failure
849 	 *	__GFP_NOMEMALLOC: don't use emergency reserves
850 	 *	__GFP_NOWARN: don't print a warning in case of failure
851 	 *
852 	 * For debugging, if we set the cache size to 1, no new buffers will
853 	 * be allocated.
854 	 */
855 	while (1) {
856 		if (dm_bufio_cache_size_latch != 1) {
857 			b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
858 			if (b)
859 				return b;
860 		}
861 
862 		if (nf == NF_PREFETCH)
863 			return NULL;
864 
865 		if (dm_bufio_cache_size_latch != 1 && !tried_noio_alloc) {
866 			dm_bufio_unlock(c);
867 			b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
868 			dm_bufio_lock(c);
869 			if (b)
870 				return b;
871 			tried_noio_alloc = true;
872 		}
873 
874 		if (!list_empty(&c->reserved_buffers)) {
875 			b = list_entry(c->reserved_buffers.next,
876 				       struct dm_buffer, lru_list);
877 			list_del(&b->lru_list);
878 			c->need_reserved_buffers++;
879 
880 			return b;
881 		}
882 
883 		b = __get_unclaimed_buffer(c);
884 		if (b)
885 			return b;
886 
887 		__wait_for_free_buffer(c);
888 	}
889 }
890 
891 static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf)
892 {
893 	struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf);
894 
895 	if (!b)
896 		return NULL;
897 
898 	if (c->alloc_callback)
899 		c->alloc_callback(b);
900 
901 	return b;
902 }
903 
904 /*
905  * Free a buffer and wake other threads waiting for free buffers.
906  */
907 static void __free_buffer_wake(struct dm_buffer *b)
908 {
909 	struct dm_bufio_client *c = b->c;
910 
911 	if (!c->need_reserved_buffers)
912 		free_buffer(b);
913 	else {
914 		list_add(&b->lru_list, &c->reserved_buffers);
915 		c->need_reserved_buffers--;
916 	}
917 
918 	wake_up(&c->free_buffer_wait);
919 }
920 
921 static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait,
922 					struct list_head *write_list)
923 {
924 	struct dm_buffer *b, *tmp;
925 
926 	list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
927 		BUG_ON(test_bit(B_READING, &b->state));
928 
929 		if (!test_bit(B_DIRTY, &b->state) &&
930 		    !test_bit(B_WRITING, &b->state)) {
931 			__relink_lru(b, LIST_CLEAN);
932 			continue;
933 		}
934 
935 		if (no_wait && test_bit(B_WRITING, &b->state))
936 			return;
937 
938 		__write_dirty_buffer(b, write_list);
939 		cond_resched();
940 	}
941 }
942 
943 /*
944  * Check if we're over watermark.
945  * If we are over threshold_buffers, start freeing buffers.
946  * If we're over "limit_buffers", block until we get under the limit.
947  */
948 static void __check_watermark(struct dm_bufio_client *c,
949 			      struct list_head *write_list)
950 {
951 	if (c->n_buffers[LIST_DIRTY] > c->n_buffers[LIST_CLEAN] * DM_BUFIO_WRITEBACK_RATIO)
952 		__write_dirty_buffers_async(c, 1, write_list);
953 }
954 
955 /*----------------------------------------------------------------
956  * Getting a buffer
957  *--------------------------------------------------------------*/
958 
959 static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
960 				     enum new_flag nf, int *need_submit,
961 				     struct list_head *write_list)
962 {
963 	struct dm_buffer *b, *new_b = NULL;
964 
965 	*need_submit = 0;
966 
967 	b = __find(c, block);
968 	if (b)
969 		goto found_buffer;
970 
971 	if (nf == NF_GET)
972 		return NULL;
973 
974 	new_b = __alloc_buffer_wait(c, nf);
975 	if (!new_b)
976 		return NULL;
977 
978 	/*
979 	 * We've had a period where the mutex was unlocked, so need to
980 	 * recheck the buffer tree.
981 	 */
982 	b = __find(c, block);
983 	if (b) {
984 		__free_buffer_wake(new_b);
985 		goto found_buffer;
986 	}
987 
988 	__check_watermark(c, write_list);
989 
990 	b = new_b;
991 	b->hold_count = 1;
992 	b->read_error = 0;
993 	b->write_error = 0;
994 	__link_buffer(b, block, LIST_CLEAN);
995 
996 	if (nf == NF_FRESH) {
997 		b->state = 0;
998 		return b;
999 	}
1000 
1001 	b->state = 1 << B_READING;
1002 	*need_submit = 1;
1003 
1004 	return b;
1005 
1006 found_buffer:
1007 	if (nf == NF_PREFETCH)
1008 		return NULL;
1009 	/*
1010 	 * Note: it is essential that we don't wait for the buffer to be
1011 	 * read if dm_bufio_get function is used. Both dm_bufio_get and
1012 	 * dm_bufio_prefetch can be used in the driver request routine.
1013 	 * If the user called both dm_bufio_prefetch and dm_bufio_get on
1014 	 * the same buffer, it would deadlock if we waited.
1015 	 */
1016 	if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state)))
1017 		return NULL;
1018 
1019 	b->hold_count++;
1020 	__relink_lru(b, test_bit(B_DIRTY, &b->state) ||
1021 		     test_bit(B_WRITING, &b->state));
1022 	return b;
1023 }
1024 
1025 /*
1026  * The endio routine for reading: set the error, clear the bit and wake up
1027  * anyone waiting on the buffer.
1028  */
1029 static void read_endio(struct dm_buffer *b, blk_status_t status)
1030 {
1031 	b->read_error = status;
1032 
1033 	BUG_ON(!test_bit(B_READING, &b->state));
1034 
1035 	smp_mb__before_atomic();
1036 	clear_bit(B_READING, &b->state);
1037 	smp_mb__after_atomic();
1038 
1039 	wake_up_bit(&b->state, B_READING);
1040 }
1041 
1042 /*
1043  * A common routine for dm_bufio_new and dm_bufio_read.  Operation of these
1044  * functions is similar except that dm_bufio_new doesn't read the
1045  * buffer from the disk (assuming that the caller overwrites all the data
1046  * and uses dm_bufio_mark_buffer_dirty to write new data back).
1047  */
1048 static void *new_read(struct dm_bufio_client *c, sector_t block,
1049 		      enum new_flag nf, struct dm_buffer **bp)
1050 {
1051 	int need_submit;
1052 	struct dm_buffer *b;
1053 
1054 	LIST_HEAD(write_list);
1055 
1056 	dm_bufio_lock(c);
1057 	b = __bufio_new(c, block, nf, &need_submit, &write_list);
1058 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1059 	if (b && b->hold_count == 1)
1060 		buffer_record_stack(b);
1061 #endif
1062 	dm_bufio_unlock(c);
1063 
1064 	__flush_write_list(&write_list);
1065 
1066 	if (!b)
1067 		return NULL;
1068 
1069 	if (need_submit)
1070 		submit_io(b, REQ_OP_READ, read_endio);
1071 
1072 	wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
1073 
1074 	if (b->read_error) {
1075 		int error = blk_status_to_errno(b->read_error);
1076 
1077 		dm_bufio_release(b);
1078 
1079 		return ERR_PTR(error);
1080 	}
1081 
1082 	*bp = b;
1083 
1084 	return b->data;
1085 }
1086 
1087 void *dm_bufio_get(struct dm_bufio_client *c, sector_t block,
1088 		   struct dm_buffer **bp)
1089 {
1090 	return new_read(c, block, NF_GET, bp);
1091 }
1092 EXPORT_SYMBOL_GPL(dm_bufio_get);
1093 
1094 void *dm_bufio_read(struct dm_bufio_client *c, sector_t block,
1095 		    struct dm_buffer **bp)
1096 {
1097 	BUG_ON(dm_bufio_in_request());
1098 
1099 	return new_read(c, block, NF_READ, bp);
1100 }
1101 EXPORT_SYMBOL_GPL(dm_bufio_read);
1102 
1103 void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
1104 		   struct dm_buffer **bp)
1105 {
1106 	BUG_ON(dm_bufio_in_request());
1107 
1108 	return new_read(c, block, NF_FRESH, bp);
1109 }
1110 EXPORT_SYMBOL_GPL(dm_bufio_new);
1111 
1112 void dm_bufio_prefetch(struct dm_bufio_client *c,
1113 		       sector_t block, unsigned n_blocks)
1114 {
1115 	struct blk_plug plug;
1116 
1117 	LIST_HEAD(write_list);
1118 
1119 	BUG_ON(dm_bufio_in_request());
1120 
1121 	blk_start_plug(&plug);
1122 	dm_bufio_lock(c);
1123 
1124 	for (; n_blocks--; block++) {
1125 		int need_submit;
1126 		struct dm_buffer *b;
1127 		b = __bufio_new(c, block, NF_PREFETCH, &need_submit,
1128 				&write_list);
1129 		if (unlikely(!list_empty(&write_list))) {
1130 			dm_bufio_unlock(c);
1131 			blk_finish_plug(&plug);
1132 			__flush_write_list(&write_list);
1133 			blk_start_plug(&plug);
1134 			dm_bufio_lock(c);
1135 		}
1136 		if (unlikely(b != NULL)) {
1137 			dm_bufio_unlock(c);
1138 
1139 			if (need_submit)
1140 				submit_io(b, REQ_OP_READ, read_endio);
1141 			dm_bufio_release(b);
1142 
1143 			cond_resched();
1144 
1145 			if (!n_blocks)
1146 				goto flush_plug;
1147 			dm_bufio_lock(c);
1148 		}
1149 	}
1150 
1151 	dm_bufio_unlock(c);
1152 
1153 flush_plug:
1154 	blk_finish_plug(&plug);
1155 }
1156 EXPORT_SYMBOL_GPL(dm_bufio_prefetch);
1157 
1158 void dm_bufio_release(struct dm_buffer *b)
1159 {
1160 	struct dm_bufio_client *c = b->c;
1161 
1162 	dm_bufio_lock(c);
1163 
1164 	BUG_ON(!b->hold_count);
1165 
1166 	b->hold_count--;
1167 	if (!b->hold_count) {
1168 		wake_up(&c->free_buffer_wait);
1169 
1170 		/*
1171 		 * If there were errors on the buffer, and the buffer is not
1172 		 * to be written, free the buffer. There is no point in caching
1173 		 * invalid buffer.
1174 		 */
1175 		if ((b->read_error || b->write_error) &&
1176 		    !test_bit(B_READING, &b->state) &&
1177 		    !test_bit(B_WRITING, &b->state) &&
1178 		    !test_bit(B_DIRTY, &b->state)) {
1179 			__unlink_buffer(b);
1180 			__free_buffer_wake(b);
1181 		}
1182 	}
1183 
1184 	dm_bufio_unlock(c);
1185 }
1186 EXPORT_SYMBOL_GPL(dm_bufio_release);
1187 
1188 void dm_bufio_mark_partial_buffer_dirty(struct dm_buffer *b,
1189 					unsigned start, unsigned end)
1190 {
1191 	struct dm_bufio_client *c = b->c;
1192 
1193 	BUG_ON(start >= end);
1194 	BUG_ON(end > b->c->block_size);
1195 
1196 	dm_bufio_lock(c);
1197 
1198 	BUG_ON(test_bit(B_READING, &b->state));
1199 
1200 	if (!test_and_set_bit(B_DIRTY, &b->state)) {
1201 		b->dirty_start = start;
1202 		b->dirty_end = end;
1203 		__relink_lru(b, LIST_DIRTY);
1204 	} else {
1205 		if (start < b->dirty_start)
1206 			b->dirty_start = start;
1207 		if (end > b->dirty_end)
1208 			b->dirty_end = end;
1209 	}
1210 
1211 	dm_bufio_unlock(c);
1212 }
1213 EXPORT_SYMBOL_GPL(dm_bufio_mark_partial_buffer_dirty);
1214 
1215 void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
1216 {
1217 	dm_bufio_mark_partial_buffer_dirty(b, 0, b->c->block_size);
1218 }
1219 EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty);
1220 
1221 void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c)
1222 {
1223 	LIST_HEAD(write_list);
1224 
1225 	BUG_ON(dm_bufio_in_request());
1226 
1227 	dm_bufio_lock(c);
1228 	__write_dirty_buffers_async(c, 0, &write_list);
1229 	dm_bufio_unlock(c);
1230 	__flush_write_list(&write_list);
1231 }
1232 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
1233 
1234 /*
1235  * For performance, it is essential that the buffers are written asynchronously
1236  * and simultaneously (so that the block layer can merge the writes) and then
1237  * waited upon.
1238  *
1239  * Finally, we flush hardware disk cache.
1240  */
1241 int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
1242 {
1243 	int a, f;
1244 	unsigned long buffers_processed = 0;
1245 	struct dm_buffer *b, *tmp;
1246 
1247 	LIST_HEAD(write_list);
1248 
1249 	dm_bufio_lock(c);
1250 	__write_dirty_buffers_async(c, 0, &write_list);
1251 	dm_bufio_unlock(c);
1252 	__flush_write_list(&write_list);
1253 	dm_bufio_lock(c);
1254 
1255 again:
1256 	list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
1257 		int dropped_lock = 0;
1258 
1259 		if (buffers_processed < c->n_buffers[LIST_DIRTY])
1260 			buffers_processed++;
1261 
1262 		BUG_ON(test_bit(B_READING, &b->state));
1263 
1264 		if (test_bit(B_WRITING, &b->state)) {
1265 			if (buffers_processed < c->n_buffers[LIST_DIRTY]) {
1266 				dropped_lock = 1;
1267 				b->hold_count++;
1268 				dm_bufio_unlock(c);
1269 				wait_on_bit_io(&b->state, B_WRITING,
1270 					       TASK_UNINTERRUPTIBLE);
1271 				dm_bufio_lock(c);
1272 				b->hold_count--;
1273 			} else
1274 				wait_on_bit_io(&b->state, B_WRITING,
1275 					       TASK_UNINTERRUPTIBLE);
1276 		}
1277 
1278 		if (!test_bit(B_DIRTY, &b->state) &&
1279 		    !test_bit(B_WRITING, &b->state))
1280 			__relink_lru(b, LIST_CLEAN);
1281 
1282 		cond_resched();
1283 
1284 		/*
1285 		 * If we dropped the lock, the list is no longer consistent,
1286 		 * so we must restart the search.
1287 		 *
1288 		 * In the most common case, the buffer just processed is
1289 		 * relinked to the clean list, so we won't loop scanning the
1290 		 * same buffer again and again.
1291 		 *
1292 		 * This may livelock if there is another thread simultaneously
1293 		 * dirtying buffers, so we count the number of buffers walked
1294 		 * and if it exceeds the total number of buffers, it means that
1295 		 * someone is doing some writes simultaneously with us.  In
1296 		 * this case, stop, dropping the lock.
1297 		 */
1298 		if (dropped_lock)
1299 			goto again;
1300 	}
1301 	wake_up(&c->free_buffer_wait);
1302 	dm_bufio_unlock(c);
1303 
1304 	a = xchg(&c->async_write_error, 0);
1305 	f = dm_bufio_issue_flush(c);
1306 	if (a)
1307 		return a;
1308 
1309 	return f;
1310 }
1311 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers);
1312 
1313 /*
1314  * Use dm-io to send an empty barrier to flush the device.
1315  */
1316 int dm_bufio_issue_flush(struct dm_bufio_client *c)
1317 {
1318 	struct dm_io_request io_req = {
1319 		.bi_op = REQ_OP_WRITE,
1320 		.bi_op_flags = REQ_PREFLUSH | REQ_SYNC,
1321 		.mem.type = DM_IO_KMEM,
1322 		.mem.ptr.addr = NULL,
1323 		.client = c->dm_io,
1324 	};
1325 	struct dm_io_region io_reg = {
1326 		.bdev = c->bdev,
1327 		.sector = 0,
1328 		.count = 0,
1329 	};
1330 
1331 	BUG_ON(dm_bufio_in_request());
1332 
1333 	return dm_io(&io_req, 1, &io_reg, NULL);
1334 }
1335 EXPORT_SYMBOL_GPL(dm_bufio_issue_flush);
1336 
1337 /*
1338  * Use dm-io to send a discard request to flush the device.
1339  */
1340 int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t count)
1341 {
1342 	struct dm_io_request io_req = {
1343 		.bi_op = REQ_OP_DISCARD,
1344 		.bi_op_flags = REQ_SYNC,
1345 		.mem.type = DM_IO_KMEM,
1346 		.mem.ptr.addr = NULL,
1347 		.client = c->dm_io,
1348 	};
1349 	struct dm_io_region io_reg = {
1350 		.bdev = c->bdev,
1351 		.sector = block_to_sector(c, block),
1352 		.count = block_to_sector(c, count),
1353 	};
1354 
1355 	BUG_ON(dm_bufio_in_request());
1356 
1357 	return dm_io(&io_req, 1, &io_reg, NULL);
1358 }
1359 EXPORT_SYMBOL_GPL(dm_bufio_issue_discard);
1360 
1361 /*
1362  * Free the specified range of buffers. If a buffer is held by other process, it
1363  * is not freed. If a buffer is dirty, it is discarded without writeback.
1364  * Finally, send the discard request to the device.
1365  */
1366 int dm_bufio_discard_buffers(struct dm_bufio_client *c, sector_t block, sector_t count)
1367 {
1368 	sector_t i;
1369 
1370 	for (i = block; i < block + count; i++) {
1371 		struct dm_buffer *b;
1372 		dm_bufio_lock(c);
1373 		b = __find(c, i);
1374 		if (b && likely(!b->hold_count)) {
1375 			wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
1376 			wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
1377 			__unlink_buffer(b);
1378 			__free_buffer_wake(b);
1379 		}
1380 		dm_bufio_unlock(c);
1381 	}
1382 
1383 	return dm_bufio_issue_discard(c, block, count);
1384 }
1385 EXPORT_SYMBOL_GPL(dm_bufio_discard_buffers);
1386 
1387 /*
1388  * We first delete any other buffer that may be at that new location.
1389  *
1390  * Then, we write the buffer to the original location if it was dirty.
1391  *
1392  * Then, if we are the only one who is holding the buffer, relink the buffer
1393  * in the buffer tree for the new location.
1394  *
1395  * If there was someone else holding the buffer, we write it to the new
1396  * location but not relink it, because that other user needs to have the buffer
1397  * at the same place.
1398  */
1399 void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block)
1400 {
1401 	struct dm_bufio_client *c = b->c;
1402 	struct dm_buffer *new;
1403 
1404 	BUG_ON(dm_bufio_in_request());
1405 
1406 	dm_bufio_lock(c);
1407 
1408 retry:
1409 	new = __find(c, new_block);
1410 	if (new) {
1411 		if (new->hold_count) {
1412 			__wait_for_free_buffer(c);
1413 			goto retry;
1414 		}
1415 
1416 		/*
1417 		 * FIXME: Is there any point waiting for a write that's going
1418 		 * to be overwritten in a bit?
1419 		 */
1420 		__make_buffer_clean(new);
1421 		__unlink_buffer(new);
1422 		__free_buffer_wake(new);
1423 	}
1424 
1425 	BUG_ON(!b->hold_count);
1426 	BUG_ON(test_bit(B_READING, &b->state));
1427 
1428 	__write_dirty_buffer(b, NULL);
1429 	if (b->hold_count == 1) {
1430 		wait_on_bit_io(&b->state, B_WRITING,
1431 			       TASK_UNINTERRUPTIBLE);
1432 		set_bit(B_DIRTY, &b->state);
1433 		b->dirty_start = 0;
1434 		b->dirty_end = c->block_size;
1435 		__unlink_buffer(b);
1436 		__link_buffer(b, new_block, LIST_DIRTY);
1437 	} else {
1438 		sector_t old_block;
1439 		wait_on_bit_lock_io(&b->state, B_WRITING,
1440 				    TASK_UNINTERRUPTIBLE);
1441 		/*
1442 		 * Relink buffer to "new_block" so that write_callback
1443 		 * sees "new_block" as a block number.
1444 		 * After the write, link the buffer back to old_block.
1445 		 * All this must be done in bufio lock, so that block number
1446 		 * change isn't visible to other threads.
1447 		 */
1448 		old_block = b->block;
1449 		__unlink_buffer(b);
1450 		__link_buffer(b, new_block, b->list_mode);
1451 		submit_io(b, REQ_OP_WRITE, write_endio);
1452 		wait_on_bit_io(&b->state, B_WRITING,
1453 			       TASK_UNINTERRUPTIBLE);
1454 		__unlink_buffer(b);
1455 		__link_buffer(b, old_block, b->list_mode);
1456 	}
1457 
1458 	dm_bufio_unlock(c);
1459 	dm_bufio_release(b);
1460 }
1461 EXPORT_SYMBOL_GPL(dm_bufio_release_move);
1462 
1463 /*
1464  * Free the given buffer.
1465  *
1466  * This is just a hint, if the buffer is in use or dirty, this function
1467  * does nothing.
1468  */
1469 void dm_bufio_forget(struct dm_bufio_client *c, sector_t block)
1470 {
1471 	struct dm_buffer *b;
1472 
1473 	dm_bufio_lock(c);
1474 
1475 	b = __find(c, block);
1476 	if (b && likely(!b->hold_count) && likely(!b->state)) {
1477 		__unlink_buffer(b);
1478 		__free_buffer_wake(b);
1479 	}
1480 
1481 	dm_bufio_unlock(c);
1482 }
1483 EXPORT_SYMBOL_GPL(dm_bufio_forget);
1484 
1485 void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n)
1486 {
1487 	c->minimum_buffers = n;
1488 }
1489 EXPORT_SYMBOL_GPL(dm_bufio_set_minimum_buffers);
1490 
1491 unsigned dm_bufio_get_block_size(struct dm_bufio_client *c)
1492 {
1493 	return c->block_size;
1494 }
1495 EXPORT_SYMBOL_GPL(dm_bufio_get_block_size);
1496 
1497 sector_t dm_bufio_get_device_size(struct dm_bufio_client *c)
1498 {
1499 	sector_t s = i_size_read(c->bdev->bd_inode) >> SECTOR_SHIFT;
1500 	if (likely(c->sectors_per_block_bits >= 0))
1501 		s >>= c->sectors_per_block_bits;
1502 	else
1503 		sector_div(s, c->block_size >> SECTOR_SHIFT);
1504 	return s;
1505 }
1506 EXPORT_SYMBOL_GPL(dm_bufio_get_device_size);
1507 
1508 sector_t dm_bufio_get_block_number(struct dm_buffer *b)
1509 {
1510 	return b->block;
1511 }
1512 EXPORT_SYMBOL_GPL(dm_bufio_get_block_number);
1513 
1514 void *dm_bufio_get_block_data(struct dm_buffer *b)
1515 {
1516 	return b->data;
1517 }
1518 EXPORT_SYMBOL_GPL(dm_bufio_get_block_data);
1519 
1520 void *dm_bufio_get_aux_data(struct dm_buffer *b)
1521 {
1522 	return b + 1;
1523 }
1524 EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data);
1525 
1526 struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b)
1527 {
1528 	return b->c;
1529 }
1530 EXPORT_SYMBOL_GPL(dm_bufio_get_client);
1531 
1532 static void drop_buffers(struct dm_bufio_client *c)
1533 {
1534 	struct dm_buffer *b;
1535 	int i;
1536 	bool warned = false;
1537 
1538 	BUG_ON(dm_bufio_in_request());
1539 
1540 	/*
1541 	 * An optimization so that the buffers are not written one-by-one.
1542 	 */
1543 	dm_bufio_write_dirty_buffers_async(c);
1544 
1545 	dm_bufio_lock(c);
1546 
1547 	while ((b = __get_unclaimed_buffer(c)))
1548 		__free_buffer_wake(b);
1549 
1550 	for (i = 0; i < LIST_SIZE; i++)
1551 		list_for_each_entry(b, &c->lru[i], lru_list) {
1552 			WARN_ON(!warned);
1553 			warned = true;
1554 			DMERR("leaked buffer %llx, hold count %u, list %d",
1555 			      (unsigned long long)b->block, b->hold_count, i);
1556 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1557 			stack_trace_print(b->stack_entries, b->stack_len, 1);
1558 			/* mark unclaimed to avoid BUG_ON below */
1559 			b->hold_count = 0;
1560 #endif
1561 		}
1562 
1563 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1564 	while ((b = __get_unclaimed_buffer(c)))
1565 		__free_buffer_wake(b);
1566 #endif
1567 
1568 	for (i = 0; i < LIST_SIZE; i++)
1569 		BUG_ON(!list_empty(&c->lru[i]));
1570 
1571 	dm_bufio_unlock(c);
1572 }
1573 
1574 /*
1575  * We may not be able to evict this buffer if IO pending or the client
1576  * is still using it.  Caller is expected to know buffer is too old.
1577  *
1578  * And if GFP_NOFS is used, we must not do any I/O because we hold
1579  * dm_bufio_clients_lock and we would risk deadlock if the I/O gets
1580  * rerouted to different bufio client.
1581  */
1582 static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp)
1583 {
1584 	if (!(gfp & __GFP_FS)) {
1585 		if (test_bit(B_READING, &b->state) ||
1586 		    test_bit(B_WRITING, &b->state) ||
1587 		    test_bit(B_DIRTY, &b->state))
1588 			return false;
1589 	}
1590 
1591 	if (b->hold_count)
1592 		return false;
1593 
1594 	__make_buffer_clean(b);
1595 	__unlink_buffer(b);
1596 	__free_buffer_wake(b);
1597 
1598 	return true;
1599 }
1600 
1601 static unsigned long get_retain_buffers(struct dm_bufio_client *c)
1602 {
1603 	unsigned long retain_bytes = READ_ONCE(dm_bufio_retain_bytes);
1604 	if (likely(c->sectors_per_block_bits >= 0))
1605 		retain_bytes >>= c->sectors_per_block_bits + SECTOR_SHIFT;
1606 	else
1607 		retain_bytes /= c->block_size;
1608 	return retain_bytes;
1609 }
1610 
1611 static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
1612 			    gfp_t gfp_mask)
1613 {
1614 	int l;
1615 	struct dm_buffer *b, *tmp;
1616 	unsigned long freed = 0;
1617 	unsigned long count = c->n_buffers[LIST_CLEAN] +
1618 			      c->n_buffers[LIST_DIRTY];
1619 	unsigned long retain_target = get_retain_buffers(c);
1620 
1621 	for (l = 0; l < LIST_SIZE; l++) {
1622 		list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) {
1623 			if (__try_evict_buffer(b, gfp_mask))
1624 				freed++;
1625 			if (!--nr_to_scan || ((count - freed) <= retain_target))
1626 				return freed;
1627 			cond_resched();
1628 		}
1629 	}
1630 	return freed;
1631 }
1632 
1633 static unsigned long
1634 dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
1635 {
1636 	struct dm_bufio_client *c;
1637 	unsigned long freed;
1638 
1639 	c = container_of(shrink, struct dm_bufio_client, shrinker);
1640 	if (sc->gfp_mask & __GFP_FS)
1641 		dm_bufio_lock(c);
1642 	else if (!dm_bufio_trylock(c))
1643 		return SHRINK_STOP;
1644 
1645 	freed  = __scan(c, sc->nr_to_scan, sc->gfp_mask);
1646 	dm_bufio_unlock(c);
1647 	return freed;
1648 }
1649 
1650 static unsigned long
1651 dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
1652 {
1653 	struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker);
1654 	unsigned long count = READ_ONCE(c->n_buffers[LIST_CLEAN]) +
1655 			      READ_ONCE(c->n_buffers[LIST_DIRTY]);
1656 	unsigned long retain_target = get_retain_buffers(c);
1657 
1658 	return (count < retain_target) ? 0 : (count - retain_target);
1659 }
1660 
1661 /*
1662  * Create the buffering interface
1663  */
1664 struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size,
1665 					       unsigned reserved_buffers, unsigned aux_size,
1666 					       void (*alloc_callback)(struct dm_buffer *),
1667 					       void (*write_callback)(struct dm_buffer *))
1668 {
1669 	int r;
1670 	struct dm_bufio_client *c;
1671 	unsigned i;
1672 	char slab_name[27];
1673 
1674 	if (!block_size || block_size & ((1 << SECTOR_SHIFT) - 1)) {
1675 		DMERR("%s: block size not specified or is not multiple of 512b", __func__);
1676 		r = -EINVAL;
1677 		goto bad_client;
1678 	}
1679 
1680 	c = kzalloc(sizeof(*c), GFP_KERNEL);
1681 	if (!c) {
1682 		r = -ENOMEM;
1683 		goto bad_client;
1684 	}
1685 	c->buffer_tree = RB_ROOT;
1686 
1687 	c->bdev = bdev;
1688 	c->block_size = block_size;
1689 	if (is_power_of_2(block_size))
1690 		c->sectors_per_block_bits = __ffs(block_size) - SECTOR_SHIFT;
1691 	else
1692 		c->sectors_per_block_bits = -1;
1693 
1694 	c->alloc_callback = alloc_callback;
1695 	c->write_callback = write_callback;
1696 
1697 	for (i = 0; i < LIST_SIZE; i++) {
1698 		INIT_LIST_HEAD(&c->lru[i]);
1699 		c->n_buffers[i] = 0;
1700 	}
1701 
1702 	mutex_init(&c->lock);
1703 	INIT_LIST_HEAD(&c->reserved_buffers);
1704 	c->need_reserved_buffers = reserved_buffers;
1705 
1706 	dm_bufio_set_minimum_buffers(c, DM_BUFIO_MIN_BUFFERS);
1707 
1708 	init_waitqueue_head(&c->free_buffer_wait);
1709 	c->async_write_error = 0;
1710 
1711 	c->dm_io = dm_io_client_create();
1712 	if (IS_ERR(c->dm_io)) {
1713 		r = PTR_ERR(c->dm_io);
1714 		goto bad_dm_io;
1715 	}
1716 
1717 	if (block_size <= KMALLOC_MAX_SIZE &&
1718 	    (block_size < PAGE_SIZE || !is_power_of_2(block_size))) {
1719 		unsigned align = min(1U << __ffs(block_size), (unsigned)PAGE_SIZE);
1720 		snprintf(slab_name, sizeof slab_name, "dm_bufio_cache-%u", block_size);
1721 		c->slab_cache = kmem_cache_create(slab_name, block_size, align,
1722 						  SLAB_RECLAIM_ACCOUNT, NULL);
1723 		if (!c->slab_cache) {
1724 			r = -ENOMEM;
1725 			goto bad;
1726 		}
1727 	}
1728 	if (aux_size)
1729 		snprintf(slab_name, sizeof slab_name, "dm_bufio_buffer-%u", aux_size);
1730 	else
1731 		snprintf(slab_name, sizeof slab_name, "dm_bufio_buffer");
1732 	c->slab_buffer = kmem_cache_create(slab_name, sizeof(struct dm_buffer) + aux_size,
1733 					   0, SLAB_RECLAIM_ACCOUNT, NULL);
1734 	if (!c->slab_buffer) {
1735 		r = -ENOMEM;
1736 		goto bad;
1737 	}
1738 
1739 	while (c->need_reserved_buffers) {
1740 		struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL);
1741 
1742 		if (!b) {
1743 			r = -ENOMEM;
1744 			goto bad;
1745 		}
1746 		__free_buffer_wake(b);
1747 	}
1748 
1749 	c->shrinker.count_objects = dm_bufio_shrink_count;
1750 	c->shrinker.scan_objects = dm_bufio_shrink_scan;
1751 	c->shrinker.seeks = 1;
1752 	c->shrinker.batch = 0;
1753 	r = register_shrinker(&c->shrinker);
1754 	if (r)
1755 		goto bad;
1756 
1757 	mutex_lock(&dm_bufio_clients_lock);
1758 	dm_bufio_client_count++;
1759 	list_add(&c->client_list, &dm_bufio_all_clients);
1760 	__cache_size_refresh();
1761 	mutex_unlock(&dm_bufio_clients_lock);
1762 
1763 	return c;
1764 
1765 bad:
1766 	while (!list_empty(&c->reserved_buffers)) {
1767 		struct dm_buffer *b = list_entry(c->reserved_buffers.next,
1768 						 struct dm_buffer, lru_list);
1769 		list_del(&b->lru_list);
1770 		free_buffer(b);
1771 	}
1772 	kmem_cache_destroy(c->slab_cache);
1773 	kmem_cache_destroy(c->slab_buffer);
1774 	dm_io_client_destroy(c->dm_io);
1775 bad_dm_io:
1776 	mutex_destroy(&c->lock);
1777 	kfree(c);
1778 bad_client:
1779 	return ERR_PTR(r);
1780 }
1781 EXPORT_SYMBOL_GPL(dm_bufio_client_create);
1782 
1783 /*
1784  * Free the buffering interface.
1785  * It is required that there are no references on any buffers.
1786  */
1787 void dm_bufio_client_destroy(struct dm_bufio_client *c)
1788 {
1789 	unsigned i;
1790 
1791 	drop_buffers(c);
1792 
1793 	unregister_shrinker(&c->shrinker);
1794 
1795 	mutex_lock(&dm_bufio_clients_lock);
1796 
1797 	list_del(&c->client_list);
1798 	dm_bufio_client_count--;
1799 	__cache_size_refresh();
1800 
1801 	mutex_unlock(&dm_bufio_clients_lock);
1802 
1803 	BUG_ON(!RB_EMPTY_ROOT(&c->buffer_tree));
1804 	BUG_ON(c->need_reserved_buffers);
1805 
1806 	while (!list_empty(&c->reserved_buffers)) {
1807 		struct dm_buffer *b = list_entry(c->reserved_buffers.next,
1808 						 struct dm_buffer, lru_list);
1809 		list_del(&b->lru_list);
1810 		free_buffer(b);
1811 	}
1812 
1813 	for (i = 0; i < LIST_SIZE; i++)
1814 		if (c->n_buffers[i])
1815 			DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]);
1816 
1817 	for (i = 0; i < LIST_SIZE; i++)
1818 		BUG_ON(c->n_buffers[i]);
1819 
1820 	kmem_cache_destroy(c->slab_cache);
1821 	kmem_cache_destroy(c->slab_buffer);
1822 	dm_io_client_destroy(c->dm_io);
1823 	mutex_destroy(&c->lock);
1824 	kfree(c);
1825 }
1826 EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
1827 
1828 void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start)
1829 {
1830 	c->start = start;
1831 }
1832 EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset);
1833 
1834 static unsigned get_max_age_hz(void)
1835 {
1836 	unsigned max_age = READ_ONCE(dm_bufio_max_age);
1837 
1838 	if (max_age > UINT_MAX / HZ)
1839 		max_age = UINT_MAX / HZ;
1840 
1841 	return max_age * HZ;
1842 }
1843 
1844 static bool older_than(struct dm_buffer *b, unsigned long age_hz)
1845 {
1846 	return time_after_eq(jiffies, b->last_accessed + age_hz);
1847 }
1848 
1849 static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
1850 {
1851 	struct dm_buffer *b, *tmp;
1852 	unsigned long retain_target = get_retain_buffers(c);
1853 	unsigned long count;
1854 	LIST_HEAD(write_list);
1855 
1856 	dm_bufio_lock(c);
1857 
1858 	__check_watermark(c, &write_list);
1859 	if (unlikely(!list_empty(&write_list))) {
1860 		dm_bufio_unlock(c);
1861 		__flush_write_list(&write_list);
1862 		dm_bufio_lock(c);
1863 	}
1864 
1865 	count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
1866 	list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) {
1867 		if (count <= retain_target)
1868 			break;
1869 
1870 		if (!older_than(b, age_hz))
1871 			break;
1872 
1873 		if (__try_evict_buffer(b, 0))
1874 			count--;
1875 
1876 		cond_resched();
1877 	}
1878 
1879 	dm_bufio_unlock(c);
1880 }
1881 
1882 static void do_global_cleanup(struct work_struct *w)
1883 {
1884 	struct dm_bufio_client *locked_client = NULL;
1885 	struct dm_bufio_client *current_client;
1886 	struct dm_buffer *b;
1887 	unsigned spinlock_hold_count;
1888 	unsigned long threshold = dm_bufio_cache_size -
1889 		dm_bufio_cache_size / DM_BUFIO_LOW_WATERMARK_RATIO;
1890 	unsigned long loops = global_num * 2;
1891 
1892 	mutex_lock(&dm_bufio_clients_lock);
1893 
1894 	while (1) {
1895 		cond_resched();
1896 
1897 		spin_lock(&global_spinlock);
1898 		if (unlikely(dm_bufio_current_allocated <= threshold))
1899 			break;
1900 
1901 		spinlock_hold_count = 0;
1902 get_next:
1903 		if (!loops--)
1904 			break;
1905 		if (unlikely(list_empty(&global_queue)))
1906 			break;
1907 		b = list_entry(global_queue.prev, struct dm_buffer, global_list);
1908 
1909 		if (b->accessed) {
1910 			b->accessed = 0;
1911 			list_move(&b->global_list, &global_queue);
1912 			if (likely(++spinlock_hold_count < 16))
1913 				goto get_next;
1914 			spin_unlock(&global_spinlock);
1915 			continue;
1916 		}
1917 
1918 		current_client = b->c;
1919 		if (unlikely(current_client != locked_client)) {
1920 			if (locked_client)
1921 				dm_bufio_unlock(locked_client);
1922 
1923 			if (!dm_bufio_trylock(current_client)) {
1924 				spin_unlock(&global_spinlock);
1925 				dm_bufio_lock(current_client);
1926 				locked_client = current_client;
1927 				continue;
1928 			}
1929 
1930 			locked_client = current_client;
1931 		}
1932 
1933 		spin_unlock(&global_spinlock);
1934 
1935 		if (unlikely(!__try_evict_buffer(b, GFP_KERNEL))) {
1936 			spin_lock(&global_spinlock);
1937 			list_move(&b->global_list, &global_queue);
1938 			spin_unlock(&global_spinlock);
1939 		}
1940 	}
1941 
1942 	spin_unlock(&global_spinlock);
1943 
1944 	if (locked_client)
1945 		dm_bufio_unlock(locked_client);
1946 
1947 	mutex_unlock(&dm_bufio_clients_lock);
1948 }
1949 
1950 static void cleanup_old_buffers(void)
1951 {
1952 	unsigned long max_age_hz = get_max_age_hz();
1953 	struct dm_bufio_client *c;
1954 
1955 	mutex_lock(&dm_bufio_clients_lock);
1956 
1957 	__cache_size_refresh();
1958 
1959 	list_for_each_entry(c, &dm_bufio_all_clients, client_list)
1960 		__evict_old_buffers(c, max_age_hz);
1961 
1962 	mutex_unlock(&dm_bufio_clients_lock);
1963 }
1964 
1965 static void work_fn(struct work_struct *w)
1966 {
1967 	cleanup_old_buffers();
1968 
1969 	queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
1970 			   DM_BUFIO_WORK_TIMER_SECS * HZ);
1971 }
1972 
1973 /*----------------------------------------------------------------
1974  * Module setup
1975  *--------------------------------------------------------------*/
1976 
1977 /*
1978  * This is called only once for the whole dm_bufio module.
1979  * It initializes memory limit.
1980  */
1981 static int __init dm_bufio_init(void)
1982 {
1983 	__u64 mem;
1984 
1985 	dm_bufio_allocated_kmem_cache = 0;
1986 	dm_bufio_allocated_get_free_pages = 0;
1987 	dm_bufio_allocated_vmalloc = 0;
1988 	dm_bufio_current_allocated = 0;
1989 
1990 	mem = (__u64)mult_frac(totalram_pages() - totalhigh_pages(),
1991 			       DM_BUFIO_MEMORY_PERCENT, 100) << PAGE_SHIFT;
1992 
1993 	if (mem > ULONG_MAX)
1994 		mem = ULONG_MAX;
1995 
1996 #ifdef CONFIG_MMU
1997 	if (mem > mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100))
1998 		mem = mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100);
1999 #endif
2000 
2001 	dm_bufio_default_cache_size = mem;
2002 
2003 	mutex_lock(&dm_bufio_clients_lock);
2004 	__cache_size_refresh();
2005 	mutex_unlock(&dm_bufio_clients_lock);
2006 
2007 	dm_bufio_wq = alloc_workqueue("dm_bufio_cache", WQ_MEM_RECLAIM, 0);
2008 	if (!dm_bufio_wq)
2009 		return -ENOMEM;
2010 
2011 	INIT_DELAYED_WORK(&dm_bufio_cleanup_old_work, work_fn);
2012 	INIT_WORK(&dm_bufio_replacement_work, do_global_cleanup);
2013 	queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
2014 			   DM_BUFIO_WORK_TIMER_SECS * HZ);
2015 
2016 	return 0;
2017 }
2018 
2019 /*
2020  * This is called once when unloading the dm_bufio module.
2021  */
2022 static void __exit dm_bufio_exit(void)
2023 {
2024 	int bug = 0;
2025 
2026 	cancel_delayed_work_sync(&dm_bufio_cleanup_old_work);
2027 	flush_workqueue(dm_bufio_wq);
2028 	destroy_workqueue(dm_bufio_wq);
2029 
2030 	if (dm_bufio_client_count) {
2031 		DMCRIT("%s: dm_bufio_client_count leaked: %d",
2032 			__func__, dm_bufio_client_count);
2033 		bug = 1;
2034 	}
2035 
2036 	if (dm_bufio_current_allocated) {
2037 		DMCRIT("%s: dm_bufio_current_allocated leaked: %lu",
2038 			__func__, dm_bufio_current_allocated);
2039 		bug = 1;
2040 	}
2041 
2042 	if (dm_bufio_allocated_get_free_pages) {
2043 		DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu",
2044 		       __func__, dm_bufio_allocated_get_free_pages);
2045 		bug = 1;
2046 	}
2047 
2048 	if (dm_bufio_allocated_vmalloc) {
2049 		DMCRIT("%s: dm_bufio_vmalloc leaked: %lu",
2050 		       __func__, dm_bufio_allocated_vmalloc);
2051 		bug = 1;
2052 	}
2053 
2054 	BUG_ON(bug);
2055 }
2056 
2057 module_init(dm_bufio_init)
2058 module_exit(dm_bufio_exit)
2059 
2060 module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, S_IRUGO | S_IWUSR);
2061 MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
2062 
2063 module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR);
2064 MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
2065 
2066 module_param_named(retain_bytes, dm_bufio_retain_bytes, ulong, S_IRUGO | S_IWUSR);
2067 MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory");
2068 
2069 module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR);
2070 MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");
2071 
2072 module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, S_IRUGO);
2073 MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc");
2074 
2075 module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, S_IRUGO);
2076 MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages");
2077 
2078 module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, S_IRUGO);
2079 MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc");
2080 
2081 module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, S_IRUGO);
2082 MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache");
2083 
2084 MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
2085 MODULE_DESCRIPTION(DM_NAME " buffered I/O library");
2086 MODULE_LICENSE("GPL");
2087