xref: /openbmc/linux/drivers/md/dm-bufio.c (revision 7ae5c03a)
1 /*
2  * Copyright (C) 2009-2011 Red Hat, Inc.
3  *
4  * Author: Mikulas Patocka <mpatocka@redhat.com>
5  *
6  * This file is released under the GPL.
7  */
8 
9 #include <linux/dm-bufio.h>
10 
11 #include <linux/device-mapper.h>
12 #include <linux/dm-io.h>
13 #include <linux/slab.h>
14 #include <linux/sched/mm.h>
15 #include <linux/jiffies.h>
16 #include <linux/vmalloc.h>
17 #include <linux/shrinker.h>
18 #include <linux/module.h>
19 #include <linux/rbtree.h>
20 #include <linux/stacktrace.h>
21 #include <linux/jump_label.h>
22 
23 #define DM_MSG_PREFIX "bufio"
24 
25 /*
26  * Memory management policy:
27  *	Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory
28  *	or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower).
29  *	Always allocate at least DM_BUFIO_MIN_BUFFERS buffers.
30  *	Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT
31  *	dirty buffers.
32  */
33 #define DM_BUFIO_MIN_BUFFERS		8
34 
35 #define DM_BUFIO_MEMORY_PERCENT		2
36 #define DM_BUFIO_VMALLOC_PERCENT	25
37 #define DM_BUFIO_WRITEBACK_RATIO	3
38 #define DM_BUFIO_LOW_WATERMARK_RATIO	16
39 
40 /*
41  * Check buffer ages in this interval (seconds)
42  */
43 #define DM_BUFIO_WORK_TIMER_SECS	30
44 
45 /*
46  * Free buffers when they are older than this (seconds)
47  */
48 #define DM_BUFIO_DEFAULT_AGE_SECS	300
49 
50 /*
51  * The nr of bytes of cached data to keep around.
52  */
53 #define DM_BUFIO_DEFAULT_RETAIN_BYTES   (256 * 1024)
54 
55 /*
56  * Align buffer writes to this boundary.
57  * Tests show that SSDs have the highest IOPS when using 4k writes.
58  */
59 #define DM_BUFIO_WRITE_ALIGN		4096
60 
61 /*
62  * dm_buffer->list_mode
63  */
64 #define LIST_CLEAN	0
65 #define LIST_DIRTY	1
66 #define LIST_SIZE	2
67 
68 /*
69  * Linking of buffers:
70  *	All buffers are linked to buffer_tree with their node field.
71  *
72  *	Clean buffers that are not being written (B_WRITING not set)
73  *	are linked to lru[LIST_CLEAN] with their lru_list field.
74  *
75  *	Dirty and clean buffers that are being written are linked to
76  *	lru[LIST_DIRTY] with their lru_list field. When the write
77  *	finishes, the buffer cannot be relinked immediately (because we
78  *	are in an interrupt context and relinking requires process
79  *	context), so some clean-not-writing buffers can be held on
80  *	dirty_lru too.  They are later added to lru in the process
81  *	context.
82  */
83 struct dm_bufio_client {
84 	struct mutex lock;
85 	spinlock_t spinlock;
86 	unsigned long spinlock_flags;
87 
88 	struct list_head lru[LIST_SIZE];
89 	unsigned long n_buffers[LIST_SIZE];
90 
91 	struct block_device *bdev;
92 	unsigned block_size;
93 	s8 sectors_per_block_bits;
94 	void (*alloc_callback)(struct dm_buffer *);
95 	void (*write_callback)(struct dm_buffer *);
96 	bool no_sleep;
97 
98 	struct kmem_cache *slab_buffer;
99 	struct kmem_cache *slab_cache;
100 	struct dm_io_client *dm_io;
101 
102 	struct list_head reserved_buffers;
103 	unsigned need_reserved_buffers;
104 
105 	unsigned minimum_buffers;
106 
107 	struct rb_root buffer_tree;
108 	wait_queue_head_t free_buffer_wait;
109 
110 	sector_t start;
111 
112 	int async_write_error;
113 
114 	struct list_head client_list;
115 
116 	struct shrinker shrinker;
117 	struct work_struct shrink_work;
118 	atomic_long_t need_shrink;
119 };
120 
121 /*
122  * Buffer state bits.
123  */
124 #define B_READING	0
125 #define B_WRITING	1
126 #define B_DIRTY		2
127 
128 /*
129  * Describes how the block was allocated:
130  * kmem_cache_alloc(), __get_free_pages() or vmalloc().
131  * See the comment at alloc_buffer_data.
132  */
133 enum data_mode {
134 	DATA_MODE_SLAB = 0,
135 	DATA_MODE_GET_FREE_PAGES = 1,
136 	DATA_MODE_VMALLOC = 2,
137 	DATA_MODE_LIMIT = 3
138 };
139 
140 struct dm_buffer {
141 	struct rb_node node;
142 	struct list_head lru_list;
143 	struct list_head global_list;
144 	sector_t block;
145 	void *data;
146 	unsigned char data_mode;		/* DATA_MODE_* */
147 	unsigned char list_mode;		/* LIST_* */
148 	blk_status_t read_error;
149 	blk_status_t write_error;
150 	unsigned accessed;
151 	unsigned hold_count;
152 	unsigned long state;
153 	unsigned long last_accessed;
154 	unsigned dirty_start;
155 	unsigned dirty_end;
156 	unsigned write_start;
157 	unsigned write_end;
158 	struct dm_bufio_client *c;
159 	struct list_head write_list;
160 	void (*end_io)(struct dm_buffer *, blk_status_t);
161 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
162 #define MAX_STACK 10
163 	unsigned int stack_len;
164 	unsigned long stack_entries[MAX_STACK];
165 #endif
166 };
167 
168 static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled);
169 
170 /*----------------------------------------------------------------*/
171 
172 #define dm_bufio_in_request()	(!!current->bio_list)
173 
174 static void dm_bufio_lock(struct dm_bufio_client *c)
175 {
176 	if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
177 		spin_lock_irqsave_nested(&c->spinlock, c->spinlock_flags, dm_bufio_in_request());
178 	else
179 		mutex_lock_nested(&c->lock, dm_bufio_in_request());
180 }
181 
182 static int dm_bufio_trylock(struct dm_bufio_client *c)
183 {
184 	if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
185 		return spin_trylock_irqsave(&c->spinlock, c->spinlock_flags);
186 	else
187 		return mutex_trylock(&c->lock);
188 }
189 
190 static void dm_bufio_unlock(struct dm_bufio_client *c)
191 {
192 	if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
193 		spin_unlock_irqrestore(&c->spinlock, c->spinlock_flags);
194 	else
195 		mutex_unlock(&c->lock);
196 }
197 
198 /*----------------------------------------------------------------*/
199 
200 /*
201  * Default cache size: available memory divided by the ratio.
202  */
203 static unsigned long dm_bufio_default_cache_size;
204 
205 /*
206  * Total cache size set by the user.
207  */
208 static unsigned long dm_bufio_cache_size;
209 
210 /*
211  * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change
212  * at any time.  If it disagrees, the user has changed cache size.
213  */
214 static unsigned long dm_bufio_cache_size_latch;
215 
216 static DEFINE_SPINLOCK(global_spinlock);
217 
218 static LIST_HEAD(global_queue);
219 
220 static unsigned long global_num = 0;
221 
222 /*
223  * Buffers are freed after this timeout
224  */
225 static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
226 static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
227 
228 static unsigned long dm_bufio_peak_allocated;
229 static unsigned long dm_bufio_allocated_kmem_cache;
230 static unsigned long dm_bufio_allocated_get_free_pages;
231 static unsigned long dm_bufio_allocated_vmalloc;
232 static unsigned long dm_bufio_current_allocated;
233 
234 /*----------------------------------------------------------------*/
235 
236 /*
237  * The current number of clients.
238  */
239 static int dm_bufio_client_count;
240 
241 /*
242  * The list of all clients.
243  */
244 static LIST_HEAD(dm_bufio_all_clients);
245 
246 /*
247  * This mutex protects dm_bufio_cache_size_latch and dm_bufio_client_count
248  */
249 static DEFINE_MUTEX(dm_bufio_clients_lock);
250 
251 static struct workqueue_struct *dm_bufio_wq;
252 static struct delayed_work dm_bufio_cleanup_old_work;
253 static struct work_struct dm_bufio_replacement_work;
254 
255 
256 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
257 static void buffer_record_stack(struct dm_buffer *b)
258 {
259 	b->stack_len = stack_trace_save(b->stack_entries, MAX_STACK, 2);
260 }
261 #endif
262 
263 /*----------------------------------------------------------------
264  * A red/black tree acts as an index for all the buffers.
265  *--------------------------------------------------------------*/
266 static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
267 {
268 	struct rb_node *n = c->buffer_tree.rb_node;
269 	struct dm_buffer *b;
270 
271 	while (n) {
272 		b = container_of(n, struct dm_buffer, node);
273 
274 		if (b->block == block)
275 			return b;
276 
277 		n = block < b->block ? n->rb_left : n->rb_right;
278 	}
279 
280 	return NULL;
281 }
282 
283 static struct dm_buffer *__find_next(struct dm_bufio_client *c, sector_t block)
284 {
285 	struct rb_node *n = c->buffer_tree.rb_node;
286 	struct dm_buffer *b;
287 	struct dm_buffer *best = NULL;
288 
289 	while (n) {
290 		b = container_of(n, struct dm_buffer, node);
291 
292 		if (b->block == block)
293 			return b;
294 
295 		if (block <= b->block) {
296 			n = n->rb_left;
297 			best = b;
298 		} else {
299 			n = n->rb_right;
300 		}
301 	}
302 
303 	return best;
304 }
305 
306 static void __insert(struct dm_bufio_client *c, struct dm_buffer *b)
307 {
308 	struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL;
309 	struct dm_buffer *found;
310 
311 	while (*new) {
312 		found = container_of(*new, struct dm_buffer, node);
313 
314 		if (found->block == b->block) {
315 			BUG_ON(found != b);
316 			return;
317 		}
318 
319 		parent = *new;
320 		new = b->block < found->block ?
321 			&found->node.rb_left : &found->node.rb_right;
322 	}
323 
324 	rb_link_node(&b->node, parent, new);
325 	rb_insert_color(&b->node, &c->buffer_tree);
326 }
327 
328 static void __remove(struct dm_bufio_client *c, struct dm_buffer *b)
329 {
330 	rb_erase(&b->node, &c->buffer_tree);
331 }
332 
333 /*----------------------------------------------------------------*/
334 
335 static void adjust_total_allocated(struct dm_buffer *b, bool unlink)
336 {
337 	unsigned char data_mode;
338 	long diff;
339 
340 	static unsigned long * const class_ptr[DATA_MODE_LIMIT] = {
341 		&dm_bufio_allocated_kmem_cache,
342 		&dm_bufio_allocated_get_free_pages,
343 		&dm_bufio_allocated_vmalloc,
344 	};
345 
346 	data_mode = b->data_mode;
347 	diff = (long)b->c->block_size;
348 	if (unlink)
349 		diff = -diff;
350 
351 	spin_lock(&global_spinlock);
352 
353 	*class_ptr[data_mode] += diff;
354 
355 	dm_bufio_current_allocated += diff;
356 
357 	if (dm_bufio_current_allocated > dm_bufio_peak_allocated)
358 		dm_bufio_peak_allocated = dm_bufio_current_allocated;
359 
360 	b->accessed = 1;
361 
362 	if (!unlink) {
363 		list_add(&b->global_list, &global_queue);
364 		global_num++;
365 		if (dm_bufio_current_allocated > dm_bufio_cache_size)
366 			queue_work(dm_bufio_wq, &dm_bufio_replacement_work);
367 	} else {
368 		list_del(&b->global_list);
369 		global_num--;
370 	}
371 
372 	spin_unlock(&global_spinlock);
373 }
374 
375 /*
376  * Change the number of clients and recalculate per-client limit.
377  */
378 static void __cache_size_refresh(void)
379 {
380 	BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock));
381 	BUG_ON(dm_bufio_client_count < 0);
382 
383 	dm_bufio_cache_size_latch = READ_ONCE(dm_bufio_cache_size);
384 
385 	/*
386 	 * Use default if set to 0 and report the actual cache size used.
387 	 */
388 	if (!dm_bufio_cache_size_latch) {
389 		(void)cmpxchg(&dm_bufio_cache_size, 0,
390 			      dm_bufio_default_cache_size);
391 		dm_bufio_cache_size_latch = dm_bufio_default_cache_size;
392 	}
393 }
394 
395 /*
396  * Allocating buffer data.
397  *
398  * Small buffers are allocated with kmem_cache, to use space optimally.
399  *
400  * For large buffers, we choose between get_free_pages and vmalloc.
401  * Each has advantages and disadvantages.
402  *
403  * __get_free_pages can randomly fail if the memory is fragmented.
404  * __vmalloc won't randomly fail, but vmalloc space is limited (it may be
405  * as low as 128M) so using it for caching is not appropriate.
406  *
407  * If the allocation may fail we use __get_free_pages. Memory fragmentation
408  * won't have a fatal effect here, but it just causes flushes of some other
409  * buffers and more I/O will be performed. Don't use __get_free_pages if it
410  * always fails (i.e. order >= MAX_ORDER).
411  *
412  * If the allocation shouldn't fail we use __vmalloc. This is only for the
413  * initial reserve allocation, so there's no risk of wasting all vmalloc
414  * space.
415  */
416 static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
417 			       unsigned char *data_mode)
418 {
419 	if (unlikely(c->slab_cache != NULL)) {
420 		*data_mode = DATA_MODE_SLAB;
421 		return kmem_cache_alloc(c->slab_cache, gfp_mask);
422 	}
423 
424 	if (c->block_size <= KMALLOC_MAX_SIZE &&
425 	    gfp_mask & __GFP_NORETRY) {
426 		*data_mode = DATA_MODE_GET_FREE_PAGES;
427 		return (void *)__get_free_pages(gfp_mask,
428 						c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
429 	}
430 
431 	*data_mode = DATA_MODE_VMALLOC;
432 
433 	/*
434 	 * __vmalloc allocates the data pages and auxiliary structures with
435 	 * gfp_flags that were specified, but pagetables are always allocated
436 	 * with GFP_KERNEL, no matter what was specified as gfp_mask.
437 	 *
438 	 * Consequently, we must set per-process flag PF_MEMALLOC_NOIO so that
439 	 * all allocations done by this process (including pagetables) are done
440 	 * as if GFP_NOIO was specified.
441 	 */
442 	if (gfp_mask & __GFP_NORETRY) {
443 		unsigned noio_flag = memalloc_noio_save();
444 		void *ptr = __vmalloc(c->block_size, gfp_mask);
445 
446 		memalloc_noio_restore(noio_flag);
447 		return ptr;
448 	}
449 
450 	return __vmalloc(c->block_size, gfp_mask);
451 }
452 
453 /*
454  * Free buffer's data.
455  */
456 static void free_buffer_data(struct dm_bufio_client *c,
457 			     void *data, unsigned char data_mode)
458 {
459 	switch (data_mode) {
460 	case DATA_MODE_SLAB:
461 		kmem_cache_free(c->slab_cache, data);
462 		break;
463 
464 	case DATA_MODE_GET_FREE_PAGES:
465 		free_pages((unsigned long)data,
466 			   c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
467 		break;
468 
469 	case DATA_MODE_VMALLOC:
470 		vfree(data);
471 		break;
472 
473 	default:
474 		DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d",
475 		       data_mode);
476 		BUG();
477 	}
478 }
479 
480 /*
481  * Allocate buffer and its data.
482  */
483 static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
484 {
485 	struct dm_buffer *b = kmem_cache_alloc(c->slab_buffer, gfp_mask);
486 
487 	if (!b)
488 		return NULL;
489 
490 	b->c = c;
491 
492 	b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode);
493 	if (!b->data) {
494 		kmem_cache_free(c->slab_buffer, b);
495 		return NULL;
496 	}
497 
498 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
499 	b->stack_len = 0;
500 #endif
501 	return b;
502 }
503 
504 /*
505  * Free buffer and its data.
506  */
507 static void free_buffer(struct dm_buffer *b)
508 {
509 	struct dm_bufio_client *c = b->c;
510 
511 	free_buffer_data(c, b->data, b->data_mode);
512 	kmem_cache_free(c->slab_buffer, b);
513 }
514 
515 /*
516  * Link buffer to the buffer tree and clean or dirty queue.
517  */
518 static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty)
519 {
520 	struct dm_bufio_client *c = b->c;
521 
522 	c->n_buffers[dirty]++;
523 	b->block = block;
524 	b->list_mode = dirty;
525 	list_add(&b->lru_list, &c->lru[dirty]);
526 	__insert(b->c, b);
527 	b->last_accessed = jiffies;
528 
529 	adjust_total_allocated(b, false);
530 }
531 
532 /*
533  * Unlink buffer from the buffer tree and dirty or clean queue.
534  */
535 static void __unlink_buffer(struct dm_buffer *b)
536 {
537 	struct dm_bufio_client *c = b->c;
538 
539 	BUG_ON(!c->n_buffers[b->list_mode]);
540 
541 	c->n_buffers[b->list_mode]--;
542 	__remove(b->c, b);
543 	list_del(&b->lru_list);
544 
545 	adjust_total_allocated(b, true);
546 }
547 
548 /*
549  * Place the buffer to the head of dirty or clean LRU queue.
550  */
551 static void __relink_lru(struct dm_buffer *b, int dirty)
552 {
553 	struct dm_bufio_client *c = b->c;
554 
555 	b->accessed = 1;
556 
557 	BUG_ON(!c->n_buffers[b->list_mode]);
558 
559 	c->n_buffers[b->list_mode]--;
560 	c->n_buffers[dirty]++;
561 	b->list_mode = dirty;
562 	list_move(&b->lru_list, &c->lru[dirty]);
563 	b->last_accessed = jiffies;
564 }
565 
566 /*----------------------------------------------------------------
567  * Submit I/O on the buffer.
568  *
569  * Bio interface is faster but it has some problems:
570  *	the vector list is limited (increasing this limit increases
571  *	memory-consumption per buffer, so it is not viable);
572  *
573  *	the memory must be direct-mapped, not vmalloced;
574  *
575  * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and
576  * it is not vmalloced, try using the bio interface.
577  *
578  * If the buffer is big, if it is vmalloced or if the underlying device
579  * rejects the bio because it is too large, use dm-io layer to do the I/O.
580  * The dm-io layer splits the I/O into multiple requests, avoiding the above
581  * shortcomings.
582  *--------------------------------------------------------------*/
583 
584 /*
585  * dm-io completion routine. It just calls b->bio.bi_end_io, pretending
586  * that the request was handled directly with bio interface.
587  */
588 static void dmio_complete(unsigned long error, void *context)
589 {
590 	struct dm_buffer *b = context;
591 
592 	b->end_io(b, unlikely(error != 0) ? BLK_STS_IOERR : 0);
593 }
594 
595 static void use_dmio(struct dm_buffer *b, enum req_op op, sector_t sector,
596 		     unsigned n_sectors, unsigned offset)
597 {
598 	int r;
599 	struct dm_io_request io_req = {
600 		.bi_opf = op,
601 		.notify.fn = dmio_complete,
602 		.notify.context = b,
603 		.client = b->c->dm_io,
604 	};
605 	struct dm_io_region region = {
606 		.bdev = b->c->bdev,
607 		.sector = sector,
608 		.count = n_sectors,
609 	};
610 
611 	if (b->data_mode != DATA_MODE_VMALLOC) {
612 		io_req.mem.type = DM_IO_KMEM;
613 		io_req.mem.ptr.addr = (char *)b->data + offset;
614 	} else {
615 		io_req.mem.type = DM_IO_VMA;
616 		io_req.mem.ptr.vma = (char *)b->data + offset;
617 	}
618 
619 	r = dm_io(&io_req, 1, &region, NULL);
620 	if (unlikely(r))
621 		b->end_io(b, errno_to_blk_status(r));
622 }
623 
624 static void bio_complete(struct bio *bio)
625 {
626 	struct dm_buffer *b = bio->bi_private;
627 	blk_status_t status = bio->bi_status;
628 	bio_uninit(bio);
629 	kfree(bio);
630 	b->end_io(b, status);
631 }
632 
633 static void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector,
634 		    unsigned n_sectors, unsigned offset)
635 {
636 	struct bio *bio;
637 	char *ptr;
638 	unsigned vec_size, len;
639 
640 	vec_size = b->c->block_size >> PAGE_SHIFT;
641 	if (unlikely(b->c->sectors_per_block_bits < PAGE_SHIFT - SECTOR_SHIFT))
642 		vec_size += 2;
643 
644 	bio = bio_kmalloc(vec_size, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOWARN);
645 	if (!bio) {
646 dmio:
647 		use_dmio(b, op, sector, n_sectors, offset);
648 		return;
649 	}
650 	bio_init(bio, b->c->bdev, bio->bi_inline_vecs, vec_size, op);
651 	bio->bi_iter.bi_sector = sector;
652 	bio->bi_end_io = bio_complete;
653 	bio->bi_private = b;
654 
655 	ptr = (char *)b->data + offset;
656 	len = n_sectors << SECTOR_SHIFT;
657 
658 	do {
659 		unsigned this_step = min((unsigned)(PAGE_SIZE - offset_in_page(ptr)), len);
660 		if (!bio_add_page(bio, virt_to_page(ptr), this_step,
661 				  offset_in_page(ptr))) {
662 			bio_put(bio);
663 			goto dmio;
664 		}
665 
666 		len -= this_step;
667 		ptr += this_step;
668 	} while (len > 0);
669 
670 	submit_bio(bio);
671 }
672 
673 static inline sector_t block_to_sector(struct dm_bufio_client *c, sector_t block)
674 {
675 	sector_t sector;
676 
677 	if (likely(c->sectors_per_block_bits >= 0))
678 		sector = block << c->sectors_per_block_bits;
679 	else
680 		sector = block * (c->block_size >> SECTOR_SHIFT);
681 	sector += c->start;
682 
683 	return sector;
684 }
685 
686 static void submit_io(struct dm_buffer *b, enum req_op op,
687 		      void (*end_io)(struct dm_buffer *, blk_status_t))
688 {
689 	unsigned n_sectors;
690 	sector_t sector;
691 	unsigned offset, end;
692 
693 	b->end_io = end_io;
694 
695 	sector = block_to_sector(b->c, b->block);
696 
697 	if (op != REQ_OP_WRITE) {
698 		n_sectors = b->c->block_size >> SECTOR_SHIFT;
699 		offset = 0;
700 	} else {
701 		if (b->c->write_callback)
702 			b->c->write_callback(b);
703 		offset = b->write_start;
704 		end = b->write_end;
705 		offset &= -DM_BUFIO_WRITE_ALIGN;
706 		end += DM_BUFIO_WRITE_ALIGN - 1;
707 		end &= -DM_BUFIO_WRITE_ALIGN;
708 		if (unlikely(end > b->c->block_size))
709 			end = b->c->block_size;
710 
711 		sector += offset >> SECTOR_SHIFT;
712 		n_sectors = (end - offset) >> SECTOR_SHIFT;
713 	}
714 
715 	if (b->data_mode != DATA_MODE_VMALLOC)
716 		use_bio(b, op, sector, n_sectors, offset);
717 	else
718 		use_dmio(b, op, sector, n_sectors, offset);
719 }
720 
721 /*----------------------------------------------------------------
722  * Writing dirty buffers
723  *--------------------------------------------------------------*/
724 
725 /*
726  * The endio routine for write.
727  *
728  * Set the error, clear B_WRITING bit and wake anyone who was waiting on
729  * it.
730  */
731 static void write_endio(struct dm_buffer *b, blk_status_t status)
732 {
733 	b->write_error = status;
734 	if (unlikely(status)) {
735 		struct dm_bufio_client *c = b->c;
736 
737 		(void)cmpxchg(&c->async_write_error, 0,
738 				blk_status_to_errno(status));
739 	}
740 
741 	BUG_ON(!test_bit(B_WRITING, &b->state));
742 
743 	smp_mb__before_atomic();
744 	clear_bit(B_WRITING, &b->state);
745 	smp_mb__after_atomic();
746 
747 	wake_up_bit(&b->state, B_WRITING);
748 }
749 
750 /*
751  * Initiate a write on a dirty buffer, but don't wait for it.
752  *
753  * - If the buffer is not dirty, exit.
754  * - If there some previous write going on, wait for it to finish (we can't
755  *   have two writes on the same buffer simultaneously).
756  * - Submit our write and don't wait on it. We set B_WRITING indicating
757  *   that there is a write in progress.
758  */
759 static void __write_dirty_buffer(struct dm_buffer *b,
760 				 struct list_head *write_list)
761 {
762 	if (!test_bit(B_DIRTY, &b->state))
763 		return;
764 
765 	clear_bit(B_DIRTY, &b->state);
766 	wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
767 
768 	b->write_start = b->dirty_start;
769 	b->write_end = b->dirty_end;
770 
771 	if (!write_list)
772 		submit_io(b, REQ_OP_WRITE, write_endio);
773 	else
774 		list_add_tail(&b->write_list, write_list);
775 }
776 
777 static void __flush_write_list(struct list_head *write_list)
778 {
779 	struct blk_plug plug;
780 	blk_start_plug(&plug);
781 	while (!list_empty(write_list)) {
782 		struct dm_buffer *b =
783 			list_entry(write_list->next, struct dm_buffer, write_list);
784 		list_del(&b->write_list);
785 		submit_io(b, REQ_OP_WRITE, write_endio);
786 		cond_resched();
787 	}
788 	blk_finish_plug(&plug);
789 }
790 
791 /*
792  * Wait until any activity on the buffer finishes.  Possibly write the
793  * buffer if it is dirty.  When this function finishes, there is no I/O
794  * running on the buffer and the buffer is not dirty.
795  */
796 static void __make_buffer_clean(struct dm_buffer *b)
797 {
798 	BUG_ON(b->hold_count);
799 
800 	if (!b->state)	/* fast case */
801 		return;
802 
803 	wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
804 	__write_dirty_buffer(b, NULL);
805 	wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
806 }
807 
808 /*
809  * Find some buffer that is not held by anybody, clean it, unlink it and
810  * return it.
811  */
812 static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c)
813 {
814 	struct dm_buffer *b;
815 
816 	list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) {
817 		BUG_ON(test_bit(B_WRITING, &b->state));
818 		BUG_ON(test_bit(B_DIRTY, &b->state));
819 
820 		if (!b->hold_count) {
821 			__make_buffer_clean(b);
822 			__unlink_buffer(b);
823 			return b;
824 		}
825 		cond_resched();
826 	}
827 
828 	list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) {
829 		BUG_ON(test_bit(B_READING, &b->state));
830 
831 		if (!b->hold_count) {
832 			__make_buffer_clean(b);
833 			__unlink_buffer(b);
834 			return b;
835 		}
836 		cond_resched();
837 	}
838 
839 	return NULL;
840 }
841 
842 /*
843  * Wait until some other threads free some buffer or release hold count on
844  * some buffer.
845  *
846  * This function is entered with c->lock held, drops it and regains it
847  * before exiting.
848  */
849 static void __wait_for_free_buffer(struct dm_bufio_client *c)
850 {
851 	DECLARE_WAITQUEUE(wait, current);
852 
853 	add_wait_queue(&c->free_buffer_wait, &wait);
854 	set_current_state(TASK_UNINTERRUPTIBLE);
855 	dm_bufio_unlock(c);
856 
857 	io_schedule();
858 
859 	remove_wait_queue(&c->free_buffer_wait, &wait);
860 
861 	dm_bufio_lock(c);
862 }
863 
864 enum new_flag {
865 	NF_FRESH = 0,
866 	NF_READ = 1,
867 	NF_GET = 2,
868 	NF_PREFETCH = 3
869 };
870 
871 /*
872  * Allocate a new buffer. If the allocation is not possible, wait until
873  * some other thread frees a buffer.
874  *
875  * May drop the lock and regain it.
876  */
877 static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf)
878 {
879 	struct dm_buffer *b;
880 	bool tried_noio_alloc = false;
881 
882 	/*
883 	 * dm-bufio is resistant to allocation failures (it just keeps
884 	 * one buffer reserved in cases all the allocations fail).
885 	 * So set flags to not try too hard:
886 	 *	GFP_NOWAIT: don't wait; if we need to sleep we'll release our
887 	 *		    mutex and wait ourselves.
888 	 *	__GFP_NORETRY: don't retry and rather return failure
889 	 *	__GFP_NOMEMALLOC: don't use emergency reserves
890 	 *	__GFP_NOWARN: don't print a warning in case of failure
891 	 *
892 	 * For debugging, if we set the cache size to 1, no new buffers will
893 	 * be allocated.
894 	 */
895 	while (1) {
896 		if (dm_bufio_cache_size_latch != 1) {
897 			b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
898 			if (b)
899 				return b;
900 		}
901 
902 		if (nf == NF_PREFETCH)
903 			return NULL;
904 
905 		if (dm_bufio_cache_size_latch != 1 && !tried_noio_alloc) {
906 			dm_bufio_unlock(c);
907 			b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
908 			dm_bufio_lock(c);
909 			if (b)
910 				return b;
911 			tried_noio_alloc = true;
912 		}
913 
914 		if (!list_empty(&c->reserved_buffers)) {
915 			b = list_entry(c->reserved_buffers.next,
916 				       struct dm_buffer, lru_list);
917 			list_del(&b->lru_list);
918 			c->need_reserved_buffers++;
919 
920 			return b;
921 		}
922 
923 		b = __get_unclaimed_buffer(c);
924 		if (b)
925 			return b;
926 
927 		__wait_for_free_buffer(c);
928 	}
929 }
930 
931 static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf)
932 {
933 	struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf);
934 
935 	if (!b)
936 		return NULL;
937 
938 	if (c->alloc_callback)
939 		c->alloc_callback(b);
940 
941 	return b;
942 }
943 
944 /*
945  * Free a buffer and wake other threads waiting for free buffers.
946  */
947 static void __free_buffer_wake(struct dm_buffer *b)
948 {
949 	struct dm_bufio_client *c = b->c;
950 
951 	if (!c->need_reserved_buffers)
952 		free_buffer(b);
953 	else {
954 		list_add(&b->lru_list, &c->reserved_buffers);
955 		c->need_reserved_buffers--;
956 	}
957 
958 	wake_up(&c->free_buffer_wait);
959 }
960 
961 static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait,
962 					struct list_head *write_list)
963 {
964 	struct dm_buffer *b, *tmp;
965 
966 	list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
967 		BUG_ON(test_bit(B_READING, &b->state));
968 
969 		if (!test_bit(B_DIRTY, &b->state) &&
970 		    !test_bit(B_WRITING, &b->state)) {
971 			__relink_lru(b, LIST_CLEAN);
972 			continue;
973 		}
974 
975 		if (no_wait && test_bit(B_WRITING, &b->state))
976 			return;
977 
978 		__write_dirty_buffer(b, write_list);
979 		cond_resched();
980 	}
981 }
982 
983 /*
984  * Check if we're over watermark.
985  * If we are over threshold_buffers, start freeing buffers.
986  * If we're over "limit_buffers", block until we get under the limit.
987  */
988 static void __check_watermark(struct dm_bufio_client *c,
989 			      struct list_head *write_list)
990 {
991 	if (c->n_buffers[LIST_DIRTY] > c->n_buffers[LIST_CLEAN] * DM_BUFIO_WRITEBACK_RATIO)
992 		__write_dirty_buffers_async(c, 1, write_list);
993 }
994 
995 /*----------------------------------------------------------------
996  * Getting a buffer
997  *--------------------------------------------------------------*/
998 
999 static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
1000 				     enum new_flag nf, int *need_submit,
1001 				     struct list_head *write_list)
1002 {
1003 	struct dm_buffer *b, *new_b = NULL;
1004 
1005 	*need_submit = 0;
1006 
1007 	b = __find(c, block);
1008 	if (b)
1009 		goto found_buffer;
1010 
1011 	if (nf == NF_GET)
1012 		return NULL;
1013 
1014 	new_b = __alloc_buffer_wait(c, nf);
1015 	if (!new_b)
1016 		return NULL;
1017 
1018 	/*
1019 	 * We've had a period where the mutex was unlocked, so need to
1020 	 * recheck the buffer tree.
1021 	 */
1022 	b = __find(c, block);
1023 	if (b) {
1024 		__free_buffer_wake(new_b);
1025 		goto found_buffer;
1026 	}
1027 
1028 	__check_watermark(c, write_list);
1029 
1030 	b = new_b;
1031 	b->hold_count = 1;
1032 	b->read_error = 0;
1033 	b->write_error = 0;
1034 	__link_buffer(b, block, LIST_CLEAN);
1035 
1036 	if (nf == NF_FRESH) {
1037 		b->state = 0;
1038 		return b;
1039 	}
1040 
1041 	b->state = 1 << B_READING;
1042 	*need_submit = 1;
1043 
1044 	return b;
1045 
1046 found_buffer:
1047 	if (nf == NF_PREFETCH)
1048 		return NULL;
1049 	/*
1050 	 * Note: it is essential that we don't wait for the buffer to be
1051 	 * read if dm_bufio_get function is used. Both dm_bufio_get and
1052 	 * dm_bufio_prefetch can be used in the driver request routine.
1053 	 * If the user called both dm_bufio_prefetch and dm_bufio_get on
1054 	 * the same buffer, it would deadlock if we waited.
1055 	 */
1056 	if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state)))
1057 		return NULL;
1058 
1059 	b->hold_count++;
1060 	__relink_lru(b, test_bit(B_DIRTY, &b->state) ||
1061 		     test_bit(B_WRITING, &b->state));
1062 	return b;
1063 }
1064 
1065 /*
1066  * The endio routine for reading: set the error, clear the bit and wake up
1067  * anyone waiting on the buffer.
1068  */
1069 static void read_endio(struct dm_buffer *b, blk_status_t status)
1070 {
1071 	b->read_error = status;
1072 
1073 	BUG_ON(!test_bit(B_READING, &b->state));
1074 
1075 	smp_mb__before_atomic();
1076 	clear_bit(B_READING, &b->state);
1077 	smp_mb__after_atomic();
1078 
1079 	wake_up_bit(&b->state, B_READING);
1080 }
1081 
1082 /*
1083  * A common routine for dm_bufio_new and dm_bufio_read.  Operation of these
1084  * functions is similar except that dm_bufio_new doesn't read the
1085  * buffer from the disk (assuming that the caller overwrites all the data
1086  * and uses dm_bufio_mark_buffer_dirty to write new data back).
1087  */
1088 static void *new_read(struct dm_bufio_client *c, sector_t block,
1089 		      enum new_flag nf, struct dm_buffer **bp)
1090 {
1091 	int need_submit;
1092 	struct dm_buffer *b;
1093 
1094 	LIST_HEAD(write_list);
1095 
1096 	dm_bufio_lock(c);
1097 	b = __bufio_new(c, block, nf, &need_submit, &write_list);
1098 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1099 	if (b && b->hold_count == 1)
1100 		buffer_record_stack(b);
1101 #endif
1102 	dm_bufio_unlock(c);
1103 
1104 	__flush_write_list(&write_list);
1105 
1106 	if (!b)
1107 		return NULL;
1108 
1109 	if (need_submit)
1110 		submit_io(b, REQ_OP_READ, read_endio);
1111 
1112 	wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
1113 
1114 	if (b->read_error) {
1115 		int error = blk_status_to_errno(b->read_error);
1116 
1117 		dm_bufio_release(b);
1118 
1119 		return ERR_PTR(error);
1120 	}
1121 
1122 	*bp = b;
1123 
1124 	return b->data;
1125 }
1126 
1127 void *dm_bufio_get(struct dm_bufio_client *c, sector_t block,
1128 		   struct dm_buffer **bp)
1129 {
1130 	return new_read(c, block, NF_GET, bp);
1131 }
1132 EXPORT_SYMBOL_GPL(dm_bufio_get);
1133 
1134 void *dm_bufio_read(struct dm_bufio_client *c, sector_t block,
1135 		    struct dm_buffer **bp)
1136 {
1137 	BUG_ON(dm_bufio_in_request());
1138 
1139 	return new_read(c, block, NF_READ, bp);
1140 }
1141 EXPORT_SYMBOL_GPL(dm_bufio_read);
1142 
1143 void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
1144 		   struct dm_buffer **bp)
1145 {
1146 	BUG_ON(dm_bufio_in_request());
1147 
1148 	return new_read(c, block, NF_FRESH, bp);
1149 }
1150 EXPORT_SYMBOL_GPL(dm_bufio_new);
1151 
1152 void dm_bufio_prefetch(struct dm_bufio_client *c,
1153 		       sector_t block, unsigned n_blocks)
1154 {
1155 	struct blk_plug plug;
1156 
1157 	LIST_HEAD(write_list);
1158 
1159 	BUG_ON(dm_bufio_in_request());
1160 
1161 	blk_start_plug(&plug);
1162 	dm_bufio_lock(c);
1163 
1164 	for (; n_blocks--; block++) {
1165 		int need_submit;
1166 		struct dm_buffer *b;
1167 		b = __bufio_new(c, block, NF_PREFETCH, &need_submit,
1168 				&write_list);
1169 		if (unlikely(!list_empty(&write_list))) {
1170 			dm_bufio_unlock(c);
1171 			blk_finish_plug(&plug);
1172 			__flush_write_list(&write_list);
1173 			blk_start_plug(&plug);
1174 			dm_bufio_lock(c);
1175 		}
1176 		if (unlikely(b != NULL)) {
1177 			dm_bufio_unlock(c);
1178 
1179 			if (need_submit)
1180 				submit_io(b, REQ_OP_READ, read_endio);
1181 			dm_bufio_release(b);
1182 
1183 			cond_resched();
1184 
1185 			if (!n_blocks)
1186 				goto flush_plug;
1187 			dm_bufio_lock(c);
1188 		}
1189 	}
1190 
1191 	dm_bufio_unlock(c);
1192 
1193 flush_plug:
1194 	blk_finish_plug(&plug);
1195 }
1196 EXPORT_SYMBOL_GPL(dm_bufio_prefetch);
1197 
1198 void dm_bufio_release(struct dm_buffer *b)
1199 {
1200 	struct dm_bufio_client *c = b->c;
1201 
1202 	dm_bufio_lock(c);
1203 
1204 	BUG_ON(!b->hold_count);
1205 
1206 	b->hold_count--;
1207 	if (!b->hold_count) {
1208 		wake_up(&c->free_buffer_wait);
1209 
1210 		/*
1211 		 * If there were errors on the buffer, and the buffer is not
1212 		 * to be written, free the buffer. There is no point in caching
1213 		 * invalid buffer.
1214 		 */
1215 		if ((b->read_error || b->write_error) &&
1216 		    !test_bit(B_READING, &b->state) &&
1217 		    !test_bit(B_WRITING, &b->state) &&
1218 		    !test_bit(B_DIRTY, &b->state)) {
1219 			__unlink_buffer(b);
1220 			__free_buffer_wake(b);
1221 		}
1222 	}
1223 
1224 	dm_bufio_unlock(c);
1225 }
1226 EXPORT_SYMBOL_GPL(dm_bufio_release);
1227 
1228 void dm_bufio_mark_partial_buffer_dirty(struct dm_buffer *b,
1229 					unsigned start, unsigned end)
1230 {
1231 	struct dm_bufio_client *c = b->c;
1232 
1233 	BUG_ON(start >= end);
1234 	BUG_ON(end > b->c->block_size);
1235 
1236 	dm_bufio_lock(c);
1237 
1238 	BUG_ON(test_bit(B_READING, &b->state));
1239 
1240 	if (!test_and_set_bit(B_DIRTY, &b->state)) {
1241 		b->dirty_start = start;
1242 		b->dirty_end = end;
1243 		__relink_lru(b, LIST_DIRTY);
1244 	} else {
1245 		if (start < b->dirty_start)
1246 			b->dirty_start = start;
1247 		if (end > b->dirty_end)
1248 			b->dirty_end = end;
1249 	}
1250 
1251 	dm_bufio_unlock(c);
1252 }
1253 EXPORT_SYMBOL_GPL(dm_bufio_mark_partial_buffer_dirty);
1254 
1255 void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
1256 {
1257 	dm_bufio_mark_partial_buffer_dirty(b, 0, b->c->block_size);
1258 }
1259 EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty);
1260 
1261 void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c)
1262 {
1263 	LIST_HEAD(write_list);
1264 
1265 	BUG_ON(dm_bufio_in_request());
1266 
1267 	dm_bufio_lock(c);
1268 	__write_dirty_buffers_async(c, 0, &write_list);
1269 	dm_bufio_unlock(c);
1270 	__flush_write_list(&write_list);
1271 }
1272 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
1273 
1274 /*
1275  * For performance, it is essential that the buffers are written asynchronously
1276  * and simultaneously (so that the block layer can merge the writes) and then
1277  * waited upon.
1278  *
1279  * Finally, we flush hardware disk cache.
1280  */
1281 int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
1282 {
1283 	int a, f;
1284 	unsigned long buffers_processed = 0;
1285 	struct dm_buffer *b, *tmp;
1286 
1287 	LIST_HEAD(write_list);
1288 
1289 	dm_bufio_lock(c);
1290 	__write_dirty_buffers_async(c, 0, &write_list);
1291 	dm_bufio_unlock(c);
1292 	__flush_write_list(&write_list);
1293 	dm_bufio_lock(c);
1294 
1295 again:
1296 	list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
1297 		int dropped_lock = 0;
1298 
1299 		if (buffers_processed < c->n_buffers[LIST_DIRTY])
1300 			buffers_processed++;
1301 
1302 		BUG_ON(test_bit(B_READING, &b->state));
1303 
1304 		if (test_bit(B_WRITING, &b->state)) {
1305 			if (buffers_processed < c->n_buffers[LIST_DIRTY]) {
1306 				dropped_lock = 1;
1307 				b->hold_count++;
1308 				dm_bufio_unlock(c);
1309 				wait_on_bit_io(&b->state, B_WRITING,
1310 					       TASK_UNINTERRUPTIBLE);
1311 				dm_bufio_lock(c);
1312 				b->hold_count--;
1313 			} else
1314 				wait_on_bit_io(&b->state, B_WRITING,
1315 					       TASK_UNINTERRUPTIBLE);
1316 		}
1317 
1318 		if (!test_bit(B_DIRTY, &b->state) &&
1319 		    !test_bit(B_WRITING, &b->state))
1320 			__relink_lru(b, LIST_CLEAN);
1321 
1322 		cond_resched();
1323 
1324 		/*
1325 		 * If we dropped the lock, the list is no longer consistent,
1326 		 * so we must restart the search.
1327 		 *
1328 		 * In the most common case, the buffer just processed is
1329 		 * relinked to the clean list, so we won't loop scanning the
1330 		 * same buffer again and again.
1331 		 *
1332 		 * This may livelock if there is another thread simultaneously
1333 		 * dirtying buffers, so we count the number of buffers walked
1334 		 * and if it exceeds the total number of buffers, it means that
1335 		 * someone is doing some writes simultaneously with us.  In
1336 		 * this case, stop, dropping the lock.
1337 		 */
1338 		if (dropped_lock)
1339 			goto again;
1340 	}
1341 	wake_up(&c->free_buffer_wait);
1342 	dm_bufio_unlock(c);
1343 
1344 	a = xchg(&c->async_write_error, 0);
1345 	f = dm_bufio_issue_flush(c);
1346 	if (a)
1347 		return a;
1348 
1349 	return f;
1350 }
1351 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers);
1352 
1353 /*
1354  * Use dm-io to send an empty barrier to flush the device.
1355  */
1356 int dm_bufio_issue_flush(struct dm_bufio_client *c)
1357 {
1358 	struct dm_io_request io_req = {
1359 		.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC,
1360 		.mem.type = DM_IO_KMEM,
1361 		.mem.ptr.addr = NULL,
1362 		.client = c->dm_io,
1363 	};
1364 	struct dm_io_region io_reg = {
1365 		.bdev = c->bdev,
1366 		.sector = 0,
1367 		.count = 0,
1368 	};
1369 
1370 	BUG_ON(dm_bufio_in_request());
1371 
1372 	return dm_io(&io_req, 1, &io_reg, NULL);
1373 }
1374 EXPORT_SYMBOL_GPL(dm_bufio_issue_flush);
1375 
1376 /*
1377  * Use dm-io to send a discard request to flush the device.
1378  */
1379 int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t count)
1380 {
1381 	struct dm_io_request io_req = {
1382 		.bi_opf = REQ_OP_DISCARD | REQ_SYNC,
1383 		.mem.type = DM_IO_KMEM,
1384 		.mem.ptr.addr = NULL,
1385 		.client = c->dm_io,
1386 	};
1387 	struct dm_io_region io_reg = {
1388 		.bdev = c->bdev,
1389 		.sector = block_to_sector(c, block),
1390 		.count = block_to_sector(c, count),
1391 	};
1392 
1393 	BUG_ON(dm_bufio_in_request());
1394 
1395 	return dm_io(&io_req, 1, &io_reg, NULL);
1396 }
1397 EXPORT_SYMBOL_GPL(dm_bufio_issue_discard);
1398 
1399 /*
1400  * We first delete any other buffer that may be at that new location.
1401  *
1402  * Then, we write the buffer to the original location if it was dirty.
1403  *
1404  * Then, if we are the only one who is holding the buffer, relink the buffer
1405  * in the buffer tree for the new location.
1406  *
1407  * If there was someone else holding the buffer, we write it to the new
1408  * location but not relink it, because that other user needs to have the buffer
1409  * at the same place.
1410  */
1411 void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block)
1412 {
1413 	struct dm_bufio_client *c = b->c;
1414 	struct dm_buffer *new;
1415 
1416 	BUG_ON(dm_bufio_in_request());
1417 
1418 	dm_bufio_lock(c);
1419 
1420 retry:
1421 	new = __find(c, new_block);
1422 	if (new) {
1423 		if (new->hold_count) {
1424 			__wait_for_free_buffer(c);
1425 			goto retry;
1426 		}
1427 
1428 		/*
1429 		 * FIXME: Is there any point waiting for a write that's going
1430 		 * to be overwritten in a bit?
1431 		 */
1432 		__make_buffer_clean(new);
1433 		__unlink_buffer(new);
1434 		__free_buffer_wake(new);
1435 	}
1436 
1437 	BUG_ON(!b->hold_count);
1438 	BUG_ON(test_bit(B_READING, &b->state));
1439 
1440 	__write_dirty_buffer(b, NULL);
1441 	if (b->hold_count == 1) {
1442 		wait_on_bit_io(&b->state, B_WRITING,
1443 			       TASK_UNINTERRUPTIBLE);
1444 		set_bit(B_DIRTY, &b->state);
1445 		b->dirty_start = 0;
1446 		b->dirty_end = c->block_size;
1447 		__unlink_buffer(b);
1448 		__link_buffer(b, new_block, LIST_DIRTY);
1449 	} else {
1450 		sector_t old_block;
1451 		wait_on_bit_lock_io(&b->state, B_WRITING,
1452 				    TASK_UNINTERRUPTIBLE);
1453 		/*
1454 		 * Relink buffer to "new_block" so that write_callback
1455 		 * sees "new_block" as a block number.
1456 		 * After the write, link the buffer back to old_block.
1457 		 * All this must be done in bufio lock, so that block number
1458 		 * change isn't visible to other threads.
1459 		 */
1460 		old_block = b->block;
1461 		__unlink_buffer(b);
1462 		__link_buffer(b, new_block, b->list_mode);
1463 		submit_io(b, REQ_OP_WRITE, write_endio);
1464 		wait_on_bit_io(&b->state, B_WRITING,
1465 			       TASK_UNINTERRUPTIBLE);
1466 		__unlink_buffer(b);
1467 		__link_buffer(b, old_block, b->list_mode);
1468 	}
1469 
1470 	dm_bufio_unlock(c);
1471 	dm_bufio_release(b);
1472 }
1473 EXPORT_SYMBOL_GPL(dm_bufio_release_move);
1474 
1475 static void forget_buffer_locked(struct dm_buffer *b)
1476 {
1477 	if (likely(!b->hold_count) && likely(!b->state)) {
1478 		__unlink_buffer(b);
1479 		__free_buffer_wake(b);
1480 	}
1481 }
1482 
1483 /*
1484  * Free the given buffer.
1485  *
1486  * This is just a hint, if the buffer is in use or dirty, this function
1487  * does nothing.
1488  */
1489 void dm_bufio_forget(struct dm_bufio_client *c, sector_t block)
1490 {
1491 	struct dm_buffer *b;
1492 
1493 	dm_bufio_lock(c);
1494 
1495 	b = __find(c, block);
1496 	if (b)
1497 		forget_buffer_locked(b);
1498 
1499 	dm_bufio_unlock(c);
1500 }
1501 EXPORT_SYMBOL_GPL(dm_bufio_forget);
1502 
1503 void dm_bufio_forget_buffers(struct dm_bufio_client *c, sector_t block, sector_t n_blocks)
1504 {
1505 	struct dm_buffer *b;
1506 	sector_t end_block = block + n_blocks;
1507 
1508 	while (block < end_block) {
1509 		dm_bufio_lock(c);
1510 
1511 		b = __find_next(c, block);
1512 		if (b) {
1513 			block = b->block + 1;
1514 			forget_buffer_locked(b);
1515 		}
1516 
1517 		dm_bufio_unlock(c);
1518 
1519 		if (!b)
1520 			break;
1521 	}
1522 
1523 }
1524 EXPORT_SYMBOL_GPL(dm_bufio_forget_buffers);
1525 
1526 void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n)
1527 {
1528 	c->minimum_buffers = n;
1529 }
1530 EXPORT_SYMBOL_GPL(dm_bufio_set_minimum_buffers);
1531 
1532 unsigned dm_bufio_get_block_size(struct dm_bufio_client *c)
1533 {
1534 	return c->block_size;
1535 }
1536 EXPORT_SYMBOL_GPL(dm_bufio_get_block_size);
1537 
1538 sector_t dm_bufio_get_device_size(struct dm_bufio_client *c)
1539 {
1540 	sector_t s = bdev_nr_sectors(c->bdev);
1541 	if (s >= c->start)
1542 		s -= c->start;
1543 	else
1544 		s = 0;
1545 	if (likely(c->sectors_per_block_bits >= 0))
1546 		s >>= c->sectors_per_block_bits;
1547 	else
1548 		sector_div(s, c->block_size >> SECTOR_SHIFT);
1549 	return s;
1550 }
1551 EXPORT_SYMBOL_GPL(dm_bufio_get_device_size);
1552 
1553 struct dm_io_client *dm_bufio_get_dm_io_client(struct dm_bufio_client *c)
1554 {
1555 	return c->dm_io;
1556 }
1557 EXPORT_SYMBOL_GPL(dm_bufio_get_dm_io_client);
1558 
1559 sector_t dm_bufio_get_block_number(struct dm_buffer *b)
1560 {
1561 	return b->block;
1562 }
1563 EXPORT_SYMBOL_GPL(dm_bufio_get_block_number);
1564 
1565 void *dm_bufio_get_block_data(struct dm_buffer *b)
1566 {
1567 	return b->data;
1568 }
1569 EXPORT_SYMBOL_GPL(dm_bufio_get_block_data);
1570 
1571 void *dm_bufio_get_aux_data(struct dm_buffer *b)
1572 {
1573 	return b + 1;
1574 }
1575 EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data);
1576 
1577 struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b)
1578 {
1579 	return b->c;
1580 }
1581 EXPORT_SYMBOL_GPL(dm_bufio_get_client);
1582 
1583 static void drop_buffers(struct dm_bufio_client *c)
1584 {
1585 	struct dm_buffer *b;
1586 	int i;
1587 	bool warned = false;
1588 
1589 	BUG_ON(dm_bufio_in_request());
1590 
1591 	/*
1592 	 * An optimization so that the buffers are not written one-by-one.
1593 	 */
1594 	dm_bufio_write_dirty_buffers_async(c);
1595 
1596 	dm_bufio_lock(c);
1597 
1598 	while ((b = __get_unclaimed_buffer(c)))
1599 		__free_buffer_wake(b);
1600 
1601 	for (i = 0; i < LIST_SIZE; i++)
1602 		list_for_each_entry(b, &c->lru[i], lru_list) {
1603 			WARN_ON(!warned);
1604 			warned = true;
1605 			DMERR("leaked buffer %llx, hold count %u, list %d",
1606 			      (unsigned long long)b->block, b->hold_count, i);
1607 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1608 			stack_trace_print(b->stack_entries, b->stack_len, 1);
1609 			/* mark unclaimed to avoid BUG_ON below */
1610 			b->hold_count = 0;
1611 #endif
1612 		}
1613 
1614 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1615 	while ((b = __get_unclaimed_buffer(c)))
1616 		__free_buffer_wake(b);
1617 #endif
1618 
1619 	for (i = 0; i < LIST_SIZE; i++)
1620 		BUG_ON(!list_empty(&c->lru[i]));
1621 
1622 	dm_bufio_unlock(c);
1623 }
1624 
1625 /*
1626  * We may not be able to evict this buffer if IO pending or the client
1627  * is still using it.  Caller is expected to know buffer is too old.
1628  *
1629  * And if GFP_NOFS is used, we must not do any I/O because we hold
1630  * dm_bufio_clients_lock and we would risk deadlock if the I/O gets
1631  * rerouted to different bufio client.
1632  */
1633 static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp)
1634 {
1635 	if (!(gfp & __GFP_FS)) {
1636 		if (test_bit(B_READING, &b->state) ||
1637 		    test_bit(B_WRITING, &b->state) ||
1638 		    test_bit(B_DIRTY, &b->state))
1639 			return false;
1640 	}
1641 
1642 	if (b->hold_count)
1643 		return false;
1644 
1645 	__make_buffer_clean(b);
1646 	__unlink_buffer(b);
1647 	__free_buffer_wake(b);
1648 
1649 	return true;
1650 }
1651 
1652 static unsigned long get_retain_buffers(struct dm_bufio_client *c)
1653 {
1654 	unsigned long retain_bytes = READ_ONCE(dm_bufio_retain_bytes);
1655 	if (likely(c->sectors_per_block_bits >= 0))
1656 		retain_bytes >>= c->sectors_per_block_bits + SECTOR_SHIFT;
1657 	else
1658 		retain_bytes /= c->block_size;
1659 	return retain_bytes;
1660 }
1661 
1662 static void __scan(struct dm_bufio_client *c)
1663 {
1664 	int l;
1665 	struct dm_buffer *b, *tmp;
1666 	unsigned long freed = 0;
1667 	unsigned long count = c->n_buffers[LIST_CLEAN] +
1668 			      c->n_buffers[LIST_DIRTY];
1669 	unsigned long retain_target = get_retain_buffers(c);
1670 
1671 	for (l = 0; l < LIST_SIZE; l++) {
1672 		list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) {
1673 			if (count - freed <= retain_target)
1674 				atomic_long_set(&c->need_shrink, 0);
1675 			if (!atomic_long_read(&c->need_shrink))
1676 				return;
1677 			if (__try_evict_buffer(b, GFP_KERNEL)) {
1678 				atomic_long_dec(&c->need_shrink);
1679 				freed++;
1680 			}
1681 			cond_resched();
1682 		}
1683 	}
1684 }
1685 
1686 static void shrink_work(struct work_struct *w)
1687 {
1688 	struct dm_bufio_client *c = container_of(w, struct dm_bufio_client, shrink_work);
1689 
1690 	dm_bufio_lock(c);
1691 	__scan(c);
1692 	dm_bufio_unlock(c);
1693 }
1694 
1695 static unsigned long dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
1696 {
1697 	struct dm_bufio_client *c;
1698 
1699 	c = container_of(shrink, struct dm_bufio_client, shrinker);
1700 	atomic_long_add(sc->nr_to_scan, &c->need_shrink);
1701 	queue_work(dm_bufio_wq, &c->shrink_work);
1702 
1703 	return sc->nr_to_scan;
1704 }
1705 
1706 static unsigned long dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
1707 {
1708 	struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker);
1709 	unsigned long count = READ_ONCE(c->n_buffers[LIST_CLEAN]) +
1710 			      READ_ONCE(c->n_buffers[LIST_DIRTY]);
1711 	unsigned long retain_target = get_retain_buffers(c);
1712 	unsigned long queued_for_cleanup = atomic_long_read(&c->need_shrink);
1713 
1714 	if (unlikely(count < retain_target))
1715 		count = 0;
1716 	else
1717 		count -= retain_target;
1718 
1719 	if (unlikely(count < queued_for_cleanup))
1720 		count = 0;
1721 	else
1722 		count -= queued_for_cleanup;
1723 
1724 	return count;
1725 }
1726 
1727 /*
1728  * Create the buffering interface
1729  */
1730 struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size,
1731 					       unsigned reserved_buffers, unsigned aux_size,
1732 					       void (*alloc_callback)(struct dm_buffer *),
1733 					       void (*write_callback)(struct dm_buffer *),
1734 					       unsigned int flags)
1735 {
1736 	int r;
1737 	struct dm_bufio_client *c;
1738 	unsigned i;
1739 	char slab_name[27];
1740 
1741 	if (!block_size || block_size & ((1 << SECTOR_SHIFT) - 1)) {
1742 		DMERR("%s: block size not specified or is not multiple of 512b", __func__);
1743 		r = -EINVAL;
1744 		goto bad_client;
1745 	}
1746 
1747 	c = kzalloc(sizeof(*c), GFP_KERNEL);
1748 	if (!c) {
1749 		r = -ENOMEM;
1750 		goto bad_client;
1751 	}
1752 	c->buffer_tree = RB_ROOT;
1753 
1754 	c->bdev = bdev;
1755 	c->block_size = block_size;
1756 	if (is_power_of_2(block_size))
1757 		c->sectors_per_block_bits = __ffs(block_size) - SECTOR_SHIFT;
1758 	else
1759 		c->sectors_per_block_bits = -1;
1760 
1761 	c->alloc_callback = alloc_callback;
1762 	c->write_callback = write_callback;
1763 
1764 	if (flags & DM_BUFIO_CLIENT_NO_SLEEP) {
1765 		c->no_sleep = true;
1766 		static_branch_inc(&no_sleep_enabled);
1767 	}
1768 
1769 	for (i = 0; i < LIST_SIZE; i++) {
1770 		INIT_LIST_HEAD(&c->lru[i]);
1771 		c->n_buffers[i] = 0;
1772 	}
1773 
1774 	mutex_init(&c->lock);
1775 	spin_lock_init(&c->spinlock);
1776 	INIT_LIST_HEAD(&c->reserved_buffers);
1777 	c->need_reserved_buffers = reserved_buffers;
1778 
1779 	dm_bufio_set_minimum_buffers(c, DM_BUFIO_MIN_BUFFERS);
1780 
1781 	init_waitqueue_head(&c->free_buffer_wait);
1782 	c->async_write_error = 0;
1783 
1784 	c->dm_io = dm_io_client_create();
1785 	if (IS_ERR(c->dm_io)) {
1786 		r = PTR_ERR(c->dm_io);
1787 		goto bad_dm_io;
1788 	}
1789 
1790 	if (block_size <= KMALLOC_MAX_SIZE &&
1791 	    (block_size < PAGE_SIZE || !is_power_of_2(block_size))) {
1792 		unsigned align = min(1U << __ffs(block_size), (unsigned)PAGE_SIZE);
1793 		snprintf(slab_name, sizeof slab_name, "dm_bufio_cache-%u", block_size);
1794 		c->slab_cache = kmem_cache_create(slab_name, block_size, align,
1795 						  SLAB_RECLAIM_ACCOUNT, NULL);
1796 		if (!c->slab_cache) {
1797 			r = -ENOMEM;
1798 			goto bad;
1799 		}
1800 	}
1801 	if (aux_size)
1802 		snprintf(slab_name, sizeof slab_name, "dm_bufio_buffer-%u", aux_size);
1803 	else
1804 		snprintf(slab_name, sizeof slab_name, "dm_bufio_buffer");
1805 	c->slab_buffer = kmem_cache_create(slab_name, sizeof(struct dm_buffer) + aux_size,
1806 					   0, SLAB_RECLAIM_ACCOUNT, NULL);
1807 	if (!c->slab_buffer) {
1808 		r = -ENOMEM;
1809 		goto bad;
1810 	}
1811 
1812 	while (c->need_reserved_buffers) {
1813 		struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL);
1814 
1815 		if (!b) {
1816 			r = -ENOMEM;
1817 			goto bad;
1818 		}
1819 		__free_buffer_wake(b);
1820 	}
1821 
1822 	INIT_WORK(&c->shrink_work, shrink_work);
1823 	atomic_long_set(&c->need_shrink, 0);
1824 
1825 	c->shrinker.count_objects = dm_bufio_shrink_count;
1826 	c->shrinker.scan_objects = dm_bufio_shrink_scan;
1827 	c->shrinker.seeks = 1;
1828 	c->shrinker.batch = 0;
1829 	r = register_shrinker(&c->shrinker, "md-%s:(%u:%u)", slab_name,
1830 			      MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
1831 	if (r)
1832 		goto bad;
1833 
1834 	mutex_lock(&dm_bufio_clients_lock);
1835 	dm_bufio_client_count++;
1836 	list_add(&c->client_list, &dm_bufio_all_clients);
1837 	__cache_size_refresh();
1838 	mutex_unlock(&dm_bufio_clients_lock);
1839 
1840 	return c;
1841 
1842 bad:
1843 	while (!list_empty(&c->reserved_buffers)) {
1844 		struct dm_buffer *b = list_entry(c->reserved_buffers.next,
1845 						 struct dm_buffer, lru_list);
1846 		list_del(&b->lru_list);
1847 		free_buffer(b);
1848 	}
1849 	kmem_cache_destroy(c->slab_cache);
1850 	kmem_cache_destroy(c->slab_buffer);
1851 	dm_io_client_destroy(c->dm_io);
1852 bad_dm_io:
1853 	mutex_destroy(&c->lock);
1854 	kfree(c);
1855 bad_client:
1856 	return ERR_PTR(r);
1857 }
1858 EXPORT_SYMBOL_GPL(dm_bufio_client_create);
1859 
1860 /*
1861  * Free the buffering interface.
1862  * It is required that there are no references on any buffers.
1863  */
1864 void dm_bufio_client_destroy(struct dm_bufio_client *c)
1865 {
1866 	unsigned i;
1867 
1868 	drop_buffers(c);
1869 
1870 	unregister_shrinker(&c->shrinker);
1871 	flush_work(&c->shrink_work);
1872 
1873 	mutex_lock(&dm_bufio_clients_lock);
1874 
1875 	list_del(&c->client_list);
1876 	dm_bufio_client_count--;
1877 	__cache_size_refresh();
1878 
1879 	mutex_unlock(&dm_bufio_clients_lock);
1880 
1881 	BUG_ON(!RB_EMPTY_ROOT(&c->buffer_tree));
1882 	BUG_ON(c->need_reserved_buffers);
1883 
1884 	while (!list_empty(&c->reserved_buffers)) {
1885 		struct dm_buffer *b = list_entry(c->reserved_buffers.next,
1886 						 struct dm_buffer, lru_list);
1887 		list_del(&b->lru_list);
1888 		free_buffer(b);
1889 	}
1890 
1891 	for (i = 0; i < LIST_SIZE; i++)
1892 		if (c->n_buffers[i])
1893 			DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]);
1894 
1895 	for (i = 0; i < LIST_SIZE; i++)
1896 		BUG_ON(c->n_buffers[i]);
1897 
1898 	kmem_cache_destroy(c->slab_cache);
1899 	kmem_cache_destroy(c->slab_buffer);
1900 	dm_io_client_destroy(c->dm_io);
1901 	mutex_destroy(&c->lock);
1902 	if (c->no_sleep)
1903 		static_branch_dec(&no_sleep_enabled);
1904 	kfree(c);
1905 }
1906 EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
1907 
1908 void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start)
1909 {
1910 	c->start = start;
1911 }
1912 EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset);
1913 
1914 static unsigned get_max_age_hz(void)
1915 {
1916 	unsigned max_age = READ_ONCE(dm_bufio_max_age);
1917 
1918 	if (max_age > UINT_MAX / HZ)
1919 		max_age = UINT_MAX / HZ;
1920 
1921 	return max_age * HZ;
1922 }
1923 
1924 static bool older_than(struct dm_buffer *b, unsigned long age_hz)
1925 {
1926 	return time_after_eq(jiffies, b->last_accessed + age_hz);
1927 }
1928 
1929 static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
1930 {
1931 	struct dm_buffer *b, *tmp;
1932 	unsigned long retain_target = get_retain_buffers(c);
1933 	unsigned long count;
1934 	LIST_HEAD(write_list);
1935 
1936 	dm_bufio_lock(c);
1937 
1938 	__check_watermark(c, &write_list);
1939 	if (unlikely(!list_empty(&write_list))) {
1940 		dm_bufio_unlock(c);
1941 		__flush_write_list(&write_list);
1942 		dm_bufio_lock(c);
1943 	}
1944 
1945 	count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
1946 	list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) {
1947 		if (count <= retain_target)
1948 			break;
1949 
1950 		if (!older_than(b, age_hz))
1951 			break;
1952 
1953 		if (__try_evict_buffer(b, 0))
1954 			count--;
1955 
1956 		cond_resched();
1957 	}
1958 
1959 	dm_bufio_unlock(c);
1960 }
1961 
1962 static void do_global_cleanup(struct work_struct *w)
1963 {
1964 	struct dm_bufio_client *locked_client = NULL;
1965 	struct dm_bufio_client *current_client;
1966 	struct dm_buffer *b;
1967 	unsigned spinlock_hold_count;
1968 	unsigned long threshold = dm_bufio_cache_size -
1969 		dm_bufio_cache_size / DM_BUFIO_LOW_WATERMARK_RATIO;
1970 	unsigned long loops = global_num * 2;
1971 
1972 	mutex_lock(&dm_bufio_clients_lock);
1973 
1974 	while (1) {
1975 		cond_resched();
1976 
1977 		spin_lock(&global_spinlock);
1978 		if (unlikely(dm_bufio_current_allocated <= threshold))
1979 			break;
1980 
1981 		spinlock_hold_count = 0;
1982 get_next:
1983 		if (!loops--)
1984 			break;
1985 		if (unlikely(list_empty(&global_queue)))
1986 			break;
1987 		b = list_entry(global_queue.prev, struct dm_buffer, global_list);
1988 
1989 		if (b->accessed) {
1990 			b->accessed = 0;
1991 			list_move(&b->global_list, &global_queue);
1992 			if (likely(++spinlock_hold_count < 16))
1993 				goto get_next;
1994 			spin_unlock(&global_spinlock);
1995 			continue;
1996 		}
1997 
1998 		current_client = b->c;
1999 		if (unlikely(current_client != locked_client)) {
2000 			if (locked_client)
2001 				dm_bufio_unlock(locked_client);
2002 
2003 			if (!dm_bufio_trylock(current_client)) {
2004 				spin_unlock(&global_spinlock);
2005 				dm_bufio_lock(current_client);
2006 				locked_client = current_client;
2007 				continue;
2008 			}
2009 
2010 			locked_client = current_client;
2011 		}
2012 
2013 		spin_unlock(&global_spinlock);
2014 
2015 		if (unlikely(!__try_evict_buffer(b, GFP_KERNEL))) {
2016 			spin_lock(&global_spinlock);
2017 			list_move(&b->global_list, &global_queue);
2018 			spin_unlock(&global_spinlock);
2019 		}
2020 	}
2021 
2022 	spin_unlock(&global_spinlock);
2023 
2024 	if (locked_client)
2025 		dm_bufio_unlock(locked_client);
2026 
2027 	mutex_unlock(&dm_bufio_clients_lock);
2028 }
2029 
2030 static void cleanup_old_buffers(void)
2031 {
2032 	unsigned long max_age_hz = get_max_age_hz();
2033 	struct dm_bufio_client *c;
2034 
2035 	mutex_lock(&dm_bufio_clients_lock);
2036 
2037 	__cache_size_refresh();
2038 
2039 	list_for_each_entry(c, &dm_bufio_all_clients, client_list)
2040 		__evict_old_buffers(c, max_age_hz);
2041 
2042 	mutex_unlock(&dm_bufio_clients_lock);
2043 }
2044 
2045 static void work_fn(struct work_struct *w)
2046 {
2047 	cleanup_old_buffers();
2048 
2049 	queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
2050 			   DM_BUFIO_WORK_TIMER_SECS * HZ);
2051 }
2052 
2053 /*----------------------------------------------------------------
2054  * Module setup
2055  *--------------------------------------------------------------*/
2056 
2057 /*
2058  * This is called only once for the whole dm_bufio module.
2059  * It initializes memory limit.
2060  */
2061 static int __init dm_bufio_init(void)
2062 {
2063 	__u64 mem;
2064 
2065 	dm_bufio_allocated_kmem_cache = 0;
2066 	dm_bufio_allocated_get_free_pages = 0;
2067 	dm_bufio_allocated_vmalloc = 0;
2068 	dm_bufio_current_allocated = 0;
2069 
2070 	mem = (__u64)mult_frac(totalram_pages() - totalhigh_pages(),
2071 			       DM_BUFIO_MEMORY_PERCENT, 100) << PAGE_SHIFT;
2072 
2073 	if (mem > ULONG_MAX)
2074 		mem = ULONG_MAX;
2075 
2076 #ifdef CONFIG_MMU
2077 	if (mem > mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100))
2078 		mem = mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100);
2079 #endif
2080 
2081 	dm_bufio_default_cache_size = mem;
2082 
2083 	mutex_lock(&dm_bufio_clients_lock);
2084 	__cache_size_refresh();
2085 	mutex_unlock(&dm_bufio_clients_lock);
2086 
2087 	dm_bufio_wq = alloc_workqueue("dm_bufio_cache", WQ_MEM_RECLAIM, 0);
2088 	if (!dm_bufio_wq)
2089 		return -ENOMEM;
2090 
2091 	INIT_DELAYED_WORK(&dm_bufio_cleanup_old_work, work_fn);
2092 	INIT_WORK(&dm_bufio_replacement_work, do_global_cleanup);
2093 	queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
2094 			   DM_BUFIO_WORK_TIMER_SECS * HZ);
2095 
2096 	return 0;
2097 }
2098 
2099 /*
2100  * This is called once when unloading the dm_bufio module.
2101  */
2102 static void __exit dm_bufio_exit(void)
2103 {
2104 	int bug = 0;
2105 
2106 	cancel_delayed_work_sync(&dm_bufio_cleanup_old_work);
2107 	destroy_workqueue(dm_bufio_wq);
2108 
2109 	if (dm_bufio_client_count) {
2110 		DMCRIT("%s: dm_bufio_client_count leaked: %d",
2111 			__func__, dm_bufio_client_count);
2112 		bug = 1;
2113 	}
2114 
2115 	if (dm_bufio_current_allocated) {
2116 		DMCRIT("%s: dm_bufio_current_allocated leaked: %lu",
2117 			__func__, dm_bufio_current_allocated);
2118 		bug = 1;
2119 	}
2120 
2121 	if (dm_bufio_allocated_get_free_pages) {
2122 		DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu",
2123 		       __func__, dm_bufio_allocated_get_free_pages);
2124 		bug = 1;
2125 	}
2126 
2127 	if (dm_bufio_allocated_vmalloc) {
2128 		DMCRIT("%s: dm_bufio_vmalloc leaked: %lu",
2129 		       __func__, dm_bufio_allocated_vmalloc);
2130 		bug = 1;
2131 	}
2132 
2133 	BUG_ON(bug);
2134 }
2135 
2136 module_init(dm_bufio_init)
2137 module_exit(dm_bufio_exit)
2138 
2139 module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, S_IRUGO | S_IWUSR);
2140 MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
2141 
2142 module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR);
2143 MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
2144 
2145 module_param_named(retain_bytes, dm_bufio_retain_bytes, ulong, S_IRUGO | S_IWUSR);
2146 MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory");
2147 
2148 module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR);
2149 MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");
2150 
2151 module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, S_IRUGO);
2152 MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc");
2153 
2154 module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, S_IRUGO);
2155 MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages");
2156 
2157 module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, S_IRUGO);
2158 MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc");
2159 
2160 module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, S_IRUGO);
2161 MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache");
2162 
2163 MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
2164 MODULE_DESCRIPTION(DM_NAME " buffered I/O library");
2165 MODULE_LICENSE("GPL");
2166