1 /*
2  * Copyright (C) 2015 Red Hat. All rights reserved.
3  *
4  * This file is released under the GPL.
5  */
6 
7 #include "dm-cache-background-tracker.h"
8 #include "dm-cache-policy-internal.h"
9 #include "dm-cache-policy.h"
10 #include "dm.h"
11 
12 #include <linux/hash.h>
13 #include <linux/jiffies.h>
14 #include <linux/module.h>
15 #include <linux/mutex.h>
16 #include <linux/vmalloc.h>
17 #include <linux/math64.h>
18 
19 #define DM_MSG_PREFIX "cache-policy-smq"
20 
21 /*----------------------------------------------------------------*/
22 
23 /*
24  * Safe division functions that return zero on divide by zero.
25  */
26 static unsigned safe_div(unsigned n, unsigned d)
27 {
28 	return d ? n / d : 0u;
29 }
30 
31 static unsigned safe_mod(unsigned n, unsigned d)
32 {
33 	return d ? n % d : 0u;
34 }
35 
36 /*----------------------------------------------------------------*/
37 
38 struct entry {
39 	unsigned hash_next:28;
40 	unsigned prev:28;
41 	unsigned next:28;
42 	unsigned level:6;
43 	bool dirty:1;
44 	bool allocated:1;
45 	bool sentinel:1;
46 	bool pending_work:1;
47 
48 	dm_oblock_t oblock;
49 };
50 
51 /*----------------------------------------------------------------*/
52 
53 #define INDEXER_NULL ((1u << 28u) - 1u)
54 
55 /*
56  * An entry_space manages a set of entries that we use for the queues.
57  * The clean and dirty queues share entries, so this object is separate
58  * from the queue itself.
59  */
60 struct entry_space {
61 	struct entry *begin;
62 	struct entry *end;
63 };
64 
65 static int space_init(struct entry_space *es, unsigned nr_entries)
66 {
67 	if (!nr_entries) {
68 		es->begin = es->end = NULL;
69 		return 0;
70 	}
71 
72 	es->begin = vzalloc(array_size(nr_entries, sizeof(struct entry)));
73 	if (!es->begin)
74 		return -ENOMEM;
75 
76 	es->end = es->begin + nr_entries;
77 	return 0;
78 }
79 
80 static void space_exit(struct entry_space *es)
81 {
82 	vfree(es->begin);
83 }
84 
85 static struct entry *__get_entry(struct entry_space *es, unsigned block)
86 {
87 	struct entry *e;
88 
89 	e = es->begin + block;
90 	BUG_ON(e >= es->end);
91 
92 	return e;
93 }
94 
95 static unsigned to_index(struct entry_space *es, struct entry *e)
96 {
97 	BUG_ON(e < es->begin || e >= es->end);
98 	return e - es->begin;
99 }
100 
101 static struct entry *to_entry(struct entry_space *es, unsigned block)
102 {
103 	if (block == INDEXER_NULL)
104 		return NULL;
105 
106 	return __get_entry(es, block);
107 }
108 
109 /*----------------------------------------------------------------*/
110 
111 struct ilist {
112 	unsigned nr_elts;	/* excluding sentinel entries */
113 	unsigned head, tail;
114 };
115 
116 static void l_init(struct ilist *l)
117 {
118 	l->nr_elts = 0;
119 	l->head = l->tail = INDEXER_NULL;
120 }
121 
122 static struct entry *l_head(struct entry_space *es, struct ilist *l)
123 {
124 	return to_entry(es, l->head);
125 }
126 
127 static struct entry *l_tail(struct entry_space *es, struct ilist *l)
128 {
129 	return to_entry(es, l->tail);
130 }
131 
132 static struct entry *l_next(struct entry_space *es, struct entry *e)
133 {
134 	return to_entry(es, e->next);
135 }
136 
137 static struct entry *l_prev(struct entry_space *es, struct entry *e)
138 {
139 	return to_entry(es, e->prev);
140 }
141 
142 static bool l_empty(struct ilist *l)
143 {
144 	return l->head == INDEXER_NULL;
145 }
146 
147 static void l_add_head(struct entry_space *es, struct ilist *l, struct entry *e)
148 {
149 	struct entry *head = l_head(es, l);
150 
151 	e->next = l->head;
152 	e->prev = INDEXER_NULL;
153 
154 	if (head)
155 		head->prev = l->head = to_index(es, e);
156 	else
157 		l->head = l->tail = to_index(es, e);
158 
159 	if (!e->sentinel)
160 		l->nr_elts++;
161 }
162 
163 static void l_add_tail(struct entry_space *es, struct ilist *l, struct entry *e)
164 {
165 	struct entry *tail = l_tail(es, l);
166 
167 	e->next = INDEXER_NULL;
168 	e->prev = l->tail;
169 
170 	if (tail)
171 		tail->next = l->tail = to_index(es, e);
172 	else
173 		l->head = l->tail = to_index(es, e);
174 
175 	if (!e->sentinel)
176 		l->nr_elts++;
177 }
178 
179 static void l_add_before(struct entry_space *es, struct ilist *l,
180 			 struct entry *old, struct entry *e)
181 {
182 	struct entry *prev = l_prev(es, old);
183 
184 	if (!prev)
185 		l_add_head(es, l, e);
186 
187 	else {
188 		e->prev = old->prev;
189 		e->next = to_index(es, old);
190 		prev->next = old->prev = to_index(es, e);
191 
192 		if (!e->sentinel)
193 			l->nr_elts++;
194 	}
195 }
196 
197 static void l_del(struct entry_space *es, struct ilist *l, struct entry *e)
198 {
199 	struct entry *prev = l_prev(es, e);
200 	struct entry *next = l_next(es, e);
201 
202 	if (prev)
203 		prev->next = e->next;
204 	else
205 		l->head = e->next;
206 
207 	if (next)
208 		next->prev = e->prev;
209 	else
210 		l->tail = e->prev;
211 
212 	if (!e->sentinel)
213 		l->nr_elts--;
214 }
215 
216 static struct entry *l_pop_head(struct entry_space *es, struct ilist *l)
217 {
218 	struct entry *e;
219 
220 	for (e = l_head(es, l); e; e = l_next(es, e))
221 		if (!e->sentinel) {
222 			l_del(es, l, e);
223 			return e;
224 		}
225 
226 	return NULL;
227 }
228 
229 static struct entry *l_pop_tail(struct entry_space *es, struct ilist *l)
230 {
231 	struct entry *e;
232 
233 	for (e = l_tail(es, l); e; e = l_prev(es, e))
234 		if (!e->sentinel) {
235 			l_del(es, l, e);
236 			return e;
237 		}
238 
239 	return NULL;
240 }
241 
242 /*----------------------------------------------------------------*/
243 
244 /*
245  * The stochastic-multi-queue is a set of lru lists stacked into levels.
246  * Entries are moved up levels when they are used, which loosely orders the
247  * most accessed entries in the top levels and least in the bottom.  This
248  * structure is *much* better than a single lru list.
249  */
250 #define MAX_LEVELS 64u
251 
252 struct queue {
253 	struct entry_space *es;
254 
255 	unsigned nr_elts;
256 	unsigned nr_levels;
257 	struct ilist qs[MAX_LEVELS];
258 
259 	/*
260 	 * We maintain a count of the number of entries we would like in each
261 	 * level.
262 	 */
263 	unsigned last_target_nr_elts;
264 	unsigned nr_top_levels;
265 	unsigned nr_in_top_levels;
266 	unsigned target_count[MAX_LEVELS];
267 };
268 
269 static void q_init(struct queue *q, struct entry_space *es, unsigned nr_levels)
270 {
271 	unsigned i;
272 
273 	q->es = es;
274 	q->nr_elts = 0;
275 	q->nr_levels = nr_levels;
276 
277 	for (i = 0; i < q->nr_levels; i++) {
278 		l_init(q->qs + i);
279 		q->target_count[i] = 0u;
280 	}
281 
282 	q->last_target_nr_elts = 0u;
283 	q->nr_top_levels = 0u;
284 	q->nr_in_top_levels = 0u;
285 }
286 
287 static unsigned q_size(struct queue *q)
288 {
289 	return q->nr_elts;
290 }
291 
292 /*
293  * Insert an entry to the back of the given level.
294  */
295 static void q_push(struct queue *q, struct entry *e)
296 {
297 	BUG_ON(e->pending_work);
298 
299 	if (!e->sentinel)
300 		q->nr_elts++;
301 
302 	l_add_tail(q->es, q->qs + e->level, e);
303 }
304 
305 static void q_push_front(struct queue *q, struct entry *e)
306 {
307 	BUG_ON(e->pending_work);
308 
309 	if (!e->sentinel)
310 		q->nr_elts++;
311 
312 	l_add_head(q->es, q->qs + e->level, e);
313 }
314 
315 static void q_push_before(struct queue *q, struct entry *old, struct entry *e)
316 {
317 	BUG_ON(e->pending_work);
318 
319 	if (!e->sentinel)
320 		q->nr_elts++;
321 
322 	l_add_before(q->es, q->qs + e->level, old, e);
323 }
324 
325 static void q_del(struct queue *q, struct entry *e)
326 {
327 	l_del(q->es, q->qs + e->level, e);
328 	if (!e->sentinel)
329 		q->nr_elts--;
330 }
331 
332 /*
333  * Return the oldest entry of the lowest populated level.
334  */
335 static struct entry *q_peek(struct queue *q, unsigned max_level, bool can_cross_sentinel)
336 {
337 	unsigned level;
338 	struct entry *e;
339 
340 	max_level = min(max_level, q->nr_levels);
341 
342 	for (level = 0; level < max_level; level++)
343 		for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e)) {
344 			if (e->sentinel) {
345 				if (can_cross_sentinel)
346 					continue;
347 				else
348 					break;
349 			}
350 
351 			return e;
352 		}
353 
354 	return NULL;
355 }
356 
357 static struct entry *q_pop(struct queue *q)
358 {
359 	struct entry *e = q_peek(q, q->nr_levels, true);
360 
361 	if (e)
362 		q_del(q, e);
363 
364 	return e;
365 }
366 
367 /*
368  * This function assumes there is a non-sentinel entry to pop.  It's only
369  * used by redistribute, so we know this is true.  It also doesn't adjust
370  * the q->nr_elts count.
371  */
372 static struct entry *__redist_pop_from(struct queue *q, unsigned level)
373 {
374 	struct entry *e;
375 
376 	for (; level < q->nr_levels; level++)
377 		for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e))
378 			if (!e->sentinel) {
379 				l_del(q->es, q->qs + e->level, e);
380 				return e;
381 			}
382 
383 	return NULL;
384 }
385 
386 static void q_set_targets_subrange_(struct queue *q, unsigned nr_elts, unsigned lbegin, unsigned lend)
387 {
388 	unsigned level, nr_levels, entries_per_level, remainder;
389 
390 	BUG_ON(lbegin > lend);
391 	BUG_ON(lend > q->nr_levels);
392 	nr_levels = lend - lbegin;
393 	entries_per_level = safe_div(nr_elts, nr_levels);
394 	remainder = safe_mod(nr_elts, nr_levels);
395 
396 	for (level = lbegin; level < lend; level++)
397 		q->target_count[level] =
398 			(level < (lbegin + remainder)) ? entries_per_level + 1u : entries_per_level;
399 }
400 
401 /*
402  * Typically we have fewer elements in the top few levels which allows us
403  * to adjust the promote threshold nicely.
404  */
405 static void q_set_targets(struct queue *q)
406 {
407 	if (q->last_target_nr_elts == q->nr_elts)
408 		return;
409 
410 	q->last_target_nr_elts = q->nr_elts;
411 
412 	if (q->nr_top_levels > q->nr_levels)
413 		q_set_targets_subrange_(q, q->nr_elts, 0, q->nr_levels);
414 
415 	else {
416 		q_set_targets_subrange_(q, q->nr_in_top_levels,
417 					q->nr_levels - q->nr_top_levels, q->nr_levels);
418 
419 		if (q->nr_in_top_levels < q->nr_elts)
420 			q_set_targets_subrange_(q, q->nr_elts - q->nr_in_top_levels,
421 						0, q->nr_levels - q->nr_top_levels);
422 		else
423 			q_set_targets_subrange_(q, 0, 0, q->nr_levels - q->nr_top_levels);
424 	}
425 }
426 
427 static void q_redistribute(struct queue *q)
428 {
429 	unsigned target, level;
430 	struct ilist *l, *l_above;
431 	struct entry *e;
432 
433 	q_set_targets(q);
434 
435 	for (level = 0u; level < q->nr_levels - 1u; level++) {
436 		l = q->qs + level;
437 		target = q->target_count[level];
438 
439 		/*
440 		 * Pull down some entries from the level above.
441 		 */
442 		while (l->nr_elts < target) {
443 			e = __redist_pop_from(q, level + 1u);
444 			if (!e) {
445 				/* bug in nr_elts */
446 				break;
447 			}
448 
449 			e->level = level;
450 			l_add_tail(q->es, l, e);
451 		}
452 
453 		/*
454 		 * Push some entries up.
455 		 */
456 		l_above = q->qs + level + 1u;
457 		while (l->nr_elts > target) {
458 			e = l_pop_tail(q->es, l);
459 
460 			if (!e)
461 				/* bug in nr_elts */
462 				break;
463 
464 			e->level = level + 1u;
465 			l_add_tail(q->es, l_above, e);
466 		}
467 	}
468 }
469 
470 static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels,
471 		      struct entry *s1, struct entry *s2)
472 {
473 	struct entry *de;
474 	unsigned sentinels_passed = 0;
475 	unsigned new_level = min(q->nr_levels - 1u, e->level + extra_levels);
476 
477 	/* try and find an entry to swap with */
478 	if (extra_levels && (e->level < q->nr_levels - 1u)) {
479 		for (de = l_head(q->es, q->qs + new_level); de && de->sentinel; de = l_next(q->es, de))
480 			sentinels_passed++;
481 
482 		if (de) {
483 			q_del(q, de);
484 			de->level = e->level;
485 			if (s1) {
486 				switch (sentinels_passed) {
487 				case 0:
488 					q_push_before(q, s1, de);
489 					break;
490 
491 				case 1:
492 					q_push_before(q, s2, de);
493 					break;
494 
495 				default:
496 					q_push(q, de);
497 				}
498 			} else
499 				q_push(q, de);
500 		}
501 	}
502 
503 	q_del(q, e);
504 	e->level = new_level;
505 	q_push(q, e);
506 }
507 
508 /*----------------------------------------------------------------*/
509 
510 #define FP_SHIFT 8
511 #define SIXTEENTH (1u << (FP_SHIFT - 4u))
512 #define EIGHTH (1u << (FP_SHIFT - 3u))
513 
514 struct stats {
515 	unsigned hit_threshold;
516 	unsigned hits;
517 	unsigned misses;
518 };
519 
520 enum performance {
521 	Q_POOR,
522 	Q_FAIR,
523 	Q_WELL
524 };
525 
526 static void stats_init(struct stats *s, unsigned nr_levels)
527 {
528 	s->hit_threshold = (nr_levels * 3u) / 4u;
529 	s->hits = 0u;
530 	s->misses = 0u;
531 }
532 
533 static void stats_reset(struct stats *s)
534 {
535 	s->hits = s->misses = 0u;
536 }
537 
538 static void stats_level_accessed(struct stats *s, unsigned level)
539 {
540 	if (level >= s->hit_threshold)
541 		s->hits++;
542 	else
543 		s->misses++;
544 }
545 
546 static void stats_miss(struct stats *s)
547 {
548 	s->misses++;
549 }
550 
551 /*
552  * There are times when we don't have any confidence in the hotspot queue.
553  * Such as when a fresh cache is created and the blocks have been spread
554  * out across the levels, or if an io load changes.  We detect this by
555  * seeing how often a lookup is in the top levels of the hotspot queue.
556  */
557 static enum performance stats_assess(struct stats *s)
558 {
559 	unsigned confidence = safe_div(s->hits << FP_SHIFT, s->hits + s->misses);
560 
561 	if (confidence < SIXTEENTH)
562 		return Q_POOR;
563 
564 	else if (confidence < EIGHTH)
565 		return Q_FAIR;
566 
567 	else
568 		return Q_WELL;
569 }
570 
571 /*----------------------------------------------------------------*/
572 
573 struct smq_hash_table {
574 	struct entry_space *es;
575 	unsigned long long hash_bits;
576 	unsigned *buckets;
577 };
578 
579 /*
580  * All cache entries are stored in a chained hash table.  To save space we
581  * use indexing again, and only store indexes to the next entry.
582  */
583 static int h_init(struct smq_hash_table *ht, struct entry_space *es, unsigned nr_entries)
584 {
585 	unsigned i, nr_buckets;
586 
587 	ht->es = es;
588 	nr_buckets = roundup_pow_of_two(max(nr_entries / 4u, 16u));
589 	ht->hash_bits = __ffs(nr_buckets);
590 
591 	ht->buckets = vmalloc(array_size(nr_buckets, sizeof(*ht->buckets)));
592 	if (!ht->buckets)
593 		return -ENOMEM;
594 
595 	for (i = 0; i < nr_buckets; i++)
596 		ht->buckets[i] = INDEXER_NULL;
597 
598 	return 0;
599 }
600 
601 static void h_exit(struct smq_hash_table *ht)
602 {
603 	vfree(ht->buckets);
604 }
605 
606 static struct entry *h_head(struct smq_hash_table *ht, unsigned bucket)
607 {
608 	return to_entry(ht->es, ht->buckets[bucket]);
609 }
610 
611 static struct entry *h_next(struct smq_hash_table *ht, struct entry *e)
612 {
613 	return to_entry(ht->es, e->hash_next);
614 }
615 
616 static void __h_insert(struct smq_hash_table *ht, unsigned bucket, struct entry *e)
617 {
618 	e->hash_next = ht->buckets[bucket];
619 	ht->buckets[bucket] = to_index(ht->es, e);
620 }
621 
622 static void h_insert(struct smq_hash_table *ht, struct entry *e)
623 {
624 	unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
625 	__h_insert(ht, h, e);
626 }
627 
628 static struct entry *__h_lookup(struct smq_hash_table *ht, unsigned h, dm_oblock_t oblock,
629 				struct entry **prev)
630 {
631 	struct entry *e;
632 
633 	*prev = NULL;
634 	for (e = h_head(ht, h); e; e = h_next(ht, e)) {
635 		if (e->oblock == oblock)
636 			return e;
637 
638 		*prev = e;
639 	}
640 
641 	return NULL;
642 }
643 
644 static void __h_unlink(struct smq_hash_table *ht, unsigned h,
645 		       struct entry *e, struct entry *prev)
646 {
647 	if (prev)
648 		prev->hash_next = e->hash_next;
649 	else
650 		ht->buckets[h] = e->hash_next;
651 }
652 
653 /*
654  * Also moves each entry to the front of the bucket.
655  */
656 static struct entry *h_lookup(struct smq_hash_table *ht, dm_oblock_t oblock)
657 {
658 	struct entry *e, *prev;
659 	unsigned h = hash_64(from_oblock(oblock), ht->hash_bits);
660 
661 	e = __h_lookup(ht, h, oblock, &prev);
662 	if (e && prev) {
663 		/*
664 		 * Move to the front because this entry is likely
665 		 * to be hit again.
666 		 */
667 		__h_unlink(ht, h, e, prev);
668 		__h_insert(ht, h, e);
669 	}
670 
671 	return e;
672 }
673 
674 static void h_remove(struct smq_hash_table *ht, struct entry *e)
675 {
676 	unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
677 	struct entry *prev;
678 
679 	/*
680 	 * The down side of using a singly linked list is we have to
681 	 * iterate the bucket to remove an item.
682 	 */
683 	e = __h_lookup(ht, h, e->oblock, &prev);
684 	if (e)
685 		__h_unlink(ht, h, e, prev);
686 }
687 
688 /*----------------------------------------------------------------*/
689 
690 struct entry_alloc {
691 	struct entry_space *es;
692 	unsigned begin;
693 
694 	unsigned nr_allocated;
695 	struct ilist free;
696 };
697 
698 static void init_allocator(struct entry_alloc *ea, struct entry_space *es,
699 			   unsigned begin, unsigned end)
700 {
701 	unsigned i;
702 
703 	ea->es = es;
704 	ea->nr_allocated = 0u;
705 	ea->begin = begin;
706 
707 	l_init(&ea->free);
708 	for (i = begin; i != end; i++)
709 		l_add_tail(ea->es, &ea->free, __get_entry(ea->es, i));
710 }
711 
712 static void init_entry(struct entry *e)
713 {
714 	/*
715 	 * We can't memset because that would clear the hotspot and
716 	 * sentinel bits which remain constant.
717 	 */
718 	e->hash_next = INDEXER_NULL;
719 	e->next = INDEXER_NULL;
720 	e->prev = INDEXER_NULL;
721 	e->level = 0u;
722 	e->dirty = true;	/* FIXME: audit */
723 	e->allocated = true;
724 	e->sentinel = false;
725 	e->pending_work = false;
726 }
727 
728 static struct entry *alloc_entry(struct entry_alloc *ea)
729 {
730 	struct entry *e;
731 
732 	if (l_empty(&ea->free))
733 		return NULL;
734 
735 	e = l_pop_head(ea->es, &ea->free);
736 	init_entry(e);
737 	ea->nr_allocated++;
738 
739 	return e;
740 }
741 
742 /*
743  * This assumes the cblock hasn't already been allocated.
744  */
745 static struct entry *alloc_particular_entry(struct entry_alloc *ea, unsigned i)
746 {
747 	struct entry *e = __get_entry(ea->es, ea->begin + i);
748 
749 	BUG_ON(e->allocated);
750 
751 	l_del(ea->es, &ea->free, e);
752 	init_entry(e);
753 	ea->nr_allocated++;
754 
755 	return e;
756 }
757 
758 static void free_entry(struct entry_alloc *ea, struct entry *e)
759 {
760 	BUG_ON(!ea->nr_allocated);
761 	BUG_ON(!e->allocated);
762 
763 	ea->nr_allocated--;
764 	e->allocated = false;
765 	l_add_tail(ea->es, &ea->free, e);
766 }
767 
768 static bool allocator_empty(struct entry_alloc *ea)
769 {
770 	return l_empty(&ea->free);
771 }
772 
773 static unsigned get_index(struct entry_alloc *ea, struct entry *e)
774 {
775 	return to_index(ea->es, e) - ea->begin;
776 }
777 
778 static struct entry *get_entry(struct entry_alloc *ea, unsigned index)
779 {
780 	return __get_entry(ea->es, ea->begin + index);
781 }
782 
783 /*----------------------------------------------------------------*/
784 
785 #define NR_HOTSPOT_LEVELS 64u
786 #define NR_CACHE_LEVELS 64u
787 
788 #define WRITEBACK_PERIOD (10ul * HZ)
789 #define DEMOTE_PERIOD (60ul * HZ)
790 
791 #define HOTSPOT_UPDATE_PERIOD (HZ)
792 #define CACHE_UPDATE_PERIOD (60ul * HZ)
793 
794 struct smq_policy {
795 	struct dm_cache_policy policy;
796 
797 	/* protects everything */
798 	spinlock_t lock;
799 	dm_cblock_t cache_size;
800 	sector_t cache_block_size;
801 
802 	sector_t hotspot_block_size;
803 	unsigned nr_hotspot_blocks;
804 	unsigned cache_blocks_per_hotspot_block;
805 	unsigned hotspot_level_jump;
806 
807 	struct entry_space es;
808 	struct entry_alloc writeback_sentinel_alloc;
809 	struct entry_alloc demote_sentinel_alloc;
810 	struct entry_alloc hotspot_alloc;
811 	struct entry_alloc cache_alloc;
812 
813 	unsigned long *hotspot_hit_bits;
814 	unsigned long *cache_hit_bits;
815 
816 	/*
817 	 * We maintain three queues of entries.  The cache proper,
818 	 * consisting of a clean and dirty queue, containing the currently
819 	 * active mappings.  The hotspot queue uses a larger block size to
820 	 * track blocks that are being hit frequently and potential
821 	 * candidates for promotion to the cache.
822 	 */
823 	struct queue hotspot;
824 	struct queue clean;
825 	struct queue dirty;
826 
827 	struct stats hotspot_stats;
828 	struct stats cache_stats;
829 
830 	/*
831 	 * Keeps track of time, incremented by the core.  We use this to
832 	 * avoid attributing multiple hits within the same tick.
833 	 */
834 	unsigned tick;
835 
836 	/*
837 	 * The hash tables allows us to quickly find an entry by origin
838 	 * block.
839 	 */
840 	struct smq_hash_table table;
841 	struct smq_hash_table hotspot_table;
842 
843 	bool current_writeback_sentinels;
844 	unsigned long next_writeback_period;
845 
846 	bool current_demote_sentinels;
847 	unsigned long next_demote_period;
848 
849 	unsigned write_promote_level;
850 	unsigned read_promote_level;
851 
852 	unsigned long next_hotspot_period;
853 	unsigned long next_cache_period;
854 
855 	struct background_tracker *bg_work;
856 
857 	bool migrations_allowed;
858 };
859 
860 /*----------------------------------------------------------------*/
861 
862 static struct entry *get_sentinel(struct entry_alloc *ea, unsigned level, bool which)
863 {
864 	return get_entry(ea, which ? level : NR_CACHE_LEVELS + level);
865 }
866 
867 static struct entry *writeback_sentinel(struct smq_policy *mq, unsigned level)
868 {
869 	return get_sentinel(&mq->writeback_sentinel_alloc, level, mq->current_writeback_sentinels);
870 }
871 
872 static struct entry *demote_sentinel(struct smq_policy *mq, unsigned level)
873 {
874 	return get_sentinel(&mq->demote_sentinel_alloc, level, mq->current_demote_sentinels);
875 }
876 
877 static void __update_writeback_sentinels(struct smq_policy *mq)
878 {
879 	unsigned level;
880 	struct queue *q = &mq->dirty;
881 	struct entry *sentinel;
882 
883 	for (level = 0; level < q->nr_levels; level++) {
884 		sentinel = writeback_sentinel(mq, level);
885 		q_del(q, sentinel);
886 		q_push(q, sentinel);
887 	}
888 }
889 
890 static void __update_demote_sentinels(struct smq_policy *mq)
891 {
892 	unsigned level;
893 	struct queue *q = &mq->clean;
894 	struct entry *sentinel;
895 
896 	for (level = 0; level < q->nr_levels; level++) {
897 		sentinel = demote_sentinel(mq, level);
898 		q_del(q, sentinel);
899 		q_push(q, sentinel);
900 	}
901 }
902 
903 static void update_sentinels(struct smq_policy *mq)
904 {
905 	if (time_after(jiffies, mq->next_writeback_period)) {
906 		mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
907 		mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
908 		__update_writeback_sentinels(mq);
909 	}
910 
911 	if (time_after(jiffies, mq->next_demote_period)) {
912 		mq->next_demote_period = jiffies + DEMOTE_PERIOD;
913 		mq->current_demote_sentinels = !mq->current_demote_sentinels;
914 		__update_demote_sentinels(mq);
915 	}
916 }
917 
918 static void __sentinels_init(struct smq_policy *mq)
919 {
920 	unsigned level;
921 	struct entry *sentinel;
922 
923 	for (level = 0; level < NR_CACHE_LEVELS; level++) {
924 		sentinel = writeback_sentinel(mq, level);
925 		sentinel->level = level;
926 		q_push(&mq->dirty, sentinel);
927 
928 		sentinel = demote_sentinel(mq, level);
929 		sentinel->level = level;
930 		q_push(&mq->clean, sentinel);
931 	}
932 }
933 
934 static void sentinels_init(struct smq_policy *mq)
935 {
936 	mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
937 	mq->next_demote_period = jiffies + DEMOTE_PERIOD;
938 
939 	mq->current_writeback_sentinels = false;
940 	mq->current_demote_sentinels = false;
941 	__sentinels_init(mq);
942 
943 	mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
944 	mq->current_demote_sentinels = !mq->current_demote_sentinels;
945 	__sentinels_init(mq);
946 }
947 
948 /*----------------------------------------------------------------*/
949 
950 static void del_queue(struct smq_policy *mq, struct entry *e)
951 {
952 	q_del(e->dirty ? &mq->dirty : &mq->clean, e);
953 }
954 
955 static void push_queue(struct smq_policy *mq, struct entry *e)
956 {
957 	if (e->dirty)
958 		q_push(&mq->dirty, e);
959 	else
960 		q_push(&mq->clean, e);
961 }
962 
963 // !h, !q, a -> h, q, a
964 static void push(struct smq_policy *mq, struct entry *e)
965 {
966 	h_insert(&mq->table, e);
967 	if (!e->pending_work)
968 		push_queue(mq, e);
969 }
970 
971 static void push_queue_front(struct smq_policy *mq, struct entry *e)
972 {
973 	if (e->dirty)
974 		q_push_front(&mq->dirty, e);
975 	else
976 		q_push_front(&mq->clean, e);
977 }
978 
979 static void push_front(struct smq_policy *mq, struct entry *e)
980 {
981 	h_insert(&mq->table, e);
982 	if (!e->pending_work)
983 		push_queue_front(mq, e);
984 }
985 
986 static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e)
987 {
988 	return to_cblock(get_index(&mq->cache_alloc, e));
989 }
990 
991 static void requeue(struct smq_policy *mq, struct entry *e)
992 {
993 	/*
994 	 * Pending work has temporarily been taken out of the queues.
995 	 */
996 	if (e->pending_work)
997 		return;
998 
999 	if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) {
1000 		if (!e->dirty) {
1001 			q_requeue(&mq->clean, e, 1u, NULL, NULL);
1002 			return;
1003 		}
1004 
1005 		q_requeue(&mq->dirty, e, 1u,
1006 			  get_sentinel(&mq->writeback_sentinel_alloc, e->level, !mq->current_writeback_sentinels),
1007 			  get_sentinel(&mq->writeback_sentinel_alloc, e->level, mq->current_writeback_sentinels));
1008 	}
1009 }
1010 
1011 static unsigned default_promote_level(struct smq_policy *mq)
1012 {
1013 	/*
1014 	 * The promote level depends on the current performance of the
1015 	 * cache.
1016 	 *
1017 	 * If the cache is performing badly, then we can't afford
1018 	 * to promote much without causing performance to drop below that
1019 	 * of the origin device.
1020 	 *
1021 	 * If the cache is performing well, then we don't need to promote
1022 	 * much.  If it isn't broken, don't fix it.
1023 	 *
1024 	 * If the cache is middling then we promote more.
1025 	 *
1026 	 * This scheme reminds me of a graph of entropy vs probability of a
1027 	 * binary variable.
1028 	 */
1029 	static const unsigned int table[] = {
1030 		1, 1, 1, 2, 4, 6, 7, 8, 7, 6, 4, 4, 3, 3, 2, 2, 1
1031 	};
1032 
1033 	unsigned hits = mq->cache_stats.hits;
1034 	unsigned misses = mq->cache_stats.misses;
1035 	unsigned index = safe_div(hits << 4u, hits + misses);
1036 	return table[index];
1037 }
1038 
1039 static void update_promote_levels(struct smq_policy *mq)
1040 {
1041 	/*
1042 	 * If there are unused cache entries then we want to be really
1043 	 * eager to promote.
1044 	 */
1045 	unsigned threshold_level = allocator_empty(&mq->cache_alloc) ?
1046 		default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u);
1047 
1048 	threshold_level = max(threshold_level, NR_HOTSPOT_LEVELS);
1049 
1050 	/*
1051 	 * If the hotspot queue is performing badly then we have little
1052 	 * confidence that we know which blocks to promote.  So we cut down
1053 	 * the amount of promotions.
1054 	 */
1055 	switch (stats_assess(&mq->hotspot_stats)) {
1056 	case Q_POOR:
1057 		threshold_level /= 4u;
1058 		break;
1059 
1060 	case Q_FAIR:
1061 		threshold_level /= 2u;
1062 		break;
1063 
1064 	case Q_WELL:
1065 		break;
1066 	}
1067 
1068 	mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level;
1069 	mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level);
1070 }
1071 
1072 /*
1073  * If the hotspot queue is performing badly, then we try and move entries
1074  * around more quickly.
1075  */
1076 static void update_level_jump(struct smq_policy *mq)
1077 {
1078 	switch (stats_assess(&mq->hotspot_stats)) {
1079 	case Q_POOR:
1080 		mq->hotspot_level_jump = 4u;
1081 		break;
1082 
1083 	case Q_FAIR:
1084 		mq->hotspot_level_jump = 2u;
1085 		break;
1086 
1087 	case Q_WELL:
1088 		mq->hotspot_level_jump = 1u;
1089 		break;
1090 	}
1091 }
1092 
1093 static void end_hotspot_period(struct smq_policy *mq)
1094 {
1095 	clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
1096 	update_promote_levels(mq);
1097 
1098 	if (time_after(jiffies, mq->next_hotspot_period)) {
1099 		update_level_jump(mq);
1100 		q_redistribute(&mq->hotspot);
1101 		stats_reset(&mq->hotspot_stats);
1102 		mq->next_hotspot_period = jiffies + HOTSPOT_UPDATE_PERIOD;
1103 	}
1104 }
1105 
1106 static void end_cache_period(struct smq_policy *mq)
1107 {
1108 	if (time_after(jiffies, mq->next_cache_period)) {
1109 		clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
1110 
1111 		q_redistribute(&mq->dirty);
1112 		q_redistribute(&mq->clean);
1113 		stats_reset(&mq->cache_stats);
1114 
1115 		mq->next_cache_period = jiffies + CACHE_UPDATE_PERIOD;
1116 	}
1117 }
1118 
1119 /*----------------------------------------------------------------*/
1120 
1121 /*
1122  * Targets are given as a percentage.
1123  */
1124 #define CLEAN_TARGET 25u
1125 #define FREE_TARGET 25u
1126 
1127 static unsigned percent_to_target(struct smq_policy *mq, unsigned p)
1128 {
1129 	return from_cblock(mq->cache_size) * p / 100u;
1130 }
1131 
1132 static bool clean_target_met(struct smq_policy *mq, bool idle)
1133 {
1134 	/*
1135 	 * Cache entries may not be populated.  So we cannot rely on the
1136 	 * size of the clean queue.
1137 	 */
1138 	if (idle) {
1139 		/*
1140 		 * We'd like to clean everything.
1141 		 */
1142 		return q_size(&mq->dirty) == 0u;
1143 	}
1144 
1145 	/*
1146 	 * If we're busy we don't worry about cleaning at all.
1147 	 */
1148 	return true;
1149 }
1150 
1151 static bool free_target_met(struct smq_policy *mq)
1152 {
1153 	unsigned nr_free;
1154 
1155 	nr_free = from_cblock(mq->cache_size) - mq->cache_alloc.nr_allocated;
1156 	return (nr_free + btracker_nr_demotions_queued(mq->bg_work)) >=
1157 		percent_to_target(mq, FREE_TARGET);
1158 }
1159 
1160 /*----------------------------------------------------------------*/
1161 
1162 static void mark_pending(struct smq_policy *mq, struct entry *e)
1163 {
1164 	BUG_ON(e->sentinel);
1165 	BUG_ON(!e->allocated);
1166 	BUG_ON(e->pending_work);
1167 	e->pending_work = true;
1168 }
1169 
1170 static void clear_pending(struct smq_policy *mq, struct entry *e)
1171 {
1172 	BUG_ON(!e->pending_work);
1173 	e->pending_work = false;
1174 }
1175 
1176 static void queue_writeback(struct smq_policy *mq, bool idle)
1177 {
1178 	int r;
1179 	struct policy_work work;
1180 	struct entry *e;
1181 
1182 	e = q_peek(&mq->dirty, mq->dirty.nr_levels, idle);
1183 	if (e) {
1184 		mark_pending(mq, e);
1185 		q_del(&mq->dirty, e);
1186 
1187 		work.op = POLICY_WRITEBACK;
1188 		work.oblock = e->oblock;
1189 		work.cblock = infer_cblock(mq, e);
1190 
1191 		r = btracker_queue(mq->bg_work, &work, NULL);
1192 		if (r) {
1193 			clear_pending(mq, e);
1194 			q_push_front(&mq->dirty, e);
1195 		}
1196 	}
1197 }
1198 
1199 static void queue_demotion(struct smq_policy *mq)
1200 {
1201 	int r;
1202 	struct policy_work work;
1203 	struct entry *e;
1204 
1205 	if (WARN_ON_ONCE(!mq->migrations_allowed))
1206 		return;
1207 
1208 	e = q_peek(&mq->clean, mq->clean.nr_levels / 2, true);
1209 	if (!e) {
1210 		if (!clean_target_met(mq, true))
1211 			queue_writeback(mq, false);
1212 		return;
1213 	}
1214 
1215 	mark_pending(mq, e);
1216 	q_del(&mq->clean, e);
1217 
1218 	work.op = POLICY_DEMOTE;
1219 	work.oblock = e->oblock;
1220 	work.cblock = infer_cblock(mq, e);
1221 	r = btracker_queue(mq->bg_work, &work, NULL);
1222 	if (r) {
1223 		clear_pending(mq, e);
1224 		q_push_front(&mq->clean, e);
1225 	}
1226 }
1227 
1228 static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock,
1229 			    struct policy_work **workp)
1230 {
1231 	int r;
1232 	struct entry *e;
1233 	struct policy_work work;
1234 
1235 	if (!mq->migrations_allowed)
1236 		return;
1237 
1238 	if (allocator_empty(&mq->cache_alloc)) {
1239 		/*
1240 		 * We always claim to be 'idle' to ensure some demotions happen
1241 		 * with continuous loads.
1242 		 */
1243 		if (!free_target_met(mq))
1244 			queue_demotion(mq);
1245 		return;
1246 	}
1247 
1248 	if (btracker_promotion_already_present(mq->bg_work, oblock))
1249 		return;
1250 
1251 	/*
1252 	 * We allocate the entry now to reserve the cblock.  If the
1253 	 * background work is aborted we must remember to free it.
1254 	 */
1255 	e = alloc_entry(&mq->cache_alloc);
1256 	BUG_ON(!e);
1257 	e->pending_work = true;
1258 	work.op = POLICY_PROMOTE;
1259 	work.oblock = oblock;
1260 	work.cblock = infer_cblock(mq, e);
1261 	r = btracker_queue(mq->bg_work, &work, workp);
1262 	if (r)
1263 		free_entry(&mq->cache_alloc, e);
1264 }
1265 
1266 /*----------------------------------------------------------------*/
1267 
1268 enum promote_result {
1269 	PROMOTE_NOT,
1270 	PROMOTE_TEMPORARY,
1271 	PROMOTE_PERMANENT
1272 };
1273 
1274 /*
1275  * Converts a boolean into a promote result.
1276  */
1277 static enum promote_result maybe_promote(bool promote)
1278 {
1279 	return promote ? PROMOTE_PERMANENT : PROMOTE_NOT;
1280 }
1281 
1282 static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e,
1283 					  int data_dir, bool fast_promote)
1284 {
1285 	if (data_dir == WRITE) {
1286 		if (!allocator_empty(&mq->cache_alloc) && fast_promote)
1287 			return PROMOTE_TEMPORARY;
1288 
1289 		return maybe_promote(hs_e->level >= mq->write_promote_level);
1290 	} else
1291 		return maybe_promote(hs_e->level >= mq->read_promote_level);
1292 }
1293 
1294 static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
1295 {
1296 	sector_t r = from_oblock(b);
1297 	(void) sector_div(r, mq->cache_blocks_per_hotspot_block);
1298 	return to_oblock(r);
1299 }
1300 
1301 static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b)
1302 {
1303 	unsigned hi;
1304 	dm_oblock_t hb = to_hblock(mq, b);
1305 	struct entry *e = h_lookup(&mq->hotspot_table, hb);
1306 
1307 	if (e) {
1308 		stats_level_accessed(&mq->hotspot_stats, e->level);
1309 
1310 		hi = get_index(&mq->hotspot_alloc, e);
1311 		q_requeue(&mq->hotspot, e,
1312 			  test_and_set_bit(hi, mq->hotspot_hit_bits) ?
1313 			  0u : mq->hotspot_level_jump,
1314 			  NULL, NULL);
1315 
1316 	} else {
1317 		stats_miss(&mq->hotspot_stats);
1318 
1319 		e = alloc_entry(&mq->hotspot_alloc);
1320 		if (!e) {
1321 			e = q_pop(&mq->hotspot);
1322 			if (e) {
1323 				h_remove(&mq->hotspot_table, e);
1324 				hi = get_index(&mq->hotspot_alloc, e);
1325 				clear_bit(hi, mq->hotspot_hit_bits);
1326 			}
1327 
1328 		}
1329 
1330 		if (e) {
1331 			e->oblock = hb;
1332 			q_push(&mq->hotspot, e);
1333 			h_insert(&mq->hotspot_table, e);
1334 		}
1335 	}
1336 
1337 	return e;
1338 }
1339 
1340 /*----------------------------------------------------------------*/
1341 
1342 /*
1343  * Public interface, via the policy struct.  See dm-cache-policy.h for a
1344  * description of these.
1345  */
1346 
1347 static struct smq_policy *to_smq_policy(struct dm_cache_policy *p)
1348 {
1349 	return container_of(p, struct smq_policy, policy);
1350 }
1351 
1352 static void smq_destroy(struct dm_cache_policy *p)
1353 {
1354 	struct smq_policy *mq = to_smq_policy(p);
1355 
1356 	btracker_destroy(mq->bg_work);
1357 	h_exit(&mq->hotspot_table);
1358 	h_exit(&mq->table);
1359 	free_bitset(mq->hotspot_hit_bits);
1360 	free_bitset(mq->cache_hit_bits);
1361 	space_exit(&mq->es);
1362 	kfree(mq);
1363 }
1364 
1365 /*----------------------------------------------------------------*/
1366 
1367 static int __lookup(struct smq_policy *mq, dm_oblock_t oblock, dm_cblock_t *cblock,
1368 		    int data_dir, bool fast_copy,
1369 		    struct policy_work **work, bool *background_work)
1370 {
1371 	struct entry *e, *hs_e;
1372 	enum promote_result pr;
1373 
1374 	*background_work = false;
1375 
1376 	e = h_lookup(&mq->table, oblock);
1377 	if (e) {
1378 		stats_level_accessed(&mq->cache_stats, e->level);
1379 
1380 		requeue(mq, e);
1381 		*cblock = infer_cblock(mq, e);
1382 		return 0;
1383 
1384 	} else {
1385 		stats_miss(&mq->cache_stats);
1386 
1387 		/*
1388 		 * The hotspot queue only gets updated with misses.
1389 		 */
1390 		hs_e = update_hotspot_queue(mq, oblock);
1391 
1392 		pr = should_promote(mq, hs_e, data_dir, fast_copy);
1393 		if (pr != PROMOTE_NOT) {
1394 			queue_promotion(mq, oblock, work);
1395 			*background_work = true;
1396 		}
1397 
1398 		return -ENOENT;
1399 	}
1400 }
1401 
1402 static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock,
1403 		      int data_dir, bool fast_copy,
1404 		      bool *background_work)
1405 {
1406 	int r;
1407 	unsigned long flags;
1408 	struct smq_policy *mq = to_smq_policy(p);
1409 
1410 	spin_lock_irqsave(&mq->lock, flags);
1411 	r = __lookup(mq, oblock, cblock,
1412 		     data_dir, fast_copy,
1413 		     NULL, background_work);
1414 	spin_unlock_irqrestore(&mq->lock, flags);
1415 
1416 	return r;
1417 }
1418 
1419 static int smq_lookup_with_work(struct dm_cache_policy *p,
1420 				dm_oblock_t oblock, dm_cblock_t *cblock,
1421 				int data_dir, bool fast_copy,
1422 				struct policy_work **work)
1423 {
1424 	int r;
1425 	bool background_queued;
1426 	unsigned long flags;
1427 	struct smq_policy *mq = to_smq_policy(p);
1428 
1429 	spin_lock_irqsave(&mq->lock, flags);
1430 	r = __lookup(mq, oblock, cblock, data_dir, fast_copy, work, &background_queued);
1431 	spin_unlock_irqrestore(&mq->lock, flags);
1432 
1433 	return r;
1434 }
1435 
1436 static int smq_get_background_work(struct dm_cache_policy *p, bool idle,
1437 				   struct policy_work **result)
1438 {
1439 	int r;
1440 	unsigned long flags;
1441 	struct smq_policy *mq = to_smq_policy(p);
1442 
1443 	spin_lock_irqsave(&mq->lock, flags);
1444 	r = btracker_issue(mq->bg_work, result);
1445 	if (r == -ENODATA) {
1446 		if (!clean_target_met(mq, idle)) {
1447 			queue_writeback(mq, idle);
1448 			r = btracker_issue(mq->bg_work, result);
1449 		}
1450 	}
1451 	spin_unlock_irqrestore(&mq->lock, flags);
1452 
1453 	return r;
1454 }
1455 
1456 /*
1457  * We need to clear any pending work flags that have been set, and in the
1458  * case of promotion free the entry for the destination cblock.
1459  */
1460 static void __complete_background_work(struct smq_policy *mq,
1461 				       struct policy_work *work,
1462 				       bool success)
1463 {
1464 	struct entry *e = get_entry(&mq->cache_alloc,
1465 				    from_cblock(work->cblock));
1466 
1467 	switch (work->op) {
1468 	case POLICY_PROMOTE:
1469 		// !h, !q, a
1470 		clear_pending(mq, e);
1471 		if (success) {
1472 			e->oblock = work->oblock;
1473 			e->level = NR_CACHE_LEVELS - 1;
1474 			push(mq, e);
1475 			// h, q, a
1476 		} else {
1477 			free_entry(&mq->cache_alloc, e);
1478 			// !h, !q, !a
1479 		}
1480 		break;
1481 
1482 	case POLICY_DEMOTE:
1483 		// h, !q, a
1484 		if (success) {
1485 			h_remove(&mq->table, e);
1486 			free_entry(&mq->cache_alloc, e);
1487 			// !h, !q, !a
1488 		} else {
1489 			clear_pending(mq, e);
1490 			push_queue(mq, e);
1491 			// h, q, a
1492 		}
1493 		break;
1494 
1495 	case POLICY_WRITEBACK:
1496 		// h, !q, a
1497 		clear_pending(mq, e);
1498 		push_queue(mq, e);
1499 		// h, q, a
1500 		break;
1501 	}
1502 
1503 	btracker_complete(mq->bg_work, work);
1504 }
1505 
1506 static void smq_complete_background_work(struct dm_cache_policy *p,
1507 					 struct policy_work *work,
1508 					 bool success)
1509 {
1510 	unsigned long flags;
1511 	struct smq_policy *mq = to_smq_policy(p);
1512 
1513 	spin_lock_irqsave(&mq->lock, flags);
1514 	__complete_background_work(mq, work, success);
1515 	spin_unlock_irqrestore(&mq->lock, flags);
1516 }
1517 
1518 // in_hash(oblock) -> in_hash(oblock)
1519 static void __smq_set_clear_dirty(struct smq_policy *mq, dm_cblock_t cblock, bool set)
1520 {
1521 	struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
1522 
1523 	if (e->pending_work)
1524 		e->dirty = set;
1525 	else {
1526 		del_queue(mq, e);
1527 		e->dirty = set;
1528 		push_queue(mq, e);
1529 	}
1530 }
1531 
1532 static void smq_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
1533 {
1534 	unsigned long flags;
1535 	struct smq_policy *mq = to_smq_policy(p);
1536 
1537 	spin_lock_irqsave(&mq->lock, flags);
1538 	__smq_set_clear_dirty(mq, cblock, true);
1539 	spin_unlock_irqrestore(&mq->lock, flags);
1540 }
1541 
1542 static void smq_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
1543 {
1544 	struct smq_policy *mq = to_smq_policy(p);
1545 	unsigned long flags;
1546 
1547 	spin_lock_irqsave(&mq->lock, flags);
1548 	__smq_set_clear_dirty(mq, cblock, false);
1549 	spin_unlock_irqrestore(&mq->lock, flags);
1550 }
1551 
1552 static unsigned random_level(dm_cblock_t cblock)
1553 {
1554 	return hash_32(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1);
1555 }
1556 
1557 static int smq_load_mapping(struct dm_cache_policy *p,
1558 			    dm_oblock_t oblock, dm_cblock_t cblock,
1559 			    bool dirty, uint32_t hint, bool hint_valid)
1560 {
1561 	struct smq_policy *mq = to_smq_policy(p);
1562 	struct entry *e;
1563 
1564 	e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock));
1565 	e->oblock = oblock;
1566 	e->dirty = dirty;
1567 	e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock);
1568 	e->pending_work = false;
1569 
1570 	/*
1571 	 * When we load mappings we push ahead of both sentinels in order to
1572 	 * allow demotions and cleaning to occur immediately.
1573 	 */
1574 	push_front(mq, e);
1575 
1576 	return 0;
1577 }
1578 
1579 static int smq_invalidate_mapping(struct dm_cache_policy *p, dm_cblock_t cblock)
1580 {
1581 	struct smq_policy *mq = to_smq_policy(p);
1582 	struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
1583 
1584 	if (!e->allocated)
1585 		return -ENODATA;
1586 
1587 	// FIXME: what if this block has pending background work?
1588 	del_queue(mq, e);
1589 	h_remove(&mq->table, e);
1590 	free_entry(&mq->cache_alloc, e);
1591 	return 0;
1592 }
1593 
1594 static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock)
1595 {
1596 	struct smq_policy *mq = to_smq_policy(p);
1597 	struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
1598 
1599 	if (!e->allocated)
1600 		return 0;
1601 
1602 	return e->level;
1603 }
1604 
1605 static dm_cblock_t smq_residency(struct dm_cache_policy *p)
1606 {
1607 	dm_cblock_t r;
1608 	unsigned long flags;
1609 	struct smq_policy *mq = to_smq_policy(p);
1610 
1611 	spin_lock_irqsave(&mq->lock, flags);
1612 	r = to_cblock(mq->cache_alloc.nr_allocated);
1613 	spin_unlock_irqrestore(&mq->lock, flags);
1614 
1615 	return r;
1616 }
1617 
1618 static void smq_tick(struct dm_cache_policy *p, bool can_block)
1619 {
1620 	struct smq_policy *mq = to_smq_policy(p);
1621 	unsigned long flags;
1622 
1623 	spin_lock_irqsave(&mq->lock, flags);
1624 	mq->tick++;
1625 	update_sentinels(mq);
1626 	end_hotspot_period(mq);
1627 	end_cache_period(mq);
1628 	spin_unlock_irqrestore(&mq->lock, flags);
1629 }
1630 
1631 static void smq_allow_migrations(struct dm_cache_policy *p, bool allow)
1632 {
1633 	struct smq_policy *mq = to_smq_policy(p);
1634 	mq->migrations_allowed = allow;
1635 }
1636 
1637 /*
1638  * smq has no config values, but the old mq policy did.  To avoid breaking
1639  * software we continue to accept these configurables for the mq policy,
1640  * but they have no effect.
1641  */
1642 static int mq_set_config_value(struct dm_cache_policy *p,
1643 			       const char *key, const char *value)
1644 {
1645 	unsigned long tmp;
1646 
1647 	if (kstrtoul(value, 10, &tmp))
1648 		return -EINVAL;
1649 
1650 	if (!strcasecmp(key, "random_threshold") ||
1651 	    !strcasecmp(key, "sequential_threshold") ||
1652 	    !strcasecmp(key, "discard_promote_adjustment") ||
1653 	    !strcasecmp(key, "read_promote_adjustment") ||
1654 	    !strcasecmp(key, "write_promote_adjustment")) {
1655 		DMWARN("tunable '%s' no longer has any effect, mq policy is now an alias for smq", key);
1656 		return 0;
1657 	}
1658 
1659 	return -EINVAL;
1660 }
1661 
1662 static int mq_emit_config_values(struct dm_cache_policy *p, char *result,
1663 				 unsigned maxlen, ssize_t *sz_ptr)
1664 {
1665 	ssize_t sz = *sz_ptr;
1666 
1667 	DMEMIT("10 random_threshold 0 "
1668 	       "sequential_threshold 0 "
1669 	       "discard_promote_adjustment 0 "
1670 	       "read_promote_adjustment 0 "
1671 	       "write_promote_adjustment 0 ");
1672 
1673 	*sz_ptr = sz;
1674 	return 0;
1675 }
1676 
1677 /* Init the policy plugin interface function pointers. */
1678 static void init_policy_functions(struct smq_policy *mq, bool mimic_mq)
1679 {
1680 	mq->policy.destroy = smq_destroy;
1681 	mq->policy.lookup = smq_lookup;
1682 	mq->policy.lookup_with_work = smq_lookup_with_work;
1683 	mq->policy.get_background_work = smq_get_background_work;
1684 	mq->policy.complete_background_work = smq_complete_background_work;
1685 	mq->policy.set_dirty = smq_set_dirty;
1686 	mq->policy.clear_dirty = smq_clear_dirty;
1687 	mq->policy.load_mapping = smq_load_mapping;
1688 	mq->policy.invalidate_mapping = smq_invalidate_mapping;
1689 	mq->policy.get_hint = smq_get_hint;
1690 	mq->policy.residency = smq_residency;
1691 	mq->policy.tick = smq_tick;
1692 	mq->policy.allow_migrations = smq_allow_migrations;
1693 
1694 	if (mimic_mq) {
1695 		mq->policy.set_config_value = mq_set_config_value;
1696 		mq->policy.emit_config_values = mq_emit_config_values;
1697 	}
1698 }
1699 
1700 static bool too_many_hotspot_blocks(sector_t origin_size,
1701 				    sector_t hotspot_block_size,
1702 				    unsigned nr_hotspot_blocks)
1703 {
1704 	return (hotspot_block_size * nr_hotspot_blocks) > origin_size;
1705 }
1706 
1707 static void calc_hotspot_params(sector_t origin_size,
1708 				sector_t cache_block_size,
1709 				unsigned nr_cache_blocks,
1710 				sector_t *hotspot_block_size,
1711 				unsigned *nr_hotspot_blocks)
1712 {
1713 	*hotspot_block_size = cache_block_size * 16u;
1714 	*nr_hotspot_blocks = max(nr_cache_blocks / 4u, 1024u);
1715 
1716 	while ((*hotspot_block_size > cache_block_size) &&
1717 	       too_many_hotspot_blocks(origin_size, *hotspot_block_size, *nr_hotspot_blocks))
1718 		*hotspot_block_size /= 2u;
1719 }
1720 
1721 static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
1722 					    sector_t origin_size,
1723 					    sector_t cache_block_size,
1724 					    bool mimic_mq,
1725 					    bool migrations_allowed)
1726 {
1727 	unsigned i;
1728 	unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS;
1729 	unsigned total_sentinels = 2u * nr_sentinels_per_queue;
1730 	struct smq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
1731 
1732 	if (!mq)
1733 		return NULL;
1734 
1735 	init_policy_functions(mq, mimic_mq);
1736 	mq->cache_size = cache_size;
1737 	mq->cache_block_size = cache_block_size;
1738 
1739 	calc_hotspot_params(origin_size, cache_block_size, from_cblock(cache_size),
1740 			    &mq->hotspot_block_size, &mq->nr_hotspot_blocks);
1741 
1742 	mq->cache_blocks_per_hotspot_block = div64_u64(mq->hotspot_block_size, mq->cache_block_size);
1743 	mq->hotspot_level_jump = 1u;
1744 	if (space_init(&mq->es, total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size))) {
1745 		DMERR("couldn't initialize entry space");
1746 		goto bad_pool_init;
1747 	}
1748 
1749 	init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue);
1750 	for (i = 0; i < nr_sentinels_per_queue; i++)
1751 		get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true;
1752 
1753 	init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels);
1754 	for (i = 0; i < nr_sentinels_per_queue; i++)
1755 		get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true;
1756 
1757 	init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels,
1758 		       total_sentinels + mq->nr_hotspot_blocks);
1759 
1760 	init_allocator(&mq->cache_alloc, &mq->es,
1761 		       total_sentinels + mq->nr_hotspot_blocks,
1762 		       total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size));
1763 
1764 	mq->hotspot_hit_bits = alloc_bitset(mq->nr_hotspot_blocks);
1765 	if (!mq->hotspot_hit_bits) {
1766 		DMERR("couldn't allocate hotspot hit bitset");
1767 		goto bad_hotspot_hit_bits;
1768 	}
1769 	clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
1770 
1771 	if (from_cblock(cache_size)) {
1772 		mq->cache_hit_bits = alloc_bitset(from_cblock(cache_size));
1773 		if (!mq->cache_hit_bits) {
1774 			DMERR("couldn't allocate cache hit bitset");
1775 			goto bad_cache_hit_bits;
1776 		}
1777 		clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
1778 	} else
1779 		mq->cache_hit_bits = NULL;
1780 
1781 	mq->tick = 0;
1782 	spin_lock_init(&mq->lock);
1783 
1784 	q_init(&mq->hotspot, &mq->es, NR_HOTSPOT_LEVELS);
1785 	mq->hotspot.nr_top_levels = 8;
1786 	mq->hotspot.nr_in_top_levels = min(mq->nr_hotspot_blocks / NR_HOTSPOT_LEVELS,
1787 					   from_cblock(mq->cache_size) / mq->cache_blocks_per_hotspot_block);
1788 
1789 	q_init(&mq->clean, &mq->es, NR_CACHE_LEVELS);
1790 	q_init(&mq->dirty, &mq->es, NR_CACHE_LEVELS);
1791 
1792 	stats_init(&mq->hotspot_stats, NR_HOTSPOT_LEVELS);
1793 	stats_init(&mq->cache_stats, NR_CACHE_LEVELS);
1794 
1795 	if (h_init(&mq->table, &mq->es, from_cblock(cache_size)))
1796 		goto bad_alloc_table;
1797 
1798 	if (h_init(&mq->hotspot_table, &mq->es, mq->nr_hotspot_blocks))
1799 		goto bad_alloc_hotspot_table;
1800 
1801 	sentinels_init(mq);
1802 	mq->write_promote_level = mq->read_promote_level = NR_HOTSPOT_LEVELS;
1803 
1804 	mq->next_hotspot_period = jiffies;
1805 	mq->next_cache_period = jiffies;
1806 
1807 	mq->bg_work = btracker_create(4096); /* FIXME: hard coded value */
1808 	if (!mq->bg_work)
1809 		goto bad_btracker;
1810 
1811 	mq->migrations_allowed = migrations_allowed;
1812 
1813 	return &mq->policy;
1814 
1815 bad_btracker:
1816 	h_exit(&mq->hotspot_table);
1817 bad_alloc_hotspot_table:
1818 	h_exit(&mq->table);
1819 bad_alloc_table:
1820 	free_bitset(mq->cache_hit_bits);
1821 bad_cache_hit_bits:
1822 	free_bitset(mq->hotspot_hit_bits);
1823 bad_hotspot_hit_bits:
1824 	space_exit(&mq->es);
1825 bad_pool_init:
1826 	kfree(mq);
1827 
1828 	return NULL;
1829 }
1830 
1831 static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
1832 					  sector_t origin_size,
1833 					  sector_t cache_block_size)
1834 {
1835 	return __smq_create(cache_size, origin_size, cache_block_size, false, true);
1836 }
1837 
1838 static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
1839 					 sector_t origin_size,
1840 					 sector_t cache_block_size)
1841 {
1842 	return __smq_create(cache_size, origin_size, cache_block_size, true, true);
1843 }
1844 
1845 static struct dm_cache_policy *cleaner_create(dm_cblock_t cache_size,
1846 					      sector_t origin_size,
1847 					      sector_t cache_block_size)
1848 {
1849 	return __smq_create(cache_size, origin_size, cache_block_size, false, false);
1850 }
1851 
1852 /*----------------------------------------------------------------*/
1853 
1854 static struct dm_cache_policy_type smq_policy_type = {
1855 	.name = "smq",
1856 	.version = {2, 0, 0},
1857 	.hint_size = 4,
1858 	.owner = THIS_MODULE,
1859 	.create = smq_create
1860 };
1861 
1862 static struct dm_cache_policy_type mq_policy_type = {
1863 	.name = "mq",
1864 	.version = {2, 0, 0},
1865 	.hint_size = 4,
1866 	.owner = THIS_MODULE,
1867 	.create = mq_create,
1868 };
1869 
1870 static struct dm_cache_policy_type cleaner_policy_type = {
1871 	.name = "cleaner",
1872 	.version = {2, 0, 0},
1873 	.hint_size = 4,
1874 	.owner = THIS_MODULE,
1875 	.create = cleaner_create,
1876 };
1877 
1878 static struct dm_cache_policy_type default_policy_type = {
1879 	.name = "default",
1880 	.version = {2, 0, 0},
1881 	.hint_size = 4,
1882 	.owner = THIS_MODULE,
1883 	.create = smq_create,
1884 	.real = &smq_policy_type
1885 };
1886 
1887 static int __init smq_init(void)
1888 {
1889 	int r;
1890 
1891 	r = dm_cache_policy_register(&smq_policy_type);
1892 	if (r) {
1893 		DMERR("register failed %d", r);
1894 		return -ENOMEM;
1895 	}
1896 
1897 	r = dm_cache_policy_register(&mq_policy_type);
1898 	if (r) {
1899 		DMERR("register failed (as mq) %d", r);
1900 		goto out_mq;
1901 	}
1902 
1903 	r = dm_cache_policy_register(&cleaner_policy_type);
1904 	if (r) {
1905 		DMERR("register failed (as cleaner) %d", r);
1906 		goto out_cleaner;
1907 	}
1908 
1909 	r = dm_cache_policy_register(&default_policy_type);
1910 	if (r) {
1911 		DMERR("register failed (as default) %d", r);
1912 		goto out_default;
1913 	}
1914 
1915 	return 0;
1916 
1917 out_default:
1918 	dm_cache_policy_unregister(&cleaner_policy_type);
1919 out_cleaner:
1920 	dm_cache_policy_unregister(&mq_policy_type);
1921 out_mq:
1922 	dm_cache_policy_unregister(&smq_policy_type);
1923 
1924 	return -ENOMEM;
1925 }
1926 
1927 static void __exit smq_exit(void)
1928 {
1929 	dm_cache_policy_unregister(&cleaner_policy_type);
1930 	dm_cache_policy_unregister(&smq_policy_type);
1931 	dm_cache_policy_unregister(&mq_policy_type);
1932 	dm_cache_policy_unregister(&default_policy_type);
1933 }
1934 
1935 module_init(smq_init);
1936 module_exit(smq_exit);
1937 
1938 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
1939 MODULE_LICENSE("GPL");
1940 MODULE_DESCRIPTION("smq cache policy");
1941 
1942 MODULE_ALIAS("dm-cache-default");
1943 MODULE_ALIAS("dm-cache-mq");
1944 MODULE_ALIAS("dm-cache-cleaner");
1945