1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
2cafe5635SKent Overstreet /*
3cafe5635SKent Overstreet * Primary bucket allocation code
4cafe5635SKent Overstreet *
5cafe5635SKent Overstreet * Copyright 2012 Google, Inc.
6cafe5635SKent Overstreet *
7cafe5635SKent Overstreet * Allocation in bcache is done in terms of buckets:
8cafe5635SKent Overstreet *
9cafe5635SKent Overstreet * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in
10cafe5635SKent Overstreet * btree pointers - they must match for the pointer to be considered valid.
11cafe5635SKent Overstreet *
12cafe5635SKent Overstreet * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a
13cafe5635SKent Overstreet * bucket simply by incrementing its gen.
14cafe5635SKent Overstreet *
15cafe5635SKent Overstreet * The gens (along with the priorities; it's really the gens are important but
16cafe5635SKent Overstreet * the code is named as if it's the priorities) are written in an arbitrary list
17cafe5635SKent Overstreet * of buckets on disk, with a pointer to them in the journal header.
18cafe5635SKent Overstreet *
19cafe5635SKent Overstreet * When we invalidate a bucket, we have to write its new gen to disk and wait
20cafe5635SKent Overstreet * for that write to complete before we use it - otherwise after a crash we
21cafe5635SKent Overstreet * could have pointers that appeared to be good but pointed to data that had
22cafe5635SKent Overstreet * been overwritten.
23cafe5635SKent Overstreet *
24cafe5635SKent Overstreet * Since the gens and priorities are all stored contiguously on disk, we can
25cafe5635SKent Overstreet * batch this up: We fill up the free_inc list with freshly invalidated buckets,
26cafe5635SKent Overstreet * call prio_write(), and when prio_write() finishes we pull buckets off the
27cafe5635SKent Overstreet * free_inc list and optionally discard them.
28cafe5635SKent Overstreet *
29cafe5635SKent Overstreet * free_inc isn't the only freelist - if it was, we'd often to sleep while
30cafe5635SKent Overstreet * priorities and gens were being written before we could allocate. c->free is a
31cafe5635SKent Overstreet * smaller freelist, and buckets on that list are always ready to be used.
32cafe5635SKent Overstreet *
33cafe5635SKent Overstreet * If we've got discards enabled, that happens when a bucket moves from the
34cafe5635SKent Overstreet * free_inc list to the free list.
35cafe5635SKent Overstreet *
36cafe5635SKent Overstreet * There is another freelist, because sometimes we have buckets that we know
37cafe5635SKent Overstreet * have nothing pointing into them - these we can reuse without waiting for
38cafe5635SKent Overstreet * priorities to be rewritten. These come from freed btree nodes and buckets
39cafe5635SKent Overstreet * that garbage collection discovered no longer had valid keys pointing into
40cafe5635SKent Overstreet * them (because they were overwritten). That's the unused list - buckets on the
41cafe5635SKent Overstreet * unused list move to the free list, optionally being discarded in the process.
42cafe5635SKent Overstreet *
43cafe5635SKent Overstreet * It's also important to ensure that gens don't wrap around - with respect to
44cafe5635SKent Overstreet * either the oldest gen in the btree or the gen on disk. This is quite
45cafe5635SKent Overstreet * difficult to do in practice, but we explicitly guard against it anyways - if
46cafe5635SKent Overstreet * a bucket is in danger of wrapping around we simply skip invalidating it that
47cafe5635SKent Overstreet * time around, and we garbage collect or rewrite the priorities sooner than we
48cafe5635SKent Overstreet * would have otherwise.
49cafe5635SKent Overstreet *
50cafe5635SKent Overstreet * bch_bucket_alloc() allocates a single bucket from a specific cache.
51cafe5635SKent Overstreet *
5217e4aed8SColy Li * bch_bucket_alloc_set() allocates one bucket from different caches
53cafe5635SKent Overstreet * out of a cache set.
54cafe5635SKent Overstreet *
55cafe5635SKent Overstreet * free_some_buckets() drives all the processes described above. It's called
56cafe5635SKent Overstreet * from bch_bucket_alloc() and a few other places that need to make sure free
57cafe5635SKent Overstreet * buckets are ready.
58cafe5635SKent Overstreet *
59cafe5635SKent Overstreet * invalidate_buckets_(lru|fifo)() find buckets that are available to be
60cafe5635SKent Overstreet * invalidated, and then invalidate them and stick them on the free_inc list -
61cafe5635SKent Overstreet * in either lru or fifo order.
62cafe5635SKent Overstreet */
63cafe5635SKent Overstreet
64cafe5635SKent Overstreet #include "bcache.h"
65cafe5635SKent Overstreet #include "btree.h"
66cafe5635SKent Overstreet
6749b1212dSKent Overstreet #include <linux/blkdev.h>
68119ba0f8SKent Overstreet #include <linux/kthread.h>
69cafe5635SKent Overstreet #include <linux/random.h>
70c37511b8SKent Overstreet #include <trace/events/bcache.h>
71cafe5635SKent Overstreet
7289b1fc54STang Junhui #define MAX_OPEN_BUCKETS 128
7389b1fc54STang Junhui
74cafe5635SKent Overstreet /* Bucket heap / gen */
75cafe5635SKent Overstreet
bch_inc_gen(struct cache * ca,struct bucket * b)76cafe5635SKent Overstreet uint8_t bch_inc_gen(struct cache *ca, struct bucket *b)
77cafe5635SKent Overstreet {
78cafe5635SKent Overstreet uint8_t ret = ++b->gen;
79cafe5635SKent Overstreet
80cafe5635SKent Overstreet ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b));
81cafe5635SKent Overstreet WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX);
82cafe5635SKent Overstreet
83cafe5635SKent Overstreet return ret;
84cafe5635SKent Overstreet }
85cafe5635SKent Overstreet
bch_rescale_priorities(struct cache_set * c,int sectors)86cafe5635SKent Overstreet void bch_rescale_priorities(struct cache_set *c, int sectors)
87cafe5635SKent Overstreet {
88cafe5635SKent Overstreet struct cache *ca;
89cafe5635SKent Overstreet struct bucket *b;
904a784266SColy Li unsigned long next = c->nbuckets * c->cache->sb.bucket_size / 1024;
91cafe5635SKent Overstreet int r;
92cafe5635SKent Overstreet
93cafe5635SKent Overstreet atomic_sub(sectors, &c->rescale);
94cafe5635SKent Overstreet
95cafe5635SKent Overstreet do {
96cafe5635SKent Overstreet r = atomic_read(&c->rescale);
97cafe5635SKent Overstreet
98cafe5635SKent Overstreet if (r >= 0)
99cafe5635SKent Overstreet return;
100cafe5635SKent Overstreet } while (atomic_cmpxchg(&c->rescale, r, r + next) != r);
101cafe5635SKent Overstreet
102cafe5635SKent Overstreet mutex_lock(&c->bucket_lock);
103cafe5635SKent Overstreet
104cafe5635SKent Overstreet c->min_prio = USHRT_MAX;
105cafe5635SKent Overstreet
10608fdb2cdSColy Li ca = c->cache;
107cafe5635SKent Overstreet for_each_bucket(b, ca)
108cafe5635SKent Overstreet if (b->prio &&
109cafe5635SKent Overstreet b->prio != BTREE_PRIO &&
110cafe5635SKent Overstreet !atomic_read(&b->pin)) {
111cafe5635SKent Overstreet b->prio--;
112cafe5635SKent Overstreet c->min_prio = min(c->min_prio, b->prio);
113cafe5635SKent Overstreet }
114cafe5635SKent Overstreet
115cafe5635SKent Overstreet mutex_unlock(&c->bucket_lock);
116cafe5635SKent Overstreet }
117cafe5635SKent Overstreet
1182531d9eeSKent Overstreet /*
1192531d9eeSKent Overstreet * Background allocation thread: scans for buckets to be invalidated,
1202531d9eeSKent Overstreet * invalidates them, rewrites prios/gens (marking them as invalidated on disk),
1212531d9eeSKent Overstreet * then optionally issues discard commands to the newly free buckets, then puts
1222531d9eeSKent Overstreet * them on the various freelists.
1232531d9eeSKent Overstreet */
124cafe5635SKent Overstreet
can_inc_bucket_gen(struct bucket * b)125cafe5635SKent Overstreet static inline bool can_inc_bucket_gen(struct bucket *b)
126cafe5635SKent Overstreet {
1272531d9eeSKent Overstreet return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX;
128cafe5635SKent Overstreet }
129cafe5635SKent Overstreet
bch_can_invalidate_bucket(struct cache * ca,struct bucket * b)1302531d9eeSKent Overstreet bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b)
131cafe5635SKent Overstreet {
1322531d9eeSKent Overstreet BUG_ON(!ca->set->gc_mark_valid);
133cafe5635SKent Overstreet
1344fe6a816SKent Overstreet return (!GC_MARK(b) ||
1354fe6a816SKent Overstreet GC_MARK(b) == GC_MARK_RECLAIMABLE) &&
136cafe5635SKent Overstreet !atomic_read(&b->pin) &&
137cafe5635SKent Overstreet can_inc_bucket_gen(b);
138cafe5635SKent Overstreet }
139cafe5635SKent Overstreet
__bch_invalidate_one_bucket(struct cache * ca,struct bucket * b)1402531d9eeSKent Overstreet void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
141cafe5635SKent Overstreet {
1422531d9eeSKent Overstreet lockdep_assert_held(&ca->set->bucket_lock);
1432531d9eeSKent Overstreet BUG_ON(GC_MARK(b) && GC_MARK(b) != GC_MARK_RECLAIMABLE);
1447159b1adSKent Overstreet
1457159b1adSKent Overstreet if (GC_SECTORS_USED(b))
1462531d9eeSKent Overstreet trace_bcache_invalidate(ca, b - ca->buckets);
1477159b1adSKent Overstreet
148cafe5635SKent Overstreet bch_inc_gen(ca, b);
149cafe5635SKent Overstreet b->prio = INITIAL_PRIO;
150cafe5635SKent Overstreet atomic_inc(&b->pin);
1512531d9eeSKent Overstreet }
1522531d9eeSKent Overstreet
bch_invalidate_one_bucket(struct cache * ca,struct bucket * b)1532531d9eeSKent Overstreet static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
1542531d9eeSKent Overstreet {
1552531d9eeSKent Overstreet __bch_invalidate_one_bucket(ca, b);
1562531d9eeSKent Overstreet
1572531d9eeSKent Overstreet fifo_push(&ca->free_inc, b - ca->buckets);
158cafe5635SKent Overstreet }
159cafe5635SKent Overstreet
160e0a985a4SKent Overstreet /*
161e0a985a4SKent Overstreet * Determines what order we're going to reuse buckets, smallest bucket_prio()
162e0a985a4SKent Overstreet * first: we also take into account the number of sectors of live data in that
163e0a985a4SKent Overstreet * bucket, and in order for that multiply to make sense we have to scale bucket
164e0a985a4SKent Overstreet *
165e0a985a4SKent Overstreet * Thus, we scale the bucket priorities so that the bucket with the smallest
166e0a985a4SKent Overstreet * prio is worth 1/8th of what INITIAL_PRIO is worth.
167e0a985a4SKent Overstreet */
168e0a985a4SKent Overstreet
169b1a67b0fSKent Overstreet #define bucket_prio(b) \
170e0a985a4SKent Overstreet ({ \
1716f10f7d1SColy Li unsigned int min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; \
172e0a985a4SKent Overstreet \
173e0a985a4SKent Overstreet (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); \
174e0a985a4SKent Overstreet })
175b1a67b0fSKent Overstreet
176b1a67b0fSKent Overstreet #define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r))
177b1a67b0fSKent Overstreet #define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r))
178b1a67b0fSKent Overstreet
invalidate_buckets_lru(struct cache * ca)179cafe5635SKent Overstreet static void invalidate_buckets_lru(struct cache *ca)
180cafe5635SKent Overstreet {
181cafe5635SKent Overstreet struct bucket *b;
182cafe5635SKent Overstreet ssize_t i;
183cafe5635SKent Overstreet
184cafe5635SKent Overstreet ca->heap.used = 0;
185cafe5635SKent Overstreet
186cafe5635SKent Overstreet for_each_bucket(b, ca) {
1872531d9eeSKent Overstreet if (!bch_can_invalidate_bucket(ca, b))
18886b26b82SKent Overstreet continue;
18986b26b82SKent Overstreet
190cafe5635SKent Overstreet if (!heap_full(&ca->heap))
191cafe5635SKent Overstreet heap_add(&ca->heap, b, bucket_max_cmp);
192cafe5635SKent Overstreet else if (bucket_max_cmp(b, heap_peek(&ca->heap))) {
193cafe5635SKent Overstreet ca->heap.data[0] = b;
194cafe5635SKent Overstreet heap_sift(&ca->heap, 0, bucket_max_cmp);
195cafe5635SKent Overstreet }
196cafe5635SKent Overstreet }
197cafe5635SKent Overstreet
198cafe5635SKent Overstreet for (i = ca->heap.used / 2 - 1; i >= 0; --i)
199cafe5635SKent Overstreet heap_sift(&ca->heap, i, bucket_min_cmp);
200cafe5635SKent Overstreet
201cafe5635SKent Overstreet while (!fifo_full(&ca->free_inc)) {
202cafe5635SKent Overstreet if (!heap_pop(&ca->heap, b, bucket_min_cmp)) {
20386b26b82SKent Overstreet /*
20486b26b82SKent Overstreet * We don't want to be calling invalidate_buckets()
205cafe5635SKent Overstreet * multiple times when it can't do anything
206cafe5635SKent Overstreet */
207cafe5635SKent Overstreet ca->invalidate_needs_gc = 1;
20872a44517SKent Overstreet wake_up_gc(ca->set);
209cafe5635SKent Overstreet return;
210cafe5635SKent Overstreet }
211cafe5635SKent Overstreet
2122531d9eeSKent Overstreet bch_invalidate_one_bucket(ca, b);
213cafe5635SKent Overstreet }
214cafe5635SKent Overstreet }
215cafe5635SKent Overstreet
invalidate_buckets_fifo(struct cache * ca)216cafe5635SKent Overstreet static void invalidate_buckets_fifo(struct cache *ca)
217cafe5635SKent Overstreet {
218cafe5635SKent Overstreet struct bucket *b;
219cafe5635SKent Overstreet size_t checked = 0;
220cafe5635SKent Overstreet
221cafe5635SKent Overstreet while (!fifo_full(&ca->free_inc)) {
222cafe5635SKent Overstreet if (ca->fifo_last_bucket < ca->sb.first_bucket ||
223cafe5635SKent Overstreet ca->fifo_last_bucket >= ca->sb.nbuckets)
224cafe5635SKent Overstreet ca->fifo_last_bucket = ca->sb.first_bucket;
225cafe5635SKent Overstreet
226cafe5635SKent Overstreet b = ca->buckets + ca->fifo_last_bucket++;
227cafe5635SKent Overstreet
2282531d9eeSKent Overstreet if (bch_can_invalidate_bucket(ca, b))
2292531d9eeSKent Overstreet bch_invalidate_one_bucket(ca, b);
230cafe5635SKent Overstreet
231cafe5635SKent Overstreet if (++checked >= ca->sb.nbuckets) {
232cafe5635SKent Overstreet ca->invalidate_needs_gc = 1;
23372a44517SKent Overstreet wake_up_gc(ca->set);
234cafe5635SKent Overstreet return;
235cafe5635SKent Overstreet }
236cafe5635SKent Overstreet }
237cafe5635SKent Overstreet }
238cafe5635SKent Overstreet
invalidate_buckets_random(struct cache * ca)239cafe5635SKent Overstreet static void invalidate_buckets_random(struct cache *ca)
240cafe5635SKent Overstreet {
241cafe5635SKent Overstreet struct bucket *b;
242cafe5635SKent Overstreet size_t checked = 0;
243cafe5635SKent Overstreet
244cafe5635SKent Overstreet while (!fifo_full(&ca->free_inc)) {
245cafe5635SKent Overstreet size_t n;
2461fae7cf0SColy Li
247cafe5635SKent Overstreet get_random_bytes(&n, sizeof(n));
248cafe5635SKent Overstreet
249cafe5635SKent Overstreet n %= (size_t) (ca->sb.nbuckets - ca->sb.first_bucket);
250cafe5635SKent Overstreet n += ca->sb.first_bucket;
251cafe5635SKent Overstreet
252cafe5635SKent Overstreet b = ca->buckets + n;
253cafe5635SKent Overstreet
2542531d9eeSKent Overstreet if (bch_can_invalidate_bucket(ca, b))
2552531d9eeSKent Overstreet bch_invalidate_one_bucket(ca, b);
256cafe5635SKent Overstreet
257cafe5635SKent Overstreet if (++checked >= ca->sb.nbuckets / 2) {
258cafe5635SKent Overstreet ca->invalidate_needs_gc = 1;
25972a44517SKent Overstreet wake_up_gc(ca->set);
260cafe5635SKent Overstreet return;
261cafe5635SKent Overstreet }
262cafe5635SKent Overstreet }
263cafe5635SKent Overstreet }
264cafe5635SKent Overstreet
invalidate_buckets(struct cache * ca)265cafe5635SKent Overstreet static void invalidate_buckets(struct cache *ca)
266cafe5635SKent Overstreet {
2672531d9eeSKent Overstreet BUG_ON(ca->invalidate_needs_gc);
268cafe5635SKent Overstreet
269cafe5635SKent Overstreet switch (CACHE_REPLACEMENT(&ca->sb)) {
270cafe5635SKent Overstreet case CACHE_REPLACEMENT_LRU:
271cafe5635SKent Overstreet invalidate_buckets_lru(ca);
272cafe5635SKent Overstreet break;
273cafe5635SKent Overstreet case CACHE_REPLACEMENT_FIFO:
274cafe5635SKent Overstreet invalidate_buckets_fifo(ca);
275cafe5635SKent Overstreet break;
276cafe5635SKent Overstreet case CACHE_REPLACEMENT_RANDOM:
277cafe5635SKent Overstreet invalidate_buckets_random(ca);
278cafe5635SKent Overstreet break;
279cafe5635SKent Overstreet }
280cafe5635SKent Overstreet }
281cafe5635SKent Overstreet
282cafe5635SKent Overstreet #define allocator_wait(ca, cond) \
283cafe5635SKent Overstreet do { \
28486b26b82SKent Overstreet while (1) { \
285119ba0f8SKent Overstreet set_current_state(TASK_INTERRUPTIBLE); \
28686b26b82SKent Overstreet if (cond) \
28786b26b82SKent Overstreet break; \
288cafe5635SKent Overstreet \
289cafe5635SKent Overstreet mutex_unlock(&(ca)->set->bucket_lock); \
290771f393eSColy Li if (kthread_should_stop() || \
291771f393eSColy Li test_bit(CACHE_SET_IO_DISABLE, &ca->set->flags)) { \
29299361bbfSColy Li set_current_state(TASK_RUNNING); \
293ecb2ba8cSColy Li goto out; \
29499361bbfSColy Li } \
295cafe5635SKent Overstreet \
296cafe5635SKent Overstreet schedule(); \
297cafe5635SKent Overstreet mutex_lock(&(ca)->set->bucket_lock); \
298cafe5635SKent Overstreet } \
299119ba0f8SKent Overstreet __set_current_state(TASK_RUNNING); \
300cafe5635SKent Overstreet } while (0)
301cafe5635SKent Overstreet
bch_allocator_push(struct cache * ca,long bucket)30278365411SKent Overstreet static int bch_allocator_push(struct cache *ca, long bucket)
30378365411SKent Overstreet {
3046f10f7d1SColy Li unsigned int i;
30578365411SKent Overstreet
30678365411SKent Overstreet /* Prios/gens are actually the most important reserve */
30778365411SKent Overstreet if (fifo_push(&ca->free[RESERVE_PRIO], bucket))
30878365411SKent Overstreet return true;
30978365411SKent Overstreet
31078365411SKent Overstreet for (i = 0; i < RESERVE_NR; i++)
31178365411SKent Overstreet if (fifo_push(&ca->free[i], bucket))
31278365411SKent Overstreet return true;
31378365411SKent Overstreet
31478365411SKent Overstreet return false;
31578365411SKent Overstreet }
31678365411SKent Overstreet
bch_allocator_thread(void * arg)317119ba0f8SKent Overstreet static int bch_allocator_thread(void *arg)
318cafe5635SKent Overstreet {
319119ba0f8SKent Overstreet struct cache *ca = arg;
320cafe5635SKent Overstreet
321cafe5635SKent Overstreet mutex_lock(&ca->set->bucket_lock);
322cafe5635SKent Overstreet
323cafe5635SKent Overstreet while (1) {
32486b26b82SKent Overstreet /*
32586b26b82SKent Overstreet * First, we pull buckets off of the unused and free_inc lists,
32686b26b82SKent Overstreet * possibly issue discards to them, then we add the bucket to
32786b26b82SKent Overstreet * the free list:
32886b26b82SKent Overstreet */
32978d4eb8aSArnd Bergmann while (1) {
330cafe5635SKent Overstreet long bucket;
331cafe5635SKent Overstreet
33278d4eb8aSArnd Bergmann if (!fifo_pop(&ca->free_inc, bucket))
33378d4eb8aSArnd Bergmann break;
334cafe5635SKent Overstreet
335cafe5635SKent Overstreet if (ca->discard) {
33649b1212dSKent Overstreet mutex_unlock(&ca->set->bucket_lock);
33749b1212dSKent Overstreet blkdev_issue_discard(ca->bdev,
33849b1212dSKent Overstreet bucket_to_sector(ca->set, bucket),
339*44abff2cSChristoph Hellwig ca->sb.bucket_size, GFP_KERNEL);
34049b1212dSKent Overstreet mutex_lock(&ca->set->bucket_lock);
34149b1212dSKent Overstreet }
34249b1212dSKent Overstreet
34378365411SKent Overstreet allocator_wait(ca, bch_allocator_push(ca, bucket));
3440a63b66dSKent Overstreet wake_up(&ca->set->btree_cache_wait);
34535fcd848SKent Overstreet wake_up(&ca->set->bucket_wait);
346cafe5635SKent Overstreet }
347cafe5635SKent Overstreet
34886b26b82SKent Overstreet /*
34986b26b82SKent Overstreet * We've run out of free buckets, we need to find some buckets
35086b26b82SKent Overstreet * we can invalidate. First, invalidate them in memory and add
35186b26b82SKent Overstreet * them to the free_inc list:
35286b26b82SKent Overstreet */
35386b26b82SKent Overstreet
3542531d9eeSKent Overstreet retry_invalidate:
35586b26b82SKent Overstreet allocator_wait(ca, ca->set->gc_mark_valid &&
3562531d9eeSKent Overstreet !ca->invalidate_needs_gc);
357cafe5635SKent Overstreet invalidate_buckets(ca);
358cafe5635SKent Overstreet
35986b26b82SKent Overstreet /*
36086b26b82SKent Overstreet * Now, we write their new gens to disk so we can start writing
36186b26b82SKent Overstreet * new stuff to them:
36286b26b82SKent Overstreet */
36386b26b82SKent Overstreet allocator_wait(ca, !atomic_read(&ca->set->prio_blocked));
3646f9414e0SColy Li if (CACHE_SYNC(&ca->sb)) {
3652531d9eeSKent Overstreet /*
3662531d9eeSKent Overstreet * This could deadlock if an allocation with a btree
3672531d9eeSKent Overstreet * node locked ever blocked - having the btree node
3682531d9eeSKent Overstreet * locked would block garbage collection, but here we're
3692531d9eeSKent Overstreet * waiting on garbage collection before we invalidate
3702531d9eeSKent Overstreet * and free anything.
3712531d9eeSKent Overstreet *
3722531d9eeSKent Overstreet * But this should be safe since the btree code always
3732531d9eeSKent Overstreet * uses btree_check_reserve() before allocating now, and
3742531d9eeSKent Overstreet * if it fails it blocks without btree nodes locked.
3752531d9eeSKent Overstreet */
3762531d9eeSKent Overstreet if (!fifo_full(&ca->free_inc))
3772531d9eeSKent Overstreet goto retry_invalidate;
3782531d9eeSKent Overstreet
37984c529aeSAndrea Righi if (bch_prio_write(ca, false) < 0) {
38084c529aeSAndrea Righi ca->invalidate_needs_gc = 1;
38184c529aeSAndrea Righi wake_up_gc(ca->set);
38284c529aeSAndrea Righi }
383cafe5635SKent Overstreet }
384cafe5635SKent Overstreet }
385ecb2ba8cSColy Li out:
386ecb2ba8cSColy Li wait_for_kthread_stop();
387ecb2ba8cSColy Li return 0;
3882531d9eeSKent Overstreet }
3892531d9eeSKent Overstreet
3902531d9eeSKent Overstreet /* Allocation */
391cafe5635SKent Overstreet
bch_bucket_alloc(struct cache * ca,unsigned int reserve,bool wait)3926f10f7d1SColy Li long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait)
393cafe5635SKent Overstreet {
39435fcd848SKent Overstreet DEFINE_WAIT(w);
39535fcd848SKent Overstreet struct bucket *b;
39635fcd848SKent Overstreet long r;
39735fcd848SKent Overstreet
398e775339eSColy Li
399e775339eSColy Li /* No allocation if CACHE_SET_IO_DISABLE bit is set */
400e775339eSColy Li if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &ca->set->flags)))
401e775339eSColy Li return -1;
402e775339eSColy Li
40335fcd848SKent Overstreet /* fastpath */
40478365411SKent Overstreet if (fifo_pop(&ca->free[RESERVE_NONE], r) ||
40578365411SKent Overstreet fifo_pop(&ca->free[reserve], r))
40635fcd848SKent Overstreet goto out;
40735fcd848SKent Overstreet
4087159b1adSKent Overstreet if (!wait) {
4097159b1adSKent Overstreet trace_bcache_alloc_fail(ca, reserve);
41035fcd848SKent Overstreet return -1;
4117159b1adSKent Overstreet }
41235fcd848SKent Overstreet
41378365411SKent Overstreet do {
41435fcd848SKent Overstreet prepare_to_wait(&ca->set->bucket_wait, &w,
41535fcd848SKent Overstreet TASK_UNINTERRUPTIBLE);
41635fcd848SKent Overstreet
41735fcd848SKent Overstreet mutex_unlock(&ca->set->bucket_lock);
41835fcd848SKent Overstreet schedule();
41935fcd848SKent Overstreet mutex_lock(&ca->set->bucket_lock);
42078365411SKent Overstreet } while (!fifo_pop(&ca->free[RESERVE_NONE], r) &&
42178365411SKent Overstreet !fifo_pop(&ca->free[reserve], r));
42235fcd848SKent Overstreet
42335fcd848SKent Overstreet finish_wait(&ca->set->bucket_wait, &w);
42435fcd848SKent Overstreet out:
42591af8300SColy Li if (ca->alloc_thread)
426119ba0f8SKent Overstreet wake_up_process(ca->alloc_thread);
427cafe5635SKent Overstreet
4287159b1adSKent Overstreet trace_bcache_alloc(ca, reserve);
4297159b1adSKent Overstreet
430280481d0SKent Overstreet if (expensive_debug_checks(ca->set)) {
431cafe5635SKent Overstreet size_t iter;
432cafe5635SKent Overstreet long i;
4336f10f7d1SColy Li unsigned int j;
434cafe5635SKent Overstreet
435cafe5635SKent Overstreet for (iter = 0; iter < prio_buckets(ca) * 2; iter++)
436cafe5635SKent Overstreet BUG_ON(ca->prio_buckets[iter] == (uint64_t) r);
437cafe5635SKent Overstreet
43878365411SKent Overstreet for (j = 0; j < RESERVE_NR; j++)
43978365411SKent Overstreet fifo_for_each(i, &ca->free[j], iter)
440cafe5635SKent Overstreet BUG_ON(i == r);
441cafe5635SKent Overstreet fifo_for_each(i, &ca->free_inc, iter)
442cafe5635SKent Overstreet BUG_ON(i == r);
44335fcd848SKent Overstreet }
444280481d0SKent Overstreet
44535fcd848SKent Overstreet b = ca->buckets + r;
44635fcd848SKent Overstreet
447cafe5635SKent Overstreet BUG_ON(atomic_read(&b->pin) != 1);
448cafe5635SKent Overstreet
449cafe5635SKent Overstreet SET_GC_SECTORS_USED(b, ca->sb.bucket_size);
450cafe5635SKent Overstreet
45178365411SKent Overstreet if (reserve <= RESERVE_PRIO) {
452cafe5635SKent Overstreet SET_GC_MARK(b, GC_MARK_METADATA);
453981aa8c0SNicholas Swenson SET_GC_MOVE(b, 0);
454cafe5635SKent Overstreet b->prio = BTREE_PRIO;
455cafe5635SKent Overstreet } else {
456cafe5635SKent Overstreet SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
457981aa8c0SNicholas Swenson SET_GC_MOVE(b, 0);
458cafe5635SKent Overstreet b->prio = INITIAL_PRIO;
459cafe5635SKent Overstreet }
460cafe5635SKent Overstreet
461d44c2f9eSTang Junhui if (ca->set->avail_nbuckets > 0) {
462d44c2f9eSTang Junhui ca->set->avail_nbuckets--;
463d44c2f9eSTang Junhui bch_update_bucket_in_use(ca->set, &ca->set->gc_stats);
464d44c2f9eSTang Junhui }
465d44c2f9eSTang Junhui
466cafe5635SKent Overstreet return r;
467cafe5635SKent Overstreet }
468cafe5635SKent Overstreet
__bch_bucket_free(struct cache * ca,struct bucket * b)4692531d9eeSKent Overstreet void __bch_bucket_free(struct cache *ca, struct bucket *b)
4702531d9eeSKent Overstreet {
4712531d9eeSKent Overstreet SET_GC_MARK(b, 0);
4722531d9eeSKent Overstreet SET_GC_SECTORS_USED(b, 0);
473d44c2f9eSTang Junhui
474d44c2f9eSTang Junhui if (ca->set->avail_nbuckets < ca->set->nbuckets) {
475d44c2f9eSTang Junhui ca->set->avail_nbuckets++;
476d44c2f9eSTang Junhui bch_update_bucket_in_use(ca->set, &ca->set->gc_stats);
477d44c2f9eSTang Junhui }
4782531d9eeSKent Overstreet }
4792531d9eeSKent Overstreet
bch_bucket_free(struct cache_set * c,struct bkey * k)480cafe5635SKent Overstreet void bch_bucket_free(struct cache_set *c, struct bkey *k)
481cafe5635SKent Overstreet {
4826f10f7d1SColy Li unsigned int i;
483cafe5635SKent Overstreet
4842531d9eeSKent Overstreet for (i = 0; i < KEY_PTRS(k); i++)
48511e9560eSChristoph Hellwig __bch_bucket_free(c->cache, PTR_BUCKET(c, k, i));
486cafe5635SKent Overstreet }
487cafe5635SKent Overstreet
__bch_bucket_alloc_set(struct cache_set * c,unsigned int reserve,struct bkey * k,bool wait)4886f10f7d1SColy Li int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
48917e4aed8SColy Li struct bkey *k, bool wait)
490cafe5635SKent Overstreet {
49117e4aed8SColy Li struct cache *ca;
49217e4aed8SColy Li long b;
493cafe5635SKent Overstreet
494e775339eSColy Li /* No allocation if CACHE_SET_IO_DISABLE bit is set */
495e775339eSColy Li if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags)))
496e775339eSColy Li return -1;
497e775339eSColy Li
498cafe5635SKent Overstreet lockdep_assert_held(&c->bucket_lock);
499cafe5635SKent Overstreet
500cafe5635SKent Overstreet bkey_init(k);
501cafe5635SKent Overstreet
502697e2349SColy Li ca = c->cache;
50317e4aed8SColy Li b = bch_bucket_alloc(ca, reserve, wait);
504cafe5635SKent Overstreet if (b == -1)
505cafe5635SKent Overstreet goto err;
506cafe5635SKent Overstreet
50717e4aed8SColy Li k->ptr[0] = MAKE_PTR(ca->buckets[b].gen,
508cafe5635SKent Overstreet bucket_to_sector(c, b),
509cafe5635SKent Overstreet ca->sb.nr_this_dev);
510cafe5635SKent Overstreet
51117e4aed8SColy Li SET_KEY_PTRS(k, 1);
512cafe5635SKent Overstreet
513cafe5635SKent Overstreet return 0;
514cafe5635SKent Overstreet err:
515cafe5635SKent Overstreet bch_bucket_free(c, k);
5163a3b6a4eSKent Overstreet bkey_put(c, k);
517cafe5635SKent Overstreet return -1;
518cafe5635SKent Overstreet }
519cafe5635SKent Overstreet
bch_bucket_alloc_set(struct cache_set * c,unsigned int reserve,struct bkey * k,bool wait)5206f10f7d1SColy Li int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
52117e4aed8SColy Li struct bkey *k, bool wait)
522cafe5635SKent Overstreet {
523cafe5635SKent Overstreet int ret;
5241fae7cf0SColy Li
525cafe5635SKent Overstreet mutex_lock(&c->bucket_lock);
52617e4aed8SColy Li ret = __bch_bucket_alloc_set(c, reserve, k, wait);
527cafe5635SKent Overstreet mutex_unlock(&c->bucket_lock);
528cafe5635SKent Overstreet return ret;
529cafe5635SKent Overstreet }
530cafe5635SKent Overstreet
5312599b53bSKent Overstreet /* Sector allocator */
5322599b53bSKent Overstreet
5332599b53bSKent Overstreet struct open_bucket {
5342599b53bSKent Overstreet struct list_head list;
5356f10f7d1SColy Li unsigned int last_write_point;
5366f10f7d1SColy Li unsigned int sectors_free;
5372599b53bSKent Overstreet BKEY_PADDED(key);
5382599b53bSKent Overstreet };
5392599b53bSKent Overstreet
5402599b53bSKent Overstreet /*
5412599b53bSKent Overstreet * We keep multiple buckets open for writes, and try to segregate different
5424eca1cb2STang Junhui * write streams for better cache utilization: first we try to segregate flash
5434eca1cb2STang Junhui * only volume write streams from cached devices, secondly we look for a bucket
5444eca1cb2STang Junhui * where the last write to it was sequential with the current write, and
5454eca1cb2STang Junhui * failing that we look for a bucket that was last used by the same task.
5462599b53bSKent Overstreet *
5472599b53bSKent Overstreet * The ideas is if you've got multiple tasks pulling data into the cache at the
5482599b53bSKent Overstreet * same time, you'll get better cache utilization if you try to segregate their
5492599b53bSKent Overstreet * data and preserve locality.
5502599b53bSKent Overstreet *
5514eca1cb2STang Junhui * For example, dirty sectors of flash only volume is not reclaimable, if their
5524eca1cb2STang Junhui * dirty sectors mixed with dirty sectors of cached device, such buckets will
5534eca1cb2STang Junhui * be marked as dirty and won't be reclaimed, though the dirty data of cached
5544eca1cb2STang Junhui * device have been written back to backend device.
5554eca1cb2STang Junhui *
5564eca1cb2STang Junhui * And say you've starting Firefox at the same time you're copying a
5572599b53bSKent Overstreet * bunch of files. Firefox will likely end up being fairly hot and stay in the
5582599b53bSKent Overstreet * cache awhile, but the data you copied might not be; if you wrote all that
5592599b53bSKent Overstreet * data to the same buckets it'd get invalidated at the same time.
5602599b53bSKent Overstreet *
5612599b53bSKent Overstreet * Both of those tasks will be doing fairly random IO so we can't rely on
5622599b53bSKent Overstreet * detecting sequential IO to segregate their data, but going off of the task
5632599b53bSKent Overstreet * should be a sane heuristic.
5642599b53bSKent Overstreet */
pick_data_bucket(struct cache_set * c,const struct bkey * search,unsigned int write_point,struct bkey * alloc)5652599b53bSKent Overstreet static struct open_bucket *pick_data_bucket(struct cache_set *c,
5662599b53bSKent Overstreet const struct bkey *search,
5676f10f7d1SColy Li unsigned int write_point,
5682599b53bSKent Overstreet struct bkey *alloc)
5692599b53bSKent Overstreet {
5702599b53bSKent Overstreet struct open_bucket *ret, *ret_task = NULL;
5712599b53bSKent Overstreet
5722599b53bSKent Overstreet list_for_each_entry_reverse(ret, &c->data_buckets, list)
5734eca1cb2STang Junhui if (UUID_FLASH_ONLY(&c->uuids[KEY_INODE(&ret->key)]) !=
5744eca1cb2STang Junhui UUID_FLASH_ONLY(&c->uuids[KEY_INODE(search)]))
5754eca1cb2STang Junhui continue;
5764eca1cb2STang Junhui else if (!bkey_cmp(&ret->key, search))
5772599b53bSKent Overstreet goto found;
5782599b53bSKent Overstreet else if (ret->last_write_point == write_point)
5792599b53bSKent Overstreet ret_task = ret;
5802599b53bSKent Overstreet
5812599b53bSKent Overstreet ret = ret_task ?: list_first_entry(&c->data_buckets,
5822599b53bSKent Overstreet struct open_bucket, list);
5832599b53bSKent Overstreet found:
5842599b53bSKent Overstreet if (!ret->sectors_free && KEY_PTRS(alloc)) {
5854a784266SColy Li ret->sectors_free = c->cache->sb.bucket_size;
5862599b53bSKent Overstreet bkey_copy(&ret->key, alloc);
5872599b53bSKent Overstreet bkey_init(alloc);
5882599b53bSKent Overstreet }
5892599b53bSKent Overstreet
5902599b53bSKent Overstreet if (!ret->sectors_free)
5912599b53bSKent Overstreet ret = NULL;
5922599b53bSKent Overstreet
5932599b53bSKent Overstreet return ret;
5942599b53bSKent Overstreet }
5952599b53bSKent Overstreet
5962599b53bSKent Overstreet /*
5972599b53bSKent Overstreet * Allocates some space in the cache to write to, and k to point to the newly
5982599b53bSKent Overstreet * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the
5992599b53bSKent Overstreet * end of the newly allocated space).
6002599b53bSKent Overstreet *
6012599b53bSKent Overstreet * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many
6022599b53bSKent Overstreet * sectors were actually allocated.
6032599b53bSKent Overstreet *
6042599b53bSKent Overstreet * If s->writeback is true, will not fail.
6052599b53bSKent Overstreet */
bch_alloc_sectors(struct cache_set * c,struct bkey * k,unsigned int sectors,unsigned int write_point,unsigned int write_prio,bool wait)6066f10f7d1SColy Li bool bch_alloc_sectors(struct cache_set *c,
6076f10f7d1SColy Li struct bkey *k,
6086f10f7d1SColy Li unsigned int sectors,
6096f10f7d1SColy Li unsigned int write_point,
6106f10f7d1SColy Li unsigned int write_prio,
6116f10f7d1SColy Li bool wait)
6122599b53bSKent Overstreet {
6132599b53bSKent Overstreet struct open_bucket *b;
6142599b53bSKent Overstreet BKEY_PADDED(key) alloc;
6156f10f7d1SColy Li unsigned int i;
6162599b53bSKent Overstreet
6172599b53bSKent Overstreet /*
6182599b53bSKent Overstreet * We might have to allocate a new bucket, which we can't do with a
6192599b53bSKent Overstreet * spinlock held. So if we have to allocate, we drop the lock, allocate
6202599b53bSKent Overstreet * and then retry. KEY_PTRS() indicates whether alloc points to
6212599b53bSKent Overstreet * allocated bucket(s).
6222599b53bSKent Overstreet */
6232599b53bSKent Overstreet
6242599b53bSKent Overstreet bkey_init(&alloc.key);
6252599b53bSKent Overstreet spin_lock(&c->data_bucket_lock);
6262599b53bSKent Overstreet
6272599b53bSKent Overstreet while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) {
6286f10f7d1SColy Li unsigned int watermark = write_prio
62978365411SKent Overstreet ? RESERVE_MOVINGGC
63078365411SKent Overstreet : RESERVE_NONE;
6312599b53bSKent Overstreet
6322599b53bSKent Overstreet spin_unlock(&c->data_bucket_lock);
6332599b53bSKent Overstreet
63417e4aed8SColy Li if (bch_bucket_alloc_set(c, watermark, &alloc.key, wait))
6352599b53bSKent Overstreet return false;
6362599b53bSKent Overstreet
6372599b53bSKent Overstreet spin_lock(&c->data_bucket_lock);
6382599b53bSKent Overstreet }
6392599b53bSKent Overstreet
6402599b53bSKent Overstreet /*
6412599b53bSKent Overstreet * If we had to allocate, we might race and not need to allocate the
642b1e8139eSColy Li * second time we call pick_data_bucket(). If we allocated a bucket but
6432599b53bSKent Overstreet * didn't use it, drop the refcount bch_bucket_alloc_set() took:
6442599b53bSKent Overstreet */
6452599b53bSKent Overstreet if (KEY_PTRS(&alloc.key))
6463a3b6a4eSKent Overstreet bkey_put(c, &alloc.key);
6472599b53bSKent Overstreet
6482599b53bSKent Overstreet for (i = 0; i < KEY_PTRS(&b->key); i++)
6492599b53bSKent Overstreet EBUG_ON(ptr_stale(c, &b->key, i));
6502599b53bSKent Overstreet
6512599b53bSKent Overstreet /* Set up the pointer to the space we're allocating: */
6522599b53bSKent Overstreet
6532599b53bSKent Overstreet for (i = 0; i < KEY_PTRS(&b->key); i++)
6542599b53bSKent Overstreet k->ptr[i] = b->key.ptr[i];
6552599b53bSKent Overstreet
6562599b53bSKent Overstreet sectors = min(sectors, b->sectors_free);
6572599b53bSKent Overstreet
6582599b53bSKent Overstreet SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors);
6592599b53bSKent Overstreet SET_KEY_SIZE(k, sectors);
6602599b53bSKent Overstreet SET_KEY_PTRS(k, KEY_PTRS(&b->key));
6612599b53bSKent Overstreet
6622599b53bSKent Overstreet /*
6632599b53bSKent Overstreet * Move b to the end of the lru, and keep track of what this bucket was
6642599b53bSKent Overstreet * last used for:
6652599b53bSKent Overstreet */
6662599b53bSKent Overstreet list_move_tail(&b->list, &c->data_buckets);
6672599b53bSKent Overstreet bkey_copy_key(&b->key, k);
6682599b53bSKent Overstreet b->last_write_point = write_point;
6692599b53bSKent Overstreet
6702599b53bSKent Overstreet b->sectors_free -= sectors;
6712599b53bSKent Overstreet
6722599b53bSKent Overstreet for (i = 0; i < KEY_PTRS(&b->key); i++) {
6732599b53bSKent Overstreet SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors);
6742599b53bSKent Overstreet
6752599b53bSKent Overstreet atomic_long_add(sectors,
67611e9560eSChristoph Hellwig &c->cache->sectors_written);
6772599b53bSKent Overstreet }
6782599b53bSKent Overstreet
6794a784266SColy Li if (b->sectors_free < c->cache->sb.block_size)
6802599b53bSKent Overstreet b->sectors_free = 0;
6812599b53bSKent Overstreet
6822599b53bSKent Overstreet /*
6832599b53bSKent Overstreet * k takes refcounts on the buckets it points to until it's inserted
6842599b53bSKent Overstreet * into the btree, but if we're done with this bucket we just transfer
6852599b53bSKent Overstreet * get_data_bucket()'s refcount.
6862599b53bSKent Overstreet */
6872599b53bSKent Overstreet if (b->sectors_free)
6882599b53bSKent Overstreet for (i = 0; i < KEY_PTRS(&b->key); i++)
6892599b53bSKent Overstreet atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin);
6902599b53bSKent Overstreet
6912599b53bSKent Overstreet spin_unlock(&c->data_bucket_lock);
6922599b53bSKent Overstreet return true;
6932599b53bSKent Overstreet }
6942599b53bSKent Overstreet
695cafe5635SKent Overstreet /* Init */
696cafe5635SKent Overstreet
bch_open_buckets_free(struct cache_set * c)6972599b53bSKent Overstreet void bch_open_buckets_free(struct cache_set *c)
6982599b53bSKent Overstreet {
6992599b53bSKent Overstreet struct open_bucket *b;
7002599b53bSKent Overstreet
7012599b53bSKent Overstreet while (!list_empty(&c->data_buckets)) {
7022599b53bSKent Overstreet b = list_first_entry(&c->data_buckets,
7032599b53bSKent Overstreet struct open_bucket, list);
7042599b53bSKent Overstreet list_del(&b->list);
7052599b53bSKent Overstreet kfree(b);
7062599b53bSKent Overstreet }
7072599b53bSKent Overstreet }
7082599b53bSKent Overstreet
bch_open_buckets_alloc(struct cache_set * c)7092599b53bSKent Overstreet int bch_open_buckets_alloc(struct cache_set *c)
7102599b53bSKent Overstreet {
7112599b53bSKent Overstreet int i;
7122599b53bSKent Overstreet
7132599b53bSKent Overstreet spin_lock_init(&c->data_bucket_lock);
7142599b53bSKent Overstreet
71589b1fc54STang Junhui for (i = 0; i < MAX_OPEN_BUCKETS; i++) {
7162599b53bSKent Overstreet struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL);
7171fae7cf0SColy Li
7182599b53bSKent Overstreet if (!b)
7192599b53bSKent Overstreet return -ENOMEM;
7202599b53bSKent Overstreet
7212599b53bSKent Overstreet list_add(&b->list, &c->data_buckets);
7222599b53bSKent Overstreet }
7232599b53bSKent Overstreet
7242599b53bSKent Overstreet return 0;
7252599b53bSKent Overstreet }
7262599b53bSKent Overstreet
bch_cache_allocator_start(struct cache * ca)727119ba0f8SKent Overstreet int bch_cache_allocator_start(struct cache *ca)
728119ba0f8SKent Overstreet {
729764b53b2SJens Axboe struct task_struct *k = kthread_run(bch_allocator_thread,
730764b53b2SJens Axboe ca, "bcache_allocator");
73179826c35SKent Overstreet if (IS_ERR(k))
73279826c35SKent Overstreet return PTR_ERR(k);
733119ba0f8SKent Overstreet
73479826c35SKent Overstreet ca->alloc_thread = k;
735119ba0f8SKent Overstreet return 0;
736119ba0f8SKent Overstreet }
737