xref: /openbmc/linux/fs/btrfs/discard.c (revision 6e9b7cd6)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include <linux/jiffies.h>
4 #include <linux/kernel.h>
5 #include <linux/ktime.h>
6 #include <linux/list.h>
7 #include <linux/math64.h>
8 #include <linux/sizes.h>
9 #include <linux/workqueue.h>
10 #include "ctree.h"
11 #include "block-group.h"
12 #include "discard.h"
13 #include "free-space-cache.h"
14 #include "fs.h"
15 
16 /*
17  * This contains the logic to handle async discard.
18  *
19  * Async discard manages trimming of free space outside of transaction commit.
20  * Discarding is done by managing the block_groups on a LRU list based on free
21  * space recency.  Two passes are used to first prioritize discarding extents
22  * and then allow for trimming in the bitmap the best opportunity to coalesce.
23  * The block_groups are maintained on multiple lists to allow for multiple
24  * passes with different discard filter requirements.  A delayed work item is
25  * used to manage discarding with timeout determined by a max of the delay
26  * incurred by the iops rate limit, the byte rate limit, and the max delay of
27  * BTRFS_DISCARD_MAX_DELAY.
28  *
29  * Note, this only keeps track of block_groups that are explicitly for data.
30  * Mixed block_groups are not supported.
31  *
32  * The first list is special to manage discarding of fully free block groups.
33  * This is necessary because we issue a final trim for a full free block group
34  * after forgetting it.  When a block group becomes unused, instead of directly
35  * being added to the unused_bgs list, we add it to this first list.  Then
36  * from there, if it becomes fully discarded, we place it onto the unused_bgs
37  * list.
38  *
39  * The in-memory free space cache serves as the backing state for discard.
40  * Consequently this means there is no persistence.  We opt to load all the
41  * block groups in as not discarded, so the mount case degenerates to the
42  * crashing case.
43  *
44  * As the free space cache uses bitmaps, there exists a tradeoff between
45  * ease/efficiency for find_free_extent() and the accuracy of discard state.
46  * Here we opt to let untrimmed regions merge with everything while only letting
47  * trimmed regions merge with other trimmed regions.  This can cause
48  * overtrimming, but the coalescing benefit seems to be worth it.  Additionally,
49  * bitmap state is tracked as a whole.  If we're able to fully trim a bitmap,
50  * the trimmed flag is set on the bitmap.  Otherwise, if an allocation comes in,
51  * this resets the state and we will retry trimming the whole bitmap.  This is a
52  * tradeoff between discard state accuracy and the cost of accounting.
53  */
54 
55 /* This is an initial delay to give some chance for block reuse */
56 #define BTRFS_DISCARD_DELAY		(120ULL * NSEC_PER_SEC)
57 #define BTRFS_DISCARD_UNUSED_DELAY	(10ULL * NSEC_PER_SEC)
58 
59 #define BTRFS_DISCARD_MIN_DELAY_MSEC	(1UL)
60 #define BTRFS_DISCARD_MAX_DELAY_MSEC	(1000UL)
61 #define BTRFS_DISCARD_MAX_IOPS		(1000U)
62 
63 /* Monotonically decreasing minimum length filters after index 0 */
64 static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = {
65 	0,
66 	BTRFS_ASYNC_DISCARD_MAX_FILTER,
67 	BTRFS_ASYNC_DISCARD_MIN_FILTER
68 };
69 
70 static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl,
71 					  struct btrfs_block_group *block_group)
72 {
73 	return &discard_ctl->discard_list[block_group->discard_index];
74 }
75 
76 static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
77 				  struct btrfs_block_group *block_group)
78 {
79 	lockdep_assert_held(&discard_ctl->lock);
80 	if (!btrfs_run_discard_work(discard_ctl))
81 		return;
82 
83 	if (list_empty(&block_group->discard_list) ||
84 	    block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) {
85 		if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED)
86 			block_group->discard_index = BTRFS_DISCARD_INDEX_START;
87 		block_group->discard_eligible_time = (ktime_get_ns() +
88 						      BTRFS_DISCARD_DELAY);
89 		block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
90 	}
91 	if (list_empty(&block_group->discard_list))
92 		btrfs_get_block_group(block_group);
93 
94 	list_move_tail(&block_group->discard_list,
95 		       get_discard_list(discard_ctl, block_group));
96 }
97 
98 static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
99 				struct btrfs_block_group *block_group)
100 {
101 	if (!btrfs_is_block_group_data_only(block_group))
102 		return;
103 
104 	spin_lock(&discard_ctl->lock);
105 	__add_to_discard_list(discard_ctl, block_group);
106 	spin_unlock(&discard_ctl->lock);
107 }
108 
109 static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl,
110 				       struct btrfs_block_group *block_group)
111 {
112 	bool queued;
113 
114 	spin_lock(&discard_ctl->lock);
115 
116 	queued = !list_empty(&block_group->discard_list);
117 
118 	if (!btrfs_run_discard_work(discard_ctl)) {
119 		spin_unlock(&discard_ctl->lock);
120 		return;
121 	}
122 
123 	list_del_init(&block_group->discard_list);
124 
125 	block_group->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
126 	block_group->discard_eligible_time = (ktime_get_ns() +
127 					      BTRFS_DISCARD_UNUSED_DELAY);
128 	block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
129 	if (!queued)
130 		btrfs_get_block_group(block_group);
131 	list_add_tail(&block_group->discard_list,
132 		      &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]);
133 
134 	spin_unlock(&discard_ctl->lock);
135 }
136 
137 static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
138 				     struct btrfs_block_group *block_group)
139 {
140 	bool running = false;
141 	bool queued = false;
142 
143 	spin_lock(&discard_ctl->lock);
144 
145 	if (block_group == discard_ctl->block_group) {
146 		running = true;
147 		discard_ctl->block_group = NULL;
148 	}
149 
150 	block_group->discard_eligible_time = 0;
151 	queued = !list_empty(&block_group->discard_list);
152 	list_del_init(&block_group->discard_list);
153 	/*
154 	 * If the block group is currently running in the discard workfn, we
155 	 * don't want to deref it, since it's still being used by the workfn.
156 	 * The workfn will notice this case and deref the block group when it is
157 	 * finished.
158 	 */
159 	if (queued && !running)
160 		btrfs_put_block_group(block_group);
161 
162 	spin_unlock(&discard_ctl->lock);
163 
164 	return running;
165 }
166 
167 /*
168  * Find block_group that's up next for discarding.
169  *
170  * @discard_ctl:  discard control
171  * @now:          current time
172  *
173  * Iterate over the discard lists to find the next block_group up for
174  * discarding checking the discard_eligible_time of block_group.
175  */
176 static struct btrfs_block_group *find_next_block_group(
177 					struct btrfs_discard_ctl *discard_ctl,
178 					u64 now)
179 {
180 	struct btrfs_block_group *ret_block_group = NULL, *block_group;
181 	int i;
182 
183 	for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
184 		struct list_head *discard_list = &discard_ctl->discard_list[i];
185 
186 		if (!list_empty(discard_list)) {
187 			block_group = list_first_entry(discard_list,
188 						       struct btrfs_block_group,
189 						       discard_list);
190 
191 			if (!ret_block_group)
192 				ret_block_group = block_group;
193 
194 			if (ret_block_group->discard_eligible_time < now)
195 				break;
196 
197 			if (ret_block_group->discard_eligible_time >
198 			    block_group->discard_eligible_time)
199 				ret_block_group = block_group;
200 		}
201 	}
202 
203 	return ret_block_group;
204 }
205 
206 /*
207  * Look up next block group and set it for use.
208  *
209  * @discard_ctl:   discard control
210  * @discard_state: the discard_state of the block_group after state management
211  * @discard_index: the discard_index of the block_group after state management
212  * @now:           time when discard was invoked, in ns
213  *
214  * Wrap find_next_block_group() and set the block_group to be in use.
215  * @discard_state's control flow is managed here.  Variables related to
216  * @discard_state are reset here as needed (eg. @discard_cursor).  @discard_state
217  * and @discard_index are remembered as it may change while we're discarding,
218  * but we want the discard to execute in the context determined here.
219  */
220 static struct btrfs_block_group *peek_discard_list(
221 					struct btrfs_discard_ctl *discard_ctl,
222 					enum btrfs_discard_state *discard_state,
223 					int *discard_index, u64 now)
224 {
225 	struct btrfs_block_group *block_group;
226 
227 	spin_lock(&discard_ctl->lock);
228 again:
229 	block_group = find_next_block_group(discard_ctl, now);
230 
231 	if (block_group && now >= block_group->discard_eligible_time) {
232 		if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED &&
233 		    block_group->used != 0) {
234 			if (btrfs_is_block_group_data_only(block_group)) {
235 				__add_to_discard_list(discard_ctl, block_group);
236 			} else {
237 				list_del_init(&block_group->discard_list);
238 				btrfs_put_block_group(block_group);
239 			}
240 			goto again;
241 		}
242 		if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) {
243 			block_group->discard_cursor = block_group->start;
244 			block_group->discard_state = BTRFS_DISCARD_EXTENTS;
245 		}
246 		discard_ctl->block_group = block_group;
247 	}
248 	if (block_group) {
249 		*discard_state = block_group->discard_state;
250 		*discard_index = block_group->discard_index;
251 	}
252 	spin_unlock(&discard_ctl->lock);
253 
254 	return block_group;
255 }
256 
257 /*
258  * Update a block group's filters.
259  *
260  * @block_group:  block group of interest
261  * @bytes:        recently freed region size after coalescing
262  *
263  * Async discard maintains multiple lists with progressively smaller filters
264  * to prioritize discarding based on size.  Should a free space that matches
265  * a larger filter be returned to the free_space_cache, prioritize that discard
266  * by moving @block_group to the proper filter.
267  */
268 void btrfs_discard_check_filter(struct btrfs_block_group *block_group,
269 				u64 bytes)
270 {
271 	struct btrfs_discard_ctl *discard_ctl;
272 
273 	if (!block_group ||
274 	    !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
275 		return;
276 
277 	discard_ctl = &block_group->fs_info->discard_ctl;
278 
279 	if (block_group->discard_index > BTRFS_DISCARD_INDEX_START &&
280 	    bytes >= discard_minlen[block_group->discard_index - 1]) {
281 		int i;
282 
283 		remove_from_discard_list(discard_ctl, block_group);
284 
285 		for (i = BTRFS_DISCARD_INDEX_START; i < BTRFS_NR_DISCARD_LISTS;
286 		     i++) {
287 			if (bytes >= discard_minlen[i]) {
288 				block_group->discard_index = i;
289 				add_to_discard_list(discard_ctl, block_group);
290 				break;
291 			}
292 		}
293 	}
294 }
295 
296 /*
297  * Move a block group along the discard lists.
298  *
299  * @discard_ctl: discard control
300  * @block_group: block_group of interest
301  *
302  * Increment @block_group's discard_index.  If it falls of the list, let it be.
303  * Otherwise add it back to the appropriate list.
304  */
305 static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl,
306 				       struct btrfs_block_group *block_group)
307 {
308 	block_group->discard_index++;
309 	if (block_group->discard_index == BTRFS_NR_DISCARD_LISTS) {
310 		block_group->discard_index = 1;
311 		return;
312 	}
313 
314 	add_to_discard_list(discard_ctl, block_group);
315 }
316 
317 /*
318  * Remove a block_group from the discard lists.
319  *
320  * @discard_ctl: discard control
321  * @block_group: block_group of interest
322  *
323  * Remove @block_group from the discard lists.  If necessary, wait on the
324  * current work and then reschedule the delayed work.
325  */
326 void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
327 			       struct btrfs_block_group *block_group)
328 {
329 	if (remove_from_discard_list(discard_ctl, block_group)) {
330 		cancel_delayed_work_sync(&discard_ctl->work);
331 		btrfs_discard_schedule_work(discard_ctl, true);
332 	}
333 }
334 
335 /*
336  * Handles queuing the block_groups.
337  *
338  * @discard_ctl: discard control
339  * @block_group: block_group of interest
340  *
341  * Maintain the LRU order of the discard lists.
342  */
343 void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
344 			      struct btrfs_block_group *block_group)
345 {
346 	if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
347 		return;
348 
349 	if (block_group->used == 0)
350 		add_to_discard_unused_list(discard_ctl, block_group);
351 	else
352 		add_to_discard_list(discard_ctl, block_group);
353 
354 	if (!delayed_work_pending(&discard_ctl->work))
355 		btrfs_discard_schedule_work(discard_ctl, false);
356 }
357 
358 static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
359 					  u64 now, bool override)
360 {
361 	struct btrfs_block_group *block_group;
362 
363 	if (!btrfs_run_discard_work(discard_ctl))
364 		return;
365 	if (!override && delayed_work_pending(&discard_ctl->work))
366 		return;
367 
368 	block_group = find_next_block_group(discard_ctl, now);
369 	if (block_group) {
370 		u64 delay = discard_ctl->delay_ms * NSEC_PER_MSEC;
371 		u32 kbps_limit = READ_ONCE(discard_ctl->kbps_limit);
372 
373 		/*
374 		 * A single delayed workqueue item is responsible for
375 		 * discarding, so we can manage the bytes rate limit by keeping
376 		 * track of the previous discard.
377 		 */
378 		if (kbps_limit && discard_ctl->prev_discard) {
379 			u64 bps_limit = ((u64)kbps_limit) * SZ_1K;
380 			u64 bps_delay = div64_u64(discard_ctl->prev_discard *
381 						  NSEC_PER_SEC, bps_limit);
382 
383 			delay = max(delay, bps_delay);
384 		}
385 
386 		/*
387 		 * This timeout is to hopefully prevent immediate discarding
388 		 * in a recently allocated block group.
389 		 */
390 		if (now < block_group->discard_eligible_time) {
391 			u64 bg_timeout = block_group->discard_eligible_time - now;
392 
393 			delay = max(delay, bg_timeout);
394 		}
395 
396 		if (override && discard_ctl->prev_discard) {
397 			u64 elapsed = now - discard_ctl->prev_discard_time;
398 
399 			if (delay > elapsed)
400 				delay -= elapsed;
401 			else
402 				delay = 0;
403 		}
404 
405 		mod_delayed_work(discard_ctl->discard_workers,
406 				 &discard_ctl->work, nsecs_to_jiffies(delay));
407 	}
408 }
409 
410 /*
411  * Responsible for scheduling the discard work.
412  *
413  * @discard_ctl:  discard control
414  * @override:     override the current timer
415  *
416  * Discards are issued by a delayed workqueue item.  @override is used to
417  * update the current delay as the baseline delay interval is reevaluated on
418  * transaction commit.  This is also maxed with any other rate limit.
419  */
420 void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
421 				 bool override)
422 {
423 	const u64 now = ktime_get_ns();
424 
425 	spin_lock(&discard_ctl->lock);
426 	__btrfs_discard_schedule_work(discard_ctl, now, override);
427 	spin_unlock(&discard_ctl->lock);
428 }
429 
430 /*
431  * Determine next step of a block_group.
432  *
433  * @discard_ctl: discard control
434  * @block_group: block_group of interest
435  *
436  * Determine the next step for a block group after it's finished going through
437  * a pass on a discard list.  If it is unused and fully trimmed, we can mark it
438  * unused and send it to the unused_bgs path.  Otherwise, pass it onto the
439  * appropriate filter list or let it fall off.
440  */
441 static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl,
442 				      struct btrfs_block_group *block_group)
443 {
444 	remove_from_discard_list(discard_ctl, block_group);
445 
446 	if (block_group->used == 0) {
447 		if (btrfs_is_free_space_trimmed(block_group))
448 			btrfs_mark_bg_unused(block_group);
449 		else
450 			add_to_discard_unused_list(discard_ctl, block_group);
451 	} else {
452 		btrfs_update_discard_index(discard_ctl, block_group);
453 	}
454 }
455 
456 /*
457  * Discard work queue callback
458  *
459  * @work: work
460  *
461  * Find the next block_group to start discarding and then discard a single
462  * region.  It does this in a two-pass fashion: first extents and second
463  * bitmaps.  Completely discarded block groups are sent to the unused_bgs path.
464  */
465 static void btrfs_discard_workfn(struct work_struct *work)
466 {
467 	struct btrfs_discard_ctl *discard_ctl;
468 	struct btrfs_block_group *block_group;
469 	enum btrfs_discard_state discard_state;
470 	int discard_index = 0;
471 	u64 trimmed = 0;
472 	u64 minlen = 0;
473 	u64 now = ktime_get_ns();
474 
475 	discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work);
476 
477 	block_group = peek_discard_list(discard_ctl, &discard_state,
478 					&discard_index, now);
479 	if (!block_group || !btrfs_run_discard_work(discard_ctl))
480 		return;
481 	if (now < block_group->discard_eligible_time) {
482 		btrfs_discard_schedule_work(discard_ctl, false);
483 		return;
484 	}
485 
486 	/* Perform discarding */
487 	minlen = discard_minlen[discard_index];
488 
489 	if (discard_state == BTRFS_DISCARD_BITMAPS) {
490 		u64 maxlen = 0;
491 
492 		/*
493 		 * Use the previous levels minimum discard length as the max
494 		 * length filter.  In the case something is added to make a
495 		 * region go beyond the max filter, the entire bitmap is set
496 		 * back to BTRFS_TRIM_STATE_UNTRIMMED.
497 		 */
498 		if (discard_index != BTRFS_DISCARD_INDEX_UNUSED)
499 			maxlen = discard_minlen[discard_index - 1];
500 
501 		btrfs_trim_block_group_bitmaps(block_group, &trimmed,
502 				       block_group->discard_cursor,
503 				       btrfs_block_group_end(block_group),
504 				       minlen, maxlen, true);
505 		discard_ctl->discard_bitmap_bytes += trimmed;
506 	} else {
507 		btrfs_trim_block_group_extents(block_group, &trimmed,
508 				       block_group->discard_cursor,
509 				       btrfs_block_group_end(block_group),
510 				       minlen, true);
511 		discard_ctl->discard_extent_bytes += trimmed;
512 	}
513 
514 	/* Determine next steps for a block_group */
515 	if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) {
516 		if (discard_state == BTRFS_DISCARD_BITMAPS) {
517 			btrfs_finish_discard_pass(discard_ctl, block_group);
518 		} else {
519 			block_group->discard_cursor = block_group->start;
520 			spin_lock(&discard_ctl->lock);
521 			if (block_group->discard_state !=
522 			    BTRFS_DISCARD_RESET_CURSOR)
523 				block_group->discard_state =
524 							BTRFS_DISCARD_BITMAPS;
525 			spin_unlock(&discard_ctl->lock);
526 		}
527 	}
528 
529 	now = ktime_get_ns();
530 	spin_lock(&discard_ctl->lock);
531 	discard_ctl->prev_discard = trimmed;
532 	discard_ctl->prev_discard_time = now;
533 	/*
534 	 * If the block group was removed from the discard list while it was
535 	 * running in this workfn, then we didn't deref it, since this function
536 	 * still owned that reference. But we set the discard_ctl->block_group
537 	 * back to NULL, so we can use that condition to know that now we need
538 	 * to deref the block_group.
539 	 */
540 	if (discard_ctl->block_group == NULL)
541 		btrfs_put_block_group(block_group);
542 	discard_ctl->block_group = NULL;
543 	__btrfs_discard_schedule_work(discard_ctl, now, false);
544 	spin_unlock(&discard_ctl->lock);
545 }
546 
547 /*
548  * Determine if async discard should be running.
549  *
550  * @discard_ctl: discard control
551  *
552  * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set.
553  */
554 bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl)
555 {
556 	struct btrfs_fs_info *fs_info = container_of(discard_ctl,
557 						     struct btrfs_fs_info,
558 						     discard_ctl);
559 
560 	return (!(fs_info->sb->s_flags & SB_RDONLY) &&
561 		test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags));
562 }
563 
564 /*
565  * Recalculate the base delay.
566  *
567  * @discard_ctl: discard control
568  *
569  * Recalculate the base delay which is based off the total number of
570  * discardable_extents.  Clamp this between the lower_limit (iops_limit or 1ms)
571  * and the upper_limit (BTRFS_DISCARD_MAX_DELAY_MSEC).
572  */
573 void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
574 {
575 	s32 discardable_extents;
576 	s64 discardable_bytes;
577 	u32 iops_limit;
578 	unsigned long min_delay = BTRFS_DISCARD_MIN_DELAY_MSEC;
579 	unsigned long delay;
580 
581 	discardable_extents = atomic_read(&discard_ctl->discardable_extents);
582 	if (!discardable_extents)
583 		return;
584 
585 	spin_lock(&discard_ctl->lock);
586 
587 	/*
588 	 * The following is to fix a potential -1 discrepancy that we're not
589 	 * sure how to reproduce. But given that this is the only place that
590 	 * utilizes these numbers and this is only called by from
591 	 * btrfs_finish_extent_commit() which is synchronized, we can correct
592 	 * here.
593 	 */
594 	if (discardable_extents < 0)
595 		atomic_add(-discardable_extents,
596 			   &discard_ctl->discardable_extents);
597 
598 	discardable_bytes = atomic64_read(&discard_ctl->discardable_bytes);
599 	if (discardable_bytes < 0)
600 		atomic64_add(-discardable_bytes,
601 			     &discard_ctl->discardable_bytes);
602 
603 	if (discardable_extents <= 0) {
604 		spin_unlock(&discard_ctl->lock);
605 		return;
606 	}
607 
608 	iops_limit = READ_ONCE(discard_ctl->iops_limit);
609 
610 	if (iops_limit) {
611 		delay = MSEC_PER_SEC / iops_limit;
612 	} else {
613 		/*
614 		 * Unset iops_limit means go as fast as possible, so allow a
615 		 * delay of 0.
616 		 */
617 		delay = 0;
618 		min_delay = 0;
619 	}
620 
621 	delay = clamp(delay, min_delay, BTRFS_DISCARD_MAX_DELAY_MSEC);
622 	discard_ctl->delay_ms = delay;
623 
624 	spin_unlock(&discard_ctl->lock);
625 }
626 
627 /*
628  * Propagate discard counters.
629  *
630  * @block_group: block_group of interest
631  *
632  * Propagate deltas of counters up to the discard_ctl.  It maintains a current
633  * counter and a previous counter passing the delta up to the global stat.
634  * Then the current counter value becomes the previous counter value.
635  */
636 void btrfs_discard_update_discardable(struct btrfs_block_group *block_group)
637 {
638 	struct btrfs_free_space_ctl *ctl;
639 	struct btrfs_discard_ctl *discard_ctl;
640 	s32 extents_delta;
641 	s64 bytes_delta;
642 
643 	if (!block_group ||
644 	    !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC) ||
645 	    !btrfs_is_block_group_data_only(block_group))
646 		return;
647 
648 	ctl = block_group->free_space_ctl;
649 	discard_ctl = &block_group->fs_info->discard_ctl;
650 
651 	lockdep_assert_held(&ctl->tree_lock);
652 	extents_delta = ctl->discardable_extents[BTRFS_STAT_CURR] -
653 			ctl->discardable_extents[BTRFS_STAT_PREV];
654 	if (extents_delta) {
655 		atomic_add(extents_delta, &discard_ctl->discardable_extents);
656 		ctl->discardable_extents[BTRFS_STAT_PREV] =
657 			ctl->discardable_extents[BTRFS_STAT_CURR];
658 	}
659 
660 	bytes_delta = ctl->discardable_bytes[BTRFS_STAT_CURR] -
661 		      ctl->discardable_bytes[BTRFS_STAT_PREV];
662 	if (bytes_delta) {
663 		atomic64_add(bytes_delta, &discard_ctl->discardable_bytes);
664 		ctl->discardable_bytes[BTRFS_STAT_PREV] =
665 			ctl->discardable_bytes[BTRFS_STAT_CURR];
666 	}
667 }
668 
669 /*
670  * Punt unused_bgs list to discard lists.
671  *
672  * @fs_info: fs_info of interest
673  *
674  * The unused_bgs list needs to be punted to the discard lists because the
675  * order of operations is changed.  In the normal synchronous discard path, the
676  * block groups are trimmed via a single large trim in transaction commit.  This
677  * is ultimately what we are trying to avoid with asynchronous discard.  Thus,
678  * it must be done before going down the unused_bgs path.
679  */
680 void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info)
681 {
682 	struct btrfs_block_group *block_group, *next;
683 
684 	spin_lock(&fs_info->unused_bgs_lock);
685 	/* We enabled async discard, so punt all to the queue */
686 	list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs,
687 				 bg_list) {
688 		list_del_init(&block_group->bg_list);
689 		btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
690 		/*
691 		 * This put is for the get done by btrfs_mark_bg_unused.
692 		 * Queueing discard incremented it for discard's reference.
693 		 */
694 		btrfs_put_block_group(block_group);
695 	}
696 	spin_unlock(&fs_info->unused_bgs_lock);
697 }
698 
699 /*
700  * Purge discard lists.
701  *
702  * @discard_ctl: discard control
703  *
704  * If we are disabling async discard, we may have intercepted block groups that
705  * are completely free and ready for the unused_bgs path.  As discarding will
706  * now happen in transaction commit or not at all, we can safely mark the
707  * corresponding block groups as unused and they will be sent on their merry
708  * way to the unused_bgs list.
709  */
710 static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl)
711 {
712 	struct btrfs_block_group *block_group, *next;
713 	int i;
714 
715 	spin_lock(&discard_ctl->lock);
716 	for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
717 		list_for_each_entry_safe(block_group, next,
718 					 &discard_ctl->discard_list[i],
719 					 discard_list) {
720 			list_del_init(&block_group->discard_list);
721 			spin_unlock(&discard_ctl->lock);
722 			if (block_group->used == 0)
723 				btrfs_mark_bg_unused(block_group);
724 			spin_lock(&discard_ctl->lock);
725 			btrfs_put_block_group(block_group);
726 		}
727 	}
728 	spin_unlock(&discard_ctl->lock);
729 }
730 
731 void btrfs_discard_resume(struct btrfs_fs_info *fs_info)
732 {
733 	if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
734 		btrfs_discard_cleanup(fs_info);
735 		return;
736 	}
737 
738 	btrfs_discard_punt_unused_bgs_list(fs_info);
739 
740 	set_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
741 }
742 
743 void btrfs_discard_stop(struct btrfs_fs_info *fs_info)
744 {
745 	clear_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
746 }
747 
748 void btrfs_discard_init(struct btrfs_fs_info *fs_info)
749 {
750 	struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl;
751 	int i;
752 
753 	spin_lock_init(&discard_ctl->lock);
754 	INIT_DELAYED_WORK(&discard_ctl->work, btrfs_discard_workfn);
755 
756 	for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++)
757 		INIT_LIST_HEAD(&discard_ctl->discard_list[i]);
758 
759 	discard_ctl->prev_discard = 0;
760 	discard_ctl->prev_discard_time = 0;
761 	atomic_set(&discard_ctl->discardable_extents, 0);
762 	atomic64_set(&discard_ctl->discardable_bytes, 0);
763 	discard_ctl->max_discard_size = BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE;
764 	discard_ctl->delay_ms = BTRFS_DISCARD_MAX_DELAY_MSEC;
765 	discard_ctl->iops_limit = BTRFS_DISCARD_MAX_IOPS;
766 	discard_ctl->kbps_limit = 0;
767 	discard_ctl->discard_extent_bytes = 0;
768 	discard_ctl->discard_bitmap_bytes = 0;
769 	atomic64_set(&discard_ctl->discard_bytes_saved, 0);
770 }
771 
772 void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info)
773 {
774 	btrfs_discard_stop(fs_info);
775 	cancel_delayed_work_sync(&fs_info->discard_ctl.work);
776 	btrfs_discard_purge_list(&fs_info->discard_ctl);
777 }
778