xref: /openbmc/linux/fs/btrfs/space-info.c (revision 83d731a5)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "ctree.h"
4 #include "space-info.h"
5 #include "sysfs.h"
6 #include "volumes.h"
7 #include "free-space-cache.h"
8 #include "ordered-data.h"
9 #include "transaction.h"
10 #include "math.h"
11 
12 u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
13 			  bool may_use_included)
14 {
15 	ASSERT(s_info);
16 	return s_info->bytes_used + s_info->bytes_reserved +
17 		s_info->bytes_pinned + s_info->bytes_readonly +
18 		(may_use_included ? s_info->bytes_may_use : 0);
19 }
20 
21 /*
22  * after adding space to the filesystem, we need to clear the full flags
23  * on all the space infos.
24  */
25 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
26 {
27 	struct list_head *head = &info->space_info;
28 	struct btrfs_space_info *found;
29 
30 	rcu_read_lock();
31 	list_for_each_entry_rcu(found, head, list)
32 		found->full = 0;
33 	rcu_read_unlock();
34 }
35 
36 static const char *alloc_name(u64 flags)
37 {
38 	switch (flags) {
39 	case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
40 		return "mixed";
41 	case BTRFS_BLOCK_GROUP_METADATA:
42 		return "metadata";
43 	case BTRFS_BLOCK_GROUP_DATA:
44 		return "data";
45 	case BTRFS_BLOCK_GROUP_SYSTEM:
46 		return "system";
47 	default:
48 		WARN_ON(1);
49 		return "invalid-combination";
50 	};
51 }
52 
53 static int create_space_info(struct btrfs_fs_info *info, u64 flags)
54 {
55 
56 	struct btrfs_space_info *space_info;
57 	int i;
58 	int ret;
59 
60 	space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
61 	if (!space_info)
62 		return -ENOMEM;
63 
64 	ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
65 				 GFP_KERNEL);
66 	if (ret) {
67 		kfree(space_info);
68 		return ret;
69 	}
70 
71 	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
72 		INIT_LIST_HEAD(&space_info->block_groups[i]);
73 	init_rwsem(&space_info->groups_sem);
74 	spin_lock_init(&space_info->lock);
75 	space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
76 	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
77 	init_waitqueue_head(&space_info->wait);
78 	INIT_LIST_HEAD(&space_info->ro_bgs);
79 	INIT_LIST_HEAD(&space_info->tickets);
80 	INIT_LIST_HEAD(&space_info->priority_tickets);
81 
82 	ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
83 				    info->space_info_kobj, "%s",
84 				    alloc_name(space_info->flags));
85 	if (ret) {
86 		kobject_put(&space_info->kobj);
87 		return ret;
88 	}
89 
90 	list_add_rcu(&space_info->list, &info->space_info);
91 	if (flags & BTRFS_BLOCK_GROUP_DATA)
92 		info->data_sinfo = space_info;
93 
94 	return ret;
95 }
96 
97 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
98 {
99 	struct btrfs_super_block *disk_super;
100 	u64 features;
101 	u64 flags;
102 	int mixed = 0;
103 	int ret;
104 
105 	disk_super = fs_info->super_copy;
106 	if (!btrfs_super_root(disk_super))
107 		return -EINVAL;
108 
109 	features = btrfs_super_incompat_flags(disk_super);
110 	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
111 		mixed = 1;
112 
113 	flags = BTRFS_BLOCK_GROUP_SYSTEM;
114 	ret = create_space_info(fs_info, flags);
115 	if (ret)
116 		goto out;
117 
118 	if (mixed) {
119 		flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
120 		ret = create_space_info(fs_info, flags);
121 	} else {
122 		flags = BTRFS_BLOCK_GROUP_METADATA;
123 		ret = create_space_info(fs_info, flags);
124 		if (ret)
125 			goto out;
126 
127 		flags = BTRFS_BLOCK_GROUP_DATA;
128 		ret = create_space_info(fs_info, flags);
129 	}
130 out:
131 	return ret;
132 }
133 
134 void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
135 			     u64 total_bytes, u64 bytes_used,
136 			     u64 bytes_readonly,
137 			     struct btrfs_space_info **space_info)
138 {
139 	struct btrfs_space_info *found;
140 	int factor;
141 
142 	factor = btrfs_bg_type_to_factor(flags);
143 
144 	found = btrfs_find_space_info(info, flags);
145 	ASSERT(found);
146 	spin_lock(&found->lock);
147 	found->total_bytes += total_bytes;
148 	found->disk_total += total_bytes * factor;
149 	found->bytes_used += bytes_used;
150 	found->disk_used += bytes_used * factor;
151 	found->bytes_readonly += bytes_readonly;
152 	if (total_bytes > 0)
153 		found->full = 0;
154 	btrfs_space_info_add_new_bytes(info, found,
155 				       total_bytes - bytes_used -
156 				       bytes_readonly);
157 	spin_unlock(&found->lock);
158 	*space_info = found;
159 }
160 
161 struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
162 					       u64 flags)
163 {
164 	struct list_head *head = &info->space_info;
165 	struct btrfs_space_info *found;
166 
167 	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
168 
169 	rcu_read_lock();
170 	list_for_each_entry_rcu(found, head, list) {
171 		if (found->flags & flags) {
172 			rcu_read_unlock();
173 			return found;
174 		}
175 	}
176 	rcu_read_unlock();
177 	return NULL;
178 }
179 
180 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
181 {
182 	return (global->size << 1);
183 }
184 
185 static int can_overcommit(struct btrfs_fs_info *fs_info,
186 			  struct btrfs_space_info *space_info, u64 bytes,
187 			  enum btrfs_reserve_flush_enum flush,
188 			  bool system_chunk)
189 {
190 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
191 	u64 profile;
192 	u64 space_size;
193 	u64 avail;
194 	u64 used;
195 	int factor;
196 
197 	/* Don't overcommit when in mixed mode. */
198 	if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
199 		return 0;
200 
201 	if (system_chunk)
202 		profile = btrfs_system_alloc_profile(fs_info);
203 	else
204 		profile = btrfs_metadata_alloc_profile(fs_info);
205 
206 	used = btrfs_space_info_used(space_info, false);
207 
208 	/*
209 	 * We only want to allow over committing if we have lots of actual space
210 	 * free, but if we don't have enough space to handle the global reserve
211 	 * space then we could end up having a real enospc problem when trying
212 	 * to allocate a chunk or some other such important allocation.
213 	 */
214 	spin_lock(&global_rsv->lock);
215 	space_size = calc_global_rsv_need_space(global_rsv);
216 	spin_unlock(&global_rsv->lock);
217 	if (used + space_size >= space_info->total_bytes)
218 		return 0;
219 
220 	used += space_info->bytes_may_use;
221 
222 	avail = atomic64_read(&fs_info->free_chunk_space);
223 
224 	/*
225 	 * If we have dup, raid1 or raid10 then only half of the free
226 	 * space is actually usable.  For raid56, the space info used
227 	 * doesn't include the parity drive, so we don't have to
228 	 * change the math
229 	 */
230 	factor = btrfs_bg_type_to_factor(profile);
231 	avail = div_u64(avail, factor);
232 
233 	/*
234 	 * If we aren't flushing all things, let us overcommit up to
235 	 * 1/2th of the space. If we can flush, don't let us overcommit
236 	 * too much, let it overcommit up to 1/8 of the space.
237 	 */
238 	if (flush == BTRFS_RESERVE_FLUSH_ALL)
239 		avail >>= 3;
240 	else
241 		avail >>= 1;
242 
243 	if (used + bytes < space_info->total_bytes + avail)
244 		return 1;
245 	return 0;
246 }
247 
248 /*
249  * This is for space we already have accounted in space_info->bytes_may_use, so
250  * basically when we're returning space from block_rsv's.
251  */
252 void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
253 				    struct btrfs_space_info *space_info,
254 				    u64 num_bytes)
255 {
256 	struct reserve_ticket *ticket;
257 	struct list_head *head;
258 	u64 used;
259 	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
260 	bool check_overcommit = false;
261 
262 	spin_lock(&space_info->lock);
263 	head = &space_info->priority_tickets;
264 
265 	/*
266 	 * If we are over our limit then we need to check and see if we can
267 	 * overcommit, and if we can't then we just need to free up our space
268 	 * and not satisfy any requests.
269 	 */
270 	used = btrfs_space_info_used(space_info, true);
271 	if (used - num_bytes >= space_info->total_bytes)
272 		check_overcommit = true;
273 again:
274 	while (!list_empty(head) && num_bytes) {
275 		ticket = list_first_entry(head, struct reserve_ticket,
276 					  list);
277 		/*
278 		 * We use 0 bytes because this space is already reserved, so
279 		 * adding the ticket space would be a double count.
280 		 */
281 		if (check_overcommit &&
282 		    !can_overcommit(fs_info, space_info, 0, flush, false))
283 			break;
284 		if (num_bytes >= ticket->bytes) {
285 			list_del_init(&ticket->list);
286 			num_bytes -= ticket->bytes;
287 			ticket->bytes = 0;
288 			space_info->tickets_id++;
289 			wake_up(&ticket->wait);
290 		} else {
291 			ticket->bytes -= num_bytes;
292 			num_bytes = 0;
293 		}
294 	}
295 
296 	if (num_bytes && head == &space_info->priority_tickets) {
297 		head = &space_info->tickets;
298 		flush = BTRFS_RESERVE_FLUSH_ALL;
299 		goto again;
300 	}
301 	btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes);
302 	trace_btrfs_space_reservation(fs_info, "space_info",
303 				      space_info->flags, num_bytes, 0);
304 	spin_unlock(&space_info->lock);
305 }
306 
307 /*
308  * This is for newly allocated space that isn't accounted in
309  * space_info->bytes_may_use yet.  So if we allocate a chunk or unpin an extent
310  * we use this helper.
311  */
312 void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
313 				    struct btrfs_space_info *space_info,
314 				    u64 num_bytes)
315 {
316 	struct reserve_ticket *ticket;
317 	struct list_head *head = &space_info->priority_tickets;
318 
319 again:
320 	while (!list_empty(head) && num_bytes) {
321 		ticket = list_first_entry(head, struct reserve_ticket,
322 					  list);
323 		if (num_bytes >= ticket->bytes) {
324 			trace_btrfs_space_reservation(fs_info, "space_info",
325 						      space_info->flags,
326 						      ticket->bytes, 1);
327 			list_del_init(&ticket->list);
328 			num_bytes -= ticket->bytes;
329 			btrfs_space_info_update_bytes_may_use(fs_info,
330 							      space_info,
331 							      ticket->bytes);
332 			ticket->bytes = 0;
333 			space_info->tickets_id++;
334 			wake_up(&ticket->wait);
335 		} else {
336 			trace_btrfs_space_reservation(fs_info, "space_info",
337 						      space_info->flags,
338 						      num_bytes, 1);
339 			btrfs_space_info_update_bytes_may_use(fs_info,
340 							      space_info,
341 							      num_bytes);
342 			ticket->bytes -= num_bytes;
343 			num_bytes = 0;
344 		}
345 	}
346 
347 	if (num_bytes && head == &space_info->priority_tickets) {
348 		head = &space_info->tickets;
349 		goto again;
350 	}
351 }
352 
353 #define DUMP_BLOCK_RSV(fs_info, rsv_name)				\
354 do {									\
355 	struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name;		\
356 	spin_lock(&__rsv->lock);					\
357 	btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu",	\
358 		   __rsv->size, __rsv->reserved);			\
359 	spin_unlock(&__rsv->lock);					\
360 } while (0)
361 
362 void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
363 			   struct btrfs_space_info *info, u64 bytes,
364 			   int dump_block_groups)
365 {
366 	struct btrfs_block_group_cache *cache;
367 	int index = 0;
368 
369 	spin_lock(&info->lock);
370 	btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
371 		   info->flags,
372 		   info->total_bytes - btrfs_space_info_used(info, true),
373 		   info->full ? "" : "not ");
374 	btrfs_info(fs_info,
375 		"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
376 		info->total_bytes, info->bytes_used, info->bytes_pinned,
377 		info->bytes_reserved, info->bytes_may_use,
378 		info->bytes_readonly);
379 	spin_unlock(&info->lock);
380 
381 	DUMP_BLOCK_RSV(fs_info, global_block_rsv);
382 	DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
383 	DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
384 	DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
385 	DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
386 
387 	if (!dump_block_groups)
388 		return;
389 
390 	down_read(&info->groups_sem);
391 again:
392 	list_for_each_entry(cache, &info->block_groups[index], list) {
393 		spin_lock(&cache->lock);
394 		btrfs_info(fs_info,
395 			"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
396 			cache->key.objectid, cache->key.offset,
397 			btrfs_block_group_used(&cache->item), cache->pinned,
398 			cache->reserved, cache->ro ? "[readonly]" : "");
399 		btrfs_dump_free_space(cache, bytes);
400 		spin_unlock(&cache->lock);
401 	}
402 	if (++index < BTRFS_NR_RAID_TYPES)
403 		goto again;
404 	up_read(&info->groups_sem);
405 }
406 
407 static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
408 					 unsigned long nr_pages, int nr_items)
409 {
410 	struct super_block *sb = fs_info->sb;
411 
412 	if (down_read_trylock(&sb->s_umount)) {
413 		writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
414 		up_read(&sb->s_umount);
415 	} else {
416 		/*
417 		 * We needn't worry the filesystem going from r/w to r/o though
418 		 * we don't acquire ->s_umount mutex, because the filesystem
419 		 * should guarantee the delalloc inodes list be empty after
420 		 * the filesystem is readonly(all dirty pages are written to
421 		 * the disk).
422 		 */
423 		btrfs_start_delalloc_roots(fs_info, nr_items);
424 		if (!current->journal_info)
425 			btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
426 	}
427 }
428 
429 static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
430 					u64 to_reclaim)
431 {
432 	u64 bytes;
433 	u64 nr;
434 
435 	bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
436 	nr = div64_u64(to_reclaim, bytes);
437 	if (!nr)
438 		nr = 1;
439 	return nr;
440 }
441 
442 #define EXTENT_SIZE_PER_ITEM	SZ_256K
443 
444 /*
445  * shrink metadata reservation for delalloc
446  */
447 static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
448 			    u64 orig, bool wait_ordered)
449 {
450 	struct btrfs_space_info *space_info;
451 	struct btrfs_trans_handle *trans;
452 	u64 delalloc_bytes;
453 	u64 dio_bytes;
454 	u64 async_pages;
455 	u64 items;
456 	long time_left;
457 	unsigned long nr_pages;
458 	int loops;
459 
460 	/* Calc the number of the pages we need flush for space reservation */
461 	items = calc_reclaim_items_nr(fs_info, to_reclaim);
462 	to_reclaim = items * EXTENT_SIZE_PER_ITEM;
463 
464 	trans = (struct btrfs_trans_handle *)current->journal_info;
465 	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
466 
467 	delalloc_bytes = percpu_counter_sum_positive(
468 						&fs_info->delalloc_bytes);
469 	dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
470 	if (delalloc_bytes == 0 && dio_bytes == 0) {
471 		if (trans)
472 			return;
473 		if (wait_ordered)
474 			btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
475 		return;
476 	}
477 
478 	/*
479 	 * If we are doing more ordered than delalloc we need to just wait on
480 	 * ordered extents, otherwise we'll waste time trying to flush delalloc
481 	 * that likely won't give us the space back we need.
482 	 */
483 	if (dio_bytes > delalloc_bytes)
484 		wait_ordered = true;
485 
486 	loops = 0;
487 	while ((delalloc_bytes || dio_bytes) && loops < 3) {
488 		nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
489 
490 		/*
491 		 * Triggers inode writeback for up to nr_pages. This will invoke
492 		 * ->writepages callback and trigger delalloc filling
493 		 *  (btrfs_run_delalloc_range()).
494 		 */
495 		btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
496 
497 		/*
498 		 * We need to wait for the compressed pages to start before
499 		 * we continue.
500 		 */
501 		async_pages = atomic_read(&fs_info->async_delalloc_pages);
502 		if (!async_pages)
503 			goto skip_async;
504 
505 		/*
506 		 * Calculate how many compressed pages we want to be written
507 		 * before we continue. I.e if there are more async pages than we
508 		 * require wait_event will wait until nr_pages are written.
509 		 */
510 		if (async_pages <= nr_pages)
511 			async_pages = 0;
512 		else
513 			async_pages -= nr_pages;
514 
515 		wait_event(fs_info->async_submit_wait,
516 			   atomic_read(&fs_info->async_delalloc_pages) <=
517 			   (int)async_pages);
518 skip_async:
519 		spin_lock(&space_info->lock);
520 		if (list_empty(&space_info->tickets) &&
521 		    list_empty(&space_info->priority_tickets)) {
522 			spin_unlock(&space_info->lock);
523 			break;
524 		}
525 		spin_unlock(&space_info->lock);
526 
527 		loops++;
528 		if (wait_ordered && !trans) {
529 			btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
530 		} else {
531 			time_left = schedule_timeout_killable(1);
532 			if (time_left)
533 				break;
534 		}
535 		delalloc_bytes = percpu_counter_sum_positive(
536 						&fs_info->delalloc_bytes);
537 		dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
538 	}
539 }
540 
541 /**
542  * maybe_commit_transaction - possibly commit the transaction if its ok to
543  * @root - the root we're allocating for
544  * @bytes - the number of bytes we want to reserve
545  * @force - force the commit
546  *
547  * This will check to make sure that committing the transaction will actually
548  * get us somewhere and then commit the transaction if it does.  Otherwise it
549  * will return -ENOSPC.
550  */
551 static int may_commit_transaction(struct btrfs_fs_info *fs_info,
552 				  struct btrfs_space_info *space_info)
553 {
554 	struct reserve_ticket *ticket = NULL;
555 	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
556 	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
557 	struct btrfs_trans_handle *trans;
558 	u64 bytes_needed;
559 	u64 reclaim_bytes = 0;
560 
561 	trans = (struct btrfs_trans_handle *)current->journal_info;
562 	if (trans)
563 		return -EAGAIN;
564 
565 	spin_lock(&space_info->lock);
566 	if (!list_empty(&space_info->priority_tickets))
567 		ticket = list_first_entry(&space_info->priority_tickets,
568 					  struct reserve_ticket, list);
569 	else if (!list_empty(&space_info->tickets))
570 		ticket = list_first_entry(&space_info->tickets,
571 					  struct reserve_ticket, list);
572 	bytes_needed = (ticket) ? ticket->bytes : 0;
573 	spin_unlock(&space_info->lock);
574 
575 	if (!bytes_needed)
576 		return 0;
577 
578 	trans = btrfs_join_transaction(fs_info->extent_root);
579 	if (IS_ERR(trans))
580 		return PTR_ERR(trans);
581 
582 	/*
583 	 * See if there is enough pinned space to make this reservation, or if
584 	 * we have block groups that are going to be freed, allowing us to
585 	 * possibly do a chunk allocation the next loop through.
586 	 */
587 	if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
588 	    __percpu_counter_compare(&space_info->total_bytes_pinned,
589 				     bytes_needed,
590 				     BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
591 		goto commit;
592 
593 	/*
594 	 * See if there is some space in the delayed insertion reservation for
595 	 * this reservation.
596 	 */
597 	if (space_info != delayed_rsv->space_info)
598 		goto enospc;
599 
600 	spin_lock(&delayed_rsv->lock);
601 	reclaim_bytes += delayed_rsv->reserved;
602 	spin_unlock(&delayed_rsv->lock);
603 
604 	spin_lock(&delayed_refs_rsv->lock);
605 	reclaim_bytes += delayed_refs_rsv->reserved;
606 	spin_unlock(&delayed_refs_rsv->lock);
607 	if (reclaim_bytes >= bytes_needed)
608 		goto commit;
609 	bytes_needed -= reclaim_bytes;
610 
611 	if (__percpu_counter_compare(&space_info->total_bytes_pinned,
612 				   bytes_needed,
613 				   BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
614 		goto enospc;
615 
616 commit:
617 	return btrfs_commit_transaction(trans);
618 enospc:
619 	btrfs_end_transaction(trans);
620 	return -ENOSPC;
621 }
622 
623 /*
624  * Try to flush some data based on policy set by @state. This is only advisory
625  * and may fail for various reasons. The caller is supposed to examine the
626  * state of @space_info to detect the outcome.
627  */
628 static void flush_space(struct btrfs_fs_info *fs_info,
629 		       struct btrfs_space_info *space_info, u64 num_bytes,
630 		       int state)
631 {
632 	struct btrfs_root *root = fs_info->extent_root;
633 	struct btrfs_trans_handle *trans;
634 	int nr;
635 	int ret = 0;
636 
637 	switch (state) {
638 	case FLUSH_DELAYED_ITEMS_NR:
639 	case FLUSH_DELAYED_ITEMS:
640 		if (state == FLUSH_DELAYED_ITEMS_NR)
641 			nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
642 		else
643 			nr = -1;
644 
645 		trans = btrfs_join_transaction(root);
646 		if (IS_ERR(trans)) {
647 			ret = PTR_ERR(trans);
648 			break;
649 		}
650 		ret = btrfs_run_delayed_items_nr(trans, nr);
651 		btrfs_end_transaction(trans);
652 		break;
653 	case FLUSH_DELALLOC:
654 	case FLUSH_DELALLOC_WAIT:
655 		shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
656 				state == FLUSH_DELALLOC_WAIT);
657 		break;
658 	case FLUSH_DELAYED_REFS_NR:
659 	case FLUSH_DELAYED_REFS:
660 		trans = btrfs_join_transaction(root);
661 		if (IS_ERR(trans)) {
662 			ret = PTR_ERR(trans);
663 			break;
664 		}
665 		if (state == FLUSH_DELAYED_REFS_NR)
666 			nr = calc_reclaim_items_nr(fs_info, num_bytes);
667 		else
668 			nr = 0;
669 		btrfs_run_delayed_refs(trans, nr);
670 		btrfs_end_transaction(trans);
671 		break;
672 	case ALLOC_CHUNK:
673 	case ALLOC_CHUNK_FORCE:
674 		trans = btrfs_join_transaction(root);
675 		if (IS_ERR(trans)) {
676 			ret = PTR_ERR(trans);
677 			break;
678 		}
679 		ret = btrfs_chunk_alloc(trans,
680 				btrfs_metadata_alloc_profile(fs_info),
681 				(state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
682 					CHUNK_ALLOC_FORCE);
683 		btrfs_end_transaction(trans);
684 		if (ret > 0 || ret == -ENOSPC)
685 			ret = 0;
686 		break;
687 	case COMMIT_TRANS:
688 		/*
689 		 * If we have pending delayed iputs then we could free up a
690 		 * bunch of pinned space, so make sure we run the iputs before
691 		 * we do our pinned bytes check below.
692 		 */
693 		btrfs_run_delayed_iputs(fs_info);
694 		btrfs_wait_on_delayed_iputs(fs_info);
695 
696 		ret = may_commit_transaction(fs_info, space_info);
697 		break;
698 	default:
699 		ret = -ENOSPC;
700 		break;
701 	}
702 
703 	trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
704 				ret);
705 	return;
706 }
707 
708 static inline u64
709 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
710 				 struct btrfs_space_info *space_info,
711 				 bool system_chunk)
712 {
713 	struct reserve_ticket *ticket;
714 	u64 used;
715 	u64 expected;
716 	u64 to_reclaim = 0;
717 
718 	list_for_each_entry(ticket, &space_info->tickets, list)
719 		to_reclaim += ticket->bytes;
720 	list_for_each_entry(ticket, &space_info->priority_tickets, list)
721 		to_reclaim += ticket->bytes;
722 	if (to_reclaim)
723 		return to_reclaim;
724 
725 	to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
726 	if (can_overcommit(fs_info, space_info, to_reclaim,
727 			   BTRFS_RESERVE_FLUSH_ALL, system_chunk))
728 		return 0;
729 
730 	used = btrfs_space_info_used(space_info, true);
731 
732 	if (can_overcommit(fs_info, space_info, SZ_1M,
733 			   BTRFS_RESERVE_FLUSH_ALL, system_chunk))
734 		expected = div_factor_fine(space_info->total_bytes, 95);
735 	else
736 		expected = div_factor_fine(space_info->total_bytes, 90);
737 
738 	if (used > expected)
739 		to_reclaim = used - expected;
740 	else
741 		to_reclaim = 0;
742 	to_reclaim = min(to_reclaim, space_info->bytes_may_use +
743 				     space_info->bytes_reserved);
744 	return to_reclaim;
745 }
746 
747 static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
748 					struct btrfs_space_info *space_info,
749 					u64 used, bool system_chunk)
750 {
751 	u64 thresh = div_factor_fine(space_info->total_bytes, 98);
752 
753 	/* If we're just plain full then async reclaim just slows us down. */
754 	if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
755 		return 0;
756 
757 	if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
758 					      system_chunk))
759 		return 0;
760 
761 	return (used >= thresh && !btrfs_fs_closing(fs_info) &&
762 		!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
763 }
764 
765 static bool wake_all_tickets(struct list_head *head)
766 {
767 	struct reserve_ticket *ticket;
768 
769 	while (!list_empty(head)) {
770 		ticket = list_first_entry(head, struct reserve_ticket, list);
771 		list_del_init(&ticket->list);
772 		ticket->error = -ENOSPC;
773 		wake_up(&ticket->wait);
774 		if (ticket->bytes != ticket->orig_bytes)
775 			return true;
776 	}
777 	return false;
778 }
779 
780 /*
781  * This is for normal flushers, we can wait all goddamned day if we want to.  We
782  * will loop and continuously try to flush as long as we are making progress.
783  * We count progress as clearing off tickets each time we have to loop.
784  */
785 static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
786 {
787 	struct btrfs_fs_info *fs_info;
788 	struct btrfs_space_info *space_info;
789 	u64 to_reclaim;
790 	int flush_state;
791 	int commit_cycles = 0;
792 	u64 last_tickets_id;
793 
794 	fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
795 	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
796 
797 	spin_lock(&space_info->lock);
798 	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
799 						      false);
800 	if (!to_reclaim) {
801 		space_info->flush = 0;
802 		spin_unlock(&space_info->lock);
803 		return;
804 	}
805 	last_tickets_id = space_info->tickets_id;
806 	spin_unlock(&space_info->lock);
807 
808 	flush_state = FLUSH_DELAYED_ITEMS_NR;
809 	do {
810 		flush_space(fs_info, space_info, to_reclaim, flush_state);
811 		spin_lock(&space_info->lock);
812 		if (list_empty(&space_info->tickets)) {
813 			space_info->flush = 0;
814 			spin_unlock(&space_info->lock);
815 			return;
816 		}
817 		to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
818 							      space_info,
819 							      false);
820 		if (last_tickets_id == space_info->tickets_id) {
821 			flush_state++;
822 		} else {
823 			last_tickets_id = space_info->tickets_id;
824 			flush_state = FLUSH_DELAYED_ITEMS_NR;
825 			if (commit_cycles)
826 				commit_cycles--;
827 		}
828 
829 		/*
830 		 * We don't want to force a chunk allocation until we've tried
831 		 * pretty hard to reclaim space.  Think of the case where we
832 		 * freed up a bunch of space and so have a lot of pinned space
833 		 * to reclaim.  We would rather use that than possibly create a
834 		 * underutilized metadata chunk.  So if this is our first run
835 		 * through the flushing state machine skip ALLOC_CHUNK_FORCE and
836 		 * commit the transaction.  If nothing has changed the next go
837 		 * around then we can force a chunk allocation.
838 		 */
839 		if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
840 			flush_state++;
841 
842 		if (flush_state > COMMIT_TRANS) {
843 			commit_cycles++;
844 			if (commit_cycles > 2) {
845 				if (wake_all_tickets(&space_info->tickets)) {
846 					flush_state = FLUSH_DELAYED_ITEMS_NR;
847 					commit_cycles--;
848 				} else {
849 					space_info->flush = 0;
850 				}
851 			} else {
852 				flush_state = FLUSH_DELAYED_ITEMS_NR;
853 			}
854 		}
855 		spin_unlock(&space_info->lock);
856 	} while (flush_state <= COMMIT_TRANS);
857 }
858 
859 void btrfs_init_async_reclaim_work(struct work_struct *work)
860 {
861 	INIT_WORK(work, btrfs_async_reclaim_metadata_space);
862 }
863 
864 static const enum btrfs_flush_state priority_flush_states[] = {
865 	FLUSH_DELAYED_ITEMS_NR,
866 	FLUSH_DELAYED_ITEMS,
867 	ALLOC_CHUNK,
868 };
869 
870 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
871 					    struct btrfs_space_info *space_info,
872 					    struct reserve_ticket *ticket)
873 {
874 	u64 to_reclaim;
875 	int flush_state;
876 
877 	spin_lock(&space_info->lock);
878 	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
879 						      false);
880 	if (!to_reclaim) {
881 		spin_unlock(&space_info->lock);
882 		return;
883 	}
884 	spin_unlock(&space_info->lock);
885 
886 	flush_state = 0;
887 	do {
888 		flush_space(fs_info, space_info, to_reclaim,
889 			    priority_flush_states[flush_state]);
890 		flush_state++;
891 		spin_lock(&space_info->lock);
892 		if (ticket->bytes == 0) {
893 			spin_unlock(&space_info->lock);
894 			return;
895 		}
896 		spin_unlock(&space_info->lock);
897 	} while (flush_state < ARRAY_SIZE(priority_flush_states));
898 }
899 
900 static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
901 			       struct btrfs_space_info *space_info,
902 			       struct reserve_ticket *ticket)
903 
904 {
905 	DEFINE_WAIT(wait);
906 	u64 reclaim_bytes = 0;
907 	int ret = 0;
908 
909 	spin_lock(&space_info->lock);
910 	while (ticket->bytes > 0 && ticket->error == 0) {
911 		ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
912 		if (ret) {
913 			ret = -EINTR;
914 			break;
915 		}
916 		spin_unlock(&space_info->lock);
917 
918 		schedule();
919 
920 		finish_wait(&ticket->wait, &wait);
921 		spin_lock(&space_info->lock);
922 	}
923 	if (!ret)
924 		ret = ticket->error;
925 	if (!list_empty(&ticket->list))
926 		list_del_init(&ticket->list);
927 	if (ticket->bytes && ticket->bytes < ticket->orig_bytes)
928 		reclaim_bytes = ticket->orig_bytes - ticket->bytes;
929 	spin_unlock(&space_info->lock);
930 
931 	if (reclaim_bytes)
932 		btrfs_space_info_add_old_bytes(fs_info, space_info,
933 					       reclaim_bytes);
934 	return ret;
935 }
936 
937 /**
938  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
939  * @root - the root we're allocating for
940  * @space_info - the space info we want to allocate from
941  * @orig_bytes - the number of bytes we want
942  * @flush - whether or not we can flush to make our reservation
943  *
944  * This will reserve orig_bytes number of bytes from the space info associated
945  * with the block_rsv.  If there is not enough space it will make an attempt to
946  * flush out space to make room.  It will do this by flushing delalloc if
947  * possible or committing the transaction.  If flush is 0 then no attempts to
948  * regain reservations will be made and this will fail if there is not enough
949  * space already.
950  */
951 static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
952 				    struct btrfs_space_info *space_info,
953 				    u64 orig_bytes,
954 				    enum btrfs_reserve_flush_enum flush,
955 				    bool system_chunk)
956 {
957 	struct reserve_ticket ticket;
958 	u64 used;
959 	u64 reclaim_bytes = 0;
960 	int ret = 0;
961 
962 	ASSERT(orig_bytes);
963 	ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
964 
965 	spin_lock(&space_info->lock);
966 	ret = -ENOSPC;
967 	used = btrfs_space_info_used(space_info, true);
968 
969 	/*
970 	 * If we have enough space then hooray, make our reservation and carry
971 	 * on.  If not see if we can overcommit, and if we can, hooray carry on.
972 	 * If not things get more complicated.
973 	 */
974 	if (used + orig_bytes <= space_info->total_bytes) {
975 		btrfs_space_info_update_bytes_may_use(fs_info, space_info,
976 						      orig_bytes);
977 		trace_btrfs_space_reservation(fs_info, "space_info",
978 					      space_info->flags, orig_bytes, 1);
979 		ret = 0;
980 	} else if (can_overcommit(fs_info, space_info, orig_bytes, flush,
981 				  system_chunk)) {
982 		btrfs_space_info_update_bytes_may_use(fs_info, space_info,
983 						      orig_bytes);
984 		trace_btrfs_space_reservation(fs_info, "space_info",
985 					      space_info->flags, orig_bytes, 1);
986 		ret = 0;
987 	}
988 
989 	/*
990 	 * If we couldn't make a reservation then setup our reservation ticket
991 	 * and kick the async worker if it's not already running.
992 	 *
993 	 * If we are a priority flusher then we just need to add our ticket to
994 	 * the list and we will do our own flushing further down.
995 	 */
996 	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
997 		ticket.orig_bytes = orig_bytes;
998 		ticket.bytes = orig_bytes;
999 		ticket.error = 0;
1000 		init_waitqueue_head(&ticket.wait);
1001 		if (flush == BTRFS_RESERVE_FLUSH_ALL) {
1002 			list_add_tail(&ticket.list, &space_info->tickets);
1003 			if (!space_info->flush) {
1004 				space_info->flush = 1;
1005 				trace_btrfs_trigger_flush(fs_info,
1006 							  space_info->flags,
1007 							  orig_bytes, flush,
1008 							  "enospc");
1009 				queue_work(system_unbound_wq,
1010 					   &fs_info->async_reclaim_work);
1011 			}
1012 		} else {
1013 			list_add_tail(&ticket.list,
1014 				      &space_info->priority_tickets);
1015 		}
1016 	} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
1017 		used += orig_bytes;
1018 		/*
1019 		 * We will do the space reservation dance during log replay,
1020 		 * which means we won't have fs_info->fs_root set, so don't do
1021 		 * the async reclaim as we will panic.
1022 		 */
1023 		if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
1024 		    need_do_async_reclaim(fs_info, space_info,
1025 					  used, system_chunk) &&
1026 		    !work_busy(&fs_info->async_reclaim_work)) {
1027 			trace_btrfs_trigger_flush(fs_info, space_info->flags,
1028 						  orig_bytes, flush, "preempt");
1029 			queue_work(system_unbound_wq,
1030 				   &fs_info->async_reclaim_work);
1031 		}
1032 	}
1033 	spin_unlock(&space_info->lock);
1034 	if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
1035 		return ret;
1036 
1037 	if (flush == BTRFS_RESERVE_FLUSH_ALL)
1038 		return wait_reserve_ticket(fs_info, space_info, &ticket);
1039 
1040 	ret = 0;
1041 	priority_reclaim_metadata_space(fs_info, space_info, &ticket);
1042 	spin_lock(&space_info->lock);
1043 	if (ticket.bytes) {
1044 		if (ticket.bytes < orig_bytes)
1045 			reclaim_bytes = orig_bytes - ticket.bytes;
1046 		list_del_init(&ticket.list);
1047 		ret = -ENOSPC;
1048 	}
1049 	spin_unlock(&space_info->lock);
1050 
1051 	if (reclaim_bytes)
1052 		btrfs_space_info_add_old_bytes(fs_info, space_info,
1053 					       reclaim_bytes);
1054 	ASSERT(list_empty(&ticket.list));
1055 	return ret;
1056 }
1057 
1058 /**
1059  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
1060  * @root - the root we're allocating for
1061  * @block_rsv - the block_rsv we're allocating for
1062  * @orig_bytes - the number of bytes we want
1063  * @flush - whether or not we can flush to make our reservation
1064  *
1065  * This will reserve orig_bytes number of bytes from the space info associated
1066  * with the block_rsv.  If there is not enough space it will make an attempt to
1067  * flush out space to make room.  It will do this by flushing delalloc if
1068  * possible or committing the transaction.  If flush is 0 then no attempts to
1069  * regain reservations will be made and this will fail if there is not enough
1070  * space already.
1071  */
1072 int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
1073 				 struct btrfs_block_rsv *block_rsv,
1074 				 u64 orig_bytes,
1075 				 enum btrfs_reserve_flush_enum flush)
1076 {
1077 	struct btrfs_fs_info *fs_info = root->fs_info;
1078 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
1079 	int ret;
1080 	bool system_chunk = (root == fs_info->chunk_root);
1081 
1082 	ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
1083 				       orig_bytes, flush, system_chunk);
1084 	if (ret == -ENOSPC &&
1085 	    unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
1086 		if (block_rsv != global_rsv &&
1087 		    !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes))
1088 			ret = 0;
1089 	}
1090 	if (ret == -ENOSPC) {
1091 		trace_btrfs_space_reservation(fs_info, "space_info:enospc",
1092 					      block_rsv->space_info->flags,
1093 					      orig_bytes, 1);
1094 
1095 		if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
1096 			btrfs_dump_space_info(fs_info, block_rsv->space_info,
1097 					      orig_bytes, 0);
1098 	}
1099 	return ret;
1100 }
1101