xref: /openbmc/linux/fs/btrfs/space-info.c (revision ef1317a1)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "misc.h"
4 #include "ctree.h"
5 #include "space-info.h"
6 #include "sysfs.h"
7 #include "volumes.h"
8 #include "free-space-cache.h"
9 #include "ordered-data.h"
10 #include "transaction.h"
11 #include "block-group.h"
12 
13 u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
14 			  bool may_use_included)
15 {
16 	ASSERT(s_info);
17 	return s_info->bytes_used + s_info->bytes_reserved +
18 		s_info->bytes_pinned + s_info->bytes_readonly +
19 		(may_use_included ? s_info->bytes_may_use : 0);
20 }
21 
22 /*
23  * after adding space to the filesystem, we need to clear the full flags
24  * on all the space infos.
25  */
26 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
27 {
28 	struct list_head *head = &info->space_info;
29 	struct btrfs_space_info *found;
30 
31 	rcu_read_lock();
32 	list_for_each_entry_rcu(found, head, list)
33 		found->full = 0;
34 	rcu_read_unlock();
35 }
36 
37 static int create_space_info(struct btrfs_fs_info *info, u64 flags)
38 {
39 
40 	struct btrfs_space_info *space_info;
41 	int i;
42 	int ret;
43 
44 	space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
45 	if (!space_info)
46 		return -ENOMEM;
47 
48 	ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
49 				 GFP_KERNEL);
50 	if (ret) {
51 		kfree(space_info);
52 		return ret;
53 	}
54 
55 	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
56 		INIT_LIST_HEAD(&space_info->block_groups[i]);
57 	init_rwsem(&space_info->groups_sem);
58 	spin_lock_init(&space_info->lock);
59 	space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
60 	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
61 	init_waitqueue_head(&space_info->wait);
62 	INIT_LIST_HEAD(&space_info->ro_bgs);
63 	INIT_LIST_HEAD(&space_info->tickets);
64 	INIT_LIST_HEAD(&space_info->priority_tickets);
65 
66 	ret = btrfs_sysfs_add_space_info_type(info, space_info);
67 	if (ret)
68 		return ret;
69 
70 	list_add_rcu(&space_info->list, &info->space_info);
71 	if (flags & BTRFS_BLOCK_GROUP_DATA)
72 		info->data_sinfo = space_info;
73 
74 	return ret;
75 }
76 
77 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
78 {
79 	struct btrfs_super_block *disk_super;
80 	u64 features;
81 	u64 flags;
82 	int mixed = 0;
83 	int ret;
84 
85 	disk_super = fs_info->super_copy;
86 	if (!btrfs_super_root(disk_super))
87 		return -EINVAL;
88 
89 	features = btrfs_super_incompat_flags(disk_super);
90 	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
91 		mixed = 1;
92 
93 	flags = BTRFS_BLOCK_GROUP_SYSTEM;
94 	ret = create_space_info(fs_info, flags);
95 	if (ret)
96 		goto out;
97 
98 	if (mixed) {
99 		flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
100 		ret = create_space_info(fs_info, flags);
101 	} else {
102 		flags = BTRFS_BLOCK_GROUP_METADATA;
103 		ret = create_space_info(fs_info, flags);
104 		if (ret)
105 			goto out;
106 
107 		flags = BTRFS_BLOCK_GROUP_DATA;
108 		ret = create_space_info(fs_info, flags);
109 	}
110 out:
111 	return ret;
112 }
113 
114 void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
115 			     u64 total_bytes, u64 bytes_used,
116 			     u64 bytes_readonly,
117 			     struct btrfs_space_info **space_info)
118 {
119 	struct btrfs_space_info *found;
120 	int factor;
121 
122 	factor = btrfs_bg_type_to_factor(flags);
123 
124 	found = btrfs_find_space_info(info, flags);
125 	ASSERT(found);
126 	spin_lock(&found->lock);
127 	found->total_bytes += total_bytes;
128 	found->disk_total += total_bytes * factor;
129 	found->bytes_used += bytes_used;
130 	found->disk_used += bytes_used * factor;
131 	found->bytes_readonly += bytes_readonly;
132 	if (total_bytes > 0)
133 		found->full = 0;
134 	btrfs_space_info_add_new_bytes(info, found,
135 				       total_bytes - bytes_used -
136 				       bytes_readonly);
137 	spin_unlock(&found->lock);
138 	*space_info = found;
139 }
140 
141 struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
142 					       u64 flags)
143 {
144 	struct list_head *head = &info->space_info;
145 	struct btrfs_space_info *found;
146 
147 	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
148 
149 	rcu_read_lock();
150 	list_for_each_entry_rcu(found, head, list) {
151 		if (found->flags & flags) {
152 			rcu_read_unlock();
153 			return found;
154 		}
155 	}
156 	rcu_read_unlock();
157 	return NULL;
158 }
159 
160 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
161 {
162 	return (global->size << 1);
163 }
164 
165 static int can_overcommit(struct btrfs_fs_info *fs_info,
166 			  struct btrfs_space_info *space_info, u64 bytes,
167 			  enum btrfs_reserve_flush_enum flush,
168 			  bool system_chunk)
169 {
170 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
171 	u64 profile;
172 	u64 space_size;
173 	u64 avail;
174 	u64 used;
175 	int factor;
176 
177 	/* Don't overcommit when in mixed mode. */
178 	if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
179 		return 0;
180 
181 	if (system_chunk)
182 		profile = btrfs_system_alloc_profile(fs_info);
183 	else
184 		profile = btrfs_metadata_alloc_profile(fs_info);
185 
186 	used = btrfs_space_info_used(space_info, false);
187 
188 	/*
189 	 * We only want to allow over committing if we have lots of actual space
190 	 * free, but if we don't have enough space to handle the global reserve
191 	 * space then we could end up having a real enospc problem when trying
192 	 * to allocate a chunk or some other such important allocation.
193 	 */
194 	spin_lock(&global_rsv->lock);
195 	space_size = calc_global_rsv_need_space(global_rsv);
196 	spin_unlock(&global_rsv->lock);
197 	if (used + space_size >= space_info->total_bytes)
198 		return 0;
199 
200 	used += space_info->bytes_may_use;
201 
202 	avail = atomic64_read(&fs_info->free_chunk_space);
203 
204 	/*
205 	 * If we have dup, raid1 or raid10 then only half of the free
206 	 * space is actually usable.  For raid56, the space info used
207 	 * doesn't include the parity drive, so we don't have to
208 	 * change the math
209 	 */
210 	factor = btrfs_bg_type_to_factor(profile);
211 	avail = div_u64(avail, factor);
212 
213 	/*
214 	 * If we aren't flushing all things, let us overcommit up to
215 	 * 1/2th of the space. If we can flush, don't let us overcommit
216 	 * too much, let it overcommit up to 1/8 of the space.
217 	 */
218 	if (flush == BTRFS_RESERVE_FLUSH_ALL)
219 		avail >>= 3;
220 	else
221 		avail >>= 1;
222 
223 	if (used + bytes < space_info->total_bytes + avail)
224 		return 1;
225 	return 0;
226 }
227 
228 /*
229  * This is for space we already have accounted in space_info->bytes_may_use, so
230  * basically when we're returning space from block_rsv's.
231  */
232 void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
233 				    struct btrfs_space_info *space_info,
234 				    u64 num_bytes)
235 {
236 	struct reserve_ticket *ticket;
237 	struct list_head *head;
238 	u64 used;
239 	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
240 	bool check_overcommit = false;
241 
242 	spin_lock(&space_info->lock);
243 	head = &space_info->priority_tickets;
244 
245 	/*
246 	 * If we are over our limit then we need to check and see if we can
247 	 * overcommit, and if we can't then we just need to free up our space
248 	 * and not satisfy any requests.
249 	 */
250 	used = btrfs_space_info_used(space_info, true);
251 	if (used - num_bytes >= space_info->total_bytes)
252 		check_overcommit = true;
253 again:
254 	while (!list_empty(head) && num_bytes) {
255 		ticket = list_first_entry(head, struct reserve_ticket,
256 					  list);
257 		/*
258 		 * We use 0 bytes because this space is already reserved, so
259 		 * adding the ticket space would be a double count.
260 		 */
261 		if (check_overcommit &&
262 		    !can_overcommit(fs_info, space_info, 0, flush, false))
263 			break;
264 		if (num_bytes >= ticket->bytes) {
265 			list_del_init(&ticket->list);
266 			num_bytes -= ticket->bytes;
267 			ticket->bytes = 0;
268 			space_info->tickets_id++;
269 			wake_up(&ticket->wait);
270 		} else {
271 			ticket->bytes -= num_bytes;
272 			num_bytes = 0;
273 		}
274 	}
275 
276 	if (num_bytes && head == &space_info->priority_tickets) {
277 		head = &space_info->tickets;
278 		flush = BTRFS_RESERVE_FLUSH_ALL;
279 		goto again;
280 	}
281 	btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes);
282 	trace_btrfs_space_reservation(fs_info, "space_info",
283 				      space_info->flags, num_bytes, 0);
284 	spin_unlock(&space_info->lock);
285 }
286 
287 /*
288  * This is for newly allocated space that isn't accounted in
289  * space_info->bytes_may_use yet.  So if we allocate a chunk or unpin an extent
290  * we use this helper.
291  */
292 void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
293 				    struct btrfs_space_info *space_info,
294 				    u64 num_bytes)
295 {
296 	struct reserve_ticket *ticket;
297 	struct list_head *head = &space_info->priority_tickets;
298 
299 again:
300 	while (!list_empty(head) && num_bytes) {
301 		ticket = list_first_entry(head, struct reserve_ticket,
302 					  list);
303 		if (num_bytes >= ticket->bytes) {
304 			trace_btrfs_space_reservation(fs_info, "space_info",
305 						      space_info->flags,
306 						      ticket->bytes, 1);
307 			list_del_init(&ticket->list);
308 			num_bytes -= ticket->bytes;
309 			btrfs_space_info_update_bytes_may_use(fs_info,
310 							      space_info,
311 							      ticket->bytes);
312 			ticket->bytes = 0;
313 			space_info->tickets_id++;
314 			wake_up(&ticket->wait);
315 		} else {
316 			trace_btrfs_space_reservation(fs_info, "space_info",
317 						      space_info->flags,
318 						      num_bytes, 1);
319 			btrfs_space_info_update_bytes_may_use(fs_info,
320 							      space_info,
321 							      num_bytes);
322 			ticket->bytes -= num_bytes;
323 			num_bytes = 0;
324 		}
325 	}
326 
327 	if (num_bytes && head == &space_info->priority_tickets) {
328 		head = &space_info->tickets;
329 		goto again;
330 	}
331 }
332 
333 #define DUMP_BLOCK_RSV(fs_info, rsv_name)				\
334 do {									\
335 	struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name;		\
336 	spin_lock(&__rsv->lock);					\
337 	btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu",	\
338 		   __rsv->size, __rsv->reserved);			\
339 	spin_unlock(&__rsv->lock);					\
340 } while (0)
341 
342 void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
343 			   struct btrfs_space_info *info, u64 bytes,
344 			   int dump_block_groups)
345 {
346 	struct btrfs_block_group_cache *cache;
347 	int index = 0;
348 
349 	spin_lock(&info->lock);
350 	btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
351 		   info->flags,
352 		   info->total_bytes - btrfs_space_info_used(info, true),
353 		   info->full ? "" : "not ");
354 	btrfs_info(fs_info,
355 		"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
356 		info->total_bytes, info->bytes_used, info->bytes_pinned,
357 		info->bytes_reserved, info->bytes_may_use,
358 		info->bytes_readonly);
359 	spin_unlock(&info->lock);
360 
361 	DUMP_BLOCK_RSV(fs_info, global_block_rsv);
362 	DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
363 	DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
364 	DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
365 	DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
366 
367 	if (!dump_block_groups)
368 		return;
369 
370 	down_read(&info->groups_sem);
371 again:
372 	list_for_each_entry(cache, &info->block_groups[index], list) {
373 		spin_lock(&cache->lock);
374 		btrfs_info(fs_info,
375 			"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
376 			cache->key.objectid, cache->key.offset,
377 			btrfs_block_group_used(&cache->item), cache->pinned,
378 			cache->reserved, cache->ro ? "[readonly]" : "");
379 		btrfs_dump_free_space(cache, bytes);
380 		spin_unlock(&cache->lock);
381 	}
382 	if (++index < BTRFS_NR_RAID_TYPES)
383 		goto again;
384 	up_read(&info->groups_sem);
385 }
386 
387 static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
388 					 unsigned long nr_pages, int nr_items)
389 {
390 	struct super_block *sb = fs_info->sb;
391 
392 	if (down_read_trylock(&sb->s_umount)) {
393 		writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
394 		up_read(&sb->s_umount);
395 	} else {
396 		/*
397 		 * We needn't worry the filesystem going from r/w to r/o though
398 		 * we don't acquire ->s_umount mutex, because the filesystem
399 		 * should guarantee the delalloc inodes list be empty after
400 		 * the filesystem is readonly(all dirty pages are written to
401 		 * the disk).
402 		 */
403 		btrfs_start_delalloc_roots(fs_info, nr_items);
404 		if (!current->journal_info)
405 			btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
406 	}
407 }
408 
409 static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
410 					u64 to_reclaim)
411 {
412 	u64 bytes;
413 	u64 nr;
414 
415 	bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
416 	nr = div64_u64(to_reclaim, bytes);
417 	if (!nr)
418 		nr = 1;
419 	return nr;
420 }
421 
422 #define EXTENT_SIZE_PER_ITEM	SZ_256K
423 
424 /*
425  * shrink metadata reservation for delalloc
426  */
427 static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
428 			    u64 orig, bool wait_ordered)
429 {
430 	struct btrfs_space_info *space_info;
431 	struct btrfs_trans_handle *trans;
432 	u64 delalloc_bytes;
433 	u64 dio_bytes;
434 	u64 async_pages;
435 	u64 items;
436 	long time_left;
437 	unsigned long nr_pages;
438 	int loops;
439 
440 	/* Calc the number of the pages we need flush for space reservation */
441 	items = calc_reclaim_items_nr(fs_info, to_reclaim);
442 	to_reclaim = items * EXTENT_SIZE_PER_ITEM;
443 
444 	trans = (struct btrfs_trans_handle *)current->journal_info;
445 	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
446 
447 	delalloc_bytes = percpu_counter_sum_positive(
448 						&fs_info->delalloc_bytes);
449 	dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
450 	if (delalloc_bytes == 0 && dio_bytes == 0) {
451 		if (trans)
452 			return;
453 		if (wait_ordered)
454 			btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
455 		return;
456 	}
457 
458 	/*
459 	 * If we are doing more ordered than delalloc we need to just wait on
460 	 * ordered extents, otherwise we'll waste time trying to flush delalloc
461 	 * that likely won't give us the space back we need.
462 	 */
463 	if (dio_bytes > delalloc_bytes)
464 		wait_ordered = true;
465 
466 	loops = 0;
467 	while ((delalloc_bytes || dio_bytes) && loops < 3) {
468 		nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
469 
470 		/*
471 		 * Triggers inode writeback for up to nr_pages. This will invoke
472 		 * ->writepages callback and trigger delalloc filling
473 		 *  (btrfs_run_delalloc_range()).
474 		 */
475 		btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
476 
477 		/*
478 		 * We need to wait for the compressed pages to start before
479 		 * we continue.
480 		 */
481 		async_pages = atomic_read(&fs_info->async_delalloc_pages);
482 		if (!async_pages)
483 			goto skip_async;
484 
485 		/*
486 		 * Calculate how many compressed pages we want to be written
487 		 * before we continue. I.e if there are more async pages than we
488 		 * require wait_event will wait until nr_pages are written.
489 		 */
490 		if (async_pages <= nr_pages)
491 			async_pages = 0;
492 		else
493 			async_pages -= nr_pages;
494 
495 		wait_event(fs_info->async_submit_wait,
496 			   atomic_read(&fs_info->async_delalloc_pages) <=
497 			   (int)async_pages);
498 skip_async:
499 		spin_lock(&space_info->lock);
500 		if (list_empty(&space_info->tickets) &&
501 		    list_empty(&space_info->priority_tickets)) {
502 			spin_unlock(&space_info->lock);
503 			break;
504 		}
505 		spin_unlock(&space_info->lock);
506 
507 		loops++;
508 		if (wait_ordered && !trans) {
509 			btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
510 		} else {
511 			time_left = schedule_timeout_killable(1);
512 			if (time_left)
513 				break;
514 		}
515 		delalloc_bytes = percpu_counter_sum_positive(
516 						&fs_info->delalloc_bytes);
517 		dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
518 	}
519 }
520 
521 /**
522  * maybe_commit_transaction - possibly commit the transaction if its ok to
523  * @root - the root we're allocating for
524  * @bytes - the number of bytes we want to reserve
525  * @force - force the commit
526  *
527  * This will check to make sure that committing the transaction will actually
528  * get us somewhere and then commit the transaction if it does.  Otherwise it
529  * will return -ENOSPC.
530  */
531 static int may_commit_transaction(struct btrfs_fs_info *fs_info,
532 				  struct btrfs_space_info *space_info)
533 {
534 	struct reserve_ticket *ticket = NULL;
535 	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
536 	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
537 	struct btrfs_trans_handle *trans;
538 	u64 bytes_needed;
539 	u64 reclaim_bytes = 0;
540 
541 	trans = (struct btrfs_trans_handle *)current->journal_info;
542 	if (trans)
543 		return -EAGAIN;
544 
545 	spin_lock(&space_info->lock);
546 	if (!list_empty(&space_info->priority_tickets))
547 		ticket = list_first_entry(&space_info->priority_tickets,
548 					  struct reserve_ticket, list);
549 	else if (!list_empty(&space_info->tickets))
550 		ticket = list_first_entry(&space_info->tickets,
551 					  struct reserve_ticket, list);
552 	bytes_needed = (ticket) ? ticket->bytes : 0;
553 	spin_unlock(&space_info->lock);
554 
555 	if (!bytes_needed)
556 		return 0;
557 
558 	trans = btrfs_join_transaction(fs_info->extent_root);
559 	if (IS_ERR(trans))
560 		return PTR_ERR(trans);
561 
562 	/*
563 	 * See if there is enough pinned space to make this reservation, or if
564 	 * we have block groups that are going to be freed, allowing us to
565 	 * possibly do a chunk allocation the next loop through.
566 	 */
567 	if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
568 	    __percpu_counter_compare(&space_info->total_bytes_pinned,
569 				     bytes_needed,
570 				     BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
571 		goto commit;
572 
573 	/*
574 	 * See if there is some space in the delayed insertion reservation for
575 	 * this reservation.
576 	 */
577 	if (space_info != delayed_rsv->space_info)
578 		goto enospc;
579 
580 	spin_lock(&delayed_rsv->lock);
581 	reclaim_bytes += delayed_rsv->reserved;
582 	spin_unlock(&delayed_rsv->lock);
583 
584 	spin_lock(&delayed_refs_rsv->lock);
585 	reclaim_bytes += delayed_refs_rsv->reserved;
586 	spin_unlock(&delayed_refs_rsv->lock);
587 	if (reclaim_bytes >= bytes_needed)
588 		goto commit;
589 	bytes_needed -= reclaim_bytes;
590 
591 	if (__percpu_counter_compare(&space_info->total_bytes_pinned,
592 				   bytes_needed,
593 				   BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
594 		goto enospc;
595 
596 commit:
597 	return btrfs_commit_transaction(trans);
598 enospc:
599 	btrfs_end_transaction(trans);
600 	return -ENOSPC;
601 }
602 
603 /*
604  * Try to flush some data based on policy set by @state. This is only advisory
605  * and may fail for various reasons. The caller is supposed to examine the
606  * state of @space_info to detect the outcome.
607  */
608 static void flush_space(struct btrfs_fs_info *fs_info,
609 		       struct btrfs_space_info *space_info, u64 num_bytes,
610 		       int state)
611 {
612 	struct btrfs_root *root = fs_info->extent_root;
613 	struct btrfs_trans_handle *trans;
614 	int nr;
615 	int ret = 0;
616 
617 	switch (state) {
618 	case FLUSH_DELAYED_ITEMS_NR:
619 	case FLUSH_DELAYED_ITEMS:
620 		if (state == FLUSH_DELAYED_ITEMS_NR)
621 			nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
622 		else
623 			nr = -1;
624 
625 		trans = btrfs_join_transaction(root);
626 		if (IS_ERR(trans)) {
627 			ret = PTR_ERR(trans);
628 			break;
629 		}
630 		ret = btrfs_run_delayed_items_nr(trans, nr);
631 		btrfs_end_transaction(trans);
632 		break;
633 	case FLUSH_DELALLOC:
634 	case FLUSH_DELALLOC_WAIT:
635 		shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
636 				state == FLUSH_DELALLOC_WAIT);
637 		break;
638 	case FLUSH_DELAYED_REFS_NR:
639 	case FLUSH_DELAYED_REFS:
640 		trans = btrfs_join_transaction(root);
641 		if (IS_ERR(trans)) {
642 			ret = PTR_ERR(trans);
643 			break;
644 		}
645 		if (state == FLUSH_DELAYED_REFS_NR)
646 			nr = calc_reclaim_items_nr(fs_info, num_bytes);
647 		else
648 			nr = 0;
649 		btrfs_run_delayed_refs(trans, nr);
650 		btrfs_end_transaction(trans);
651 		break;
652 	case ALLOC_CHUNK:
653 	case ALLOC_CHUNK_FORCE:
654 		trans = btrfs_join_transaction(root);
655 		if (IS_ERR(trans)) {
656 			ret = PTR_ERR(trans);
657 			break;
658 		}
659 		ret = btrfs_chunk_alloc(trans,
660 				btrfs_metadata_alloc_profile(fs_info),
661 				(state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
662 					CHUNK_ALLOC_FORCE);
663 		btrfs_end_transaction(trans);
664 		if (ret > 0 || ret == -ENOSPC)
665 			ret = 0;
666 		break;
667 	case RUN_DELAYED_IPUTS:
668 		/*
669 		 * If we have pending delayed iputs then we could free up a
670 		 * bunch of pinned space, so make sure we run the iputs before
671 		 * we do our pinned bytes check below.
672 		 */
673 		btrfs_run_delayed_iputs(fs_info);
674 		btrfs_wait_on_delayed_iputs(fs_info);
675 		break;
676 	case COMMIT_TRANS:
677 		ret = may_commit_transaction(fs_info, space_info);
678 		break;
679 	default:
680 		ret = -ENOSPC;
681 		break;
682 	}
683 
684 	trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
685 				ret);
686 	return;
687 }
688 
689 static inline u64
690 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
691 				 struct btrfs_space_info *space_info,
692 				 bool system_chunk)
693 {
694 	struct reserve_ticket *ticket;
695 	u64 used;
696 	u64 expected;
697 	u64 to_reclaim = 0;
698 
699 	list_for_each_entry(ticket, &space_info->tickets, list)
700 		to_reclaim += ticket->bytes;
701 	list_for_each_entry(ticket, &space_info->priority_tickets, list)
702 		to_reclaim += ticket->bytes;
703 	if (to_reclaim)
704 		return to_reclaim;
705 
706 	to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
707 	if (can_overcommit(fs_info, space_info, to_reclaim,
708 			   BTRFS_RESERVE_FLUSH_ALL, system_chunk))
709 		return 0;
710 
711 	used = btrfs_space_info_used(space_info, true);
712 
713 	if (can_overcommit(fs_info, space_info, SZ_1M,
714 			   BTRFS_RESERVE_FLUSH_ALL, system_chunk))
715 		expected = div_factor_fine(space_info->total_bytes, 95);
716 	else
717 		expected = div_factor_fine(space_info->total_bytes, 90);
718 
719 	if (used > expected)
720 		to_reclaim = used - expected;
721 	else
722 		to_reclaim = 0;
723 	to_reclaim = min(to_reclaim, space_info->bytes_may_use +
724 				     space_info->bytes_reserved);
725 	return to_reclaim;
726 }
727 
728 static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
729 					struct btrfs_space_info *space_info,
730 					u64 used, bool system_chunk)
731 {
732 	u64 thresh = div_factor_fine(space_info->total_bytes, 98);
733 
734 	/* If we're just plain full then async reclaim just slows us down. */
735 	if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
736 		return 0;
737 
738 	if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
739 					      system_chunk))
740 		return 0;
741 
742 	return (used >= thresh && !btrfs_fs_closing(fs_info) &&
743 		!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
744 }
745 
746 static bool wake_all_tickets(struct list_head *head)
747 {
748 	struct reserve_ticket *ticket;
749 
750 	while (!list_empty(head)) {
751 		ticket = list_first_entry(head, struct reserve_ticket, list);
752 		list_del_init(&ticket->list);
753 		ticket->error = -ENOSPC;
754 		wake_up(&ticket->wait);
755 		if (ticket->bytes != ticket->orig_bytes)
756 			return true;
757 	}
758 	return false;
759 }
760 
761 /*
762  * This is for normal flushers, we can wait all goddamned day if we want to.  We
763  * will loop and continuously try to flush as long as we are making progress.
764  * We count progress as clearing off tickets each time we have to loop.
765  */
766 static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
767 {
768 	struct btrfs_fs_info *fs_info;
769 	struct btrfs_space_info *space_info;
770 	u64 to_reclaim;
771 	int flush_state;
772 	int commit_cycles = 0;
773 	u64 last_tickets_id;
774 
775 	fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
776 	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
777 
778 	spin_lock(&space_info->lock);
779 	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
780 						      false);
781 	if (!to_reclaim) {
782 		space_info->flush = 0;
783 		spin_unlock(&space_info->lock);
784 		return;
785 	}
786 	last_tickets_id = space_info->tickets_id;
787 	spin_unlock(&space_info->lock);
788 
789 	flush_state = FLUSH_DELAYED_ITEMS_NR;
790 	do {
791 		flush_space(fs_info, space_info, to_reclaim, flush_state);
792 		spin_lock(&space_info->lock);
793 		if (list_empty(&space_info->tickets)) {
794 			space_info->flush = 0;
795 			spin_unlock(&space_info->lock);
796 			return;
797 		}
798 		to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
799 							      space_info,
800 							      false);
801 		if (last_tickets_id == space_info->tickets_id) {
802 			flush_state++;
803 		} else {
804 			last_tickets_id = space_info->tickets_id;
805 			flush_state = FLUSH_DELAYED_ITEMS_NR;
806 			if (commit_cycles)
807 				commit_cycles--;
808 		}
809 
810 		/*
811 		 * We don't want to force a chunk allocation until we've tried
812 		 * pretty hard to reclaim space.  Think of the case where we
813 		 * freed up a bunch of space and so have a lot of pinned space
814 		 * to reclaim.  We would rather use that than possibly create a
815 		 * underutilized metadata chunk.  So if this is our first run
816 		 * through the flushing state machine skip ALLOC_CHUNK_FORCE and
817 		 * commit the transaction.  If nothing has changed the next go
818 		 * around then we can force a chunk allocation.
819 		 */
820 		if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
821 			flush_state++;
822 
823 		if (flush_state > COMMIT_TRANS) {
824 			commit_cycles++;
825 			if (commit_cycles > 2) {
826 				if (wake_all_tickets(&space_info->tickets)) {
827 					flush_state = FLUSH_DELAYED_ITEMS_NR;
828 					commit_cycles--;
829 				} else {
830 					space_info->flush = 0;
831 				}
832 			} else {
833 				flush_state = FLUSH_DELAYED_ITEMS_NR;
834 			}
835 		}
836 		spin_unlock(&space_info->lock);
837 	} while (flush_state <= COMMIT_TRANS);
838 }
839 
840 void btrfs_init_async_reclaim_work(struct work_struct *work)
841 {
842 	INIT_WORK(work, btrfs_async_reclaim_metadata_space);
843 }
844 
845 static const enum btrfs_flush_state priority_flush_states[] = {
846 	FLUSH_DELAYED_ITEMS_NR,
847 	FLUSH_DELAYED_ITEMS,
848 	ALLOC_CHUNK,
849 };
850 
851 static const enum btrfs_flush_state evict_flush_states[] = {
852 	FLUSH_DELAYED_ITEMS_NR,
853 	FLUSH_DELAYED_ITEMS,
854 	FLUSH_DELAYED_REFS_NR,
855 	FLUSH_DELAYED_REFS,
856 	FLUSH_DELALLOC,
857 	FLUSH_DELALLOC_WAIT,
858 	ALLOC_CHUNK,
859 	COMMIT_TRANS,
860 };
861 
862 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
863 				struct btrfs_space_info *space_info,
864 				struct reserve_ticket *ticket,
865 				const enum btrfs_flush_state *states,
866 				int states_nr)
867 {
868 	u64 to_reclaim;
869 	int flush_state;
870 
871 	spin_lock(&space_info->lock);
872 	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
873 						      false);
874 	if (!to_reclaim) {
875 		spin_unlock(&space_info->lock);
876 		return;
877 	}
878 	spin_unlock(&space_info->lock);
879 
880 	flush_state = 0;
881 	do {
882 		flush_space(fs_info, space_info, to_reclaim, states[flush_state]);
883 		flush_state++;
884 		spin_lock(&space_info->lock);
885 		if (ticket->bytes == 0) {
886 			spin_unlock(&space_info->lock);
887 			return;
888 		}
889 		spin_unlock(&space_info->lock);
890 	} while (flush_state < states_nr);
891 }
892 
893 static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
894 				struct btrfs_space_info *space_info,
895 				struct reserve_ticket *ticket)
896 
897 {
898 	DEFINE_WAIT(wait);
899 	int ret = 0;
900 
901 	spin_lock(&space_info->lock);
902 	while (ticket->bytes > 0 && ticket->error == 0) {
903 		ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
904 		if (ret) {
905 			ticket->error = -EINTR;
906 			break;
907 		}
908 		spin_unlock(&space_info->lock);
909 
910 		schedule();
911 
912 		finish_wait(&ticket->wait, &wait);
913 		spin_lock(&space_info->lock);
914 	}
915 	spin_unlock(&space_info->lock);
916 }
917 
918 /**
919  * handle_reserve_ticket - do the appropriate flushing and waiting for a ticket
920  * @fs_info - the fs
921  * @space_info - the space_info for the reservation
922  * @ticket - the ticket for the reservation
923  * @flush - how much we can flush
924  *
925  * This does the work of figuring out how to flush for the ticket, waiting for
926  * the reservation, and returning the appropriate error if there is one.
927  */
928 static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
929 				 struct btrfs_space_info *space_info,
930 				 struct reserve_ticket *ticket,
931 				 enum btrfs_reserve_flush_enum flush)
932 {
933 	u64 reclaim_bytes = 0;
934 	int ret;
935 
936 	switch (flush) {
937 	case BTRFS_RESERVE_FLUSH_ALL:
938 		wait_reserve_ticket(fs_info, space_info, ticket);
939 		break;
940 	case BTRFS_RESERVE_FLUSH_LIMIT:
941 		priority_reclaim_metadata_space(fs_info, space_info, ticket,
942 						priority_flush_states,
943 						ARRAY_SIZE(priority_flush_states));
944 		break;
945 	case BTRFS_RESERVE_FLUSH_EVICT:
946 		priority_reclaim_metadata_space(fs_info, space_info, ticket,
947 						evict_flush_states,
948 						ARRAY_SIZE(evict_flush_states));
949 		break;
950 	default:
951 		ASSERT(0);
952 		break;
953 	}
954 
955 	spin_lock(&space_info->lock);
956 	ret = ticket->error;
957 	if (ticket->bytes || ticket->error) {
958 		if (ticket->bytes < ticket->orig_bytes)
959 			reclaim_bytes = ticket->orig_bytes - ticket->bytes;
960 		list_del_init(&ticket->list);
961 		if (!ret)
962 			ret = -ENOSPC;
963 	}
964 	spin_unlock(&space_info->lock);
965 
966 	if (reclaim_bytes)
967 		btrfs_space_info_add_old_bytes(fs_info, space_info,
968 					       reclaim_bytes);
969 	ASSERT(list_empty(&ticket->list));
970 	return ret;
971 }
972 
973 /**
974  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
975  * @root - the root we're allocating for
976  * @space_info - the space info we want to allocate from
977  * @orig_bytes - the number of bytes we want
978  * @flush - whether or not we can flush to make our reservation
979  *
980  * This will reserve orig_bytes number of bytes from the space info associated
981  * with the block_rsv.  If there is not enough space it will make an attempt to
982  * flush out space to make room.  It will do this by flushing delalloc if
983  * possible or committing the transaction.  If flush is 0 then no attempts to
984  * regain reservations will be made and this will fail if there is not enough
985  * space already.
986  */
987 static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
988 				    struct btrfs_space_info *space_info,
989 				    u64 orig_bytes,
990 				    enum btrfs_reserve_flush_enum flush,
991 				    bool system_chunk)
992 {
993 	struct reserve_ticket ticket;
994 	u64 used;
995 	int ret = 0;
996 	bool pending_tickets;
997 
998 	ASSERT(orig_bytes);
999 	ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
1000 
1001 	spin_lock(&space_info->lock);
1002 	ret = -ENOSPC;
1003 	used = btrfs_space_info_used(space_info, true);
1004 	pending_tickets = !list_empty(&space_info->tickets) ||
1005 		!list_empty(&space_info->priority_tickets);
1006 
1007 	/*
1008 	 * Carry on if we have enough space (short-circuit) OR call
1009 	 * can_overcommit() to ensure we can overcommit to continue.
1010 	 */
1011 	if (!pending_tickets &&
1012 	    ((used + orig_bytes <= space_info->total_bytes) ||
1013 	     can_overcommit(fs_info, space_info, orig_bytes, flush,
1014 			   system_chunk))) {
1015 		btrfs_space_info_update_bytes_may_use(fs_info, space_info,
1016 						      orig_bytes);
1017 		trace_btrfs_space_reservation(fs_info, "space_info",
1018 					      space_info->flags, orig_bytes, 1);
1019 		ret = 0;
1020 	}
1021 
1022 	/*
1023 	 * If we couldn't make a reservation then setup our reservation ticket
1024 	 * and kick the async worker if it's not already running.
1025 	 *
1026 	 * If we are a priority flusher then we just need to add our ticket to
1027 	 * the list and we will do our own flushing further down.
1028 	 */
1029 	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
1030 		ticket.orig_bytes = orig_bytes;
1031 		ticket.bytes = orig_bytes;
1032 		ticket.error = 0;
1033 		init_waitqueue_head(&ticket.wait);
1034 		if (flush == BTRFS_RESERVE_FLUSH_ALL) {
1035 			list_add_tail(&ticket.list, &space_info->tickets);
1036 			if (!space_info->flush) {
1037 				space_info->flush = 1;
1038 				trace_btrfs_trigger_flush(fs_info,
1039 							  space_info->flags,
1040 							  orig_bytes, flush,
1041 							  "enospc");
1042 				queue_work(system_unbound_wq,
1043 					   &fs_info->async_reclaim_work);
1044 			}
1045 		} else {
1046 			list_add_tail(&ticket.list,
1047 				      &space_info->priority_tickets);
1048 		}
1049 	} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
1050 		used += orig_bytes;
1051 		/*
1052 		 * We will do the space reservation dance during log replay,
1053 		 * which means we won't have fs_info->fs_root set, so don't do
1054 		 * the async reclaim as we will panic.
1055 		 */
1056 		if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
1057 		    need_do_async_reclaim(fs_info, space_info,
1058 					  used, system_chunk) &&
1059 		    !work_busy(&fs_info->async_reclaim_work)) {
1060 			trace_btrfs_trigger_flush(fs_info, space_info->flags,
1061 						  orig_bytes, flush, "preempt");
1062 			queue_work(system_unbound_wq,
1063 				   &fs_info->async_reclaim_work);
1064 		}
1065 	}
1066 	spin_unlock(&space_info->lock);
1067 	if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
1068 		return ret;
1069 
1070 	return handle_reserve_ticket(fs_info, space_info, &ticket, flush);
1071 }
1072 
1073 /**
1074  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
1075  * @root - the root we're allocating for
1076  * @block_rsv - the block_rsv we're allocating for
1077  * @orig_bytes - the number of bytes we want
1078  * @flush - whether or not we can flush to make our reservation
1079  *
1080  * This will reserve orig_bytes number of bytes from the space info associated
1081  * with the block_rsv.  If there is not enough space it will make an attempt to
1082  * flush out space to make room.  It will do this by flushing delalloc if
1083  * possible or committing the transaction.  If flush is 0 then no attempts to
1084  * regain reservations will be made and this will fail if there is not enough
1085  * space already.
1086  */
1087 int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
1088 				 struct btrfs_block_rsv *block_rsv,
1089 				 u64 orig_bytes,
1090 				 enum btrfs_reserve_flush_enum flush)
1091 {
1092 	struct btrfs_fs_info *fs_info = root->fs_info;
1093 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
1094 	int ret;
1095 	bool system_chunk = (root == fs_info->chunk_root);
1096 
1097 	ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
1098 				       orig_bytes, flush, system_chunk);
1099 	if (ret == -ENOSPC &&
1100 	    unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
1101 		if (block_rsv != global_rsv &&
1102 		    !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes))
1103 			ret = 0;
1104 	}
1105 	if (ret == -ENOSPC) {
1106 		trace_btrfs_space_reservation(fs_info, "space_info:enospc",
1107 					      block_rsv->space_info->flags,
1108 					      orig_bytes, 1);
1109 
1110 		if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
1111 			btrfs_dump_space_info(fs_info, block_rsv->space_info,
1112 					      orig_bytes, 0);
1113 	}
1114 	return ret;
1115 }
1116