xref: /openbmc/linux/fs/btrfs/space-info.c (revision b882327a)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "ctree.h"
4 #include "space-info.h"
5 #include "sysfs.h"
6 #include "volumes.h"
7 #include "free-space-cache.h"
8 #include "ordered-data.h"
9 #include "transaction.h"
10 #include "math.h"
11 #include "block-group.h"
12 
13 u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
14 			  bool may_use_included)
15 {
16 	ASSERT(s_info);
17 	return s_info->bytes_used + s_info->bytes_reserved +
18 		s_info->bytes_pinned + s_info->bytes_readonly +
19 		(may_use_included ? s_info->bytes_may_use : 0);
20 }
21 
22 /*
23  * after adding space to the filesystem, we need to clear the full flags
24  * on all the space infos.
25  */
26 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
27 {
28 	struct list_head *head = &info->space_info;
29 	struct btrfs_space_info *found;
30 
31 	rcu_read_lock();
32 	list_for_each_entry_rcu(found, head, list)
33 		found->full = 0;
34 	rcu_read_unlock();
35 }
36 
37 static int create_space_info(struct btrfs_fs_info *info, u64 flags)
38 {
39 
40 	struct btrfs_space_info *space_info;
41 	int i;
42 	int ret;
43 
44 	space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
45 	if (!space_info)
46 		return -ENOMEM;
47 
48 	ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
49 				 GFP_KERNEL);
50 	if (ret) {
51 		kfree(space_info);
52 		return ret;
53 	}
54 
55 	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
56 		INIT_LIST_HEAD(&space_info->block_groups[i]);
57 	init_rwsem(&space_info->groups_sem);
58 	spin_lock_init(&space_info->lock);
59 	space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
60 	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
61 	init_waitqueue_head(&space_info->wait);
62 	INIT_LIST_HEAD(&space_info->ro_bgs);
63 	INIT_LIST_HEAD(&space_info->tickets);
64 	INIT_LIST_HEAD(&space_info->priority_tickets);
65 
66 	ret = btrfs_sysfs_add_space_info_type(info, space_info);
67 	if (ret)
68 		return ret;
69 
70 	list_add_rcu(&space_info->list, &info->space_info);
71 	if (flags & BTRFS_BLOCK_GROUP_DATA)
72 		info->data_sinfo = space_info;
73 
74 	return ret;
75 }
76 
77 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
78 {
79 	struct btrfs_super_block *disk_super;
80 	u64 features;
81 	u64 flags;
82 	int mixed = 0;
83 	int ret;
84 
85 	disk_super = fs_info->super_copy;
86 	if (!btrfs_super_root(disk_super))
87 		return -EINVAL;
88 
89 	features = btrfs_super_incompat_flags(disk_super);
90 	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
91 		mixed = 1;
92 
93 	flags = BTRFS_BLOCK_GROUP_SYSTEM;
94 	ret = create_space_info(fs_info, flags);
95 	if (ret)
96 		goto out;
97 
98 	if (mixed) {
99 		flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
100 		ret = create_space_info(fs_info, flags);
101 	} else {
102 		flags = BTRFS_BLOCK_GROUP_METADATA;
103 		ret = create_space_info(fs_info, flags);
104 		if (ret)
105 			goto out;
106 
107 		flags = BTRFS_BLOCK_GROUP_DATA;
108 		ret = create_space_info(fs_info, flags);
109 	}
110 out:
111 	return ret;
112 }
113 
114 void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
115 			     u64 total_bytes, u64 bytes_used,
116 			     u64 bytes_readonly,
117 			     struct btrfs_space_info **space_info)
118 {
119 	struct btrfs_space_info *found;
120 	int factor;
121 
122 	factor = btrfs_bg_type_to_factor(flags);
123 
124 	found = btrfs_find_space_info(info, flags);
125 	ASSERT(found);
126 	spin_lock(&found->lock);
127 	found->total_bytes += total_bytes;
128 	found->disk_total += total_bytes * factor;
129 	found->bytes_used += bytes_used;
130 	found->disk_used += bytes_used * factor;
131 	found->bytes_readonly += bytes_readonly;
132 	if (total_bytes > 0)
133 		found->full = 0;
134 	btrfs_space_info_add_new_bytes(info, found,
135 				       total_bytes - bytes_used -
136 				       bytes_readonly);
137 	spin_unlock(&found->lock);
138 	*space_info = found;
139 }
140 
141 struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
142 					       u64 flags)
143 {
144 	struct list_head *head = &info->space_info;
145 	struct btrfs_space_info *found;
146 
147 	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
148 
149 	rcu_read_lock();
150 	list_for_each_entry_rcu(found, head, list) {
151 		if (found->flags & flags) {
152 			rcu_read_unlock();
153 			return found;
154 		}
155 	}
156 	rcu_read_unlock();
157 	return NULL;
158 }
159 
160 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
161 {
162 	return (global->size << 1);
163 }
164 
165 static int can_overcommit(struct btrfs_fs_info *fs_info,
166 			  struct btrfs_space_info *space_info, u64 bytes,
167 			  enum btrfs_reserve_flush_enum flush,
168 			  bool system_chunk)
169 {
170 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
171 	u64 profile;
172 	u64 space_size;
173 	u64 avail;
174 	u64 used;
175 	int factor;
176 
177 	/* Don't overcommit when in mixed mode. */
178 	if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
179 		return 0;
180 
181 	if (system_chunk)
182 		profile = btrfs_system_alloc_profile(fs_info);
183 	else
184 		profile = btrfs_metadata_alloc_profile(fs_info);
185 
186 	used = btrfs_space_info_used(space_info, false);
187 
188 	/*
189 	 * We only want to allow over committing if we have lots of actual space
190 	 * free, but if we don't have enough space to handle the global reserve
191 	 * space then we could end up having a real enospc problem when trying
192 	 * to allocate a chunk or some other such important allocation.
193 	 */
194 	spin_lock(&global_rsv->lock);
195 	space_size = calc_global_rsv_need_space(global_rsv);
196 	spin_unlock(&global_rsv->lock);
197 	if (used + space_size >= space_info->total_bytes)
198 		return 0;
199 
200 	used += space_info->bytes_may_use;
201 
202 	avail = atomic64_read(&fs_info->free_chunk_space);
203 
204 	/*
205 	 * If we have dup, raid1 or raid10 then only half of the free
206 	 * space is actually usable.  For raid56, the space info used
207 	 * doesn't include the parity drive, so we don't have to
208 	 * change the math
209 	 */
210 	factor = btrfs_bg_type_to_factor(profile);
211 	avail = div_u64(avail, factor);
212 
213 	/*
214 	 * If we aren't flushing all things, let us overcommit up to
215 	 * 1/2th of the space. If we can flush, don't let us overcommit
216 	 * too much, let it overcommit up to 1/8 of the space.
217 	 */
218 	if (flush == BTRFS_RESERVE_FLUSH_ALL)
219 		avail >>= 3;
220 	else
221 		avail >>= 1;
222 
223 	if (used + bytes < space_info->total_bytes + avail)
224 		return 1;
225 	return 0;
226 }
227 
228 /*
229  * This is for space we already have accounted in space_info->bytes_may_use, so
230  * basically when we're returning space from block_rsv's.
231  */
232 void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
233 				    struct btrfs_space_info *space_info,
234 				    u64 num_bytes)
235 {
236 	struct reserve_ticket *ticket;
237 	struct list_head *head;
238 	u64 used;
239 	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
240 	bool check_overcommit = false;
241 
242 	spin_lock(&space_info->lock);
243 	head = &space_info->priority_tickets;
244 
245 	/*
246 	 * If we are over our limit then we need to check and see if we can
247 	 * overcommit, and if we can't then we just need to free up our space
248 	 * and not satisfy any requests.
249 	 */
250 	used = btrfs_space_info_used(space_info, true);
251 	if (used - num_bytes >= space_info->total_bytes)
252 		check_overcommit = true;
253 again:
254 	while (!list_empty(head) && num_bytes) {
255 		ticket = list_first_entry(head, struct reserve_ticket,
256 					  list);
257 		/*
258 		 * We use 0 bytes because this space is already reserved, so
259 		 * adding the ticket space would be a double count.
260 		 */
261 		if (check_overcommit &&
262 		    !can_overcommit(fs_info, space_info, 0, flush, false))
263 			break;
264 		if (num_bytes >= ticket->bytes) {
265 			list_del_init(&ticket->list);
266 			num_bytes -= ticket->bytes;
267 			ticket->bytes = 0;
268 			space_info->tickets_id++;
269 			wake_up(&ticket->wait);
270 		} else {
271 			ticket->bytes -= num_bytes;
272 			num_bytes = 0;
273 		}
274 	}
275 
276 	if (num_bytes && head == &space_info->priority_tickets) {
277 		head = &space_info->tickets;
278 		flush = BTRFS_RESERVE_FLUSH_ALL;
279 		goto again;
280 	}
281 	btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes);
282 	trace_btrfs_space_reservation(fs_info, "space_info",
283 				      space_info->flags, num_bytes, 0);
284 	spin_unlock(&space_info->lock);
285 }
286 
287 /*
288  * This is for newly allocated space that isn't accounted in
289  * space_info->bytes_may_use yet.  So if we allocate a chunk or unpin an extent
290  * we use this helper.
291  */
292 void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
293 				    struct btrfs_space_info *space_info,
294 				    u64 num_bytes)
295 {
296 	struct reserve_ticket *ticket;
297 	struct list_head *head = &space_info->priority_tickets;
298 
299 again:
300 	while (!list_empty(head) && num_bytes) {
301 		ticket = list_first_entry(head, struct reserve_ticket,
302 					  list);
303 		if (num_bytes >= ticket->bytes) {
304 			trace_btrfs_space_reservation(fs_info, "space_info",
305 						      space_info->flags,
306 						      ticket->bytes, 1);
307 			list_del_init(&ticket->list);
308 			num_bytes -= ticket->bytes;
309 			btrfs_space_info_update_bytes_may_use(fs_info,
310 							      space_info,
311 							      ticket->bytes);
312 			ticket->bytes = 0;
313 			space_info->tickets_id++;
314 			wake_up(&ticket->wait);
315 		} else {
316 			trace_btrfs_space_reservation(fs_info, "space_info",
317 						      space_info->flags,
318 						      num_bytes, 1);
319 			btrfs_space_info_update_bytes_may_use(fs_info,
320 							      space_info,
321 							      num_bytes);
322 			ticket->bytes -= num_bytes;
323 			num_bytes = 0;
324 		}
325 	}
326 
327 	if (num_bytes && head == &space_info->priority_tickets) {
328 		head = &space_info->tickets;
329 		goto again;
330 	}
331 }
332 
333 #define DUMP_BLOCK_RSV(fs_info, rsv_name)				\
334 do {									\
335 	struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name;		\
336 	spin_lock(&__rsv->lock);					\
337 	btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu",	\
338 		   __rsv->size, __rsv->reserved);			\
339 	spin_unlock(&__rsv->lock);					\
340 } while (0)
341 
342 void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
343 			   struct btrfs_space_info *info, u64 bytes,
344 			   int dump_block_groups)
345 {
346 	struct btrfs_block_group_cache *cache;
347 	int index = 0;
348 
349 	spin_lock(&info->lock);
350 	btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
351 		   info->flags,
352 		   info->total_bytes - btrfs_space_info_used(info, true),
353 		   info->full ? "" : "not ");
354 	btrfs_info(fs_info,
355 		"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
356 		info->total_bytes, info->bytes_used, info->bytes_pinned,
357 		info->bytes_reserved, info->bytes_may_use,
358 		info->bytes_readonly);
359 	spin_unlock(&info->lock);
360 
361 	DUMP_BLOCK_RSV(fs_info, global_block_rsv);
362 	DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
363 	DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
364 	DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
365 	DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
366 
367 	if (!dump_block_groups)
368 		return;
369 
370 	down_read(&info->groups_sem);
371 again:
372 	list_for_each_entry(cache, &info->block_groups[index], list) {
373 		spin_lock(&cache->lock);
374 		btrfs_info(fs_info,
375 			"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
376 			cache->key.objectid, cache->key.offset,
377 			btrfs_block_group_used(&cache->item), cache->pinned,
378 			cache->reserved, cache->ro ? "[readonly]" : "");
379 		btrfs_dump_free_space(cache, bytes);
380 		spin_unlock(&cache->lock);
381 	}
382 	if (++index < BTRFS_NR_RAID_TYPES)
383 		goto again;
384 	up_read(&info->groups_sem);
385 }
386 
387 static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
388 					 unsigned long nr_pages, int nr_items)
389 {
390 	struct super_block *sb = fs_info->sb;
391 
392 	if (down_read_trylock(&sb->s_umount)) {
393 		writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
394 		up_read(&sb->s_umount);
395 	} else {
396 		/*
397 		 * We needn't worry the filesystem going from r/w to r/o though
398 		 * we don't acquire ->s_umount mutex, because the filesystem
399 		 * should guarantee the delalloc inodes list be empty after
400 		 * the filesystem is readonly(all dirty pages are written to
401 		 * the disk).
402 		 */
403 		btrfs_start_delalloc_roots(fs_info, nr_items);
404 		if (!current->journal_info)
405 			btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
406 	}
407 }
408 
409 static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
410 					u64 to_reclaim)
411 {
412 	u64 bytes;
413 	u64 nr;
414 
415 	bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
416 	nr = div64_u64(to_reclaim, bytes);
417 	if (!nr)
418 		nr = 1;
419 	return nr;
420 }
421 
422 #define EXTENT_SIZE_PER_ITEM	SZ_256K
423 
424 /*
425  * shrink metadata reservation for delalloc
426  */
427 static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
428 			    u64 orig, bool wait_ordered)
429 {
430 	struct btrfs_space_info *space_info;
431 	struct btrfs_trans_handle *trans;
432 	u64 delalloc_bytes;
433 	u64 dio_bytes;
434 	u64 async_pages;
435 	u64 items;
436 	long time_left;
437 	unsigned long nr_pages;
438 	int loops;
439 
440 	/* Calc the number of the pages we need flush for space reservation */
441 	items = calc_reclaim_items_nr(fs_info, to_reclaim);
442 	to_reclaim = items * EXTENT_SIZE_PER_ITEM;
443 
444 	trans = (struct btrfs_trans_handle *)current->journal_info;
445 	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
446 
447 	delalloc_bytes = percpu_counter_sum_positive(
448 						&fs_info->delalloc_bytes);
449 	dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
450 	if (delalloc_bytes == 0 && dio_bytes == 0) {
451 		if (trans)
452 			return;
453 		if (wait_ordered)
454 			btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
455 		return;
456 	}
457 
458 	/*
459 	 * If we are doing more ordered than delalloc we need to just wait on
460 	 * ordered extents, otherwise we'll waste time trying to flush delalloc
461 	 * that likely won't give us the space back we need.
462 	 */
463 	if (dio_bytes > delalloc_bytes)
464 		wait_ordered = true;
465 
466 	loops = 0;
467 	while ((delalloc_bytes || dio_bytes) && loops < 3) {
468 		nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
469 
470 		/*
471 		 * Triggers inode writeback for up to nr_pages. This will invoke
472 		 * ->writepages callback and trigger delalloc filling
473 		 *  (btrfs_run_delalloc_range()).
474 		 */
475 		btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
476 
477 		/*
478 		 * We need to wait for the compressed pages to start before
479 		 * we continue.
480 		 */
481 		async_pages = atomic_read(&fs_info->async_delalloc_pages);
482 		if (!async_pages)
483 			goto skip_async;
484 
485 		/*
486 		 * Calculate how many compressed pages we want to be written
487 		 * before we continue. I.e if there are more async pages than we
488 		 * require wait_event will wait until nr_pages are written.
489 		 */
490 		if (async_pages <= nr_pages)
491 			async_pages = 0;
492 		else
493 			async_pages -= nr_pages;
494 
495 		wait_event(fs_info->async_submit_wait,
496 			   atomic_read(&fs_info->async_delalloc_pages) <=
497 			   (int)async_pages);
498 skip_async:
499 		spin_lock(&space_info->lock);
500 		if (list_empty(&space_info->tickets) &&
501 		    list_empty(&space_info->priority_tickets)) {
502 			spin_unlock(&space_info->lock);
503 			break;
504 		}
505 		spin_unlock(&space_info->lock);
506 
507 		loops++;
508 		if (wait_ordered && !trans) {
509 			btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
510 		} else {
511 			time_left = schedule_timeout_killable(1);
512 			if (time_left)
513 				break;
514 		}
515 		delalloc_bytes = percpu_counter_sum_positive(
516 						&fs_info->delalloc_bytes);
517 		dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
518 	}
519 }
520 
521 /**
522  * maybe_commit_transaction - possibly commit the transaction if its ok to
523  * @root - the root we're allocating for
524  * @bytes - the number of bytes we want to reserve
525  * @force - force the commit
526  *
527  * This will check to make sure that committing the transaction will actually
528  * get us somewhere and then commit the transaction if it does.  Otherwise it
529  * will return -ENOSPC.
530  */
531 static int may_commit_transaction(struct btrfs_fs_info *fs_info,
532 				  struct btrfs_space_info *space_info)
533 {
534 	struct reserve_ticket *ticket = NULL;
535 	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
536 	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
537 	struct btrfs_trans_handle *trans;
538 	u64 bytes_needed;
539 	u64 reclaim_bytes = 0;
540 
541 	trans = (struct btrfs_trans_handle *)current->journal_info;
542 	if (trans)
543 		return -EAGAIN;
544 
545 	spin_lock(&space_info->lock);
546 	if (!list_empty(&space_info->priority_tickets))
547 		ticket = list_first_entry(&space_info->priority_tickets,
548 					  struct reserve_ticket, list);
549 	else if (!list_empty(&space_info->tickets))
550 		ticket = list_first_entry(&space_info->tickets,
551 					  struct reserve_ticket, list);
552 	bytes_needed = (ticket) ? ticket->bytes : 0;
553 	spin_unlock(&space_info->lock);
554 
555 	if (!bytes_needed)
556 		return 0;
557 
558 	trans = btrfs_join_transaction(fs_info->extent_root);
559 	if (IS_ERR(trans))
560 		return PTR_ERR(trans);
561 
562 	/*
563 	 * See if there is enough pinned space to make this reservation, or if
564 	 * we have block groups that are going to be freed, allowing us to
565 	 * possibly do a chunk allocation the next loop through.
566 	 */
567 	if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
568 	    __percpu_counter_compare(&space_info->total_bytes_pinned,
569 				     bytes_needed,
570 				     BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
571 		goto commit;
572 
573 	/*
574 	 * See if there is some space in the delayed insertion reservation for
575 	 * this reservation.
576 	 */
577 	if (space_info != delayed_rsv->space_info)
578 		goto enospc;
579 
580 	spin_lock(&delayed_rsv->lock);
581 	reclaim_bytes += delayed_rsv->reserved;
582 	spin_unlock(&delayed_rsv->lock);
583 
584 	spin_lock(&delayed_refs_rsv->lock);
585 	reclaim_bytes += delayed_refs_rsv->reserved;
586 	spin_unlock(&delayed_refs_rsv->lock);
587 	if (reclaim_bytes >= bytes_needed)
588 		goto commit;
589 	bytes_needed -= reclaim_bytes;
590 
591 	if (__percpu_counter_compare(&space_info->total_bytes_pinned,
592 				   bytes_needed,
593 				   BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
594 		goto enospc;
595 
596 commit:
597 	return btrfs_commit_transaction(trans);
598 enospc:
599 	btrfs_end_transaction(trans);
600 	return -ENOSPC;
601 }
602 
603 /*
604  * Try to flush some data based on policy set by @state. This is only advisory
605  * and may fail for various reasons. The caller is supposed to examine the
606  * state of @space_info to detect the outcome.
607  */
608 static void flush_space(struct btrfs_fs_info *fs_info,
609 		       struct btrfs_space_info *space_info, u64 num_bytes,
610 		       int state)
611 {
612 	struct btrfs_root *root = fs_info->extent_root;
613 	struct btrfs_trans_handle *trans;
614 	int nr;
615 	int ret = 0;
616 
617 	switch (state) {
618 	case FLUSH_DELAYED_ITEMS_NR:
619 	case FLUSH_DELAYED_ITEMS:
620 		if (state == FLUSH_DELAYED_ITEMS_NR)
621 			nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
622 		else
623 			nr = -1;
624 
625 		trans = btrfs_join_transaction(root);
626 		if (IS_ERR(trans)) {
627 			ret = PTR_ERR(trans);
628 			break;
629 		}
630 		ret = btrfs_run_delayed_items_nr(trans, nr);
631 		btrfs_end_transaction(trans);
632 		break;
633 	case FLUSH_DELALLOC:
634 	case FLUSH_DELALLOC_WAIT:
635 		shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
636 				state == FLUSH_DELALLOC_WAIT);
637 		break;
638 	case FLUSH_DELAYED_REFS_NR:
639 	case FLUSH_DELAYED_REFS:
640 		trans = btrfs_join_transaction(root);
641 		if (IS_ERR(trans)) {
642 			ret = PTR_ERR(trans);
643 			break;
644 		}
645 		if (state == FLUSH_DELAYED_REFS_NR)
646 			nr = calc_reclaim_items_nr(fs_info, num_bytes);
647 		else
648 			nr = 0;
649 		btrfs_run_delayed_refs(trans, nr);
650 		btrfs_end_transaction(trans);
651 		break;
652 	case ALLOC_CHUNK:
653 	case ALLOC_CHUNK_FORCE:
654 		trans = btrfs_join_transaction(root);
655 		if (IS_ERR(trans)) {
656 			ret = PTR_ERR(trans);
657 			break;
658 		}
659 		ret = btrfs_chunk_alloc(trans,
660 				btrfs_metadata_alloc_profile(fs_info),
661 				(state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
662 					CHUNK_ALLOC_FORCE);
663 		btrfs_end_transaction(trans);
664 		if (ret > 0 || ret == -ENOSPC)
665 			ret = 0;
666 		break;
667 	case COMMIT_TRANS:
668 		/*
669 		 * If we have pending delayed iputs then we could free up a
670 		 * bunch of pinned space, so make sure we run the iputs before
671 		 * we do our pinned bytes check below.
672 		 */
673 		btrfs_run_delayed_iputs(fs_info);
674 		btrfs_wait_on_delayed_iputs(fs_info);
675 
676 		ret = may_commit_transaction(fs_info, space_info);
677 		break;
678 	default:
679 		ret = -ENOSPC;
680 		break;
681 	}
682 
683 	trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
684 				ret);
685 	return;
686 }
687 
688 static inline u64
689 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
690 				 struct btrfs_space_info *space_info,
691 				 bool system_chunk)
692 {
693 	struct reserve_ticket *ticket;
694 	u64 used;
695 	u64 expected;
696 	u64 to_reclaim = 0;
697 
698 	list_for_each_entry(ticket, &space_info->tickets, list)
699 		to_reclaim += ticket->bytes;
700 	list_for_each_entry(ticket, &space_info->priority_tickets, list)
701 		to_reclaim += ticket->bytes;
702 	if (to_reclaim)
703 		return to_reclaim;
704 
705 	to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
706 	if (can_overcommit(fs_info, space_info, to_reclaim,
707 			   BTRFS_RESERVE_FLUSH_ALL, system_chunk))
708 		return 0;
709 
710 	used = btrfs_space_info_used(space_info, true);
711 
712 	if (can_overcommit(fs_info, space_info, SZ_1M,
713 			   BTRFS_RESERVE_FLUSH_ALL, system_chunk))
714 		expected = div_factor_fine(space_info->total_bytes, 95);
715 	else
716 		expected = div_factor_fine(space_info->total_bytes, 90);
717 
718 	if (used > expected)
719 		to_reclaim = used - expected;
720 	else
721 		to_reclaim = 0;
722 	to_reclaim = min(to_reclaim, space_info->bytes_may_use +
723 				     space_info->bytes_reserved);
724 	return to_reclaim;
725 }
726 
727 static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
728 					struct btrfs_space_info *space_info,
729 					u64 used, bool system_chunk)
730 {
731 	u64 thresh = div_factor_fine(space_info->total_bytes, 98);
732 
733 	/* If we're just plain full then async reclaim just slows us down. */
734 	if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
735 		return 0;
736 
737 	if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
738 					      system_chunk))
739 		return 0;
740 
741 	return (used >= thresh && !btrfs_fs_closing(fs_info) &&
742 		!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
743 }
744 
745 static bool wake_all_tickets(struct list_head *head)
746 {
747 	struct reserve_ticket *ticket;
748 
749 	while (!list_empty(head)) {
750 		ticket = list_first_entry(head, struct reserve_ticket, list);
751 		list_del_init(&ticket->list);
752 		ticket->error = -ENOSPC;
753 		wake_up(&ticket->wait);
754 		if (ticket->bytes != ticket->orig_bytes)
755 			return true;
756 	}
757 	return false;
758 }
759 
760 /*
761  * This is for normal flushers, we can wait all goddamned day if we want to.  We
762  * will loop and continuously try to flush as long as we are making progress.
763  * We count progress as clearing off tickets each time we have to loop.
764  */
765 static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
766 {
767 	struct btrfs_fs_info *fs_info;
768 	struct btrfs_space_info *space_info;
769 	u64 to_reclaim;
770 	int flush_state;
771 	int commit_cycles = 0;
772 	u64 last_tickets_id;
773 
774 	fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
775 	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
776 
777 	spin_lock(&space_info->lock);
778 	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
779 						      false);
780 	if (!to_reclaim) {
781 		space_info->flush = 0;
782 		spin_unlock(&space_info->lock);
783 		return;
784 	}
785 	last_tickets_id = space_info->tickets_id;
786 	spin_unlock(&space_info->lock);
787 
788 	flush_state = FLUSH_DELAYED_ITEMS_NR;
789 	do {
790 		flush_space(fs_info, space_info, to_reclaim, flush_state);
791 		spin_lock(&space_info->lock);
792 		if (list_empty(&space_info->tickets)) {
793 			space_info->flush = 0;
794 			spin_unlock(&space_info->lock);
795 			return;
796 		}
797 		to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
798 							      space_info,
799 							      false);
800 		if (last_tickets_id == space_info->tickets_id) {
801 			flush_state++;
802 		} else {
803 			last_tickets_id = space_info->tickets_id;
804 			flush_state = FLUSH_DELAYED_ITEMS_NR;
805 			if (commit_cycles)
806 				commit_cycles--;
807 		}
808 
809 		/*
810 		 * We don't want to force a chunk allocation until we've tried
811 		 * pretty hard to reclaim space.  Think of the case where we
812 		 * freed up a bunch of space and so have a lot of pinned space
813 		 * to reclaim.  We would rather use that than possibly create a
814 		 * underutilized metadata chunk.  So if this is our first run
815 		 * through the flushing state machine skip ALLOC_CHUNK_FORCE and
816 		 * commit the transaction.  If nothing has changed the next go
817 		 * around then we can force a chunk allocation.
818 		 */
819 		if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
820 			flush_state++;
821 
822 		if (flush_state > COMMIT_TRANS) {
823 			commit_cycles++;
824 			if (commit_cycles > 2) {
825 				if (wake_all_tickets(&space_info->tickets)) {
826 					flush_state = FLUSH_DELAYED_ITEMS_NR;
827 					commit_cycles--;
828 				} else {
829 					space_info->flush = 0;
830 				}
831 			} else {
832 				flush_state = FLUSH_DELAYED_ITEMS_NR;
833 			}
834 		}
835 		spin_unlock(&space_info->lock);
836 	} while (flush_state <= COMMIT_TRANS);
837 }
838 
839 void btrfs_init_async_reclaim_work(struct work_struct *work)
840 {
841 	INIT_WORK(work, btrfs_async_reclaim_metadata_space);
842 }
843 
844 static const enum btrfs_flush_state priority_flush_states[] = {
845 	FLUSH_DELAYED_ITEMS_NR,
846 	FLUSH_DELAYED_ITEMS,
847 	ALLOC_CHUNK,
848 };
849 
850 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
851 					    struct btrfs_space_info *space_info,
852 					    struct reserve_ticket *ticket)
853 {
854 	u64 to_reclaim;
855 	int flush_state;
856 
857 	spin_lock(&space_info->lock);
858 	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
859 						      false);
860 	if (!to_reclaim) {
861 		spin_unlock(&space_info->lock);
862 		return;
863 	}
864 	spin_unlock(&space_info->lock);
865 
866 	flush_state = 0;
867 	do {
868 		flush_space(fs_info, space_info, to_reclaim,
869 			    priority_flush_states[flush_state]);
870 		flush_state++;
871 		spin_lock(&space_info->lock);
872 		if (ticket->bytes == 0) {
873 			spin_unlock(&space_info->lock);
874 			return;
875 		}
876 		spin_unlock(&space_info->lock);
877 	} while (flush_state < ARRAY_SIZE(priority_flush_states));
878 }
879 
880 static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
881 			       struct btrfs_space_info *space_info,
882 			       struct reserve_ticket *ticket)
883 
884 {
885 	DEFINE_WAIT(wait);
886 	u64 reclaim_bytes = 0;
887 	int ret = 0;
888 
889 	spin_lock(&space_info->lock);
890 	while (ticket->bytes > 0 && ticket->error == 0) {
891 		ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
892 		if (ret) {
893 			ret = -EINTR;
894 			break;
895 		}
896 		spin_unlock(&space_info->lock);
897 
898 		schedule();
899 
900 		finish_wait(&ticket->wait, &wait);
901 		spin_lock(&space_info->lock);
902 	}
903 	if (!ret)
904 		ret = ticket->error;
905 	if (!list_empty(&ticket->list))
906 		list_del_init(&ticket->list);
907 	if (ticket->bytes && ticket->bytes < ticket->orig_bytes)
908 		reclaim_bytes = ticket->orig_bytes - ticket->bytes;
909 	spin_unlock(&space_info->lock);
910 
911 	if (reclaim_bytes)
912 		btrfs_space_info_add_old_bytes(fs_info, space_info,
913 					       reclaim_bytes);
914 	return ret;
915 }
916 
917 /**
918  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
919  * @root - the root we're allocating for
920  * @space_info - the space info we want to allocate from
921  * @orig_bytes - the number of bytes we want
922  * @flush - whether or not we can flush to make our reservation
923  *
924  * This will reserve orig_bytes number of bytes from the space info associated
925  * with the block_rsv.  If there is not enough space it will make an attempt to
926  * flush out space to make room.  It will do this by flushing delalloc if
927  * possible or committing the transaction.  If flush is 0 then no attempts to
928  * regain reservations will be made and this will fail if there is not enough
929  * space already.
930  */
931 static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
932 				    struct btrfs_space_info *space_info,
933 				    u64 orig_bytes,
934 				    enum btrfs_reserve_flush_enum flush,
935 				    bool system_chunk)
936 {
937 	struct reserve_ticket ticket;
938 	u64 used;
939 	u64 reclaim_bytes = 0;
940 	int ret = 0;
941 
942 	ASSERT(orig_bytes);
943 	ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
944 
945 	spin_lock(&space_info->lock);
946 	ret = -ENOSPC;
947 	used = btrfs_space_info_used(space_info, true);
948 
949 	/*
950 	 * Carry on if we have enough space (short-circuit) OR call
951 	 * can_overcommit() to ensure we can overcommit to continue.
952 	 */
953 	if ((used + orig_bytes <= space_info->total_bytes) ||
954 	    can_overcommit(fs_info, space_info, orig_bytes, flush,
955 			   system_chunk)) {
956 		btrfs_space_info_update_bytes_may_use(fs_info, space_info,
957 						      orig_bytes);
958 		trace_btrfs_space_reservation(fs_info, "space_info",
959 					      space_info->flags, orig_bytes, 1);
960 		ret = 0;
961 	}
962 
963 	/*
964 	 * If we couldn't make a reservation then setup our reservation ticket
965 	 * and kick the async worker if it's not already running.
966 	 *
967 	 * If we are a priority flusher then we just need to add our ticket to
968 	 * the list and we will do our own flushing further down.
969 	 */
970 	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
971 		ticket.orig_bytes = orig_bytes;
972 		ticket.bytes = orig_bytes;
973 		ticket.error = 0;
974 		init_waitqueue_head(&ticket.wait);
975 		if (flush == BTRFS_RESERVE_FLUSH_ALL) {
976 			list_add_tail(&ticket.list, &space_info->tickets);
977 			if (!space_info->flush) {
978 				space_info->flush = 1;
979 				trace_btrfs_trigger_flush(fs_info,
980 							  space_info->flags,
981 							  orig_bytes, flush,
982 							  "enospc");
983 				queue_work(system_unbound_wq,
984 					   &fs_info->async_reclaim_work);
985 			}
986 		} else {
987 			list_add_tail(&ticket.list,
988 				      &space_info->priority_tickets);
989 		}
990 	} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
991 		used += orig_bytes;
992 		/*
993 		 * We will do the space reservation dance during log replay,
994 		 * which means we won't have fs_info->fs_root set, so don't do
995 		 * the async reclaim as we will panic.
996 		 */
997 		if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
998 		    need_do_async_reclaim(fs_info, space_info,
999 					  used, system_chunk) &&
1000 		    !work_busy(&fs_info->async_reclaim_work)) {
1001 			trace_btrfs_trigger_flush(fs_info, space_info->flags,
1002 						  orig_bytes, flush, "preempt");
1003 			queue_work(system_unbound_wq,
1004 				   &fs_info->async_reclaim_work);
1005 		}
1006 	}
1007 	spin_unlock(&space_info->lock);
1008 	if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
1009 		return ret;
1010 
1011 	if (flush == BTRFS_RESERVE_FLUSH_ALL)
1012 		return wait_reserve_ticket(fs_info, space_info, &ticket);
1013 
1014 	ret = 0;
1015 	priority_reclaim_metadata_space(fs_info, space_info, &ticket);
1016 	spin_lock(&space_info->lock);
1017 	if (ticket.bytes) {
1018 		if (ticket.bytes < orig_bytes)
1019 			reclaim_bytes = orig_bytes - ticket.bytes;
1020 		list_del_init(&ticket.list);
1021 		ret = -ENOSPC;
1022 	}
1023 	spin_unlock(&space_info->lock);
1024 
1025 	if (reclaim_bytes)
1026 		btrfs_space_info_add_old_bytes(fs_info, space_info,
1027 					       reclaim_bytes);
1028 	ASSERT(list_empty(&ticket.list));
1029 	return ret;
1030 }
1031 
1032 /**
1033  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
1034  * @root - the root we're allocating for
1035  * @block_rsv - the block_rsv we're allocating for
1036  * @orig_bytes - the number of bytes we want
1037  * @flush - whether or not we can flush to make our reservation
1038  *
1039  * This will reserve orig_bytes number of bytes from the space info associated
1040  * with the block_rsv.  If there is not enough space it will make an attempt to
1041  * flush out space to make room.  It will do this by flushing delalloc if
1042  * possible or committing the transaction.  If flush is 0 then no attempts to
1043  * regain reservations will be made and this will fail if there is not enough
1044  * space already.
1045  */
1046 int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
1047 				 struct btrfs_block_rsv *block_rsv,
1048 				 u64 orig_bytes,
1049 				 enum btrfs_reserve_flush_enum flush)
1050 {
1051 	struct btrfs_fs_info *fs_info = root->fs_info;
1052 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
1053 	int ret;
1054 	bool system_chunk = (root == fs_info->chunk_root);
1055 
1056 	ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
1057 				       orig_bytes, flush, system_chunk);
1058 	if (ret == -ENOSPC &&
1059 	    unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
1060 		if (block_rsv != global_rsv &&
1061 		    !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes))
1062 			ret = 0;
1063 	}
1064 	if (ret == -ENOSPC) {
1065 		trace_btrfs_space_reservation(fs_info, "space_info:enospc",
1066 					      block_rsv->space_info->flags,
1067 					      orig_bytes, 1);
1068 
1069 		if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
1070 			btrfs_dump_space_info(fs_info, block_rsv->space_info,
1071 					      orig_bytes, 0);
1072 	}
1073 	return ret;
1074 }
1075