xref: /openbmc/linux/fs/btrfs/space-info.c (revision 0d9764f6)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "ctree.h"
4 #include "space-info.h"
5 #include "sysfs.h"
6 #include "volumes.h"
7 #include "free-space-cache.h"
8 #include "ordered-data.h"
9 #include "transaction.h"
10 #include "math.h"
11 
12 u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
13 			  bool may_use_included)
14 {
15 	ASSERT(s_info);
16 	return s_info->bytes_used + s_info->bytes_reserved +
17 		s_info->bytes_pinned + s_info->bytes_readonly +
18 		(may_use_included ? s_info->bytes_may_use : 0);
19 }
20 
21 /*
22  * after adding space to the filesystem, we need to clear the full flags
23  * on all the space infos.
24  */
25 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
26 {
27 	struct list_head *head = &info->space_info;
28 	struct btrfs_space_info *found;
29 
30 	rcu_read_lock();
31 	list_for_each_entry_rcu(found, head, list)
32 		found->full = 0;
33 	rcu_read_unlock();
34 }
35 
36 static const char *alloc_name(u64 flags)
37 {
38 	switch (flags) {
39 	case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
40 		return "mixed";
41 	case BTRFS_BLOCK_GROUP_METADATA:
42 		return "metadata";
43 	case BTRFS_BLOCK_GROUP_DATA:
44 		return "data";
45 	case BTRFS_BLOCK_GROUP_SYSTEM:
46 		return "system";
47 	default:
48 		WARN_ON(1);
49 		return "invalid-combination";
50 	};
51 }
52 
53 static int create_space_info(struct btrfs_fs_info *info, u64 flags)
54 {
55 
56 	struct btrfs_space_info *space_info;
57 	int i;
58 	int ret;
59 
60 	space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
61 	if (!space_info)
62 		return -ENOMEM;
63 
64 	ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
65 				 GFP_KERNEL);
66 	if (ret) {
67 		kfree(space_info);
68 		return ret;
69 	}
70 
71 	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
72 		INIT_LIST_HEAD(&space_info->block_groups[i]);
73 	init_rwsem(&space_info->groups_sem);
74 	spin_lock_init(&space_info->lock);
75 	space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
76 	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
77 	init_waitqueue_head(&space_info->wait);
78 	INIT_LIST_HEAD(&space_info->ro_bgs);
79 	INIT_LIST_HEAD(&space_info->tickets);
80 	INIT_LIST_HEAD(&space_info->priority_tickets);
81 
82 	ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
83 				    info->space_info_kobj, "%s",
84 				    alloc_name(space_info->flags));
85 	if (ret) {
86 		kobject_put(&space_info->kobj);
87 		return ret;
88 	}
89 
90 	list_add_rcu(&space_info->list, &info->space_info);
91 	if (flags & BTRFS_BLOCK_GROUP_DATA)
92 		info->data_sinfo = space_info;
93 
94 	return ret;
95 }
96 
97 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
98 {
99 	struct btrfs_super_block *disk_super;
100 	u64 features;
101 	u64 flags;
102 	int mixed = 0;
103 	int ret;
104 
105 	disk_super = fs_info->super_copy;
106 	if (!btrfs_super_root(disk_super))
107 		return -EINVAL;
108 
109 	features = btrfs_super_incompat_flags(disk_super);
110 	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
111 		mixed = 1;
112 
113 	flags = BTRFS_BLOCK_GROUP_SYSTEM;
114 	ret = create_space_info(fs_info, flags);
115 	if (ret)
116 		goto out;
117 
118 	if (mixed) {
119 		flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
120 		ret = create_space_info(fs_info, flags);
121 	} else {
122 		flags = BTRFS_BLOCK_GROUP_METADATA;
123 		ret = create_space_info(fs_info, flags);
124 		if (ret)
125 			goto out;
126 
127 		flags = BTRFS_BLOCK_GROUP_DATA;
128 		ret = create_space_info(fs_info, flags);
129 	}
130 out:
131 	return ret;
132 }
133 
134 void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
135 			     u64 total_bytes, u64 bytes_used,
136 			     u64 bytes_readonly,
137 			     struct btrfs_space_info **space_info)
138 {
139 	struct btrfs_space_info *found;
140 	int factor;
141 
142 	factor = btrfs_bg_type_to_factor(flags);
143 
144 	found = btrfs_find_space_info(info, flags);
145 	ASSERT(found);
146 	spin_lock(&found->lock);
147 	found->total_bytes += total_bytes;
148 	found->disk_total += total_bytes * factor;
149 	found->bytes_used += bytes_used;
150 	found->disk_used += bytes_used * factor;
151 	found->bytes_readonly += bytes_readonly;
152 	if (total_bytes > 0)
153 		found->full = 0;
154 	btrfs_space_info_add_new_bytes(info, found,
155 				       total_bytes - bytes_used -
156 				       bytes_readonly);
157 	spin_unlock(&found->lock);
158 	*space_info = found;
159 }
160 
161 struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
162 					       u64 flags)
163 {
164 	struct list_head *head = &info->space_info;
165 	struct btrfs_space_info *found;
166 
167 	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
168 
169 	rcu_read_lock();
170 	list_for_each_entry_rcu(found, head, list) {
171 		if (found->flags & flags) {
172 			rcu_read_unlock();
173 			return found;
174 		}
175 	}
176 	rcu_read_unlock();
177 	return NULL;
178 }
179 
180 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
181 {
182 	return (global->size << 1);
183 }
184 
185 int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
186 			 struct btrfs_space_info *space_info, u64 bytes,
187 			 enum btrfs_reserve_flush_enum flush,
188 			 bool system_chunk)
189 {
190 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
191 	u64 profile;
192 	u64 space_size;
193 	u64 avail;
194 	u64 used;
195 	int factor;
196 
197 	/* Don't overcommit when in mixed mode. */
198 	if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
199 		return 0;
200 
201 	if (system_chunk)
202 		profile = btrfs_system_alloc_profile(fs_info);
203 	else
204 		profile = btrfs_metadata_alloc_profile(fs_info);
205 
206 	used = btrfs_space_info_used(space_info, false);
207 
208 	/*
209 	 * We only want to allow over committing if we have lots of actual space
210 	 * free, but if we don't have enough space to handle the global reserve
211 	 * space then we could end up having a real enospc problem when trying
212 	 * to allocate a chunk or some other such important allocation.
213 	 */
214 	spin_lock(&global_rsv->lock);
215 	space_size = calc_global_rsv_need_space(global_rsv);
216 	spin_unlock(&global_rsv->lock);
217 	if (used + space_size >= space_info->total_bytes)
218 		return 0;
219 
220 	used += space_info->bytes_may_use;
221 
222 	avail = atomic64_read(&fs_info->free_chunk_space);
223 
224 	/*
225 	 * If we have dup, raid1 or raid10 then only half of the free
226 	 * space is actually usable.  For raid56, the space info used
227 	 * doesn't include the parity drive, so we don't have to
228 	 * change the math
229 	 */
230 	factor = btrfs_bg_type_to_factor(profile);
231 	avail = div_u64(avail, factor);
232 
233 	/*
234 	 * If we aren't flushing all things, let us overcommit up to
235 	 * 1/2th of the space. If we can flush, don't let us overcommit
236 	 * too much, let it overcommit up to 1/8 of the space.
237 	 */
238 	if (flush == BTRFS_RESERVE_FLUSH_ALL)
239 		avail >>= 3;
240 	else
241 		avail >>= 1;
242 
243 	if (used + bytes < space_info->total_bytes + avail)
244 		return 1;
245 	return 0;
246 }
247 
248 /*
249  * This is for space we already have accounted in space_info->bytes_may_use, so
250  * basically when we're returning space from block_rsv's.
251  */
252 void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
253 				    struct btrfs_space_info *space_info,
254 				    u64 num_bytes)
255 {
256 	struct reserve_ticket *ticket;
257 	struct list_head *head;
258 	u64 used;
259 	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
260 	bool check_overcommit = false;
261 
262 	spin_lock(&space_info->lock);
263 	head = &space_info->priority_tickets;
264 
265 	/*
266 	 * If we are over our limit then we need to check and see if we can
267 	 * overcommit, and if we can't then we just need to free up our space
268 	 * and not satisfy any requests.
269 	 */
270 	used = btrfs_space_info_used(space_info, true);
271 	if (used - num_bytes >= space_info->total_bytes)
272 		check_overcommit = true;
273 again:
274 	while (!list_empty(head) && num_bytes) {
275 		ticket = list_first_entry(head, struct reserve_ticket,
276 					  list);
277 		/*
278 		 * We use 0 bytes because this space is already reserved, so
279 		 * adding the ticket space would be a double count.
280 		 */
281 		if (check_overcommit &&
282 		    !btrfs_can_overcommit(fs_info, space_info, 0, flush,
283 					  false))
284 			break;
285 		if (num_bytes >= ticket->bytes) {
286 			list_del_init(&ticket->list);
287 			num_bytes -= ticket->bytes;
288 			ticket->bytes = 0;
289 			space_info->tickets_id++;
290 			wake_up(&ticket->wait);
291 		} else {
292 			ticket->bytes -= num_bytes;
293 			num_bytes = 0;
294 		}
295 	}
296 
297 	if (num_bytes && head == &space_info->priority_tickets) {
298 		head = &space_info->tickets;
299 		flush = BTRFS_RESERVE_FLUSH_ALL;
300 		goto again;
301 	}
302 	btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes);
303 	trace_btrfs_space_reservation(fs_info, "space_info",
304 				      space_info->flags, num_bytes, 0);
305 	spin_unlock(&space_info->lock);
306 }
307 
308 /*
309  * This is for newly allocated space that isn't accounted in
310  * space_info->bytes_may_use yet.  So if we allocate a chunk or unpin an extent
311  * we use this helper.
312  */
313 void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
314 				    struct btrfs_space_info *space_info,
315 				    u64 num_bytes)
316 {
317 	struct reserve_ticket *ticket;
318 	struct list_head *head = &space_info->priority_tickets;
319 
320 again:
321 	while (!list_empty(head) && num_bytes) {
322 		ticket = list_first_entry(head, struct reserve_ticket,
323 					  list);
324 		if (num_bytes >= ticket->bytes) {
325 			trace_btrfs_space_reservation(fs_info, "space_info",
326 						      space_info->flags,
327 						      ticket->bytes, 1);
328 			list_del_init(&ticket->list);
329 			num_bytes -= ticket->bytes;
330 			btrfs_space_info_update_bytes_may_use(fs_info,
331 							      space_info,
332 							      ticket->bytes);
333 			ticket->bytes = 0;
334 			space_info->tickets_id++;
335 			wake_up(&ticket->wait);
336 		} else {
337 			trace_btrfs_space_reservation(fs_info, "space_info",
338 						      space_info->flags,
339 						      num_bytes, 1);
340 			btrfs_space_info_update_bytes_may_use(fs_info,
341 							      space_info,
342 							      num_bytes);
343 			ticket->bytes -= num_bytes;
344 			num_bytes = 0;
345 		}
346 	}
347 
348 	if (num_bytes && head == &space_info->priority_tickets) {
349 		head = &space_info->tickets;
350 		goto again;
351 	}
352 }
353 
354 #define DUMP_BLOCK_RSV(fs_info, rsv_name)				\
355 do {									\
356 	struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name;		\
357 	spin_lock(&__rsv->lock);					\
358 	btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu",	\
359 		   __rsv->size, __rsv->reserved);			\
360 	spin_unlock(&__rsv->lock);					\
361 } while (0)
362 
363 void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
364 			   struct btrfs_space_info *info, u64 bytes,
365 			   int dump_block_groups)
366 {
367 	struct btrfs_block_group_cache *cache;
368 	int index = 0;
369 
370 	spin_lock(&info->lock);
371 	btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
372 		   info->flags,
373 		   info->total_bytes - btrfs_space_info_used(info, true),
374 		   info->full ? "" : "not ");
375 	btrfs_info(fs_info,
376 		"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
377 		info->total_bytes, info->bytes_used, info->bytes_pinned,
378 		info->bytes_reserved, info->bytes_may_use,
379 		info->bytes_readonly);
380 	spin_unlock(&info->lock);
381 
382 	DUMP_BLOCK_RSV(fs_info, global_block_rsv);
383 	DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
384 	DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
385 	DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
386 	DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
387 
388 	if (!dump_block_groups)
389 		return;
390 
391 	down_read(&info->groups_sem);
392 again:
393 	list_for_each_entry(cache, &info->block_groups[index], list) {
394 		spin_lock(&cache->lock);
395 		btrfs_info(fs_info,
396 			"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
397 			cache->key.objectid, cache->key.offset,
398 			btrfs_block_group_used(&cache->item), cache->pinned,
399 			cache->reserved, cache->ro ? "[readonly]" : "");
400 		btrfs_dump_free_space(cache, bytes);
401 		spin_unlock(&cache->lock);
402 	}
403 	if (++index < BTRFS_NR_RAID_TYPES)
404 		goto again;
405 	up_read(&info->groups_sem);
406 }
407 
408 static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
409 					 unsigned long nr_pages, int nr_items)
410 {
411 	struct super_block *sb = fs_info->sb;
412 
413 	if (down_read_trylock(&sb->s_umount)) {
414 		writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
415 		up_read(&sb->s_umount);
416 	} else {
417 		/*
418 		 * We needn't worry the filesystem going from r/w to r/o though
419 		 * we don't acquire ->s_umount mutex, because the filesystem
420 		 * should guarantee the delalloc inodes list be empty after
421 		 * the filesystem is readonly(all dirty pages are written to
422 		 * the disk).
423 		 */
424 		btrfs_start_delalloc_roots(fs_info, nr_items);
425 		if (!current->journal_info)
426 			btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
427 	}
428 }
429 
430 static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
431 					u64 to_reclaim)
432 {
433 	u64 bytes;
434 	u64 nr;
435 
436 	bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
437 	nr = div64_u64(to_reclaim, bytes);
438 	if (!nr)
439 		nr = 1;
440 	return nr;
441 }
442 
443 #define EXTENT_SIZE_PER_ITEM	SZ_256K
444 
445 /*
446  * shrink metadata reservation for delalloc
447  */
448 static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
449 			    u64 orig, bool wait_ordered)
450 {
451 	struct btrfs_space_info *space_info;
452 	struct btrfs_trans_handle *trans;
453 	u64 delalloc_bytes;
454 	u64 dio_bytes;
455 	u64 async_pages;
456 	u64 items;
457 	long time_left;
458 	unsigned long nr_pages;
459 	int loops;
460 
461 	/* Calc the number of the pages we need flush for space reservation */
462 	items = calc_reclaim_items_nr(fs_info, to_reclaim);
463 	to_reclaim = items * EXTENT_SIZE_PER_ITEM;
464 
465 	trans = (struct btrfs_trans_handle *)current->journal_info;
466 	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
467 
468 	delalloc_bytes = percpu_counter_sum_positive(
469 						&fs_info->delalloc_bytes);
470 	dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
471 	if (delalloc_bytes == 0 && dio_bytes == 0) {
472 		if (trans)
473 			return;
474 		if (wait_ordered)
475 			btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
476 		return;
477 	}
478 
479 	/*
480 	 * If we are doing more ordered than delalloc we need to just wait on
481 	 * ordered extents, otherwise we'll waste time trying to flush delalloc
482 	 * that likely won't give us the space back we need.
483 	 */
484 	if (dio_bytes > delalloc_bytes)
485 		wait_ordered = true;
486 
487 	loops = 0;
488 	while ((delalloc_bytes || dio_bytes) && loops < 3) {
489 		nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
490 
491 		/*
492 		 * Triggers inode writeback for up to nr_pages. This will invoke
493 		 * ->writepages callback and trigger delalloc filling
494 		 *  (btrfs_run_delalloc_range()).
495 		 */
496 		btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
497 
498 		/*
499 		 * We need to wait for the compressed pages to start before
500 		 * we continue.
501 		 */
502 		async_pages = atomic_read(&fs_info->async_delalloc_pages);
503 		if (!async_pages)
504 			goto skip_async;
505 
506 		/*
507 		 * Calculate how many compressed pages we want to be written
508 		 * before we continue. I.e if there are more async pages than we
509 		 * require wait_event will wait until nr_pages are written.
510 		 */
511 		if (async_pages <= nr_pages)
512 			async_pages = 0;
513 		else
514 			async_pages -= nr_pages;
515 
516 		wait_event(fs_info->async_submit_wait,
517 			   atomic_read(&fs_info->async_delalloc_pages) <=
518 			   (int)async_pages);
519 skip_async:
520 		spin_lock(&space_info->lock);
521 		if (list_empty(&space_info->tickets) &&
522 		    list_empty(&space_info->priority_tickets)) {
523 			spin_unlock(&space_info->lock);
524 			break;
525 		}
526 		spin_unlock(&space_info->lock);
527 
528 		loops++;
529 		if (wait_ordered && !trans) {
530 			btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
531 		} else {
532 			time_left = schedule_timeout_killable(1);
533 			if (time_left)
534 				break;
535 		}
536 		delalloc_bytes = percpu_counter_sum_positive(
537 						&fs_info->delalloc_bytes);
538 		dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
539 	}
540 }
541 
542 /**
543  * maybe_commit_transaction - possibly commit the transaction if its ok to
544  * @root - the root we're allocating for
545  * @bytes - the number of bytes we want to reserve
546  * @force - force the commit
547  *
548  * This will check to make sure that committing the transaction will actually
549  * get us somewhere and then commit the transaction if it does.  Otherwise it
550  * will return -ENOSPC.
551  */
552 static int may_commit_transaction(struct btrfs_fs_info *fs_info,
553 				  struct btrfs_space_info *space_info)
554 {
555 	struct reserve_ticket *ticket = NULL;
556 	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
557 	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
558 	struct btrfs_trans_handle *trans;
559 	u64 bytes_needed;
560 	u64 reclaim_bytes = 0;
561 
562 	trans = (struct btrfs_trans_handle *)current->journal_info;
563 	if (trans)
564 		return -EAGAIN;
565 
566 	spin_lock(&space_info->lock);
567 	if (!list_empty(&space_info->priority_tickets))
568 		ticket = list_first_entry(&space_info->priority_tickets,
569 					  struct reserve_ticket, list);
570 	else if (!list_empty(&space_info->tickets))
571 		ticket = list_first_entry(&space_info->tickets,
572 					  struct reserve_ticket, list);
573 	bytes_needed = (ticket) ? ticket->bytes : 0;
574 	spin_unlock(&space_info->lock);
575 
576 	if (!bytes_needed)
577 		return 0;
578 
579 	trans = btrfs_join_transaction(fs_info->extent_root);
580 	if (IS_ERR(trans))
581 		return PTR_ERR(trans);
582 
583 	/*
584 	 * See if there is enough pinned space to make this reservation, or if
585 	 * we have block groups that are going to be freed, allowing us to
586 	 * possibly do a chunk allocation the next loop through.
587 	 */
588 	if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
589 	    __percpu_counter_compare(&space_info->total_bytes_pinned,
590 				     bytes_needed,
591 				     BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
592 		goto commit;
593 
594 	/*
595 	 * See if there is some space in the delayed insertion reservation for
596 	 * this reservation.
597 	 */
598 	if (space_info != delayed_rsv->space_info)
599 		goto enospc;
600 
601 	spin_lock(&delayed_rsv->lock);
602 	reclaim_bytes += delayed_rsv->reserved;
603 	spin_unlock(&delayed_rsv->lock);
604 
605 	spin_lock(&delayed_refs_rsv->lock);
606 	reclaim_bytes += delayed_refs_rsv->reserved;
607 	spin_unlock(&delayed_refs_rsv->lock);
608 	if (reclaim_bytes >= bytes_needed)
609 		goto commit;
610 	bytes_needed -= reclaim_bytes;
611 
612 	if (__percpu_counter_compare(&space_info->total_bytes_pinned,
613 				   bytes_needed,
614 				   BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
615 		goto enospc;
616 
617 commit:
618 	return btrfs_commit_transaction(trans);
619 enospc:
620 	btrfs_end_transaction(trans);
621 	return -ENOSPC;
622 }
623 
624 /*
625  * Try to flush some data based on policy set by @state. This is only advisory
626  * and may fail for various reasons. The caller is supposed to examine the
627  * state of @space_info to detect the outcome.
628  */
629 static void flush_space(struct btrfs_fs_info *fs_info,
630 		       struct btrfs_space_info *space_info, u64 num_bytes,
631 		       int state)
632 {
633 	struct btrfs_root *root = fs_info->extent_root;
634 	struct btrfs_trans_handle *trans;
635 	int nr;
636 	int ret = 0;
637 
638 	switch (state) {
639 	case FLUSH_DELAYED_ITEMS_NR:
640 	case FLUSH_DELAYED_ITEMS:
641 		if (state == FLUSH_DELAYED_ITEMS_NR)
642 			nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
643 		else
644 			nr = -1;
645 
646 		trans = btrfs_join_transaction(root);
647 		if (IS_ERR(trans)) {
648 			ret = PTR_ERR(trans);
649 			break;
650 		}
651 		ret = btrfs_run_delayed_items_nr(trans, nr);
652 		btrfs_end_transaction(trans);
653 		break;
654 	case FLUSH_DELALLOC:
655 	case FLUSH_DELALLOC_WAIT:
656 		shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
657 				state == FLUSH_DELALLOC_WAIT);
658 		break;
659 	case FLUSH_DELAYED_REFS_NR:
660 	case FLUSH_DELAYED_REFS:
661 		trans = btrfs_join_transaction(root);
662 		if (IS_ERR(trans)) {
663 			ret = PTR_ERR(trans);
664 			break;
665 		}
666 		if (state == FLUSH_DELAYED_REFS_NR)
667 			nr = calc_reclaim_items_nr(fs_info, num_bytes);
668 		else
669 			nr = 0;
670 		btrfs_run_delayed_refs(trans, nr);
671 		btrfs_end_transaction(trans);
672 		break;
673 	case ALLOC_CHUNK:
674 	case ALLOC_CHUNK_FORCE:
675 		trans = btrfs_join_transaction(root);
676 		if (IS_ERR(trans)) {
677 			ret = PTR_ERR(trans);
678 			break;
679 		}
680 		ret = btrfs_chunk_alloc(trans,
681 				btrfs_metadata_alloc_profile(fs_info),
682 				(state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
683 					CHUNK_ALLOC_FORCE);
684 		btrfs_end_transaction(trans);
685 		if (ret > 0 || ret == -ENOSPC)
686 			ret = 0;
687 		break;
688 	case COMMIT_TRANS:
689 		/*
690 		 * If we have pending delayed iputs then we could free up a
691 		 * bunch of pinned space, so make sure we run the iputs before
692 		 * we do our pinned bytes check below.
693 		 */
694 		btrfs_run_delayed_iputs(fs_info);
695 		btrfs_wait_on_delayed_iputs(fs_info);
696 
697 		ret = may_commit_transaction(fs_info, space_info);
698 		break;
699 	default:
700 		ret = -ENOSPC;
701 		break;
702 	}
703 
704 	trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
705 				ret);
706 	return;
707 }
708 
709 static inline u64
710 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
711 				 struct btrfs_space_info *space_info,
712 				 bool system_chunk)
713 {
714 	struct reserve_ticket *ticket;
715 	u64 used;
716 	u64 expected;
717 	u64 to_reclaim = 0;
718 
719 	list_for_each_entry(ticket, &space_info->tickets, list)
720 		to_reclaim += ticket->bytes;
721 	list_for_each_entry(ticket, &space_info->priority_tickets, list)
722 		to_reclaim += ticket->bytes;
723 	if (to_reclaim)
724 		return to_reclaim;
725 
726 	to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
727 	if (btrfs_can_overcommit(fs_info, space_info, to_reclaim,
728 				 BTRFS_RESERVE_FLUSH_ALL, system_chunk))
729 		return 0;
730 
731 	used = btrfs_space_info_used(space_info, true);
732 
733 	if (btrfs_can_overcommit(fs_info, space_info, SZ_1M,
734 				 BTRFS_RESERVE_FLUSH_ALL, system_chunk))
735 		expected = div_factor_fine(space_info->total_bytes, 95);
736 	else
737 		expected = div_factor_fine(space_info->total_bytes, 90);
738 
739 	if (used > expected)
740 		to_reclaim = used - expected;
741 	else
742 		to_reclaim = 0;
743 	to_reclaim = min(to_reclaim, space_info->bytes_may_use +
744 				     space_info->bytes_reserved);
745 	return to_reclaim;
746 }
747 
748 static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
749 					struct btrfs_space_info *space_info,
750 					u64 used, bool system_chunk)
751 {
752 	u64 thresh = div_factor_fine(space_info->total_bytes, 98);
753 
754 	/* If we're just plain full then async reclaim just slows us down. */
755 	if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
756 		return 0;
757 
758 	if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
759 					      system_chunk))
760 		return 0;
761 
762 	return (used >= thresh && !btrfs_fs_closing(fs_info) &&
763 		!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
764 }
765 
766 static bool wake_all_tickets(struct list_head *head)
767 {
768 	struct reserve_ticket *ticket;
769 
770 	while (!list_empty(head)) {
771 		ticket = list_first_entry(head, struct reserve_ticket, list);
772 		list_del_init(&ticket->list);
773 		ticket->error = -ENOSPC;
774 		wake_up(&ticket->wait);
775 		if (ticket->bytes != ticket->orig_bytes)
776 			return true;
777 	}
778 	return false;
779 }
780 
781 /*
782  * This is for normal flushers, we can wait all goddamned day if we want to.  We
783  * will loop and continuously try to flush as long as we are making progress.
784  * We count progress as clearing off tickets each time we have to loop.
785  */
786 static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
787 {
788 	struct btrfs_fs_info *fs_info;
789 	struct btrfs_space_info *space_info;
790 	u64 to_reclaim;
791 	int flush_state;
792 	int commit_cycles = 0;
793 	u64 last_tickets_id;
794 
795 	fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
796 	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
797 
798 	spin_lock(&space_info->lock);
799 	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
800 						      false);
801 	if (!to_reclaim) {
802 		space_info->flush = 0;
803 		spin_unlock(&space_info->lock);
804 		return;
805 	}
806 	last_tickets_id = space_info->tickets_id;
807 	spin_unlock(&space_info->lock);
808 
809 	flush_state = FLUSH_DELAYED_ITEMS_NR;
810 	do {
811 		flush_space(fs_info, space_info, to_reclaim, flush_state);
812 		spin_lock(&space_info->lock);
813 		if (list_empty(&space_info->tickets)) {
814 			space_info->flush = 0;
815 			spin_unlock(&space_info->lock);
816 			return;
817 		}
818 		to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
819 							      space_info,
820 							      false);
821 		if (last_tickets_id == space_info->tickets_id) {
822 			flush_state++;
823 		} else {
824 			last_tickets_id = space_info->tickets_id;
825 			flush_state = FLUSH_DELAYED_ITEMS_NR;
826 			if (commit_cycles)
827 				commit_cycles--;
828 		}
829 
830 		/*
831 		 * We don't want to force a chunk allocation until we've tried
832 		 * pretty hard to reclaim space.  Think of the case where we
833 		 * freed up a bunch of space and so have a lot of pinned space
834 		 * to reclaim.  We would rather use that than possibly create a
835 		 * underutilized metadata chunk.  So if this is our first run
836 		 * through the flushing state machine skip ALLOC_CHUNK_FORCE and
837 		 * commit the transaction.  If nothing has changed the next go
838 		 * around then we can force a chunk allocation.
839 		 */
840 		if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
841 			flush_state++;
842 
843 		if (flush_state > COMMIT_TRANS) {
844 			commit_cycles++;
845 			if (commit_cycles > 2) {
846 				if (wake_all_tickets(&space_info->tickets)) {
847 					flush_state = FLUSH_DELAYED_ITEMS_NR;
848 					commit_cycles--;
849 				} else {
850 					space_info->flush = 0;
851 				}
852 			} else {
853 				flush_state = FLUSH_DELAYED_ITEMS_NR;
854 			}
855 		}
856 		spin_unlock(&space_info->lock);
857 	} while (flush_state <= COMMIT_TRANS);
858 }
859 
860 void btrfs_init_async_reclaim_work(struct work_struct *work)
861 {
862 	INIT_WORK(work, btrfs_async_reclaim_metadata_space);
863 }
864 
865 static const enum btrfs_flush_state priority_flush_states[] = {
866 	FLUSH_DELAYED_ITEMS_NR,
867 	FLUSH_DELAYED_ITEMS,
868 	ALLOC_CHUNK,
869 };
870 
871 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
872 					    struct btrfs_space_info *space_info,
873 					    struct reserve_ticket *ticket)
874 {
875 	u64 to_reclaim;
876 	int flush_state;
877 
878 	spin_lock(&space_info->lock);
879 	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
880 						      false);
881 	if (!to_reclaim) {
882 		spin_unlock(&space_info->lock);
883 		return;
884 	}
885 	spin_unlock(&space_info->lock);
886 
887 	flush_state = 0;
888 	do {
889 		flush_space(fs_info, space_info, to_reclaim,
890 			    priority_flush_states[flush_state]);
891 		flush_state++;
892 		spin_lock(&space_info->lock);
893 		if (ticket->bytes == 0) {
894 			spin_unlock(&space_info->lock);
895 			return;
896 		}
897 		spin_unlock(&space_info->lock);
898 	} while (flush_state < ARRAY_SIZE(priority_flush_states));
899 }
900 
901 static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
902 			       struct btrfs_space_info *space_info,
903 			       struct reserve_ticket *ticket)
904 
905 {
906 	DEFINE_WAIT(wait);
907 	u64 reclaim_bytes = 0;
908 	int ret = 0;
909 
910 	spin_lock(&space_info->lock);
911 	while (ticket->bytes > 0 && ticket->error == 0) {
912 		ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
913 		if (ret) {
914 			ret = -EINTR;
915 			break;
916 		}
917 		spin_unlock(&space_info->lock);
918 
919 		schedule();
920 
921 		finish_wait(&ticket->wait, &wait);
922 		spin_lock(&space_info->lock);
923 	}
924 	if (!ret)
925 		ret = ticket->error;
926 	if (!list_empty(&ticket->list))
927 		list_del_init(&ticket->list);
928 	if (ticket->bytes && ticket->bytes < ticket->orig_bytes)
929 		reclaim_bytes = ticket->orig_bytes - ticket->bytes;
930 	spin_unlock(&space_info->lock);
931 
932 	if (reclaim_bytes)
933 		btrfs_space_info_add_old_bytes(fs_info, space_info,
934 					       reclaim_bytes);
935 	return ret;
936 }
937 
938 /**
939  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
940  * @root - the root we're allocating for
941  * @space_info - the space info we want to allocate from
942  * @orig_bytes - the number of bytes we want
943  * @flush - whether or not we can flush to make our reservation
944  *
945  * This will reserve orig_bytes number of bytes from the space info associated
946  * with the block_rsv.  If there is not enough space it will make an attempt to
947  * flush out space to make room.  It will do this by flushing delalloc if
948  * possible or committing the transaction.  If flush is 0 then no attempts to
949  * regain reservations will be made and this will fail if there is not enough
950  * space already.
951  */
952 static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
953 				    struct btrfs_space_info *space_info,
954 				    u64 orig_bytes,
955 				    enum btrfs_reserve_flush_enum flush,
956 				    bool system_chunk)
957 {
958 	struct reserve_ticket ticket;
959 	u64 used;
960 	u64 reclaim_bytes = 0;
961 	int ret = 0;
962 
963 	ASSERT(orig_bytes);
964 	ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
965 
966 	spin_lock(&space_info->lock);
967 	ret = -ENOSPC;
968 	used = btrfs_space_info_used(space_info, true);
969 
970 	/*
971 	 * If we have enough space then hooray, make our reservation and carry
972 	 * on.  If not see if we can overcommit, and if we can, hooray carry on.
973 	 * If not things get more complicated.
974 	 */
975 	if (used + orig_bytes <= space_info->total_bytes) {
976 		btrfs_space_info_update_bytes_may_use(fs_info, space_info,
977 						      orig_bytes);
978 		trace_btrfs_space_reservation(fs_info, "space_info",
979 					      space_info->flags, orig_bytes, 1);
980 		ret = 0;
981 	} else if (btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush,
982 					system_chunk)) {
983 		btrfs_space_info_update_bytes_may_use(fs_info, space_info,
984 						      orig_bytes);
985 		trace_btrfs_space_reservation(fs_info, "space_info",
986 					      space_info->flags, orig_bytes, 1);
987 		ret = 0;
988 	}
989 
990 	/*
991 	 * If we couldn't make a reservation then setup our reservation ticket
992 	 * and kick the async worker if it's not already running.
993 	 *
994 	 * If we are a priority flusher then we just need to add our ticket to
995 	 * the list and we will do our own flushing further down.
996 	 */
997 	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
998 		ticket.orig_bytes = orig_bytes;
999 		ticket.bytes = orig_bytes;
1000 		ticket.error = 0;
1001 		init_waitqueue_head(&ticket.wait);
1002 		if (flush == BTRFS_RESERVE_FLUSH_ALL) {
1003 			list_add_tail(&ticket.list, &space_info->tickets);
1004 			if (!space_info->flush) {
1005 				space_info->flush = 1;
1006 				trace_btrfs_trigger_flush(fs_info,
1007 							  space_info->flags,
1008 							  orig_bytes, flush,
1009 							  "enospc");
1010 				queue_work(system_unbound_wq,
1011 					   &fs_info->async_reclaim_work);
1012 			}
1013 		} else {
1014 			list_add_tail(&ticket.list,
1015 				      &space_info->priority_tickets);
1016 		}
1017 	} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
1018 		used += orig_bytes;
1019 		/*
1020 		 * We will do the space reservation dance during log replay,
1021 		 * which means we won't have fs_info->fs_root set, so don't do
1022 		 * the async reclaim as we will panic.
1023 		 */
1024 		if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
1025 		    need_do_async_reclaim(fs_info, space_info,
1026 					  used, system_chunk) &&
1027 		    !work_busy(&fs_info->async_reclaim_work)) {
1028 			trace_btrfs_trigger_flush(fs_info, space_info->flags,
1029 						  orig_bytes, flush, "preempt");
1030 			queue_work(system_unbound_wq,
1031 				   &fs_info->async_reclaim_work);
1032 		}
1033 	}
1034 	spin_unlock(&space_info->lock);
1035 	if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
1036 		return ret;
1037 
1038 	if (flush == BTRFS_RESERVE_FLUSH_ALL)
1039 		return wait_reserve_ticket(fs_info, space_info, &ticket);
1040 
1041 	ret = 0;
1042 	priority_reclaim_metadata_space(fs_info, space_info, &ticket);
1043 	spin_lock(&space_info->lock);
1044 	if (ticket.bytes) {
1045 		if (ticket.bytes < orig_bytes)
1046 			reclaim_bytes = orig_bytes - ticket.bytes;
1047 		list_del_init(&ticket.list);
1048 		ret = -ENOSPC;
1049 	}
1050 	spin_unlock(&space_info->lock);
1051 
1052 	if (reclaim_bytes)
1053 		btrfs_space_info_add_old_bytes(fs_info, space_info,
1054 					       reclaim_bytes);
1055 	ASSERT(list_empty(&ticket.list));
1056 	return ret;
1057 }
1058 
1059 /**
1060  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
1061  * @root - the root we're allocating for
1062  * @block_rsv - the block_rsv we're allocating for
1063  * @orig_bytes - the number of bytes we want
1064  * @flush - whether or not we can flush to make our reservation
1065  *
1066  * This will reserve orig_bytes number of bytes from the space info associated
1067  * with the block_rsv.  If there is not enough space it will make an attempt to
1068  * flush out space to make room.  It will do this by flushing delalloc if
1069  * possible or committing the transaction.  If flush is 0 then no attempts to
1070  * regain reservations will be made and this will fail if there is not enough
1071  * space already.
1072  */
1073 int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
1074 				 struct btrfs_block_rsv *block_rsv,
1075 				 u64 orig_bytes,
1076 				 enum btrfs_reserve_flush_enum flush)
1077 {
1078 	struct btrfs_fs_info *fs_info = root->fs_info;
1079 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
1080 	int ret;
1081 	bool system_chunk = (root == fs_info->chunk_root);
1082 
1083 	ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
1084 				       orig_bytes, flush, system_chunk);
1085 	if (ret == -ENOSPC &&
1086 	    unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
1087 		if (block_rsv != global_rsv &&
1088 		    !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes))
1089 			ret = 0;
1090 	}
1091 	if (ret == -ENOSPC) {
1092 		trace_btrfs_space_reservation(fs_info, "space_info:enospc",
1093 					      block_rsv->space_info->flags,
1094 					      orig_bytes, 1);
1095 
1096 		if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
1097 			btrfs_dump_space_info(fs_info, block_rsv->space_info,
1098 					      orig_bytes, 0);
1099 	}
1100 	return ret;
1101 }
1102