xref: /openbmc/linux/fs/btrfs/space-info.c (revision aac0023c)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "ctree.h"
4 #include "space-info.h"
5 #include "sysfs.h"
6 #include "volumes.h"
7 #include "free-space-cache.h"
8 #include "ordered-data.h"
9 #include "transaction.h"
10 #include "math.h"
11 #include "block-group.h"
12 
13 u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
14 			  bool may_use_included)
15 {
16 	ASSERT(s_info);
17 	return s_info->bytes_used + s_info->bytes_reserved +
18 		s_info->bytes_pinned + s_info->bytes_readonly +
19 		(may_use_included ? s_info->bytes_may_use : 0);
20 }
21 
22 /*
23  * after adding space to the filesystem, we need to clear the full flags
24  * on all the space infos.
25  */
26 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
27 {
28 	struct list_head *head = &info->space_info;
29 	struct btrfs_space_info *found;
30 
31 	rcu_read_lock();
32 	list_for_each_entry_rcu(found, head, list)
33 		found->full = 0;
34 	rcu_read_unlock();
35 }
36 
37 static const char *alloc_name(u64 flags)
38 {
39 	switch (flags) {
40 	case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
41 		return "mixed";
42 	case BTRFS_BLOCK_GROUP_METADATA:
43 		return "metadata";
44 	case BTRFS_BLOCK_GROUP_DATA:
45 		return "data";
46 	case BTRFS_BLOCK_GROUP_SYSTEM:
47 		return "system";
48 	default:
49 		WARN_ON(1);
50 		return "invalid-combination";
51 	};
52 }
53 
54 static int create_space_info(struct btrfs_fs_info *info, u64 flags)
55 {
56 
57 	struct btrfs_space_info *space_info;
58 	int i;
59 	int ret;
60 
61 	space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
62 	if (!space_info)
63 		return -ENOMEM;
64 
65 	ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
66 				 GFP_KERNEL);
67 	if (ret) {
68 		kfree(space_info);
69 		return ret;
70 	}
71 
72 	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
73 		INIT_LIST_HEAD(&space_info->block_groups[i]);
74 	init_rwsem(&space_info->groups_sem);
75 	spin_lock_init(&space_info->lock);
76 	space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
77 	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
78 	init_waitqueue_head(&space_info->wait);
79 	INIT_LIST_HEAD(&space_info->ro_bgs);
80 	INIT_LIST_HEAD(&space_info->tickets);
81 	INIT_LIST_HEAD(&space_info->priority_tickets);
82 
83 	ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
84 				    info->space_info_kobj, "%s",
85 				    alloc_name(space_info->flags));
86 	if (ret) {
87 		kobject_put(&space_info->kobj);
88 		return ret;
89 	}
90 
91 	list_add_rcu(&space_info->list, &info->space_info);
92 	if (flags & BTRFS_BLOCK_GROUP_DATA)
93 		info->data_sinfo = space_info;
94 
95 	return ret;
96 }
97 
98 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
99 {
100 	struct btrfs_super_block *disk_super;
101 	u64 features;
102 	u64 flags;
103 	int mixed = 0;
104 	int ret;
105 
106 	disk_super = fs_info->super_copy;
107 	if (!btrfs_super_root(disk_super))
108 		return -EINVAL;
109 
110 	features = btrfs_super_incompat_flags(disk_super);
111 	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
112 		mixed = 1;
113 
114 	flags = BTRFS_BLOCK_GROUP_SYSTEM;
115 	ret = create_space_info(fs_info, flags);
116 	if (ret)
117 		goto out;
118 
119 	if (mixed) {
120 		flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
121 		ret = create_space_info(fs_info, flags);
122 	} else {
123 		flags = BTRFS_BLOCK_GROUP_METADATA;
124 		ret = create_space_info(fs_info, flags);
125 		if (ret)
126 			goto out;
127 
128 		flags = BTRFS_BLOCK_GROUP_DATA;
129 		ret = create_space_info(fs_info, flags);
130 	}
131 out:
132 	return ret;
133 }
134 
135 void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
136 			     u64 total_bytes, u64 bytes_used,
137 			     u64 bytes_readonly,
138 			     struct btrfs_space_info **space_info)
139 {
140 	struct btrfs_space_info *found;
141 	int factor;
142 
143 	factor = btrfs_bg_type_to_factor(flags);
144 
145 	found = btrfs_find_space_info(info, flags);
146 	ASSERT(found);
147 	spin_lock(&found->lock);
148 	found->total_bytes += total_bytes;
149 	found->disk_total += total_bytes * factor;
150 	found->bytes_used += bytes_used;
151 	found->disk_used += bytes_used * factor;
152 	found->bytes_readonly += bytes_readonly;
153 	if (total_bytes > 0)
154 		found->full = 0;
155 	btrfs_space_info_add_new_bytes(info, found,
156 				       total_bytes - bytes_used -
157 				       bytes_readonly);
158 	spin_unlock(&found->lock);
159 	*space_info = found;
160 }
161 
162 struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
163 					       u64 flags)
164 {
165 	struct list_head *head = &info->space_info;
166 	struct btrfs_space_info *found;
167 
168 	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
169 
170 	rcu_read_lock();
171 	list_for_each_entry_rcu(found, head, list) {
172 		if (found->flags & flags) {
173 			rcu_read_unlock();
174 			return found;
175 		}
176 	}
177 	rcu_read_unlock();
178 	return NULL;
179 }
180 
181 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
182 {
183 	return (global->size << 1);
184 }
185 
186 static int can_overcommit(struct btrfs_fs_info *fs_info,
187 			  struct btrfs_space_info *space_info, u64 bytes,
188 			  enum btrfs_reserve_flush_enum flush,
189 			  bool system_chunk)
190 {
191 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
192 	u64 profile;
193 	u64 space_size;
194 	u64 avail;
195 	u64 used;
196 	int factor;
197 
198 	/* Don't overcommit when in mixed mode. */
199 	if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
200 		return 0;
201 
202 	if (system_chunk)
203 		profile = btrfs_system_alloc_profile(fs_info);
204 	else
205 		profile = btrfs_metadata_alloc_profile(fs_info);
206 
207 	used = btrfs_space_info_used(space_info, false);
208 
209 	/*
210 	 * We only want to allow over committing if we have lots of actual space
211 	 * free, but if we don't have enough space to handle the global reserve
212 	 * space then we could end up having a real enospc problem when trying
213 	 * to allocate a chunk or some other such important allocation.
214 	 */
215 	spin_lock(&global_rsv->lock);
216 	space_size = calc_global_rsv_need_space(global_rsv);
217 	spin_unlock(&global_rsv->lock);
218 	if (used + space_size >= space_info->total_bytes)
219 		return 0;
220 
221 	used += space_info->bytes_may_use;
222 
223 	avail = atomic64_read(&fs_info->free_chunk_space);
224 
225 	/*
226 	 * If we have dup, raid1 or raid10 then only half of the free
227 	 * space is actually usable.  For raid56, the space info used
228 	 * doesn't include the parity drive, so we don't have to
229 	 * change the math
230 	 */
231 	factor = btrfs_bg_type_to_factor(profile);
232 	avail = div_u64(avail, factor);
233 
234 	/*
235 	 * If we aren't flushing all things, let us overcommit up to
236 	 * 1/2th of the space. If we can flush, don't let us overcommit
237 	 * too much, let it overcommit up to 1/8 of the space.
238 	 */
239 	if (flush == BTRFS_RESERVE_FLUSH_ALL)
240 		avail >>= 3;
241 	else
242 		avail >>= 1;
243 
244 	if (used + bytes < space_info->total_bytes + avail)
245 		return 1;
246 	return 0;
247 }
248 
249 /*
250  * This is for space we already have accounted in space_info->bytes_may_use, so
251  * basically when we're returning space from block_rsv's.
252  */
253 void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
254 				    struct btrfs_space_info *space_info,
255 				    u64 num_bytes)
256 {
257 	struct reserve_ticket *ticket;
258 	struct list_head *head;
259 	u64 used;
260 	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
261 	bool check_overcommit = false;
262 
263 	spin_lock(&space_info->lock);
264 	head = &space_info->priority_tickets;
265 
266 	/*
267 	 * If we are over our limit then we need to check and see if we can
268 	 * overcommit, and if we can't then we just need to free up our space
269 	 * and not satisfy any requests.
270 	 */
271 	used = btrfs_space_info_used(space_info, true);
272 	if (used - num_bytes >= space_info->total_bytes)
273 		check_overcommit = true;
274 again:
275 	while (!list_empty(head) && num_bytes) {
276 		ticket = list_first_entry(head, struct reserve_ticket,
277 					  list);
278 		/*
279 		 * We use 0 bytes because this space is already reserved, so
280 		 * adding the ticket space would be a double count.
281 		 */
282 		if (check_overcommit &&
283 		    !can_overcommit(fs_info, space_info, 0, flush, false))
284 			break;
285 		if (num_bytes >= ticket->bytes) {
286 			list_del_init(&ticket->list);
287 			num_bytes -= ticket->bytes;
288 			ticket->bytes = 0;
289 			space_info->tickets_id++;
290 			wake_up(&ticket->wait);
291 		} else {
292 			ticket->bytes -= num_bytes;
293 			num_bytes = 0;
294 		}
295 	}
296 
297 	if (num_bytes && head == &space_info->priority_tickets) {
298 		head = &space_info->tickets;
299 		flush = BTRFS_RESERVE_FLUSH_ALL;
300 		goto again;
301 	}
302 	btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes);
303 	trace_btrfs_space_reservation(fs_info, "space_info",
304 				      space_info->flags, num_bytes, 0);
305 	spin_unlock(&space_info->lock);
306 }
307 
308 /*
309  * This is for newly allocated space that isn't accounted in
310  * space_info->bytes_may_use yet.  So if we allocate a chunk or unpin an extent
311  * we use this helper.
312  */
313 void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
314 				    struct btrfs_space_info *space_info,
315 				    u64 num_bytes)
316 {
317 	struct reserve_ticket *ticket;
318 	struct list_head *head = &space_info->priority_tickets;
319 
320 again:
321 	while (!list_empty(head) && num_bytes) {
322 		ticket = list_first_entry(head, struct reserve_ticket,
323 					  list);
324 		if (num_bytes >= ticket->bytes) {
325 			trace_btrfs_space_reservation(fs_info, "space_info",
326 						      space_info->flags,
327 						      ticket->bytes, 1);
328 			list_del_init(&ticket->list);
329 			num_bytes -= ticket->bytes;
330 			btrfs_space_info_update_bytes_may_use(fs_info,
331 							      space_info,
332 							      ticket->bytes);
333 			ticket->bytes = 0;
334 			space_info->tickets_id++;
335 			wake_up(&ticket->wait);
336 		} else {
337 			trace_btrfs_space_reservation(fs_info, "space_info",
338 						      space_info->flags,
339 						      num_bytes, 1);
340 			btrfs_space_info_update_bytes_may_use(fs_info,
341 							      space_info,
342 							      num_bytes);
343 			ticket->bytes -= num_bytes;
344 			num_bytes = 0;
345 		}
346 	}
347 
348 	if (num_bytes && head == &space_info->priority_tickets) {
349 		head = &space_info->tickets;
350 		goto again;
351 	}
352 }
353 
354 #define DUMP_BLOCK_RSV(fs_info, rsv_name)				\
355 do {									\
356 	struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name;		\
357 	spin_lock(&__rsv->lock);					\
358 	btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu",	\
359 		   __rsv->size, __rsv->reserved);			\
360 	spin_unlock(&__rsv->lock);					\
361 } while (0)
362 
363 void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
364 			   struct btrfs_space_info *info, u64 bytes,
365 			   int dump_block_groups)
366 {
367 	struct btrfs_block_group_cache *cache;
368 	int index = 0;
369 
370 	spin_lock(&info->lock);
371 	btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
372 		   info->flags,
373 		   info->total_bytes - btrfs_space_info_used(info, true),
374 		   info->full ? "" : "not ");
375 	btrfs_info(fs_info,
376 		"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
377 		info->total_bytes, info->bytes_used, info->bytes_pinned,
378 		info->bytes_reserved, info->bytes_may_use,
379 		info->bytes_readonly);
380 	spin_unlock(&info->lock);
381 
382 	DUMP_BLOCK_RSV(fs_info, global_block_rsv);
383 	DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
384 	DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
385 	DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
386 	DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
387 
388 	if (!dump_block_groups)
389 		return;
390 
391 	down_read(&info->groups_sem);
392 again:
393 	list_for_each_entry(cache, &info->block_groups[index], list) {
394 		spin_lock(&cache->lock);
395 		btrfs_info(fs_info,
396 			"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
397 			cache->key.objectid, cache->key.offset,
398 			btrfs_block_group_used(&cache->item), cache->pinned,
399 			cache->reserved, cache->ro ? "[readonly]" : "");
400 		btrfs_dump_free_space(cache, bytes);
401 		spin_unlock(&cache->lock);
402 	}
403 	if (++index < BTRFS_NR_RAID_TYPES)
404 		goto again;
405 	up_read(&info->groups_sem);
406 }
407 
408 static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
409 					 unsigned long nr_pages, int nr_items)
410 {
411 	struct super_block *sb = fs_info->sb;
412 
413 	if (down_read_trylock(&sb->s_umount)) {
414 		writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
415 		up_read(&sb->s_umount);
416 	} else {
417 		/*
418 		 * We needn't worry the filesystem going from r/w to r/o though
419 		 * we don't acquire ->s_umount mutex, because the filesystem
420 		 * should guarantee the delalloc inodes list be empty after
421 		 * the filesystem is readonly(all dirty pages are written to
422 		 * the disk).
423 		 */
424 		btrfs_start_delalloc_roots(fs_info, nr_items);
425 		if (!current->journal_info)
426 			btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
427 	}
428 }
429 
430 static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
431 					u64 to_reclaim)
432 {
433 	u64 bytes;
434 	u64 nr;
435 
436 	bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
437 	nr = div64_u64(to_reclaim, bytes);
438 	if (!nr)
439 		nr = 1;
440 	return nr;
441 }
442 
443 #define EXTENT_SIZE_PER_ITEM	SZ_256K
444 
445 /*
446  * shrink metadata reservation for delalloc
447  */
448 static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
449 			    u64 orig, bool wait_ordered)
450 {
451 	struct btrfs_space_info *space_info;
452 	struct btrfs_trans_handle *trans;
453 	u64 delalloc_bytes;
454 	u64 dio_bytes;
455 	u64 async_pages;
456 	u64 items;
457 	long time_left;
458 	unsigned long nr_pages;
459 	int loops;
460 
461 	/* Calc the number of the pages we need flush for space reservation */
462 	items = calc_reclaim_items_nr(fs_info, to_reclaim);
463 	to_reclaim = items * EXTENT_SIZE_PER_ITEM;
464 
465 	trans = (struct btrfs_trans_handle *)current->journal_info;
466 	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
467 
468 	delalloc_bytes = percpu_counter_sum_positive(
469 						&fs_info->delalloc_bytes);
470 	dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
471 	if (delalloc_bytes == 0 && dio_bytes == 0) {
472 		if (trans)
473 			return;
474 		if (wait_ordered)
475 			btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
476 		return;
477 	}
478 
479 	/*
480 	 * If we are doing more ordered than delalloc we need to just wait on
481 	 * ordered extents, otherwise we'll waste time trying to flush delalloc
482 	 * that likely won't give us the space back we need.
483 	 */
484 	if (dio_bytes > delalloc_bytes)
485 		wait_ordered = true;
486 
487 	loops = 0;
488 	while ((delalloc_bytes || dio_bytes) && loops < 3) {
489 		nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
490 
491 		/*
492 		 * Triggers inode writeback for up to nr_pages. This will invoke
493 		 * ->writepages callback and trigger delalloc filling
494 		 *  (btrfs_run_delalloc_range()).
495 		 */
496 		btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
497 
498 		/*
499 		 * We need to wait for the compressed pages to start before
500 		 * we continue.
501 		 */
502 		async_pages = atomic_read(&fs_info->async_delalloc_pages);
503 		if (!async_pages)
504 			goto skip_async;
505 
506 		/*
507 		 * Calculate how many compressed pages we want to be written
508 		 * before we continue. I.e if there are more async pages than we
509 		 * require wait_event will wait until nr_pages are written.
510 		 */
511 		if (async_pages <= nr_pages)
512 			async_pages = 0;
513 		else
514 			async_pages -= nr_pages;
515 
516 		wait_event(fs_info->async_submit_wait,
517 			   atomic_read(&fs_info->async_delalloc_pages) <=
518 			   (int)async_pages);
519 skip_async:
520 		spin_lock(&space_info->lock);
521 		if (list_empty(&space_info->tickets) &&
522 		    list_empty(&space_info->priority_tickets)) {
523 			spin_unlock(&space_info->lock);
524 			break;
525 		}
526 		spin_unlock(&space_info->lock);
527 
528 		loops++;
529 		if (wait_ordered && !trans) {
530 			btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
531 		} else {
532 			time_left = schedule_timeout_killable(1);
533 			if (time_left)
534 				break;
535 		}
536 		delalloc_bytes = percpu_counter_sum_positive(
537 						&fs_info->delalloc_bytes);
538 		dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
539 	}
540 }
541 
542 /**
543  * maybe_commit_transaction - possibly commit the transaction if its ok to
544  * @root - the root we're allocating for
545  * @bytes - the number of bytes we want to reserve
546  * @force - force the commit
547  *
548  * This will check to make sure that committing the transaction will actually
549  * get us somewhere and then commit the transaction if it does.  Otherwise it
550  * will return -ENOSPC.
551  */
552 static int may_commit_transaction(struct btrfs_fs_info *fs_info,
553 				  struct btrfs_space_info *space_info)
554 {
555 	struct reserve_ticket *ticket = NULL;
556 	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
557 	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
558 	struct btrfs_trans_handle *trans;
559 	u64 bytes_needed;
560 	u64 reclaim_bytes = 0;
561 
562 	trans = (struct btrfs_trans_handle *)current->journal_info;
563 	if (trans)
564 		return -EAGAIN;
565 
566 	spin_lock(&space_info->lock);
567 	if (!list_empty(&space_info->priority_tickets))
568 		ticket = list_first_entry(&space_info->priority_tickets,
569 					  struct reserve_ticket, list);
570 	else if (!list_empty(&space_info->tickets))
571 		ticket = list_first_entry(&space_info->tickets,
572 					  struct reserve_ticket, list);
573 	bytes_needed = (ticket) ? ticket->bytes : 0;
574 	spin_unlock(&space_info->lock);
575 
576 	if (!bytes_needed)
577 		return 0;
578 
579 	trans = btrfs_join_transaction(fs_info->extent_root);
580 	if (IS_ERR(trans))
581 		return PTR_ERR(trans);
582 
583 	/*
584 	 * See if there is enough pinned space to make this reservation, or if
585 	 * we have block groups that are going to be freed, allowing us to
586 	 * possibly do a chunk allocation the next loop through.
587 	 */
588 	if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
589 	    __percpu_counter_compare(&space_info->total_bytes_pinned,
590 				     bytes_needed,
591 				     BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
592 		goto commit;
593 
594 	/*
595 	 * See if there is some space in the delayed insertion reservation for
596 	 * this reservation.
597 	 */
598 	if (space_info != delayed_rsv->space_info)
599 		goto enospc;
600 
601 	spin_lock(&delayed_rsv->lock);
602 	reclaim_bytes += delayed_rsv->reserved;
603 	spin_unlock(&delayed_rsv->lock);
604 
605 	spin_lock(&delayed_refs_rsv->lock);
606 	reclaim_bytes += delayed_refs_rsv->reserved;
607 	spin_unlock(&delayed_refs_rsv->lock);
608 	if (reclaim_bytes >= bytes_needed)
609 		goto commit;
610 	bytes_needed -= reclaim_bytes;
611 
612 	if (__percpu_counter_compare(&space_info->total_bytes_pinned,
613 				   bytes_needed,
614 				   BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
615 		goto enospc;
616 
617 commit:
618 	return btrfs_commit_transaction(trans);
619 enospc:
620 	btrfs_end_transaction(trans);
621 	return -ENOSPC;
622 }
623 
624 /*
625  * Try to flush some data based on policy set by @state. This is only advisory
626  * and may fail for various reasons. The caller is supposed to examine the
627  * state of @space_info to detect the outcome.
628  */
629 static void flush_space(struct btrfs_fs_info *fs_info,
630 		       struct btrfs_space_info *space_info, u64 num_bytes,
631 		       int state)
632 {
633 	struct btrfs_root *root = fs_info->extent_root;
634 	struct btrfs_trans_handle *trans;
635 	int nr;
636 	int ret = 0;
637 
638 	switch (state) {
639 	case FLUSH_DELAYED_ITEMS_NR:
640 	case FLUSH_DELAYED_ITEMS:
641 		if (state == FLUSH_DELAYED_ITEMS_NR)
642 			nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
643 		else
644 			nr = -1;
645 
646 		trans = btrfs_join_transaction(root);
647 		if (IS_ERR(trans)) {
648 			ret = PTR_ERR(trans);
649 			break;
650 		}
651 		ret = btrfs_run_delayed_items_nr(trans, nr);
652 		btrfs_end_transaction(trans);
653 		break;
654 	case FLUSH_DELALLOC:
655 	case FLUSH_DELALLOC_WAIT:
656 		shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
657 				state == FLUSH_DELALLOC_WAIT);
658 		break;
659 	case FLUSH_DELAYED_REFS_NR:
660 	case FLUSH_DELAYED_REFS:
661 		trans = btrfs_join_transaction(root);
662 		if (IS_ERR(trans)) {
663 			ret = PTR_ERR(trans);
664 			break;
665 		}
666 		if (state == FLUSH_DELAYED_REFS_NR)
667 			nr = calc_reclaim_items_nr(fs_info, num_bytes);
668 		else
669 			nr = 0;
670 		btrfs_run_delayed_refs(trans, nr);
671 		btrfs_end_transaction(trans);
672 		break;
673 	case ALLOC_CHUNK:
674 	case ALLOC_CHUNK_FORCE:
675 		trans = btrfs_join_transaction(root);
676 		if (IS_ERR(trans)) {
677 			ret = PTR_ERR(trans);
678 			break;
679 		}
680 		ret = btrfs_chunk_alloc(trans,
681 				btrfs_metadata_alloc_profile(fs_info),
682 				(state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
683 					CHUNK_ALLOC_FORCE);
684 		btrfs_end_transaction(trans);
685 		if (ret > 0 || ret == -ENOSPC)
686 			ret = 0;
687 		break;
688 	case COMMIT_TRANS:
689 		/*
690 		 * If we have pending delayed iputs then we could free up a
691 		 * bunch of pinned space, so make sure we run the iputs before
692 		 * we do our pinned bytes check below.
693 		 */
694 		btrfs_run_delayed_iputs(fs_info);
695 		btrfs_wait_on_delayed_iputs(fs_info);
696 
697 		ret = may_commit_transaction(fs_info, space_info);
698 		break;
699 	default:
700 		ret = -ENOSPC;
701 		break;
702 	}
703 
704 	trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
705 				ret);
706 	return;
707 }
708 
709 static inline u64
710 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
711 				 struct btrfs_space_info *space_info,
712 				 bool system_chunk)
713 {
714 	struct reserve_ticket *ticket;
715 	u64 used;
716 	u64 expected;
717 	u64 to_reclaim = 0;
718 
719 	list_for_each_entry(ticket, &space_info->tickets, list)
720 		to_reclaim += ticket->bytes;
721 	list_for_each_entry(ticket, &space_info->priority_tickets, list)
722 		to_reclaim += ticket->bytes;
723 	if (to_reclaim)
724 		return to_reclaim;
725 
726 	to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
727 	if (can_overcommit(fs_info, space_info, to_reclaim,
728 			   BTRFS_RESERVE_FLUSH_ALL, system_chunk))
729 		return 0;
730 
731 	used = btrfs_space_info_used(space_info, true);
732 
733 	if (can_overcommit(fs_info, space_info, SZ_1M,
734 			   BTRFS_RESERVE_FLUSH_ALL, system_chunk))
735 		expected = div_factor_fine(space_info->total_bytes, 95);
736 	else
737 		expected = div_factor_fine(space_info->total_bytes, 90);
738 
739 	if (used > expected)
740 		to_reclaim = used - expected;
741 	else
742 		to_reclaim = 0;
743 	to_reclaim = min(to_reclaim, space_info->bytes_may_use +
744 				     space_info->bytes_reserved);
745 	return to_reclaim;
746 }
747 
748 static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
749 					struct btrfs_space_info *space_info,
750 					u64 used, bool system_chunk)
751 {
752 	u64 thresh = div_factor_fine(space_info->total_bytes, 98);
753 
754 	/* If we're just plain full then async reclaim just slows us down. */
755 	if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
756 		return 0;
757 
758 	if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
759 					      system_chunk))
760 		return 0;
761 
762 	return (used >= thresh && !btrfs_fs_closing(fs_info) &&
763 		!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
764 }
765 
766 static bool wake_all_tickets(struct list_head *head)
767 {
768 	struct reserve_ticket *ticket;
769 
770 	while (!list_empty(head)) {
771 		ticket = list_first_entry(head, struct reserve_ticket, list);
772 		list_del_init(&ticket->list);
773 		ticket->error = -ENOSPC;
774 		wake_up(&ticket->wait);
775 		if (ticket->bytes != ticket->orig_bytes)
776 			return true;
777 	}
778 	return false;
779 }
780 
781 /*
782  * This is for normal flushers, we can wait all goddamned day if we want to.  We
783  * will loop and continuously try to flush as long as we are making progress.
784  * We count progress as clearing off tickets each time we have to loop.
785  */
786 static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
787 {
788 	struct btrfs_fs_info *fs_info;
789 	struct btrfs_space_info *space_info;
790 	u64 to_reclaim;
791 	int flush_state;
792 	int commit_cycles = 0;
793 	u64 last_tickets_id;
794 
795 	fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
796 	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
797 
798 	spin_lock(&space_info->lock);
799 	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
800 						      false);
801 	if (!to_reclaim) {
802 		space_info->flush = 0;
803 		spin_unlock(&space_info->lock);
804 		return;
805 	}
806 	last_tickets_id = space_info->tickets_id;
807 	spin_unlock(&space_info->lock);
808 
809 	flush_state = FLUSH_DELAYED_ITEMS_NR;
810 	do {
811 		flush_space(fs_info, space_info, to_reclaim, flush_state);
812 		spin_lock(&space_info->lock);
813 		if (list_empty(&space_info->tickets)) {
814 			space_info->flush = 0;
815 			spin_unlock(&space_info->lock);
816 			return;
817 		}
818 		to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
819 							      space_info,
820 							      false);
821 		if (last_tickets_id == space_info->tickets_id) {
822 			flush_state++;
823 		} else {
824 			last_tickets_id = space_info->tickets_id;
825 			flush_state = FLUSH_DELAYED_ITEMS_NR;
826 			if (commit_cycles)
827 				commit_cycles--;
828 		}
829 
830 		/*
831 		 * We don't want to force a chunk allocation until we've tried
832 		 * pretty hard to reclaim space.  Think of the case where we
833 		 * freed up a bunch of space and so have a lot of pinned space
834 		 * to reclaim.  We would rather use that than possibly create a
835 		 * underutilized metadata chunk.  So if this is our first run
836 		 * through the flushing state machine skip ALLOC_CHUNK_FORCE and
837 		 * commit the transaction.  If nothing has changed the next go
838 		 * around then we can force a chunk allocation.
839 		 */
840 		if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
841 			flush_state++;
842 
843 		if (flush_state > COMMIT_TRANS) {
844 			commit_cycles++;
845 			if (commit_cycles > 2) {
846 				if (wake_all_tickets(&space_info->tickets)) {
847 					flush_state = FLUSH_DELAYED_ITEMS_NR;
848 					commit_cycles--;
849 				} else {
850 					space_info->flush = 0;
851 				}
852 			} else {
853 				flush_state = FLUSH_DELAYED_ITEMS_NR;
854 			}
855 		}
856 		spin_unlock(&space_info->lock);
857 	} while (flush_state <= COMMIT_TRANS);
858 }
859 
860 void btrfs_init_async_reclaim_work(struct work_struct *work)
861 {
862 	INIT_WORK(work, btrfs_async_reclaim_metadata_space);
863 }
864 
865 static const enum btrfs_flush_state priority_flush_states[] = {
866 	FLUSH_DELAYED_ITEMS_NR,
867 	FLUSH_DELAYED_ITEMS,
868 	ALLOC_CHUNK,
869 };
870 
871 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
872 					    struct btrfs_space_info *space_info,
873 					    struct reserve_ticket *ticket)
874 {
875 	u64 to_reclaim;
876 	int flush_state;
877 
878 	spin_lock(&space_info->lock);
879 	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
880 						      false);
881 	if (!to_reclaim) {
882 		spin_unlock(&space_info->lock);
883 		return;
884 	}
885 	spin_unlock(&space_info->lock);
886 
887 	flush_state = 0;
888 	do {
889 		flush_space(fs_info, space_info, to_reclaim,
890 			    priority_flush_states[flush_state]);
891 		flush_state++;
892 		spin_lock(&space_info->lock);
893 		if (ticket->bytes == 0) {
894 			spin_unlock(&space_info->lock);
895 			return;
896 		}
897 		spin_unlock(&space_info->lock);
898 	} while (flush_state < ARRAY_SIZE(priority_flush_states));
899 }
900 
901 static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
902 			       struct btrfs_space_info *space_info,
903 			       struct reserve_ticket *ticket)
904 
905 {
906 	DEFINE_WAIT(wait);
907 	u64 reclaim_bytes = 0;
908 	int ret = 0;
909 
910 	spin_lock(&space_info->lock);
911 	while (ticket->bytes > 0 && ticket->error == 0) {
912 		ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
913 		if (ret) {
914 			ret = -EINTR;
915 			break;
916 		}
917 		spin_unlock(&space_info->lock);
918 
919 		schedule();
920 
921 		finish_wait(&ticket->wait, &wait);
922 		spin_lock(&space_info->lock);
923 	}
924 	if (!ret)
925 		ret = ticket->error;
926 	if (!list_empty(&ticket->list))
927 		list_del_init(&ticket->list);
928 	if (ticket->bytes && ticket->bytes < ticket->orig_bytes)
929 		reclaim_bytes = ticket->orig_bytes - ticket->bytes;
930 	spin_unlock(&space_info->lock);
931 
932 	if (reclaim_bytes)
933 		btrfs_space_info_add_old_bytes(fs_info, space_info,
934 					       reclaim_bytes);
935 	return ret;
936 }
937 
938 /**
939  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
940  * @root - the root we're allocating for
941  * @space_info - the space info we want to allocate from
942  * @orig_bytes - the number of bytes we want
943  * @flush - whether or not we can flush to make our reservation
944  *
945  * This will reserve orig_bytes number of bytes from the space info associated
946  * with the block_rsv.  If there is not enough space it will make an attempt to
947  * flush out space to make room.  It will do this by flushing delalloc if
948  * possible or committing the transaction.  If flush is 0 then no attempts to
949  * regain reservations will be made and this will fail if there is not enough
950  * space already.
951  */
952 static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
953 				    struct btrfs_space_info *space_info,
954 				    u64 orig_bytes,
955 				    enum btrfs_reserve_flush_enum flush,
956 				    bool system_chunk)
957 {
958 	struct reserve_ticket ticket;
959 	u64 used;
960 	u64 reclaim_bytes = 0;
961 	int ret = 0;
962 
963 	ASSERT(orig_bytes);
964 	ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
965 
966 	spin_lock(&space_info->lock);
967 	ret = -ENOSPC;
968 	used = btrfs_space_info_used(space_info, true);
969 
970 	/*
971 	 * Carry on if we have enough space (short-circuit) OR call
972 	 * can_overcommit() to ensure we can overcommit to continue.
973 	 */
974 	if ((used + orig_bytes <= space_info->total_bytes) ||
975 	    can_overcommit(fs_info, space_info, orig_bytes, flush,
976 			   system_chunk)) {
977 		btrfs_space_info_update_bytes_may_use(fs_info, space_info,
978 						      orig_bytes);
979 		trace_btrfs_space_reservation(fs_info, "space_info",
980 					      space_info->flags, orig_bytes, 1);
981 		ret = 0;
982 	}
983 
984 	/*
985 	 * If we couldn't make a reservation then setup our reservation ticket
986 	 * and kick the async worker if it's not already running.
987 	 *
988 	 * If we are a priority flusher then we just need to add our ticket to
989 	 * the list and we will do our own flushing further down.
990 	 */
991 	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
992 		ticket.orig_bytes = orig_bytes;
993 		ticket.bytes = orig_bytes;
994 		ticket.error = 0;
995 		init_waitqueue_head(&ticket.wait);
996 		if (flush == BTRFS_RESERVE_FLUSH_ALL) {
997 			list_add_tail(&ticket.list, &space_info->tickets);
998 			if (!space_info->flush) {
999 				space_info->flush = 1;
1000 				trace_btrfs_trigger_flush(fs_info,
1001 							  space_info->flags,
1002 							  orig_bytes, flush,
1003 							  "enospc");
1004 				queue_work(system_unbound_wq,
1005 					   &fs_info->async_reclaim_work);
1006 			}
1007 		} else {
1008 			list_add_tail(&ticket.list,
1009 				      &space_info->priority_tickets);
1010 		}
1011 	} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
1012 		used += orig_bytes;
1013 		/*
1014 		 * We will do the space reservation dance during log replay,
1015 		 * which means we won't have fs_info->fs_root set, so don't do
1016 		 * the async reclaim as we will panic.
1017 		 */
1018 		if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
1019 		    need_do_async_reclaim(fs_info, space_info,
1020 					  used, system_chunk) &&
1021 		    !work_busy(&fs_info->async_reclaim_work)) {
1022 			trace_btrfs_trigger_flush(fs_info, space_info->flags,
1023 						  orig_bytes, flush, "preempt");
1024 			queue_work(system_unbound_wq,
1025 				   &fs_info->async_reclaim_work);
1026 		}
1027 	}
1028 	spin_unlock(&space_info->lock);
1029 	if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
1030 		return ret;
1031 
1032 	if (flush == BTRFS_RESERVE_FLUSH_ALL)
1033 		return wait_reserve_ticket(fs_info, space_info, &ticket);
1034 
1035 	ret = 0;
1036 	priority_reclaim_metadata_space(fs_info, space_info, &ticket);
1037 	spin_lock(&space_info->lock);
1038 	if (ticket.bytes) {
1039 		if (ticket.bytes < orig_bytes)
1040 			reclaim_bytes = orig_bytes - ticket.bytes;
1041 		list_del_init(&ticket.list);
1042 		ret = -ENOSPC;
1043 	}
1044 	spin_unlock(&space_info->lock);
1045 
1046 	if (reclaim_bytes)
1047 		btrfs_space_info_add_old_bytes(fs_info, space_info,
1048 					       reclaim_bytes);
1049 	ASSERT(list_empty(&ticket.list));
1050 	return ret;
1051 }
1052 
1053 /**
1054  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
1055  * @root - the root we're allocating for
1056  * @block_rsv - the block_rsv we're allocating for
1057  * @orig_bytes - the number of bytes we want
1058  * @flush - whether or not we can flush to make our reservation
1059  *
1060  * This will reserve orig_bytes number of bytes from the space info associated
1061  * with the block_rsv.  If there is not enough space it will make an attempt to
1062  * flush out space to make room.  It will do this by flushing delalloc if
1063  * possible or committing the transaction.  If flush is 0 then no attempts to
1064  * regain reservations will be made and this will fail if there is not enough
1065  * space already.
1066  */
1067 int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
1068 				 struct btrfs_block_rsv *block_rsv,
1069 				 u64 orig_bytes,
1070 				 enum btrfs_reserve_flush_enum flush)
1071 {
1072 	struct btrfs_fs_info *fs_info = root->fs_info;
1073 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
1074 	int ret;
1075 	bool system_chunk = (root == fs_info->chunk_root);
1076 
1077 	ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
1078 				       orig_bytes, flush, system_chunk);
1079 	if (ret == -ENOSPC &&
1080 	    unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
1081 		if (block_rsv != global_rsv &&
1082 		    !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes))
1083 			ret = 0;
1084 	}
1085 	if (ret == -ENOSPC) {
1086 		trace_btrfs_space_reservation(fs_info, "space_info:enospc",
1087 					      block_rsv->space_info->flags,
1088 					      orig_bytes, 1);
1089 
1090 		if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
1091 			btrfs_dump_space_info(fs_info, block_rsv->space_info,
1092 					      orig_bytes, 0);
1093 	}
1094 	return ret;
1095 }
1096