xref: /openbmc/linux/fs/btrfs/delayed-ref.c (revision b737eecd)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2009 Oracle.  All rights reserved.
4  */
5 
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/sort.h>
9 #include "ctree.h"
10 #include "delayed-ref.h"
11 #include "transaction.h"
12 #include "qgroup.h"
13 #include "space-info.h"
14 
15 struct kmem_cache *btrfs_delayed_ref_head_cachep;
16 struct kmem_cache *btrfs_delayed_tree_ref_cachep;
17 struct kmem_cache *btrfs_delayed_data_ref_cachep;
18 struct kmem_cache *btrfs_delayed_extent_op_cachep;
19 /*
20  * delayed back reference update tracking.  For subvolume trees
21  * we queue up extent allocations and backref maintenance for
22  * delayed processing.   This avoids deep call chains where we
23  * add extents in the middle of btrfs_search_slot, and it allows
24  * us to buffer up frequently modified backrefs in an rb tree instead
25  * of hammering updates on the extent allocation tree.
26  */
27 
28 bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)
29 {
30 	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
31 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
32 	bool ret = false;
33 	u64 reserved;
34 
35 	spin_lock(&global_rsv->lock);
36 	reserved = global_rsv->reserved;
37 	spin_unlock(&global_rsv->lock);
38 
39 	/*
40 	 * Since the global reserve is just kind of magic we don't really want
41 	 * to rely on it to save our bacon, so if our size is more than the
42 	 * delayed_refs_rsv and the global rsv then it's time to think about
43 	 * bailing.
44 	 */
45 	spin_lock(&delayed_refs_rsv->lock);
46 	reserved += delayed_refs_rsv->reserved;
47 	if (delayed_refs_rsv->size >= reserved)
48 		ret = true;
49 	spin_unlock(&delayed_refs_rsv->lock);
50 	return ret;
51 }
52 
53 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
54 {
55 	u64 num_entries =
56 		atomic_read(&trans->transaction->delayed_refs.num_entries);
57 	u64 avg_runtime;
58 	u64 val;
59 
60 	smp_mb();
61 	avg_runtime = trans->fs_info->avg_delayed_ref_runtime;
62 	val = num_entries * avg_runtime;
63 	if (val >= NSEC_PER_SEC)
64 		return 1;
65 	if (val >= NSEC_PER_SEC / 2)
66 		return 2;
67 
68 	return btrfs_check_space_for_delayed_refs(trans->fs_info);
69 }
70 
71 /**
72  * Release a ref head's reservation
73  *
74  * @fs_info:  the filesystem
75  * @nr:       number of items to drop
76  *
77  * This drops the delayed ref head's count from the delayed refs rsv and frees
78  * any excess reservation we had.
79  */
80 void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
81 {
82 	struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
83 	u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, nr);
84 	u64 released = 0;
85 
86 	released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
87 	if (released)
88 		trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
89 					      0, released, 0);
90 }
91 
92 /*
93  * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv
94  * @trans - the trans that may have generated delayed refs
95  *
96  * This is to be called anytime we may have adjusted trans->delayed_ref_updates,
97  * it'll calculate the additional size and add it to the delayed_refs_rsv.
98  */
99 void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
100 {
101 	struct btrfs_fs_info *fs_info = trans->fs_info;
102 	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
103 	u64 num_bytes;
104 
105 	if (!trans->delayed_ref_updates)
106 		return;
107 
108 	num_bytes = btrfs_calc_insert_metadata_size(fs_info,
109 						    trans->delayed_ref_updates);
110 	spin_lock(&delayed_rsv->lock);
111 	delayed_rsv->size += num_bytes;
112 	delayed_rsv->full = 0;
113 	spin_unlock(&delayed_rsv->lock);
114 	trans->delayed_ref_updates = 0;
115 }
116 
117 /**
118  * Transfer bytes to our delayed refs rsv
119  *
120  * @fs_info:   the filesystem
121  * @src:       source block rsv to transfer from
122  * @num_bytes: number of bytes to transfer
123  *
124  * This transfers up to the num_bytes amount from the src rsv to the
125  * delayed_refs_rsv.  Any extra bytes are returned to the space info.
126  */
127 void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
128 				       struct btrfs_block_rsv *src,
129 				       u64 num_bytes)
130 {
131 	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
132 	u64 to_free = 0;
133 
134 	spin_lock(&src->lock);
135 	src->reserved -= num_bytes;
136 	src->size -= num_bytes;
137 	spin_unlock(&src->lock);
138 
139 	spin_lock(&delayed_refs_rsv->lock);
140 	if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) {
141 		u64 delta = delayed_refs_rsv->size -
142 			delayed_refs_rsv->reserved;
143 		if (num_bytes > delta) {
144 			to_free = num_bytes - delta;
145 			num_bytes = delta;
146 		}
147 	} else {
148 		to_free = num_bytes;
149 		num_bytes = 0;
150 	}
151 
152 	if (num_bytes)
153 		delayed_refs_rsv->reserved += num_bytes;
154 	if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size)
155 		delayed_refs_rsv->full = 1;
156 	spin_unlock(&delayed_refs_rsv->lock);
157 
158 	if (num_bytes)
159 		trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
160 					      0, num_bytes, 1);
161 	if (to_free)
162 		btrfs_space_info_free_bytes_may_use(fs_info,
163 				delayed_refs_rsv->space_info, to_free);
164 }
165 
166 /**
167  * Refill based on our delayed refs usage
168  *
169  * @fs_info: the filesystem
170  * @flush:   control how we can flush for this reservation.
171  *
172  * This will refill the delayed block_rsv up to 1 items size worth of space and
173  * will return -ENOSPC if we can't make the reservation.
174  */
175 int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
176 				  enum btrfs_reserve_flush_enum flush)
177 {
178 	struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
179 	u64 limit = btrfs_calc_insert_metadata_size(fs_info, 1);
180 	u64 num_bytes = 0;
181 	int ret = -ENOSPC;
182 
183 	spin_lock(&block_rsv->lock);
184 	if (block_rsv->reserved < block_rsv->size) {
185 		num_bytes = block_rsv->size - block_rsv->reserved;
186 		num_bytes = min(num_bytes, limit);
187 	}
188 	spin_unlock(&block_rsv->lock);
189 
190 	if (!num_bytes)
191 		return 0;
192 
193 	ret = btrfs_reserve_metadata_bytes(fs_info->extent_root, block_rsv,
194 					   num_bytes, flush);
195 	if (ret)
196 		return ret;
197 	btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0);
198 	trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
199 				      0, num_bytes, 1);
200 	return 0;
201 }
202 
203 /*
204  * compare two delayed tree backrefs with same bytenr and type
205  */
206 static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref1,
207 			  struct btrfs_delayed_tree_ref *ref2)
208 {
209 	if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) {
210 		if (ref1->root < ref2->root)
211 			return -1;
212 		if (ref1->root > ref2->root)
213 			return 1;
214 	} else {
215 		if (ref1->parent < ref2->parent)
216 			return -1;
217 		if (ref1->parent > ref2->parent)
218 			return 1;
219 	}
220 	return 0;
221 }
222 
223 /*
224  * compare two delayed data backrefs with same bytenr and type
225  */
226 static int comp_data_refs(struct btrfs_delayed_data_ref *ref1,
227 			  struct btrfs_delayed_data_ref *ref2)
228 {
229 	if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) {
230 		if (ref1->root < ref2->root)
231 			return -1;
232 		if (ref1->root > ref2->root)
233 			return 1;
234 		if (ref1->objectid < ref2->objectid)
235 			return -1;
236 		if (ref1->objectid > ref2->objectid)
237 			return 1;
238 		if (ref1->offset < ref2->offset)
239 			return -1;
240 		if (ref1->offset > ref2->offset)
241 			return 1;
242 	} else {
243 		if (ref1->parent < ref2->parent)
244 			return -1;
245 		if (ref1->parent > ref2->parent)
246 			return 1;
247 	}
248 	return 0;
249 }
250 
251 static int comp_refs(struct btrfs_delayed_ref_node *ref1,
252 		     struct btrfs_delayed_ref_node *ref2,
253 		     bool check_seq)
254 {
255 	int ret = 0;
256 
257 	if (ref1->type < ref2->type)
258 		return -1;
259 	if (ref1->type > ref2->type)
260 		return 1;
261 	if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
262 	    ref1->type == BTRFS_SHARED_BLOCK_REF_KEY)
263 		ret = comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref1),
264 				     btrfs_delayed_node_to_tree_ref(ref2));
265 	else
266 		ret = comp_data_refs(btrfs_delayed_node_to_data_ref(ref1),
267 				     btrfs_delayed_node_to_data_ref(ref2));
268 	if (ret)
269 		return ret;
270 	if (check_seq) {
271 		if (ref1->seq < ref2->seq)
272 			return -1;
273 		if (ref1->seq > ref2->seq)
274 			return 1;
275 	}
276 	return 0;
277 }
278 
279 /* insert a new ref to head ref rbtree */
280 static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root,
281 						   struct rb_node *node)
282 {
283 	struct rb_node **p = &root->rb_root.rb_node;
284 	struct rb_node *parent_node = NULL;
285 	struct btrfs_delayed_ref_head *entry;
286 	struct btrfs_delayed_ref_head *ins;
287 	u64 bytenr;
288 	bool leftmost = true;
289 
290 	ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node);
291 	bytenr = ins->bytenr;
292 	while (*p) {
293 		parent_node = *p;
294 		entry = rb_entry(parent_node, struct btrfs_delayed_ref_head,
295 				 href_node);
296 
297 		if (bytenr < entry->bytenr) {
298 			p = &(*p)->rb_left;
299 		} else if (bytenr > entry->bytenr) {
300 			p = &(*p)->rb_right;
301 			leftmost = false;
302 		} else {
303 			return entry;
304 		}
305 	}
306 
307 	rb_link_node(node, parent_node, p);
308 	rb_insert_color_cached(node, root, leftmost);
309 	return NULL;
310 }
311 
312 static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root,
313 		struct btrfs_delayed_ref_node *ins)
314 {
315 	struct rb_node **p = &root->rb_root.rb_node;
316 	struct rb_node *node = &ins->ref_node;
317 	struct rb_node *parent_node = NULL;
318 	struct btrfs_delayed_ref_node *entry;
319 	bool leftmost = true;
320 
321 	while (*p) {
322 		int comp;
323 
324 		parent_node = *p;
325 		entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
326 				 ref_node);
327 		comp = comp_refs(ins, entry, true);
328 		if (comp < 0) {
329 			p = &(*p)->rb_left;
330 		} else if (comp > 0) {
331 			p = &(*p)->rb_right;
332 			leftmost = false;
333 		} else {
334 			return entry;
335 		}
336 	}
337 
338 	rb_link_node(node, parent_node, p);
339 	rb_insert_color_cached(node, root, leftmost);
340 	return NULL;
341 }
342 
343 static struct btrfs_delayed_ref_head *find_first_ref_head(
344 		struct btrfs_delayed_ref_root *dr)
345 {
346 	struct rb_node *n;
347 	struct btrfs_delayed_ref_head *entry;
348 
349 	n = rb_first_cached(&dr->href_root);
350 	if (!n)
351 		return NULL;
352 
353 	entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
354 
355 	return entry;
356 }
357 
358 /*
359  * Find a head entry based on bytenr. This returns the delayed ref head if it
360  * was able to find one, or NULL if nothing was in that spot.  If return_bigger
361  * is given, the next bigger entry is returned if no exact match is found.
362  */
363 static struct btrfs_delayed_ref_head *find_ref_head(
364 		struct btrfs_delayed_ref_root *dr, u64 bytenr,
365 		bool return_bigger)
366 {
367 	struct rb_root *root = &dr->href_root.rb_root;
368 	struct rb_node *n;
369 	struct btrfs_delayed_ref_head *entry;
370 
371 	n = root->rb_node;
372 	entry = NULL;
373 	while (n) {
374 		entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
375 
376 		if (bytenr < entry->bytenr)
377 			n = n->rb_left;
378 		else if (bytenr > entry->bytenr)
379 			n = n->rb_right;
380 		else
381 			return entry;
382 	}
383 	if (entry && return_bigger) {
384 		if (bytenr > entry->bytenr) {
385 			n = rb_next(&entry->href_node);
386 			if (!n)
387 				return NULL;
388 			entry = rb_entry(n, struct btrfs_delayed_ref_head,
389 					 href_node);
390 		}
391 		return entry;
392 	}
393 	return NULL;
394 }
395 
396 int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
397 			   struct btrfs_delayed_ref_head *head)
398 {
399 	lockdep_assert_held(&delayed_refs->lock);
400 	if (mutex_trylock(&head->mutex))
401 		return 0;
402 
403 	refcount_inc(&head->refs);
404 	spin_unlock(&delayed_refs->lock);
405 
406 	mutex_lock(&head->mutex);
407 	spin_lock(&delayed_refs->lock);
408 	if (RB_EMPTY_NODE(&head->href_node)) {
409 		mutex_unlock(&head->mutex);
410 		btrfs_put_delayed_ref_head(head);
411 		return -EAGAIN;
412 	}
413 	btrfs_put_delayed_ref_head(head);
414 	return 0;
415 }
416 
417 static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
418 				    struct btrfs_delayed_ref_root *delayed_refs,
419 				    struct btrfs_delayed_ref_head *head,
420 				    struct btrfs_delayed_ref_node *ref)
421 {
422 	lockdep_assert_held(&head->lock);
423 	rb_erase_cached(&ref->ref_node, &head->ref_tree);
424 	RB_CLEAR_NODE(&ref->ref_node);
425 	if (!list_empty(&ref->add_list))
426 		list_del(&ref->add_list);
427 	ref->in_tree = 0;
428 	btrfs_put_delayed_ref(ref);
429 	atomic_dec(&delayed_refs->num_entries);
430 }
431 
432 static bool merge_ref(struct btrfs_trans_handle *trans,
433 		      struct btrfs_delayed_ref_root *delayed_refs,
434 		      struct btrfs_delayed_ref_head *head,
435 		      struct btrfs_delayed_ref_node *ref,
436 		      u64 seq)
437 {
438 	struct btrfs_delayed_ref_node *next;
439 	struct rb_node *node = rb_next(&ref->ref_node);
440 	bool done = false;
441 
442 	while (!done && node) {
443 		int mod;
444 
445 		next = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
446 		node = rb_next(node);
447 		if (seq && next->seq >= seq)
448 			break;
449 		if (comp_refs(ref, next, false))
450 			break;
451 
452 		if (ref->action == next->action) {
453 			mod = next->ref_mod;
454 		} else {
455 			if (ref->ref_mod < next->ref_mod) {
456 				swap(ref, next);
457 				done = true;
458 			}
459 			mod = -next->ref_mod;
460 		}
461 
462 		drop_delayed_ref(trans, delayed_refs, head, next);
463 		ref->ref_mod += mod;
464 		if (ref->ref_mod == 0) {
465 			drop_delayed_ref(trans, delayed_refs, head, ref);
466 			done = true;
467 		} else {
468 			/*
469 			 * Can't have multiples of the same ref on a tree block.
470 			 */
471 			WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
472 				ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
473 		}
474 	}
475 
476 	return done;
477 }
478 
479 void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
480 			      struct btrfs_delayed_ref_root *delayed_refs,
481 			      struct btrfs_delayed_ref_head *head)
482 {
483 	struct btrfs_fs_info *fs_info = trans->fs_info;
484 	struct btrfs_delayed_ref_node *ref;
485 	struct rb_node *node;
486 	u64 seq = 0;
487 
488 	lockdep_assert_held(&head->lock);
489 
490 	if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
491 		return;
492 
493 	/* We don't have too many refs to merge for data. */
494 	if (head->is_data)
495 		return;
496 
497 	read_lock(&fs_info->tree_mod_log_lock);
498 	if (!list_empty(&fs_info->tree_mod_seq_list)) {
499 		struct seq_list *elem;
500 
501 		elem = list_first_entry(&fs_info->tree_mod_seq_list,
502 					struct seq_list, list);
503 		seq = elem->seq;
504 	}
505 	read_unlock(&fs_info->tree_mod_log_lock);
506 
507 again:
508 	for (node = rb_first_cached(&head->ref_tree); node;
509 	     node = rb_next(node)) {
510 		ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
511 		if (seq && ref->seq >= seq)
512 			continue;
513 		if (merge_ref(trans, delayed_refs, head, ref, seq))
514 			goto again;
515 	}
516 }
517 
518 int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq)
519 {
520 	struct seq_list *elem;
521 	int ret = 0;
522 
523 	read_lock(&fs_info->tree_mod_log_lock);
524 	if (!list_empty(&fs_info->tree_mod_seq_list)) {
525 		elem = list_first_entry(&fs_info->tree_mod_seq_list,
526 					struct seq_list, list);
527 		if (seq >= elem->seq) {
528 			btrfs_debug(fs_info,
529 				"holding back delayed_ref %#x.%x, lowest is %#x.%x",
530 				(u32)(seq >> 32), (u32)seq,
531 				(u32)(elem->seq >> 32), (u32)elem->seq);
532 			ret = 1;
533 		}
534 	}
535 
536 	read_unlock(&fs_info->tree_mod_log_lock);
537 	return ret;
538 }
539 
540 struct btrfs_delayed_ref_head *btrfs_select_ref_head(
541 		struct btrfs_delayed_ref_root *delayed_refs)
542 {
543 	struct btrfs_delayed_ref_head *head;
544 
545 again:
546 	head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start,
547 			     true);
548 	if (!head && delayed_refs->run_delayed_start != 0) {
549 		delayed_refs->run_delayed_start = 0;
550 		head = find_first_ref_head(delayed_refs);
551 	}
552 	if (!head)
553 		return NULL;
554 
555 	while (head->processing) {
556 		struct rb_node *node;
557 
558 		node = rb_next(&head->href_node);
559 		if (!node) {
560 			if (delayed_refs->run_delayed_start == 0)
561 				return NULL;
562 			delayed_refs->run_delayed_start = 0;
563 			goto again;
564 		}
565 		head = rb_entry(node, struct btrfs_delayed_ref_head,
566 				href_node);
567 	}
568 
569 	head->processing = 1;
570 	WARN_ON(delayed_refs->num_heads_ready == 0);
571 	delayed_refs->num_heads_ready--;
572 	delayed_refs->run_delayed_start = head->bytenr +
573 		head->num_bytes;
574 	return head;
575 }
576 
577 void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
578 			   struct btrfs_delayed_ref_head *head)
579 {
580 	lockdep_assert_held(&delayed_refs->lock);
581 	lockdep_assert_held(&head->lock);
582 
583 	rb_erase_cached(&head->href_node, &delayed_refs->href_root);
584 	RB_CLEAR_NODE(&head->href_node);
585 	atomic_dec(&delayed_refs->num_entries);
586 	delayed_refs->num_heads--;
587 	if (head->processing == 0)
588 		delayed_refs->num_heads_ready--;
589 }
590 
591 /*
592  * Helper to insert the ref_node to the tail or merge with tail.
593  *
594  * Return 0 for insert.
595  * Return >0 for merge.
596  */
597 static int insert_delayed_ref(struct btrfs_trans_handle *trans,
598 			      struct btrfs_delayed_ref_root *root,
599 			      struct btrfs_delayed_ref_head *href,
600 			      struct btrfs_delayed_ref_node *ref)
601 {
602 	struct btrfs_delayed_ref_node *exist;
603 	int mod;
604 	int ret = 0;
605 
606 	spin_lock(&href->lock);
607 	exist = tree_insert(&href->ref_tree, ref);
608 	if (!exist)
609 		goto inserted;
610 
611 	/* Now we are sure we can merge */
612 	ret = 1;
613 	if (exist->action == ref->action) {
614 		mod = ref->ref_mod;
615 	} else {
616 		/* Need to change action */
617 		if (exist->ref_mod < ref->ref_mod) {
618 			exist->action = ref->action;
619 			mod = -exist->ref_mod;
620 			exist->ref_mod = ref->ref_mod;
621 			if (ref->action == BTRFS_ADD_DELAYED_REF)
622 				list_add_tail(&exist->add_list,
623 					      &href->ref_add_list);
624 			else if (ref->action == BTRFS_DROP_DELAYED_REF) {
625 				ASSERT(!list_empty(&exist->add_list));
626 				list_del(&exist->add_list);
627 			} else {
628 				ASSERT(0);
629 			}
630 		} else
631 			mod = -ref->ref_mod;
632 	}
633 	exist->ref_mod += mod;
634 
635 	/* remove existing tail if its ref_mod is zero */
636 	if (exist->ref_mod == 0)
637 		drop_delayed_ref(trans, root, href, exist);
638 	spin_unlock(&href->lock);
639 	return ret;
640 inserted:
641 	if (ref->action == BTRFS_ADD_DELAYED_REF)
642 		list_add_tail(&ref->add_list, &href->ref_add_list);
643 	atomic_inc(&root->num_entries);
644 	spin_unlock(&href->lock);
645 	return ret;
646 }
647 
648 /*
649  * helper function to update the accounting in the head ref
650  * existing and update must have the same bytenr
651  */
652 static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
653 			 struct btrfs_delayed_ref_head *existing,
654 			 struct btrfs_delayed_ref_head *update)
655 {
656 	struct btrfs_delayed_ref_root *delayed_refs =
657 		&trans->transaction->delayed_refs;
658 	struct btrfs_fs_info *fs_info = trans->fs_info;
659 	u64 flags = btrfs_ref_head_to_space_flags(existing);
660 	int old_ref_mod;
661 
662 	BUG_ON(existing->is_data != update->is_data);
663 
664 	spin_lock(&existing->lock);
665 	if (update->must_insert_reserved) {
666 		/* if the extent was freed and then
667 		 * reallocated before the delayed ref
668 		 * entries were processed, we can end up
669 		 * with an existing head ref without
670 		 * the must_insert_reserved flag set.
671 		 * Set it again here
672 		 */
673 		existing->must_insert_reserved = update->must_insert_reserved;
674 
675 		/*
676 		 * update the num_bytes so we make sure the accounting
677 		 * is done correctly
678 		 */
679 		existing->num_bytes = update->num_bytes;
680 
681 	}
682 
683 	if (update->extent_op) {
684 		if (!existing->extent_op) {
685 			existing->extent_op = update->extent_op;
686 		} else {
687 			if (update->extent_op->update_key) {
688 				memcpy(&existing->extent_op->key,
689 				       &update->extent_op->key,
690 				       sizeof(update->extent_op->key));
691 				existing->extent_op->update_key = true;
692 			}
693 			if (update->extent_op->update_flags) {
694 				existing->extent_op->flags_to_set |=
695 					update->extent_op->flags_to_set;
696 				existing->extent_op->update_flags = true;
697 			}
698 			btrfs_free_delayed_extent_op(update->extent_op);
699 		}
700 	}
701 	/*
702 	 * update the reference mod on the head to reflect this new operation,
703 	 * only need the lock for this case cause we could be processing it
704 	 * currently, for refs we just added we know we're a-ok.
705 	 */
706 	old_ref_mod = existing->total_ref_mod;
707 	existing->ref_mod += update->ref_mod;
708 	existing->total_ref_mod += update->ref_mod;
709 
710 	/*
711 	 * If we are going to from a positive ref mod to a negative or vice
712 	 * versa we need to make sure to adjust pending_csums accordingly.
713 	 */
714 	if (existing->is_data) {
715 		u64 csum_leaves =
716 			btrfs_csum_bytes_to_leaves(fs_info,
717 						   existing->num_bytes);
718 
719 		if (existing->total_ref_mod >= 0 && old_ref_mod < 0) {
720 			delayed_refs->pending_csums -= existing->num_bytes;
721 			btrfs_delayed_refs_rsv_release(fs_info, csum_leaves);
722 		}
723 		if (existing->total_ref_mod < 0 && old_ref_mod >= 0) {
724 			delayed_refs->pending_csums += existing->num_bytes;
725 			trans->delayed_ref_updates += csum_leaves;
726 		}
727 	}
728 
729 	/*
730 	 * This handles the following conditions:
731 	 *
732 	 * 1. We had a ref mod of 0 or more and went negative, indicating that
733 	 *    we may be freeing space, so add our space to the
734 	 *    total_bytes_pinned counter.
735 	 * 2. We were negative and went to 0 or positive, so no longer can say
736 	 *    that the space would be pinned, decrement our counter from the
737 	 *    total_bytes_pinned counter.
738 	 * 3. We are now at 0 and have ->must_insert_reserved set, which means
739 	 *    this was a new allocation and then we dropped it, and thus must
740 	 *    add our space to the total_bytes_pinned counter.
741 	 */
742 	if (existing->total_ref_mod < 0 && old_ref_mod >= 0)
743 		btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes);
744 	else if (existing->total_ref_mod >= 0 && old_ref_mod < 0)
745 		btrfs_mod_total_bytes_pinned(fs_info, flags, -existing->num_bytes);
746 	else if (existing->total_ref_mod == 0 && existing->must_insert_reserved)
747 		btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes);
748 
749 	spin_unlock(&existing->lock);
750 }
751 
752 static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
753 				  struct btrfs_qgroup_extent_record *qrecord,
754 				  u64 bytenr, u64 num_bytes, u64 ref_root,
755 				  u64 reserved, int action, bool is_data,
756 				  bool is_system)
757 {
758 	int count_mod = 1;
759 	int must_insert_reserved = 0;
760 
761 	/* If reserved is provided, it must be a data extent. */
762 	BUG_ON(!is_data && reserved);
763 
764 	/*
765 	 * The head node stores the sum of all the mods, so dropping a ref
766 	 * should drop the sum in the head node by one.
767 	 */
768 	if (action == BTRFS_UPDATE_DELAYED_HEAD)
769 		count_mod = 0;
770 	else if (action == BTRFS_DROP_DELAYED_REF)
771 		count_mod = -1;
772 
773 	/*
774 	 * BTRFS_ADD_DELAYED_EXTENT means that we need to update the reserved
775 	 * accounting when the extent is finally added, or if a later
776 	 * modification deletes the delayed ref without ever inserting the
777 	 * extent into the extent allocation tree.  ref->must_insert_reserved
778 	 * is the flag used to record that accounting mods are required.
779 	 *
780 	 * Once we record must_insert_reserved, switch the action to
781 	 * BTRFS_ADD_DELAYED_REF because other special casing is not required.
782 	 */
783 	if (action == BTRFS_ADD_DELAYED_EXTENT)
784 		must_insert_reserved = 1;
785 	else
786 		must_insert_reserved = 0;
787 
788 	refcount_set(&head_ref->refs, 1);
789 	head_ref->bytenr = bytenr;
790 	head_ref->num_bytes = num_bytes;
791 	head_ref->ref_mod = count_mod;
792 	head_ref->must_insert_reserved = must_insert_reserved;
793 	head_ref->is_data = is_data;
794 	head_ref->is_system = is_system;
795 	head_ref->ref_tree = RB_ROOT_CACHED;
796 	INIT_LIST_HEAD(&head_ref->ref_add_list);
797 	RB_CLEAR_NODE(&head_ref->href_node);
798 	head_ref->processing = 0;
799 	head_ref->total_ref_mod = count_mod;
800 	spin_lock_init(&head_ref->lock);
801 	mutex_init(&head_ref->mutex);
802 
803 	if (qrecord) {
804 		if (ref_root && reserved) {
805 			qrecord->data_rsv = reserved;
806 			qrecord->data_rsv_refroot = ref_root;
807 		}
808 		qrecord->bytenr = bytenr;
809 		qrecord->num_bytes = num_bytes;
810 		qrecord->old_roots = NULL;
811 	}
812 }
813 
814 /*
815  * helper function to actually insert a head node into the rbtree.
816  * this does all the dirty work in terms of maintaining the correct
817  * overall modification count.
818  */
819 static noinline struct btrfs_delayed_ref_head *
820 add_delayed_ref_head(struct btrfs_trans_handle *trans,
821 		     struct btrfs_delayed_ref_head *head_ref,
822 		     struct btrfs_qgroup_extent_record *qrecord,
823 		     int action, int *qrecord_inserted_ret)
824 {
825 	struct btrfs_delayed_ref_head *existing;
826 	struct btrfs_delayed_ref_root *delayed_refs;
827 	int qrecord_inserted = 0;
828 
829 	delayed_refs = &trans->transaction->delayed_refs;
830 
831 	/* Record qgroup extent info if provided */
832 	if (qrecord) {
833 		if (btrfs_qgroup_trace_extent_nolock(trans->fs_info,
834 					delayed_refs, qrecord))
835 			kfree(qrecord);
836 		else
837 			qrecord_inserted = 1;
838 	}
839 
840 	trace_add_delayed_ref_head(trans->fs_info, head_ref, action);
841 
842 	existing = htree_insert(&delayed_refs->href_root,
843 				&head_ref->href_node);
844 	if (existing) {
845 		update_existing_head_ref(trans, existing, head_ref);
846 		/*
847 		 * we've updated the existing ref, free the newly
848 		 * allocated ref
849 		 */
850 		kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
851 		head_ref = existing;
852 	} else {
853 		u64 flags = btrfs_ref_head_to_space_flags(head_ref);
854 
855 		if (head_ref->is_data && head_ref->ref_mod < 0) {
856 			delayed_refs->pending_csums += head_ref->num_bytes;
857 			trans->delayed_ref_updates +=
858 				btrfs_csum_bytes_to_leaves(trans->fs_info,
859 							   head_ref->num_bytes);
860 		}
861 		if (head_ref->ref_mod < 0)
862 			btrfs_mod_total_bytes_pinned(trans->fs_info, flags,
863 						     head_ref->num_bytes);
864 		delayed_refs->num_heads++;
865 		delayed_refs->num_heads_ready++;
866 		atomic_inc(&delayed_refs->num_entries);
867 		trans->delayed_ref_updates++;
868 	}
869 	if (qrecord_inserted_ret)
870 		*qrecord_inserted_ret = qrecord_inserted;
871 
872 	return head_ref;
873 }
874 
875 /*
876  * init_delayed_ref_common - Initialize the structure which represents a
877  *			     modification to a an extent.
878  *
879  * @fs_info:    Internal to the mounted filesystem mount structure.
880  *
881  * @ref:	The structure which is going to be initialized.
882  *
883  * @bytenr:	The logical address of the extent for which a modification is
884  *		going to be recorded.
885  *
886  * @num_bytes:  Size of the extent whose modification is being recorded.
887  *
888  * @ref_root:	The id of the root where this modification has originated, this
889  *		can be either one of the well-known metadata trees or the
890  *		subvolume id which references this extent.
891  *
892  * @action:	Can be one of BTRFS_ADD_DELAYED_REF/BTRFS_DROP_DELAYED_REF or
893  *		BTRFS_ADD_DELAYED_EXTENT
894  *
895  * @ref_type:	Holds the type of the extent which is being recorded, can be
896  *		one of BTRFS_SHARED_BLOCK_REF_KEY/BTRFS_TREE_BLOCK_REF_KEY
897  *		when recording a metadata extent or BTRFS_SHARED_DATA_REF_KEY/
898  *		BTRFS_EXTENT_DATA_REF_KEY when recording data extent
899  */
900 static void init_delayed_ref_common(struct btrfs_fs_info *fs_info,
901 				    struct btrfs_delayed_ref_node *ref,
902 				    u64 bytenr, u64 num_bytes, u64 ref_root,
903 				    int action, u8 ref_type)
904 {
905 	u64 seq = 0;
906 
907 	if (action == BTRFS_ADD_DELAYED_EXTENT)
908 		action = BTRFS_ADD_DELAYED_REF;
909 
910 	if (is_fstree(ref_root))
911 		seq = atomic64_read(&fs_info->tree_mod_seq);
912 
913 	refcount_set(&ref->refs, 1);
914 	ref->bytenr = bytenr;
915 	ref->num_bytes = num_bytes;
916 	ref->ref_mod = 1;
917 	ref->action = action;
918 	ref->is_head = 0;
919 	ref->in_tree = 1;
920 	ref->seq = seq;
921 	ref->type = ref_type;
922 	RB_CLEAR_NODE(&ref->ref_node);
923 	INIT_LIST_HEAD(&ref->add_list);
924 }
925 
926 /*
927  * add a delayed tree ref.  This does all of the accounting required
928  * to make sure the delayed ref is eventually processed before this
929  * transaction commits.
930  */
931 int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
932 			       struct btrfs_ref *generic_ref,
933 			       struct btrfs_delayed_extent_op *extent_op)
934 {
935 	struct btrfs_fs_info *fs_info = trans->fs_info;
936 	struct btrfs_delayed_tree_ref *ref;
937 	struct btrfs_delayed_ref_head *head_ref;
938 	struct btrfs_delayed_ref_root *delayed_refs;
939 	struct btrfs_qgroup_extent_record *record = NULL;
940 	int qrecord_inserted;
941 	bool is_system;
942 	int action = generic_ref->action;
943 	int level = generic_ref->tree_ref.level;
944 	int ret;
945 	u64 bytenr = generic_ref->bytenr;
946 	u64 num_bytes = generic_ref->len;
947 	u64 parent = generic_ref->parent;
948 	u8 ref_type;
949 
950 	is_system = (generic_ref->real_root == BTRFS_CHUNK_TREE_OBJECTID);
951 
952 	ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action);
953 	BUG_ON(extent_op && extent_op->is_data);
954 	ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
955 	if (!ref)
956 		return -ENOMEM;
957 
958 	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
959 	if (!head_ref) {
960 		kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
961 		return -ENOMEM;
962 	}
963 
964 	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
965 	    is_fstree(generic_ref->real_root) &&
966 	    is_fstree(generic_ref->tree_ref.root) &&
967 	    !generic_ref->skip_qgroup) {
968 		record = kzalloc(sizeof(*record), GFP_NOFS);
969 		if (!record) {
970 			kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
971 			kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
972 			return -ENOMEM;
973 		}
974 	}
975 
976 	if (parent)
977 		ref_type = BTRFS_SHARED_BLOCK_REF_KEY;
978 	else
979 		ref_type = BTRFS_TREE_BLOCK_REF_KEY;
980 
981 	init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
982 				generic_ref->tree_ref.root, action, ref_type);
983 	ref->root = generic_ref->tree_ref.root;
984 	ref->parent = parent;
985 	ref->level = level;
986 
987 	init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
988 			      generic_ref->tree_ref.root, 0, action, false,
989 			      is_system);
990 	head_ref->extent_op = extent_op;
991 
992 	delayed_refs = &trans->transaction->delayed_refs;
993 	spin_lock(&delayed_refs->lock);
994 
995 	/*
996 	 * insert both the head node and the new ref without dropping
997 	 * the spin lock
998 	 */
999 	head_ref = add_delayed_ref_head(trans, head_ref, record,
1000 					action, &qrecord_inserted);
1001 
1002 	ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
1003 	spin_unlock(&delayed_refs->lock);
1004 
1005 	/*
1006 	 * Need to update the delayed_refs_rsv with any changes we may have
1007 	 * made.
1008 	 */
1009 	btrfs_update_delayed_refs_rsv(trans);
1010 
1011 	trace_add_delayed_tree_ref(fs_info, &ref->node, ref,
1012 				   action == BTRFS_ADD_DELAYED_EXTENT ?
1013 				   BTRFS_ADD_DELAYED_REF : action);
1014 	if (ret > 0)
1015 		kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
1016 
1017 	if (qrecord_inserted)
1018 		btrfs_qgroup_trace_extent_post(fs_info, record);
1019 
1020 	return 0;
1021 }
1022 
1023 /*
1024  * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
1025  */
1026 int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
1027 			       struct btrfs_ref *generic_ref,
1028 			       u64 reserved)
1029 {
1030 	struct btrfs_fs_info *fs_info = trans->fs_info;
1031 	struct btrfs_delayed_data_ref *ref;
1032 	struct btrfs_delayed_ref_head *head_ref;
1033 	struct btrfs_delayed_ref_root *delayed_refs;
1034 	struct btrfs_qgroup_extent_record *record = NULL;
1035 	int qrecord_inserted;
1036 	int action = generic_ref->action;
1037 	int ret;
1038 	u64 bytenr = generic_ref->bytenr;
1039 	u64 num_bytes = generic_ref->len;
1040 	u64 parent = generic_ref->parent;
1041 	u64 ref_root = generic_ref->data_ref.ref_root;
1042 	u64 owner = generic_ref->data_ref.ino;
1043 	u64 offset = generic_ref->data_ref.offset;
1044 	u8 ref_type;
1045 
1046 	ASSERT(generic_ref->type == BTRFS_REF_DATA && action);
1047 	ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
1048 	if (!ref)
1049 		return -ENOMEM;
1050 
1051 	if (parent)
1052 	        ref_type = BTRFS_SHARED_DATA_REF_KEY;
1053 	else
1054 	        ref_type = BTRFS_EXTENT_DATA_REF_KEY;
1055 	init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
1056 				ref_root, action, ref_type);
1057 	ref->root = ref_root;
1058 	ref->parent = parent;
1059 	ref->objectid = owner;
1060 	ref->offset = offset;
1061 
1062 
1063 	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
1064 	if (!head_ref) {
1065 		kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
1066 		return -ENOMEM;
1067 	}
1068 
1069 	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
1070 	    is_fstree(ref_root) &&
1071 	    is_fstree(generic_ref->real_root) &&
1072 	    !generic_ref->skip_qgroup) {
1073 		record = kzalloc(sizeof(*record), GFP_NOFS);
1074 		if (!record) {
1075 			kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
1076 			kmem_cache_free(btrfs_delayed_ref_head_cachep,
1077 					head_ref);
1078 			return -ENOMEM;
1079 		}
1080 	}
1081 
1082 	init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root,
1083 			      reserved, action, true, false);
1084 	head_ref->extent_op = NULL;
1085 
1086 	delayed_refs = &trans->transaction->delayed_refs;
1087 	spin_lock(&delayed_refs->lock);
1088 
1089 	/*
1090 	 * insert both the head node and the new ref without dropping
1091 	 * the spin lock
1092 	 */
1093 	head_ref = add_delayed_ref_head(trans, head_ref, record,
1094 					action, &qrecord_inserted);
1095 
1096 	ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
1097 	spin_unlock(&delayed_refs->lock);
1098 
1099 	/*
1100 	 * Need to update the delayed_refs_rsv with any changes we may have
1101 	 * made.
1102 	 */
1103 	btrfs_update_delayed_refs_rsv(trans);
1104 
1105 	trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref,
1106 				   action == BTRFS_ADD_DELAYED_EXTENT ?
1107 				   BTRFS_ADD_DELAYED_REF : action);
1108 	if (ret > 0)
1109 		kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
1110 
1111 
1112 	if (qrecord_inserted)
1113 		return btrfs_qgroup_trace_extent_post(fs_info, record);
1114 	return 0;
1115 }
1116 
1117 int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
1118 				u64 bytenr, u64 num_bytes,
1119 				struct btrfs_delayed_extent_op *extent_op)
1120 {
1121 	struct btrfs_delayed_ref_head *head_ref;
1122 	struct btrfs_delayed_ref_root *delayed_refs;
1123 
1124 	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
1125 	if (!head_ref)
1126 		return -ENOMEM;
1127 
1128 	init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0,
1129 			      BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data,
1130 			      false);
1131 	head_ref->extent_op = extent_op;
1132 
1133 	delayed_refs = &trans->transaction->delayed_refs;
1134 	spin_lock(&delayed_refs->lock);
1135 
1136 	add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD,
1137 			     NULL);
1138 
1139 	spin_unlock(&delayed_refs->lock);
1140 
1141 	/*
1142 	 * Need to update the delayed_refs_rsv with any changes we may have
1143 	 * made.
1144 	 */
1145 	btrfs_update_delayed_refs_rsv(trans);
1146 	return 0;
1147 }
1148 
1149 /*
1150  * This does a simple search for the head node for a given extent.  Returns the
1151  * head node if found, or NULL if not.
1152  */
1153 struct btrfs_delayed_ref_head *
1154 btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr)
1155 {
1156 	lockdep_assert_held(&delayed_refs->lock);
1157 
1158 	return find_ref_head(delayed_refs, bytenr, false);
1159 }
1160 
1161 void __cold btrfs_delayed_ref_exit(void)
1162 {
1163 	kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
1164 	kmem_cache_destroy(btrfs_delayed_tree_ref_cachep);
1165 	kmem_cache_destroy(btrfs_delayed_data_ref_cachep);
1166 	kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
1167 }
1168 
1169 int __init btrfs_delayed_ref_init(void)
1170 {
1171 	btrfs_delayed_ref_head_cachep = kmem_cache_create(
1172 				"btrfs_delayed_ref_head",
1173 				sizeof(struct btrfs_delayed_ref_head), 0,
1174 				SLAB_MEM_SPREAD, NULL);
1175 	if (!btrfs_delayed_ref_head_cachep)
1176 		goto fail;
1177 
1178 	btrfs_delayed_tree_ref_cachep = kmem_cache_create(
1179 				"btrfs_delayed_tree_ref",
1180 				sizeof(struct btrfs_delayed_tree_ref), 0,
1181 				SLAB_MEM_SPREAD, NULL);
1182 	if (!btrfs_delayed_tree_ref_cachep)
1183 		goto fail;
1184 
1185 	btrfs_delayed_data_ref_cachep = kmem_cache_create(
1186 				"btrfs_delayed_data_ref",
1187 				sizeof(struct btrfs_delayed_data_ref), 0,
1188 				SLAB_MEM_SPREAD, NULL);
1189 	if (!btrfs_delayed_data_ref_cachep)
1190 		goto fail;
1191 
1192 	btrfs_delayed_extent_op_cachep = kmem_cache_create(
1193 				"btrfs_delayed_extent_op",
1194 				sizeof(struct btrfs_delayed_extent_op), 0,
1195 				SLAB_MEM_SPREAD, NULL);
1196 	if (!btrfs_delayed_extent_op_cachep)
1197 		goto fail;
1198 
1199 	return 0;
1200 fail:
1201 	btrfs_delayed_ref_exit();
1202 	return -ENOMEM;
1203 }
1204