xref: /openbmc/linux/fs/btrfs/delayed-ref.c (revision 5b4cb650)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2009 Oracle.  All rights reserved.
4  */
5 
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/sort.h>
9 #include "ctree.h"
10 #include "delayed-ref.h"
11 #include "transaction.h"
12 #include "qgroup.h"
13 
14 struct kmem_cache *btrfs_delayed_ref_head_cachep;
15 struct kmem_cache *btrfs_delayed_tree_ref_cachep;
16 struct kmem_cache *btrfs_delayed_data_ref_cachep;
17 struct kmem_cache *btrfs_delayed_extent_op_cachep;
18 /*
19  * delayed back reference update tracking.  For subvolume trees
20  * we queue up extent allocations and backref maintenance for
21  * delayed processing.   This avoids deep call chains where we
22  * add extents in the middle of btrfs_search_slot, and it allows
23  * us to buffer up frequently modified backrefs in an rb tree instead
24  * of hammering updates on the extent allocation tree.
25  */
26 
27 /*
28  * compare two delayed tree backrefs with same bytenr and type
29  */
30 static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref1,
31 			  struct btrfs_delayed_tree_ref *ref2)
32 {
33 	if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) {
34 		if (ref1->root < ref2->root)
35 			return -1;
36 		if (ref1->root > ref2->root)
37 			return 1;
38 	} else {
39 		if (ref1->parent < ref2->parent)
40 			return -1;
41 		if (ref1->parent > ref2->parent)
42 			return 1;
43 	}
44 	return 0;
45 }
46 
47 /*
48  * compare two delayed data backrefs with same bytenr and type
49  */
50 static int comp_data_refs(struct btrfs_delayed_data_ref *ref1,
51 			  struct btrfs_delayed_data_ref *ref2)
52 {
53 	if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) {
54 		if (ref1->root < ref2->root)
55 			return -1;
56 		if (ref1->root > ref2->root)
57 			return 1;
58 		if (ref1->objectid < ref2->objectid)
59 			return -1;
60 		if (ref1->objectid > ref2->objectid)
61 			return 1;
62 		if (ref1->offset < ref2->offset)
63 			return -1;
64 		if (ref1->offset > ref2->offset)
65 			return 1;
66 	} else {
67 		if (ref1->parent < ref2->parent)
68 			return -1;
69 		if (ref1->parent > ref2->parent)
70 			return 1;
71 	}
72 	return 0;
73 }
74 
75 static int comp_refs(struct btrfs_delayed_ref_node *ref1,
76 		     struct btrfs_delayed_ref_node *ref2,
77 		     bool check_seq)
78 {
79 	int ret = 0;
80 
81 	if (ref1->type < ref2->type)
82 		return -1;
83 	if (ref1->type > ref2->type)
84 		return 1;
85 	if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
86 	    ref1->type == BTRFS_SHARED_BLOCK_REF_KEY)
87 		ret = comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref1),
88 				     btrfs_delayed_node_to_tree_ref(ref2));
89 	else
90 		ret = comp_data_refs(btrfs_delayed_node_to_data_ref(ref1),
91 				     btrfs_delayed_node_to_data_ref(ref2));
92 	if (ret)
93 		return ret;
94 	if (check_seq) {
95 		if (ref1->seq < ref2->seq)
96 			return -1;
97 		if (ref1->seq > ref2->seq)
98 			return 1;
99 	}
100 	return 0;
101 }
102 
103 /* insert a new ref to head ref rbtree */
104 static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root,
105 						   struct rb_node *node)
106 {
107 	struct rb_node **p = &root->rb_root.rb_node;
108 	struct rb_node *parent_node = NULL;
109 	struct btrfs_delayed_ref_head *entry;
110 	struct btrfs_delayed_ref_head *ins;
111 	u64 bytenr;
112 	bool leftmost = true;
113 
114 	ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node);
115 	bytenr = ins->bytenr;
116 	while (*p) {
117 		parent_node = *p;
118 		entry = rb_entry(parent_node, struct btrfs_delayed_ref_head,
119 				 href_node);
120 
121 		if (bytenr < entry->bytenr) {
122 			p = &(*p)->rb_left;
123 		} else if (bytenr > entry->bytenr) {
124 			p = &(*p)->rb_right;
125 			leftmost = false;
126 		} else {
127 			return entry;
128 		}
129 	}
130 
131 	rb_link_node(node, parent_node, p);
132 	rb_insert_color_cached(node, root, leftmost);
133 	return NULL;
134 }
135 
136 static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root,
137 		struct btrfs_delayed_ref_node *ins)
138 {
139 	struct rb_node **p = &root->rb_root.rb_node;
140 	struct rb_node *node = &ins->ref_node;
141 	struct rb_node *parent_node = NULL;
142 	struct btrfs_delayed_ref_node *entry;
143 	bool leftmost = true;
144 
145 	while (*p) {
146 		int comp;
147 
148 		parent_node = *p;
149 		entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
150 				 ref_node);
151 		comp = comp_refs(ins, entry, true);
152 		if (comp < 0) {
153 			p = &(*p)->rb_left;
154 		} else if (comp > 0) {
155 			p = &(*p)->rb_right;
156 			leftmost = false;
157 		} else {
158 			return entry;
159 		}
160 	}
161 
162 	rb_link_node(node, parent_node, p);
163 	rb_insert_color_cached(node, root, leftmost);
164 	return NULL;
165 }
166 
167 static struct btrfs_delayed_ref_head *find_first_ref_head(
168 		struct btrfs_delayed_ref_root *dr)
169 {
170 	struct rb_node *n;
171 	struct btrfs_delayed_ref_head *entry;
172 
173 	n = rb_first_cached(&dr->href_root);
174 	if (!n)
175 		return NULL;
176 
177 	entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
178 
179 	return entry;
180 }
181 
182 /*
183  * Find a head entry based on bytenr. This returns the delayed ref head if it
184  * was able to find one, or NULL if nothing was in that spot.  If return_bigger
185  * is given, the next bigger entry is returned if no exact match is found.
186  */
187 static struct btrfs_delayed_ref_head *find_ref_head(
188 		struct btrfs_delayed_ref_root *dr, u64 bytenr,
189 		bool return_bigger)
190 {
191 	struct rb_root *root = &dr->href_root.rb_root;
192 	struct rb_node *n;
193 	struct btrfs_delayed_ref_head *entry;
194 
195 	n = root->rb_node;
196 	entry = NULL;
197 	while (n) {
198 		entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
199 
200 		if (bytenr < entry->bytenr)
201 			n = n->rb_left;
202 		else if (bytenr > entry->bytenr)
203 			n = n->rb_right;
204 		else
205 			return entry;
206 	}
207 	if (entry && return_bigger) {
208 		if (bytenr > entry->bytenr) {
209 			n = rb_next(&entry->href_node);
210 			if (!n)
211 				return NULL;
212 			entry = rb_entry(n, struct btrfs_delayed_ref_head,
213 					 href_node);
214 		}
215 		return entry;
216 	}
217 	return NULL;
218 }
219 
220 int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
221 			   struct btrfs_delayed_ref_head *head)
222 {
223 	lockdep_assert_held(&delayed_refs->lock);
224 	if (mutex_trylock(&head->mutex))
225 		return 0;
226 
227 	refcount_inc(&head->refs);
228 	spin_unlock(&delayed_refs->lock);
229 
230 	mutex_lock(&head->mutex);
231 	spin_lock(&delayed_refs->lock);
232 	if (RB_EMPTY_NODE(&head->href_node)) {
233 		mutex_unlock(&head->mutex);
234 		btrfs_put_delayed_ref_head(head);
235 		return -EAGAIN;
236 	}
237 	btrfs_put_delayed_ref_head(head);
238 	return 0;
239 }
240 
241 static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
242 				    struct btrfs_delayed_ref_root *delayed_refs,
243 				    struct btrfs_delayed_ref_head *head,
244 				    struct btrfs_delayed_ref_node *ref)
245 {
246 	lockdep_assert_held(&head->lock);
247 	rb_erase_cached(&ref->ref_node, &head->ref_tree);
248 	RB_CLEAR_NODE(&ref->ref_node);
249 	if (!list_empty(&ref->add_list))
250 		list_del(&ref->add_list);
251 	ref->in_tree = 0;
252 	btrfs_put_delayed_ref(ref);
253 	atomic_dec(&delayed_refs->num_entries);
254 }
255 
256 static bool merge_ref(struct btrfs_trans_handle *trans,
257 		      struct btrfs_delayed_ref_root *delayed_refs,
258 		      struct btrfs_delayed_ref_head *head,
259 		      struct btrfs_delayed_ref_node *ref,
260 		      u64 seq)
261 {
262 	struct btrfs_delayed_ref_node *next;
263 	struct rb_node *node = rb_next(&ref->ref_node);
264 	bool done = false;
265 
266 	while (!done && node) {
267 		int mod;
268 
269 		next = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
270 		node = rb_next(node);
271 		if (seq && next->seq >= seq)
272 			break;
273 		if (comp_refs(ref, next, false))
274 			break;
275 
276 		if (ref->action == next->action) {
277 			mod = next->ref_mod;
278 		} else {
279 			if (ref->ref_mod < next->ref_mod) {
280 				swap(ref, next);
281 				done = true;
282 			}
283 			mod = -next->ref_mod;
284 		}
285 
286 		drop_delayed_ref(trans, delayed_refs, head, next);
287 		ref->ref_mod += mod;
288 		if (ref->ref_mod == 0) {
289 			drop_delayed_ref(trans, delayed_refs, head, ref);
290 			done = true;
291 		} else {
292 			/*
293 			 * Can't have multiples of the same ref on a tree block.
294 			 */
295 			WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
296 				ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
297 		}
298 	}
299 
300 	return done;
301 }
302 
303 void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
304 			      struct btrfs_delayed_ref_root *delayed_refs,
305 			      struct btrfs_delayed_ref_head *head)
306 {
307 	struct btrfs_fs_info *fs_info = trans->fs_info;
308 	struct btrfs_delayed_ref_node *ref;
309 	struct rb_node *node;
310 	u64 seq = 0;
311 
312 	lockdep_assert_held(&head->lock);
313 
314 	if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
315 		return;
316 
317 	/* We don't have too many refs to merge for data. */
318 	if (head->is_data)
319 		return;
320 
321 	spin_lock(&fs_info->tree_mod_seq_lock);
322 	if (!list_empty(&fs_info->tree_mod_seq_list)) {
323 		struct seq_list *elem;
324 
325 		elem = list_first_entry(&fs_info->tree_mod_seq_list,
326 					struct seq_list, list);
327 		seq = elem->seq;
328 	}
329 	spin_unlock(&fs_info->tree_mod_seq_lock);
330 
331 again:
332 	for (node = rb_first_cached(&head->ref_tree); node;
333 	     node = rb_next(node)) {
334 		ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
335 		if (seq && ref->seq >= seq)
336 			continue;
337 		if (merge_ref(trans, delayed_refs, head, ref, seq))
338 			goto again;
339 	}
340 }
341 
342 int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq)
343 {
344 	struct seq_list *elem;
345 	int ret = 0;
346 
347 	spin_lock(&fs_info->tree_mod_seq_lock);
348 	if (!list_empty(&fs_info->tree_mod_seq_list)) {
349 		elem = list_first_entry(&fs_info->tree_mod_seq_list,
350 					struct seq_list, list);
351 		if (seq >= elem->seq) {
352 			btrfs_debug(fs_info,
353 				"holding back delayed_ref %#x.%x, lowest is %#x.%x",
354 				(u32)(seq >> 32), (u32)seq,
355 				(u32)(elem->seq >> 32), (u32)elem->seq);
356 			ret = 1;
357 		}
358 	}
359 
360 	spin_unlock(&fs_info->tree_mod_seq_lock);
361 	return ret;
362 }
363 
364 struct btrfs_delayed_ref_head *btrfs_select_ref_head(
365 		struct btrfs_delayed_ref_root *delayed_refs)
366 {
367 	struct btrfs_delayed_ref_head *head;
368 
369 again:
370 	head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start,
371 			     true);
372 	if (!head && delayed_refs->run_delayed_start != 0) {
373 		delayed_refs->run_delayed_start = 0;
374 		head = find_first_ref_head(delayed_refs);
375 	}
376 	if (!head)
377 		return NULL;
378 
379 	while (head->processing) {
380 		struct rb_node *node;
381 
382 		node = rb_next(&head->href_node);
383 		if (!node) {
384 			if (delayed_refs->run_delayed_start == 0)
385 				return NULL;
386 			delayed_refs->run_delayed_start = 0;
387 			goto again;
388 		}
389 		head = rb_entry(node, struct btrfs_delayed_ref_head,
390 				href_node);
391 	}
392 
393 	head->processing = 1;
394 	WARN_ON(delayed_refs->num_heads_ready == 0);
395 	delayed_refs->num_heads_ready--;
396 	delayed_refs->run_delayed_start = head->bytenr +
397 		head->num_bytes;
398 	return head;
399 }
400 
401 void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
402 			   struct btrfs_delayed_ref_head *head)
403 {
404 	lockdep_assert_held(&delayed_refs->lock);
405 	lockdep_assert_held(&head->lock);
406 
407 	rb_erase_cached(&head->href_node, &delayed_refs->href_root);
408 	RB_CLEAR_NODE(&head->href_node);
409 	atomic_dec(&delayed_refs->num_entries);
410 	delayed_refs->num_heads--;
411 	if (head->processing == 0)
412 		delayed_refs->num_heads_ready--;
413 }
414 
415 /*
416  * Helper to insert the ref_node to the tail or merge with tail.
417  *
418  * Return 0 for insert.
419  * Return >0 for merge.
420  */
421 static int insert_delayed_ref(struct btrfs_trans_handle *trans,
422 			      struct btrfs_delayed_ref_root *root,
423 			      struct btrfs_delayed_ref_head *href,
424 			      struct btrfs_delayed_ref_node *ref)
425 {
426 	struct btrfs_delayed_ref_node *exist;
427 	int mod;
428 	int ret = 0;
429 
430 	spin_lock(&href->lock);
431 	exist = tree_insert(&href->ref_tree, ref);
432 	if (!exist)
433 		goto inserted;
434 
435 	/* Now we are sure we can merge */
436 	ret = 1;
437 	if (exist->action == ref->action) {
438 		mod = ref->ref_mod;
439 	} else {
440 		/* Need to change action */
441 		if (exist->ref_mod < ref->ref_mod) {
442 			exist->action = ref->action;
443 			mod = -exist->ref_mod;
444 			exist->ref_mod = ref->ref_mod;
445 			if (ref->action == BTRFS_ADD_DELAYED_REF)
446 				list_add_tail(&exist->add_list,
447 					      &href->ref_add_list);
448 			else if (ref->action == BTRFS_DROP_DELAYED_REF) {
449 				ASSERT(!list_empty(&exist->add_list));
450 				list_del(&exist->add_list);
451 			} else {
452 				ASSERT(0);
453 			}
454 		} else
455 			mod = -ref->ref_mod;
456 	}
457 	exist->ref_mod += mod;
458 
459 	/* remove existing tail if its ref_mod is zero */
460 	if (exist->ref_mod == 0)
461 		drop_delayed_ref(trans, root, href, exist);
462 	spin_unlock(&href->lock);
463 	return ret;
464 inserted:
465 	if (ref->action == BTRFS_ADD_DELAYED_REF)
466 		list_add_tail(&ref->add_list, &href->ref_add_list);
467 	atomic_inc(&root->num_entries);
468 	spin_unlock(&href->lock);
469 	return ret;
470 }
471 
472 /*
473  * helper function to update the accounting in the head ref
474  * existing and update must have the same bytenr
475  */
476 static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
477 			 struct btrfs_delayed_ref_head *existing,
478 			 struct btrfs_delayed_ref_head *update,
479 			 int *old_ref_mod_ret)
480 {
481 	struct btrfs_delayed_ref_root *delayed_refs =
482 		&trans->transaction->delayed_refs;
483 	struct btrfs_fs_info *fs_info = trans->fs_info;
484 	int old_ref_mod;
485 
486 	BUG_ON(existing->is_data != update->is_data);
487 
488 	spin_lock(&existing->lock);
489 	if (update->must_insert_reserved) {
490 		/* if the extent was freed and then
491 		 * reallocated before the delayed ref
492 		 * entries were processed, we can end up
493 		 * with an existing head ref without
494 		 * the must_insert_reserved flag set.
495 		 * Set it again here
496 		 */
497 		existing->must_insert_reserved = update->must_insert_reserved;
498 
499 		/*
500 		 * update the num_bytes so we make sure the accounting
501 		 * is done correctly
502 		 */
503 		existing->num_bytes = update->num_bytes;
504 
505 	}
506 
507 	if (update->extent_op) {
508 		if (!existing->extent_op) {
509 			existing->extent_op = update->extent_op;
510 		} else {
511 			if (update->extent_op->update_key) {
512 				memcpy(&existing->extent_op->key,
513 				       &update->extent_op->key,
514 				       sizeof(update->extent_op->key));
515 				existing->extent_op->update_key = true;
516 			}
517 			if (update->extent_op->update_flags) {
518 				existing->extent_op->flags_to_set |=
519 					update->extent_op->flags_to_set;
520 				existing->extent_op->update_flags = true;
521 			}
522 			btrfs_free_delayed_extent_op(update->extent_op);
523 		}
524 	}
525 	/*
526 	 * update the reference mod on the head to reflect this new operation,
527 	 * only need the lock for this case cause we could be processing it
528 	 * currently, for refs we just added we know we're a-ok.
529 	 */
530 	old_ref_mod = existing->total_ref_mod;
531 	if (old_ref_mod_ret)
532 		*old_ref_mod_ret = old_ref_mod;
533 	existing->ref_mod += update->ref_mod;
534 	existing->total_ref_mod += update->ref_mod;
535 
536 	/*
537 	 * If we are going to from a positive ref mod to a negative or vice
538 	 * versa we need to make sure to adjust pending_csums accordingly.
539 	 */
540 	if (existing->is_data) {
541 		u64 csum_leaves =
542 			btrfs_csum_bytes_to_leaves(fs_info,
543 						   existing->num_bytes);
544 
545 		if (existing->total_ref_mod >= 0 && old_ref_mod < 0) {
546 			delayed_refs->pending_csums -= existing->num_bytes;
547 			btrfs_delayed_refs_rsv_release(fs_info, csum_leaves);
548 		}
549 		if (existing->total_ref_mod < 0 && old_ref_mod >= 0) {
550 			delayed_refs->pending_csums += existing->num_bytes;
551 			trans->delayed_ref_updates += csum_leaves;
552 		}
553 	}
554 	spin_unlock(&existing->lock);
555 }
556 
557 static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
558 				  struct btrfs_qgroup_extent_record *qrecord,
559 				  u64 bytenr, u64 num_bytes, u64 ref_root,
560 				  u64 reserved, int action, bool is_data,
561 				  bool is_system)
562 {
563 	int count_mod = 1;
564 	int must_insert_reserved = 0;
565 
566 	/* If reserved is provided, it must be a data extent. */
567 	BUG_ON(!is_data && reserved);
568 
569 	/*
570 	 * The head node stores the sum of all the mods, so dropping a ref
571 	 * should drop the sum in the head node by one.
572 	 */
573 	if (action == BTRFS_UPDATE_DELAYED_HEAD)
574 		count_mod = 0;
575 	else if (action == BTRFS_DROP_DELAYED_REF)
576 		count_mod = -1;
577 
578 	/*
579 	 * BTRFS_ADD_DELAYED_EXTENT means that we need to update the reserved
580 	 * accounting when the extent is finally added, or if a later
581 	 * modification deletes the delayed ref without ever inserting the
582 	 * extent into the extent allocation tree.  ref->must_insert_reserved
583 	 * is the flag used to record that accounting mods are required.
584 	 *
585 	 * Once we record must_insert_reserved, switch the action to
586 	 * BTRFS_ADD_DELAYED_REF because other special casing is not required.
587 	 */
588 	if (action == BTRFS_ADD_DELAYED_EXTENT)
589 		must_insert_reserved = 1;
590 	else
591 		must_insert_reserved = 0;
592 
593 	refcount_set(&head_ref->refs, 1);
594 	head_ref->bytenr = bytenr;
595 	head_ref->num_bytes = num_bytes;
596 	head_ref->ref_mod = count_mod;
597 	head_ref->must_insert_reserved = must_insert_reserved;
598 	head_ref->is_data = is_data;
599 	head_ref->is_system = is_system;
600 	head_ref->ref_tree = RB_ROOT_CACHED;
601 	INIT_LIST_HEAD(&head_ref->ref_add_list);
602 	RB_CLEAR_NODE(&head_ref->href_node);
603 	head_ref->processing = 0;
604 	head_ref->total_ref_mod = count_mod;
605 	head_ref->qgroup_reserved = 0;
606 	head_ref->qgroup_ref_root = 0;
607 	spin_lock_init(&head_ref->lock);
608 	mutex_init(&head_ref->mutex);
609 
610 	if (qrecord) {
611 		if (ref_root && reserved) {
612 			head_ref->qgroup_ref_root = ref_root;
613 			head_ref->qgroup_reserved = reserved;
614 		}
615 
616 		qrecord->bytenr = bytenr;
617 		qrecord->num_bytes = num_bytes;
618 		qrecord->old_roots = NULL;
619 	}
620 }
621 
622 /*
623  * helper function to actually insert a head node into the rbtree.
624  * this does all the dirty work in terms of maintaining the correct
625  * overall modification count.
626  */
627 static noinline struct btrfs_delayed_ref_head *
628 add_delayed_ref_head(struct btrfs_trans_handle *trans,
629 		     struct btrfs_delayed_ref_head *head_ref,
630 		     struct btrfs_qgroup_extent_record *qrecord,
631 		     int action, int *qrecord_inserted_ret,
632 		     int *old_ref_mod, int *new_ref_mod)
633 {
634 	struct btrfs_delayed_ref_head *existing;
635 	struct btrfs_delayed_ref_root *delayed_refs;
636 	int qrecord_inserted = 0;
637 
638 	delayed_refs = &trans->transaction->delayed_refs;
639 
640 	/* Record qgroup extent info if provided */
641 	if (qrecord) {
642 		if (btrfs_qgroup_trace_extent_nolock(trans->fs_info,
643 					delayed_refs, qrecord))
644 			kfree(qrecord);
645 		else
646 			qrecord_inserted = 1;
647 	}
648 
649 	trace_add_delayed_ref_head(trans->fs_info, head_ref, action);
650 
651 	existing = htree_insert(&delayed_refs->href_root,
652 				&head_ref->href_node);
653 	if (existing) {
654 		WARN_ON(qrecord && head_ref->qgroup_ref_root
655 			&& head_ref->qgroup_reserved
656 			&& existing->qgroup_ref_root
657 			&& existing->qgroup_reserved);
658 		update_existing_head_ref(trans, existing, head_ref,
659 					 old_ref_mod);
660 		/*
661 		 * we've updated the existing ref, free the newly
662 		 * allocated ref
663 		 */
664 		kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
665 		head_ref = existing;
666 	} else {
667 		if (old_ref_mod)
668 			*old_ref_mod = 0;
669 		if (head_ref->is_data && head_ref->ref_mod < 0) {
670 			delayed_refs->pending_csums += head_ref->num_bytes;
671 			trans->delayed_ref_updates +=
672 				btrfs_csum_bytes_to_leaves(trans->fs_info,
673 							   head_ref->num_bytes);
674 		}
675 		delayed_refs->num_heads++;
676 		delayed_refs->num_heads_ready++;
677 		atomic_inc(&delayed_refs->num_entries);
678 		trans->delayed_ref_updates++;
679 	}
680 	if (qrecord_inserted_ret)
681 		*qrecord_inserted_ret = qrecord_inserted;
682 	if (new_ref_mod)
683 		*new_ref_mod = head_ref->total_ref_mod;
684 
685 	return head_ref;
686 }
687 
688 /*
689  * init_delayed_ref_common - Initialize the structure which represents a
690  *			     modification to a an extent.
691  *
692  * @fs_info:    Internal to the mounted filesystem mount structure.
693  *
694  * @ref:	The structure which is going to be initialized.
695  *
696  * @bytenr:	The logical address of the extent for which a modification is
697  *		going to be recorded.
698  *
699  * @num_bytes:  Size of the extent whose modification is being recorded.
700  *
701  * @ref_root:	The id of the root where this modification has originated, this
702  *		can be either one of the well-known metadata trees or the
703  *		subvolume id which references this extent.
704  *
705  * @action:	Can be one of BTRFS_ADD_DELAYED_REF/BTRFS_DROP_DELAYED_REF or
706  *		BTRFS_ADD_DELAYED_EXTENT
707  *
708  * @ref_type:	Holds the type of the extent which is being recorded, can be
709  *		one of BTRFS_SHARED_BLOCK_REF_KEY/BTRFS_TREE_BLOCK_REF_KEY
710  *		when recording a metadata extent or BTRFS_SHARED_DATA_REF_KEY/
711  *		BTRFS_EXTENT_DATA_REF_KEY when recording data extent
712  */
713 static void init_delayed_ref_common(struct btrfs_fs_info *fs_info,
714 				    struct btrfs_delayed_ref_node *ref,
715 				    u64 bytenr, u64 num_bytes, u64 ref_root,
716 				    int action, u8 ref_type)
717 {
718 	u64 seq = 0;
719 
720 	if (action == BTRFS_ADD_DELAYED_EXTENT)
721 		action = BTRFS_ADD_DELAYED_REF;
722 
723 	if (is_fstree(ref_root))
724 		seq = atomic64_read(&fs_info->tree_mod_seq);
725 
726 	refcount_set(&ref->refs, 1);
727 	ref->bytenr = bytenr;
728 	ref->num_bytes = num_bytes;
729 	ref->ref_mod = 1;
730 	ref->action = action;
731 	ref->is_head = 0;
732 	ref->in_tree = 1;
733 	ref->seq = seq;
734 	ref->type = ref_type;
735 	RB_CLEAR_NODE(&ref->ref_node);
736 	INIT_LIST_HEAD(&ref->add_list);
737 }
738 
739 /*
740  * add a delayed tree ref.  This does all of the accounting required
741  * to make sure the delayed ref is eventually processed before this
742  * transaction commits.
743  */
744 int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
745 			       u64 bytenr, u64 num_bytes, u64 parent,
746 			       u64 ref_root,  int level, int action,
747 			       struct btrfs_delayed_extent_op *extent_op,
748 			       int *old_ref_mod, int *new_ref_mod)
749 {
750 	struct btrfs_fs_info *fs_info = trans->fs_info;
751 	struct btrfs_delayed_tree_ref *ref;
752 	struct btrfs_delayed_ref_head *head_ref;
753 	struct btrfs_delayed_ref_root *delayed_refs;
754 	struct btrfs_qgroup_extent_record *record = NULL;
755 	int qrecord_inserted;
756 	bool is_system = (ref_root == BTRFS_CHUNK_TREE_OBJECTID);
757 	int ret;
758 	u8 ref_type;
759 
760 	BUG_ON(extent_op && extent_op->is_data);
761 	ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
762 	if (!ref)
763 		return -ENOMEM;
764 
765 	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
766 	if (!head_ref) {
767 		kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
768 		return -ENOMEM;
769 	}
770 
771 	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
772 	    is_fstree(ref_root)) {
773 		record = kmalloc(sizeof(*record), GFP_NOFS);
774 		if (!record) {
775 			kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
776 			kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
777 			return -ENOMEM;
778 		}
779 	}
780 
781 	if (parent)
782 		ref_type = BTRFS_SHARED_BLOCK_REF_KEY;
783 	else
784 		ref_type = BTRFS_TREE_BLOCK_REF_KEY;
785 
786 	init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
787 				ref_root, action, ref_type);
788 	ref->root = ref_root;
789 	ref->parent = parent;
790 	ref->level = level;
791 
792 	init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
793 			      ref_root, 0, action, false, is_system);
794 	head_ref->extent_op = extent_op;
795 
796 	delayed_refs = &trans->transaction->delayed_refs;
797 	spin_lock(&delayed_refs->lock);
798 
799 	/*
800 	 * insert both the head node and the new ref without dropping
801 	 * the spin lock
802 	 */
803 	head_ref = add_delayed_ref_head(trans, head_ref, record,
804 					action, &qrecord_inserted,
805 					old_ref_mod, new_ref_mod);
806 
807 	ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
808 	spin_unlock(&delayed_refs->lock);
809 
810 	/*
811 	 * Need to update the delayed_refs_rsv with any changes we may have
812 	 * made.
813 	 */
814 	btrfs_update_delayed_refs_rsv(trans);
815 
816 	trace_add_delayed_tree_ref(fs_info, &ref->node, ref,
817 				   action == BTRFS_ADD_DELAYED_EXTENT ?
818 				   BTRFS_ADD_DELAYED_REF : action);
819 	if (ret > 0)
820 		kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
821 
822 	if (qrecord_inserted)
823 		btrfs_qgroup_trace_extent_post(fs_info, record);
824 
825 	return 0;
826 }
827 
828 /*
829  * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
830  */
831 int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
832 			       u64 bytenr, u64 num_bytes,
833 			       u64 parent, u64 ref_root,
834 			       u64 owner, u64 offset, u64 reserved, int action,
835 			       int *old_ref_mod, int *new_ref_mod)
836 {
837 	struct btrfs_fs_info *fs_info = trans->fs_info;
838 	struct btrfs_delayed_data_ref *ref;
839 	struct btrfs_delayed_ref_head *head_ref;
840 	struct btrfs_delayed_ref_root *delayed_refs;
841 	struct btrfs_qgroup_extent_record *record = NULL;
842 	int qrecord_inserted;
843 	int ret;
844 	u8 ref_type;
845 
846 	ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
847 	if (!ref)
848 		return -ENOMEM;
849 
850 	if (parent)
851 	        ref_type = BTRFS_SHARED_DATA_REF_KEY;
852 	else
853 	        ref_type = BTRFS_EXTENT_DATA_REF_KEY;
854 	init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
855 				ref_root, action, ref_type);
856 	ref->root = ref_root;
857 	ref->parent = parent;
858 	ref->objectid = owner;
859 	ref->offset = offset;
860 
861 
862 	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
863 	if (!head_ref) {
864 		kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
865 		return -ENOMEM;
866 	}
867 
868 	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
869 	    is_fstree(ref_root)) {
870 		record = kmalloc(sizeof(*record), GFP_NOFS);
871 		if (!record) {
872 			kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
873 			kmem_cache_free(btrfs_delayed_ref_head_cachep,
874 					head_ref);
875 			return -ENOMEM;
876 		}
877 	}
878 
879 	init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root,
880 			      reserved, action, true, false);
881 	head_ref->extent_op = NULL;
882 
883 	delayed_refs = &trans->transaction->delayed_refs;
884 	spin_lock(&delayed_refs->lock);
885 
886 	/*
887 	 * insert both the head node and the new ref without dropping
888 	 * the spin lock
889 	 */
890 	head_ref = add_delayed_ref_head(trans, head_ref, record,
891 					action, &qrecord_inserted,
892 					old_ref_mod, new_ref_mod);
893 
894 	ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
895 	spin_unlock(&delayed_refs->lock);
896 
897 	/*
898 	 * Need to update the delayed_refs_rsv with any changes we may have
899 	 * made.
900 	 */
901 	btrfs_update_delayed_refs_rsv(trans);
902 
903 	trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref,
904 				   action == BTRFS_ADD_DELAYED_EXTENT ?
905 				   BTRFS_ADD_DELAYED_REF : action);
906 	if (ret > 0)
907 		kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
908 
909 
910 	if (qrecord_inserted)
911 		return btrfs_qgroup_trace_extent_post(fs_info, record);
912 	return 0;
913 }
914 
915 int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
916 				struct btrfs_trans_handle *trans,
917 				u64 bytenr, u64 num_bytes,
918 				struct btrfs_delayed_extent_op *extent_op)
919 {
920 	struct btrfs_delayed_ref_head *head_ref;
921 	struct btrfs_delayed_ref_root *delayed_refs;
922 
923 	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
924 	if (!head_ref)
925 		return -ENOMEM;
926 
927 	init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0,
928 			      BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data,
929 			      false);
930 	head_ref->extent_op = extent_op;
931 
932 	delayed_refs = &trans->transaction->delayed_refs;
933 	spin_lock(&delayed_refs->lock);
934 
935 	add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD,
936 			     NULL, NULL, NULL);
937 
938 	spin_unlock(&delayed_refs->lock);
939 
940 	/*
941 	 * Need to update the delayed_refs_rsv with any changes we may have
942 	 * made.
943 	 */
944 	btrfs_update_delayed_refs_rsv(trans);
945 	return 0;
946 }
947 
948 /*
949  * this does a simple search for the head node for a given extent.
950  * It must be called with the delayed ref spinlock held, and it returns
951  * the head node if any where found, or NULL if not.
952  */
953 struct btrfs_delayed_ref_head *
954 btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr)
955 {
956 	return find_ref_head(delayed_refs, bytenr, false);
957 }
958 
959 void __cold btrfs_delayed_ref_exit(void)
960 {
961 	kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
962 	kmem_cache_destroy(btrfs_delayed_tree_ref_cachep);
963 	kmem_cache_destroy(btrfs_delayed_data_ref_cachep);
964 	kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
965 }
966 
967 int __init btrfs_delayed_ref_init(void)
968 {
969 	btrfs_delayed_ref_head_cachep = kmem_cache_create(
970 				"btrfs_delayed_ref_head",
971 				sizeof(struct btrfs_delayed_ref_head), 0,
972 				SLAB_MEM_SPREAD, NULL);
973 	if (!btrfs_delayed_ref_head_cachep)
974 		goto fail;
975 
976 	btrfs_delayed_tree_ref_cachep = kmem_cache_create(
977 				"btrfs_delayed_tree_ref",
978 				sizeof(struct btrfs_delayed_tree_ref), 0,
979 				SLAB_MEM_SPREAD, NULL);
980 	if (!btrfs_delayed_tree_ref_cachep)
981 		goto fail;
982 
983 	btrfs_delayed_data_ref_cachep = kmem_cache_create(
984 				"btrfs_delayed_data_ref",
985 				sizeof(struct btrfs_delayed_data_ref), 0,
986 				SLAB_MEM_SPREAD, NULL);
987 	if (!btrfs_delayed_data_ref_cachep)
988 		goto fail;
989 
990 	btrfs_delayed_extent_op_cachep = kmem_cache_create(
991 				"btrfs_delayed_extent_op",
992 				sizeof(struct btrfs_delayed_extent_op), 0,
993 				SLAB_MEM_SPREAD, NULL);
994 	if (!btrfs_delayed_extent_op_cachep)
995 		goto fail;
996 
997 	return 0;
998 fail:
999 	btrfs_delayed_ref_exit();
1000 	return -ENOMEM;
1001 }
1002