xref: /openbmc/linux/fs/btrfs/delayed-ref.c (revision 002dff36)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2009 Oracle.  All rights reserved.
4  */
5 
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/sort.h>
9 #include "ctree.h"
10 #include "delayed-ref.h"
11 #include "transaction.h"
12 #include "qgroup.h"
13 #include "space-info.h"
14 
15 struct kmem_cache *btrfs_delayed_ref_head_cachep;
16 struct kmem_cache *btrfs_delayed_tree_ref_cachep;
17 struct kmem_cache *btrfs_delayed_data_ref_cachep;
18 struct kmem_cache *btrfs_delayed_extent_op_cachep;
19 /*
20  * delayed back reference update tracking.  For subvolume trees
21  * we queue up extent allocations and backref maintenance for
22  * delayed processing.   This avoids deep call chains where we
23  * add extents in the middle of btrfs_search_slot, and it allows
24  * us to buffer up frequently modified backrefs in an rb tree instead
25  * of hammering updates on the extent allocation tree.
26  */
27 
28 bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)
29 {
30 	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
31 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
32 	bool ret = false;
33 	u64 reserved;
34 
35 	spin_lock(&global_rsv->lock);
36 	reserved = global_rsv->reserved;
37 	spin_unlock(&global_rsv->lock);
38 
39 	/*
40 	 * Since the global reserve is just kind of magic we don't really want
41 	 * to rely on it to save our bacon, so if our size is more than the
42 	 * delayed_refs_rsv and the global rsv then it's time to think about
43 	 * bailing.
44 	 */
45 	spin_lock(&delayed_refs_rsv->lock);
46 	reserved += delayed_refs_rsv->reserved;
47 	if (delayed_refs_rsv->size >= reserved)
48 		ret = true;
49 	spin_unlock(&delayed_refs_rsv->lock);
50 	return ret;
51 }
52 
53 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
54 {
55 	u64 num_entries =
56 		atomic_read(&trans->transaction->delayed_refs.num_entries);
57 	u64 avg_runtime;
58 	u64 val;
59 
60 	smp_mb();
61 	avg_runtime = trans->fs_info->avg_delayed_ref_runtime;
62 	val = num_entries * avg_runtime;
63 	if (val >= NSEC_PER_SEC)
64 		return 1;
65 	if (val >= NSEC_PER_SEC / 2)
66 		return 2;
67 
68 	return btrfs_check_space_for_delayed_refs(trans->fs_info);
69 }
70 
71 /**
72  * btrfs_delayed_refs_rsv_release - release a ref head's reservation.
73  * @fs_info - the fs_info for our fs.
74  * @nr - the number of items to drop.
75  *
76  * This drops the delayed ref head's count from the delayed refs rsv and frees
77  * any excess reservation we had.
78  */
79 void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
80 {
81 	struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
82 	u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, nr);
83 	u64 released = 0;
84 
85 	released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
86 	if (released)
87 		trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
88 					      0, released, 0);
89 }
90 
91 /*
92  * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv
93  * @trans - the trans that may have generated delayed refs
94  *
95  * This is to be called anytime we may have adjusted trans->delayed_ref_updates,
96  * it'll calculate the additional size and add it to the delayed_refs_rsv.
97  */
98 void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
99 {
100 	struct btrfs_fs_info *fs_info = trans->fs_info;
101 	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
102 	u64 num_bytes;
103 
104 	if (!trans->delayed_ref_updates)
105 		return;
106 
107 	num_bytes = btrfs_calc_insert_metadata_size(fs_info,
108 						    trans->delayed_ref_updates);
109 	spin_lock(&delayed_rsv->lock);
110 	delayed_rsv->size += num_bytes;
111 	delayed_rsv->full = 0;
112 	spin_unlock(&delayed_rsv->lock);
113 	trans->delayed_ref_updates = 0;
114 }
115 
116 /**
117  * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv.
118  * @fs_info - the fs info for our fs.
119  * @src - the source block rsv to transfer from.
120  * @num_bytes - the number of bytes to transfer.
121  *
122  * This transfers up to the num_bytes amount from the src rsv to the
123  * delayed_refs_rsv.  Any extra bytes are returned to the space info.
124  */
125 void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
126 				       struct btrfs_block_rsv *src,
127 				       u64 num_bytes)
128 {
129 	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
130 	u64 to_free = 0;
131 
132 	spin_lock(&src->lock);
133 	src->reserved -= num_bytes;
134 	src->size -= num_bytes;
135 	spin_unlock(&src->lock);
136 
137 	spin_lock(&delayed_refs_rsv->lock);
138 	if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) {
139 		u64 delta = delayed_refs_rsv->size -
140 			delayed_refs_rsv->reserved;
141 		if (num_bytes > delta) {
142 			to_free = num_bytes - delta;
143 			num_bytes = delta;
144 		}
145 	} else {
146 		to_free = num_bytes;
147 		num_bytes = 0;
148 	}
149 
150 	if (num_bytes)
151 		delayed_refs_rsv->reserved += num_bytes;
152 	if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size)
153 		delayed_refs_rsv->full = 1;
154 	spin_unlock(&delayed_refs_rsv->lock);
155 
156 	if (num_bytes)
157 		trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
158 					      0, num_bytes, 1);
159 	if (to_free)
160 		btrfs_space_info_free_bytes_may_use(fs_info,
161 				delayed_refs_rsv->space_info, to_free);
162 }
163 
164 /**
165  * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage.
166  * @fs_info - the fs_info for our fs.
167  * @flush - control how we can flush for this reservation.
168  *
169  * This will refill the delayed block_rsv up to 1 items size worth of space and
170  * will return -ENOSPC if we can't make the reservation.
171  */
172 int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
173 				  enum btrfs_reserve_flush_enum flush)
174 {
175 	struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
176 	u64 limit = btrfs_calc_insert_metadata_size(fs_info, 1);
177 	u64 num_bytes = 0;
178 	int ret = -ENOSPC;
179 
180 	spin_lock(&block_rsv->lock);
181 	if (block_rsv->reserved < block_rsv->size) {
182 		num_bytes = block_rsv->size - block_rsv->reserved;
183 		num_bytes = min(num_bytes, limit);
184 	}
185 	spin_unlock(&block_rsv->lock);
186 
187 	if (!num_bytes)
188 		return 0;
189 
190 	ret = btrfs_reserve_metadata_bytes(fs_info->extent_root, block_rsv,
191 					   num_bytes, flush);
192 	if (ret)
193 		return ret;
194 	btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0);
195 	trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
196 				      0, num_bytes, 1);
197 	return 0;
198 }
199 
200 /*
201  * compare two delayed tree backrefs with same bytenr and type
202  */
203 static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref1,
204 			  struct btrfs_delayed_tree_ref *ref2)
205 {
206 	if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) {
207 		if (ref1->root < ref2->root)
208 			return -1;
209 		if (ref1->root > ref2->root)
210 			return 1;
211 	} else {
212 		if (ref1->parent < ref2->parent)
213 			return -1;
214 		if (ref1->parent > ref2->parent)
215 			return 1;
216 	}
217 	return 0;
218 }
219 
220 /*
221  * compare two delayed data backrefs with same bytenr and type
222  */
223 static int comp_data_refs(struct btrfs_delayed_data_ref *ref1,
224 			  struct btrfs_delayed_data_ref *ref2)
225 {
226 	if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) {
227 		if (ref1->root < ref2->root)
228 			return -1;
229 		if (ref1->root > ref2->root)
230 			return 1;
231 		if (ref1->objectid < ref2->objectid)
232 			return -1;
233 		if (ref1->objectid > ref2->objectid)
234 			return 1;
235 		if (ref1->offset < ref2->offset)
236 			return -1;
237 		if (ref1->offset > ref2->offset)
238 			return 1;
239 	} else {
240 		if (ref1->parent < ref2->parent)
241 			return -1;
242 		if (ref1->parent > ref2->parent)
243 			return 1;
244 	}
245 	return 0;
246 }
247 
248 static int comp_refs(struct btrfs_delayed_ref_node *ref1,
249 		     struct btrfs_delayed_ref_node *ref2,
250 		     bool check_seq)
251 {
252 	int ret = 0;
253 
254 	if (ref1->type < ref2->type)
255 		return -1;
256 	if (ref1->type > ref2->type)
257 		return 1;
258 	if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
259 	    ref1->type == BTRFS_SHARED_BLOCK_REF_KEY)
260 		ret = comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref1),
261 				     btrfs_delayed_node_to_tree_ref(ref2));
262 	else
263 		ret = comp_data_refs(btrfs_delayed_node_to_data_ref(ref1),
264 				     btrfs_delayed_node_to_data_ref(ref2));
265 	if (ret)
266 		return ret;
267 	if (check_seq) {
268 		if (ref1->seq < ref2->seq)
269 			return -1;
270 		if (ref1->seq > ref2->seq)
271 			return 1;
272 	}
273 	return 0;
274 }
275 
276 /* insert a new ref to head ref rbtree */
277 static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root,
278 						   struct rb_node *node)
279 {
280 	struct rb_node **p = &root->rb_root.rb_node;
281 	struct rb_node *parent_node = NULL;
282 	struct btrfs_delayed_ref_head *entry;
283 	struct btrfs_delayed_ref_head *ins;
284 	u64 bytenr;
285 	bool leftmost = true;
286 
287 	ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node);
288 	bytenr = ins->bytenr;
289 	while (*p) {
290 		parent_node = *p;
291 		entry = rb_entry(parent_node, struct btrfs_delayed_ref_head,
292 				 href_node);
293 
294 		if (bytenr < entry->bytenr) {
295 			p = &(*p)->rb_left;
296 		} else if (bytenr > entry->bytenr) {
297 			p = &(*p)->rb_right;
298 			leftmost = false;
299 		} else {
300 			return entry;
301 		}
302 	}
303 
304 	rb_link_node(node, parent_node, p);
305 	rb_insert_color_cached(node, root, leftmost);
306 	return NULL;
307 }
308 
309 static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root,
310 		struct btrfs_delayed_ref_node *ins)
311 {
312 	struct rb_node **p = &root->rb_root.rb_node;
313 	struct rb_node *node = &ins->ref_node;
314 	struct rb_node *parent_node = NULL;
315 	struct btrfs_delayed_ref_node *entry;
316 	bool leftmost = true;
317 
318 	while (*p) {
319 		int comp;
320 
321 		parent_node = *p;
322 		entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
323 				 ref_node);
324 		comp = comp_refs(ins, entry, true);
325 		if (comp < 0) {
326 			p = &(*p)->rb_left;
327 		} else if (comp > 0) {
328 			p = &(*p)->rb_right;
329 			leftmost = false;
330 		} else {
331 			return entry;
332 		}
333 	}
334 
335 	rb_link_node(node, parent_node, p);
336 	rb_insert_color_cached(node, root, leftmost);
337 	return NULL;
338 }
339 
340 static struct btrfs_delayed_ref_head *find_first_ref_head(
341 		struct btrfs_delayed_ref_root *dr)
342 {
343 	struct rb_node *n;
344 	struct btrfs_delayed_ref_head *entry;
345 
346 	n = rb_first_cached(&dr->href_root);
347 	if (!n)
348 		return NULL;
349 
350 	entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
351 
352 	return entry;
353 }
354 
355 /*
356  * Find a head entry based on bytenr. This returns the delayed ref head if it
357  * was able to find one, or NULL if nothing was in that spot.  If return_bigger
358  * is given, the next bigger entry is returned if no exact match is found.
359  */
360 static struct btrfs_delayed_ref_head *find_ref_head(
361 		struct btrfs_delayed_ref_root *dr, u64 bytenr,
362 		bool return_bigger)
363 {
364 	struct rb_root *root = &dr->href_root.rb_root;
365 	struct rb_node *n;
366 	struct btrfs_delayed_ref_head *entry;
367 
368 	n = root->rb_node;
369 	entry = NULL;
370 	while (n) {
371 		entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
372 
373 		if (bytenr < entry->bytenr)
374 			n = n->rb_left;
375 		else if (bytenr > entry->bytenr)
376 			n = n->rb_right;
377 		else
378 			return entry;
379 	}
380 	if (entry && return_bigger) {
381 		if (bytenr > entry->bytenr) {
382 			n = rb_next(&entry->href_node);
383 			if (!n)
384 				return NULL;
385 			entry = rb_entry(n, struct btrfs_delayed_ref_head,
386 					 href_node);
387 		}
388 		return entry;
389 	}
390 	return NULL;
391 }
392 
393 int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
394 			   struct btrfs_delayed_ref_head *head)
395 {
396 	lockdep_assert_held(&delayed_refs->lock);
397 	if (mutex_trylock(&head->mutex))
398 		return 0;
399 
400 	refcount_inc(&head->refs);
401 	spin_unlock(&delayed_refs->lock);
402 
403 	mutex_lock(&head->mutex);
404 	spin_lock(&delayed_refs->lock);
405 	if (RB_EMPTY_NODE(&head->href_node)) {
406 		mutex_unlock(&head->mutex);
407 		btrfs_put_delayed_ref_head(head);
408 		return -EAGAIN;
409 	}
410 	btrfs_put_delayed_ref_head(head);
411 	return 0;
412 }
413 
414 static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
415 				    struct btrfs_delayed_ref_root *delayed_refs,
416 				    struct btrfs_delayed_ref_head *head,
417 				    struct btrfs_delayed_ref_node *ref)
418 {
419 	lockdep_assert_held(&head->lock);
420 	rb_erase_cached(&ref->ref_node, &head->ref_tree);
421 	RB_CLEAR_NODE(&ref->ref_node);
422 	if (!list_empty(&ref->add_list))
423 		list_del(&ref->add_list);
424 	ref->in_tree = 0;
425 	btrfs_put_delayed_ref(ref);
426 	atomic_dec(&delayed_refs->num_entries);
427 }
428 
429 static bool merge_ref(struct btrfs_trans_handle *trans,
430 		      struct btrfs_delayed_ref_root *delayed_refs,
431 		      struct btrfs_delayed_ref_head *head,
432 		      struct btrfs_delayed_ref_node *ref,
433 		      u64 seq)
434 {
435 	struct btrfs_delayed_ref_node *next;
436 	struct rb_node *node = rb_next(&ref->ref_node);
437 	bool done = false;
438 
439 	while (!done && node) {
440 		int mod;
441 
442 		next = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
443 		node = rb_next(node);
444 		if (seq && next->seq >= seq)
445 			break;
446 		if (comp_refs(ref, next, false))
447 			break;
448 
449 		if (ref->action == next->action) {
450 			mod = next->ref_mod;
451 		} else {
452 			if (ref->ref_mod < next->ref_mod) {
453 				swap(ref, next);
454 				done = true;
455 			}
456 			mod = -next->ref_mod;
457 		}
458 
459 		drop_delayed_ref(trans, delayed_refs, head, next);
460 		ref->ref_mod += mod;
461 		if (ref->ref_mod == 0) {
462 			drop_delayed_ref(trans, delayed_refs, head, ref);
463 			done = true;
464 		} else {
465 			/*
466 			 * Can't have multiples of the same ref on a tree block.
467 			 */
468 			WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
469 				ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
470 		}
471 	}
472 
473 	return done;
474 }
475 
476 void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
477 			      struct btrfs_delayed_ref_root *delayed_refs,
478 			      struct btrfs_delayed_ref_head *head)
479 {
480 	struct btrfs_fs_info *fs_info = trans->fs_info;
481 	struct btrfs_delayed_ref_node *ref;
482 	struct rb_node *node;
483 	u64 seq = 0;
484 
485 	lockdep_assert_held(&head->lock);
486 
487 	if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
488 		return;
489 
490 	/* We don't have too many refs to merge for data. */
491 	if (head->is_data)
492 		return;
493 
494 	read_lock(&fs_info->tree_mod_log_lock);
495 	if (!list_empty(&fs_info->tree_mod_seq_list)) {
496 		struct seq_list *elem;
497 
498 		elem = list_first_entry(&fs_info->tree_mod_seq_list,
499 					struct seq_list, list);
500 		seq = elem->seq;
501 	}
502 	read_unlock(&fs_info->tree_mod_log_lock);
503 
504 again:
505 	for (node = rb_first_cached(&head->ref_tree); node;
506 	     node = rb_next(node)) {
507 		ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
508 		if (seq && ref->seq >= seq)
509 			continue;
510 		if (merge_ref(trans, delayed_refs, head, ref, seq))
511 			goto again;
512 	}
513 }
514 
515 int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq)
516 {
517 	struct seq_list *elem;
518 	int ret = 0;
519 
520 	read_lock(&fs_info->tree_mod_log_lock);
521 	if (!list_empty(&fs_info->tree_mod_seq_list)) {
522 		elem = list_first_entry(&fs_info->tree_mod_seq_list,
523 					struct seq_list, list);
524 		if (seq >= elem->seq) {
525 			btrfs_debug(fs_info,
526 				"holding back delayed_ref %#x.%x, lowest is %#x.%x",
527 				(u32)(seq >> 32), (u32)seq,
528 				(u32)(elem->seq >> 32), (u32)elem->seq);
529 			ret = 1;
530 		}
531 	}
532 
533 	read_unlock(&fs_info->tree_mod_log_lock);
534 	return ret;
535 }
536 
537 struct btrfs_delayed_ref_head *btrfs_select_ref_head(
538 		struct btrfs_delayed_ref_root *delayed_refs)
539 {
540 	struct btrfs_delayed_ref_head *head;
541 
542 again:
543 	head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start,
544 			     true);
545 	if (!head && delayed_refs->run_delayed_start != 0) {
546 		delayed_refs->run_delayed_start = 0;
547 		head = find_first_ref_head(delayed_refs);
548 	}
549 	if (!head)
550 		return NULL;
551 
552 	while (head->processing) {
553 		struct rb_node *node;
554 
555 		node = rb_next(&head->href_node);
556 		if (!node) {
557 			if (delayed_refs->run_delayed_start == 0)
558 				return NULL;
559 			delayed_refs->run_delayed_start = 0;
560 			goto again;
561 		}
562 		head = rb_entry(node, struct btrfs_delayed_ref_head,
563 				href_node);
564 	}
565 
566 	head->processing = 1;
567 	WARN_ON(delayed_refs->num_heads_ready == 0);
568 	delayed_refs->num_heads_ready--;
569 	delayed_refs->run_delayed_start = head->bytenr +
570 		head->num_bytes;
571 	return head;
572 }
573 
574 void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
575 			   struct btrfs_delayed_ref_head *head)
576 {
577 	lockdep_assert_held(&delayed_refs->lock);
578 	lockdep_assert_held(&head->lock);
579 
580 	rb_erase_cached(&head->href_node, &delayed_refs->href_root);
581 	RB_CLEAR_NODE(&head->href_node);
582 	atomic_dec(&delayed_refs->num_entries);
583 	delayed_refs->num_heads--;
584 	if (head->processing == 0)
585 		delayed_refs->num_heads_ready--;
586 }
587 
588 /*
589  * Helper to insert the ref_node to the tail or merge with tail.
590  *
591  * Return 0 for insert.
592  * Return >0 for merge.
593  */
594 static int insert_delayed_ref(struct btrfs_trans_handle *trans,
595 			      struct btrfs_delayed_ref_root *root,
596 			      struct btrfs_delayed_ref_head *href,
597 			      struct btrfs_delayed_ref_node *ref)
598 {
599 	struct btrfs_delayed_ref_node *exist;
600 	int mod;
601 	int ret = 0;
602 
603 	spin_lock(&href->lock);
604 	exist = tree_insert(&href->ref_tree, ref);
605 	if (!exist)
606 		goto inserted;
607 
608 	/* Now we are sure we can merge */
609 	ret = 1;
610 	if (exist->action == ref->action) {
611 		mod = ref->ref_mod;
612 	} else {
613 		/* Need to change action */
614 		if (exist->ref_mod < ref->ref_mod) {
615 			exist->action = ref->action;
616 			mod = -exist->ref_mod;
617 			exist->ref_mod = ref->ref_mod;
618 			if (ref->action == BTRFS_ADD_DELAYED_REF)
619 				list_add_tail(&exist->add_list,
620 					      &href->ref_add_list);
621 			else if (ref->action == BTRFS_DROP_DELAYED_REF) {
622 				ASSERT(!list_empty(&exist->add_list));
623 				list_del(&exist->add_list);
624 			} else {
625 				ASSERT(0);
626 			}
627 		} else
628 			mod = -ref->ref_mod;
629 	}
630 	exist->ref_mod += mod;
631 
632 	/* remove existing tail if its ref_mod is zero */
633 	if (exist->ref_mod == 0)
634 		drop_delayed_ref(trans, root, href, exist);
635 	spin_unlock(&href->lock);
636 	return ret;
637 inserted:
638 	if (ref->action == BTRFS_ADD_DELAYED_REF)
639 		list_add_tail(&ref->add_list, &href->ref_add_list);
640 	atomic_inc(&root->num_entries);
641 	spin_unlock(&href->lock);
642 	return ret;
643 }
644 
645 /*
646  * helper function to update the accounting in the head ref
647  * existing and update must have the same bytenr
648  */
649 static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
650 			 struct btrfs_delayed_ref_head *existing,
651 			 struct btrfs_delayed_ref_head *update,
652 			 int *old_ref_mod_ret)
653 {
654 	struct btrfs_delayed_ref_root *delayed_refs =
655 		&trans->transaction->delayed_refs;
656 	struct btrfs_fs_info *fs_info = trans->fs_info;
657 	int old_ref_mod;
658 
659 	BUG_ON(existing->is_data != update->is_data);
660 
661 	spin_lock(&existing->lock);
662 	if (update->must_insert_reserved) {
663 		/* if the extent was freed and then
664 		 * reallocated before the delayed ref
665 		 * entries were processed, we can end up
666 		 * with an existing head ref without
667 		 * the must_insert_reserved flag set.
668 		 * Set it again here
669 		 */
670 		existing->must_insert_reserved = update->must_insert_reserved;
671 
672 		/*
673 		 * update the num_bytes so we make sure the accounting
674 		 * is done correctly
675 		 */
676 		existing->num_bytes = update->num_bytes;
677 
678 	}
679 
680 	if (update->extent_op) {
681 		if (!existing->extent_op) {
682 			existing->extent_op = update->extent_op;
683 		} else {
684 			if (update->extent_op->update_key) {
685 				memcpy(&existing->extent_op->key,
686 				       &update->extent_op->key,
687 				       sizeof(update->extent_op->key));
688 				existing->extent_op->update_key = true;
689 			}
690 			if (update->extent_op->update_flags) {
691 				existing->extent_op->flags_to_set |=
692 					update->extent_op->flags_to_set;
693 				existing->extent_op->update_flags = true;
694 			}
695 			btrfs_free_delayed_extent_op(update->extent_op);
696 		}
697 	}
698 	/*
699 	 * update the reference mod on the head to reflect this new operation,
700 	 * only need the lock for this case cause we could be processing it
701 	 * currently, for refs we just added we know we're a-ok.
702 	 */
703 	old_ref_mod = existing->total_ref_mod;
704 	if (old_ref_mod_ret)
705 		*old_ref_mod_ret = old_ref_mod;
706 	existing->ref_mod += update->ref_mod;
707 	existing->total_ref_mod += update->ref_mod;
708 
709 	/*
710 	 * If we are going to from a positive ref mod to a negative or vice
711 	 * versa we need to make sure to adjust pending_csums accordingly.
712 	 */
713 	if (existing->is_data) {
714 		u64 csum_leaves =
715 			btrfs_csum_bytes_to_leaves(fs_info,
716 						   existing->num_bytes);
717 
718 		if (existing->total_ref_mod >= 0 && old_ref_mod < 0) {
719 			delayed_refs->pending_csums -= existing->num_bytes;
720 			btrfs_delayed_refs_rsv_release(fs_info, csum_leaves);
721 		}
722 		if (existing->total_ref_mod < 0 && old_ref_mod >= 0) {
723 			delayed_refs->pending_csums += existing->num_bytes;
724 			trans->delayed_ref_updates += csum_leaves;
725 		}
726 	}
727 	spin_unlock(&existing->lock);
728 }
729 
730 static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
731 				  struct btrfs_qgroup_extent_record *qrecord,
732 				  u64 bytenr, u64 num_bytes, u64 ref_root,
733 				  u64 reserved, int action, bool is_data,
734 				  bool is_system)
735 {
736 	int count_mod = 1;
737 	int must_insert_reserved = 0;
738 
739 	/* If reserved is provided, it must be a data extent. */
740 	BUG_ON(!is_data && reserved);
741 
742 	/*
743 	 * The head node stores the sum of all the mods, so dropping a ref
744 	 * should drop the sum in the head node by one.
745 	 */
746 	if (action == BTRFS_UPDATE_DELAYED_HEAD)
747 		count_mod = 0;
748 	else if (action == BTRFS_DROP_DELAYED_REF)
749 		count_mod = -1;
750 
751 	/*
752 	 * BTRFS_ADD_DELAYED_EXTENT means that we need to update the reserved
753 	 * accounting when the extent is finally added, or if a later
754 	 * modification deletes the delayed ref without ever inserting the
755 	 * extent into the extent allocation tree.  ref->must_insert_reserved
756 	 * is the flag used to record that accounting mods are required.
757 	 *
758 	 * Once we record must_insert_reserved, switch the action to
759 	 * BTRFS_ADD_DELAYED_REF because other special casing is not required.
760 	 */
761 	if (action == BTRFS_ADD_DELAYED_EXTENT)
762 		must_insert_reserved = 1;
763 	else
764 		must_insert_reserved = 0;
765 
766 	refcount_set(&head_ref->refs, 1);
767 	head_ref->bytenr = bytenr;
768 	head_ref->num_bytes = num_bytes;
769 	head_ref->ref_mod = count_mod;
770 	head_ref->must_insert_reserved = must_insert_reserved;
771 	head_ref->is_data = is_data;
772 	head_ref->is_system = is_system;
773 	head_ref->ref_tree = RB_ROOT_CACHED;
774 	INIT_LIST_HEAD(&head_ref->ref_add_list);
775 	RB_CLEAR_NODE(&head_ref->href_node);
776 	head_ref->processing = 0;
777 	head_ref->total_ref_mod = count_mod;
778 	spin_lock_init(&head_ref->lock);
779 	mutex_init(&head_ref->mutex);
780 
781 	if (qrecord) {
782 		if (ref_root && reserved) {
783 			qrecord->data_rsv = reserved;
784 			qrecord->data_rsv_refroot = ref_root;
785 		}
786 		qrecord->bytenr = bytenr;
787 		qrecord->num_bytes = num_bytes;
788 		qrecord->old_roots = NULL;
789 	}
790 }
791 
792 /*
793  * helper function to actually insert a head node into the rbtree.
794  * this does all the dirty work in terms of maintaining the correct
795  * overall modification count.
796  */
797 static noinline struct btrfs_delayed_ref_head *
798 add_delayed_ref_head(struct btrfs_trans_handle *trans,
799 		     struct btrfs_delayed_ref_head *head_ref,
800 		     struct btrfs_qgroup_extent_record *qrecord,
801 		     int action, int *qrecord_inserted_ret,
802 		     int *old_ref_mod, int *new_ref_mod)
803 {
804 	struct btrfs_delayed_ref_head *existing;
805 	struct btrfs_delayed_ref_root *delayed_refs;
806 	int qrecord_inserted = 0;
807 
808 	delayed_refs = &trans->transaction->delayed_refs;
809 
810 	/* Record qgroup extent info if provided */
811 	if (qrecord) {
812 		if (btrfs_qgroup_trace_extent_nolock(trans->fs_info,
813 					delayed_refs, qrecord))
814 			kfree(qrecord);
815 		else
816 			qrecord_inserted = 1;
817 	}
818 
819 	trace_add_delayed_ref_head(trans->fs_info, head_ref, action);
820 
821 	existing = htree_insert(&delayed_refs->href_root,
822 				&head_ref->href_node);
823 	if (existing) {
824 		update_existing_head_ref(trans, existing, head_ref,
825 					 old_ref_mod);
826 		/*
827 		 * we've updated the existing ref, free the newly
828 		 * allocated ref
829 		 */
830 		kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
831 		head_ref = existing;
832 	} else {
833 		if (old_ref_mod)
834 			*old_ref_mod = 0;
835 		if (head_ref->is_data && head_ref->ref_mod < 0) {
836 			delayed_refs->pending_csums += head_ref->num_bytes;
837 			trans->delayed_ref_updates +=
838 				btrfs_csum_bytes_to_leaves(trans->fs_info,
839 							   head_ref->num_bytes);
840 		}
841 		delayed_refs->num_heads++;
842 		delayed_refs->num_heads_ready++;
843 		atomic_inc(&delayed_refs->num_entries);
844 		trans->delayed_ref_updates++;
845 	}
846 	if (qrecord_inserted_ret)
847 		*qrecord_inserted_ret = qrecord_inserted;
848 	if (new_ref_mod)
849 		*new_ref_mod = head_ref->total_ref_mod;
850 
851 	return head_ref;
852 }
853 
854 /*
855  * init_delayed_ref_common - Initialize the structure which represents a
856  *			     modification to a an extent.
857  *
858  * @fs_info:    Internal to the mounted filesystem mount structure.
859  *
860  * @ref:	The structure which is going to be initialized.
861  *
862  * @bytenr:	The logical address of the extent for which a modification is
863  *		going to be recorded.
864  *
865  * @num_bytes:  Size of the extent whose modification is being recorded.
866  *
867  * @ref_root:	The id of the root where this modification has originated, this
868  *		can be either one of the well-known metadata trees or the
869  *		subvolume id which references this extent.
870  *
871  * @action:	Can be one of BTRFS_ADD_DELAYED_REF/BTRFS_DROP_DELAYED_REF or
872  *		BTRFS_ADD_DELAYED_EXTENT
873  *
874  * @ref_type:	Holds the type of the extent which is being recorded, can be
875  *		one of BTRFS_SHARED_BLOCK_REF_KEY/BTRFS_TREE_BLOCK_REF_KEY
876  *		when recording a metadata extent or BTRFS_SHARED_DATA_REF_KEY/
877  *		BTRFS_EXTENT_DATA_REF_KEY when recording data extent
878  */
879 static void init_delayed_ref_common(struct btrfs_fs_info *fs_info,
880 				    struct btrfs_delayed_ref_node *ref,
881 				    u64 bytenr, u64 num_bytes, u64 ref_root,
882 				    int action, u8 ref_type)
883 {
884 	u64 seq = 0;
885 
886 	if (action == BTRFS_ADD_DELAYED_EXTENT)
887 		action = BTRFS_ADD_DELAYED_REF;
888 
889 	if (is_fstree(ref_root))
890 		seq = atomic64_read(&fs_info->tree_mod_seq);
891 
892 	refcount_set(&ref->refs, 1);
893 	ref->bytenr = bytenr;
894 	ref->num_bytes = num_bytes;
895 	ref->ref_mod = 1;
896 	ref->action = action;
897 	ref->is_head = 0;
898 	ref->in_tree = 1;
899 	ref->seq = seq;
900 	ref->type = ref_type;
901 	RB_CLEAR_NODE(&ref->ref_node);
902 	INIT_LIST_HEAD(&ref->add_list);
903 }
904 
905 /*
906  * add a delayed tree ref.  This does all of the accounting required
907  * to make sure the delayed ref is eventually processed before this
908  * transaction commits.
909  */
910 int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
911 			       struct btrfs_ref *generic_ref,
912 			       struct btrfs_delayed_extent_op *extent_op,
913 			       int *old_ref_mod, int *new_ref_mod)
914 {
915 	struct btrfs_fs_info *fs_info = trans->fs_info;
916 	struct btrfs_delayed_tree_ref *ref;
917 	struct btrfs_delayed_ref_head *head_ref;
918 	struct btrfs_delayed_ref_root *delayed_refs;
919 	struct btrfs_qgroup_extent_record *record = NULL;
920 	int qrecord_inserted;
921 	bool is_system;
922 	int action = generic_ref->action;
923 	int level = generic_ref->tree_ref.level;
924 	int ret;
925 	u64 bytenr = generic_ref->bytenr;
926 	u64 num_bytes = generic_ref->len;
927 	u64 parent = generic_ref->parent;
928 	u8 ref_type;
929 
930 	is_system = (generic_ref->real_root == BTRFS_CHUNK_TREE_OBJECTID);
931 
932 	ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action);
933 	BUG_ON(extent_op && extent_op->is_data);
934 	ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
935 	if (!ref)
936 		return -ENOMEM;
937 
938 	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
939 	if (!head_ref) {
940 		kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
941 		return -ENOMEM;
942 	}
943 
944 	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
945 	    is_fstree(generic_ref->real_root) &&
946 	    is_fstree(generic_ref->tree_ref.root) &&
947 	    !generic_ref->skip_qgroup) {
948 		record = kzalloc(sizeof(*record), GFP_NOFS);
949 		if (!record) {
950 			kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
951 			kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
952 			return -ENOMEM;
953 		}
954 	}
955 
956 	if (parent)
957 		ref_type = BTRFS_SHARED_BLOCK_REF_KEY;
958 	else
959 		ref_type = BTRFS_TREE_BLOCK_REF_KEY;
960 
961 	init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
962 				generic_ref->tree_ref.root, action, ref_type);
963 	ref->root = generic_ref->tree_ref.root;
964 	ref->parent = parent;
965 	ref->level = level;
966 
967 	init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
968 			      generic_ref->tree_ref.root, 0, action, false,
969 			      is_system);
970 	head_ref->extent_op = extent_op;
971 
972 	delayed_refs = &trans->transaction->delayed_refs;
973 	spin_lock(&delayed_refs->lock);
974 
975 	/*
976 	 * insert both the head node and the new ref without dropping
977 	 * the spin lock
978 	 */
979 	head_ref = add_delayed_ref_head(trans, head_ref, record,
980 					action, &qrecord_inserted,
981 					old_ref_mod, new_ref_mod);
982 
983 	ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
984 	spin_unlock(&delayed_refs->lock);
985 
986 	/*
987 	 * Need to update the delayed_refs_rsv with any changes we may have
988 	 * made.
989 	 */
990 	btrfs_update_delayed_refs_rsv(trans);
991 
992 	trace_add_delayed_tree_ref(fs_info, &ref->node, ref,
993 				   action == BTRFS_ADD_DELAYED_EXTENT ?
994 				   BTRFS_ADD_DELAYED_REF : action);
995 	if (ret > 0)
996 		kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
997 
998 	if (qrecord_inserted)
999 		btrfs_qgroup_trace_extent_post(fs_info, record);
1000 
1001 	return 0;
1002 }
1003 
1004 /*
1005  * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
1006  */
1007 int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
1008 			       struct btrfs_ref *generic_ref,
1009 			       u64 reserved, int *old_ref_mod,
1010 			       int *new_ref_mod)
1011 {
1012 	struct btrfs_fs_info *fs_info = trans->fs_info;
1013 	struct btrfs_delayed_data_ref *ref;
1014 	struct btrfs_delayed_ref_head *head_ref;
1015 	struct btrfs_delayed_ref_root *delayed_refs;
1016 	struct btrfs_qgroup_extent_record *record = NULL;
1017 	int qrecord_inserted;
1018 	int action = generic_ref->action;
1019 	int ret;
1020 	u64 bytenr = generic_ref->bytenr;
1021 	u64 num_bytes = generic_ref->len;
1022 	u64 parent = generic_ref->parent;
1023 	u64 ref_root = generic_ref->data_ref.ref_root;
1024 	u64 owner = generic_ref->data_ref.ino;
1025 	u64 offset = generic_ref->data_ref.offset;
1026 	u8 ref_type;
1027 
1028 	ASSERT(generic_ref->type == BTRFS_REF_DATA && action);
1029 	ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
1030 	if (!ref)
1031 		return -ENOMEM;
1032 
1033 	if (parent)
1034 	        ref_type = BTRFS_SHARED_DATA_REF_KEY;
1035 	else
1036 	        ref_type = BTRFS_EXTENT_DATA_REF_KEY;
1037 	init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
1038 				ref_root, action, ref_type);
1039 	ref->root = ref_root;
1040 	ref->parent = parent;
1041 	ref->objectid = owner;
1042 	ref->offset = offset;
1043 
1044 
1045 	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
1046 	if (!head_ref) {
1047 		kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
1048 		return -ENOMEM;
1049 	}
1050 
1051 	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
1052 	    is_fstree(ref_root) &&
1053 	    is_fstree(generic_ref->real_root) &&
1054 	    !generic_ref->skip_qgroup) {
1055 		record = kzalloc(sizeof(*record), GFP_NOFS);
1056 		if (!record) {
1057 			kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
1058 			kmem_cache_free(btrfs_delayed_ref_head_cachep,
1059 					head_ref);
1060 			return -ENOMEM;
1061 		}
1062 	}
1063 
1064 	init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root,
1065 			      reserved, action, true, false);
1066 	head_ref->extent_op = NULL;
1067 
1068 	delayed_refs = &trans->transaction->delayed_refs;
1069 	spin_lock(&delayed_refs->lock);
1070 
1071 	/*
1072 	 * insert both the head node and the new ref without dropping
1073 	 * the spin lock
1074 	 */
1075 	head_ref = add_delayed_ref_head(trans, head_ref, record,
1076 					action, &qrecord_inserted,
1077 					old_ref_mod, new_ref_mod);
1078 
1079 	ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
1080 	spin_unlock(&delayed_refs->lock);
1081 
1082 	/*
1083 	 * Need to update the delayed_refs_rsv with any changes we may have
1084 	 * made.
1085 	 */
1086 	btrfs_update_delayed_refs_rsv(trans);
1087 
1088 	trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref,
1089 				   action == BTRFS_ADD_DELAYED_EXTENT ?
1090 				   BTRFS_ADD_DELAYED_REF : action);
1091 	if (ret > 0)
1092 		kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
1093 
1094 
1095 	if (qrecord_inserted)
1096 		return btrfs_qgroup_trace_extent_post(fs_info, record);
1097 	return 0;
1098 }
1099 
1100 int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
1101 				u64 bytenr, u64 num_bytes,
1102 				struct btrfs_delayed_extent_op *extent_op)
1103 {
1104 	struct btrfs_delayed_ref_head *head_ref;
1105 	struct btrfs_delayed_ref_root *delayed_refs;
1106 
1107 	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
1108 	if (!head_ref)
1109 		return -ENOMEM;
1110 
1111 	init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0,
1112 			      BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data,
1113 			      false);
1114 	head_ref->extent_op = extent_op;
1115 
1116 	delayed_refs = &trans->transaction->delayed_refs;
1117 	spin_lock(&delayed_refs->lock);
1118 
1119 	add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD,
1120 			     NULL, NULL, NULL);
1121 
1122 	spin_unlock(&delayed_refs->lock);
1123 
1124 	/*
1125 	 * Need to update the delayed_refs_rsv with any changes we may have
1126 	 * made.
1127 	 */
1128 	btrfs_update_delayed_refs_rsv(trans);
1129 	return 0;
1130 }
1131 
1132 /*
1133  * This does a simple search for the head node for a given extent.  Returns the
1134  * head node if found, or NULL if not.
1135  */
1136 struct btrfs_delayed_ref_head *
1137 btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr)
1138 {
1139 	lockdep_assert_held(&delayed_refs->lock);
1140 
1141 	return find_ref_head(delayed_refs, bytenr, false);
1142 }
1143 
1144 void __cold btrfs_delayed_ref_exit(void)
1145 {
1146 	kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
1147 	kmem_cache_destroy(btrfs_delayed_tree_ref_cachep);
1148 	kmem_cache_destroy(btrfs_delayed_data_ref_cachep);
1149 	kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
1150 }
1151 
1152 int __init btrfs_delayed_ref_init(void)
1153 {
1154 	btrfs_delayed_ref_head_cachep = kmem_cache_create(
1155 				"btrfs_delayed_ref_head",
1156 				sizeof(struct btrfs_delayed_ref_head), 0,
1157 				SLAB_MEM_SPREAD, NULL);
1158 	if (!btrfs_delayed_ref_head_cachep)
1159 		goto fail;
1160 
1161 	btrfs_delayed_tree_ref_cachep = kmem_cache_create(
1162 				"btrfs_delayed_tree_ref",
1163 				sizeof(struct btrfs_delayed_tree_ref), 0,
1164 				SLAB_MEM_SPREAD, NULL);
1165 	if (!btrfs_delayed_tree_ref_cachep)
1166 		goto fail;
1167 
1168 	btrfs_delayed_data_ref_cachep = kmem_cache_create(
1169 				"btrfs_delayed_data_ref",
1170 				sizeof(struct btrfs_delayed_data_ref), 0,
1171 				SLAB_MEM_SPREAD, NULL);
1172 	if (!btrfs_delayed_data_ref_cachep)
1173 		goto fail;
1174 
1175 	btrfs_delayed_extent_op_cachep = kmem_cache_create(
1176 				"btrfs_delayed_extent_op",
1177 				sizeof(struct btrfs_delayed_extent_op), 0,
1178 				SLAB_MEM_SPREAD, NULL);
1179 	if (!btrfs_delayed_extent_op_cachep)
1180 		goto fail;
1181 
1182 	return 0;
1183 fail:
1184 	btrfs_delayed_ref_exit();
1185 	return -ENOMEM;
1186 }
1187