xref: /openbmc/linux/fs/btrfs/delayed-ref.c (revision 82ced6fd)
1 /*
2  * Copyright (C) 2009 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/sched.h>
20 #include <linux/sort.h>
21 #include "ctree.h"
22 #include "delayed-ref.h"
23 #include "transaction.h"
24 
25 /*
26  * delayed back reference update tracking.  For subvolume trees
27  * we queue up extent allocations and backref maintenance for
28  * delayed processing.   This avoids deep call chains where we
29  * add extents in the middle of btrfs_search_slot, and it allows
30  * us to buffer up frequently modified backrefs in an rb tree instead
31  * of hammering updates on the extent allocation tree.
32  *
33  * Right now this code is only used for reference counted trees, but
34  * the long term goal is to get rid of the similar code for delayed
35  * extent tree modifications.
36  */
37 
38 /*
39  * entries in the rb tree are ordered by the byte number of the extent
40  * and by the byte number of the parent block.
41  */
42 static int comp_entry(struct btrfs_delayed_ref_node *ref,
43 		      u64 bytenr, u64 parent)
44 {
45 	if (bytenr < ref->bytenr)
46 		return -1;
47 	if (bytenr > ref->bytenr)
48 		return 1;
49 	if (parent < ref->parent)
50 		return -1;
51 	if (parent > ref->parent)
52 		return 1;
53 	return 0;
54 }
55 
56 /*
57  * insert a new ref into the rbtree.  This returns any existing refs
58  * for the same (bytenr,parent) tuple, or NULL if the new node was properly
59  * inserted.
60  */
61 static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
62 						  u64 bytenr, u64 parent,
63 						  struct rb_node *node)
64 {
65 	struct rb_node **p = &root->rb_node;
66 	struct rb_node *parent_node = NULL;
67 	struct btrfs_delayed_ref_node *entry;
68 	int cmp;
69 
70 	while (*p) {
71 		parent_node = *p;
72 		entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
73 				 rb_node);
74 
75 		cmp = comp_entry(entry, bytenr, parent);
76 		if (cmp < 0)
77 			p = &(*p)->rb_left;
78 		else if (cmp > 0)
79 			p = &(*p)->rb_right;
80 		else
81 			return entry;
82 	}
83 
84 	entry = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
85 	rb_link_node(node, parent_node, p);
86 	rb_insert_color(node, root);
87 	return NULL;
88 }
89 
90 /*
91  * find an entry based on (bytenr,parent).  This returns the delayed
92  * ref if it was able to find one, or NULL if nothing was in that spot
93  */
94 static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
95 				  u64 bytenr, u64 parent,
96 				  struct btrfs_delayed_ref_node **last)
97 {
98 	struct rb_node *n = root->rb_node;
99 	struct btrfs_delayed_ref_node *entry;
100 	int cmp;
101 
102 	while (n) {
103 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
104 		WARN_ON(!entry->in_tree);
105 		if (last)
106 			*last = entry;
107 
108 		cmp = comp_entry(entry, bytenr, parent);
109 		if (cmp < 0)
110 			n = n->rb_left;
111 		else if (cmp > 0)
112 			n = n->rb_right;
113 		else
114 			return entry;
115 	}
116 	return NULL;
117 }
118 
119 int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
120 			   struct btrfs_delayed_ref_head *head)
121 {
122 	struct btrfs_delayed_ref_root *delayed_refs;
123 
124 	delayed_refs = &trans->transaction->delayed_refs;
125 	assert_spin_locked(&delayed_refs->lock);
126 	if (mutex_trylock(&head->mutex))
127 		return 0;
128 
129 	atomic_inc(&head->node.refs);
130 	spin_unlock(&delayed_refs->lock);
131 
132 	mutex_lock(&head->mutex);
133 	spin_lock(&delayed_refs->lock);
134 	if (!head->node.in_tree) {
135 		mutex_unlock(&head->mutex);
136 		btrfs_put_delayed_ref(&head->node);
137 		return -EAGAIN;
138 	}
139 	btrfs_put_delayed_ref(&head->node);
140 	return 0;
141 }
142 
143 int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
144 			   struct list_head *cluster, u64 start)
145 {
146 	int count = 0;
147 	struct btrfs_delayed_ref_root *delayed_refs;
148 	struct rb_node *node;
149 	struct btrfs_delayed_ref_node *ref;
150 	struct btrfs_delayed_ref_head *head;
151 
152 	delayed_refs = &trans->transaction->delayed_refs;
153 	if (start == 0) {
154 		node = rb_first(&delayed_refs->root);
155 	} else {
156 		ref = NULL;
157 		tree_search(&delayed_refs->root, start, (u64)-1, &ref);
158 		if (ref) {
159 			struct btrfs_delayed_ref_node *tmp;
160 
161 			node = rb_prev(&ref->rb_node);
162 			while (node) {
163 				tmp = rb_entry(node,
164 					       struct btrfs_delayed_ref_node,
165 					       rb_node);
166 				if (tmp->bytenr < start)
167 					break;
168 				ref = tmp;
169 				node = rb_prev(&ref->rb_node);
170 			}
171 			node = &ref->rb_node;
172 		} else
173 			node = rb_first(&delayed_refs->root);
174 	}
175 again:
176 	while (node && count < 32) {
177 		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
178 		if (btrfs_delayed_ref_is_head(ref)) {
179 			head = btrfs_delayed_node_to_head(ref);
180 			if (list_empty(&head->cluster)) {
181 				list_add_tail(&head->cluster, cluster);
182 				delayed_refs->run_delayed_start =
183 					head->node.bytenr;
184 				count++;
185 
186 				WARN_ON(delayed_refs->num_heads_ready == 0);
187 				delayed_refs->num_heads_ready--;
188 			} else if (count) {
189 				/* the goal of the clustering is to find extents
190 				 * that are likely to end up in the same extent
191 				 * leaf on disk.  So, we don't want them spread
192 				 * all over the tree.  Stop now if we've hit
193 				 * a head that was already in use
194 				 */
195 				break;
196 			}
197 		}
198 		node = rb_next(node);
199 	}
200 	if (count) {
201 		return 0;
202 	} else if (start) {
203 		/*
204 		 * we've gone to the end of the rbtree without finding any
205 		 * clusters.  start from the beginning and try again
206 		 */
207 		start = 0;
208 		node = rb_first(&delayed_refs->root);
209 		goto again;
210 	}
211 	return 1;
212 }
213 
214 /*
215  * This checks to see if there are any delayed refs in the
216  * btree for a given bytenr.  It returns one if it finds any
217  * and zero otherwise.
218  *
219  * If it only finds a head node, it returns 0.
220  *
221  * The idea is to use this when deciding if you can safely delete an
222  * extent from the extent allocation tree.  There may be a pending
223  * ref in the rbtree that adds or removes references, so as long as this
224  * returns one you need to leave the BTRFS_EXTENT_ITEM in the extent
225  * allocation tree.
226  */
227 int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
228 {
229 	struct btrfs_delayed_ref_node *ref;
230 	struct btrfs_delayed_ref_root *delayed_refs;
231 	struct rb_node *prev_node;
232 	int ret = 0;
233 
234 	delayed_refs = &trans->transaction->delayed_refs;
235 	spin_lock(&delayed_refs->lock);
236 
237 	ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
238 	if (ref) {
239 		prev_node = rb_prev(&ref->rb_node);
240 		if (!prev_node)
241 			goto out;
242 		ref = rb_entry(prev_node, struct btrfs_delayed_ref_node,
243 			       rb_node);
244 		if (ref->bytenr == bytenr)
245 			ret = 1;
246 	}
247 out:
248 	spin_unlock(&delayed_refs->lock);
249 	return ret;
250 }
251 
252 /*
253  * helper function to lookup reference count
254  *
255  * the head node for delayed ref is used to store the sum of all the
256  * reference count modifications queued up in the rbtree.  This way you
257  * can check to see what the reference count would be if all of the
258  * delayed refs are processed.
259  */
260 int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
261 			    struct btrfs_root *root, u64 bytenr,
262 			    u64 num_bytes, u32 *refs)
263 {
264 	struct btrfs_delayed_ref_node *ref;
265 	struct btrfs_delayed_ref_head *head;
266 	struct btrfs_delayed_ref_root *delayed_refs;
267 	struct btrfs_path *path;
268 	struct extent_buffer *leaf;
269 	struct btrfs_extent_item *ei;
270 	struct btrfs_key key;
271 	u32 num_refs;
272 	int ret;
273 
274 	path = btrfs_alloc_path();
275 	if (!path)
276 		return -ENOMEM;
277 
278 	key.objectid = bytenr;
279 	key.type = BTRFS_EXTENT_ITEM_KEY;
280 	key.offset = num_bytes;
281 	delayed_refs = &trans->transaction->delayed_refs;
282 again:
283 	ret = btrfs_search_slot(trans, root->fs_info->extent_root,
284 				&key, path, 0, 0);
285 	if (ret < 0)
286 		goto out;
287 
288 	if (ret == 0) {
289 		leaf = path->nodes[0];
290 		ei = btrfs_item_ptr(leaf, path->slots[0],
291 				    struct btrfs_extent_item);
292 		num_refs = btrfs_extent_refs(leaf, ei);
293 	} else {
294 		num_refs = 0;
295 		ret = 0;
296 	}
297 
298 	spin_lock(&delayed_refs->lock);
299 	ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
300 	if (ref) {
301 		head = btrfs_delayed_node_to_head(ref);
302 		if (mutex_trylock(&head->mutex)) {
303 			num_refs += ref->ref_mod;
304 			mutex_unlock(&head->mutex);
305 			*refs = num_refs;
306 			goto out;
307 		}
308 
309 		atomic_inc(&ref->refs);
310 		spin_unlock(&delayed_refs->lock);
311 
312 		btrfs_release_path(root->fs_info->extent_root, path);
313 
314 		mutex_lock(&head->mutex);
315 		mutex_unlock(&head->mutex);
316 		btrfs_put_delayed_ref(ref);
317 		goto again;
318 	} else {
319 		*refs = num_refs;
320 	}
321 out:
322 	spin_unlock(&delayed_refs->lock);
323 	btrfs_free_path(path);
324 	return ret;
325 }
326 
327 /*
328  * helper function to update an extent delayed ref in the
329  * rbtree.  existing and update must both have the same
330  * bytenr and parent
331  *
332  * This may free existing if the update cancels out whatever
333  * operation it was doing.
334  */
335 static noinline void
336 update_existing_ref(struct btrfs_trans_handle *trans,
337 		    struct btrfs_delayed_ref_root *delayed_refs,
338 		    struct btrfs_delayed_ref_node *existing,
339 		    struct btrfs_delayed_ref_node *update)
340 {
341 	struct btrfs_delayed_ref *existing_ref;
342 	struct btrfs_delayed_ref *ref;
343 
344 	existing_ref = btrfs_delayed_node_to_ref(existing);
345 	ref = btrfs_delayed_node_to_ref(update);
346 
347 	if (ref->pin)
348 		existing_ref->pin = 1;
349 
350 	if (ref->action != existing_ref->action) {
351 		/*
352 		 * this is effectively undoing either an add or a
353 		 * drop.  We decrement the ref_mod, and if it goes
354 		 * down to zero we just delete the entry without
355 		 * every changing the extent allocation tree.
356 		 */
357 		existing->ref_mod--;
358 		if (existing->ref_mod == 0) {
359 			rb_erase(&existing->rb_node,
360 				 &delayed_refs->root);
361 			existing->in_tree = 0;
362 			btrfs_put_delayed_ref(existing);
363 			delayed_refs->num_entries--;
364 			if (trans->delayed_ref_updates)
365 				trans->delayed_ref_updates--;
366 		}
367 	} else {
368 		if (existing_ref->action == BTRFS_ADD_DELAYED_REF) {
369 			/* if we're adding refs, make sure all the
370 			 * details match up.  The extent could
371 			 * have been totally freed and reallocated
372 			 * by a different owner before the delayed
373 			 * ref entries were removed.
374 			 */
375 			existing_ref->owner_objectid = ref->owner_objectid;
376 			existing_ref->generation = ref->generation;
377 			existing_ref->root = ref->root;
378 			existing->num_bytes = update->num_bytes;
379 		}
380 		/*
381 		 * the action on the existing ref matches
382 		 * the action on the ref we're trying to add.
383 		 * Bump the ref_mod by one so the backref that
384 		 * is eventually added/removed has the correct
385 		 * reference count
386 		 */
387 		existing->ref_mod += update->ref_mod;
388 	}
389 }
390 
391 /*
392  * helper function to update the accounting in the head ref
393  * existing and update must have the same bytenr
394  */
395 static noinline void
396 update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
397 			 struct btrfs_delayed_ref_node *update)
398 {
399 	struct btrfs_delayed_ref_head *existing_ref;
400 	struct btrfs_delayed_ref_head *ref;
401 
402 	existing_ref = btrfs_delayed_node_to_head(existing);
403 	ref = btrfs_delayed_node_to_head(update);
404 
405 	if (ref->must_insert_reserved) {
406 		/* if the extent was freed and then
407 		 * reallocated before the delayed ref
408 		 * entries were processed, we can end up
409 		 * with an existing head ref without
410 		 * the must_insert_reserved flag set.
411 		 * Set it again here
412 		 */
413 		existing_ref->must_insert_reserved = ref->must_insert_reserved;
414 
415 		/*
416 		 * update the num_bytes so we make sure the accounting
417 		 * is done correctly
418 		 */
419 		existing->num_bytes = update->num_bytes;
420 
421 	}
422 
423 	/*
424 	 * update the reference mod on the head to reflect this new operation
425 	 */
426 	existing->ref_mod += update->ref_mod;
427 }
428 
429 /*
430  * helper function to actually insert a delayed ref into the rbtree.
431  * this does all the dirty work in terms of maintaining the correct
432  * overall modification count in the head node and properly dealing
433  * with updating existing nodes as new modifications are queued.
434  */
435 static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
436 			  struct btrfs_delayed_ref_node *ref,
437 			  u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
438 			  u64 ref_generation, u64 owner_objectid, int action,
439 			  int pin)
440 {
441 	struct btrfs_delayed_ref_node *existing;
442 	struct btrfs_delayed_ref *full_ref;
443 	struct btrfs_delayed_ref_head *head_ref = NULL;
444 	struct btrfs_delayed_ref_root *delayed_refs;
445 	int count_mod = 1;
446 	int must_insert_reserved = 0;
447 
448 	/*
449 	 * the head node stores the sum of all the mods, so dropping a ref
450 	 * should drop the sum in the head node by one.
451 	 */
452 	if (parent == (u64)-1) {
453 		if (action == BTRFS_DROP_DELAYED_REF)
454 			count_mod = -1;
455 		else if (action == BTRFS_UPDATE_DELAYED_HEAD)
456 			count_mod = 0;
457 	}
458 
459 	/*
460 	 * BTRFS_ADD_DELAYED_EXTENT means that we need to update
461 	 * the reserved accounting when the extent is finally added, or
462 	 * if a later modification deletes the delayed ref without ever
463 	 * inserting the extent into the extent allocation tree.
464 	 * ref->must_insert_reserved is the flag used to record
465 	 * that accounting mods are required.
466 	 *
467 	 * Once we record must_insert_reserved, switch the action to
468 	 * BTRFS_ADD_DELAYED_REF because other special casing is not required.
469 	 */
470 	if (action == BTRFS_ADD_DELAYED_EXTENT) {
471 		must_insert_reserved = 1;
472 		action = BTRFS_ADD_DELAYED_REF;
473 	} else {
474 		must_insert_reserved = 0;
475 	}
476 
477 
478 	delayed_refs = &trans->transaction->delayed_refs;
479 
480 	/* first set the basic ref node struct up */
481 	atomic_set(&ref->refs, 1);
482 	ref->bytenr = bytenr;
483 	ref->parent = parent;
484 	ref->ref_mod = count_mod;
485 	ref->in_tree = 1;
486 	ref->num_bytes = num_bytes;
487 
488 	if (btrfs_delayed_ref_is_head(ref)) {
489 		head_ref = btrfs_delayed_node_to_head(ref);
490 		head_ref->must_insert_reserved = must_insert_reserved;
491 		INIT_LIST_HEAD(&head_ref->cluster);
492 		mutex_init(&head_ref->mutex);
493 	} else {
494 		full_ref = btrfs_delayed_node_to_ref(ref);
495 		full_ref->root = ref_root;
496 		full_ref->generation = ref_generation;
497 		full_ref->owner_objectid = owner_objectid;
498 		full_ref->pin = pin;
499 		full_ref->action = action;
500 	}
501 
502 	existing = tree_insert(&delayed_refs->root, bytenr,
503 			       parent, &ref->rb_node);
504 
505 	if (existing) {
506 		if (btrfs_delayed_ref_is_head(ref))
507 			update_existing_head_ref(existing, ref);
508 		else
509 			update_existing_ref(trans, delayed_refs, existing, ref);
510 
511 		/*
512 		 * we've updated the existing ref, free the newly
513 		 * allocated ref
514 		 */
515 		kfree(ref);
516 	} else {
517 		if (btrfs_delayed_ref_is_head(ref)) {
518 			delayed_refs->num_heads++;
519 			delayed_refs->num_heads_ready++;
520 		}
521 		delayed_refs->num_entries++;
522 		trans->delayed_ref_updates++;
523 	}
524 	return 0;
525 }
526 
527 /*
528  * add a delayed ref to the tree.  This does all of the accounting required
529  * to make sure the delayed ref is eventually processed before this
530  * transaction commits.
531  */
532 int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
533 			  u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
534 			  u64 ref_generation, u64 owner_objectid, int action,
535 			  int pin)
536 {
537 	struct btrfs_delayed_ref *ref;
538 	struct btrfs_delayed_ref_head *head_ref;
539 	struct btrfs_delayed_ref_root *delayed_refs;
540 	int ret;
541 
542 	ref = kmalloc(sizeof(*ref), GFP_NOFS);
543 	if (!ref)
544 		return -ENOMEM;
545 
546 	/*
547 	 * the parent = 0 case comes from cases where we don't actually
548 	 * know the parent yet.  It will get updated later via a add/drop
549 	 * pair.
550 	 */
551 	if (parent == 0)
552 		parent = bytenr;
553 
554 	head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
555 	if (!head_ref) {
556 		kfree(ref);
557 		return -ENOMEM;
558 	}
559 	delayed_refs = &trans->transaction->delayed_refs;
560 	spin_lock(&delayed_refs->lock);
561 
562 	/*
563 	 * insert both the head node and the new ref without dropping
564 	 * the spin lock
565 	 */
566 	ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
567 				      (u64)-1, 0, 0, 0, action, pin);
568 	BUG_ON(ret);
569 
570 	ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
571 				      parent, ref_root, ref_generation,
572 				      owner_objectid, action, pin);
573 	BUG_ON(ret);
574 	spin_unlock(&delayed_refs->lock);
575 	return 0;
576 }
577 
578 /*
579  * this does a simple search for the head node for a given extent.
580  * It must be called with the delayed ref spinlock held, and it returns
581  * the head node if any where found, or NULL if not.
582  */
583 struct btrfs_delayed_ref_head *
584 btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
585 {
586 	struct btrfs_delayed_ref_node *ref;
587 	struct btrfs_delayed_ref_root *delayed_refs;
588 
589 	delayed_refs = &trans->transaction->delayed_refs;
590 	ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
591 	if (ref)
592 		return btrfs_delayed_node_to_head(ref);
593 	return NULL;
594 }
595 
596 /*
597  * add a delayed ref to the tree.  This does all of the accounting required
598  * to make sure the delayed ref is eventually processed before this
599  * transaction commits.
600  *
601  * The main point of this call is to add and remove a backreference in a single
602  * shot, taking the lock only once, and only searching for the head node once.
603  *
604  * It is the same as doing a ref add and delete in two separate calls.
605  */
606 int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
607 			  u64 bytenr, u64 num_bytes, u64 orig_parent,
608 			  u64 parent, u64 orig_ref_root, u64 ref_root,
609 			  u64 orig_ref_generation, u64 ref_generation,
610 			  u64 owner_objectid, int pin)
611 {
612 	struct btrfs_delayed_ref *ref;
613 	struct btrfs_delayed_ref *old_ref;
614 	struct btrfs_delayed_ref_head *head_ref;
615 	struct btrfs_delayed_ref_root *delayed_refs;
616 	int ret;
617 
618 	ref = kmalloc(sizeof(*ref), GFP_NOFS);
619 	if (!ref)
620 		return -ENOMEM;
621 
622 	old_ref = kmalloc(sizeof(*old_ref), GFP_NOFS);
623 	if (!old_ref) {
624 		kfree(ref);
625 		return -ENOMEM;
626 	}
627 
628 	/*
629 	 * the parent = 0 case comes from cases where we don't actually
630 	 * know the parent yet.  It will get updated later via a add/drop
631 	 * pair.
632 	 */
633 	if (parent == 0)
634 		parent = bytenr;
635 	if (orig_parent == 0)
636 		orig_parent = bytenr;
637 
638 	head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
639 	if (!head_ref) {
640 		kfree(ref);
641 		kfree(old_ref);
642 		return -ENOMEM;
643 	}
644 	delayed_refs = &trans->transaction->delayed_refs;
645 	spin_lock(&delayed_refs->lock);
646 
647 	/*
648 	 * insert both the head node and the new ref without dropping
649 	 * the spin lock
650 	 */
651 	ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
652 				      (u64)-1, 0, 0, 0,
653 				      BTRFS_UPDATE_DELAYED_HEAD, 0);
654 	BUG_ON(ret);
655 
656 	ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
657 				      parent, ref_root, ref_generation,
658 				      owner_objectid, BTRFS_ADD_DELAYED_REF, 0);
659 	BUG_ON(ret);
660 
661 	ret = __btrfs_add_delayed_ref(trans, &old_ref->node, bytenr, num_bytes,
662 				      orig_parent, orig_ref_root,
663 				      orig_ref_generation, owner_objectid,
664 				      BTRFS_DROP_DELAYED_REF, pin);
665 	BUG_ON(ret);
666 	spin_unlock(&delayed_refs->lock);
667 	return 0;
668 }
669