xref: /openbmc/linux/fs/btrfs/tree-mod-log.c (revision 789d6a3a)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "messages.h"
4 #include "tree-mod-log.h"
5 #include "disk-io.h"
6 #include "fs.h"
7 #include "accessors.h"
8 
9 struct tree_mod_root {
10 	u64 logical;
11 	u8 level;
12 };
13 
14 struct tree_mod_elem {
15 	struct rb_node node;
16 	u64 logical;
17 	u64 seq;
18 	enum btrfs_mod_log_op op;
19 
20 	/*
21 	 * This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS
22 	 * operations.
23 	 */
24 	int slot;
25 
26 	/* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */
27 	u64 generation;
28 
29 	/* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */
30 	struct btrfs_disk_key key;
31 	u64 blockptr;
32 
33 	/* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */
34 	struct {
35 		int dst_slot;
36 		int nr_items;
37 	} move;
38 
39 	/* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */
40 	struct tree_mod_root old_root;
41 };
42 
43 /*
44  * Pull a new tree mod seq number for our operation.
45  */
46 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
47 {
48 	return atomic64_inc_return(&fs_info->tree_mod_seq);
49 }
50 
51 /*
52  * This adds a new blocker to the tree mod log's blocker list if the @elem
53  * passed does not already have a sequence number set. So when a caller expects
54  * to record tree modifications, it should ensure to set elem->seq to zero
55  * before calling btrfs_get_tree_mod_seq.
56  * Returns a fresh, unused tree log modification sequence number, even if no new
57  * blocker was added.
58  */
59 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
60 			   struct btrfs_seq_list *elem)
61 {
62 	write_lock(&fs_info->tree_mod_log_lock);
63 	if (!elem->seq) {
64 		elem->seq = btrfs_inc_tree_mod_seq(fs_info);
65 		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
66 		set_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
67 	}
68 	write_unlock(&fs_info->tree_mod_log_lock);
69 
70 	return elem->seq;
71 }
72 
73 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
74 			    struct btrfs_seq_list *elem)
75 {
76 	struct rb_root *tm_root;
77 	struct rb_node *node;
78 	struct rb_node *next;
79 	struct tree_mod_elem *tm;
80 	u64 min_seq = BTRFS_SEQ_LAST;
81 	u64 seq_putting = elem->seq;
82 
83 	if (!seq_putting)
84 		return;
85 
86 	write_lock(&fs_info->tree_mod_log_lock);
87 	list_del(&elem->list);
88 	elem->seq = 0;
89 
90 	if (list_empty(&fs_info->tree_mod_seq_list)) {
91 		clear_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
92 	} else {
93 		struct btrfs_seq_list *first;
94 
95 		first = list_first_entry(&fs_info->tree_mod_seq_list,
96 					 struct btrfs_seq_list, list);
97 		if (seq_putting > first->seq) {
98 			/*
99 			 * Blocker with lower sequence number exists, we cannot
100 			 * remove anything from the log.
101 			 */
102 			write_unlock(&fs_info->tree_mod_log_lock);
103 			return;
104 		}
105 		min_seq = first->seq;
106 	}
107 
108 	/*
109 	 * Anything that's lower than the lowest existing (read: blocked)
110 	 * sequence number can be removed from the tree.
111 	 */
112 	tm_root = &fs_info->tree_mod_log;
113 	for (node = rb_first(tm_root); node; node = next) {
114 		next = rb_next(node);
115 		tm = rb_entry(node, struct tree_mod_elem, node);
116 		if (tm->seq >= min_seq)
117 			continue;
118 		rb_erase(node, tm_root);
119 		kfree(tm);
120 	}
121 	write_unlock(&fs_info->tree_mod_log_lock);
122 }
123 
124 /*
125  * Key order of the log:
126  *       node/leaf start address -> sequence
127  *
128  * The 'start address' is the logical address of the *new* root node for root
129  * replace operations, or the logical address of the affected block for all
130  * other operations.
131  */
132 static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info,
133 					struct tree_mod_elem *tm)
134 {
135 	struct rb_root *tm_root;
136 	struct rb_node **new;
137 	struct rb_node *parent = NULL;
138 	struct tree_mod_elem *cur;
139 
140 	lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
141 
142 	tm->seq = btrfs_inc_tree_mod_seq(fs_info);
143 
144 	tm_root = &fs_info->tree_mod_log;
145 	new = &tm_root->rb_node;
146 	while (*new) {
147 		cur = rb_entry(*new, struct tree_mod_elem, node);
148 		parent = *new;
149 		if (cur->logical < tm->logical)
150 			new = &((*new)->rb_left);
151 		else if (cur->logical > tm->logical)
152 			new = &((*new)->rb_right);
153 		else if (cur->seq < tm->seq)
154 			new = &((*new)->rb_left);
155 		else if (cur->seq > tm->seq)
156 			new = &((*new)->rb_right);
157 		else
158 			return -EEXIST;
159 	}
160 
161 	rb_link_node(&tm->node, parent, new);
162 	rb_insert_color(&tm->node, tm_root);
163 	return 0;
164 }
165 
166 /*
167  * Determines if logging can be omitted. Returns true if it can. Otherwise, it
168  * returns false with the tree_mod_log_lock acquired. The caller must hold
169  * this until all tree mod log insertions are recorded in the rb tree and then
170  * write unlock fs_info::tree_mod_log_lock.
171  */
172 static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info,
173 				    struct extent_buffer *eb)
174 {
175 	if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
176 		return true;
177 	if (eb && btrfs_header_level(eb) == 0)
178 		return true;
179 
180 	write_lock(&fs_info->tree_mod_log_lock);
181 	if (list_empty(&(fs_info)->tree_mod_seq_list)) {
182 		write_unlock(&fs_info->tree_mod_log_lock);
183 		return true;
184 	}
185 
186 	return false;
187 }
188 
189 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
190 static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
191 				    struct extent_buffer *eb)
192 {
193 	if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
194 		return false;
195 	if (eb && btrfs_header_level(eb) == 0)
196 		return false;
197 
198 	return true;
199 }
200 
201 static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb,
202 						 int slot,
203 						 enum btrfs_mod_log_op op)
204 {
205 	struct tree_mod_elem *tm;
206 
207 	tm = kzalloc(sizeof(*tm), GFP_NOFS);
208 	if (!tm)
209 		return NULL;
210 
211 	tm->logical = eb->start;
212 	if (op != BTRFS_MOD_LOG_KEY_ADD) {
213 		btrfs_node_key(eb, &tm->key, slot);
214 		tm->blockptr = btrfs_node_blockptr(eb, slot);
215 	}
216 	tm->op = op;
217 	tm->slot = slot;
218 	tm->generation = btrfs_node_ptr_generation(eb, slot);
219 	RB_CLEAR_NODE(&tm->node);
220 
221 	return tm;
222 }
223 
224 int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
225 				  enum btrfs_mod_log_op op)
226 {
227 	struct tree_mod_elem *tm;
228 	int ret;
229 
230 	if (!tree_mod_need_log(eb->fs_info, eb))
231 		return 0;
232 
233 	tm = alloc_tree_mod_elem(eb, slot, op);
234 	if (!tm)
235 		return -ENOMEM;
236 
237 	if (tree_mod_dont_log(eb->fs_info, eb)) {
238 		kfree(tm);
239 		return 0;
240 	}
241 
242 	ret = tree_mod_log_insert(eb->fs_info, tm);
243 	write_unlock(&eb->fs_info->tree_mod_log_lock);
244 	if (ret)
245 		kfree(tm);
246 
247 	return ret;
248 }
249 
250 int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
251 				   int dst_slot, int src_slot,
252 				   int nr_items)
253 {
254 	struct tree_mod_elem *tm = NULL;
255 	struct tree_mod_elem **tm_list = NULL;
256 	int ret = 0;
257 	int i;
258 	bool locked = false;
259 
260 	if (!tree_mod_need_log(eb->fs_info, eb))
261 		return 0;
262 
263 	tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
264 	if (!tm_list)
265 		return -ENOMEM;
266 
267 	tm = kzalloc(sizeof(*tm), GFP_NOFS);
268 	if (!tm) {
269 		ret = -ENOMEM;
270 		goto free_tms;
271 	}
272 
273 	tm->logical = eb->start;
274 	tm->slot = src_slot;
275 	tm->move.dst_slot = dst_slot;
276 	tm->move.nr_items = nr_items;
277 	tm->op = BTRFS_MOD_LOG_MOVE_KEYS;
278 
279 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
280 		tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
281 				BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING);
282 		if (!tm_list[i]) {
283 			ret = -ENOMEM;
284 			goto free_tms;
285 		}
286 	}
287 
288 	if (tree_mod_dont_log(eb->fs_info, eb))
289 		goto free_tms;
290 	locked = true;
291 
292 	/*
293 	 * When we override something during the move, we log these removals.
294 	 * This can only happen when we move towards the beginning of the
295 	 * buffer, i.e. dst_slot < src_slot.
296 	 */
297 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
298 		ret = tree_mod_log_insert(eb->fs_info, tm_list[i]);
299 		if (ret)
300 			goto free_tms;
301 	}
302 
303 	ret = tree_mod_log_insert(eb->fs_info, tm);
304 	if (ret)
305 		goto free_tms;
306 	write_unlock(&eb->fs_info->tree_mod_log_lock);
307 	kfree(tm_list);
308 
309 	return 0;
310 
311 free_tms:
312 	for (i = 0; i < nr_items; i++) {
313 		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
314 			rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
315 		kfree(tm_list[i]);
316 	}
317 	if (locked)
318 		write_unlock(&eb->fs_info->tree_mod_log_lock);
319 	kfree(tm_list);
320 	kfree(tm);
321 
322 	return ret;
323 }
324 
325 static inline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
326 				       struct tree_mod_elem **tm_list,
327 				       int nritems)
328 {
329 	int i, j;
330 	int ret;
331 
332 	for (i = nritems - 1; i >= 0; i--) {
333 		ret = tree_mod_log_insert(fs_info, tm_list[i]);
334 		if (ret) {
335 			for (j = nritems - 1; j > i; j--)
336 				rb_erase(&tm_list[j]->node,
337 					 &fs_info->tree_mod_log);
338 			return ret;
339 		}
340 	}
341 
342 	return 0;
343 }
344 
345 int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
346 				   struct extent_buffer *new_root,
347 				   bool log_removal)
348 {
349 	struct btrfs_fs_info *fs_info = old_root->fs_info;
350 	struct tree_mod_elem *tm = NULL;
351 	struct tree_mod_elem **tm_list = NULL;
352 	int nritems = 0;
353 	int ret = 0;
354 	int i;
355 
356 	if (!tree_mod_need_log(fs_info, NULL))
357 		return 0;
358 
359 	if (log_removal && btrfs_header_level(old_root) > 0) {
360 		nritems = btrfs_header_nritems(old_root);
361 		tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
362 				  GFP_NOFS);
363 		if (!tm_list) {
364 			ret = -ENOMEM;
365 			goto free_tms;
366 		}
367 		for (i = 0; i < nritems; i++) {
368 			tm_list[i] = alloc_tree_mod_elem(old_root, i,
369 			    BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
370 			if (!tm_list[i]) {
371 				ret = -ENOMEM;
372 				goto free_tms;
373 			}
374 		}
375 	}
376 
377 	tm = kzalloc(sizeof(*tm), GFP_NOFS);
378 	if (!tm) {
379 		ret = -ENOMEM;
380 		goto free_tms;
381 	}
382 
383 	tm->logical = new_root->start;
384 	tm->old_root.logical = old_root->start;
385 	tm->old_root.level = btrfs_header_level(old_root);
386 	tm->generation = btrfs_header_generation(old_root);
387 	tm->op = BTRFS_MOD_LOG_ROOT_REPLACE;
388 
389 	if (tree_mod_dont_log(fs_info, NULL))
390 		goto free_tms;
391 
392 	if (tm_list)
393 		ret = tree_mod_log_free_eb(fs_info, tm_list, nritems);
394 	if (!ret)
395 		ret = tree_mod_log_insert(fs_info, tm);
396 
397 	write_unlock(&fs_info->tree_mod_log_lock);
398 	if (ret)
399 		goto free_tms;
400 	kfree(tm_list);
401 
402 	return ret;
403 
404 free_tms:
405 	if (tm_list) {
406 		for (i = 0; i < nritems; i++)
407 			kfree(tm_list[i]);
408 		kfree(tm_list);
409 	}
410 	kfree(tm);
411 
412 	return ret;
413 }
414 
415 static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info,
416 						   u64 start, u64 min_seq,
417 						   bool smallest)
418 {
419 	struct rb_root *tm_root;
420 	struct rb_node *node;
421 	struct tree_mod_elem *cur = NULL;
422 	struct tree_mod_elem *found = NULL;
423 
424 	read_lock(&fs_info->tree_mod_log_lock);
425 	tm_root = &fs_info->tree_mod_log;
426 	node = tm_root->rb_node;
427 	while (node) {
428 		cur = rb_entry(node, struct tree_mod_elem, node);
429 		if (cur->logical < start) {
430 			node = node->rb_left;
431 		} else if (cur->logical > start) {
432 			node = node->rb_right;
433 		} else if (cur->seq < min_seq) {
434 			node = node->rb_left;
435 		} else if (!smallest) {
436 			/* We want the node with the highest seq */
437 			if (found)
438 				BUG_ON(found->seq > cur->seq);
439 			found = cur;
440 			node = node->rb_left;
441 		} else if (cur->seq > min_seq) {
442 			/* We want the node with the smallest seq */
443 			if (found)
444 				BUG_ON(found->seq < cur->seq);
445 			found = cur;
446 			node = node->rb_right;
447 		} else {
448 			found = cur;
449 			break;
450 		}
451 	}
452 	read_unlock(&fs_info->tree_mod_log_lock);
453 
454 	return found;
455 }
456 
457 /*
458  * This returns the element from the log with the smallest time sequence
459  * value that's in the log (the oldest log item). Any element with a time
460  * sequence lower than min_seq will be ignored.
461  */
462 static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info,
463 							u64 start, u64 min_seq)
464 {
465 	return __tree_mod_log_search(fs_info, start, min_seq, true);
466 }
467 
468 /*
469  * This returns the element from the log with the largest time sequence
470  * value that's in the log (the most recent log item). Any element with
471  * a time sequence lower than min_seq will be ignored.
472  */
473 static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info,
474 						 u64 start, u64 min_seq)
475 {
476 	return __tree_mod_log_search(fs_info, start, min_seq, false);
477 }
478 
479 int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
480 			       struct extent_buffer *src,
481 			       unsigned long dst_offset,
482 			       unsigned long src_offset,
483 			       int nr_items)
484 {
485 	struct btrfs_fs_info *fs_info = dst->fs_info;
486 	int ret = 0;
487 	struct tree_mod_elem **tm_list = NULL;
488 	struct tree_mod_elem **tm_list_add, **tm_list_rem;
489 	int i;
490 	bool locked = false;
491 
492 	if (!tree_mod_need_log(fs_info, NULL))
493 		return 0;
494 
495 	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
496 		return 0;
497 
498 	tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
499 			  GFP_NOFS);
500 	if (!tm_list)
501 		return -ENOMEM;
502 
503 	tm_list_add = tm_list;
504 	tm_list_rem = tm_list + nr_items;
505 	for (i = 0; i < nr_items; i++) {
506 		tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
507 						     BTRFS_MOD_LOG_KEY_REMOVE);
508 		if (!tm_list_rem[i]) {
509 			ret = -ENOMEM;
510 			goto free_tms;
511 		}
512 
513 		tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
514 						     BTRFS_MOD_LOG_KEY_ADD);
515 		if (!tm_list_add[i]) {
516 			ret = -ENOMEM;
517 			goto free_tms;
518 		}
519 	}
520 
521 	if (tree_mod_dont_log(fs_info, NULL))
522 		goto free_tms;
523 	locked = true;
524 
525 	for (i = 0; i < nr_items; i++) {
526 		ret = tree_mod_log_insert(fs_info, tm_list_rem[i]);
527 		if (ret)
528 			goto free_tms;
529 		ret = tree_mod_log_insert(fs_info, tm_list_add[i]);
530 		if (ret)
531 			goto free_tms;
532 	}
533 
534 	write_unlock(&fs_info->tree_mod_log_lock);
535 	kfree(tm_list);
536 
537 	return 0;
538 
539 free_tms:
540 	for (i = 0; i < nr_items * 2; i++) {
541 		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
542 			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
543 		kfree(tm_list[i]);
544 	}
545 	if (locked)
546 		write_unlock(&fs_info->tree_mod_log_lock);
547 	kfree(tm_list);
548 
549 	return ret;
550 }
551 
552 int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb)
553 {
554 	struct tree_mod_elem **tm_list = NULL;
555 	int nritems = 0;
556 	int i;
557 	int ret = 0;
558 
559 	if (!tree_mod_need_log(eb->fs_info, eb))
560 		return 0;
561 
562 	nritems = btrfs_header_nritems(eb);
563 	tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
564 	if (!tm_list)
565 		return -ENOMEM;
566 
567 	for (i = 0; i < nritems; i++) {
568 		tm_list[i] = alloc_tree_mod_elem(eb, i,
569 				    BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
570 		if (!tm_list[i]) {
571 			ret = -ENOMEM;
572 			goto free_tms;
573 		}
574 	}
575 
576 	if (tree_mod_dont_log(eb->fs_info, eb))
577 		goto free_tms;
578 
579 	ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
580 	write_unlock(&eb->fs_info->tree_mod_log_lock);
581 	if (ret)
582 		goto free_tms;
583 	kfree(tm_list);
584 
585 	return 0;
586 
587 free_tms:
588 	for (i = 0; i < nritems; i++)
589 		kfree(tm_list[i]);
590 	kfree(tm_list);
591 
592 	return ret;
593 }
594 
595 /*
596  * Returns the logical address of the oldest predecessor of the given root.
597  * Entries older than time_seq are ignored.
598  */
599 static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root,
600 						      u64 time_seq)
601 {
602 	struct tree_mod_elem *tm;
603 	struct tree_mod_elem *found = NULL;
604 	u64 root_logical = eb_root->start;
605 	bool looped = false;
606 
607 	if (!time_seq)
608 		return NULL;
609 
610 	/*
611 	 * The very last operation that's logged for a root is the replacement
612 	 * operation (if it is replaced at all). This has the logical address
613 	 * of the *new* root, making it the very first operation that's logged
614 	 * for this root.
615 	 */
616 	while (1) {
617 		tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
618 						time_seq);
619 		if (!looped && !tm)
620 			return NULL;
621 		/*
622 		 * If there are no tree operation for the oldest root, we simply
623 		 * return it. This should only happen if that (old) root is at
624 		 * level 0.
625 		 */
626 		if (!tm)
627 			break;
628 
629 		/*
630 		 * If there's an operation that's not a root replacement, we
631 		 * found the oldest version of our root. Normally, we'll find a
632 		 * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
633 		 */
634 		if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE)
635 			break;
636 
637 		found = tm;
638 		root_logical = tm->old_root.logical;
639 		looped = true;
640 	}
641 
642 	/* If there's no old root to return, return what we found instead */
643 	if (!found)
644 		found = tm;
645 
646 	return found;
647 }
648 
649 
650 /*
651  * tm is a pointer to the first operation to rewind within eb. Then, all
652  * previous operations will be rewound (until we reach something older than
653  * time_seq).
654  */
655 static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
656 				struct extent_buffer *eb,
657 				u64 time_seq,
658 				struct tree_mod_elem *first_tm)
659 {
660 	u32 n;
661 	struct rb_node *next;
662 	struct tree_mod_elem *tm = first_tm;
663 	unsigned long o_dst;
664 	unsigned long o_src;
665 	unsigned long p_size = sizeof(struct btrfs_key_ptr);
666 
667 	n = btrfs_header_nritems(eb);
668 	read_lock(&fs_info->tree_mod_log_lock);
669 	while (tm && tm->seq >= time_seq) {
670 		/*
671 		 * All the operations are recorded with the operator used for
672 		 * the modification. As we're going backwards, we do the
673 		 * opposite of each operation here.
674 		 */
675 		switch (tm->op) {
676 		case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING:
677 			BUG_ON(tm->slot < n);
678 			fallthrough;
679 		case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING:
680 		case BTRFS_MOD_LOG_KEY_REMOVE:
681 			btrfs_set_node_key(eb, &tm->key, tm->slot);
682 			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
683 			btrfs_set_node_ptr_generation(eb, tm->slot,
684 						      tm->generation);
685 			n++;
686 			break;
687 		case BTRFS_MOD_LOG_KEY_REPLACE:
688 			BUG_ON(tm->slot >= n);
689 			btrfs_set_node_key(eb, &tm->key, tm->slot);
690 			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
691 			btrfs_set_node_ptr_generation(eb, tm->slot,
692 						      tm->generation);
693 			break;
694 		case BTRFS_MOD_LOG_KEY_ADD:
695 			/* if a move operation is needed it's in the log */
696 			n--;
697 			break;
698 		case BTRFS_MOD_LOG_MOVE_KEYS:
699 			o_dst = btrfs_node_key_ptr_offset(tm->slot);
700 			o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
701 			memmove_extent_buffer(eb, o_dst, o_src,
702 					      tm->move.nr_items * p_size);
703 			break;
704 		case BTRFS_MOD_LOG_ROOT_REPLACE:
705 			/*
706 			 * This operation is special. For roots, this must be
707 			 * handled explicitly before rewinding.
708 			 * For non-roots, this operation may exist if the node
709 			 * was a root: root A -> child B; then A gets empty and
710 			 * B is promoted to the new root. In the mod log, we'll
711 			 * have a root-replace operation for B, a tree block
712 			 * that is no root. We simply ignore that operation.
713 			 */
714 			break;
715 		}
716 		next = rb_next(&tm->node);
717 		if (!next)
718 			break;
719 		tm = rb_entry(next, struct tree_mod_elem, node);
720 		if (tm->logical != first_tm->logical)
721 			break;
722 	}
723 	read_unlock(&fs_info->tree_mod_log_lock);
724 	btrfs_set_header_nritems(eb, n);
725 }
726 
727 /*
728  * Called with eb read locked. If the buffer cannot be rewound, the same buffer
729  * is returned. If rewind operations happen, a fresh buffer is returned. The
730  * returned buffer is always read-locked. If the returned buffer is not the
731  * input buffer, the lock on the input buffer is released and the input buffer
732  * is freed (its refcount is decremented).
733  */
734 struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
735 						struct btrfs_path *path,
736 						struct extent_buffer *eb,
737 						u64 time_seq)
738 {
739 	struct extent_buffer *eb_rewin;
740 	struct tree_mod_elem *tm;
741 
742 	if (!time_seq)
743 		return eb;
744 
745 	if (btrfs_header_level(eb) == 0)
746 		return eb;
747 
748 	tm = tree_mod_log_search(fs_info, eb->start, time_seq);
749 	if (!tm)
750 		return eb;
751 
752 	if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
753 		BUG_ON(tm->slot != 0);
754 		eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
755 		if (!eb_rewin) {
756 			btrfs_tree_read_unlock(eb);
757 			free_extent_buffer(eb);
758 			return NULL;
759 		}
760 		btrfs_set_header_bytenr(eb_rewin, eb->start);
761 		btrfs_set_header_backref_rev(eb_rewin,
762 					     btrfs_header_backref_rev(eb));
763 		btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
764 		btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
765 	} else {
766 		eb_rewin = btrfs_clone_extent_buffer(eb);
767 		if (!eb_rewin) {
768 			btrfs_tree_read_unlock(eb);
769 			free_extent_buffer(eb);
770 			return NULL;
771 		}
772 	}
773 
774 	btrfs_tree_read_unlock(eb);
775 	free_extent_buffer(eb);
776 
777 	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
778 				       eb_rewin, btrfs_header_level(eb_rewin));
779 	btrfs_tree_read_lock(eb_rewin);
780 	tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
781 	WARN_ON(btrfs_header_nritems(eb_rewin) >
782 		BTRFS_NODEPTRS_PER_BLOCK(fs_info));
783 
784 	return eb_rewin;
785 }
786 
787 /*
788  * Rewind the state of @root's root node to the given @time_seq value.
789  * If there are no changes, the current root->root_node is returned. If anything
790  * changed in between, there's a fresh buffer allocated on which the rewind
791  * operations are done. In any case, the returned buffer is read locked.
792  * Returns NULL on error (with no locks held).
793  */
794 struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq)
795 {
796 	struct btrfs_fs_info *fs_info = root->fs_info;
797 	struct tree_mod_elem *tm;
798 	struct extent_buffer *eb = NULL;
799 	struct extent_buffer *eb_root;
800 	u64 eb_root_owner = 0;
801 	struct extent_buffer *old;
802 	struct tree_mod_root *old_root = NULL;
803 	u64 old_generation = 0;
804 	u64 logical;
805 	int level;
806 
807 	eb_root = btrfs_read_lock_root_node(root);
808 	tm = tree_mod_log_oldest_root(eb_root, time_seq);
809 	if (!tm)
810 		return eb_root;
811 
812 	if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) {
813 		old_root = &tm->old_root;
814 		old_generation = tm->generation;
815 		logical = old_root->logical;
816 		level = old_root->level;
817 	} else {
818 		logical = eb_root->start;
819 		level = btrfs_header_level(eb_root);
820 	}
821 
822 	tm = tree_mod_log_search(fs_info, logical, time_seq);
823 	if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
824 		struct btrfs_tree_parent_check check = { 0 };
825 
826 		btrfs_tree_read_unlock(eb_root);
827 		free_extent_buffer(eb_root);
828 
829 		check.level = level;
830 		check.owner_root = root->root_key.objectid;
831 
832 		old = read_tree_block(fs_info, logical, &check);
833 		if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
834 			if (!IS_ERR(old))
835 				free_extent_buffer(old);
836 			btrfs_warn(fs_info,
837 				   "failed to read tree block %llu from get_old_root",
838 				   logical);
839 		} else {
840 			struct tree_mod_elem *tm2;
841 
842 			btrfs_tree_read_lock(old);
843 			eb = btrfs_clone_extent_buffer(old);
844 			/*
845 			 * After the lookup for the most recent tree mod operation
846 			 * above and before we locked and cloned the extent buffer
847 			 * 'old', a new tree mod log operation may have been added.
848 			 * So lookup for a more recent one to make sure the number
849 			 * of mod log operations we replay is consistent with the
850 			 * number of items we have in the cloned extent buffer,
851 			 * otherwise we can hit a BUG_ON when rewinding the extent
852 			 * buffer.
853 			 */
854 			tm2 = tree_mod_log_search(fs_info, logical, time_seq);
855 			btrfs_tree_read_unlock(old);
856 			free_extent_buffer(old);
857 			ASSERT(tm2);
858 			ASSERT(tm2 == tm || tm2->seq > tm->seq);
859 			if (!tm2 || tm2->seq < tm->seq) {
860 				free_extent_buffer(eb);
861 				return NULL;
862 			}
863 			tm = tm2;
864 		}
865 	} else if (old_root) {
866 		eb_root_owner = btrfs_header_owner(eb_root);
867 		btrfs_tree_read_unlock(eb_root);
868 		free_extent_buffer(eb_root);
869 		eb = alloc_dummy_extent_buffer(fs_info, logical);
870 	} else {
871 		eb = btrfs_clone_extent_buffer(eb_root);
872 		btrfs_tree_read_unlock(eb_root);
873 		free_extent_buffer(eb_root);
874 	}
875 
876 	if (!eb)
877 		return NULL;
878 	if (old_root) {
879 		btrfs_set_header_bytenr(eb, eb->start);
880 		btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
881 		btrfs_set_header_owner(eb, eb_root_owner);
882 		btrfs_set_header_level(eb, old_root->level);
883 		btrfs_set_header_generation(eb, old_generation);
884 	}
885 	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
886 				       btrfs_header_level(eb));
887 	btrfs_tree_read_lock(eb);
888 	if (tm)
889 		tree_mod_log_rewind(fs_info, eb, time_seq, tm);
890 	else
891 		WARN_ON(btrfs_header_level(eb) != 0);
892 	WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
893 
894 	return eb;
895 }
896 
897 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
898 {
899 	struct tree_mod_elem *tm;
900 	int level;
901 	struct extent_buffer *eb_root = btrfs_root_node(root);
902 
903 	tm = tree_mod_log_oldest_root(eb_root, time_seq);
904 	if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE)
905 		level = tm->old_root.level;
906 	else
907 		level = btrfs_header_level(eb_root);
908 
909 	free_extent_buffer(eb_root);
910 
911 	return level;
912 }
913 
914 /*
915  * Return the lowest sequence number in the tree modification log.
916  *
917  * Return the sequence number of the oldest tree modification log user, which
918  * corresponds to the lowest sequence number of all existing users. If there are
919  * no users it returns 0.
920  */
921 u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info)
922 {
923 	u64 ret = 0;
924 
925 	read_lock(&fs_info->tree_mod_log_lock);
926 	if (!list_empty(&fs_info->tree_mod_seq_list)) {
927 		struct btrfs_seq_list *elem;
928 
929 		elem = list_first_entry(&fs_info->tree_mod_seq_list,
930 					struct btrfs_seq_list, list);
931 		ret = elem->seq;
932 	}
933 	read_unlock(&fs_info->tree_mod_log_lock);
934 
935 	return ret;
936 }
937