xref: /openbmc/linux/fs/xfs/libxfs/xfs_iext_tree.c (revision a0ae2562c6c4b2721d9fddba63b7286c13517d9f)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2017 Christoph Hellwig.
4  */
5 
6 #include <linux/cache.h>
7 #include <linux/kernel.h>
8 #include <linux/slab.h>
9 #include "xfs.h"
10 #include "xfs_format.h"
11 #include "xfs_bit.h"
12 #include "xfs_log_format.h"
13 #include "xfs_inode.h"
14 #include "xfs_inode_fork.h"
15 #include "xfs_trans_resv.h"
16 #include "xfs_mount.h"
17 #include "xfs_trace.h"
18 
19 /*
20  * In-core extent record layout:
21  *
22  * +-------+----------------------------+
23  * | 00:53 | all 54 bits of startoff    |
24  * | 54:63 | low 10 bits of startblock  |
25  * +-------+----------------------------+
26  * | 00:20 | all 21 bits of length      |
27  * |    21 | unwritten extent bit       |
28  * | 22:63 | high 42 bits of startblock |
29  * +-------+----------------------------+
30  */
31 #define XFS_IEXT_STARTOFF_MASK		xfs_mask64lo(BMBT_STARTOFF_BITLEN)
32 #define XFS_IEXT_LENGTH_MASK		xfs_mask64lo(BMBT_BLOCKCOUNT_BITLEN)
33 #define XFS_IEXT_STARTBLOCK_MASK	xfs_mask64lo(BMBT_STARTBLOCK_BITLEN)
34 
35 struct xfs_iext_rec {
36 	uint64_t			lo;
37 	uint64_t			hi;
38 };
39 
40 /*
41  * Given that the length can't be a zero, only an empty hi value indicates an
42  * unused record.
43  */
44 static bool xfs_iext_rec_is_empty(struct xfs_iext_rec *rec)
45 {
46 	return rec->hi == 0;
47 }
48 
49 static inline void xfs_iext_rec_clear(struct xfs_iext_rec *rec)
50 {
51 	rec->lo = 0;
52 	rec->hi = 0;
53 }
54 
55 static void
56 xfs_iext_set(
57 	struct xfs_iext_rec	*rec,
58 	struct xfs_bmbt_irec	*irec)
59 {
60 	ASSERT((irec->br_startoff & ~XFS_IEXT_STARTOFF_MASK) == 0);
61 	ASSERT((irec->br_blockcount & ~XFS_IEXT_LENGTH_MASK) == 0);
62 	ASSERT((irec->br_startblock & ~XFS_IEXT_STARTBLOCK_MASK) == 0);
63 
64 	rec->lo = irec->br_startoff & XFS_IEXT_STARTOFF_MASK;
65 	rec->hi = irec->br_blockcount & XFS_IEXT_LENGTH_MASK;
66 
67 	rec->lo |= (irec->br_startblock << 54);
68 	rec->hi |= ((irec->br_startblock & ~xfs_mask64lo(10)) << (22 - 10));
69 
70 	if (irec->br_state == XFS_EXT_UNWRITTEN)
71 		rec->hi |= (1 << 21);
72 }
73 
74 static void
75 xfs_iext_get(
76 	struct xfs_bmbt_irec	*irec,
77 	struct xfs_iext_rec	*rec)
78 {
79 	irec->br_startoff = rec->lo & XFS_IEXT_STARTOFF_MASK;
80 	irec->br_blockcount = rec->hi & XFS_IEXT_LENGTH_MASK;
81 
82 	irec->br_startblock = rec->lo >> 54;
83 	irec->br_startblock |= (rec->hi & xfs_mask64hi(42)) >> (22 - 10);
84 
85 	if (rec->hi & (1 << 21))
86 		irec->br_state = XFS_EXT_UNWRITTEN;
87 	else
88 		irec->br_state = XFS_EXT_NORM;
89 }
90 
91 enum {
92 	NODE_SIZE	= 256,
93 	KEYS_PER_NODE	= NODE_SIZE / (sizeof(uint64_t) + sizeof(void *)),
94 	RECS_PER_LEAF	= (NODE_SIZE - (2 * sizeof(struct xfs_iext_leaf *))) /
95 				sizeof(struct xfs_iext_rec),
96 };
97 
98 /*
99  * In-core extent btree block layout:
100  *
101  * There are two types of blocks in the btree: leaf and inner (non-leaf) blocks.
102  *
103  * The leaf blocks are made up by %KEYS_PER_NODE extent records, which each
104  * contain the startoffset, blockcount, startblock and unwritten extent flag.
105  * See above for the exact format, followed by pointers to the previous and next
106  * leaf blocks (if there are any).
107  *
108  * The inner (non-leaf) blocks first contain KEYS_PER_NODE lookup keys, followed
109  * by an equal number of pointers to the btree blocks at the next lower level.
110  *
111  *		+-------+-------+-------+-------+-------+----------+----------+
112  * Leaf:	| rec 1 | rec 2 | rec 3 | rec 4 | rec N | prev-ptr | next-ptr |
113  *		+-------+-------+-------+-------+-------+----------+----------+
114  *
115  *		+-------+-------+-------+-------+-------+-------+------+-------+
116  * Inner:	| key 1 | key 2 | key 3 | key N | ptr 1 | ptr 2 | ptr3 | ptr N |
117  *		+-------+-------+-------+-------+-------+-------+------+-------+
118  */
119 struct xfs_iext_node {
120 	uint64_t		keys[KEYS_PER_NODE];
121 #define XFS_IEXT_KEY_INVALID	(1ULL << 63)
122 	void			*ptrs[KEYS_PER_NODE];
123 };
124 
125 struct xfs_iext_leaf {
126 	struct xfs_iext_rec	recs[RECS_PER_LEAF];
127 	struct xfs_iext_leaf	*prev;
128 	struct xfs_iext_leaf	*next;
129 };
130 
131 inline xfs_extnum_t xfs_iext_count(struct xfs_ifork *ifp)
132 {
133 	return ifp->if_bytes / sizeof(struct xfs_iext_rec);
134 }
135 
136 static inline int xfs_iext_max_recs(struct xfs_ifork *ifp)
137 {
138 	if (ifp->if_height == 1)
139 		return xfs_iext_count(ifp);
140 	return RECS_PER_LEAF;
141 }
142 
143 static inline struct xfs_iext_rec *cur_rec(struct xfs_iext_cursor *cur)
144 {
145 	return &cur->leaf->recs[cur->pos];
146 }
147 
148 static inline bool xfs_iext_valid(struct xfs_ifork *ifp,
149 		struct xfs_iext_cursor *cur)
150 {
151 	if (!cur->leaf)
152 		return false;
153 	if (cur->pos < 0 || cur->pos >= xfs_iext_max_recs(ifp))
154 		return false;
155 	if (xfs_iext_rec_is_empty(cur_rec(cur)))
156 		return false;
157 	return true;
158 }
159 
160 static void *
161 xfs_iext_find_first_leaf(
162 	struct xfs_ifork	*ifp)
163 {
164 	struct xfs_iext_node	*node = ifp->if_u1.if_root;
165 	int			height;
166 
167 	if (!ifp->if_height)
168 		return NULL;
169 
170 	for (height = ifp->if_height; height > 1; height--) {
171 		node = node->ptrs[0];
172 		ASSERT(node);
173 	}
174 
175 	return node;
176 }
177 
178 static void *
179 xfs_iext_find_last_leaf(
180 	struct xfs_ifork	*ifp)
181 {
182 	struct xfs_iext_node	*node = ifp->if_u1.if_root;
183 	int			height, i;
184 
185 	if (!ifp->if_height)
186 		return NULL;
187 
188 	for (height = ifp->if_height; height > 1; height--) {
189 		for (i = 1; i < KEYS_PER_NODE; i++)
190 			if (!node->ptrs[i])
191 				break;
192 		node = node->ptrs[i - 1];
193 		ASSERT(node);
194 	}
195 
196 	return node;
197 }
198 
199 void
200 xfs_iext_first(
201 	struct xfs_ifork	*ifp,
202 	struct xfs_iext_cursor	*cur)
203 {
204 	cur->pos = 0;
205 	cur->leaf = xfs_iext_find_first_leaf(ifp);
206 }
207 
208 void
209 xfs_iext_last(
210 	struct xfs_ifork	*ifp,
211 	struct xfs_iext_cursor	*cur)
212 {
213 	int			i;
214 
215 	cur->leaf = xfs_iext_find_last_leaf(ifp);
216 	if (!cur->leaf) {
217 		cur->pos = 0;
218 		return;
219 	}
220 
221 	for (i = 1; i < xfs_iext_max_recs(ifp); i++) {
222 		if (xfs_iext_rec_is_empty(&cur->leaf->recs[i]))
223 			break;
224 	}
225 	cur->pos = i - 1;
226 }
227 
228 void
229 xfs_iext_next(
230 	struct xfs_ifork	*ifp,
231 	struct xfs_iext_cursor	*cur)
232 {
233 	if (!cur->leaf) {
234 		ASSERT(cur->pos <= 0 || cur->pos >= RECS_PER_LEAF);
235 		xfs_iext_first(ifp, cur);
236 		return;
237 	}
238 
239 	ASSERT(cur->pos >= 0);
240 	ASSERT(cur->pos < xfs_iext_max_recs(ifp));
241 
242 	cur->pos++;
243 	if (ifp->if_height > 1 && !xfs_iext_valid(ifp, cur) &&
244 	    cur->leaf->next) {
245 		cur->leaf = cur->leaf->next;
246 		cur->pos = 0;
247 	}
248 }
249 
250 void
251 xfs_iext_prev(
252 	struct xfs_ifork	*ifp,
253 	struct xfs_iext_cursor	*cur)
254 {
255 	if (!cur->leaf) {
256 		ASSERT(cur->pos <= 0 || cur->pos >= RECS_PER_LEAF);
257 		xfs_iext_last(ifp, cur);
258 		return;
259 	}
260 
261 	ASSERT(cur->pos >= 0);
262 	ASSERT(cur->pos <= RECS_PER_LEAF);
263 
264 recurse:
265 	do {
266 		cur->pos--;
267 		if (xfs_iext_valid(ifp, cur))
268 			return;
269 	} while (cur->pos > 0);
270 
271 	if (ifp->if_height > 1 && cur->leaf->prev) {
272 		cur->leaf = cur->leaf->prev;
273 		cur->pos = RECS_PER_LEAF;
274 		goto recurse;
275 	}
276 }
277 
278 static inline int
279 xfs_iext_key_cmp(
280 	struct xfs_iext_node	*node,
281 	int			n,
282 	xfs_fileoff_t		offset)
283 {
284 	if (node->keys[n] > offset)
285 		return 1;
286 	if (node->keys[n] < offset)
287 		return -1;
288 	return 0;
289 }
290 
291 static inline int
292 xfs_iext_rec_cmp(
293 	struct xfs_iext_rec	*rec,
294 	xfs_fileoff_t		offset)
295 {
296 	uint64_t		rec_offset = rec->lo & XFS_IEXT_STARTOFF_MASK;
297 	uint32_t		rec_len = rec->hi & XFS_IEXT_LENGTH_MASK;
298 
299 	if (rec_offset > offset)
300 		return 1;
301 	if (rec_offset + rec_len <= offset)
302 		return -1;
303 	return 0;
304 }
305 
306 static void *
307 xfs_iext_find_level(
308 	struct xfs_ifork	*ifp,
309 	xfs_fileoff_t		offset,
310 	int			level)
311 {
312 	struct xfs_iext_node	*node = ifp->if_u1.if_root;
313 	int			height, i;
314 
315 	if (!ifp->if_height)
316 		return NULL;
317 
318 	for (height = ifp->if_height; height > level; height--) {
319 		for (i = 1; i < KEYS_PER_NODE; i++)
320 			if (xfs_iext_key_cmp(node, i, offset) > 0)
321 				break;
322 
323 		node = node->ptrs[i - 1];
324 		if (!node)
325 			break;
326 	}
327 
328 	return node;
329 }
330 
331 static int
332 xfs_iext_node_pos(
333 	struct xfs_iext_node	*node,
334 	xfs_fileoff_t		offset)
335 {
336 	int			i;
337 
338 	for (i = 1; i < KEYS_PER_NODE; i++) {
339 		if (xfs_iext_key_cmp(node, i, offset) > 0)
340 			break;
341 	}
342 
343 	return i - 1;
344 }
345 
346 static int
347 xfs_iext_node_insert_pos(
348 	struct xfs_iext_node	*node,
349 	xfs_fileoff_t		offset)
350 {
351 	int			i;
352 
353 	for (i = 0; i < KEYS_PER_NODE; i++) {
354 		if (xfs_iext_key_cmp(node, i, offset) > 0)
355 			return i;
356 	}
357 
358 	return KEYS_PER_NODE;
359 }
360 
361 static int
362 xfs_iext_node_nr_entries(
363 	struct xfs_iext_node	*node,
364 	int			start)
365 {
366 	int			i;
367 
368 	for (i = start; i < KEYS_PER_NODE; i++) {
369 		if (node->keys[i] == XFS_IEXT_KEY_INVALID)
370 			break;
371 	}
372 
373 	return i;
374 }
375 
376 static int
377 xfs_iext_leaf_nr_entries(
378 	struct xfs_ifork	*ifp,
379 	struct xfs_iext_leaf	*leaf,
380 	int			start)
381 {
382 	int			i;
383 
384 	for (i = start; i < xfs_iext_max_recs(ifp); i++) {
385 		if (xfs_iext_rec_is_empty(&leaf->recs[i]))
386 			break;
387 	}
388 
389 	return i;
390 }
391 
392 static inline uint64_t
393 xfs_iext_leaf_key(
394 	struct xfs_iext_leaf	*leaf,
395 	int			n)
396 {
397 	return leaf->recs[n].lo & XFS_IEXT_STARTOFF_MASK;
398 }
399 
400 static void
401 xfs_iext_grow(
402 	struct xfs_ifork	*ifp)
403 {
404 	struct xfs_iext_node	*node = kmem_zalloc(NODE_SIZE, KM_NOFS);
405 	int			i;
406 
407 	if (ifp->if_height == 1) {
408 		struct xfs_iext_leaf *prev = ifp->if_u1.if_root;
409 
410 		node->keys[0] = xfs_iext_leaf_key(prev, 0);
411 		node->ptrs[0] = prev;
412 	} else  {
413 		struct xfs_iext_node *prev = ifp->if_u1.if_root;
414 
415 		ASSERT(ifp->if_height > 1);
416 
417 		node->keys[0] = prev->keys[0];
418 		node->ptrs[0] = prev;
419 	}
420 
421 	for (i = 1; i < KEYS_PER_NODE; i++)
422 		node->keys[i] = XFS_IEXT_KEY_INVALID;
423 
424 	ifp->if_u1.if_root = node;
425 	ifp->if_height++;
426 }
427 
428 static void
429 xfs_iext_update_node(
430 	struct xfs_ifork	*ifp,
431 	xfs_fileoff_t		old_offset,
432 	xfs_fileoff_t		new_offset,
433 	int			level,
434 	void			*ptr)
435 {
436 	struct xfs_iext_node	*node = ifp->if_u1.if_root;
437 	int			height, i;
438 
439 	for (height = ifp->if_height; height > level; height--) {
440 		for (i = 0; i < KEYS_PER_NODE; i++) {
441 			if (i > 0 && xfs_iext_key_cmp(node, i, old_offset) > 0)
442 				break;
443 			if (node->keys[i] == old_offset)
444 				node->keys[i] = new_offset;
445 		}
446 		node = node->ptrs[i - 1];
447 		ASSERT(node);
448 	}
449 
450 	ASSERT(node == ptr);
451 }
452 
453 static struct xfs_iext_node *
454 xfs_iext_split_node(
455 	struct xfs_iext_node	**nodep,
456 	int			*pos,
457 	int			*nr_entries)
458 {
459 	struct xfs_iext_node	*node = *nodep;
460 	struct xfs_iext_node	*new = kmem_zalloc(NODE_SIZE, KM_NOFS);
461 	const int		nr_move = KEYS_PER_NODE / 2;
462 	int			nr_keep = nr_move + (KEYS_PER_NODE & 1);
463 	int			i = 0;
464 
465 	/* for sequential append operations just spill over into the new node */
466 	if (*pos == KEYS_PER_NODE) {
467 		*nodep = new;
468 		*pos = 0;
469 		*nr_entries = 0;
470 		goto done;
471 	}
472 
473 
474 	for (i = 0; i < nr_move; i++) {
475 		new->keys[i] = node->keys[nr_keep + i];
476 		new->ptrs[i] = node->ptrs[nr_keep + i];
477 
478 		node->keys[nr_keep + i] = XFS_IEXT_KEY_INVALID;
479 		node->ptrs[nr_keep + i] = NULL;
480 	}
481 
482 	if (*pos >= nr_keep) {
483 		*nodep = new;
484 		*pos -= nr_keep;
485 		*nr_entries = nr_move;
486 	} else {
487 		*nr_entries = nr_keep;
488 	}
489 done:
490 	for (; i < KEYS_PER_NODE; i++)
491 		new->keys[i] = XFS_IEXT_KEY_INVALID;
492 	return new;
493 }
494 
495 static void
496 xfs_iext_insert_node(
497 	struct xfs_ifork	*ifp,
498 	uint64_t		offset,
499 	void			*ptr,
500 	int			level)
501 {
502 	struct xfs_iext_node	*node, *new;
503 	int			i, pos, nr_entries;
504 
505 again:
506 	if (ifp->if_height < level)
507 		xfs_iext_grow(ifp);
508 
509 	new = NULL;
510 	node = xfs_iext_find_level(ifp, offset, level);
511 	pos = xfs_iext_node_insert_pos(node, offset);
512 	nr_entries = xfs_iext_node_nr_entries(node, pos);
513 
514 	ASSERT(pos >= nr_entries || xfs_iext_key_cmp(node, pos, offset) != 0);
515 	ASSERT(nr_entries <= KEYS_PER_NODE);
516 
517 	if (nr_entries == KEYS_PER_NODE)
518 		new = xfs_iext_split_node(&node, &pos, &nr_entries);
519 
520 	/*
521 	 * Update the pointers in higher levels if the first entry changes
522 	 * in an existing node.
523 	 */
524 	if (node != new && pos == 0 && nr_entries > 0)
525 		xfs_iext_update_node(ifp, node->keys[0], offset, level, node);
526 
527 	for (i = nr_entries; i > pos; i--) {
528 		node->keys[i] = node->keys[i - 1];
529 		node->ptrs[i] = node->ptrs[i - 1];
530 	}
531 	node->keys[pos] = offset;
532 	node->ptrs[pos] = ptr;
533 
534 	if (new) {
535 		offset = new->keys[0];
536 		ptr = new;
537 		level++;
538 		goto again;
539 	}
540 }
541 
542 static struct xfs_iext_leaf *
543 xfs_iext_split_leaf(
544 	struct xfs_iext_cursor	*cur,
545 	int			*nr_entries)
546 {
547 	struct xfs_iext_leaf	*leaf = cur->leaf;
548 	struct xfs_iext_leaf	*new = kmem_zalloc(NODE_SIZE, KM_NOFS);
549 	const int		nr_move = RECS_PER_LEAF / 2;
550 	int			nr_keep = nr_move + (RECS_PER_LEAF & 1);
551 	int			i;
552 
553 	/* for sequential append operations just spill over into the new node */
554 	if (cur->pos == RECS_PER_LEAF) {
555 		cur->leaf = new;
556 		cur->pos = 0;
557 		*nr_entries = 0;
558 		goto done;
559 	}
560 
561 	for (i = 0; i < nr_move; i++) {
562 		new->recs[i] = leaf->recs[nr_keep + i];
563 		xfs_iext_rec_clear(&leaf->recs[nr_keep + i]);
564 	}
565 
566 	if (cur->pos >= nr_keep) {
567 		cur->leaf = new;
568 		cur->pos -= nr_keep;
569 		*nr_entries = nr_move;
570 	} else {
571 		*nr_entries = nr_keep;
572 	}
573 done:
574 	if (leaf->next)
575 		leaf->next->prev = new;
576 	new->next = leaf->next;
577 	new->prev = leaf;
578 	leaf->next = new;
579 	return new;
580 }
581 
582 static void
583 xfs_iext_alloc_root(
584 	struct xfs_ifork	*ifp,
585 	struct xfs_iext_cursor	*cur)
586 {
587 	ASSERT(ifp->if_bytes == 0);
588 
589 	ifp->if_u1.if_root = kmem_zalloc(sizeof(struct xfs_iext_rec), KM_NOFS);
590 	ifp->if_height = 1;
591 
592 	/* now that we have a node step into it */
593 	cur->leaf = ifp->if_u1.if_root;
594 	cur->pos = 0;
595 }
596 
597 static void
598 xfs_iext_realloc_root(
599 	struct xfs_ifork	*ifp,
600 	struct xfs_iext_cursor	*cur)
601 {
602 	size_t new_size = ifp->if_bytes + sizeof(struct xfs_iext_rec);
603 	void *new;
604 
605 	/* account for the prev/next pointers */
606 	if (new_size / sizeof(struct xfs_iext_rec) == RECS_PER_LEAF)
607 		new_size = NODE_SIZE;
608 
609 	new = kmem_realloc(ifp->if_u1.if_root, new_size, KM_NOFS);
610 	memset(new + ifp->if_bytes, 0, new_size - ifp->if_bytes);
611 	ifp->if_u1.if_root = new;
612 	cur->leaf = new;
613 }
614 
615 void
616 xfs_iext_insert(
617 	struct xfs_inode	*ip,
618 	struct xfs_iext_cursor	*cur,
619 	struct xfs_bmbt_irec	*irec,
620 	int			state)
621 {
622 	struct xfs_ifork	*ifp = xfs_iext_state_to_fork(ip, state);
623 	xfs_fileoff_t		offset = irec->br_startoff;
624 	struct xfs_iext_leaf	*new = NULL;
625 	int			nr_entries, i;
626 
627 	if (ifp->if_height == 0)
628 		xfs_iext_alloc_root(ifp, cur);
629 	else if (ifp->if_height == 1)
630 		xfs_iext_realloc_root(ifp, cur);
631 
632 	nr_entries = xfs_iext_leaf_nr_entries(ifp, cur->leaf, cur->pos);
633 	ASSERT(nr_entries <= RECS_PER_LEAF);
634 	ASSERT(cur->pos >= nr_entries ||
635 	       xfs_iext_rec_cmp(cur_rec(cur), irec->br_startoff) != 0);
636 
637 	if (nr_entries == RECS_PER_LEAF)
638 		new = xfs_iext_split_leaf(cur, &nr_entries);
639 
640 	/*
641 	 * Update the pointers in higher levels if the first entry changes
642 	 * in an existing node.
643 	 */
644 	if (cur->leaf != new && cur->pos == 0 && nr_entries > 0) {
645 		xfs_iext_update_node(ifp, xfs_iext_leaf_key(cur->leaf, 0),
646 				offset, 1, cur->leaf);
647 	}
648 
649 	for (i = nr_entries; i > cur->pos; i--)
650 		cur->leaf->recs[i] = cur->leaf->recs[i - 1];
651 	xfs_iext_set(cur_rec(cur), irec);
652 	ifp->if_bytes += sizeof(struct xfs_iext_rec);
653 
654 	trace_xfs_iext_insert(ip, cur, state, _RET_IP_);
655 
656 	if (new)
657 		xfs_iext_insert_node(ifp, xfs_iext_leaf_key(new, 0), new, 2);
658 }
659 
660 static struct xfs_iext_node *
661 xfs_iext_rebalance_node(
662 	struct xfs_iext_node	*parent,
663 	int			*pos,
664 	struct xfs_iext_node	*node,
665 	int			nr_entries)
666 {
667 	/*
668 	 * If the neighbouring nodes are completely full, or have different
669 	 * parents, we might never be able to merge our node, and will only
670 	 * delete it once the number of entries hits zero.
671 	 */
672 	if (nr_entries == 0)
673 		return node;
674 
675 	if (*pos > 0) {
676 		struct xfs_iext_node *prev = parent->ptrs[*pos - 1];
677 		int nr_prev = xfs_iext_node_nr_entries(prev, 0), i;
678 
679 		if (nr_prev + nr_entries <= KEYS_PER_NODE) {
680 			for (i = 0; i < nr_entries; i++) {
681 				prev->keys[nr_prev + i] = node->keys[i];
682 				prev->ptrs[nr_prev + i] = node->ptrs[i];
683 			}
684 			return node;
685 		}
686 	}
687 
688 	if (*pos + 1 < xfs_iext_node_nr_entries(parent, *pos)) {
689 		struct xfs_iext_node *next = parent->ptrs[*pos + 1];
690 		int nr_next = xfs_iext_node_nr_entries(next, 0), i;
691 
692 		if (nr_entries + nr_next <= KEYS_PER_NODE) {
693 			/*
694 			 * Merge the next node into this node so that we don't
695 			 * have to do an additional update of the keys in the
696 			 * higher levels.
697 			 */
698 			for (i = 0; i < nr_next; i++) {
699 				node->keys[nr_entries + i] = next->keys[i];
700 				node->ptrs[nr_entries + i] = next->ptrs[i];
701 			}
702 
703 			++*pos;
704 			return next;
705 		}
706 	}
707 
708 	return NULL;
709 }
710 
711 static void
712 xfs_iext_remove_node(
713 	struct xfs_ifork	*ifp,
714 	xfs_fileoff_t		offset,
715 	void			*victim)
716 {
717 	struct xfs_iext_node	*node, *parent;
718 	int			level = 2, pos, nr_entries, i;
719 
720 	ASSERT(level <= ifp->if_height);
721 	node = xfs_iext_find_level(ifp, offset, level);
722 	pos = xfs_iext_node_pos(node, offset);
723 again:
724 	ASSERT(node->ptrs[pos]);
725 	ASSERT(node->ptrs[pos] == victim);
726 	kmem_free(victim);
727 
728 	nr_entries = xfs_iext_node_nr_entries(node, pos) - 1;
729 	offset = node->keys[0];
730 	for (i = pos; i < nr_entries; i++) {
731 		node->keys[i] = node->keys[i + 1];
732 		node->ptrs[i] = node->ptrs[i + 1];
733 	}
734 	node->keys[nr_entries] = XFS_IEXT_KEY_INVALID;
735 	node->ptrs[nr_entries] = NULL;
736 
737 	if (pos == 0 && nr_entries > 0) {
738 		xfs_iext_update_node(ifp, offset, node->keys[0], level, node);
739 		offset = node->keys[0];
740 	}
741 
742 	if (nr_entries >= KEYS_PER_NODE / 2)
743 		return;
744 
745 	if (level < ifp->if_height) {
746 		/*
747 		 * If we aren't at the root yet try to find a neighbour node to
748 		 * merge with (or delete the node if it is empty), and then
749 		 * recurse up to the next level.
750 		 */
751 		level++;
752 		parent = xfs_iext_find_level(ifp, offset, level);
753 		pos = xfs_iext_node_pos(parent, offset);
754 
755 		ASSERT(pos != KEYS_PER_NODE);
756 		ASSERT(parent->ptrs[pos] == node);
757 
758 		node = xfs_iext_rebalance_node(parent, &pos, node, nr_entries);
759 		if (node) {
760 			victim = node;
761 			node = parent;
762 			goto again;
763 		}
764 	} else if (nr_entries == 1) {
765 		/*
766 		 * If we are at the root and only one entry is left we can just
767 		 * free this node and update the root pointer.
768 		 */
769 		ASSERT(node == ifp->if_u1.if_root);
770 		ifp->if_u1.if_root = node->ptrs[0];
771 		ifp->if_height--;
772 		kmem_free(node);
773 	}
774 }
775 
776 static void
777 xfs_iext_rebalance_leaf(
778 	struct xfs_ifork	*ifp,
779 	struct xfs_iext_cursor	*cur,
780 	struct xfs_iext_leaf	*leaf,
781 	xfs_fileoff_t		offset,
782 	int			nr_entries)
783 {
784 	/*
785 	 * If the neighbouring nodes are completely full we might never be able
786 	 * to merge our node, and will only delete it once the number of
787 	 * entries hits zero.
788 	 */
789 	if (nr_entries == 0)
790 		goto remove_node;
791 
792 	if (leaf->prev) {
793 		int nr_prev = xfs_iext_leaf_nr_entries(ifp, leaf->prev, 0), i;
794 
795 		if (nr_prev + nr_entries <= RECS_PER_LEAF) {
796 			for (i = 0; i < nr_entries; i++)
797 				leaf->prev->recs[nr_prev + i] = leaf->recs[i];
798 
799 			if (cur->leaf == leaf) {
800 				cur->leaf = leaf->prev;
801 				cur->pos += nr_prev;
802 			}
803 			goto remove_node;
804 		}
805 	}
806 
807 	if (leaf->next) {
808 		int nr_next = xfs_iext_leaf_nr_entries(ifp, leaf->next, 0), i;
809 
810 		if (nr_entries + nr_next <= RECS_PER_LEAF) {
811 			/*
812 			 * Merge the next node into this node so that we don't
813 			 * have to do an additional update of the keys in the
814 			 * higher levels.
815 			 */
816 			for (i = 0; i < nr_next; i++) {
817 				leaf->recs[nr_entries + i] =
818 					leaf->next->recs[i];
819 			}
820 
821 			if (cur->leaf == leaf->next) {
822 				cur->leaf = leaf;
823 				cur->pos += nr_entries;
824 			}
825 
826 			offset = xfs_iext_leaf_key(leaf->next, 0);
827 			leaf = leaf->next;
828 			goto remove_node;
829 		}
830 	}
831 
832 	return;
833 remove_node:
834 	if (leaf->prev)
835 		leaf->prev->next = leaf->next;
836 	if (leaf->next)
837 		leaf->next->prev = leaf->prev;
838 	xfs_iext_remove_node(ifp, offset, leaf);
839 }
840 
841 static void
842 xfs_iext_free_last_leaf(
843 	struct xfs_ifork	*ifp)
844 {
845 	ifp->if_height--;
846 	kmem_free(ifp->if_u1.if_root);
847 	ifp->if_u1.if_root = NULL;
848 }
849 
850 void
851 xfs_iext_remove(
852 	struct xfs_inode	*ip,
853 	struct xfs_iext_cursor	*cur,
854 	int			state)
855 {
856 	struct xfs_ifork	*ifp = xfs_iext_state_to_fork(ip, state);
857 	struct xfs_iext_leaf	*leaf = cur->leaf;
858 	xfs_fileoff_t		offset = xfs_iext_leaf_key(leaf, 0);
859 	int			i, nr_entries;
860 
861 	trace_xfs_iext_remove(ip, cur, state, _RET_IP_);
862 
863 	ASSERT(ifp->if_height > 0);
864 	ASSERT(ifp->if_u1.if_root != NULL);
865 	ASSERT(xfs_iext_valid(ifp, cur));
866 
867 	nr_entries = xfs_iext_leaf_nr_entries(ifp, leaf, cur->pos) - 1;
868 	for (i = cur->pos; i < nr_entries; i++)
869 		leaf->recs[i] = leaf->recs[i + 1];
870 	xfs_iext_rec_clear(&leaf->recs[nr_entries]);
871 	ifp->if_bytes -= sizeof(struct xfs_iext_rec);
872 
873 	if (cur->pos == 0 && nr_entries > 0) {
874 		xfs_iext_update_node(ifp, offset, xfs_iext_leaf_key(leaf, 0), 1,
875 				leaf);
876 		offset = xfs_iext_leaf_key(leaf, 0);
877 	} else if (cur->pos == nr_entries) {
878 		if (ifp->if_height > 1 && leaf->next)
879 			cur->leaf = leaf->next;
880 		else
881 			cur->leaf = NULL;
882 		cur->pos = 0;
883 	}
884 
885 	if (nr_entries >= RECS_PER_LEAF / 2)
886 		return;
887 
888 	if (ifp->if_height > 1)
889 		xfs_iext_rebalance_leaf(ifp, cur, leaf, offset, nr_entries);
890 	else if (nr_entries == 0)
891 		xfs_iext_free_last_leaf(ifp);
892 }
893 
894 /*
895  * Lookup the extent covering bno.
896  *
897  * If there is an extent covering bno return the extent index, and store the
898  * expanded extent structure in *gotp, and the extent cursor in *cur.
899  * If there is no extent covering bno, but there is an extent after it (e.g.
900  * it lies in a hole) return that extent in *gotp and its cursor in *cur
901  * instead.
902  * If bno is beyond the last extent return false, and return an invalid
903  * cursor value.
904  */
905 bool
906 xfs_iext_lookup_extent(
907 	struct xfs_inode	*ip,
908 	struct xfs_ifork	*ifp,
909 	xfs_fileoff_t		offset,
910 	struct xfs_iext_cursor	*cur,
911 	struct xfs_bmbt_irec	*gotp)
912 {
913 	XFS_STATS_INC(ip->i_mount, xs_look_exlist);
914 
915 	cur->leaf = xfs_iext_find_level(ifp, offset, 1);
916 	if (!cur->leaf) {
917 		cur->pos = 0;
918 		return false;
919 	}
920 
921 	for (cur->pos = 0; cur->pos < xfs_iext_max_recs(ifp); cur->pos++) {
922 		struct xfs_iext_rec *rec = cur_rec(cur);
923 
924 		if (xfs_iext_rec_is_empty(rec))
925 			break;
926 		if (xfs_iext_rec_cmp(rec, offset) >= 0)
927 			goto found;
928 	}
929 
930 	/* Try looking in the next node for an entry > offset */
931 	if (ifp->if_height == 1 || !cur->leaf->next)
932 		return false;
933 	cur->leaf = cur->leaf->next;
934 	cur->pos = 0;
935 	if (!xfs_iext_valid(ifp, cur))
936 		return false;
937 found:
938 	xfs_iext_get(gotp, cur_rec(cur));
939 	return true;
940 }
941 
942 /*
943  * Returns the last extent before end, and if this extent doesn't cover
944  * end, update end to the end of the extent.
945  */
946 bool
947 xfs_iext_lookup_extent_before(
948 	struct xfs_inode	*ip,
949 	struct xfs_ifork	*ifp,
950 	xfs_fileoff_t		*end,
951 	struct xfs_iext_cursor	*cur,
952 	struct xfs_bmbt_irec	*gotp)
953 {
954 	/* could be optimized to not even look up the next on a match.. */
955 	if (xfs_iext_lookup_extent(ip, ifp, *end - 1, cur, gotp) &&
956 	    gotp->br_startoff <= *end - 1)
957 		return true;
958 	if (!xfs_iext_prev_extent(ifp, cur, gotp))
959 		return false;
960 	*end = gotp->br_startoff + gotp->br_blockcount;
961 	return true;
962 }
963 
964 void
965 xfs_iext_update_extent(
966 	struct xfs_inode	*ip,
967 	int			state,
968 	struct xfs_iext_cursor	*cur,
969 	struct xfs_bmbt_irec	*new)
970 {
971 	struct xfs_ifork	*ifp = xfs_iext_state_to_fork(ip, state);
972 
973 	if (cur->pos == 0) {
974 		struct xfs_bmbt_irec	old;
975 
976 		xfs_iext_get(&old, cur_rec(cur));
977 		if (new->br_startoff != old.br_startoff) {
978 			xfs_iext_update_node(ifp, old.br_startoff,
979 					new->br_startoff, 1, cur->leaf);
980 		}
981 	}
982 
983 	trace_xfs_bmap_pre_update(ip, cur, state, _RET_IP_);
984 	xfs_iext_set(cur_rec(cur), new);
985 	trace_xfs_bmap_post_update(ip, cur, state, _RET_IP_);
986 }
987 
988 /*
989  * Return true if the cursor points at an extent and return the extent structure
990  * in gotp.  Else return false.
991  */
992 bool
993 xfs_iext_get_extent(
994 	struct xfs_ifork	*ifp,
995 	struct xfs_iext_cursor	*cur,
996 	struct xfs_bmbt_irec	*gotp)
997 {
998 	if (!xfs_iext_valid(ifp, cur))
999 		return false;
1000 	xfs_iext_get(gotp, cur_rec(cur));
1001 	return true;
1002 }
1003 
1004 /*
1005  * This is a recursive function, because of that we need to be extremely
1006  * careful with stack usage.
1007  */
1008 static void
1009 xfs_iext_destroy_node(
1010 	struct xfs_iext_node	*node,
1011 	int			level)
1012 {
1013 	int			i;
1014 
1015 	if (level > 1) {
1016 		for (i = 0; i < KEYS_PER_NODE; i++) {
1017 			if (node->keys[i] == XFS_IEXT_KEY_INVALID)
1018 				break;
1019 			xfs_iext_destroy_node(node->ptrs[i], level - 1);
1020 		}
1021 	}
1022 
1023 	kmem_free(node);
1024 }
1025 
1026 void
1027 xfs_iext_destroy(
1028 	struct xfs_ifork	*ifp)
1029 {
1030 	xfs_iext_destroy_node(ifp->if_u1.if_root, ifp->if_height);
1031 
1032 	ifp->if_bytes = 0;
1033 	ifp->if_height = 0;
1034 	ifp->if_u1.if_root = NULL;
1035 }
1036