xref: /openbmc/linux/lib/radix-tree.c (revision 8571e645)
1 /*
2  * Copyright (C) 2001 Momchil Velikov
3  * Portions Copyright (C) 2001 Christoph Hellwig
4  * Copyright (C) 2005 SGI, Christoph Lameter
5  * Copyright (C) 2006 Nick Piggin
6  * Copyright (C) 2012 Konstantin Khlebnikov
7  *
8  * This program is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU General Public License as
10  * published by the Free Software Foundation; either version 2, or (at
11  * your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful, but
14  * WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21  */
22 
23 #include <linux/errno.h>
24 #include <linux/init.h>
25 #include <linux/kernel.h>
26 #include <linux/export.h>
27 #include <linux/radix-tree.h>
28 #include <linux/percpu.h>
29 #include <linux/slab.h>
30 #include <linux/kmemleak.h>
31 #include <linux/notifier.h>
32 #include <linux/cpu.h>
33 #include <linux/string.h>
34 #include <linux/bitops.h>
35 #include <linux/rcupdate.h>
36 #include <linux/preempt.h>		/* in_interrupt() */
37 
38 
39 /*
40  * The height_to_maxindex array needs to be one deeper than the maximum
41  * path as height 0 holds only 1 entry.
42  */
43 static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1] __read_mostly;
44 
45 /*
46  * Radix tree node cache.
47  */
48 static struct kmem_cache *radix_tree_node_cachep;
49 
50 /*
51  * The radix tree is variable-height, so an insert operation not only has
52  * to build the branch to its corresponding item, it also has to build the
53  * branch to existing items if the size has to be increased (by
54  * radix_tree_extend).
55  *
56  * The worst case is a zero height tree with just a single item at index 0,
57  * and then inserting an item at index ULONG_MAX. This requires 2 new branches
58  * of RADIX_TREE_MAX_PATH size to be created, with only the root node shared.
59  * Hence:
60  */
61 #define RADIX_TREE_PRELOAD_SIZE (RADIX_TREE_MAX_PATH * 2 - 1)
62 
63 /*
64  * Per-cpu pool of preloaded nodes
65  */
66 struct radix_tree_preload {
67 	int nr;
68 	/* nodes->private_data points to next preallocated node */
69 	struct radix_tree_node *nodes;
70 };
71 static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
72 
73 static inline void *ptr_to_indirect(void *ptr)
74 {
75 	return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR);
76 }
77 
78 static inline void *indirect_to_ptr(void *ptr)
79 {
80 	return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR);
81 }
82 
83 static inline gfp_t root_gfp_mask(struct radix_tree_root *root)
84 {
85 	return root->gfp_mask & __GFP_BITS_MASK;
86 }
87 
88 static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
89 		int offset)
90 {
91 	__set_bit(offset, node->tags[tag]);
92 }
93 
94 static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
95 		int offset)
96 {
97 	__clear_bit(offset, node->tags[tag]);
98 }
99 
100 static inline int tag_get(struct radix_tree_node *node, unsigned int tag,
101 		int offset)
102 {
103 	return test_bit(offset, node->tags[tag]);
104 }
105 
106 static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag)
107 {
108 	root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT));
109 }
110 
111 static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag)
112 {
113 	root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT));
114 }
115 
116 static inline void root_tag_clear_all(struct radix_tree_root *root)
117 {
118 	root->gfp_mask &= __GFP_BITS_MASK;
119 }
120 
121 static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag)
122 {
123 	return (__force unsigned)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT));
124 }
125 
126 /*
127  * Returns 1 if any slot in the node has this tag set.
128  * Otherwise returns 0.
129  */
130 static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag)
131 {
132 	int idx;
133 	for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
134 		if (node->tags[tag][idx])
135 			return 1;
136 	}
137 	return 0;
138 }
139 
140 /**
141  * radix_tree_find_next_bit - find the next set bit in a memory region
142  *
143  * @addr: The address to base the search on
144  * @size: The bitmap size in bits
145  * @offset: The bitnumber to start searching at
146  *
147  * Unrollable variant of find_next_bit() for constant size arrays.
148  * Tail bits starting from size to roundup(size, BITS_PER_LONG) must be zero.
149  * Returns next bit offset, or size if nothing found.
150  */
151 static __always_inline unsigned long
152 radix_tree_find_next_bit(const unsigned long *addr,
153 			 unsigned long size, unsigned long offset)
154 {
155 	if (!__builtin_constant_p(size))
156 		return find_next_bit(addr, size, offset);
157 
158 	if (offset < size) {
159 		unsigned long tmp;
160 
161 		addr += offset / BITS_PER_LONG;
162 		tmp = *addr >> (offset % BITS_PER_LONG);
163 		if (tmp)
164 			return __ffs(tmp) + offset;
165 		offset = (offset + BITS_PER_LONG) & ~(BITS_PER_LONG - 1);
166 		while (offset < size) {
167 			tmp = *++addr;
168 			if (tmp)
169 				return __ffs(tmp) + offset;
170 			offset += BITS_PER_LONG;
171 		}
172 	}
173 	return size;
174 }
175 
176 #if 0
177 static void dump_node(void *slot, int height, int offset)
178 {
179 	struct radix_tree_node *node;
180 	int i;
181 
182 	if (!slot)
183 		return;
184 
185 	if (height == 0) {
186 		pr_debug("radix entry %p offset %d\n", slot, offset);
187 		return;
188 	}
189 
190 	node = indirect_to_ptr(slot);
191 	pr_debug("radix node: %p offset %d tags %lx %lx %lx path %x count %d parent %p\n",
192 		slot, offset, node->tags[0][0], node->tags[1][0],
193 		node->tags[2][0], node->path, node->count, node->parent);
194 
195 	for (i = 0; i < RADIX_TREE_MAP_SIZE; i++)
196 		dump_node(node->slots[i], height - 1, i);
197 }
198 
199 /* For debug */
200 static void radix_tree_dump(struct radix_tree_root *root)
201 {
202 	pr_debug("radix root: %p height %d rnode %p tags %x\n",
203 			root, root->height, root->rnode,
204 			root->gfp_mask >> __GFP_BITS_SHIFT);
205 	if (!radix_tree_is_indirect_ptr(root->rnode))
206 		return;
207 	dump_node(root->rnode, root->height, 0);
208 }
209 #endif
210 
211 /*
212  * This assumes that the caller has performed appropriate preallocation, and
213  * that the caller has pinned this thread of control to the current CPU.
214  */
215 static struct radix_tree_node *
216 radix_tree_node_alloc(struct radix_tree_root *root)
217 {
218 	struct radix_tree_node *ret = NULL;
219 	gfp_t gfp_mask = root_gfp_mask(root);
220 
221 	/*
222 	 * Preload code isn't irq safe and it doesn't make sence to use
223 	 * preloading in the interrupt anyway as all the allocations have to
224 	 * be atomic. So just do normal allocation when in interrupt.
225 	 */
226 	if (!gfpflags_allow_blocking(gfp_mask) && !in_interrupt()) {
227 		struct radix_tree_preload *rtp;
228 
229 		/*
230 		 * Even if the caller has preloaded, try to allocate from the
231 		 * cache first for the new node to get accounted.
232 		 */
233 		ret = kmem_cache_alloc(radix_tree_node_cachep,
234 				       gfp_mask | __GFP_ACCOUNT | __GFP_NOWARN);
235 		if (ret)
236 			goto out;
237 
238 		/*
239 		 * Provided the caller has preloaded here, we will always
240 		 * succeed in getting a node here (and never reach
241 		 * kmem_cache_alloc)
242 		 */
243 		rtp = this_cpu_ptr(&radix_tree_preloads);
244 		if (rtp->nr) {
245 			ret = rtp->nodes;
246 			rtp->nodes = ret->private_data;
247 			ret->private_data = NULL;
248 			rtp->nr--;
249 		}
250 		/*
251 		 * Update the allocation stack trace as this is more useful
252 		 * for debugging.
253 		 */
254 		kmemleak_update_trace(ret);
255 		goto out;
256 	}
257 	ret = kmem_cache_alloc(radix_tree_node_cachep,
258 			       gfp_mask | __GFP_ACCOUNT);
259 out:
260 	BUG_ON(radix_tree_is_indirect_ptr(ret));
261 	return ret;
262 }
263 
264 static void radix_tree_node_rcu_free(struct rcu_head *head)
265 {
266 	struct radix_tree_node *node =
267 			container_of(head, struct radix_tree_node, rcu_head);
268 	int i;
269 
270 	/*
271 	 * must only free zeroed nodes into the slab. radix_tree_shrink
272 	 * can leave us with a non-NULL entry in the first slot, so clear
273 	 * that here to make sure.
274 	 */
275 	for (i = 0; i < RADIX_TREE_MAX_TAGS; i++)
276 		tag_clear(node, i, 0);
277 
278 	node->slots[0] = NULL;
279 	node->count = 0;
280 
281 	kmem_cache_free(radix_tree_node_cachep, node);
282 }
283 
284 static inline void
285 radix_tree_node_free(struct radix_tree_node *node)
286 {
287 	call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
288 }
289 
290 /*
291  * Load up this CPU's radix_tree_node buffer with sufficient objects to
292  * ensure that the addition of a single element in the tree cannot fail.  On
293  * success, return zero, with preemption disabled.  On error, return -ENOMEM
294  * with preemption not disabled.
295  *
296  * To make use of this facility, the radix tree must be initialised without
297  * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE().
298  */
299 static int __radix_tree_preload(gfp_t gfp_mask)
300 {
301 	struct radix_tree_preload *rtp;
302 	struct radix_tree_node *node;
303 	int ret = -ENOMEM;
304 
305 	preempt_disable();
306 	rtp = this_cpu_ptr(&radix_tree_preloads);
307 	while (rtp->nr < RADIX_TREE_PRELOAD_SIZE) {
308 		preempt_enable();
309 		node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
310 		if (node == NULL)
311 			goto out;
312 		preempt_disable();
313 		rtp = this_cpu_ptr(&radix_tree_preloads);
314 		if (rtp->nr < RADIX_TREE_PRELOAD_SIZE) {
315 			node->private_data = rtp->nodes;
316 			rtp->nodes = node;
317 			rtp->nr++;
318 		} else {
319 			kmem_cache_free(radix_tree_node_cachep, node);
320 		}
321 	}
322 	ret = 0;
323 out:
324 	return ret;
325 }
326 
327 /*
328  * Load up this CPU's radix_tree_node buffer with sufficient objects to
329  * ensure that the addition of a single element in the tree cannot fail.  On
330  * success, return zero, with preemption disabled.  On error, return -ENOMEM
331  * with preemption not disabled.
332  *
333  * To make use of this facility, the radix tree must be initialised without
334  * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE().
335  */
336 int radix_tree_preload(gfp_t gfp_mask)
337 {
338 	/* Warn on non-sensical use... */
339 	WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask));
340 	return __radix_tree_preload(gfp_mask);
341 }
342 EXPORT_SYMBOL(radix_tree_preload);
343 
344 /*
345  * The same as above function, except we don't guarantee preloading happens.
346  * We do it, if we decide it helps. On success, return zero with preemption
347  * disabled. On error, return -ENOMEM with preemption not disabled.
348  */
349 int radix_tree_maybe_preload(gfp_t gfp_mask)
350 {
351 	if (gfpflags_allow_blocking(gfp_mask))
352 		return __radix_tree_preload(gfp_mask);
353 	/* Preloading doesn't help anything with this gfp mask, skip it */
354 	preempt_disable();
355 	return 0;
356 }
357 EXPORT_SYMBOL(radix_tree_maybe_preload);
358 
359 /*
360  *	Return the maximum key which can be store into a
361  *	radix tree with height HEIGHT.
362  */
363 static inline unsigned long radix_tree_maxindex(unsigned int height)
364 {
365 	return height_to_maxindex[height];
366 }
367 
368 /*
369  *	Extend a radix tree so it can store key @index.
370  */
371 static int radix_tree_extend(struct radix_tree_root *root,
372 				unsigned long index, unsigned order)
373 {
374 	struct radix_tree_node *node;
375 	struct radix_tree_node *slot;
376 	unsigned int height;
377 	int tag;
378 
379 	/* Figure out what the height should be.  */
380 	height = root->height + 1;
381 	while (index > radix_tree_maxindex(height))
382 		height++;
383 
384 	if ((root->rnode == NULL) && (order == 0)) {
385 		root->height = height;
386 		goto out;
387 	}
388 
389 	do {
390 		unsigned int newheight;
391 		if (!(node = radix_tree_node_alloc(root)))
392 			return -ENOMEM;
393 
394 		/* Propagate the aggregated tag info into the new root */
395 		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
396 			if (root_tag_get(root, tag))
397 				tag_set(node, tag, 0);
398 		}
399 
400 		/* Increase the height.  */
401 		newheight = root->height+1;
402 		BUG_ON(newheight & ~RADIX_TREE_HEIGHT_MASK);
403 		node->path = newheight;
404 		node->count = 1;
405 		node->parent = NULL;
406 		slot = root->rnode;
407 		if (radix_tree_is_indirect_ptr(slot) && newheight > 1) {
408 			slot = indirect_to_ptr(slot);
409 			slot->parent = node;
410 			slot = ptr_to_indirect(slot);
411 		}
412 		node->slots[0] = slot;
413 		node = ptr_to_indirect(node);
414 		rcu_assign_pointer(root->rnode, node);
415 		root->height = newheight;
416 	} while (height > root->height);
417 out:
418 	return 0;
419 }
420 
421 /**
422  *	__radix_tree_create	-	create a slot in a radix tree
423  *	@root:		radix tree root
424  *	@index:		index key
425  *	@order:		index occupies 2^order aligned slots
426  *	@nodep:		returns node
427  *	@slotp:		returns slot
428  *
429  *	Create, if necessary, and return the node and slot for an item
430  *	at position @index in the radix tree @root.
431  *
432  *	Until there is more than one item in the tree, no nodes are
433  *	allocated and @root->rnode is used as a direct slot instead of
434  *	pointing to a node, in which case *@nodep will be NULL.
435  *
436  *	Returns -ENOMEM, or 0 for success.
437  */
438 int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
439 			unsigned order, struct radix_tree_node **nodep,
440 			void ***slotp)
441 {
442 	struct radix_tree_node *node = NULL, *slot;
443 	unsigned int height, shift, offset;
444 	int error;
445 
446 	BUG_ON((0 < order) && (order < RADIX_TREE_MAP_SHIFT));
447 
448 	/* Make sure the tree is high enough.  */
449 	if (index > radix_tree_maxindex(root->height)) {
450 		error = radix_tree_extend(root, index, order);
451 		if (error)
452 			return error;
453 	}
454 
455 	slot = root->rnode;
456 
457 	height = root->height;
458 	shift = height * RADIX_TREE_MAP_SHIFT;
459 
460 	offset = 0;			/* uninitialised var warning */
461 	while (shift > order) {
462 		if (slot == NULL) {
463 			/* Have to add a child node.  */
464 			if (!(slot = radix_tree_node_alloc(root)))
465 				return -ENOMEM;
466 			slot->path = height;
467 			slot->parent = node;
468 			if (node) {
469 				rcu_assign_pointer(node->slots[offset],
470 							ptr_to_indirect(slot));
471 				node->count++;
472 				slot->path |= offset << RADIX_TREE_HEIGHT_SHIFT;
473 			} else
474 				rcu_assign_pointer(root->rnode,
475 							ptr_to_indirect(slot));
476 		} else if (!radix_tree_is_indirect_ptr(slot))
477 			break;
478 
479 		/* Go a level down */
480 		height--;
481 		shift -= RADIX_TREE_MAP_SHIFT;
482 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
483 		node = indirect_to_ptr(slot);
484 		slot = node->slots[offset];
485 	}
486 
487 	/* Insert pointers to the canonical entry */
488 	if ((shift - order) > 0) {
489 		int i, n = 1 << (shift - order);
490 		offset = offset & ~(n - 1);
491 		slot = ptr_to_indirect(&node->slots[offset]);
492 		for (i = 0; i < n; i++) {
493 			if (node->slots[offset + i])
494 				return -EEXIST;
495 		}
496 
497 		for (i = 1; i < n; i++) {
498 			rcu_assign_pointer(node->slots[offset + i], slot);
499 			node->count++;
500 		}
501 	}
502 
503 	if (nodep)
504 		*nodep = node;
505 	if (slotp)
506 		*slotp = node ? node->slots + offset : (void **)&root->rnode;
507 	return 0;
508 }
509 
510 /**
511  *	__radix_tree_insert    -    insert into a radix tree
512  *	@root:		radix tree root
513  *	@index:		index key
514  *	@order:		key covers the 2^order indices around index
515  *	@item:		item to insert
516  *
517  *	Insert an item into the radix tree at position @index.
518  */
519 int __radix_tree_insert(struct radix_tree_root *root, unsigned long index,
520 			unsigned order, void *item)
521 {
522 	struct radix_tree_node *node;
523 	void **slot;
524 	int error;
525 
526 	BUG_ON(radix_tree_is_indirect_ptr(item));
527 
528 	error = __radix_tree_create(root, index, order, &node, &slot);
529 	if (error)
530 		return error;
531 	if (*slot != NULL)
532 		return -EEXIST;
533 	rcu_assign_pointer(*slot, item);
534 
535 	if (node) {
536 		node->count++;
537 		BUG_ON(tag_get(node, 0, index & RADIX_TREE_MAP_MASK));
538 		BUG_ON(tag_get(node, 1, index & RADIX_TREE_MAP_MASK));
539 	} else {
540 		BUG_ON(root_tag_get(root, 0));
541 		BUG_ON(root_tag_get(root, 1));
542 	}
543 
544 	return 0;
545 }
546 EXPORT_SYMBOL(__radix_tree_insert);
547 
548 /**
549  *	__radix_tree_lookup	-	lookup an item in a radix tree
550  *	@root:		radix tree root
551  *	@index:		index key
552  *	@nodep:		returns node
553  *	@slotp:		returns slot
554  *
555  *	Lookup and return the item at position @index in the radix
556  *	tree @root.
557  *
558  *	Until there is more than one item in the tree, no nodes are
559  *	allocated and @root->rnode is used as a direct slot instead of
560  *	pointing to a node, in which case *@nodep will be NULL.
561  */
562 void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
563 			  struct radix_tree_node **nodep, void ***slotp)
564 {
565 	struct radix_tree_node *node, *parent;
566 	unsigned int height, shift;
567 	void **slot;
568 
569 	node = rcu_dereference_raw(root->rnode);
570 	if (node == NULL)
571 		return NULL;
572 
573 	if (!radix_tree_is_indirect_ptr(node)) {
574 		if (index > 0)
575 			return NULL;
576 
577 		if (nodep)
578 			*nodep = NULL;
579 		if (slotp)
580 			*slotp = (void **)&root->rnode;
581 		return node;
582 	}
583 	node = indirect_to_ptr(node);
584 
585 	height = node->path & RADIX_TREE_HEIGHT_MASK;
586 	if (index > radix_tree_maxindex(height))
587 		return NULL;
588 
589 	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
590 
591 	do {
592 		parent = node;
593 		slot = node->slots + ((index >> shift) & RADIX_TREE_MAP_MASK);
594 		node = rcu_dereference_raw(*slot);
595 		if (node == NULL)
596 			return NULL;
597 		if (!radix_tree_is_indirect_ptr(node))
598 			break;
599 		node = indirect_to_ptr(node);
600 
601 		shift -= RADIX_TREE_MAP_SHIFT;
602 		height--;
603 	} while (height > 0);
604 
605 	if (nodep)
606 		*nodep = parent;
607 	if (slotp)
608 		*slotp = slot;
609 	return node;
610 }
611 
612 /**
613  *	radix_tree_lookup_slot    -    lookup a slot in a radix tree
614  *	@root:		radix tree root
615  *	@index:		index key
616  *
617  *	Returns:  the slot corresponding to the position @index in the
618  *	radix tree @root. This is useful for update-if-exists operations.
619  *
620  *	This function can be called under rcu_read_lock iff the slot is not
621  *	modified by radix_tree_replace_slot, otherwise it must be called
622  *	exclusive from other writers. Any dereference of the slot must be done
623  *	using radix_tree_deref_slot.
624  */
625 void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
626 {
627 	void **slot;
628 
629 	if (!__radix_tree_lookup(root, index, NULL, &slot))
630 		return NULL;
631 	return slot;
632 }
633 EXPORT_SYMBOL(radix_tree_lookup_slot);
634 
635 /**
636  *	radix_tree_lookup    -    perform lookup operation on a radix tree
637  *	@root:		radix tree root
638  *	@index:		index key
639  *
640  *	Lookup the item at the position @index in the radix tree @root.
641  *
642  *	This function can be called under rcu_read_lock, however the caller
643  *	must manage lifetimes of leaf nodes (eg. RCU may also be used to free
644  *	them safely). No RCU barriers are required to access or modify the
645  *	returned item, however.
646  */
647 void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
648 {
649 	return __radix_tree_lookup(root, index, NULL, NULL);
650 }
651 EXPORT_SYMBOL(radix_tree_lookup);
652 
653 /**
654  *	radix_tree_tag_set - set a tag on a radix tree node
655  *	@root:		radix tree root
656  *	@index:		index key
657  *	@tag: 		tag index
658  *
659  *	Set the search tag (which must be < RADIX_TREE_MAX_TAGS)
660  *	corresponding to @index in the radix tree.  From
661  *	the root all the way down to the leaf node.
662  *
663  *	Returns the address of the tagged item.   Setting a tag on a not-present
664  *	item is a bug.
665  */
666 void *radix_tree_tag_set(struct radix_tree_root *root,
667 			unsigned long index, unsigned int tag)
668 {
669 	unsigned int height, shift;
670 	struct radix_tree_node *slot;
671 
672 	height = root->height;
673 	BUG_ON(index > radix_tree_maxindex(height));
674 
675 	slot = indirect_to_ptr(root->rnode);
676 	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
677 
678 	while (height > 0) {
679 		int offset;
680 
681 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
682 		if (!tag_get(slot, tag, offset))
683 			tag_set(slot, tag, offset);
684 		slot = slot->slots[offset];
685 		BUG_ON(slot == NULL);
686 		if (!radix_tree_is_indirect_ptr(slot))
687 			break;
688 		slot = indirect_to_ptr(slot);
689 		shift -= RADIX_TREE_MAP_SHIFT;
690 		height--;
691 	}
692 
693 	/* set the root's tag bit */
694 	if (slot && !root_tag_get(root, tag))
695 		root_tag_set(root, tag);
696 
697 	return slot;
698 }
699 EXPORT_SYMBOL(radix_tree_tag_set);
700 
701 /**
702  *	radix_tree_tag_clear - clear a tag on a radix tree node
703  *	@root:		radix tree root
704  *	@index:		index key
705  *	@tag: 		tag index
706  *
707  *	Clear the search tag (which must be < RADIX_TREE_MAX_TAGS)
708  *	corresponding to @index in the radix tree.  If
709  *	this causes the leaf node to have no tags set then clear the tag in the
710  *	next-to-leaf node, etc.
711  *
712  *	Returns the address of the tagged item on success, else NULL.  ie:
713  *	has the same return value and semantics as radix_tree_lookup().
714  */
715 void *radix_tree_tag_clear(struct radix_tree_root *root,
716 			unsigned long index, unsigned int tag)
717 {
718 	struct radix_tree_node *node = NULL;
719 	struct radix_tree_node *slot = NULL;
720 	unsigned int height, shift;
721 	int uninitialized_var(offset);
722 
723 	height = root->height;
724 	if (index > radix_tree_maxindex(height))
725 		goto out;
726 
727 	shift = height * RADIX_TREE_MAP_SHIFT;
728 	slot = root->rnode;
729 
730 	while (shift) {
731 		if (slot == NULL)
732 			goto out;
733 		if (!radix_tree_is_indirect_ptr(slot))
734 			break;
735 		slot = indirect_to_ptr(slot);
736 
737 		shift -= RADIX_TREE_MAP_SHIFT;
738 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
739 		node = slot;
740 		slot = slot->slots[offset];
741 	}
742 
743 	if (slot == NULL)
744 		goto out;
745 
746 	while (node) {
747 		if (!tag_get(node, tag, offset))
748 			goto out;
749 		tag_clear(node, tag, offset);
750 		if (any_tag_set(node, tag))
751 			goto out;
752 
753 		index >>= RADIX_TREE_MAP_SHIFT;
754 		offset = index & RADIX_TREE_MAP_MASK;
755 		node = node->parent;
756 	}
757 
758 	/* clear the root's tag bit */
759 	if (root_tag_get(root, tag))
760 		root_tag_clear(root, tag);
761 
762 out:
763 	return slot;
764 }
765 EXPORT_SYMBOL(radix_tree_tag_clear);
766 
767 /**
768  * radix_tree_tag_get - get a tag on a radix tree node
769  * @root:		radix tree root
770  * @index:		index key
771  * @tag: 		tag index (< RADIX_TREE_MAX_TAGS)
772  *
773  * Return values:
774  *
775  *  0: tag not present or not set
776  *  1: tag set
777  *
778  * Note that the return value of this function may not be relied on, even if
779  * the RCU lock is held, unless tag modification and node deletion are excluded
780  * from concurrency.
781  */
782 int radix_tree_tag_get(struct radix_tree_root *root,
783 			unsigned long index, unsigned int tag)
784 {
785 	unsigned int height, shift;
786 	struct radix_tree_node *node;
787 
788 	/* check the root's tag bit */
789 	if (!root_tag_get(root, tag))
790 		return 0;
791 
792 	node = rcu_dereference_raw(root->rnode);
793 	if (node == NULL)
794 		return 0;
795 
796 	if (!radix_tree_is_indirect_ptr(node))
797 		return (index == 0);
798 	node = indirect_to_ptr(node);
799 
800 	height = node->path & RADIX_TREE_HEIGHT_MASK;
801 	if (index > radix_tree_maxindex(height))
802 		return 0;
803 
804 	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
805 
806 	for ( ; ; ) {
807 		int offset;
808 
809 		if (node == NULL)
810 			return 0;
811 		node = indirect_to_ptr(node);
812 
813 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
814 		if (!tag_get(node, tag, offset))
815 			return 0;
816 		if (height == 1)
817 			return 1;
818 		node = rcu_dereference_raw(node->slots[offset]);
819 		if (!radix_tree_is_indirect_ptr(node))
820 			return 1;
821 		shift -= RADIX_TREE_MAP_SHIFT;
822 		height--;
823 	}
824 }
825 EXPORT_SYMBOL(radix_tree_tag_get);
826 
827 /**
828  * radix_tree_next_chunk - find next chunk of slots for iteration
829  *
830  * @root:	radix tree root
831  * @iter:	iterator state
832  * @flags:	RADIX_TREE_ITER_* flags and tag index
833  * Returns:	pointer to chunk first slot, or NULL if iteration is over
834  */
835 void **radix_tree_next_chunk(struct radix_tree_root *root,
836 			     struct radix_tree_iter *iter, unsigned flags)
837 {
838 	unsigned shift, tag = flags & RADIX_TREE_ITER_TAG_MASK;
839 	struct radix_tree_node *rnode, *node;
840 	unsigned long index, offset, height;
841 
842 	if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag))
843 		return NULL;
844 
845 	/*
846 	 * Catch next_index overflow after ~0UL. iter->index never overflows
847 	 * during iterating; it can be zero only at the beginning.
848 	 * And we cannot overflow iter->next_index in a single step,
849 	 * because RADIX_TREE_MAP_SHIFT < BITS_PER_LONG.
850 	 *
851 	 * This condition also used by radix_tree_next_slot() to stop
852 	 * contiguous iterating, and forbid swithing to the next chunk.
853 	 */
854 	index = iter->next_index;
855 	if (!index && iter->index)
856 		return NULL;
857 
858 	rnode = rcu_dereference_raw(root->rnode);
859 	if (radix_tree_is_indirect_ptr(rnode)) {
860 		rnode = indirect_to_ptr(rnode);
861 	} else if (rnode && !index) {
862 		/* Single-slot tree */
863 		iter->index = 0;
864 		iter->next_index = 1;
865 		iter->tags = 1;
866 		return (void **)&root->rnode;
867 	} else
868 		return NULL;
869 
870 restart:
871 	height = rnode->path & RADIX_TREE_HEIGHT_MASK;
872 	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
873 	offset = index >> shift;
874 
875 	/* Index outside of the tree */
876 	if (offset >= RADIX_TREE_MAP_SIZE)
877 		return NULL;
878 
879 	node = rnode;
880 	while (1) {
881 		struct radix_tree_node *slot;
882 		if ((flags & RADIX_TREE_ITER_TAGGED) ?
883 				!test_bit(offset, node->tags[tag]) :
884 				!node->slots[offset]) {
885 			/* Hole detected */
886 			if (flags & RADIX_TREE_ITER_CONTIG)
887 				return NULL;
888 
889 			if (flags & RADIX_TREE_ITER_TAGGED)
890 				offset = radix_tree_find_next_bit(
891 						node->tags[tag],
892 						RADIX_TREE_MAP_SIZE,
893 						offset + 1);
894 			else
895 				while (++offset	< RADIX_TREE_MAP_SIZE) {
896 					if (node->slots[offset])
897 						break;
898 				}
899 			index &= ~((RADIX_TREE_MAP_SIZE << shift) - 1);
900 			index += offset << shift;
901 			/* Overflow after ~0UL */
902 			if (!index)
903 				return NULL;
904 			if (offset == RADIX_TREE_MAP_SIZE)
905 				goto restart;
906 		}
907 
908 		/* This is leaf-node */
909 		if (!shift)
910 			break;
911 
912 		slot = rcu_dereference_raw(node->slots[offset]);
913 		if (slot == NULL)
914 			goto restart;
915 		if (!radix_tree_is_indirect_ptr(slot))
916 			break;
917 		node = indirect_to_ptr(slot);
918 		shift -= RADIX_TREE_MAP_SHIFT;
919 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
920 	}
921 
922 	/* Update the iterator state */
923 	iter->index = index;
924 	iter->next_index = (index | RADIX_TREE_MAP_MASK) + 1;
925 
926 	/* Construct iter->tags bit-mask from node->tags[tag] array */
927 	if (flags & RADIX_TREE_ITER_TAGGED) {
928 		unsigned tag_long, tag_bit;
929 
930 		tag_long = offset / BITS_PER_LONG;
931 		tag_bit  = offset % BITS_PER_LONG;
932 		iter->tags = node->tags[tag][tag_long] >> tag_bit;
933 		/* This never happens if RADIX_TREE_TAG_LONGS == 1 */
934 		if (tag_long < RADIX_TREE_TAG_LONGS - 1) {
935 			/* Pick tags from next element */
936 			if (tag_bit)
937 				iter->tags |= node->tags[tag][tag_long + 1] <<
938 						(BITS_PER_LONG - tag_bit);
939 			/* Clip chunk size, here only BITS_PER_LONG tags */
940 			iter->next_index = index + BITS_PER_LONG;
941 		}
942 	}
943 
944 	return node->slots + offset;
945 }
946 EXPORT_SYMBOL(radix_tree_next_chunk);
947 
948 /**
949  * radix_tree_range_tag_if_tagged - for each item in given range set given
950  *				   tag if item has another tag set
951  * @root:		radix tree root
952  * @first_indexp:	pointer to a starting index of a range to scan
953  * @last_index:		last index of a range to scan
954  * @nr_to_tag:		maximum number items to tag
955  * @iftag:		tag index to test
956  * @settag:		tag index to set if tested tag is set
957  *
958  * This function scans range of radix tree from first_index to last_index
959  * (inclusive).  For each item in the range if iftag is set, the function sets
960  * also settag. The function stops either after tagging nr_to_tag items or
961  * after reaching last_index.
962  *
963  * The tags must be set from the leaf level only and propagated back up the
964  * path to the root. We must do this so that we resolve the full path before
965  * setting any tags on intermediate nodes. If we set tags as we descend, then
966  * we can get to the leaf node and find that the index that has the iftag
967  * set is outside the range we are scanning. This reults in dangling tags and
968  * can lead to problems with later tag operations (e.g. livelocks on lookups).
969  *
970  * The function returns number of leaves where the tag was set and sets
971  * *first_indexp to the first unscanned index.
972  * WARNING! *first_indexp can wrap if last_index is ULONG_MAX. Caller must
973  * be prepared to handle that.
974  */
975 unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
976 		unsigned long *first_indexp, unsigned long last_index,
977 		unsigned long nr_to_tag,
978 		unsigned int iftag, unsigned int settag)
979 {
980 	unsigned int height = root->height;
981 	struct radix_tree_node *node = NULL;
982 	struct radix_tree_node *slot;
983 	unsigned int shift;
984 	unsigned long tagged = 0;
985 	unsigned long index = *first_indexp;
986 
987 	last_index = min(last_index, radix_tree_maxindex(height));
988 	if (index > last_index)
989 		return 0;
990 	if (!nr_to_tag)
991 		return 0;
992 	if (!root_tag_get(root, iftag)) {
993 		*first_indexp = last_index + 1;
994 		return 0;
995 	}
996 	if (height == 0) {
997 		*first_indexp = last_index + 1;
998 		root_tag_set(root, settag);
999 		return 1;
1000 	}
1001 
1002 	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
1003 	slot = indirect_to_ptr(root->rnode);
1004 
1005 	for (;;) {
1006 		unsigned long upindex;
1007 		int offset;
1008 
1009 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
1010 		if (!slot->slots[offset])
1011 			goto next;
1012 		if (!tag_get(slot, iftag, offset))
1013 			goto next;
1014 		if (shift) {
1015 			node = slot;
1016 			slot = slot->slots[offset];
1017 			if (radix_tree_is_indirect_ptr(slot)) {
1018 				slot = indirect_to_ptr(slot);
1019 				shift -= RADIX_TREE_MAP_SHIFT;
1020 				continue;
1021 			} else {
1022 				slot = node;
1023 				node = node->parent;
1024 			}
1025 		}
1026 
1027 		/* tag the leaf */
1028 		tagged += 1 << shift;
1029 		tag_set(slot, settag, offset);
1030 
1031 		/* walk back up the path tagging interior nodes */
1032 		upindex = index;
1033 		while (node) {
1034 			upindex >>= RADIX_TREE_MAP_SHIFT;
1035 			offset = upindex & RADIX_TREE_MAP_MASK;
1036 
1037 			/* stop if we find a node with the tag already set */
1038 			if (tag_get(node, settag, offset))
1039 				break;
1040 			tag_set(node, settag, offset);
1041 			node = node->parent;
1042 		}
1043 
1044 		/*
1045 		 * Small optimization: now clear that node pointer.
1046 		 * Since all of this slot's ancestors now have the tag set
1047 		 * from setting it above, we have no further need to walk
1048 		 * back up the tree setting tags, until we update slot to
1049 		 * point to another radix_tree_node.
1050 		 */
1051 		node = NULL;
1052 
1053 next:
1054 		/* Go to next item at level determined by 'shift' */
1055 		index = ((index >> shift) + 1) << shift;
1056 		/* Overflow can happen when last_index is ~0UL... */
1057 		if (index > last_index || !index)
1058 			break;
1059 		if (tagged >= nr_to_tag)
1060 			break;
1061 		while (((index >> shift) & RADIX_TREE_MAP_MASK) == 0) {
1062 			/*
1063 			 * We've fully scanned this node. Go up. Because
1064 			 * last_index is guaranteed to be in the tree, what
1065 			 * we do below cannot wander astray.
1066 			 */
1067 			slot = slot->parent;
1068 			shift += RADIX_TREE_MAP_SHIFT;
1069 		}
1070 	}
1071 	/*
1072 	 * We need not to tag the root tag if there is no tag which is set with
1073 	 * settag within the range from *first_indexp to last_index.
1074 	 */
1075 	if (tagged > 0)
1076 		root_tag_set(root, settag);
1077 	*first_indexp = index;
1078 
1079 	return tagged;
1080 }
1081 EXPORT_SYMBOL(radix_tree_range_tag_if_tagged);
1082 
1083 /**
1084  *	radix_tree_gang_lookup - perform multiple lookup on a radix tree
1085  *	@root:		radix tree root
1086  *	@results:	where the results of the lookup are placed
1087  *	@first_index:	start the lookup from this key
1088  *	@max_items:	place up to this many items at *results
1089  *
1090  *	Performs an index-ascending scan of the tree for present items.  Places
1091  *	them at *@results and returns the number of items which were placed at
1092  *	*@results.
1093  *
1094  *	The implementation is naive.
1095  *
1096  *	Like radix_tree_lookup, radix_tree_gang_lookup may be called under
1097  *	rcu_read_lock. In this case, rather than the returned results being
1098  *	an atomic snapshot of the tree at a single point in time, the semantics
1099  *	of an RCU protected gang lookup are as though multiple radix_tree_lookups
1100  *	have been issued in individual locks, and results stored in 'results'.
1101  */
1102 unsigned int
1103 radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
1104 			unsigned long first_index, unsigned int max_items)
1105 {
1106 	struct radix_tree_iter iter;
1107 	void **slot;
1108 	unsigned int ret = 0;
1109 
1110 	if (unlikely(!max_items))
1111 		return 0;
1112 
1113 	radix_tree_for_each_slot(slot, root, &iter, first_index) {
1114 		results[ret] = rcu_dereference_raw(*slot);
1115 		if (!results[ret])
1116 			continue;
1117 		if (radix_tree_is_indirect_ptr(results[ret])) {
1118 			slot = radix_tree_iter_retry(&iter);
1119 			continue;
1120 		}
1121 		if (++ret == max_items)
1122 			break;
1123 	}
1124 
1125 	return ret;
1126 }
1127 EXPORT_SYMBOL(radix_tree_gang_lookup);
1128 
1129 /**
1130  *	radix_tree_gang_lookup_slot - perform multiple slot lookup on radix tree
1131  *	@root:		radix tree root
1132  *	@results:	where the results of the lookup are placed
1133  *	@indices:	where their indices should be placed (but usually NULL)
1134  *	@first_index:	start the lookup from this key
1135  *	@max_items:	place up to this many items at *results
1136  *
1137  *	Performs an index-ascending scan of the tree for present items.  Places
1138  *	their slots at *@results and returns the number of items which were
1139  *	placed at *@results.
1140  *
1141  *	The implementation is naive.
1142  *
1143  *	Like radix_tree_gang_lookup as far as RCU and locking goes. Slots must
1144  *	be dereferenced with radix_tree_deref_slot, and if using only RCU
1145  *	protection, radix_tree_deref_slot may fail requiring a retry.
1146  */
1147 unsigned int
1148 radix_tree_gang_lookup_slot(struct radix_tree_root *root,
1149 			void ***results, unsigned long *indices,
1150 			unsigned long first_index, unsigned int max_items)
1151 {
1152 	struct radix_tree_iter iter;
1153 	void **slot;
1154 	unsigned int ret = 0;
1155 
1156 	if (unlikely(!max_items))
1157 		return 0;
1158 
1159 	radix_tree_for_each_slot(slot, root, &iter, first_index) {
1160 		results[ret] = slot;
1161 		if (indices)
1162 			indices[ret] = iter.index;
1163 		if (++ret == max_items)
1164 			break;
1165 	}
1166 
1167 	return ret;
1168 }
1169 EXPORT_SYMBOL(radix_tree_gang_lookup_slot);
1170 
1171 /**
1172  *	radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree
1173  *	                             based on a tag
1174  *	@root:		radix tree root
1175  *	@results:	where the results of the lookup are placed
1176  *	@first_index:	start the lookup from this key
1177  *	@max_items:	place up to this many items at *results
1178  *	@tag:		the tag index (< RADIX_TREE_MAX_TAGS)
1179  *
1180  *	Performs an index-ascending scan of the tree for present items which
1181  *	have the tag indexed by @tag set.  Places the items at *@results and
1182  *	returns the number of items which were placed at *@results.
1183  */
1184 unsigned int
1185 radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
1186 		unsigned long first_index, unsigned int max_items,
1187 		unsigned int tag)
1188 {
1189 	struct radix_tree_iter iter;
1190 	void **slot;
1191 	unsigned int ret = 0;
1192 
1193 	if (unlikely(!max_items))
1194 		return 0;
1195 
1196 	radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) {
1197 		results[ret] = rcu_dereference_raw(*slot);
1198 		if (!results[ret])
1199 			continue;
1200 		if (radix_tree_is_indirect_ptr(results[ret])) {
1201 			slot = radix_tree_iter_retry(&iter);
1202 			continue;
1203 		}
1204 		if (++ret == max_items)
1205 			break;
1206 	}
1207 
1208 	return ret;
1209 }
1210 EXPORT_SYMBOL(radix_tree_gang_lookup_tag);
1211 
1212 /**
1213  *	radix_tree_gang_lookup_tag_slot - perform multiple slot lookup on a
1214  *					  radix tree based on a tag
1215  *	@root:		radix tree root
1216  *	@results:	where the results of the lookup are placed
1217  *	@first_index:	start the lookup from this key
1218  *	@max_items:	place up to this many items at *results
1219  *	@tag:		the tag index (< RADIX_TREE_MAX_TAGS)
1220  *
1221  *	Performs an index-ascending scan of the tree for present items which
1222  *	have the tag indexed by @tag set.  Places the slots at *@results and
1223  *	returns the number of slots which were placed at *@results.
1224  */
1225 unsigned int
1226 radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results,
1227 		unsigned long first_index, unsigned int max_items,
1228 		unsigned int tag)
1229 {
1230 	struct radix_tree_iter iter;
1231 	void **slot;
1232 	unsigned int ret = 0;
1233 
1234 	if (unlikely(!max_items))
1235 		return 0;
1236 
1237 	radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) {
1238 		results[ret] = slot;
1239 		if (++ret == max_items)
1240 			break;
1241 	}
1242 
1243 	return ret;
1244 }
1245 EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot);
1246 
1247 #if defined(CONFIG_SHMEM) && defined(CONFIG_SWAP)
1248 #include <linux/sched.h> /* for cond_resched() */
1249 
1250 /*
1251  * This linear search is at present only useful to shmem_unuse_inode().
1252  */
1253 static unsigned long __locate(struct radix_tree_node *slot, void *item,
1254 			      unsigned long index, unsigned long *found_index)
1255 {
1256 	unsigned int shift, height;
1257 	unsigned long i;
1258 
1259 	height = slot->path & RADIX_TREE_HEIGHT_MASK;
1260 	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
1261 
1262 	for ( ; height > 1; height--) {
1263 		i = (index >> shift) & RADIX_TREE_MAP_MASK;
1264 		for (;;) {
1265 			if (slot->slots[i] != NULL)
1266 				break;
1267 			index &= ~((1UL << shift) - 1);
1268 			index += 1UL << shift;
1269 			if (index == 0)
1270 				goto out;	/* 32-bit wraparound */
1271 			i++;
1272 			if (i == RADIX_TREE_MAP_SIZE)
1273 				goto out;
1274 		}
1275 
1276 		slot = rcu_dereference_raw(slot->slots[i]);
1277 		if (slot == NULL)
1278 			goto out;
1279 		if (!radix_tree_is_indirect_ptr(slot)) {
1280 			if (slot == item) {
1281 				*found_index = index + i;
1282 				index = 0;
1283 			} else {
1284 				index += shift;
1285 			}
1286 			goto out;
1287 		}
1288 		slot = indirect_to_ptr(slot);
1289 		shift -= RADIX_TREE_MAP_SHIFT;
1290 	}
1291 
1292 	/* Bottom level: check items */
1293 	for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
1294 		if (slot->slots[i] == item) {
1295 			*found_index = index + i;
1296 			index = 0;
1297 			goto out;
1298 		}
1299 	}
1300 	index += RADIX_TREE_MAP_SIZE;
1301 out:
1302 	return index;
1303 }
1304 
1305 /**
1306  *	radix_tree_locate_item - search through radix tree for item
1307  *	@root:		radix tree root
1308  *	@item:		item to be found
1309  *
1310  *	Returns index where item was found, or -1 if not found.
1311  *	Caller must hold no lock (since this time-consuming function needs
1312  *	to be preemptible), and must check afterwards if item is still there.
1313  */
1314 unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
1315 {
1316 	struct radix_tree_node *node;
1317 	unsigned long max_index;
1318 	unsigned long cur_index = 0;
1319 	unsigned long found_index = -1;
1320 
1321 	do {
1322 		rcu_read_lock();
1323 		node = rcu_dereference_raw(root->rnode);
1324 		if (!radix_tree_is_indirect_ptr(node)) {
1325 			rcu_read_unlock();
1326 			if (node == item)
1327 				found_index = 0;
1328 			break;
1329 		}
1330 
1331 		node = indirect_to_ptr(node);
1332 		max_index = radix_tree_maxindex(node->path &
1333 						RADIX_TREE_HEIGHT_MASK);
1334 		if (cur_index > max_index) {
1335 			rcu_read_unlock();
1336 			break;
1337 		}
1338 
1339 		cur_index = __locate(node, item, cur_index, &found_index);
1340 		rcu_read_unlock();
1341 		cond_resched();
1342 	} while (cur_index != 0 && cur_index <= max_index);
1343 
1344 	return found_index;
1345 }
1346 #else
1347 unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
1348 {
1349 	return -1;
1350 }
1351 #endif /* CONFIG_SHMEM && CONFIG_SWAP */
1352 
1353 /**
1354  *	radix_tree_shrink    -    shrink height of a radix tree to minimal
1355  *	@root		radix tree root
1356  */
1357 static inline void radix_tree_shrink(struct radix_tree_root *root)
1358 {
1359 	/* try to shrink tree height */
1360 	while (root->height > 0) {
1361 		struct radix_tree_node *to_free = root->rnode;
1362 		struct radix_tree_node *slot;
1363 
1364 		BUG_ON(!radix_tree_is_indirect_ptr(to_free));
1365 		to_free = indirect_to_ptr(to_free);
1366 
1367 		/*
1368 		 * The candidate node has more than one child, or its child
1369 		 * is not at the leftmost slot, or it is a multiorder entry,
1370 		 * we cannot shrink.
1371 		 */
1372 		if (to_free->count != 1)
1373 			break;
1374 		slot = to_free->slots[0];
1375 		if (!slot)
1376 			break;
1377 
1378 		/*
1379 		 * We don't need rcu_assign_pointer(), since we are simply
1380 		 * moving the node from one part of the tree to another: if it
1381 		 * was safe to dereference the old pointer to it
1382 		 * (to_free->slots[0]), it will be safe to dereference the new
1383 		 * one (root->rnode) as far as dependent read barriers go.
1384 		 */
1385 		if (root->height > 1) {
1386 			if (!radix_tree_is_indirect_ptr(slot))
1387 				break;
1388 
1389 			slot = indirect_to_ptr(slot);
1390 			slot->parent = NULL;
1391 			slot = ptr_to_indirect(slot);
1392 		}
1393 		root->rnode = slot;
1394 		root->height--;
1395 
1396 		/*
1397 		 * We have a dilemma here. The node's slot[0] must not be
1398 		 * NULLed in case there are concurrent lookups expecting to
1399 		 * find the item. However if this was a bottom-level node,
1400 		 * then it may be subject to the slot pointer being visible
1401 		 * to callers dereferencing it. If item corresponding to
1402 		 * slot[0] is subsequently deleted, these callers would expect
1403 		 * their slot to become empty sooner or later.
1404 		 *
1405 		 * For example, lockless pagecache will look up a slot, deref
1406 		 * the page pointer, and if the page is 0 refcount it means it
1407 		 * was concurrently deleted from pagecache so try the deref
1408 		 * again. Fortunately there is already a requirement for logic
1409 		 * to retry the entire slot lookup -- the indirect pointer
1410 		 * problem (replacing direct root node with an indirect pointer
1411 		 * also results in a stale slot). So tag the slot as indirect
1412 		 * to force callers to retry.
1413 		 */
1414 		if (root->height == 0)
1415 			*((unsigned long *)&to_free->slots[0]) |=
1416 						RADIX_TREE_INDIRECT_PTR;
1417 
1418 		radix_tree_node_free(to_free);
1419 	}
1420 }
1421 
1422 /**
1423  *	__radix_tree_delete_node    -    try to free node after clearing a slot
1424  *	@root:		radix tree root
1425  *	@node:		node containing @index
1426  *
1427  *	After clearing the slot at @index in @node from radix tree
1428  *	rooted at @root, call this function to attempt freeing the
1429  *	node and shrinking the tree.
1430  *
1431  *	Returns %true if @node was freed, %false otherwise.
1432  */
1433 bool __radix_tree_delete_node(struct radix_tree_root *root,
1434 			      struct radix_tree_node *node)
1435 {
1436 	bool deleted = false;
1437 
1438 	do {
1439 		struct radix_tree_node *parent;
1440 
1441 		if (node->count) {
1442 			if (node == indirect_to_ptr(root->rnode)) {
1443 				radix_tree_shrink(root);
1444 				if (root->height == 0)
1445 					deleted = true;
1446 			}
1447 			return deleted;
1448 		}
1449 
1450 		parent = node->parent;
1451 		if (parent) {
1452 			unsigned int offset;
1453 
1454 			offset = node->path >> RADIX_TREE_HEIGHT_SHIFT;
1455 			parent->slots[offset] = NULL;
1456 			parent->count--;
1457 		} else {
1458 			root_tag_clear_all(root);
1459 			root->height = 0;
1460 			root->rnode = NULL;
1461 		}
1462 
1463 		radix_tree_node_free(node);
1464 		deleted = true;
1465 
1466 		node = parent;
1467 	} while (node);
1468 
1469 	return deleted;
1470 }
1471 
1472 /**
1473  *	radix_tree_delete_item    -    delete an item from a radix tree
1474  *	@root:		radix tree root
1475  *	@index:		index key
1476  *	@item:		expected item
1477  *
1478  *	Remove @item at @index from the radix tree rooted at @root.
1479  *
1480  *	Returns the address of the deleted item, or NULL if it was not present
1481  *	or the entry at the given @index was not @item.
1482  */
1483 void *radix_tree_delete_item(struct radix_tree_root *root,
1484 			     unsigned long index, void *item)
1485 {
1486 	struct radix_tree_node *node;
1487 	unsigned int offset, i;
1488 	void **slot;
1489 	void *entry;
1490 	int tag;
1491 
1492 	entry = __radix_tree_lookup(root, index, &node, &slot);
1493 	if (!entry)
1494 		return NULL;
1495 
1496 	if (item && entry != item)
1497 		return NULL;
1498 
1499 	if (!node) {
1500 		root_tag_clear_all(root);
1501 		root->rnode = NULL;
1502 		return entry;
1503 	}
1504 
1505 	offset = index & RADIX_TREE_MAP_MASK;
1506 
1507 	/*
1508 	 * Clear all tags associated with the item to be deleted.
1509 	 * This way of doing it would be inefficient, but seldom is any set.
1510 	 */
1511 	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
1512 		if (tag_get(node, tag, offset))
1513 			radix_tree_tag_clear(root, index, tag);
1514 	}
1515 
1516 	/* Delete any sibling slots pointing to this slot */
1517 	for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) {
1518 		if (node->slots[offset + i] != ptr_to_indirect(slot))
1519 			break;
1520 		node->slots[offset + i] = NULL;
1521 		node->count--;
1522 	}
1523 	node->slots[offset] = NULL;
1524 	node->count--;
1525 
1526 	__radix_tree_delete_node(root, node);
1527 
1528 	return entry;
1529 }
1530 EXPORT_SYMBOL(radix_tree_delete_item);
1531 
1532 /**
1533  *	radix_tree_delete    -    delete an item from a radix tree
1534  *	@root:		radix tree root
1535  *	@index:		index key
1536  *
1537  *	Remove the item at @index from the radix tree rooted at @root.
1538  *
1539  *	Returns the address of the deleted item, or NULL if it was not present.
1540  */
1541 void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
1542 {
1543 	return radix_tree_delete_item(root, index, NULL);
1544 }
1545 EXPORT_SYMBOL(radix_tree_delete);
1546 
1547 /**
1548  *	radix_tree_tagged - test whether any items in the tree are tagged
1549  *	@root:		radix tree root
1550  *	@tag:		tag to test
1551  */
1552 int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag)
1553 {
1554 	return root_tag_get(root, tag);
1555 }
1556 EXPORT_SYMBOL(radix_tree_tagged);
1557 
1558 static void
1559 radix_tree_node_ctor(void *arg)
1560 {
1561 	struct radix_tree_node *node = arg;
1562 
1563 	memset(node, 0, sizeof(*node));
1564 	INIT_LIST_HEAD(&node->private_list);
1565 }
1566 
1567 static __init unsigned long __maxindex(unsigned int height)
1568 {
1569 	unsigned int width = height * RADIX_TREE_MAP_SHIFT;
1570 	int shift = RADIX_TREE_INDEX_BITS - width;
1571 
1572 	if (shift < 0)
1573 		return ~0UL;
1574 	if (shift >= BITS_PER_LONG)
1575 		return 0UL;
1576 	return ~0UL >> shift;
1577 }
1578 
1579 static __init void radix_tree_init_maxindex(void)
1580 {
1581 	unsigned int i;
1582 
1583 	for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++)
1584 		height_to_maxindex[i] = __maxindex(i);
1585 }
1586 
1587 static int radix_tree_callback(struct notifier_block *nfb,
1588                             unsigned long action,
1589                             void *hcpu)
1590 {
1591        int cpu = (long)hcpu;
1592        struct radix_tree_preload *rtp;
1593        struct radix_tree_node *node;
1594 
1595        /* Free per-cpu pool of perloaded nodes */
1596        if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
1597                rtp = &per_cpu(radix_tree_preloads, cpu);
1598                while (rtp->nr) {
1599 			node = rtp->nodes;
1600 			rtp->nodes = node->private_data;
1601 			kmem_cache_free(radix_tree_node_cachep, node);
1602 			rtp->nr--;
1603                }
1604        }
1605        return NOTIFY_OK;
1606 }
1607 
1608 void __init radix_tree_init(void)
1609 {
1610 	radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
1611 			sizeof(struct radix_tree_node), 0,
1612 			SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
1613 			radix_tree_node_ctor);
1614 	radix_tree_init_maxindex();
1615 	hotcpu_notifier(radix_tree_callback, 0);
1616 }
1617