xref: /openbmc/linux/lib/radix-tree.c (revision e868d61272caa648214046a096e5a6bfc068dc8c)
1 /*
2  * Copyright (C) 2001 Momchil Velikov
3  * Portions Copyright (C) 2001 Christoph Hellwig
4  * Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
5  * Copyright (C) 2006 Nick Piggin
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2, or (at
10  * your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20  */
21 
22 #include <linux/errno.h>
23 #include <linux/init.h>
24 #include <linux/kernel.h>
25 #include <linux/module.h>
26 #include <linux/radix-tree.h>
27 #include <linux/percpu.h>
28 #include <linux/slab.h>
29 #include <linux/notifier.h>
30 #include <linux/cpu.h>
31 #include <linux/gfp.h>
32 #include <linux/string.h>
33 #include <linux/bitops.h>
34 #include <linux/rcupdate.h>
35 
36 
37 #ifdef __KERNEL__
38 #define RADIX_TREE_MAP_SHIFT	(CONFIG_BASE_SMALL ? 4 : 6)
39 #else
40 #define RADIX_TREE_MAP_SHIFT	3	/* For more stressful testing */
41 #endif
42 
43 #define RADIX_TREE_MAP_SIZE	(1UL << RADIX_TREE_MAP_SHIFT)
44 #define RADIX_TREE_MAP_MASK	(RADIX_TREE_MAP_SIZE-1)
45 
46 #define RADIX_TREE_TAG_LONGS	\
47 	((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
48 
49 struct radix_tree_node {
50 	unsigned int	height;		/* Height from the bottom */
51 	unsigned int	count;
52 	struct rcu_head	rcu_head;
53 	void		*slots[RADIX_TREE_MAP_SIZE];
54 	unsigned long	tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
55 };
56 
57 struct radix_tree_path {
58 	struct radix_tree_node *node;
59 	int offset;
60 };
61 
62 #define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
63 #define RADIX_TREE_MAX_PATH (RADIX_TREE_INDEX_BITS/RADIX_TREE_MAP_SHIFT + 2)
64 
65 static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH] __read_mostly;
66 
67 /*
68  * Radix tree node cache.
69  */
70 static struct kmem_cache *radix_tree_node_cachep;
71 
72 /*
73  * Per-cpu pool of preloaded nodes
74  */
75 struct radix_tree_preload {
76 	int nr;
77 	struct radix_tree_node *nodes[RADIX_TREE_MAX_PATH];
78 };
79 DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
80 
81 static inline gfp_t root_gfp_mask(struct radix_tree_root *root)
82 {
83 	return root->gfp_mask & __GFP_BITS_MASK;
84 }
85 
86 /*
87  * This assumes that the caller has performed appropriate preallocation, and
88  * that the caller has pinned this thread of control to the current CPU.
89  */
90 static struct radix_tree_node *
91 radix_tree_node_alloc(struct radix_tree_root *root)
92 {
93 	struct radix_tree_node *ret;
94 	gfp_t gfp_mask = root_gfp_mask(root);
95 
96 	ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
97 	if (ret == NULL && !(gfp_mask & __GFP_WAIT)) {
98 		struct radix_tree_preload *rtp;
99 
100 		rtp = &__get_cpu_var(radix_tree_preloads);
101 		if (rtp->nr) {
102 			ret = rtp->nodes[rtp->nr - 1];
103 			rtp->nodes[rtp->nr - 1] = NULL;
104 			rtp->nr--;
105 		}
106 	}
107 	BUG_ON(radix_tree_is_direct_ptr(ret));
108 	return ret;
109 }
110 
111 static void radix_tree_node_rcu_free(struct rcu_head *head)
112 {
113 	struct radix_tree_node *node =
114 			container_of(head, struct radix_tree_node, rcu_head);
115 	kmem_cache_free(radix_tree_node_cachep, node);
116 }
117 
118 static inline void
119 radix_tree_node_free(struct radix_tree_node *node)
120 {
121 	call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
122 }
123 
124 /*
125  * Load up this CPU's radix_tree_node buffer with sufficient objects to
126  * ensure that the addition of a single element in the tree cannot fail.  On
127  * success, return zero, with preemption disabled.  On error, return -ENOMEM
128  * with preemption not disabled.
129  */
130 int radix_tree_preload(gfp_t gfp_mask)
131 {
132 	struct radix_tree_preload *rtp;
133 	struct radix_tree_node *node;
134 	int ret = -ENOMEM;
135 
136 	preempt_disable();
137 	rtp = &__get_cpu_var(radix_tree_preloads);
138 	while (rtp->nr < ARRAY_SIZE(rtp->nodes)) {
139 		preempt_enable();
140 		node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
141 		if (node == NULL)
142 			goto out;
143 		preempt_disable();
144 		rtp = &__get_cpu_var(radix_tree_preloads);
145 		if (rtp->nr < ARRAY_SIZE(rtp->nodes))
146 			rtp->nodes[rtp->nr++] = node;
147 		else
148 			kmem_cache_free(radix_tree_node_cachep, node);
149 	}
150 	ret = 0;
151 out:
152 	return ret;
153 }
154 
155 static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
156 		int offset)
157 {
158 	__set_bit(offset, node->tags[tag]);
159 }
160 
161 static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
162 		int offset)
163 {
164 	__clear_bit(offset, node->tags[tag]);
165 }
166 
167 static inline int tag_get(struct radix_tree_node *node, unsigned int tag,
168 		int offset)
169 {
170 	return test_bit(offset, node->tags[tag]);
171 }
172 
173 static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag)
174 {
175 	root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT));
176 }
177 
178 
179 static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag)
180 {
181 	root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT));
182 }
183 
184 static inline void root_tag_clear_all(struct radix_tree_root *root)
185 {
186 	root->gfp_mask &= __GFP_BITS_MASK;
187 }
188 
189 static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag)
190 {
191 	return (__force unsigned)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT));
192 }
193 
194 /*
195  * Returns 1 if any slot in the node has this tag set.
196  * Otherwise returns 0.
197  */
198 static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag)
199 {
200 	int idx;
201 	for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
202 		if (node->tags[tag][idx])
203 			return 1;
204 	}
205 	return 0;
206 }
207 
208 /*
209  *	Return the maximum key which can be store into a
210  *	radix tree with height HEIGHT.
211  */
212 static inline unsigned long radix_tree_maxindex(unsigned int height)
213 {
214 	return height_to_maxindex[height];
215 }
216 
217 /*
218  *	Extend a radix tree so it can store key @index.
219  */
220 static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
221 {
222 	struct radix_tree_node *node;
223 	unsigned int height;
224 	int tag;
225 
226 	/* Figure out what the height should be.  */
227 	height = root->height + 1;
228 	while (index > radix_tree_maxindex(height))
229 		height++;
230 
231 	if (root->rnode == NULL) {
232 		root->height = height;
233 		goto out;
234 	}
235 
236 	do {
237 		unsigned int newheight;
238 		if (!(node = radix_tree_node_alloc(root)))
239 			return -ENOMEM;
240 
241 		/* Increase the height.  */
242 		node->slots[0] = radix_tree_direct_to_ptr(root->rnode);
243 
244 		/* Propagate the aggregated tag info into the new root */
245 		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
246 			if (root_tag_get(root, tag))
247 				tag_set(node, tag, 0);
248 		}
249 
250 		newheight = root->height+1;
251 		node->height = newheight;
252 		node->count = 1;
253 		rcu_assign_pointer(root->rnode, node);
254 		root->height = newheight;
255 	} while (height > root->height);
256 out:
257 	return 0;
258 }
259 
260 /**
261  *	radix_tree_insert    -    insert into a radix tree
262  *	@root:		radix tree root
263  *	@index:		index key
264  *	@item:		item to insert
265  *
266  *	Insert an item into the radix tree at position @index.
267  */
268 int radix_tree_insert(struct radix_tree_root *root,
269 			unsigned long index, void *item)
270 {
271 	struct radix_tree_node *node = NULL, *slot;
272 	unsigned int height, shift;
273 	int offset;
274 	int error;
275 
276 	BUG_ON(radix_tree_is_direct_ptr(item));
277 
278 	/* Make sure the tree is high enough.  */
279 	if (index > radix_tree_maxindex(root->height)) {
280 		error = radix_tree_extend(root, index);
281 		if (error)
282 			return error;
283 	}
284 
285 	slot = root->rnode;
286 	height = root->height;
287 	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
288 
289 	offset = 0;			/* uninitialised var warning */
290 	while (height > 0) {
291 		if (slot == NULL) {
292 			/* Have to add a child node.  */
293 			if (!(slot = radix_tree_node_alloc(root)))
294 				return -ENOMEM;
295 			slot->height = height;
296 			if (node) {
297 				rcu_assign_pointer(node->slots[offset], slot);
298 				node->count++;
299 			} else
300 				rcu_assign_pointer(root->rnode, slot);
301 		}
302 
303 		/* Go a level down */
304 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
305 		node = slot;
306 		slot = node->slots[offset];
307 		shift -= RADIX_TREE_MAP_SHIFT;
308 		height--;
309 	}
310 
311 	if (slot != NULL)
312 		return -EEXIST;
313 
314 	if (node) {
315 		node->count++;
316 		rcu_assign_pointer(node->slots[offset], item);
317 		BUG_ON(tag_get(node, 0, offset));
318 		BUG_ON(tag_get(node, 1, offset));
319 	} else {
320 		rcu_assign_pointer(root->rnode, radix_tree_ptr_to_direct(item));
321 		BUG_ON(root_tag_get(root, 0));
322 		BUG_ON(root_tag_get(root, 1));
323 	}
324 
325 	return 0;
326 }
327 EXPORT_SYMBOL(radix_tree_insert);
328 
329 /**
330  *	radix_tree_lookup_slot    -    lookup a slot in a radix tree
331  *	@root:		radix tree root
332  *	@index:		index key
333  *
334  *	Returns:  the slot corresponding to the position @index in the
335  *	radix tree @root. This is useful for update-if-exists operations.
336  *
337  *	This function cannot be called under rcu_read_lock, it must be
338  *	excluded from writers, as must the returned slot for subsequent
339  *	use by radix_tree_deref_slot() and radix_tree_replace slot.
340  *	Caller must hold tree write locked across slot lookup and
341  *	replace.
342  */
343 void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
344 {
345 	unsigned int height, shift;
346 	struct radix_tree_node *node, **slot;
347 
348 	node = root->rnode;
349 	if (node == NULL)
350 		return NULL;
351 
352 	if (radix_tree_is_direct_ptr(node)) {
353 		if (index > 0)
354 			return NULL;
355 		return (void **)&root->rnode;
356 	}
357 
358 	height = node->height;
359 	if (index > radix_tree_maxindex(height))
360 		return NULL;
361 
362 	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
363 
364 	do {
365 		slot = (struct radix_tree_node **)
366 			(node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK));
367 		node = *slot;
368 		if (node == NULL)
369 			return NULL;
370 
371 		shift -= RADIX_TREE_MAP_SHIFT;
372 		height--;
373 	} while (height > 0);
374 
375 	return (void **)slot;
376 }
377 EXPORT_SYMBOL(radix_tree_lookup_slot);
378 
379 /**
380  *	radix_tree_lookup    -    perform lookup operation on a radix tree
381  *	@root:		radix tree root
382  *	@index:		index key
383  *
384  *	Lookup the item at the position @index in the radix tree @root.
385  *
386  *	This function can be called under rcu_read_lock, however the caller
387  *	must manage lifetimes of leaf nodes (eg. RCU may also be used to free
388  *	them safely). No RCU barriers are required to access or modify the
389  *	returned item, however.
390  */
391 void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
392 {
393 	unsigned int height, shift;
394 	struct radix_tree_node *node, **slot;
395 
396 	node = rcu_dereference(root->rnode);
397 	if (node == NULL)
398 		return NULL;
399 
400 	if (radix_tree_is_direct_ptr(node)) {
401 		if (index > 0)
402 			return NULL;
403 		return radix_tree_direct_to_ptr(node);
404 	}
405 
406 	height = node->height;
407 	if (index > radix_tree_maxindex(height))
408 		return NULL;
409 
410 	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
411 
412 	do {
413 		slot = (struct radix_tree_node **)
414 			(node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK));
415 		node = rcu_dereference(*slot);
416 		if (node == NULL)
417 			return NULL;
418 
419 		shift -= RADIX_TREE_MAP_SHIFT;
420 		height--;
421 	} while (height > 0);
422 
423 	return node;
424 }
425 EXPORT_SYMBOL(radix_tree_lookup);
426 
427 /**
428  *	radix_tree_tag_set - set a tag on a radix tree node
429  *	@root:		radix tree root
430  *	@index:		index key
431  *	@tag: 		tag index
432  *
433  *	Set the search tag (which must be < RADIX_TREE_MAX_TAGS)
434  *	corresponding to @index in the radix tree.  From
435  *	the root all the way down to the leaf node.
436  *
437  *	Returns the address of the tagged item.   Setting a tag on a not-present
438  *	item is a bug.
439  */
440 void *radix_tree_tag_set(struct radix_tree_root *root,
441 			unsigned long index, unsigned int tag)
442 {
443 	unsigned int height, shift;
444 	struct radix_tree_node *slot;
445 
446 	height = root->height;
447 	BUG_ON(index > radix_tree_maxindex(height));
448 
449 	slot = root->rnode;
450 	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
451 
452 	while (height > 0) {
453 		int offset;
454 
455 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
456 		if (!tag_get(slot, tag, offset))
457 			tag_set(slot, tag, offset);
458 		slot = slot->slots[offset];
459 		BUG_ON(slot == NULL);
460 		shift -= RADIX_TREE_MAP_SHIFT;
461 		height--;
462 	}
463 
464 	/* set the root's tag bit */
465 	if (slot && !root_tag_get(root, tag))
466 		root_tag_set(root, tag);
467 
468 	return slot;
469 }
470 EXPORT_SYMBOL(radix_tree_tag_set);
471 
472 /**
473  *	radix_tree_tag_clear - clear a tag on a radix tree node
474  *	@root:		radix tree root
475  *	@index:		index key
476  *	@tag: 		tag index
477  *
478  *	Clear the search tag (which must be < RADIX_TREE_MAX_TAGS)
479  *	corresponding to @index in the radix tree.  If
480  *	this causes the leaf node to have no tags set then clear the tag in the
481  *	next-to-leaf node, etc.
482  *
483  *	Returns the address of the tagged item on success, else NULL.  ie:
484  *	has the same return value and semantics as radix_tree_lookup().
485  */
486 void *radix_tree_tag_clear(struct radix_tree_root *root,
487 			unsigned long index, unsigned int tag)
488 {
489 	struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
490 	struct radix_tree_node *slot = NULL;
491 	unsigned int height, shift;
492 
493 	height = root->height;
494 	if (index > radix_tree_maxindex(height))
495 		goto out;
496 
497 	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
498 	pathp->node = NULL;
499 	slot = root->rnode;
500 
501 	while (height > 0) {
502 		int offset;
503 
504 		if (slot == NULL)
505 			goto out;
506 
507 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
508 		pathp[1].offset = offset;
509 		pathp[1].node = slot;
510 		slot = slot->slots[offset];
511 		pathp++;
512 		shift -= RADIX_TREE_MAP_SHIFT;
513 		height--;
514 	}
515 
516 	if (slot == NULL)
517 		goto out;
518 
519 	while (pathp->node) {
520 		if (!tag_get(pathp->node, tag, pathp->offset))
521 			goto out;
522 		tag_clear(pathp->node, tag, pathp->offset);
523 		if (any_tag_set(pathp->node, tag))
524 			goto out;
525 		pathp--;
526 	}
527 
528 	/* clear the root's tag bit */
529 	if (root_tag_get(root, tag))
530 		root_tag_clear(root, tag);
531 
532 out:
533 	return slot;
534 }
535 EXPORT_SYMBOL(radix_tree_tag_clear);
536 
537 #ifndef __KERNEL__	/* Only the test harness uses this at present */
538 /**
539  * radix_tree_tag_get - get a tag on a radix tree node
540  * @root:		radix tree root
541  * @index:		index key
542  * @tag: 		tag index (< RADIX_TREE_MAX_TAGS)
543  *
544  * Return values:
545  *
546  *  0: tag not present or not set
547  *  1: tag set
548  */
549 int radix_tree_tag_get(struct radix_tree_root *root,
550 			unsigned long index, unsigned int tag)
551 {
552 	unsigned int height, shift;
553 	struct radix_tree_node *node;
554 	int saw_unset_tag = 0;
555 
556 	/* check the root's tag bit */
557 	if (!root_tag_get(root, tag))
558 		return 0;
559 
560 	node = rcu_dereference(root->rnode);
561 	if (node == NULL)
562 		return 0;
563 
564 	if (radix_tree_is_direct_ptr(node))
565 		return (index == 0);
566 
567 	height = node->height;
568 	if (index > radix_tree_maxindex(height))
569 		return 0;
570 
571 	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
572 
573 	for ( ; ; ) {
574 		int offset;
575 
576 		if (node == NULL)
577 			return 0;
578 
579 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
580 
581 		/*
582 		 * This is just a debug check.  Later, we can bale as soon as
583 		 * we see an unset tag.
584 		 */
585 		if (!tag_get(node, tag, offset))
586 			saw_unset_tag = 1;
587 		if (height == 1) {
588 			int ret = tag_get(node, tag, offset);
589 
590 			BUG_ON(ret && saw_unset_tag);
591 			return !!ret;
592 		}
593 		node = rcu_dereference(node->slots[offset]);
594 		shift -= RADIX_TREE_MAP_SHIFT;
595 		height--;
596 	}
597 }
598 EXPORT_SYMBOL(radix_tree_tag_get);
599 #endif
600 
601 static unsigned int
602 __lookup(struct radix_tree_node *slot, void **results, unsigned long index,
603 	unsigned int max_items, unsigned long *next_index)
604 {
605 	unsigned int nr_found = 0;
606 	unsigned int shift, height;
607 	unsigned long i;
608 
609 	height = slot->height;
610 	if (height == 0)
611 		goto out;
612 	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
613 
614 	for ( ; height > 1; height--) {
615 		i = (index >> shift) & RADIX_TREE_MAP_MASK;
616 		for (;;) {
617 			if (slot->slots[i] != NULL)
618 				break;
619 			index &= ~((1UL << shift) - 1);
620 			index += 1UL << shift;
621 			if (index == 0)
622 				goto out;	/* 32-bit wraparound */
623 			i++;
624 			if (i == RADIX_TREE_MAP_SIZE)
625 				goto out;
626 		}
627 
628 		shift -= RADIX_TREE_MAP_SHIFT;
629 		slot = rcu_dereference(slot->slots[i]);
630 		if (slot == NULL)
631 			goto out;
632 	}
633 
634 	/* Bottom level: grab some items */
635 	for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) {
636 		struct radix_tree_node *node;
637 		index++;
638 		node = slot->slots[i];
639 		if (node) {
640 			results[nr_found++] = rcu_dereference(node);
641 			if (nr_found == max_items)
642 				goto out;
643 		}
644 	}
645 out:
646 	*next_index = index;
647 	return nr_found;
648 }
649 
650 /**
651  *	radix_tree_gang_lookup - perform multiple lookup on a radix tree
652  *	@root:		radix tree root
653  *	@results:	where the results of the lookup are placed
654  *	@first_index:	start the lookup from this key
655  *	@max_items:	place up to this many items at *results
656  *
657  *	Performs an index-ascending scan of the tree for present items.  Places
658  *	them at *@results and returns the number of items which were placed at
659  *	*@results.
660  *
661  *	The implementation is naive.
662  *
663  *	Like radix_tree_lookup, radix_tree_gang_lookup may be called under
664  *	rcu_read_lock. In this case, rather than the returned results being
665  *	an atomic snapshot of the tree at a single point in time, the semantics
666  *	of an RCU protected gang lookup are as though multiple radix_tree_lookups
667  *	have been issued in individual locks, and results stored in 'results'.
668  */
669 unsigned int
670 radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
671 			unsigned long first_index, unsigned int max_items)
672 {
673 	unsigned long max_index;
674 	struct radix_tree_node *node;
675 	unsigned long cur_index = first_index;
676 	unsigned int ret;
677 
678 	node = rcu_dereference(root->rnode);
679 	if (!node)
680 		return 0;
681 
682 	if (radix_tree_is_direct_ptr(node)) {
683 		if (first_index > 0)
684 			return 0;
685 		node = radix_tree_direct_to_ptr(node);
686 		results[0] = rcu_dereference(node);
687 		return 1;
688 	}
689 
690 	max_index = radix_tree_maxindex(node->height);
691 
692 	ret = 0;
693 	while (ret < max_items) {
694 		unsigned int nr_found;
695 		unsigned long next_index;	/* Index of next search */
696 
697 		if (cur_index > max_index)
698 			break;
699 		nr_found = __lookup(node, results + ret, cur_index,
700 					max_items - ret, &next_index);
701 		ret += nr_found;
702 		if (next_index == 0)
703 			break;
704 		cur_index = next_index;
705 	}
706 
707 	return ret;
708 }
709 EXPORT_SYMBOL(radix_tree_gang_lookup);
710 
711 /*
712  * FIXME: the two tag_get()s here should use find_next_bit() instead of
713  * open-coding the search.
714  */
715 static unsigned int
716 __lookup_tag(struct radix_tree_node *slot, void **results, unsigned long index,
717 	unsigned int max_items, unsigned long *next_index, unsigned int tag)
718 {
719 	unsigned int nr_found = 0;
720 	unsigned int shift, height;
721 
722 	height = slot->height;
723 	if (height == 0)
724 		goto out;
725 	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
726 
727 	while (height > 0) {
728 		unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK ;
729 
730 		for (;;) {
731 			if (tag_get(slot, tag, i))
732 				break;
733 			index &= ~((1UL << shift) - 1);
734 			index += 1UL << shift;
735 			if (index == 0)
736 				goto out;	/* 32-bit wraparound */
737 			i++;
738 			if (i == RADIX_TREE_MAP_SIZE)
739 				goto out;
740 		}
741 		height--;
742 		if (height == 0) {	/* Bottom level: grab some items */
743 			unsigned long j = index & RADIX_TREE_MAP_MASK;
744 
745 			for ( ; j < RADIX_TREE_MAP_SIZE; j++) {
746 				struct radix_tree_node *node;
747 				index++;
748 				if (!tag_get(slot, tag, j))
749 					continue;
750 				node = slot->slots[j];
751 				/*
752 				 * Even though the tag was found set, we need to
753 				 * recheck that we have a non-NULL node, because
754 				 * if this lookup is lockless, it may have been
755 				 * subsequently deleted.
756 				 *
757 				 * Similar care must be taken in any place that
758 				 * lookup ->slots[x] without a lock (ie. can't
759 				 * rely on its value remaining the same).
760 				 */
761 				if (node) {
762 					node = rcu_dereference(node);
763 					results[nr_found++] = node;
764 					if (nr_found == max_items)
765 						goto out;
766 				}
767 			}
768 		}
769 		shift -= RADIX_TREE_MAP_SHIFT;
770 		slot = rcu_dereference(slot->slots[i]);
771 		if (slot == NULL)
772 			break;
773 	}
774 out:
775 	*next_index = index;
776 	return nr_found;
777 }
778 
779 /**
780  *	radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree
781  *	                             based on a tag
782  *	@root:		radix tree root
783  *	@results:	where the results of the lookup are placed
784  *	@first_index:	start the lookup from this key
785  *	@max_items:	place up to this many items at *results
786  *	@tag:		the tag index (< RADIX_TREE_MAX_TAGS)
787  *
788  *	Performs an index-ascending scan of the tree for present items which
789  *	have the tag indexed by @tag set.  Places the items at *@results and
790  *	returns the number of items which were placed at *@results.
791  */
792 unsigned int
793 radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
794 		unsigned long first_index, unsigned int max_items,
795 		unsigned int tag)
796 {
797 	struct radix_tree_node *node;
798 	unsigned long max_index;
799 	unsigned long cur_index = first_index;
800 	unsigned int ret;
801 
802 	/* check the root's tag bit */
803 	if (!root_tag_get(root, tag))
804 		return 0;
805 
806 	node = rcu_dereference(root->rnode);
807 	if (!node)
808 		return 0;
809 
810 	if (radix_tree_is_direct_ptr(node)) {
811 		if (first_index > 0)
812 			return 0;
813 		node = radix_tree_direct_to_ptr(node);
814 		results[0] = rcu_dereference(node);
815 		return 1;
816 	}
817 
818 	max_index = radix_tree_maxindex(node->height);
819 
820 	ret = 0;
821 	while (ret < max_items) {
822 		unsigned int nr_found;
823 		unsigned long next_index;	/* Index of next search */
824 
825 		if (cur_index > max_index)
826 			break;
827 		nr_found = __lookup_tag(node, results + ret, cur_index,
828 					max_items - ret, &next_index, tag);
829 		ret += nr_found;
830 		if (next_index == 0)
831 			break;
832 		cur_index = next_index;
833 	}
834 
835 	return ret;
836 }
837 EXPORT_SYMBOL(radix_tree_gang_lookup_tag);
838 
839 /**
840  *	radix_tree_shrink    -    shrink height of a radix tree to minimal
841  *	@root		radix tree root
842  */
843 static inline void radix_tree_shrink(struct radix_tree_root *root)
844 {
845 	/* try to shrink tree height */
846 	while (root->height > 0 &&
847 			root->rnode->count == 1 &&
848 			root->rnode->slots[0]) {
849 		struct radix_tree_node *to_free = root->rnode;
850 		void *newptr;
851 
852 		/*
853 		 * We don't need rcu_assign_pointer(), since we are simply
854 		 * moving the node from one part of the tree to another. If
855 		 * it was safe to dereference the old pointer to it
856 		 * (to_free->slots[0]), it will be safe to dereference the new
857 		 * one (root->rnode).
858 		 */
859 		newptr = to_free->slots[0];
860 		if (root->height == 1)
861 			newptr = radix_tree_ptr_to_direct(newptr);
862 		root->rnode = newptr;
863 		root->height--;
864 		/* must only free zeroed nodes into the slab */
865 		tag_clear(to_free, 0, 0);
866 		tag_clear(to_free, 1, 0);
867 		to_free->slots[0] = NULL;
868 		to_free->count = 0;
869 		radix_tree_node_free(to_free);
870 	}
871 }
872 
873 /**
874  *	radix_tree_delete    -    delete an item from a radix tree
875  *	@root:		radix tree root
876  *	@index:		index key
877  *
878  *	Remove the item at @index from the radix tree rooted at @root.
879  *
880  *	Returns the address of the deleted item, or NULL if it was not present.
881  */
882 void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
883 {
884 	struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
885 	struct radix_tree_node *slot = NULL;
886 	struct radix_tree_node *to_free;
887 	unsigned int height, shift;
888 	int tag;
889 	int offset;
890 
891 	height = root->height;
892 	if (index > radix_tree_maxindex(height))
893 		goto out;
894 
895 	slot = root->rnode;
896 	if (height == 0 && root->rnode) {
897 		slot = radix_tree_direct_to_ptr(slot);
898 		root_tag_clear_all(root);
899 		root->rnode = NULL;
900 		goto out;
901 	}
902 
903 	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
904 	pathp->node = NULL;
905 
906 	do {
907 		if (slot == NULL)
908 			goto out;
909 
910 		pathp++;
911 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
912 		pathp->offset = offset;
913 		pathp->node = slot;
914 		slot = slot->slots[offset];
915 		shift -= RADIX_TREE_MAP_SHIFT;
916 		height--;
917 	} while (height > 0);
918 
919 	if (slot == NULL)
920 		goto out;
921 
922 	/*
923 	 * Clear all tags associated with the just-deleted item
924 	 */
925 	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
926 		if (tag_get(pathp->node, tag, pathp->offset))
927 			radix_tree_tag_clear(root, index, tag);
928 	}
929 
930 	to_free = NULL;
931 	/* Now free the nodes we do not need anymore */
932 	while (pathp->node) {
933 		pathp->node->slots[pathp->offset] = NULL;
934 		pathp->node->count--;
935 		/*
936 		 * Queue the node for deferred freeing after the
937 		 * last reference to it disappears (set NULL, above).
938 		 */
939 		if (to_free)
940 			radix_tree_node_free(to_free);
941 
942 		if (pathp->node->count) {
943 			if (pathp->node == root->rnode)
944 				radix_tree_shrink(root);
945 			goto out;
946 		}
947 
948 		/* Node with zero slots in use so free it */
949 		to_free = pathp->node;
950 		pathp--;
951 
952 	}
953 	root_tag_clear_all(root);
954 	root->height = 0;
955 	root->rnode = NULL;
956 	if (to_free)
957 		radix_tree_node_free(to_free);
958 
959 out:
960 	return slot;
961 }
962 EXPORT_SYMBOL(radix_tree_delete);
963 
964 /**
965  *	radix_tree_tagged - test whether any items in the tree are tagged
966  *	@root:		radix tree root
967  *	@tag:		tag to test
968  */
969 int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag)
970 {
971 	return root_tag_get(root, tag);
972 }
973 EXPORT_SYMBOL(radix_tree_tagged);
974 
975 static void
976 radix_tree_node_ctor(void *node, struct kmem_cache *cachep, unsigned long flags)
977 {
978 	memset(node, 0, sizeof(struct radix_tree_node));
979 }
980 
981 static __init unsigned long __maxindex(unsigned int height)
982 {
983 	unsigned int tmp = height * RADIX_TREE_MAP_SHIFT;
984 	unsigned long index = (~0UL >> (RADIX_TREE_INDEX_BITS - tmp - 1)) >> 1;
985 
986 	if (tmp >= RADIX_TREE_INDEX_BITS)
987 		index = ~0UL;
988 	return index;
989 }
990 
991 static __init void radix_tree_init_maxindex(void)
992 {
993 	unsigned int i;
994 
995 	for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++)
996 		height_to_maxindex[i] = __maxindex(i);
997 }
998 
999 static int radix_tree_callback(struct notifier_block *nfb,
1000                             unsigned long action,
1001                             void *hcpu)
1002 {
1003        int cpu = (long)hcpu;
1004        struct radix_tree_preload *rtp;
1005 
1006        /* Free per-cpu pool of perloaded nodes */
1007        if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
1008                rtp = &per_cpu(radix_tree_preloads, cpu);
1009                while (rtp->nr) {
1010                        kmem_cache_free(radix_tree_node_cachep,
1011                                        rtp->nodes[rtp->nr-1]);
1012                        rtp->nodes[rtp->nr-1] = NULL;
1013                        rtp->nr--;
1014                }
1015        }
1016        return NOTIFY_OK;
1017 }
1018 
1019 void __init radix_tree_init(void)
1020 {
1021 	radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
1022 			sizeof(struct radix_tree_node), 0,
1023 			SLAB_PANIC, radix_tree_node_ctor, NULL);
1024 	radix_tree_init_maxindex();
1025 	hotcpu_notifier(radix_tree_callback, 0);
1026 }
1027