xref: /openbmc/linux/lib/radix-tree.c (revision f42b3800)
1 /*
2  * Copyright (C) 2001 Momchil Velikov
3  * Portions Copyright (C) 2001 Christoph Hellwig
4  * Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
5  * Copyright (C) 2006 Nick Piggin
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2, or (at
10  * your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20  */
21 
22 #include <linux/errno.h>
23 #include <linux/init.h>
24 #include <linux/kernel.h>
25 #include <linux/module.h>
26 #include <linux/radix-tree.h>
27 #include <linux/percpu.h>
28 #include <linux/slab.h>
29 #include <linux/notifier.h>
30 #include <linux/cpu.h>
31 #include <linux/gfp.h>
32 #include <linux/string.h>
33 #include <linux/bitops.h>
34 #include <linux/rcupdate.h>
35 
36 
37 #ifdef __KERNEL__
38 #define RADIX_TREE_MAP_SHIFT	(CONFIG_BASE_SMALL ? 4 : 6)
39 #else
40 #define RADIX_TREE_MAP_SHIFT	3	/* For more stressful testing */
41 #endif
42 
43 #define RADIX_TREE_MAP_SIZE	(1UL << RADIX_TREE_MAP_SHIFT)
44 #define RADIX_TREE_MAP_MASK	(RADIX_TREE_MAP_SIZE-1)
45 
46 #define RADIX_TREE_TAG_LONGS	\
47 	((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
48 
49 struct radix_tree_node {
50 	unsigned int	height;		/* Height from the bottom */
51 	unsigned int	count;
52 	struct rcu_head	rcu_head;
53 	void		*slots[RADIX_TREE_MAP_SIZE];
54 	unsigned long	tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
55 };
56 
57 struct radix_tree_path {
58 	struct radix_tree_node *node;
59 	int offset;
60 };
61 
62 #define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
63 #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
64 					  RADIX_TREE_MAP_SHIFT))
65 
66 /*
67  * The height_to_maxindex array needs to be one deeper than the maximum
68  * path as height 0 holds only 1 entry.
69  */
70 static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1] __read_mostly;
71 
72 /*
73  * Radix tree node cache.
74  */
75 static struct kmem_cache *radix_tree_node_cachep;
76 
77 /*
78  * Per-cpu pool of preloaded nodes
79  */
80 struct radix_tree_preload {
81 	int nr;
82 	struct radix_tree_node *nodes[RADIX_TREE_MAX_PATH];
83 };
84 DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
85 
86 static inline gfp_t root_gfp_mask(struct radix_tree_root *root)
87 {
88 	return root->gfp_mask & __GFP_BITS_MASK;
89 }
90 
91 /*
92  * This assumes that the caller has performed appropriate preallocation, and
93  * that the caller has pinned this thread of control to the current CPU.
94  */
95 static struct radix_tree_node *
96 radix_tree_node_alloc(struct radix_tree_root *root)
97 {
98 	struct radix_tree_node *ret = NULL;
99 	gfp_t gfp_mask = root_gfp_mask(root);
100 
101 	if (!(gfp_mask & __GFP_WAIT)) {
102 		struct radix_tree_preload *rtp;
103 
104 		/*
105 		 * Provided the caller has preloaded here, we will always
106 		 * succeed in getting a node here (and never reach
107 		 * kmem_cache_alloc)
108 		 */
109 		rtp = &__get_cpu_var(radix_tree_preloads);
110 		if (rtp->nr) {
111 			ret = rtp->nodes[rtp->nr - 1];
112 			rtp->nodes[rtp->nr - 1] = NULL;
113 			rtp->nr--;
114 		}
115 	}
116 	if (ret == NULL)
117 		ret = kmem_cache_alloc(radix_tree_node_cachep,
118 				set_migrateflags(gfp_mask, __GFP_RECLAIMABLE));
119 
120 	BUG_ON(radix_tree_is_indirect_ptr(ret));
121 	return ret;
122 }
123 
124 static void radix_tree_node_rcu_free(struct rcu_head *head)
125 {
126 	struct radix_tree_node *node =
127 			container_of(head, struct radix_tree_node, rcu_head);
128 	kmem_cache_free(radix_tree_node_cachep, node);
129 }
130 
131 static inline void
132 radix_tree_node_free(struct radix_tree_node *node)
133 {
134 	call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
135 }
136 
137 /*
138  * Load up this CPU's radix_tree_node buffer with sufficient objects to
139  * ensure that the addition of a single element in the tree cannot fail.  On
140  * success, return zero, with preemption disabled.  On error, return -ENOMEM
141  * with preemption not disabled.
142  */
143 int radix_tree_preload(gfp_t gfp_mask)
144 {
145 	struct radix_tree_preload *rtp;
146 	struct radix_tree_node *node;
147 	int ret = -ENOMEM;
148 
149 	preempt_disable();
150 	rtp = &__get_cpu_var(radix_tree_preloads);
151 	while (rtp->nr < ARRAY_SIZE(rtp->nodes)) {
152 		preempt_enable();
153 		node = kmem_cache_alloc(radix_tree_node_cachep,
154 				set_migrateflags(gfp_mask, __GFP_RECLAIMABLE));
155 		if (node == NULL)
156 			goto out;
157 		preempt_disable();
158 		rtp = &__get_cpu_var(radix_tree_preloads);
159 		if (rtp->nr < ARRAY_SIZE(rtp->nodes))
160 			rtp->nodes[rtp->nr++] = node;
161 		else
162 			kmem_cache_free(radix_tree_node_cachep, node);
163 	}
164 	ret = 0;
165 out:
166 	return ret;
167 }
168 EXPORT_SYMBOL(radix_tree_preload);
169 
170 static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
171 		int offset)
172 {
173 	__set_bit(offset, node->tags[tag]);
174 }
175 
176 static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
177 		int offset)
178 {
179 	__clear_bit(offset, node->tags[tag]);
180 }
181 
182 static inline int tag_get(struct radix_tree_node *node, unsigned int tag,
183 		int offset)
184 {
185 	return test_bit(offset, node->tags[tag]);
186 }
187 
188 static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag)
189 {
190 	root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT));
191 }
192 
193 
194 static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag)
195 {
196 	root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT));
197 }
198 
199 static inline void root_tag_clear_all(struct radix_tree_root *root)
200 {
201 	root->gfp_mask &= __GFP_BITS_MASK;
202 }
203 
204 static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag)
205 {
206 	return (__force unsigned)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT));
207 }
208 
209 /*
210  * Returns 1 if any slot in the node has this tag set.
211  * Otherwise returns 0.
212  */
213 static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag)
214 {
215 	int idx;
216 	for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
217 		if (node->tags[tag][idx])
218 			return 1;
219 	}
220 	return 0;
221 }
222 
223 /*
224  *	Return the maximum key which can be store into a
225  *	radix tree with height HEIGHT.
226  */
227 static inline unsigned long radix_tree_maxindex(unsigned int height)
228 {
229 	return height_to_maxindex[height];
230 }
231 
232 /*
233  *	Extend a radix tree so it can store key @index.
234  */
235 static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
236 {
237 	struct radix_tree_node *node;
238 	unsigned int height;
239 	int tag;
240 
241 	/* Figure out what the height should be.  */
242 	height = root->height + 1;
243 	while (index > radix_tree_maxindex(height))
244 		height++;
245 
246 	if (root->rnode == NULL) {
247 		root->height = height;
248 		goto out;
249 	}
250 
251 	do {
252 		unsigned int newheight;
253 		if (!(node = radix_tree_node_alloc(root)))
254 			return -ENOMEM;
255 
256 		/* Increase the height.  */
257 		node->slots[0] = radix_tree_indirect_to_ptr(root->rnode);
258 
259 		/* Propagate the aggregated tag info into the new root */
260 		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
261 			if (root_tag_get(root, tag))
262 				tag_set(node, tag, 0);
263 		}
264 
265 		newheight = root->height+1;
266 		node->height = newheight;
267 		node->count = 1;
268 		node = radix_tree_ptr_to_indirect(node);
269 		rcu_assign_pointer(root->rnode, node);
270 		root->height = newheight;
271 	} while (height > root->height);
272 out:
273 	return 0;
274 }
275 
276 /**
277  *	radix_tree_insert    -    insert into a radix tree
278  *	@root:		radix tree root
279  *	@index:		index key
280  *	@item:		item to insert
281  *
282  *	Insert an item into the radix tree at position @index.
283  */
284 int radix_tree_insert(struct radix_tree_root *root,
285 			unsigned long index, void *item)
286 {
287 	struct radix_tree_node *node = NULL, *slot;
288 	unsigned int height, shift;
289 	int offset;
290 	int error;
291 
292 	BUG_ON(radix_tree_is_indirect_ptr(item));
293 
294 	/* Make sure the tree is high enough.  */
295 	if (index > radix_tree_maxindex(root->height)) {
296 		error = radix_tree_extend(root, index);
297 		if (error)
298 			return error;
299 	}
300 
301 	slot = radix_tree_indirect_to_ptr(root->rnode);
302 
303 	height = root->height;
304 	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
305 
306 	offset = 0;			/* uninitialised var warning */
307 	while (height > 0) {
308 		if (slot == NULL) {
309 			/* Have to add a child node.  */
310 			if (!(slot = radix_tree_node_alloc(root)))
311 				return -ENOMEM;
312 			slot->height = height;
313 			if (node) {
314 				rcu_assign_pointer(node->slots[offset], slot);
315 				node->count++;
316 			} else
317 				rcu_assign_pointer(root->rnode,
318 					radix_tree_ptr_to_indirect(slot));
319 		}
320 
321 		/* Go a level down */
322 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
323 		node = slot;
324 		slot = node->slots[offset];
325 		shift -= RADIX_TREE_MAP_SHIFT;
326 		height--;
327 	}
328 
329 	if (slot != NULL)
330 		return -EEXIST;
331 
332 	if (node) {
333 		node->count++;
334 		rcu_assign_pointer(node->slots[offset], item);
335 		BUG_ON(tag_get(node, 0, offset));
336 		BUG_ON(tag_get(node, 1, offset));
337 	} else {
338 		rcu_assign_pointer(root->rnode, item);
339 		BUG_ON(root_tag_get(root, 0));
340 		BUG_ON(root_tag_get(root, 1));
341 	}
342 
343 	return 0;
344 }
345 EXPORT_SYMBOL(radix_tree_insert);
346 
347 /**
348  *	radix_tree_lookup_slot    -    lookup a slot in a radix tree
349  *	@root:		radix tree root
350  *	@index:		index key
351  *
352  *	Returns:  the slot corresponding to the position @index in the
353  *	radix tree @root. This is useful for update-if-exists operations.
354  *
355  *	This function cannot be called under rcu_read_lock, it must be
356  *	excluded from writers, as must the returned slot for subsequent
357  *	use by radix_tree_deref_slot() and radix_tree_replace slot.
358  *	Caller must hold tree write locked across slot lookup and
359  *	replace.
360  */
361 void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
362 {
363 	unsigned int height, shift;
364 	struct radix_tree_node *node, **slot;
365 
366 	node = root->rnode;
367 	if (node == NULL)
368 		return NULL;
369 
370 	if (!radix_tree_is_indirect_ptr(node)) {
371 		if (index > 0)
372 			return NULL;
373 		return (void **)&root->rnode;
374 	}
375 	node = radix_tree_indirect_to_ptr(node);
376 
377 	height = node->height;
378 	if (index > radix_tree_maxindex(height))
379 		return NULL;
380 
381 	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
382 
383 	do {
384 		slot = (struct radix_tree_node **)
385 			(node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK));
386 		node = *slot;
387 		if (node == NULL)
388 			return NULL;
389 
390 		shift -= RADIX_TREE_MAP_SHIFT;
391 		height--;
392 	} while (height > 0);
393 
394 	return (void **)slot;
395 }
396 EXPORT_SYMBOL(radix_tree_lookup_slot);
397 
398 /**
399  *	radix_tree_lookup    -    perform lookup operation on a radix tree
400  *	@root:		radix tree root
401  *	@index:		index key
402  *
403  *	Lookup the item at the position @index in the radix tree @root.
404  *
405  *	This function can be called under rcu_read_lock, however the caller
406  *	must manage lifetimes of leaf nodes (eg. RCU may also be used to free
407  *	them safely). No RCU barriers are required to access or modify the
408  *	returned item, however.
409  */
410 void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
411 {
412 	unsigned int height, shift;
413 	struct radix_tree_node *node, **slot;
414 
415 	node = rcu_dereference(root->rnode);
416 	if (node == NULL)
417 		return NULL;
418 
419 	if (!radix_tree_is_indirect_ptr(node)) {
420 		if (index > 0)
421 			return NULL;
422 		return node;
423 	}
424 	node = radix_tree_indirect_to_ptr(node);
425 
426 	height = node->height;
427 	if (index > radix_tree_maxindex(height))
428 		return NULL;
429 
430 	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
431 
432 	do {
433 		slot = (struct radix_tree_node **)
434 			(node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK));
435 		node = rcu_dereference(*slot);
436 		if (node == NULL)
437 			return NULL;
438 
439 		shift -= RADIX_TREE_MAP_SHIFT;
440 		height--;
441 	} while (height > 0);
442 
443 	return node;
444 }
445 EXPORT_SYMBOL(radix_tree_lookup);
446 
447 /**
448  *	radix_tree_tag_set - set a tag on a radix tree node
449  *	@root:		radix tree root
450  *	@index:		index key
451  *	@tag: 		tag index
452  *
453  *	Set the search tag (which must be < RADIX_TREE_MAX_TAGS)
454  *	corresponding to @index in the radix tree.  From
455  *	the root all the way down to the leaf node.
456  *
457  *	Returns the address of the tagged item.   Setting a tag on a not-present
458  *	item is a bug.
459  */
460 void *radix_tree_tag_set(struct radix_tree_root *root,
461 			unsigned long index, unsigned int tag)
462 {
463 	unsigned int height, shift;
464 	struct radix_tree_node *slot;
465 
466 	height = root->height;
467 	BUG_ON(index > radix_tree_maxindex(height));
468 
469 	slot = radix_tree_indirect_to_ptr(root->rnode);
470 	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
471 
472 	while (height > 0) {
473 		int offset;
474 
475 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
476 		if (!tag_get(slot, tag, offset))
477 			tag_set(slot, tag, offset);
478 		slot = slot->slots[offset];
479 		BUG_ON(slot == NULL);
480 		shift -= RADIX_TREE_MAP_SHIFT;
481 		height--;
482 	}
483 
484 	/* set the root's tag bit */
485 	if (slot && !root_tag_get(root, tag))
486 		root_tag_set(root, tag);
487 
488 	return slot;
489 }
490 EXPORT_SYMBOL(radix_tree_tag_set);
491 
492 /**
493  *	radix_tree_tag_clear - clear a tag on a radix tree node
494  *	@root:		radix tree root
495  *	@index:		index key
496  *	@tag: 		tag index
497  *
498  *	Clear the search tag (which must be < RADIX_TREE_MAX_TAGS)
499  *	corresponding to @index in the radix tree.  If
500  *	this causes the leaf node to have no tags set then clear the tag in the
501  *	next-to-leaf node, etc.
502  *
503  *	Returns the address of the tagged item on success, else NULL.  ie:
504  *	has the same return value and semantics as radix_tree_lookup().
505  */
506 void *radix_tree_tag_clear(struct radix_tree_root *root,
507 			unsigned long index, unsigned int tag)
508 {
509 	/*
510 	 * The radix tree path needs to be one longer than the maximum path
511 	 * since the "list" is null terminated.
512 	 */
513 	struct radix_tree_path path[RADIX_TREE_MAX_PATH + 1], *pathp = path;
514 	struct radix_tree_node *slot = NULL;
515 	unsigned int height, shift;
516 
517 	height = root->height;
518 	if (index > radix_tree_maxindex(height))
519 		goto out;
520 
521 	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
522 	pathp->node = NULL;
523 	slot = radix_tree_indirect_to_ptr(root->rnode);
524 
525 	while (height > 0) {
526 		int offset;
527 
528 		if (slot == NULL)
529 			goto out;
530 
531 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
532 		pathp[1].offset = offset;
533 		pathp[1].node = slot;
534 		slot = slot->slots[offset];
535 		pathp++;
536 		shift -= RADIX_TREE_MAP_SHIFT;
537 		height--;
538 	}
539 
540 	if (slot == NULL)
541 		goto out;
542 
543 	while (pathp->node) {
544 		if (!tag_get(pathp->node, tag, pathp->offset))
545 			goto out;
546 		tag_clear(pathp->node, tag, pathp->offset);
547 		if (any_tag_set(pathp->node, tag))
548 			goto out;
549 		pathp--;
550 	}
551 
552 	/* clear the root's tag bit */
553 	if (root_tag_get(root, tag))
554 		root_tag_clear(root, tag);
555 
556 out:
557 	return slot;
558 }
559 EXPORT_SYMBOL(radix_tree_tag_clear);
560 
561 #ifndef __KERNEL__	/* Only the test harness uses this at present */
562 /**
563  * radix_tree_tag_get - get a tag on a radix tree node
564  * @root:		radix tree root
565  * @index:		index key
566  * @tag: 		tag index (< RADIX_TREE_MAX_TAGS)
567  *
568  * Return values:
569  *
570  *  0: tag not present or not set
571  *  1: tag set
572  */
573 int radix_tree_tag_get(struct radix_tree_root *root,
574 			unsigned long index, unsigned int tag)
575 {
576 	unsigned int height, shift;
577 	struct radix_tree_node *node;
578 	int saw_unset_tag = 0;
579 
580 	/* check the root's tag bit */
581 	if (!root_tag_get(root, tag))
582 		return 0;
583 
584 	node = rcu_dereference(root->rnode);
585 	if (node == NULL)
586 		return 0;
587 
588 	if (!radix_tree_is_indirect_ptr(node))
589 		return (index == 0);
590 	node = radix_tree_indirect_to_ptr(node);
591 
592 	height = node->height;
593 	if (index > radix_tree_maxindex(height))
594 		return 0;
595 
596 	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
597 
598 	for ( ; ; ) {
599 		int offset;
600 
601 		if (node == NULL)
602 			return 0;
603 
604 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
605 
606 		/*
607 		 * This is just a debug check.  Later, we can bale as soon as
608 		 * we see an unset tag.
609 		 */
610 		if (!tag_get(node, tag, offset))
611 			saw_unset_tag = 1;
612 		if (height == 1) {
613 			int ret = tag_get(node, tag, offset);
614 
615 			BUG_ON(ret && saw_unset_tag);
616 			return !!ret;
617 		}
618 		node = rcu_dereference(node->slots[offset]);
619 		shift -= RADIX_TREE_MAP_SHIFT;
620 		height--;
621 	}
622 }
623 EXPORT_SYMBOL(radix_tree_tag_get);
624 #endif
625 
626 /**
627  *	radix_tree_next_hole    -    find the next hole (not-present entry)
628  *	@root:		tree root
629  *	@index:		index key
630  *	@max_scan:	maximum range to search
631  *
632  *	Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the lowest
633  *	indexed hole.
634  *
635  *	Returns: the index of the hole if found, otherwise returns an index
636  *	outside of the set specified (in which case 'return - index >= max_scan'
637  *	will be true).
638  *
639  *	radix_tree_next_hole may be called under rcu_read_lock. However, like
640  *	radix_tree_gang_lookup, this will not atomically search a snapshot of the
641  *	tree at a single point in time. For example, if a hole is created at index
642  *	5, then subsequently a hole is created at index 10, radix_tree_next_hole
643  *	covering both indexes may return 10 if called under rcu_read_lock.
644  */
645 unsigned long radix_tree_next_hole(struct radix_tree_root *root,
646 				unsigned long index, unsigned long max_scan)
647 {
648 	unsigned long i;
649 
650 	for (i = 0; i < max_scan; i++) {
651 		if (!radix_tree_lookup(root, index))
652 			break;
653 		index++;
654 		if (index == 0)
655 			break;
656 	}
657 
658 	return index;
659 }
660 EXPORT_SYMBOL(radix_tree_next_hole);
661 
662 static unsigned int
663 __lookup(struct radix_tree_node *slot, void **results, unsigned long index,
664 	unsigned int max_items, unsigned long *next_index)
665 {
666 	unsigned int nr_found = 0;
667 	unsigned int shift, height;
668 	unsigned long i;
669 
670 	height = slot->height;
671 	if (height == 0)
672 		goto out;
673 	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
674 
675 	for ( ; height > 1; height--) {
676 		i = (index >> shift) & RADIX_TREE_MAP_MASK;
677 		for (;;) {
678 			if (slot->slots[i] != NULL)
679 				break;
680 			index &= ~((1UL << shift) - 1);
681 			index += 1UL << shift;
682 			if (index == 0)
683 				goto out;	/* 32-bit wraparound */
684 			i++;
685 			if (i == RADIX_TREE_MAP_SIZE)
686 				goto out;
687 		}
688 
689 		shift -= RADIX_TREE_MAP_SHIFT;
690 		slot = rcu_dereference(slot->slots[i]);
691 		if (slot == NULL)
692 			goto out;
693 	}
694 
695 	/* Bottom level: grab some items */
696 	for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) {
697 		struct radix_tree_node *node;
698 		index++;
699 		node = slot->slots[i];
700 		if (node) {
701 			results[nr_found++] = rcu_dereference(node);
702 			if (nr_found == max_items)
703 				goto out;
704 		}
705 	}
706 out:
707 	*next_index = index;
708 	return nr_found;
709 }
710 
711 /**
712  *	radix_tree_gang_lookup - perform multiple lookup on a radix tree
713  *	@root:		radix tree root
714  *	@results:	where the results of the lookup are placed
715  *	@first_index:	start the lookup from this key
716  *	@max_items:	place up to this many items at *results
717  *
718  *	Performs an index-ascending scan of the tree for present items.  Places
719  *	them at *@results and returns the number of items which were placed at
720  *	*@results.
721  *
722  *	The implementation is naive.
723  *
724  *	Like radix_tree_lookup, radix_tree_gang_lookup may be called under
725  *	rcu_read_lock. In this case, rather than the returned results being
726  *	an atomic snapshot of the tree at a single point in time, the semantics
727  *	of an RCU protected gang lookup are as though multiple radix_tree_lookups
728  *	have been issued in individual locks, and results stored in 'results'.
729  */
730 unsigned int
731 radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
732 			unsigned long first_index, unsigned int max_items)
733 {
734 	unsigned long max_index;
735 	struct radix_tree_node *node;
736 	unsigned long cur_index = first_index;
737 	unsigned int ret;
738 
739 	node = rcu_dereference(root->rnode);
740 	if (!node)
741 		return 0;
742 
743 	if (!radix_tree_is_indirect_ptr(node)) {
744 		if (first_index > 0)
745 			return 0;
746 		results[0] = node;
747 		return 1;
748 	}
749 	node = radix_tree_indirect_to_ptr(node);
750 
751 	max_index = radix_tree_maxindex(node->height);
752 
753 	ret = 0;
754 	while (ret < max_items) {
755 		unsigned int nr_found;
756 		unsigned long next_index;	/* Index of next search */
757 
758 		if (cur_index > max_index)
759 			break;
760 		nr_found = __lookup(node, results + ret, cur_index,
761 					max_items - ret, &next_index);
762 		ret += nr_found;
763 		if (next_index == 0)
764 			break;
765 		cur_index = next_index;
766 	}
767 
768 	return ret;
769 }
770 EXPORT_SYMBOL(radix_tree_gang_lookup);
771 
772 /*
773  * FIXME: the two tag_get()s here should use find_next_bit() instead of
774  * open-coding the search.
775  */
776 static unsigned int
777 __lookup_tag(struct radix_tree_node *slot, void **results, unsigned long index,
778 	unsigned int max_items, unsigned long *next_index, unsigned int tag)
779 {
780 	unsigned int nr_found = 0;
781 	unsigned int shift, height;
782 
783 	height = slot->height;
784 	if (height == 0)
785 		goto out;
786 	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
787 
788 	while (height > 0) {
789 		unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK ;
790 
791 		for (;;) {
792 			if (tag_get(slot, tag, i))
793 				break;
794 			index &= ~((1UL << shift) - 1);
795 			index += 1UL << shift;
796 			if (index == 0)
797 				goto out;	/* 32-bit wraparound */
798 			i++;
799 			if (i == RADIX_TREE_MAP_SIZE)
800 				goto out;
801 		}
802 		height--;
803 		if (height == 0) {	/* Bottom level: grab some items */
804 			unsigned long j = index & RADIX_TREE_MAP_MASK;
805 
806 			for ( ; j < RADIX_TREE_MAP_SIZE; j++) {
807 				struct radix_tree_node *node;
808 				index++;
809 				if (!tag_get(slot, tag, j))
810 					continue;
811 				node = slot->slots[j];
812 				/*
813 				 * Even though the tag was found set, we need to
814 				 * recheck that we have a non-NULL node, because
815 				 * if this lookup is lockless, it may have been
816 				 * subsequently deleted.
817 				 *
818 				 * Similar care must be taken in any place that
819 				 * lookup ->slots[x] without a lock (ie. can't
820 				 * rely on its value remaining the same).
821 				 */
822 				if (node) {
823 					node = rcu_dereference(node);
824 					results[nr_found++] = node;
825 					if (nr_found == max_items)
826 						goto out;
827 				}
828 			}
829 		}
830 		shift -= RADIX_TREE_MAP_SHIFT;
831 		slot = rcu_dereference(slot->slots[i]);
832 		if (slot == NULL)
833 			break;
834 	}
835 out:
836 	*next_index = index;
837 	return nr_found;
838 }
839 
840 /**
841  *	radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree
842  *	                             based on a tag
843  *	@root:		radix tree root
844  *	@results:	where the results of the lookup are placed
845  *	@first_index:	start the lookup from this key
846  *	@max_items:	place up to this many items at *results
847  *	@tag:		the tag index (< RADIX_TREE_MAX_TAGS)
848  *
849  *	Performs an index-ascending scan of the tree for present items which
850  *	have the tag indexed by @tag set.  Places the items at *@results and
851  *	returns the number of items which were placed at *@results.
852  */
853 unsigned int
854 radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
855 		unsigned long first_index, unsigned int max_items,
856 		unsigned int tag)
857 {
858 	struct radix_tree_node *node;
859 	unsigned long max_index;
860 	unsigned long cur_index = first_index;
861 	unsigned int ret;
862 
863 	/* check the root's tag bit */
864 	if (!root_tag_get(root, tag))
865 		return 0;
866 
867 	node = rcu_dereference(root->rnode);
868 	if (!node)
869 		return 0;
870 
871 	if (!radix_tree_is_indirect_ptr(node)) {
872 		if (first_index > 0)
873 			return 0;
874 		results[0] = node;
875 		return 1;
876 	}
877 	node = radix_tree_indirect_to_ptr(node);
878 
879 	max_index = radix_tree_maxindex(node->height);
880 
881 	ret = 0;
882 	while (ret < max_items) {
883 		unsigned int nr_found;
884 		unsigned long next_index;	/* Index of next search */
885 
886 		if (cur_index > max_index)
887 			break;
888 		nr_found = __lookup_tag(node, results + ret, cur_index,
889 					max_items - ret, &next_index, tag);
890 		ret += nr_found;
891 		if (next_index == 0)
892 			break;
893 		cur_index = next_index;
894 	}
895 
896 	return ret;
897 }
898 EXPORT_SYMBOL(radix_tree_gang_lookup_tag);
899 
900 /**
901  *	radix_tree_shrink    -    shrink height of a radix tree to minimal
902  *	@root		radix tree root
903  */
904 static inline void radix_tree_shrink(struct radix_tree_root *root)
905 {
906 	/* try to shrink tree height */
907 	while (root->height > 0) {
908 		struct radix_tree_node *to_free = root->rnode;
909 		void *newptr;
910 
911 		BUG_ON(!radix_tree_is_indirect_ptr(to_free));
912 		to_free = radix_tree_indirect_to_ptr(to_free);
913 
914 		/*
915 		 * The candidate node has more than one child, or its child
916 		 * is not at the leftmost slot, we cannot shrink.
917 		 */
918 		if (to_free->count != 1)
919 			break;
920 		if (!to_free->slots[0])
921 			break;
922 
923 		/*
924 		 * We don't need rcu_assign_pointer(), since we are simply
925 		 * moving the node from one part of the tree to another. If
926 		 * it was safe to dereference the old pointer to it
927 		 * (to_free->slots[0]), it will be safe to dereference the new
928 		 * one (root->rnode).
929 		 */
930 		newptr = to_free->slots[0];
931 		if (root->height > 1)
932 			newptr = radix_tree_ptr_to_indirect(newptr);
933 		root->rnode = newptr;
934 		root->height--;
935 		/* must only free zeroed nodes into the slab */
936 		tag_clear(to_free, 0, 0);
937 		tag_clear(to_free, 1, 0);
938 		to_free->slots[0] = NULL;
939 		to_free->count = 0;
940 		radix_tree_node_free(to_free);
941 	}
942 }
943 
944 /**
945  *	radix_tree_delete    -    delete an item from a radix tree
946  *	@root:		radix tree root
947  *	@index:		index key
948  *
949  *	Remove the item at @index from the radix tree rooted at @root.
950  *
951  *	Returns the address of the deleted item, or NULL if it was not present.
952  */
953 void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
954 {
955 	/*
956 	 * The radix tree path needs to be one longer than the maximum path
957 	 * since the "list" is null terminated.
958 	 */
959 	struct radix_tree_path path[RADIX_TREE_MAX_PATH + 1], *pathp = path;
960 	struct radix_tree_node *slot = NULL;
961 	struct radix_tree_node *to_free;
962 	unsigned int height, shift;
963 	int tag;
964 	int offset;
965 
966 	height = root->height;
967 	if (index > radix_tree_maxindex(height))
968 		goto out;
969 
970 	slot = root->rnode;
971 	if (height == 0) {
972 		root_tag_clear_all(root);
973 		root->rnode = NULL;
974 		goto out;
975 	}
976 	slot = radix_tree_indirect_to_ptr(slot);
977 
978 	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
979 	pathp->node = NULL;
980 
981 	do {
982 		if (slot == NULL)
983 			goto out;
984 
985 		pathp++;
986 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
987 		pathp->offset = offset;
988 		pathp->node = slot;
989 		slot = slot->slots[offset];
990 		shift -= RADIX_TREE_MAP_SHIFT;
991 		height--;
992 	} while (height > 0);
993 
994 	if (slot == NULL)
995 		goto out;
996 
997 	/*
998 	 * Clear all tags associated with the just-deleted item
999 	 */
1000 	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
1001 		if (tag_get(pathp->node, tag, pathp->offset))
1002 			radix_tree_tag_clear(root, index, tag);
1003 	}
1004 
1005 	to_free = NULL;
1006 	/* Now free the nodes we do not need anymore */
1007 	while (pathp->node) {
1008 		pathp->node->slots[pathp->offset] = NULL;
1009 		pathp->node->count--;
1010 		/*
1011 		 * Queue the node for deferred freeing after the
1012 		 * last reference to it disappears (set NULL, above).
1013 		 */
1014 		if (to_free)
1015 			radix_tree_node_free(to_free);
1016 
1017 		if (pathp->node->count) {
1018 			if (pathp->node ==
1019 					radix_tree_indirect_to_ptr(root->rnode))
1020 				radix_tree_shrink(root);
1021 			goto out;
1022 		}
1023 
1024 		/* Node with zero slots in use so free it */
1025 		to_free = pathp->node;
1026 		pathp--;
1027 
1028 	}
1029 	root_tag_clear_all(root);
1030 	root->height = 0;
1031 	root->rnode = NULL;
1032 	if (to_free)
1033 		radix_tree_node_free(to_free);
1034 
1035 out:
1036 	return slot;
1037 }
1038 EXPORT_SYMBOL(radix_tree_delete);
1039 
1040 /**
1041  *	radix_tree_tagged - test whether any items in the tree are tagged
1042  *	@root:		radix tree root
1043  *	@tag:		tag to test
1044  */
1045 int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag)
1046 {
1047 	return root_tag_get(root, tag);
1048 }
1049 EXPORT_SYMBOL(radix_tree_tagged);
1050 
1051 static void
1052 radix_tree_node_ctor(struct kmem_cache *cachep, void *node)
1053 {
1054 	memset(node, 0, sizeof(struct radix_tree_node));
1055 }
1056 
1057 static __init unsigned long __maxindex(unsigned int height)
1058 {
1059 	unsigned int width = height * RADIX_TREE_MAP_SHIFT;
1060 	int shift = RADIX_TREE_INDEX_BITS - width;
1061 
1062 	if (shift < 0)
1063 		return ~0UL;
1064 	if (shift >= BITS_PER_LONG)
1065 		return 0UL;
1066 	return ~0UL >> shift;
1067 }
1068 
1069 static __init void radix_tree_init_maxindex(void)
1070 {
1071 	unsigned int i;
1072 
1073 	for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++)
1074 		height_to_maxindex[i] = __maxindex(i);
1075 }
1076 
1077 static int radix_tree_callback(struct notifier_block *nfb,
1078                             unsigned long action,
1079                             void *hcpu)
1080 {
1081        int cpu = (long)hcpu;
1082        struct radix_tree_preload *rtp;
1083 
1084        /* Free per-cpu pool of perloaded nodes */
1085        if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
1086                rtp = &per_cpu(radix_tree_preloads, cpu);
1087                while (rtp->nr) {
1088                        kmem_cache_free(radix_tree_node_cachep,
1089                                        rtp->nodes[rtp->nr-1]);
1090                        rtp->nodes[rtp->nr-1] = NULL;
1091                        rtp->nr--;
1092                }
1093        }
1094        return NOTIFY_OK;
1095 }
1096 
1097 void __init radix_tree_init(void)
1098 {
1099 	radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
1100 			sizeof(struct radix_tree_node), 0,
1101 			SLAB_PANIC, radix_tree_node_ctor);
1102 	radix_tree_init_maxindex();
1103 	hotcpu_notifier(radix_tree_callback, 0);
1104 }
1105