xref: /openbmc/linux/kernel/bpf/hashtab.c (revision 6a613ac6)
1 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2  *
3  * This program is free software; you can redistribute it and/or
4  * modify it under the terms of version 2 of the GNU General Public
5  * License as published by the Free Software Foundation.
6  *
7  * This program is distributed in the hope that it will be useful, but
8  * WITHOUT ANY WARRANTY; without even the implied warranty of
9  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10  * General Public License for more details.
11  */
12 #include <linux/bpf.h>
13 #include <linux/jhash.h>
14 #include <linux/filter.h>
15 #include <linux/vmalloc.h>
16 
17 struct bpf_htab {
18 	struct bpf_map map;
19 	struct hlist_head *buckets;
20 	raw_spinlock_t lock;
21 	u32 count;	/* number of elements in this hashtable */
22 	u32 n_buckets;	/* number of hash buckets */
23 	u32 elem_size;	/* size of each element in bytes */
24 };
25 
26 /* each htab element is struct htab_elem + key + value */
27 struct htab_elem {
28 	struct hlist_node hash_node;
29 	struct rcu_head rcu;
30 	u32 hash;
31 	char key[0] __aligned(8);
32 };
33 
34 /* Called from syscall */
35 static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
36 {
37 	struct bpf_htab *htab;
38 	int err, i;
39 
40 	htab = kzalloc(sizeof(*htab), GFP_USER);
41 	if (!htab)
42 		return ERR_PTR(-ENOMEM);
43 
44 	/* mandatory map attributes */
45 	htab->map.key_size = attr->key_size;
46 	htab->map.value_size = attr->value_size;
47 	htab->map.max_entries = attr->max_entries;
48 
49 	/* check sanity of attributes.
50 	 * value_size == 0 may be allowed in the future to use map as a set
51 	 */
52 	err = -EINVAL;
53 	if (htab->map.max_entries == 0 || htab->map.key_size == 0 ||
54 	    htab->map.value_size == 0)
55 		goto free_htab;
56 
57 	/* hash table size must be power of 2 */
58 	htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
59 
60 	err = -E2BIG;
61 	if (htab->map.key_size > MAX_BPF_STACK)
62 		/* eBPF programs initialize keys on stack, so they cannot be
63 		 * larger than max stack size
64 		 */
65 		goto free_htab;
66 
67 	if (htab->map.value_size >= (1 << (KMALLOC_SHIFT_MAX - 1)) -
68 	    MAX_BPF_STACK - sizeof(struct htab_elem))
69 		/* if value_size is bigger, the user space won't be able to
70 		 * access the elements via bpf syscall. This check also makes
71 		 * sure that the elem_size doesn't overflow and it's
72 		 * kmalloc-able later in htab_map_update_elem()
73 		 */
74 		goto free_htab;
75 
76 	htab->elem_size = sizeof(struct htab_elem) +
77 			  round_up(htab->map.key_size, 8) +
78 			  htab->map.value_size;
79 
80 	/* prevent zero size kmalloc and check for u32 overflow */
81 	if (htab->n_buckets == 0 ||
82 	    htab->n_buckets > U32_MAX / sizeof(struct hlist_head))
83 		goto free_htab;
84 
85 	if ((u64) htab->n_buckets * sizeof(struct hlist_head) +
86 	    (u64) htab->elem_size * htab->map.max_entries >=
87 	    U32_MAX - PAGE_SIZE)
88 		/* make sure page count doesn't overflow */
89 		goto free_htab;
90 
91 	htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) +
92 				   htab->elem_size * htab->map.max_entries,
93 				   PAGE_SIZE) >> PAGE_SHIFT;
94 
95 	err = -ENOMEM;
96 	htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head),
97 				      GFP_USER | __GFP_NOWARN);
98 
99 	if (!htab->buckets) {
100 		htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head));
101 		if (!htab->buckets)
102 			goto free_htab;
103 	}
104 
105 	for (i = 0; i < htab->n_buckets; i++)
106 		INIT_HLIST_HEAD(&htab->buckets[i]);
107 
108 	raw_spin_lock_init(&htab->lock);
109 	htab->count = 0;
110 
111 	return &htab->map;
112 
113 free_htab:
114 	kfree(htab);
115 	return ERR_PTR(err);
116 }
117 
118 static inline u32 htab_map_hash(const void *key, u32 key_len)
119 {
120 	return jhash(key, key_len, 0);
121 }
122 
123 static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
124 {
125 	return &htab->buckets[hash & (htab->n_buckets - 1)];
126 }
127 
128 static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
129 					 void *key, u32 key_size)
130 {
131 	struct htab_elem *l;
132 
133 	hlist_for_each_entry_rcu(l, head, hash_node)
134 		if (l->hash == hash && !memcmp(&l->key, key, key_size))
135 			return l;
136 
137 	return NULL;
138 }
139 
140 /* Called from syscall or from eBPF program */
141 static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
142 {
143 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
144 	struct hlist_head *head;
145 	struct htab_elem *l;
146 	u32 hash, key_size;
147 
148 	/* Must be called with rcu_read_lock. */
149 	WARN_ON_ONCE(!rcu_read_lock_held());
150 
151 	key_size = map->key_size;
152 
153 	hash = htab_map_hash(key, key_size);
154 
155 	head = select_bucket(htab, hash);
156 
157 	l = lookup_elem_raw(head, hash, key, key_size);
158 
159 	if (l)
160 		return l->key + round_up(map->key_size, 8);
161 
162 	return NULL;
163 }
164 
165 /* Called from syscall */
166 static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
167 {
168 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
169 	struct hlist_head *head;
170 	struct htab_elem *l, *next_l;
171 	u32 hash, key_size;
172 	int i;
173 
174 	WARN_ON_ONCE(!rcu_read_lock_held());
175 
176 	key_size = map->key_size;
177 
178 	hash = htab_map_hash(key, key_size);
179 
180 	head = select_bucket(htab, hash);
181 
182 	/* lookup the key */
183 	l = lookup_elem_raw(head, hash, key, key_size);
184 
185 	if (!l) {
186 		i = 0;
187 		goto find_first_elem;
188 	}
189 
190 	/* key was found, get next key in the same bucket */
191 	next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
192 				  struct htab_elem, hash_node);
193 
194 	if (next_l) {
195 		/* if next elem in this hash list is non-zero, just return it */
196 		memcpy(next_key, next_l->key, key_size);
197 		return 0;
198 	}
199 
200 	/* no more elements in this hash list, go to the next bucket */
201 	i = hash & (htab->n_buckets - 1);
202 	i++;
203 
204 find_first_elem:
205 	/* iterate over buckets */
206 	for (; i < htab->n_buckets; i++) {
207 		head = select_bucket(htab, i);
208 
209 		/* pick first element in the bucket */
210 		next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
211 					  struct htab_elem, hash_node);
212 		if (next_l) {
213 			/* if it's not empty, just return it */
214 			memcpy(next_key, next_l->key, key_size);
215 			return 0;
216 		}
217 	}
218 
219 	/* itereated over all buckets and all elements */
220 	return -ENOENT;
221 }
222 
223 /* Called from syscall or from eBPF program */
224 static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
225 				u64 map_flags)
226 {
227 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
228 	struct htab_elem *l_new, *l_old;
229 	struct hlist_head *head;
230 	unsigned long flags;
231 	u32 key_size;
232 	int ret;
233 
234 	if (map_flags > BPF_EXIST)
235 		/* unknown flags */
236 		return -EINVAL;
237 
238 	WARN_ON_ONCE(!rcu_read_lock_held());
239 
240 	/* allocate new element outside of lock */
241 	l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
242 	if (!l_new)
243 		return -ENOMEM;
244 
245 	key_size = map->key_size;
246 
247 	memcpy(l_new->key, key, key_size);
248 	memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
249 
250 	l_new->hash = htab_map_hash(l_new->key, key_size);
251 
252 	/* bpf_map_update_elem() can be called in_irq() */
253 	raw_spin_lock_irqsave(&htab->lock, flags);
254 
255 	head = select_bucket(htab, l_new->hash);
256 
257 	l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
258 
259 	if (!l_old && unlikely(htab->count >= map->max_entries)) {
260 		/* if elem with this 'key' doesn't exist and we've reached
261 		 * max_entries limit, fail insertion of new elem
262 		 */
263 		ret = -E2BIG;
264 		goto err;
265 	}
266 
267 	if (l_old && map_flags == BPF_NOEXIST) {
268 		/* elem already exists */
269 		ret = -EEXIST;
270 		goto err;
271 	}
272 
273 	if (!l_old && map_flags == BPF_EXIST) {
274 		/* elem doesn't exist, cannot update it */
275 		ret = -ENOENT;
276 		goto err;
277 	}
278 
279 	/* add new element to the head of the list, so that concurrent
280 	 * search will find it before old elem
281 	 */
282 	hlist_add_head_rcu(&l_new->hash_node, head);
283 	if (l_old) {
284 		hlist_del_rcu(&l_old->hash_node);
285 		kfree_rcu(l_old, rcu);
286 	} else {
287 		htab->count++;
288 	}
289 	raw_spin_unlock_irqrestore(&htab->lock, flags);
290 
291 	return 0;
292 err:
293 	raw_spin_unlock_irqrestore(&htab->lock, flags);
294 	kfree(l_new);
295 	return ret;
296 }
297 
298 /* Called from syscall or from eBPF program */
299 static int htab_map_delete_elem(struct bpf_map *map, void *key)
300 {
301 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
302 	struct hlist_head *head;
303 	struct htab_elem *l;
304 	unsigned long flags;
305 	u32 hash, key_size;
306 	int ret = -ENOENT;
307 
308 	WARN_ON_ONCE(!rcu_read_lock_held());
309 
310 	key_size = map->key_size;
311 
312 	hash = htab_map_hash(key, key_size);
313 
314 	raw_spin_lock_irqsave(&htab->lock, flags);
315 
316 	head = select_bucket(htab, hash);
317 
318 	l = lookup_elem_raw(head, hash, key, key_size);
319 
320 	if (l) {
321 		hlist_del_rcu(&l->hash_node);
322 		htab->count--;
323 		kfree_rcu(l, rcu);
324 		ret = 0;
325 	}
326 
327 	raw_spin_unlock_irqrestore(&htab->lock, flags);
328 	return ret;
329 }
330 
331 static void delete_all_elements(struct bpf_htab *htab)
332 {
333 	int i;
334 
335 	for (i = 0; i < htab->n_buckets; i++) {
336 		struct hlist_head *head = select_bucket(htab, i);
337 		struct hlist_node *n;
338 		struct htab_elem *l;
339 
340 		hlist_for_each_entry_safe(l, n, head, hash_node) {
341 			hlist_del_rcu(&l->hash_node);
342 			htab->count--;
343 			kfree(l);
344 		}
345 	}
346 }
347 
348 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
349 static void htab_map_free(struct bpf_map *map)
350 {
351 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
352 
353 	/* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
354 	 * so the programs (can be more than one that used this map) were
355 	 * disconnected from events. Wait for outstanding critical sections in
356 	 * these programs to complete
357 	 */
358 	synchronize_rcu();
359 
360 	/* some of kfree_rcu() callbacks for elements of this map may not have
361 	 * executed. It's ok. Proceed to free residual elements and map itself
362 	 */
363 	delete_all_elements(htab);
364 	kvfree(htab->buckets);
365 	kfree(htab);
366 }
367 
368 static const struct bpf_map_ops htab_ops = {
369 	.map_alloc = htab_map_alloc,
370 	.map_free = htab_map_free,
371 	.map_get_next_key = htab_map_get_next_key,
372 	.map_lookup_elem = htab_map_lookup_elem,
373 	.map_update_elem = htab_map_update_elem,
374 	.map_delete_elem = htab_map_delete_elem,
375 };
376 
377 static struct bpf_map_type_list htab_type __read_mostly = {
378 	.ops = &htab_ops,
379 	.type = BPF_MAP_TYPE_HASH,
380 };
381 
382 static int __init register_htab_map(void)
383 {
384 	bpf_register_map_type(&htab_type);
385 	return 0;
386 }
387 late_initcall(register_htab_map);
388