xref: /openbmc/linux/kernel/bpf/hashtab.c (revision 87fcfa7b7fe6bf819033fe827a27f710e38639b5)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
3  * Copyright (c) 2016 Facebook
4  */
5 #include <linux/bpf.h>
6 #include <linux/btf.h>
7 #include <linux/jhash.h>
8 #include <linux/filter.h>
9 #include <linux/rculist_nulls.h>
10 #include <linux/random.h>
11 #include <uapi/linux/btf.h>
12 #include "percpu_freelist.h"
13 #include "bpf_lru_list.h"
14 #include "map_in_map.h"
15 
16 #define HTAB_CREATE_FLAG_MASK						\
17 	(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE |	\
18 	 BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED)
19 
20 #define BATCH_OPS(_name)			\
21 	.map_lookup_batch =			\
22 	_name##_map_lookup_batch,		\
23 	.map_lookup_and_delete_batch =		\
24 	_name##_map_lookup_and_delete_batch,	\
25 	.map_update_batch =			\
26 	generic_map_update_batch,		\
27 	.map_delete_batch =			\
28 	generic_map_delete_batch
29 
30 struct bucket {
31 	struct hlist_nulls_head head;
32 	raw_spinlock_t lock;
33 };
34 
35 struct bpf_htab {
36 	struct bpf_map map;
37 	struct bucket *buckets;
38 	void *elems;
39 	union {
40 		struct pcpu_freelist freelist;
41 		struct bpf_lru lru;
42 	};
43 	struct htab_elem *__percpu *extra_elems;
44 	atomic_t count;	/* number of elements in this hashtable */
45 	u32 n_buckets;	/* number of hash buckets */
46 	u32 elem_size;	/* size of each element in bytes */
47 	u32 hashrnd;
48 };
49 
50 /* each htab element is struct htab_elem + key + value */
51 struct htab_elem {
52 	union {
53 		struct hlist_nulls_node hash_node;
54 		struct {
55 			void *padding;
56 			union {
57 				struct bpf_htab *htab;
58 				struct pcpu_freelist_node fnode;
59 			};
60 		};
61 	};
62 	union {
63 		struct rcu_head rcu;
64 		struct bpf_lru_node lru_node;
65 	};
66 	u32 hash;
67 	char key[0] __aligned(8);
68 };
69 
70 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
71 
72 static bool htab_is_lru(const struct bpf_htab *htab)
73 {
74 	return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH ||
75 		htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
76 }
77 
78 static bool htab_is_percpu(const struct bpf_htab *htab)
79 {
80 	return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH ||
81 		htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
82 }
83 
84 static bool htab_is_prealloc(const struct bpf_htab *htab)
85 {
86 	return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
87 }
88 
89 static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
90 				     void __percpu *pptr)
91 {
92 	*(void __percpu **)(l->key + key_size) = pptr;
93 }
94 
95 static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
96 {
97 	return *(void __percpu **)(l->key + key_size);
98 }
99 
100 static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l)
101 {
102 	return *(void **)(l->key + roundup(map->key_size, 8));
103 }
104 
105 static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
106 {
107 	return (struct htab_elem *) (htab->elems + i * htab->elem_size);
108 }
109 
110 static void htab_free_elems(struct bpf_htab *htab)
111 {
112 	int i;
113 
114 	if (!htab_is_percpu(htab))
115 		goto free_elems;
116 
117 	for (i = 0; i < htab->map.max_entries; i++) {
118 		void __percpu *pptr;
119 
120 		pptr = htab_elem_get_ptr(get_htab_elem(htab, i),
121 					 htab->map.key_size);
122 		free_percpu(pptr);
123 		cond_resched();
124 	}
125 free_elems:
126 	bpf_map_area_free(htab->elems);
127 }
128 
129 static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
130 					  u32 hash)
131 {
132 	struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash);
133 	struct htab_elem *l;
134 
135 	if (node) {
136 		l = container_of(node, struct htab_elem, lru_node);
137 		memcpy(l->key, key, htab->map.key_size);
138 		return l;
139 	}
140 
141 	return NULL;
142 }
143 
144 static int prealloc_init(struct bpf_htab *htab)
145 {
146 	u32 num_entries = htab->map.max_entries;
147 	int err = -ENOMEM, i;
148 
149 	if (!htab_is_percpu(htab) && !htab_is_lru(htab))
150 		num_entries += num_possible_cpus();
151 
152 	htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries,
153 					 htab->map.numa_node);
154 	if (!htab->elems)
155 		return -ENOMEM;
156 
157 	if (!htab_is_percpu(htab))
158 		goto skip_percpu_elems;
159 
160 	for (i = 0; i < num_entries; i++) {
161 		u32 size = round_up(htab->map.value_size, 8);
162 		void __percpu *pptr;
163 
164 		pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN);
165 		if (!pptr)
166 			goto free_elems;
167 		htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
168 				  pptr);
169 		cond_resched();
170 	}
171 
172 skip_percpu_elems:
173 	if (htab_is_lru(htab))
174 		err = bpf_lru_init(&htab->lru,
175 				   htab->map.map_flags & BPF_F_NO_COMMON_LRU,
176 				   offsetof(struct htab_elem, hash) -
177 				   offsetof(struct htab_elem, lru_node),
178 				   htab_lru_map_delete_node,
179 				   htab);
180 	else
181 		err = pcpu_freelist_init(&htab->freelist);
182 
183 	if (err)
184 		goto free_elems;
185 
186 	if (htab_is_lru(htab))
187 		bpf_lru_populate(&htab->lru, htab->elems,
188 				 offsetof(struct htab_elem, lru_node),
189 				 htab->elem_size, num_entries);
190 	else
191 		pcpu_freelist_populate(&htab->freelist,
192 				       htab->elems + offsetof(struct htab_elem, fnode),
193 				       htab->elem_size, num_entries);
194 
195 	return 0;
196 
197 free_elems:
198 	htab_free_elems(htab);
199 	return err;
200 }
201 
202 static void prealloc_destroy(struct bpf_htab *htab)
203 {
204 	htab_free_elems(htab);
205 
206 	if (htab_is_lru(htab))
207 		bpf_lru_destroy(&htab->lru);
208 	else
209 		pcpu_freelist_destroy(&htab->freelist);
210 }
211 
212 static int alloc_extra_elems(struct bpf_htab *htab)
213 {
214 	struct htab_elem *__percpu *pptr, *l_new;
215 	struct pcpu_freelist_node *l;
216 	int cpu;
217 
218 	pptr = __alloc_percpu_gfp(sizeof(struct htab_elem *), 8,
219 				  GFP_USER | __GFP_NOWARN);
220 	if (!pptr)
221 		return -ENOMEM;
222 
223 	for_each_possible_cpu(cpu) {
224 		l = pcpu_freelist_pop(&htab->freelist);
225 		/* pop will succeed, since prealloc_init()
226 		 * preallocated extra num_possible_cpus elements
227 		 */
228 		l_new = container_of(l, struct htab_elem, fnode);
229 		*per_cpu_ptr(pptr, cpu) = l_new;
230 	}
231 	htab->extra_elems = pptr;
232 	return 0;
233 }
234 
235 /* Called from syscall */
236 static int htab_map_alloc_check(union bpf_attr *attr)
237 {
238 	bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
239 		       attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
240 	bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
241 		    attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
242 	/* percpu_lru means each cpu has its own LRU list.
243 	 * it is different from BPF_MAP_TYPE_PERCPU_HASH where
244 	 * the map's value itself is percpu.  percpu_lru has
245 	 * nothing to do with the map's value.
246 	 */
247 	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
248 	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
249 	bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED);
250 	int numa_node = bpf_map_attr_numa_node(attr);
251 
252 	BUILD_BUG_ON(offsetof(struct htab_elem, htab) !=
253 		     offsetof(struct htab_elem, hash_node.pprev));
254 	BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
255 		     offsetof(struct htab_elem, hash_node.pprev));
256 
257 	if (lru && !capable(CAP_SYS_ADMIN))
258 		/* LRU implementation is much complicated than other
259 		 * maps.  Hence, limit to CAP_SYS_ADMIN for now.
260 		 */
261 		return -EPERM;
262 
263 	if (zero_seed && !capable(CAP_SYS_ADMIN))
264 		/* Guard against local DoS, and discourage production use. */
265 		return -EPERM;
266 
267 	if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK ||
268 	    !bpf_map_flags_access_ok(attr->map_flags))
269 		return -EINVAL;
270 
271 	if (!lru && percpu_lru)
272 		return -EINVAL;
273 
274 	if (lru && !prealloc)
275 		return -ENOTSUPP;
276 
277 	if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru))
278 		return -EINVAL;
279 
280 	/* check sanity of attributes.
281 	 * value_size == 0 may be allowed in the future to use map as a set
282 	 */
283 	if (attr->max_entries == 0 || attr->key_size == 0 ||
284 	    attr->value_size == 0)
285 		return -EINVAL;
286 
287 	if (attr->key_size > MAX_BPF_STACK)
288 		/* eBPF programs initialize keys on stack, so they cannot be
289 		 * larger than max stack size
290 		 */
291 		return -E2BIG;
292 
293 	if (attr->value_size >= KMALLOC_MAX_SIZE -
294 	    MAX_BPF_STACK - sizeof(struct htab_elem))
295 		/* if value_size is bigger, the user space won't be able to
296 		 * access the elements via bpf syscall. This check also makes
297 		 * sure that the elem_size doesn't overflow and it's
298 		 * kmalloc-able later in htab_map_update_elem()
299 		 */
300 		return -E2BIG;
301 
302 	return 0;
303 }
304 
305 static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
306 {
307 	bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
308 		       attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
309 	bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
310 		    attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
311 	/* percpu_lru means each cpu has its own LRU list.
312 	 * it is different from BPF_MAP_TYPE_PERCPU_HASH where
313 	 * the map's value itself is percpu.  percpu_lru has
314 	 * nothing to do with the map's value.
315 	 */
316 	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
317 	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
318 	struct bpf_htab *htab;
319 	int err, i;
320 	u64 cost;
321 
322 	htab = kzalloc(sizeof(*htab), GFP_USER);
323 	if (!htab)
324 		return ERR_PTR(-ENOMEM);
325 
326 	bpf_map_init_from_attr(&htab->map, attr);
327 
328 	if (percpu_lru) {
329 		/* ensure each CPU's lru list has >=1 elements.
330 		 * since we are at it, make each lru list has the same
331 		 * number of elements.
332 		 */
333 		htab->map.max_entries = roundup(attr->max_entries,
334 						num_possible_cpus());
335 		if (htab->map.max_entries < attr->max_entries)
336 			htab->map.max_entries = rounddown(attr->max_entries,
337 							  num_possible_cpus());
338 	}
339 
340 	/* hash table size must be power of 2 */
341 	htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
342 
343 	htab->elem_size = sizeof(struct htab_elem) +
344 			  round_up(htab->map.key_size, 8);
345 	if (percpu)
346 		htab->elem_size += sizeof(void *);
347 	else
348 		htab->elem_size += round_up(htab->map.value_size, 8);
349 
350 	err = -E2BIG;
351 	/* prevent zero size kmalloc and check for u32 overflow */
352 	if (htab->n_buckets == 0 ||
353 	    htab->n_buckets > U32_MAX / sizeof(struct bucket))
354 		goto free_htab;
355 
356 	cost = (u64) htab->n_buckets * sizeof(struct bucket) +
357 	       (u64) htab->elem_size * htab->map.max_entries;
358 
359 	if (percpu)
360 		cost += (u64) round_up(htab->map.value_size, 8) *
361 			num_possible_cpus() * htab->map.max_entries;
362 	else
363 	       cost += (u64) htab->elem_size * num_possible_cpus();
364 
365 	/* if map size is larger than memlock limit, reject it */
366 	err = bpf_map_charge_init(&htab->map.memory, cost);
367 	if (err)
368 		goto free_htab;
369 
370 	err = -ENOMEM;
371 	htab->buckets = bpf_map_area_alloc(htab->n_buckets *
372 					   sizeof(struct bucket),
373 					   htab->map.numa_node);
374 	if (!htab->buckets)
375 		goto free_charge;
376 
377 	if (htab->map.map_flags & BPF_F_ZERO_SEED)
378 		htab->hashrnd = 0;
379 	else
380 		htab->hashrnd = get_random_int();
381 
382 	for (i = 0; i < htab->n_buckets; i++) {
383 		INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
384 		raw_spin_lock_init(&htab->buckets[i].lock);
385 	}
386 
387 	if (prealloc) {
388 		err = prealloc_init(htab);
389 		if (err)
390 			goto free_buckets;
391 
392 		if (!percpu && !lru) {
393 			/* lru itself can remove the least used element, so
394 			 * there is no need for an extra elem during map_update.
395 			 */
396 			err = alloc_extra_elems(htab);
397 			if (err)
398 				goto free_prealloc;
399 		}
400 	}
401 
402 	return &htab->map;
403 
404 free_prealloc:
405 	prealloc_destroy(htab);
406 free_buckets:
407 	bpf_map_area_free(htab->buckets);
408 free_charge:
409 	bpf_map_charge_finish(&htab->map.memory);
410 free_htab:
411 	kfree(htab);
412 	return ERR_PTR(err);
413 }
414 
415 static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd)
416 {
417 	return jhash(key, key_len, hashrnd);
418 }
419 
420 static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
421 {
422 	return &htab->buckets[hash & (htab->n_buckets - 1)];
423 }
424 
425 static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash)
426 {
427 	return &__select_bucket(htab, hash)->head;
428 }
429 
430 /* this lookup function can only be called with bucket lock taken */
431 static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash,
432 					 void *key, u32 key_size)
433 {
434 	struct hlist_nulls_node *n;
435 	struct htab_elem *l;
436 
437 	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
438 		if (l->hash == hash && !memcmp(&l->key, key, key_size))
439 			return l;
440 
441 	return NULL;
442 }
443 
444 /* can be called without bucket lock. it will repeat the loop in
445  * the unlikely event when elements moved from one bucket into another
446  * while link list is being walked
447  */
448 static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head,
449 					       u32 hash, void *key,
450 					       u32 key_size, u32 n_buckets)
451 {
452 	struct hlist_nulls_node *n;
453 	struct htab_elem *l;
454 
455 again:
456 	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
457 		if (l->hash == hash && !memcmp(&l->key, key, key_size))
458 			return l;
459 
460 	if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1))))
461 		goto again;
462 
463 	return NULL;
464 }
465 
466 /* Called from syscall or from eBPF program directly, so
467  * arguments have to match bpf_map_lookup_elem() exactly.
468  * The return value is adjusted by BPF instructions
469  * in htab_map_gen_lookup().
470  */
471 static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
472 {
473 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
474 	struct hlist_nulls_head *head;
475 	struct htab_elem *l;
476 	u32 hash, key_size;
477 
478 	/* Must be called with rcu_read_lock. */
479 	WARN_ON_ONCE(!rcu_read_lock_held());
480 
481 	key_size = map->key_size;
482 
483 	hash = htab_map_hash(key, key_size, htab->hashrnd);
484 
485 	head = select_bucket(htab, hash);
486 
487 	l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
488 
489 	return l;
490 }
491 
492 static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
493 {
494 	struct htab_elem *l = __htab_map_lookup_elem(map, key);
495 
496 	if (l)
497 		return l->key + round_up(map->key_size, 8);
498 
499 	return NULL;
500 }
501 
502 /* inline bpf_map_lookup_elem() call.
503  * Instead of:
504  * bpf_prog
505  *   bpf_map_lookup_elem
506  *     map->ops->map_lookup_elem
507  *       htab_map_lookup_elem
508  *         __htab_map_lookup_elem
509  * do:
510  * bpf_prog
511  *   __htab_map_lookup_elem
512  */
513 static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
514 {
515 	struct bpf_insn *insn = insn_buf;
516 	const int ret = BPF_REG_0;
517 
518 	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
519 		     (void *(*)(struct bpf_map *map, void *key))NULL));
520 	*insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
521 	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
522 	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
523 				offsetof(struct htab_elem, key) +
524 				round_up(map->key_size, 8));
525 	return insn - insn_buf;
526 }
527 
528 static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map,
529 							void *key, const bool mark)
530 {
531 	struct htab_elem *l = __htab_map_lookup_elem(map, key);
532 
533 	if (l) {
534 		if (mark)
535 			bpf_lru_node_set_ref(&l->lru_node);
536 		return l->key + round_up(map->key_size, 8);
537 	}
538 
539 	return NULL;
540 }
541 
542 static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
543 {
544 	return __htab_lru_map_lookup_elem(map, key, true);
545 }
546 
547 static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key)
548 {
549 	return __htab_lru_map_lookup_elem(map, key, false);
550 }
551 
552 static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
553 				   struct bpf_insn *insn_buf)
554 {
555 	struct bpf_insn *insn = insn_buf;
556 	const int ret = BPF_REG_0;
557 	const int ref_reg = BPF_REG_1;
558 
559 	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
560 		     (void *(*)(struct bpf_map *map, void *key))NULL));
561 	*insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
562 	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4);
563 	*insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
564 			      offsetof(struct htab_elem, lru_node) +
565 			      offsetof(struct bpf_lru_node, ref));
566 	*insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, 0, 1);
567 	*insn++ = BPF_ST_MEM(BPF_B, ret,
568 			     offsetof(struct htab_elem, lru_node) +
569 			     offsetof(struct bpf_lru_node, ref),
570 			     1);
571 	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
572 				offsetof(struct htab_elem, key) +
573 				round_up(map->key_size, 8));
574 	return insn - insn_buf;
575 }
576 
577 /* It is called from the bpf_lru_list when the LRU needs to delete
578  * older elements from the htab.
579  */
580 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
581 {
582 	struct bpf_htab *htab = (struct bpf_htab *)arg;
583 	struct htab_elem *l = NULL, *tgt_l;
584 	struct hlist_nulls_head *head;
585 	struct hlist_nulls_node *n;
586 	unsigned long flags;
587 	struct bucket *b;
588 
589 	tgt_l = container_of(node, struct htab_elem, lru_node);
590 	b = __select_bucket(htab, tgt_l->hash);
591 	head = &b->head;
592 
593 	raw_spin_lock_irqsave(&b->lock, flags);
594 
595 	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
596 		if (l == tgt_l) {
597 			hlist_nulls_del_rcu(&l->hash_node);
598 			break;
599 		}
600 
601 	raw_spin_unlock_irqrestore(&b->lock, flags);
602 
603 	return l == tgt_l;
604 }
605 
606 /* Called from syscall */
607 static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
608 {
609 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
610 	struct hlist_nulls_head *head;
611 	struct htab_elem *l, *next_l;
612 	u32 hash, key_size;
613 	int i = 0;
614 
615 	WARN_ON_ONCE(!rcu_read_lock_held());
616 
617 	key_size = map->key_size;
618 
619 	if (!key)
620 		goto find_first_elem;
621 
622 	hash = htab_map_hash(key, key_size, htab->hashrnd);
623 
624 	head = select_bucket(htab, hash);
625 
626 	/* lookup the key */
627 	l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
628 
629 	if (!l)
630 		goto find_first_elem;
631 
632 	/* key was found, get next key in the same bucket */
633 	next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)),
634 				  struct htab_elem, hash_node);
635 
636 	if (next_l) {
637 		/* if next elem in this hash list is non-zero, just return it */
638 		memcpy(next_key, next_l->key, key_size);
639 		return 0;
640 	}
641 
642 	/* no more elements in this hash list, go to the next bucket */
643 	i = hash & (htab->n_buckets - 1);
644 	i++;
645 
646 find_first_elem:
647 	/* iterate over buckets */
648 	for (; i < htab->n_buckets; i++) {
649 		head = select_bucket(htab, i);
650 
651 		/* pick first element in the bucket */
652 		next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)),
653 					  struct htab_elem, hash_node);
654 		if (next_l) {
655 			/* if it's not empty, just return it */
656 			memcpy(next_key, next_l->key, key_size);
657 			return 0;
658 		}
659 	}
660 
661 	/* iterated over all buckets and all elements */
662 	return -ENOENT;
663 }
664 
665 static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
666 {
667 	if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
668 		free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
669 	kfree(l);
670 }
671 
672 static void htab_elem_free_rcu(struct rcu_head *head)
673 {
674 	struct htab_elem *l = container_of(head, struct htab_elem, rcu);
675 	struct bpf_htab *htab = l->htab;
676 
677 	/* must increment bpf_prog_active to avoid kprobe+bpf triggering while
678 	 * we're calling kfree, otherwise deadlock is possible if kprobes
679 	 * are placed somewhere inside of slub
680 	 */
681 	preempt_disable();
682 	__this_cpu_inc(bpf_prog_active);
683 	htab_elem_free(htab, l);
684 	__this_cpu_dec(bpf_prog_active);
685 	preempt_enable();
686 }
687 
688 static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
689 {
690 	struct bpf_map *map = &htab->map;
691 
692 	if (map->ops->map_fd_put_ptr) {
693 		void *ptr = fd_htab_map_get_ptr(map, l);
694 
695 		map->ops->map_fd_put_ptr(ptr);
696 	}
697 
698 	if (htab_is_prealloc(htab)) {
699 		__pcpu_freelist_push(&htab->freelist, &l->fnode);
700 	} else {
701 		atomic_dec(&htab->count);
702 		l->htab = htab;
703 		call_rcu(&l->rcu, htab_elem_free_rcu);
704 	}
705 }
706 
707 static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
708 			    void *value, bool onallcpus)
709 {
710 	if (!onallcpus) {
711 		/* copy true value_size bytes */
712 		memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
713 	} else {
714 		u32 size = round_up(htab->map.value_size, 8);
715 		int off = 0, cpu;
716 
717 		for_each_possible_cpu(cpu) {
718 			bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
719 					value + off, size);
720 			off += size;
721 		}
722 	}
723 }
724 
725 static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
726 {
727 	return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS &&
728 	       BITS_PER_LONG == 64;
729 }
730 
731 static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
732 					 void *value, u32 key_size, u32 hash,
733 					 bool percpu, bool onallcpus,
734 					 struct htab_elem *old_elem)
735 {
736 	u32 size = htab->map.value_size;
737 	bool prealloc = htab_is_prealloc(htab);
738 	struct htab_elem *l_new, **pl_new;
739 	void __percpu *pptr;
740 
741 	if (prealloc) {
742 		if (old_elem) {
743 			/* if we're updating the existing element,
744 			 * use per-cpu extra elems to avoid freelist_pop/push
745 			 */
746 			pl_new = this_cpu_ptr(htab->extra_elems);
747 			l_new = *pl_new;
748 			*pl_new = old_elem;
749 		} else {
750 			struct pcpu_freelist_node *l;
751 
752 			l = __pcpu_freelist_pop(&htab->freelist);
753 			if (!l)
754 				return ERR_PTR(-E2BIG);
755 			l_new = container_of(l, struct htab_elem, fnode);
756 		}
757 	} else {
758 		if (atomic_inc_return(&htab->count) > htab->map.max_entries)
759 			if (!old_elem) {
760 				/* when map is full and update() is replacing
761 				 * old element, it's ok to allocate, since
762 				 * old element will be freed immediately.
763 				 * Otherwise return an error
764 				 */
765 				l_new = ERR_PTR(-E2BIG);
766 				goto dec_count;
767 			}
768 		l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
769 				     htab->map.numa_node);
770 		if (!l_new) {
771 			l_new = ERR_PTR(-ENOMEM);
772 			goto dec_count;
773 		}
774 		check_and_init_map_lock(&htab->map,
775 					l_new->key + round_up(key_size, 8));
776 	}
777 
778 	memcpy(l_new->key, key, key_size);
779 	if (percpu) {
780 		size = round_up(size, 8);
781 		if (prealloc) {
782 			pptr = htab_elem_get_ptr(l_new, key_size);
783 		} else {
784 			/* alloc_percpu zero-fills */
785 			pptr = __alloc_percpu_gfp(size, 8,
786 						  GFP_ATOMIC | __GFP_NOWARN);
787 			if (!pptr) {
788 				kfree(l_new);
789 				l_new = ERR_PTR(-ENOMEM);
790 				goto dec_count;
791 			}
792 		}
793 
794 		pcpu_copy_value(htab, pptr, value, onallcpus);
795 
796 		if (!prealloc)
797 			htab_elem_set_ptr(l_new, key_size, pptr);
798 	} else if (fd_htab_map_needs_adjust(htab)) {
799 		size = round_up(size, 8);
800 		memcpy(l_new->key + round_up(key_size, 8), value, size);
801 	} else {
802 		copy_map_value(&htab->map,
803 			       l_new->key + round_up(key_size, 8),
804 			       value);
805 	}
806 
807 	l_new->hash = hash;
808 	return l_new;
809 dec_count:
810 	atomic_dec(&htab->count);
811 	return l_new;
812 }
813 
814 static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
815 		       u64 map_flags)
816 {
817 	if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
818 		/* elem already exists */
819 		return -EEXIST;
820 
821 	if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
822 		/* elem doesn't exist, cannot update it */
823 		return -ENOENT;
824 
825 	return 0;
826 }
827 
828 /* Called from syscall or from eBPF program */
829 static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
830 				u64 map_flags)
831 {
832 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
833 	struct htab_elem *l_new = NULL, *l_old;
834 	struct hlist_nulls_head *head;
835 	unsigned long flags;
836 	struct bucket *b;
837 	u32 key_size, hash;
838 	int ret;
839 
840 	if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
841 		/* unknown flags */
842 		return -EINVAL;
843 
844 	WARN_ON_ONCE(!rcu_read_lock_held());
845 
846 	key_size = map->key_size;
847 
848 	hash = htab_map_hash(key, key_size, htab->hashrnd);
849 
850 	b = __select_bucket(htab, hash);
851 	head = &b->head;
852 
853 	if (unlikely(map_flags & BPF_F_LOCK)) {
854 		if (unlikely(!map_value_has_spin_lock(map)))
855 			return -EINVAL;
856 		/* find an element without taking the bucket lock */
857 		l_old = lookup_nulls_elem_raw(head, hash, key, key_size,
858 					      htab->n_buckets);
859 		ret = check_flags(htab, l_old, map_flags);
860 		if (ret)
861 			return ret;
862 		if (l_old) {
863 			/* grab the element lock and update value in place */
864 			copy_map_value_locked(map,
865 					      l_old->key + round_up(key_size, 8),
866 					      value, false);
867 			return 0;
868 		}
869 		/* fall through, grab the bucket lock and lookup again.
870 		 * 99.9% chance that the element won't be found,
871 		 * but second lookup under lock has to be done.
872 		 */
873 	}
874 
875 	/* bpf_map_update_elem() can be called in_irq() */
876 	raw_spin_lock_irqsave(&b->lock, flags);
877 
878 	l_old = lookup_elem_raw(head, hash, key, key_size);
879 
880 	ret = check_flags(htab, l_old, map_flags);
881 	if (ret)
882 		goto err;
883 
884 	if (unlikely(l_old && (map_flags & BPF_F_LOCK))) {
885 		/* first lookup without the bucket lock didn't find the element,
886 		 * but second lookup with the bucket lock found it.
887 		 * This case is highly unlikely, but has to be dealt with:
888 		 * grab the element lock in addition to the bucket lock
889 		 * and update element in place
890 		 */
891 		copy_map_value_locked(map,
892 				      l_old->key + round_up(key_size, 8),
893 				      value, false);
894 		ret = 0;
895 		goto err;
896 	}
897 
898 	l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
899 				l_old);
900 	if (IS_ERR(l_new)) {
901 		/* all pre-allocated elements are in use or memory exhausted */
902 		ret = PTR_ERR(l_new);
903 		goto err;
904 	}
905 
906 	/* add new element to the head of the list, so that
907 	 * concurrent search will find it before old elem
908 	 */
909 	hlist_nulls_add_head_rcu(&l_new->hash_node, head);
910 	if (l_old) {
911 		hlist_nulls_del_rcu(&l_old->hash_node);
912 		if (!htab_is_prealloc(htab))
913 			free_htab_elem(htab, l_old);
914 	}
915 	ret = 0;
916 err:
917 	raw_spin_unlock_irqrestore(&b->lock, flags);
918 	return ret;
919 }
920 
921 static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
922 				    u64 map_flags)
923 {
924 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
925 	struct htab_elem *l_new, *l_old = NULL;
926 	struct hlist_nulls_head *head;
927 	unsigned long flags;
928 	struct bucket *b;
929 	u32 key_size, hash;
930 	int ret;
931 
932 	if (unlikely(map_flags > BPF_EXIST))
933 		/* unknown flags */
934 		return -EINVAL;
935 
936 	WARN_ON_ONCE(!rcu_read_lock_held());
937 
938 	key_size = map->key_size;
939 
940 	hash = htab_map_hash(key, key_size, htab->hashrnd);
941 
942 	b = __select_bucket(htab, hash);
943 	head = &b->head;
944 
945 	/* For LRU, we need to alloc before taking bucket's
946 	 * spinlock because getting free nodes from LRU may need
947 	 * to remove older elements from htab and this removal
948 	 * operation will need a bucket lock.
949 	 */
950 	l_new = prealloc_lru_pop(htab, key, hash);
951 	if (!l_new)
952 		return -ENOMEM;
953 	memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size);
954 
955 	/* bpf_map_update_elem() can be called in_irq() */
956 	raw_spin_lock_irqsave(&b->lock, flags);
957 
958 	l_old = lookup_elem_raw(head, hash, key, key_size);
959 
960 	ret = check_flags(htab, l_old, map_flags);
961 	if (ret)
962 		goto err;
963 
964 	/* add new element to the head of the list, so that
965 	 * concurrent search will find it before old elem
966 	 */
967 	hlist_nulls_add_head_rcu(&l_new->hash_node, head);
968 	if (l_old) {
969 		bpf_lru_node_set_ref(&l_new->lru_node);
970 		hlist_nulls_del_rcu(&l_old->hash_node);
971 	}
972 	ret = 0;
973 
974 err:
975 	raw_spin_unlock_irqrestore(&b->lock, flags);
976 
977 	if (ret)
978 		bpf_lru_push_free(&htab->lru, &l_new->lru_node);
979 	else if (l_old)
980 		bpf_lru_push_free(&htab->lru, &l_old->lru_node);
981 
982 	return ret;
983 }
984 
985 static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
986 					 void *value, u64 map_flags,
987 					 bool onallcpus)
988 {
989 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
990 	struct htab_elem *l_new = NULL, *l_old;
991 	struct hlist_nulls_head *head;
992 	unsigned long flags;
993 	struct bucket *b;
994 	u32 key_size, hash;
995 	int ret;
996 
997 	if (unlikely(map_flags > BPF_EXIST))
998 		/* unknown flags */
999 		return -EINVAL;
1000 
1001 	WARN_ON_ONCE(!rcu_read_lock_held());
1002 
1003 	key_size = map->key_size;
1004 
1005 	hash = htab_map_hash(key, key_size, htab->hashrnd);
1006 
1007 	b = __select_bucket(htab, hash);
1008 	head = &b->head;
1009 
1010 	/* bpf_map_update_elem() can be called in_irq() */
1011 	raw_spin_lock_irqsave(&b->lock, flags);
1012 
1013 	l_old = lookup_elem_raw(head, hash, key, key_size);
1014 
1015 	ret = check_flags(htab, l_old, map_flags);
1016 	if (ret)
1017 		goto err;
1018 
1019 	if (l_old) {
1020 		/* per-cpu hash map can update value in-place */
1021 		pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
1022 				value, onallcpus);
1023 	} else {
1024 		l_new = alloc_htab_elem(htab, key, value, key_size,
1025 					hash, true, onallcpus, NULL);
1026 		if (IS_ERR(l_new)) {
1027 			ret = PTR_ERR(l_new);
1028 			goto err;
1029 		}
1030 		hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1031 	}
1032 	ret = 0;
1033 err:
1034 	raw_spin_unlock_irqrestore(&b->lock, flags);
1035 	return ret;
1036 }
1037 
1038 static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
1039 					     void *value, u64 map_flags,
1040 					     bool onallcpus)
1041 {
1042 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1043 	struct htab_elem *l_new = NULL, *l_old;
1044 	struct hlist_nulls_head *head;
1045 	unsigned long flags;
1046 	struct bucket *b;
1047 	u32 key_size, hash;
1048 	int ret;
1049 
1050 	if (unlikely(map_flags > BPF_EXIST))
1051 		/* unknown flags */
1052 		return -EINVAL;
1053 
1054 	WARN_ON_ONCE(!rcu_read_lock_held());
1055 
1056 	key_size = map->key_size;
1057 
1058 	hash = htab_map_hash(key, key_size, htab->hashrnd);
1059 
1060 	b = __select_bucket(htab, hash);
1061 	head = &b->head;
1062 
1063 	/* For LRU, we need to alloc before taking bucket's
1064 	 * spinlock because LRU's elem alloc may need
1065 	 * to remove older elem from htab and this removal
1066 	 * operation will need a bucket lock.
1067 	 */
1068 	if (map_flags != BPF_EXIST) {
1069 		l_new = prealloc_lru_pop(htab, key, hash);
1070 		if (!l_new)
1071 			return -ENOMEM;
1072 	}
1073 
1074 	/* bpf_map_update_elem() can be called in_irq() */
1075 	raw_spin_lock_irqsave(&b->lock, flags);
1076 
1077 	l_old = lookup_elem_raw(head, hash, key, key_size);
1078 
1079 	ret = check_flags(htab, l_old, map_flags);
1080 	if (ret)
1081 		goto err;
1082 
1083 	if (l_old) {
1084 		bpf_lru_node_set_ref(&l_old->lru_node);
1085 
1086 		/* per-cpu hash map can update value in-place */
1087 		pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
1088 				value, onallcpus);
1089 	} else {
1090 		pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size),
1091 				value, onallcpus);
1092 		hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1093 		l_new = NULL;
1094 	}
1095 	ret = 0;
1096 err:
1097 	raw_spin_unlock_irqrestore(&b->lock, flags);
1098 	if (l_new)
1099 		bpf_lru_push_free(&htab->lru, &l_new->lru_node);
1100 	return ret;
1101 }
1102 
1103 static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
1104 				       void *value, u64 map_flags)
1105 {
1106 	return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
1107 }
1108 
1109 static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
1110 					   void *value, u64 map_flags)
1111 {
1112 	return __htab_lru_percpu_map_update_elem(map, key, value, map_flags,
1113 						 false);
1114 }
1115 
1116 /* Called from syscall or from eBPF program */
1117 static int htab_map_delete_elem(struct bpf_map *map, void *key)
1118 {
1119 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1120 	struct hlist_nulls_head *head;
1121 	struct bucket *b;
1122 	struct htab_elem *l;
1123 	unsigned long flags;
1124 	u32 hash, key_size;
1125 	int ret = -ENOENT;
1126 
1127 	WARN_ON_ONCE(!rcu_read_lock_held());
1128 
1129 	key_size = map->key_size;
1130 
1131 	hash = htab_map_hash(key, key_size, htab->hashrnd);
1132 	b = __select_bucket(htab, hash);
1133 	head = &b->head;
1134 
1135 	raw_spin_lock_irqsave(&b->lock, flags);
1136 
1137 	l = lookup_elem_raw(head, hash, key, key_size);
1138 
1139 	if (l) {
1140 		hlist_nulls_del_rcu(&l->hash_node);
1141 		free_htab_elem(htab, l);
1142 		ret = 0;
1143 	}
1144 
1145 	raw_spin_unlock_irqrestore(&b->lock, flags);
1146 	return ret;
1147 }
1148 
1149 static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
1150 {
1151 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1152 	struct hlist_nulls_head *head;
1153 	struct bucket *b;
1154 	struct htab_elem *l;
1155 	unsigned long flags;
1156 	u32 hash, key_size;
1157 	int ret = -ENOENT;
1158 
1159 	WARN_ON_ONCE(!rcu_read_lock_held());
1160 
1161 	key_size = map->key_size;
1162 
1163 	hash = htab_map_hash(key, key_size, htab->hashrnd);
1164 	b = __select_bucket(htab, hash);
1165 	head = &b->head;
1166 
1167 	raw_spin_lock_irqsave(&b->lock, flags);
1168 
1169 	l = lookup_elem_raw(head, hash, key, key_size);
1170 
1171 	if (l) {
1172 		hlist_nulls_del_rcu(&l->hash_node);
1173 		ret = 0;
1174 	}
1175 
1176 	raw_spin_unlock_irqrestore(&b->lock, flags);
1177 	if (l)
1178 		bpf_lru_push_free(&htab->lru, &l->lru_node);
1179 	return ret;
1180 }
1181 
1182 static void delete_all_elements(struct bpf_htab *htab)
1183 {
1184 	int i;
1185 
1186 	for (i = 0; i < htab->n_buckets; i++) {
1187 		struct hlist_nulls_head *head = select_bucket(htab, i);
1188 		struct hlist_nulls_node *n;
1189 		struct htab_elem *l;
1190 
1191 		hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
1192 			hlist_nulls_del_rcu(&l->hash_node);
1193 			htab_elem_free(htab, l);
1194 		}
1195 	}
1196 }
1197 
1198 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
1199 static void htab_map_free(struct bpf_map *map)
1200 {
1201 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1202 
1203 	/* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
1204 	 * so the programs (can be more than one that used this map) were
1205 	 * disconnected from events. Wait for outstanding critical sections in
1206 	 * these programs to complete
1207 	 */
1208 	synchronize_rcu();
1209 
1210 	/* some of free_htab_elem() callbacks for elements of this map may
1211 	 * not have executed. Wait for them.
1212 	 */
1213 	rcu_barrier();
1214 	if (!htab_is_prealloc(htab))
1215 		delete_all_elements(htab);
1216 	else
1217 		prealloc_destroy(htab);
1218 
1219 	free_percpu(htab->extra_elems);
1220 	bpf_map_area_free(htab->buckets);
1221 	kfree(htab);
1222 }
1223 
1224 static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
1225 				   struct seq_file *m)
1226 {
1227 	void *value;
1228 
1229 	rcu_read_lock();
1230 
1231 	value = htab_map_lookup_elem(map, key);
1232 	if (!value) {
1233 		rcu_read_unlock();
1234 		return;
1235 	}
1236 
1237 	btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
1238 	seq_puts(m, ": ");
1239 	btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
1240 	seq_puts(m, "\n");
1241 
1242 	rcu_read_unlock();
1243 }
1244 
1245 static int
1246 __htab_map_lookup_and_delete_batch(struct bpf_map *map,
1247 				   const union bpf_attr *attr,
1248 				   union bpf_attr __user *uattr,
1249 				   bool do_delete, bool is_lru_map,
1250 				   bool is_percpu)
1251 {
1252 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1253 	u32 bucket_cnt, total, key_size, value_size, roundup_key_size;
1254 	void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val;
1255 	void __user *uvalues = u64_to_user_ptr(attr->batch.values);
1256 	void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
1257 	void *ubatch = u64_to_user_ptr(attr->batch.in_batch);
1258 	u32 batch, max_count, size, bucket_size;
1259 	u64 elem_map_flags, map_flags;
1260 	struct hlist_nulls_head *head;
1261 	struct hlist_nulls_node *n;
1262 	unsigned long flags;
1263 	struct htab_elem *l;
1264 	struct bucket *b;
1265 	int ret = 0;
1266 
1267 	elem_map_flags = attr->batch.elem_flags;
1268 	if ((elem_map_flags & ~BPF_F_LOCK) ||
1269 	    ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)))
1270 		return -EINVAL;
1271 
1272 	map_flags = attr->batch.flags;
1273 	if (map_flags)
1274 		return -EINVAL;
1275 
1276 	max_count = attr->batch.count;
1277 	if (!max_count)
1278 		return 0;
1279 
1280 	if (put_user(0, &uattr->batch.count))
1281 		return -EFAULT;
1282 
1283 	batch = 0;
1284 	if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch)))
1285 		return -EFAULT;
1286 
1287 	if (batch >= htab->n_buckets)
1288 		return -ENOENT;
1289 
1290 	key_size = htab->map.key_size;
1291 	roundup_key_size = round_up(htab->map.key_size, 8);
1292 	value_size = htab->map.value_size;
1293 	size = round_up(value_size, 8);
1294 	if (is_percpu)
1295 		value_size = size * num_possible_cpus();
1296 	total = 0;
1297 	/* while experimenting with hash tables with sizes ranging from 10 to
1298 	 * 1000, it was observed that a bucket can have upto 5 entries.
1299 	 */
1300 	bucket_size = 5;
1301 
1302 alloc:
1303 	/* We cannot do copy_from_user or copy_to_user inside
1304 	 * the rcu_read_lock. Allocate enough space here.
1305 	 */
1306 	keys = kvmalloc(key_size * bucket_size, GFP_USER | __GFP_NOWARN);
1307 	values = kvmalloc(value_size * bucket_size, GFP_USER | __GFP_NOWARN);
1308 	if (!keys || !values) {
1309 		ret = -ENOMEM;
1310 		goto after_loop;
1311 	}
1312 
1313 again:
1314 	preempt_disable();
1315 	this_cpu_inc(bpf_prog_active);
1316 	rcu_read_lock();
1317 again_nocopy:
1318 	dst_key = keys;
1319 	dst_val = values;
1320 	b = &htab->buckets[batch];
1321 	head = &b->head;
1322 	raw_spin_lock_irqsave(&b->lock, flags);
1323 
1324 	bucket_cnt = 0;
1325 	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
1326 		bucket_cnt++;
1327 
1328 	if (bucket_cnt > (max_count - total)) {
1329 		if (total == 0)
1330 			ret = -ENOSPC;
1331 		raw_spin_unlock_irqrestore(&b->lock, flags);
1332 		rcu_read_unlock();
1333 		this_cpu_dec(bpf_prog_active);
1334 		preempt_enable();
1335 		goto after_loop;
1336 	}
1337 
1338 	if (bucket_cnt > bucket_size) {
1339 		bucket_size = bucket_cnt;
1340 		raw_spin_unlock_irqrestore(&b->lock, flags);
1341 		rcu_read_unlock();
1342 		this_cpu_dec(bpf_prog_active);
1343 		preempt_enable();
1344 		kvfree(keys);
1345 		kvfree(values);
1346 		goto alloc;
1347 	}
1348 
1349 	hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
1350 		memcpy(dst_key, l->key, key_size);
1351 
1352 		if (is_percpu) {
1353 			int off = 0, cpu;
1354 			void __percpu *pptr;
1355 
1356 			pptr = htab_elem_get_ptr(l, map->key_size);
1357 			for_each_possible_cpu(cpu) {
1358 				bpf_long_memcpy(dst_val + off,
1359 						per_cpu_ptr(pptr, cpu), size);
1360 				off += size;
1361 			}
1362 		} else {
1363 			value = l->key + roundup_key_size;
1364 			if (elem_map_flags & BPF_F_LOCK)
1365 				copy_map_value_locked(map, dst_val, value,
1366 						      true);
1367 			else
1368 				copy_map_value(map, dst_val, value);
1369 			check_and_init_map_lock(map, dst_val);
1370 		}
1371 		if (do_delete) {
1372 			hlist_nulls_del_rcu(&l->hash_node);
1373 			if (is_lru_map)
1374 				bpf_lru_push_free(&htab->lru, &l->lru_node);
1375 			else
1376 				free_htab_elem(htab, l);
1377 		}
1378 		dst_key += key_size;
1379 		dst_val += value_size;
1380 	}
1381 
1382 	raw_spin_unlock_irqrestore(&b->lock, flags);
1383 	/* If we are not copying data, we can go to next bucket and avoid
1384 	 * unlocking the rcu.
1385 	 */
1386 	if (!bucket_cnt && (batch + 1 < htab->n_buckets)) {
1387 		batch++;
1388 		goto again_nocopy;
1389 	}
1390 
1391 	rcu_read_unlock();
1392 	this_cpu_dec(bpf_prog_active);
1393 	preempt_enable();
1394 	if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys,
1395 	    key_size * bucket_cnt) ||
1396 	    copy_to_user(uvalues + total * value_size, values,
1397 	    value_size * bucket_cnt))) {
1398 		ret = -EFAULT;
1399 		goto after_loop;
1400 	}
1401 
1402 	total += bucket_cnt;
1403 	batch++;
1404 	if (batch >= htab->n_buckets) {
1405 		ret = -ENOENT;
1406 		goto after_loop;
1407 	}
1408 	goto again;
1409 
1410 after_loop:
1411 	if (ret == -EFAULT)
1412 		goto out;
1413 
1414 	/* copy # of entries and next batch */
1415 	ubatch = u64_to_user_ptr(attr->batch.out_batch);
1416 	if (copy_to_user(ubatch, &batch, sizeof(batch)) ||
1417 	    put_user(total, &uattr->batch.count))
1418 		ret = -EFAULT;
1419 
1420 out:
1421 	kvfree(keys);
1422 	kvfree(values);
1423 	return ret;
1424 }
1425 
1426 static int
1427 htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
1428 			     union bpf_attr __user *uattr)
1429 {
1430 	return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1431 						  false, true);
1432 }
1433 
1434 static int
1435 htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
1436 					const union bpf_attr *attr,
1437 					union bpf_attr __user *uattr)
1438 {
1439 	return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1440 						  false, true);
1441 }
1442 
1443 static int
1444 htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
1445 		      union bpf_attr __user *uattr)
1446 {
1447 	return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1448 						  false, false);
1449 }
1450 
1451 static int
1452 htab_map_lookup_and_delete_batch(struct bpf_map *map,
1453 				 const union bpf_attr *attr,
1454 				 union bpf_attr __user *uattr)
1455 {
1456 	return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1457 						  false, false);
1458 }
1459 
1460 static int
1461 htab_lru_percpu_map_lookup_batch(struct bpf_map *map,
1462 				 const union bpf_attr *attr,
1463 				 union bpf_attr __user *uattr)
1464 {
1465 	return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1466 						  true, true);
1467 }
1468 
1469 static int
1470 htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
1471 					    const union bpf_attr *attr,
1472 					    union bpf_attr __user *uattr)
1473 {
1474 	return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1475 						  true, true);
1476 }
1477 
1478 static int
1479 htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
1480 			  union bpf_attr __user *uattr)
1481 {
1482 	return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1483 						  true, false);
1484 }
1485 
1486 static int
1487 htab_lru_map_lookup_and_delete_batch(struct bpf_map *map,
1488 				     const union bpf_attr *attr,
1489 				     union bpf_attr __user *uattr)
1490 {
1491 	return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1492 						  true, false);
1493 }
1494 
1495 const struct bpf_map_ops htab_map_ops = {
1496 	.map_alloc_check = htab_map_alloc_check,
1497 	.map_alloc = htab_map_alloc,
1498 	.map_free = htab_map_free,
1499 	.map_get_next_key = htab_map_get_next_key,
1500 	.map_lookup_elem = htab_map_lookup_elem,
1501 	.map_update_elem = htab_map_update_elem,
1502 	.map_delete_elem = htab_map_delete_elem,
1503 	.map_gen_lookup = htab_map_gen_lookup,
1504 	.map_seq_show_elem = htab_map_seq_show_elem,
1505 	BATCH_OPS(htab),
1506 };
1507 
1508 const struct bpf_map_ops htab_lru_map_ops = {
1509 	.map_alloc_check = htab_map_alloc_check,
1510 	.map_alloc = htab_map_alloc,
1511 	.map_free = htab_map_free,
1512 	.map_get_next_key = htab_map_get_next_key,
1513 	.map_lookup_elem = htab_lru_map_lookup_elem,
1514 	.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
1515 	.map_update_elem = htab_lru_map_update_elem,
1516 	.map_delete_elem = htab_lru_map_delete_elem,
1517 	.map_gen_lookup = htab_lru_map_gen_lookup,
1518 	.map_seq_show_elem = htab_map_seq_show_elem,
1519 	BATCH_OPS(htab_lru),
1520 };
1521 
1522 /* Called from eBPF program */
1523 static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
1524 {
1525 	struct htab_elem *l = __htab_map_lookup_elem(map, key);
1526 
1527 	if (l)
1528 		return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
1529 	else
1530 		return NULL;
1531 }
1532 
1533 static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
1534 {
1535 	struct htab_elem *l = __htab_map_lookup_elem(map, key);
1536 
1537 	if (l) {
1538 		bpf_lru_node_set_ref(&l->lru_node);
1539 		return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
1540 	}
1541 
1542 	return NULL;
1543 }
1544 
1545 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
1546 {
1547 	struct htab_elem *l;
1548 	void __percpu *pptr;
1549 	int ret = -ENOENT;
1550 	int cpu, off = 0;
1551 	u32 size;
1552 
1553 	/* per_cpu areas are zero-filled and bpf programs can only
1554 	 * access 'value_size' of them, so copying rounded areas
1555 	 * will not leak any kernel data
1556 	 */
1557 	size = round_up(map->value_size, 8);
1558 	rcu_read_lock();
1559 	l = __htab_map_lookup_elem(map, key);
1560 	if (!l)
1561 		goto out;
1562 	/* We do not mark LRU map element here in order to not mess up
1563 	 * eviction heuristics when user space does a map walk.
1564 	 */
1565 	pptr = htab_elem_get_ptr(l, map->key_size);
1566 	for_each_possible_cpu(cpu) {
1567 		bpf_long_memcpy(value + off,
1568 				per_cpu_ptr(pptr, cpu), size);
1569 		off += size;
1570 	}
1571 	ret = 0;
1572 out:
1573 	rcu_read_unlock();
1574 	return ret;
1575 }
1576 
1577 int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
1578 			   u64 map_flags)
1579 {
1580 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1581 	int ret;
1582 
1583 	rcu_read_lock();
1584 	if (htab_is_lru(htab))
1585 		ret = __htab_lru_percpu_map_update_elem(map, key, value,
1586 							map_flags, true);
1587 	else
1588 		ret = __htab_percpu_map_update_elem(map, key, value, map_flags,
1589 						    true);
1590 	rcu_read_unlock();
1591 
1592 	return ret;
1593 }
1594 
1595 static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key,
1596 					  struct seq_file *m)
1597 {
1598 	struct htab_elem *l;
1599 	void __percpu *pptr;
1600 	int cpu;
1601 
1602 	rcu_read_lock();
1603 
1604 	l = __htab_map_lookup_elem(map, key);
1605 	if (!l) {
1606 		rcu_read_unlock();
1607 		return;
1608 	}
1609 
1610 	btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
1611 	seq_puts(m, ": {\n");
1612 	pptr = htab_elem_get_ptr(l, map->key_size);
1613 	for_each_possible_cpu(cpu) {
1614 		seq_printf(m, "\tcpu%d: ", cpu);
1615 		btf_type_seq_show(map->btf, map->btf_value_type_id,
1616 				  per_cpu_ptr(pptr, cpu), m);
1617 		seq_puts(m, "\n");
1618 	}
1619 	seq_puts(m, "}\n");
1620 
1621 	rcu_read_unlock();
1622 }
1623 
1624 const struct bpf_map_ops htab_percpu_map_ops = {
1625 	.map_alloc_check = htab_map_alloc_check,
1626 	.map_alloc = htab_map_alloc,
1627 	.map_free = htab_map_free,
1628 	.map_get_next_key = htab_map_get_next_key,
1629 	.map_lookup_elem = htab_percpu_map_lookup_elem,
1630 	.map_update_elem = htab_percpu_map_update_elem,
1631 	.map_delete_elem = htab_map_delete_elem,
1632 	.map_seq_show_elem = htab_percpu_map_seq_show_elem,
1633 	BATCH_OPS(htab_percpu),
1634 };
1635 
1636 const struct bpf_map_ops htab_lru_percpu_map_ops = {
1637 	.map_alloc_check = htab_map_alloc_check,
1638 	.map_alloc = htab_map_alloc,
1639 	.map_free = htab_map_free,
1640 	.map_get_next_key = htab_map_get_next_key,
1641 	.map_lookup_elem = htab_lru_percpu_map_lookup_elem,
1642 	.map_update_elem = htab_lru_percpu_map_update_elem,
1643 	.map_delete_elem = htab_lru_map_delete_elem,
1644 	.map_seq_show_elem = htab_percpu_map_seq_show_elem,
1645 	BATCH_OPS(htab_lru_percpu),
1646 };
1647 
1648 static int fd_htab_map_alloc_check(union bpf_attr *attr)
1649 {
1650 	if (attr->value_size != sizeof(u32))
1651 		return -EINVAL;
1652 	return htab_map_alloc_check(attr);
1653 }
1654 
1655 static void fd_htab_map_free(struct bpf_map *map)
1656 {
1657 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1658 	struct hlist_nulls_node *n;
1659 	struct hlist_nulls_head *head;
1660 	struct htab_elem *l;
1661 	int i;
1662 
1663 	for (i = 0; i < htab->n_buckets; i++) {
1664 		head = select_bucket(htab, i);
1665 
1666 		hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
1667 			void *ptr = fd_htab_map_get_ptr(map, l);
1668 
1669 			map->ops->map_fd_put_ptr(ptr);
1670 		}
1671 	}
1672 
1673 	htab_map_free(map);
1674 }
1675 
1676 /* only called from syscall */
1677 int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
1678 {
1679 	void **ptr;
1680 	int ret = 0;
1681 
1682 	if (!map->ops->map_fd_sys_lookup_elem)
1683 		return -ENOTSUPP;
1684 
1685 	rcu_read_lock();
1686 	ptr = htab_map_lookup_elem(map, key);
1687 	if (ptr)
1688 		*value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr));
1689 	else
1690 		ret = -ENOENT;
1691 	rcu_read_unlock();
1692 
1693 	return ret;
1694 }
1695 
1696 /* only called from syscall */
1697 int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
1698 				void *key, void *value, u64 map_flags)
1699 {
1700 	void *ptr;
1701 	int ret;
1702 	u32 ufd = *(u32 *)value;
1703 
1704 	ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
1705 	if (IS_ERR(ptr))
1706 		return PTR_ERR(ptr);
1707 
1708 	ret = htab_map_update_elem(map, key, &ptr, map_flags);
1709 	if (ret)
1710 		map->ops->map_fd_put_ptr(ptr);
1711 
1712 	return ret;
1713 }
1714 
1715 static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr)
1716 {
1717 	struct bpf_map *map, *inner_map_meta;
1718 
1719 	inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd);
1720 	if (IS_ERR(inner_map_meta))
1721 		return inner_map_meta;
1722 
1723 	map = htab_map_alloc(attr);
1724 	if (IS_ERR(map)) {
1725 		bpf_map_meta_free(inner_map_meta);
1726 		return map;
1727 	}
1728 
1729 	map->inner_map_meta = inner_map_meta;
1730 
1731 	return map;
1732 }
1733 
1734 static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key)
1735 {
1736 	struct bpf_map **inner_map  = htab_map_lookup_elem(map, key);
1737 
1738 	if (!inner_map)
1739 		return NULL;
1740 
1741 	return READ_ONCE(*inner_map);
1742 }
1743 
1744 static u32 htab_of_map_gen_lookup(struct bpf_map *map,
1745 				  struct bpf_insn *insn_buf)
1746 {
1747 	struct bpf_insn *insn = insn_buf;
1748 	const int ret = BPF_REG_0;
1749 
1750 	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
1751 		     (void *(*)(struct bpf_map *map, void *key))NULL));
1752 	*insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
1753 	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2);
1754 	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
1755 				offsetof(struct htab_elem, key) +
1756 				round_up(map->key_size, 8));
1757 	*insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0);
1758 
1759 	return insn - insn_buf;
1760 }
1761 
1762 static void htab_of_map_free(struct bpf_map *map)
1763 {
1764 	bpf_map_meta_free(map->inner_map_meta);
1765 	fd_htab_map_free(map);
1766 }
1767 
1768 const struct bpf_map_ops htab_of_maps_map_ops = {
1769 	.map_alloc_check = fd_htab_map_alloc_check,
1770 	.map_alloc = htab_of_map_alloc,
1771 	.map_free = htab_of_map_free,
1772 	.map_get_next_key = htab_map_get_next_key,
1773 	.map_lookup_elem = htab_of_map_lookup_elem,
1774 	.map_delete_elem = htab_map_delete_elem,
1775 	.map_fd_get_ptr = bpf_map_fd_get_ptr,
1776 	.map_fd_put_ptr = bpf_map_fd_put_ptr,
1777 	.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
1778 	.map_gen_lookup = htab_of_map_gen_lookup,
1779 	.map_check_btf = map_check_no_btf,
1780 };
1781