xref: /openbmc/linux/kernel/bpf/stackmap.c (revision 1e328ed5)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2016 Facebook
3  */
4 #include <linux/bpf.h>
5 #include <linux/jhash.h>
6 #include <linux/filter.h>
7 #include <linux/kernel.h>
8 #include <linux/stacktrace.h>
9 #include <linux/perf_event.h>
10 #include <linux/irq_work.h>
11 #include <linux/btf_ids.h>
12 #include <linux/buildid.h>
13 #include "percpu_freelist.h"
14 
15 #define STACK_CREATE_FLAG_MASK					\
16 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY |	\
17 	 BPF_F_STACK_BUILD_ID)
18 
19 struct stack_map_bucket {
20 	struct pcpu_freelist_node fnode;
21 	u32 hash;
22 	u32 nr;
23 	u64 data[];
24 };
25 
26 struct bpf_stack_map {
27 	struct bpf_map map;
28 	void *elems;
29 	struct pcpu_freelist freelist;
30 	u32 n_buckets;
31 	struct stack_map_bucket *buckets[];
32 };
33 
34 /* irq_work to run up_read() for build_id lookup in nmi context */
35 struct stack_map_irq_work {
36 	struct irq_work irq_work;
37 	struct mm_struct *mm;
38 };
39 
40 static void do_up_read(struct irq_work *entry)
41 {
42 	struct stack_map_irq_work *work;
43 
44 	if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
45 		return;
46 
47 	work = container_of(entry, struct stack_map_irq_work, irq_work);
48 	mmap_read_unlock_non_owner(work->mm);
49 }
50 
51 static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work);
52 
53 static inline bool stack_map_use_build_id(struct bpf_map *map)
54 {
55 	return (map->map_flags & BPF_F_STACK_BUILD_ID);
56 }
57 
58 static inline int stack_map_data_size(struct bpf_map *map)
59 {
60 	return stack_map_use_build_id(map) ?
61 		sizeof(struct bpf_stack_build_id) : sizeof(u64);
62 }
63 
64 static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
65 {
66 	u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
67 	int err;
68 
69 	smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries,
70 					 smap->map.numa_node);
71 	if (!smap->elems)
72 		return -ENOMEM;
73 
74 	err = pcpu_freelist_init(&smap->freelist);
75 	if (err)
76 		goto free_elems;
77 
78 	pcpu_freelist_populate(&smap->freelist, smap->elems, elem_size,
79 			       smap->map.max_entries);
80 	return 0;
81 
82 free_elems:
83 	bpf_map_area_free(smap->elems);
84 	return err;
85 }
86 
87 /* Called from syscall */
88 static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
89 {
90 	u32 value_size = attr->value_size;
91 	struct bpf_stack_map *smap;
92 	u64 cost, n_buckets;
93 	int err;
94 
95 	if (!bpf_capable())
96 		return ERR_PTR(-EPERM);
97 
98 	if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
99 		return ERR_PTR(-EINVAL);
100 
101 	/* check sanity of attributes */
102 	if (attr->max_entries == 0 || attr->key_size != 4 ||
103 	    value_size < 8 || value_size % 8)
104 		return ERR_PTR(-EINVAL);
105 
106 	BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64));
107 	if (attr->map_flags & BPF_F_STACK_BUILD_ID) {
108 		if (value_size % sizeof(struct bpf_stack_build_id) ||
109 		    value_size / sizeof(struct bpf_stack_build_id)
110 		    > sysctl_perf_event_max_stack)
111 			return ERR_PTR(-EINVAL);
112 	} else if (value_size / 8 > sysctl_perf_event_max_stack)
113 		return ERR_PTR(-EINVAL);
114 
115 	/* hash table size must be power of 2 */
116 	n_buckets = roundup_pow_of_two(attr->max_entries);
117 
118 	cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
119 	cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
120 	smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr));
121 	if (!smap)
122 		return ERR_PTR(-ENOMEM);
123 
124 	bpf_map_init_from_attr(&smap->map, attr);
125 	smap->map.value_size = value_size;
126 	smap->n_buckets = n_buckets;
127 
128 	err = get_callchain_buffers(sysctl_perf_event_max_stack);
129 	if (err)
130 		goto free_smap;
131 
132 	err = prealloc_elems_and_freelist(smap);
133 	if (err)
134 		goto put_buffers;
135 
136 	return &smap->map;
137 
138 put_buffers:
139 	put_callchain_buffers();
140 free_smap:
141 	bpf_map_area_free(smap);
142 	return ERR_PTR(err);
143 }
144 
145 static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
146 					  u64 *ips, u32 trace_nr, bool user)
147 {
148 	int i;
149 	struct vm_area_struct *vma;
150 	bool irq_work_busy = false;
151 	struct stack_map_irq_work *work = NULL;
152 
153 	if (irqs_disabled()) {
154 		if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
155 			work = this_cpu_ptr(&up_read_work);
156 			if (irq_work_is_busy(&work->irq_work)) {
157 				/* cannot queue more up_read, fallback */
158 				irq_work_busy = true;
159 			}
160 		} else {
161 			/*
162 			 * PREEMPT_RT does not allow to trylock mmap sem in
163 			 * interrupt disabled context. Force the fallback code.
164 			 */
165 			irq_work_busy = true;
166 		}
167 	}
168 
169 	/*
170 	 * We cannot do up_read() when the irq is disabled, because of
171 	 * risk to deadlock with rq_lock. To do build_id lookup when the
172 	 * irqs are disabled, we need to run up_read() in irq_work. We use
173 	 * a percpu variable to do the irq_work. If the irq_work is
174 	 * already used by another lookup, we fall back to report ips.
175 	 *
176 	 * Same fallback is used for kernel stack (!user) on a stackmap
177 	 * with build_id.
178 	 */
179 	if (!user || !current || !current->mm || irq_work_busy ||
180 	    !mmap_read_trylock_non_owner(current->mm)) {
181 		/* cannot access current->mm, fall back to ips */
182 		for (i = 0; i < trace_nr; i++) {
183 			id_offs[i].status = BPF_STACK_BUILD_ID_IP;
184 			id_offs[i].ip = ips[i];
185 			memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
186 		}
187 		return;
188 	}
189 
190 	for (i = 0; i < trace_nr; i++) {
191 		vma = find_vma(current->mm, ips[i]);
192 		if (!vma || build_id_parse(vma, id_offs[i].build_id, NULL)) {
193 			/* per entry fall back to ips */
194 			id_offs[i].status = BPF_STACK_BUILD_ID_IP;
195 			id_offs[i].ip = ips[i];
196 			memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
197 			continue;
198 		}
199 		id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i]
200 			- vma->vm_start;
201 		id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
202 	}
203 
204 	if (!work) {
205 		mmap_read_unlock_non_owner(current->mm);
206 	} else {
207 		work->mm = current->mm;
208 		irq_work_queue(&work->irq_work);
209 	}
210 }
211 
212 static struct perf_callchain_entry *
213 get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
214 {
215 #ifdef CONFIG_STACKTRACE
216 	struct perf_callchain_entry *entry;
217 	int rctx;
218 
219 	entry = get_callchain_entry(&rctx);
220 
221 	if (!entry)
222 		return NULL;
223 
224 	entry->nr = init_nr +
225 		stack_trace_save_tsk(task, (unsigned long *)(entry->ip + init_nr),
226 				     sysctl_perf_event_max_stack - init_nr, 0);
227 
228 	/* stack_trace_save_tsk() works on unsigned long array, while
229 	 * perf_callchain_entry uses u64 array. For 32-bit systems, it is
230 	 * necessary to fix this mismatch.
231 	 */
232 	if (__BITS_PER_LONG != 64) {
233 		unsigned long *from = (unsigned long *) entry->ip;
234 		u64 *to = entry->ip;
235 		int i;
236 
237 		/* copy data from the end to avoid using extra buffer */
238 		for (i = entry->nr - 1; i >= (int)init_nr; i--)
239 			to[i] = (u64)(from[i]);
240 	}
241 
242 	put_callchain_entry(rctx);
243 
244 	return entry;
245 #else /* CONFIG_STACKTRACE */
246 	return NULL;
247 #endif
248 }
249 
250 static long __bpf_get_stackid(struct bpf_map *map,
251 			      struct perf_callchain_entry *trace, u64 flags)
252 {
253 	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
254 	struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
255 	u32 max_depth = map->value_size / stack_map_data_size(map);
256 	/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
257 	u32 init_nr = sysctl_perf_event_max_stack - max_depth;
258 	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
259 	u32 hash, id, trace_nr, trace_len;
260 	bool user = flags & BPF_F_USER_STACK;
261 	u64 *ips;
262 	bool hash_matches;
263 
264 	/* get_perf_callchain() guarantees that trace->nr >= init_nr
265 	 * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
266 	 */
267 	trace_nr = trace->nr - init_nr;
268 
269 	if (trace_nr <= skip)
270 		/* skipping more than usable stack trace */
271 		return -EFAULT;
272 
273 	trace_nr -= skip;
274 	trace_len = trace_nr * sizeof(u64);
275 	ips = trace->ip + skip + init_nr;
276 	hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
277 	id = hash & (smap->n_buckets - 1);
278 	bucket = READ_ONCE(smap->buckets[id]);
279 
280 	hash_matches = bucket && bucket->hash == hash;
281 	/* fast cmp */
282 	if (hash_matches && flags & BPF_F_FAST_STACK_CMP)
283 		return id;
284 
285 	if (stack_map_use_build_id(map)) {
286 		/* for build_id+offset, pop a bucket before slow cmp */
287 		new_bucket = (struct stack_map_bucket *)
288 			pcpu_freelist_pop(&smap->freelist);
289 		if (unlikely(!new_bucket))
290 			return -ENOMEM;
291 		new_bucket->nr = trace_nr;
292 		stack_map_get_build_id_offset(
293 			(struct bpf_stack_build_id *)new_bucket->data,
294 			ips, trace_nr, user);
295 		trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
296 		if (hash_matches && bucket->nr == trace_nr &&
297 		    memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
298 			pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
299 			return id;
300 		}
301 		if (bucket && !(flags & BPF_F_REUSE_STACKID)) {
302 			pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
303 			return -EEXIST;
304 		}
305 	} else {
306 		if (hash_matches && bucket->nr == trace_nr &&
307 		    memcmp(bucket->data, ips, trace_len) == 0)
308 			return id;
309 		if (bucket && !(flags & BPF_F_REUSE_STACKID))
310 			return -EEXIST;
311 
312 		new_bucket = (struct stack_map_bucket *)
313 			pcpu_freelist_pop(&smap->freelist);
314 		if (unlikely(!new_bucket))
315 			return -ENOMEM;
316 		memcpy(new_bucket->data, ips, trace_len);
317 	}
318 
319 	new_bucket->hash = hash;
320 	new_bucket->nr = trace_nr;
321 
322 	old_bucket = xchg(&smap->buckets[id], new_bucket);
323 	if (old_bucket)
324 		pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
325 	return id;
326 }
327 
328 BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
329 	   u64, flags)
330 {
331 	u32 max_depth = map->value_size / stack_map_data_size(map);
332 	/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
333 	u32 init_nr = sysctl_perf_event_max_stack - max_depth;
334 	bool user = flags & BPF_F_USER_STACK;
335 	struct perf_callchain_entry *trace;
336 	bool kernel = !user;
337 
338 	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
339 			       BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
340 		return -EINVAL;
341 
342 	trace = get_perf_callchain(regs, init_nr, kernel, user,
343 				   sysctl_perf_event_max_stack, false, false);
344 
345 	if (unlikely(!trace))
346 		/* couldn't fetch the stack trace */
347 		return -EFAULT;
348 
349 	return __bpf_get_stackid(map, trace, flags);
350 }
351 
352 const struct bpf_func_proto bpf_get_stackid_proto = {
353 	.func		= bpf_get_stackid,
354 	.gpl_only	= true,
355 	.ret_type	= RET_INTEGER,
356 	.arg1_type	= ARG_PTR_TO_CTX,
357 	.arg2_type	= ARG_CONST_MAP_PTR,
358 	.arg3_type	= ARG_ANYTHING,
359 };
360 
361 static __u64 count_kernel_ip(struct perf_callchain_entry *trace)
362 {
363 	__u64 nr_kernel = 0;
364 
365 	while (nr_kernel < trace->nr) {
366 		if (trace->ip[nr_kernel] == PERF_CONTEXT_USER)
367 			break;
368 		nr_kernel++;
369 	}
370 	return nr_kernel;
371 }
372 
373 BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx,
374 	   struct bpf_map *, map, u64, flags)
375 {
376 	struct perf_event *event = ctx->event;
377 	struct perf_callchain_entry *trace;
378 	bool kernel, user;
379 	__u64 nr_kernel;
380 	int ret;
381 
382 	/* perf_sample_data doesn't have callchain, use bpf_get_stackid */
383 	if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
384 		return bpf_get_stackid((unsigned long)(ctx->regs),
385 				       (unsigned long) map, flags, 0, 0);
386 
387 	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
388 			       BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
389 		return -EINVAL;
390 
391 	user = flags & BPF_F_USER_STACK;
392 	kernel = !user;
393 
394 	trace = ctx->data->callchain;
395 	if (unlikely(!trace))
396 		return -EFAULT;
397 
398 	nr_kernel = count_kernel_ip(trace);
399 
400 	if (kernel) {
401 		__u64 nr = trace->nr;
402 
403 		trace->nr = nr_kernel;
404 		ret = __bpf_get_stackid(map, trace, flags);
405 
406 		/* restore nr */
407 		trace->nr = nr;
408 	} else { /* user */
409 		u64 skip = flags & BPF_F_SKIP_FIELD_MASK;
410 
411 		skip += nr_kernel;
412 		if (skip > BPF_F_SKIP_FIELD_MASK)
413 			return -EFAULT;
414 
415 		flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
416 		ret = __bpf_get_stackid(map, trace, flags);
417 	}
418 	return ret;
419 }
420 
421 const struct bpf_func_proto bpf_get_stackid_proto_pe = {
422 	.func		= bpf_get_stackid_pe,
423 	.gpl_only	= false,
424 	.ret_type	= RET_INTEGER,
425 	.arg1_type	= ARG_PTR_TO_CTX,
426 	.arg2_type	= ARG_CONST_MAP_PTR,
427 	.arg3_type	= ARG_ANYTHING,
428 };
429 
430 static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
431 			    struct perf_callchain_entry *trace_in,
432 			    void *buf, u32 size, u64 flags)
433 {
434 	u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
435 	bool user_build_id = flags & BPF_F_USER_BUILD_ID;
436 	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
437 	bool user = flags & BPF_F_USER_STACK;
438 	struct perf_callchain_entry *trace;
439 	bool kernel = !user;
440 	int err = -EINVAL;
441 	u64 *ips;
442 
443 	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
444 			       BPF_F_USER_BUILD_ID)))
445 		goto clear;
446 	if (kernel && user_build_id)
447 		goto clear;
448 
449 	elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id)
450 					    : sizeof(u64);
451 	if (unlikely(size % elem_size))
452 		goto clear;
453 
454 	/* cannot get valid user stack for task without user_mode regs */
455 	if (task && user && !user_mode(regs))
456 		goto err_fault;
457 
458 	num_elem = size / elem_size;
459 	if (sysctl_perf_event_max_stack < num_elem)
460 		init_nr = 0;
461 	else
462 		init_nr = sysctl_perf_event_max_stack - num_elem;
463 
464 	if (trace_in)
465 		trace = trace_in;
466 	else if (kernel && task)
467 		trace = get_callchain_entry_for_task(task, init_nr);
468 	else
469 		trace = get_perf_callchain(regs, init_nr, kernel, user,
470 					   sysctl_perf_event_max_stack,
471 					   false, false);
472 	if (unlikely(!trace))
473 		goto err_fault;
474 
475 	trace_nr = trace->nr - init_nr;
476 	if (trace_nr < skip)
477 		goto err_fault;
478 
479 	trace_nr -= skip;
480 	trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
481 	copy_len = trace_nr * elem_size;
482 	ips = trace->ip + skip + init_nr;
483 	if (user && user_build_id)
484 		stack_map_get_build_id_offset(buf, ips, trace_nr, user);
485 	else
486 		memcpy(buf, ips, copy_len);
487 
488 	if (size > copy_len)
489 		memset(buf + copy_len, 0, size - copy_len);
490 	return copy_len;
491 
492 err_fault:
493 	err = -EFAULT;
494 clear:
495 	memset(buf, 0, size);
496 	return err;
497 }
498 
499 BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
500 	   u64, flags)
501 {
502 	return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
503 }
504 
505 const struct bpf_func_proto bpf_get_stack_proto = {
506 	.func		= bpf_get_stack,
507 	.gpl_only	= true,
508 	.ret_type	= RET_INTEGER,
509 	.arg1_type	= ARG_PTR_TO_CTX,
510 	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
511 	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
512 	.arg4_type	= ARG_ANYTHING,
513 };
514 
515 BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
516 	   u32, size, u64, flags)
517 {
518 	struct pt_regs *regs = task_pt_regs(task);
519 
520 	return __bpf_get_stack(regs, task, NULL, buf, size, flags);
521 }
522 
523 BTF_ID_LIST_SINGLE(bpf_get_task_stack_btf_ids, struct, task_struct)
524 
525 const struct bpf_func_proto bpf_get_task_stack_proto = {
526 	.func		= bpf_get_task_stack,
527 	.gpl_only	= false,
528 	.ret_type	= RET_INTEGER,
529 	.arg1_type	= ARG_PTR_TO_BTF_ID,
530 	.arg1_btf_id	= &bpf_get_task_stack_btf_ids[0],
531 	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
532 	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
533 	.arg4_type	= ARG_ANYTHING,
534 };
535 
536 BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
537 	   void *, buf, u32, size, u64, flags)
538 {
539 	struct pt_regs *regs = (struct pt_regs *)(ctx->regs);
540 	struct perf_event *event = ctx->event;
541 	struct perf_callchain_entry *trace;
542 	bool kernel, user;
543 	int err = -EINVAL;
544 	__u64 nr_kernel;
545 
546 	if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
547 		return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
548 
549 	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
550 			       BPF_F_USER_BUILD_ID)))
551 		goto clear;
552 
553 	user = flags & BPF_F_USER_STACK;
554 	kernel = !user;
555 
556 	err = -EFAULT;
557 	trace = ctx->data->callchain;
558 	if (unlikely(!trace))
559 		goto clear;
560 
561 	nr_kernel = count_kernel_ip(trace);
562 
563 	if (kernel) {
564 		__u64 nr = trace->nr;
565 
566 		trace->nr = nr_kernel;
567 		err = __bpf_get_stack(regs, NULL, trace, buf, size, flags);
568 
569 		/* restore nr */
570 		trace->nr = nr;
571 	} else { /* user */
572 		u64 skip = flags & BPF_F_SKIP_FIELD_MASK;
573 
574 		skip += nr_kernel;
575 		if (skip > BPF_F_SKIP_FIELD_MASK)
576 			goto clear;
577 
578 		flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
579 		err = __bpf_get_stack(regs, NULL, trace, buf, size, flags);
580 	}
581 	return err;
582 
583 clear:
584 	memset(buf, 0, size);
585 	return err;
586 
587 }
588 
589 const struct bpf_func_proto bpf_get_stack_proto_pe = {
590 	.func		= bpf_get_stack_pe,
591 	.gpl_only	= true,
592 	.ret_type	= RET_INTEGER,
593 	.arg1_type	= ARG_PTR_TO_CTX,
594 	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
595 	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
596 	.arg4_type	= ARG_ANYTHING,
597 };
598 
599 /* Called from eBPF program */
600 static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
601 {
602 	return ERR_PTR(-EOPNOTSUPP);
603 }
604 
605 /* Called from syscall */
606 int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
607 {
608 	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
609 	struct stack_map_bucket *bucket, *old_bucket;
610 	u32 id = *(u32 *)key, trace_len;
611 
612 	if (unlikely(id >= smap->n_buckets))
613 		return -ENOENT;
614 
615 	bucket = xchg(&smap->buckets[id], NULL);
616 	if (!bucket)
617 		return -ENOENT;
618 
619 	trace_len = bucket->nr * stack_map_data_size(map);
620 	memcpy(value, bucket->data, trace_len);
621 	memset(value + trace_len, 0, map->value_size - trace_len);
622 
623 	old_bucket = xchg(&smap->buckets[id], bucket);
624 	if (old_bucket)
625 		pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
626 	return 0;
627 }
628 
629 static int stack_map_get_next_key(struct bpf_map *map, void *key,
630 				  void *next_key)
631 {
632 	struct bpf_stack_map *smap = container_of(map,
633 						  struct bpf_stack_map, map);
634 	u32 id;
635 
636 	WARN_ON_ONCE(!rcu_read_lock_held());
637 
638 	if (!key) {
639 		id = 0;
640 	} else {
641 		id = *(u32 *)key;
642 		if (id >= smap->n_buckets || !smap->buckets[id])
643 			id = 0;
644 		else
645 			id++;
646 	}
647 
648 	while (id < smap->n_buckets && !smap->buckets[id])
649 		id++;
650 
651 	if (id >= smap->n_buckets)
652 		return -ENOENT;
653 
654 	*(u32 *)next_key = id;
655 	return 0;
656 }
657 
658 static int stack_map_update_elem(struct bpf_map *map, void *key, void *value,
659 				 u64 map_flags)
660 {
661 	return -EINVAL;
662 }
663 
664 /* Called from syscall or from eBPF program */
665 static int stack_map_delete_elem(struct bpf_map *map, void *key)
666 {
667 	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
668 	struct stack_map_bucket *old_bucket;
669 	u32 id = *(u32 *)key;
670 
671 	if (unlikely(id >= smap->n_buckets))
672 		return -E2BIG;
673 
674 	old_bucket = xchg(&smap->buckets[id], NULL);
675 	if (old_bucket) {
676 		pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
677 		return 0;
678 	} else {
679 		return -ENOENT;
680 	}
681 }
682 
683 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
684 static void stack_map_free(struct bpf_map *map)
685 {
686 	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
687 
688 	bpf_map_area_free(smap->elems);
689 	pcpu_freelist_destroy(&smap->freelist);
690 	bpf_map_area_free(smap);
691 	put_callchain_buffers();
692 }
693 
694 static int stack_trace_map_btf_id;
695 const struct bpf_map_ops stack_trace_map_ops = {
696 	.map_meta_equal = bpf_map_meta_equal,
697 	.map_alloc = stack_map_alloc,
698 	.map_free = stack_map_free,
699 	.map_get_next_key = stack_map_get_next_key,
700 	.map_lookup_elem = stack_map_lookup_elem,
701 	.map_update_elem = stack_map_update_elem,
702 	.map_delete_elem = stack_map_delete_elem,
703 	.map_check_btf = map_check_no_btf,
704 	.map_btf_name = "bpf_stack_map",
705 	.map_btf_id = &stack_trace_map_btf_id,
706 };
707 
708 static int __init stack_map_init(void)
709 {
710 	int cpu;
711 	struct stack_map_irq_work *work;
712 
713 	for_each_possible_cpu(cpu) {
714 		work = per_cpu_ptr(&up_read_work, cpu);
715 		init_irq_work(&work->irq_work, do_up_read);
716 	}
717 	return 0;
718 }
719 subsys_initcall(stack_map_init);
720