xref: /openbmc/linux/kernel/bpf/helpers.c (revision 1a931707ad4a46e79d4ecfee56d8f6e8cc8d4f28)
1  // SPDX-License-Identifier: GPL-2.0-only
2  /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
3   */
4  #include <linux/bpf.h>
5  #include <linux/btf.h>
6  #include <linux/bpf-cgroup.h>
7  #include <linux/cgroup.h>
8  #include <linux/rcupdate.h>
9  #include <linux/random.h>
10  #include <linux/smp.h>
11  #include <linux/topology.h>
12  #include <linux/ktime.h>
13  #include <linux/sched.h>
14  #include <linux/uidgid.h>
15  #include <linux/filter.h>
16  #include <linux/ctype.h>
17  #include <linux/jiffies.h>
18  #include <linux/pid_namespace.h>
19  #include <linux/poison.h>
20  #include <linux/proc_ns.h>
21  #include <linux/sched/task.h>
22  #include <linux/security.h>
23  #include <linux/btf_ids.h>
24  #include <linux/bpf_mem_alloc.h>
25  
26  #include "../../lib/kstrtox.h"
27  
28  /* If kernel subsystem is allowing eBPF programs to call this function,
29   * inside its own verifier_ops->get_func_proto() callback it should return
30   * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments
31   *
32   * Different map implementations will rely on rcu in map methods
33   * lookup/update/delete, therefore eBPF programs must run under rcu lock
34   * if program is allowed to access maps, so check rcu_read_lock_held() or
35   * rcu_read_lock_trace_held() in all three functions.
36   */
BPF_CALL_2(bpf_map_lookup_elem,struct bpf_map *,map,void *,key)37  BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key)
38  {
39  	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
40  		     !rcu_read_lock_bh_held());
41  	return (unsigned long) map->ops->map_lookup_elem(map, key);
42  }
43  
44  const struct bpf_func_proto bpf_map_lookup_elem_proto = {
45  	.func		= bpf_map_lookup_elem,
46  	.gpl_only	= false,
47  	.pkt_access	= true,
48  	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,
49  	.arg1_type	= ARG_CONST_MAP_PTR,
50  	.arg2_type	= ARG_PTR_TO_MAP_KEY,
51  };
52  
BPF_CALL_4(bpf_map_update_elem,struct bpf_map *,map,void *,key,void *,value,u64,flags)53  BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key,
54  	   void *, value, u64, flags)
55  {
56  	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
57  		     !rcu_read_lock_bh_held());
58  	return map->ops->map_update_elem(map, key, value, flags);
59  }
60  
61  const struct bpf_func_proto bpf_map_update_elem_proto = {
62  	.func		= bpf_map_update_elem,
63  	.gpl_only	= false,
64  	.pkt_access	= true,
65  	.ret_type	= RET_INTEGER,
66  	.arg1_type	= ARG_CONST_MAP_PTR,
67  	.arg2_type	= ARG_PTR_TO_MAP_KEY,
68  	.arg3_type	= ARG_PTR_TO_MAP_VALUE,
69  	.arg4_type	= ARG_ANYTHING,
70  };
71  
BPF_CALL_2(bpf_map_delete_elem,struct bpf_map *,map,void *,key)72  BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key)
73  {
74  	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
75  		     !rcu_read_lock_bh_held());
76  	return map->ops->map_delete_elem(map, key);
77  }
78  
79  const struct bpf_func_proto bpf_map_delete_elem_proto = {
80  	.func		= bpf_map_delete_elem,
81  	.gpl_only	= false,
82  	.pkt_access	= true,
83  	.ret_type	= RET_INTEGER,
84  	.arg1_type	= ARG_CONST_MAP_PTR,
85  	.arg2_type	= ARG_PTR_TO_MAP_KEY,
86  };
87  
BPF_CALL_3(bpf_map_push_elem,struct bpf_map *,map,void *,value,u64,flags)88  BPF_CALL_3(bpf_map_push_elem, struct bpf_map *, map, void *, value, u64, flags)
89  {
90  	return map->ops->map_push_elem(map, value, flags);
91  }
92  
93  const struct bpf_func_proto bpf_map_push_elem_proto = {
94  	.func		= bpf_map_push_elem,
95  	.gpl_only	= false,
96  	.pkt_access	= true,
97  	.ret_type	= RET_INTEGER,
98  	.arg1_type	= ARG_CONST_MAP_PTR,
99  	.arg2_type	= ARG_PTR_TO_MAP_VALUE,
100  	.arg3_type	= ARG_ANYTHING,
101  };
102  
BPF_CALL_2(bpf_map_pop_elem,struct bpf_map *,map,void *,value)103  BPF_CALL_2(bpf_map_pop_elem, struct bpf_map *, map, void *, value)
104  {
105  	return map->ops->map_pop_elem(map, value);
106  }
107  
108  const struct bpf_func_proto bpf_map_pop_elem_proto = {
109  	.func		= bpf_map_pop_elem,
110  	.gpl_only	= false,
111  	.ret_type	= RET_INTEGER,
112  	.arg1_type	= ARG_CONST_MAP_PTR,
113  	.arg2_type	= ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE,
114  };
115  
BPF_CALL_2(bpf_map_peek_elem,struct bpf_map *,map,void *,value)116  BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value)
117  {
118  	return map->ops->map_peek_elem(map, value);
119  }
120  
121  const struct bpf_func_proto bpf_map_peek_elem_proto = {
122  	.func		= bpf_map_peek_elem,
123  	.gpl_only	= false,
124  	.ret_type	= RET_INTEGER,
125  	.arg1_type	= ARG_CONST_MAP_PTR,
126  	.arg2_type	= ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE,
127  };
128  
BPF_CALL_3(bpf_map_lookup_percpu_elem,struct bpf_map *,map,void *,key,u32,cpu)129  BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu)
130  {
131  	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
132  	return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu);
133  }
134  
135  const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto = {
136  	.func		= bpf_map_lookup_percpu_elem,
137  	.gpl_only	= false,
138  	.pkt_access	= true,
139  	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,
140  	.arg1_type	= ARG_CONST_MAP_PTR,
141  	.arg2_type	= ARG_PTR_TO_MAP_KEY,
142  	.arg3_type	= ARG_ANYTHING,
143  };
144  
145  const struct bpf_func_proto bpf_get_prandom_u32_proto = {
146  	.func		= bpf_user_rnd_u32,
147  	.gpl_only	= false,
148  	.ret_type	= RET_INTEGER,
149  };
150  
BPF_CALL_0(bpf_get_smp_processor_id)151  BPF_CALL_0(bpf_get_smp_processor_id)
152  {
153  	return smp_processor_id();
154  }
155  
156  const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
157  	.func		= bpf_get_smp_processor_id,
158  	.gpl_only	= false,
159  	.ret_type	= RET_INTEGER,
160  };
161  
BPF_CALL_0(bpf_get_numa_node_id)162  BPF_CALL_0(bpf_get_numa_node_id)
163  {
164  	return numa_node_id();
165  }
166  
167  const struct bpf_func_proto bpf_get_numa_node_id_proto = {
168  	.func		= bpf_get_numa_node_id,
169  	.gpl_only	= false,
170  	.ret_type	= RET_INTEGER,
171  };
172  
BPF_CALL_0(bpf_ktime_get_ns)173  BPF_CALL_0(bpf_ktime_get_ns)
174  {
175  	/* NMI safe access to clock monotonic */
176  	return ktime_get_mono_fast_ns();
177  }
178  
179  const struct bpf_func_proto bpf_ktime_get_ns_proto = {
180  	.func		= bpf_ktime_get_ns,
181  	.gpl_only	= false,
182  	.ret_type	= RET_INTEGER,
183  };
184  
BPF_CALL_0(bpf_ktime_get_boot_ns)185  BPF_CALL_0(bpf_ktime_get_boot_ns)
186  {
187  	/* NMI safe access to clock boottime */
188  	return ktime_get_boot_fast_ns();
189  }
190  
191  const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = {
192  	.func		= bpf_ktime_get_boot_ns,
193  	.gpl_only	= false,
194  	.ret_type	= RET_INTEGER,
195  };
196  
BPF_CALL_0(bpf_ktime_get_coarse_ns)197  BPF_CALL_0(bpf_ktime_get_coarse_ns)
198  {
199  	return ktime_get_coarse_ns();
200  }
201  
202  const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto = {
203  	.func		= bpf_ktime_get_coarse_ns,
204  	.gpl_only	= false,
205  	.ret_type	= RET_INTEGER,
206  };
207  
BPF_CALL_0(bpf_ktime_get_tai_ns)208  BPF_CALL_0(bpf_ktime_get_tai_ns)
209  {
210  	/* NMI safe access to clock tai */
211  	return ktime_get_tai_fast_ns();
212  }
213  
214  const struct bpf_func_proto bpf_ktime_get_tai_ns_proto = {
215  	.func		= bpf_ktime_get_tai_ns,
216  	.gpl_only	= false,
217  	.ret_type	= RET_INTEGER,
218  };
219  
BPF_CALL_0(bpf_get_current_pid_tgid)220  BPF_CALL_0(bpf_get_current_pid_tgid)
221  {
222  	struct task_struct *task = current;
223  
224  	if (unlikely(!task))
225  		return -EINVAL;
226  
227  	return (u64) task->tgid << 32 | task->pid;
228  }
229  
230  const struct bpf_func_proto bpf_get_current_pid_tgid_proto = {
231  	.func		= bpf_get_current_pid_tgid,
232  	.gpl_only	= false,
233  	.ret_type	= RET_INTEGER,
234  };
235  
BPF_CALL_0(bpf_get_current_uid_gid)236  BPF_CALL_0(bpf_get_current_uid_gid)
237  {
238  	struct task_struct *task = current;
239  	kuid_t uid;
240  	kgid_t gid;
241  
242  	if (unlikely(!task))
243  		return -EINVAL;
244  
245  	current_uid_gid(&uid, &gid);
246  	return (u64) from_kgid(&init_user_ns, gid) << 32 |
247  		     from_kuid(&init_user_ns, uid);
248  }
249  
250  const struct bpf_func_proto bpf_get_current_uid_gid_proto = {
251  	.func		= bpf_get_current_uid_gid,
252  	.gpl_only	= false,
253  	.ret_type	= RET_INTEGER,
254  };
255  
BPF_CALL_2(bpf_get_current_comm,char *,buf,u32,size)256  BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size)
257  {
258  	struct task_struct *task = current;
259  
260  	if (unlikely(!task))
261  		goto err_clear;
262  
263  	/* Verifier guarantees that size > 0 */
264  	strscpy_pad(buf, task->comm, size);
265  	return 0;
266  err_clear:
267  	memset(buf, 0, size);
268  	return -EINVAL;
269  }
270  
271  const struct bpf_func_proto bpf_get_current_comm_proto = {
272  	.func		= bpf_get_current_comm,
273  	.gpl_only	= false,
274  	.ret_type	= RET_INTEGER,
275  	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
276  	.arg2_type	= ARG_CONST_SIZE,
277  };
278  
279  #if defined(CONFIG_QUEUED_SPINLOCKS) || defined(CONFIG_BPF_ARCH_SPINLOCK)
280  
__bpf_spin_lock(struct bpf_spin_lock * lock)281  static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
282  {
283  	arch_spinlock_t *l = (void *)lock;
284  	union {
285  		__u32 val;
286  		arch_spinlock_t lock;
287  	} u = { .lock = __ARCH_SPIN_LOCK_UNLOCKED };
288  
289  	compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0");
290  	BUILD_BUG_ON(sizeof(*l) != sizeof(__u32));
291  	BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32));
292  	preempt_disable();
293  	arch_spin_lock(l);
294  }
295  
__bpf_spin_unlock(struct bpf_spin_lock * lock)296  static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
297  {
298  	arch_spinlock_t *l = (void *)lock;
299  
300  	arch_spin_unlock(l);
301  	preempt_enable();
302  }
303  
304  #else
305  
__bpf_spin_lock(struct bpf_spin_lock * lock)306  static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
307  {
308  	atomic_t *l = (void *)lock;
309  
310  	BUILD_BUG_ON(sizeof(*l) != sizeof(*lock));
311  	do {
312  		atomic_cond_read_relaxed(l, !VAL);
313  	} while (atomic_xchg(l, 1));
314  }
315  
__bpf_spin_unlock(struct bpf_spin_lock * lock)316  static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
317  {
318  	atomic_t *l = (void *)lock;
319  
320  	atomic_set_release(l, 0);
321  }
322  
323  #endif
324  
325  static DEFINE_PER_CPU(unsigned long, irqsave_flags);
326  
__bpf_spin_lock_irqsave(struct bpf_spin_lock * lock)327  static inline void __bpf_spin_lock_irqsave(struct bpf_spin_lock *lock)
328  {
329  	unsigned long flags;
330  
331  	local_irq_save(flags);
332  	__bpf_spin_lock(lock);
333  	__this_cpu_write(irqsave_flags, flags);
334  }
335  
NOTRACE_BPF_CALL_1(bpf_spin_lock,struct bpf_spin_lock *,lock)336  NOTRACE_BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
337  {
338  	__bpf_spin_lock_irqsave(lock);
339  	return 0;
340  }
341  
342  const struct bpf_func_proto bpf_spin_lock_proto = {
343  	.func		= bpf_spin_lock,
344  	.gpl_only	= false,
345  	.ret_type	= RET_VOID,
346  	.arg1_type	= ARG_PTR_TO_SPIN_LOCK,
347  	.arg1_btf_id    = BPF_PTR_POISON,
348  };
349  
__bpf_spin_unlock_irqrestore(struct bpf_spin_lock * lock)350  static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock)
351  {
352  	unsigned long flags;
353  
354  	flags = __this_cpu_read(irqsave_flags);
355  	__bpf_spin_unlock(lock);
356  	local_irq_restore(flags);
357  }
358  
NOTRACE_BPF_CALL_1(bpf_spin_unlock,struct bpf_spin_lock *,lock)359  NOTRACE_BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
360  {
361  	__bpf_spin_unlock_irqrestore(lock);
362  	return 0;
363  }
364  
365  const struct bpf_func_proto bpf_spin_unlock_proto = {
366  	.func		= bpf_spin_unlock,
367  	.gpl_only	= false,
368  	.ret_type	= RET_VOID,
369  	.arg1_type	= ARG_PTR_TO_SPIN_LOCK,
370  	.arg1_btf_id    = BPF_PTR_POISON,
371  };
372  
copy_map_value_locked(struct bpf_map * map,void * dst,void * src,bool lock_src)373  void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
374  			   bool lock_src)
375  {
376  	struct bpf_spin_lock *lock;
377  
378  	if (lock_src)
379  		lock = src + map->record->spin_lock_off;
380  	else
381  		lock = dst + map->record->spin_lock_off;
382  	preempt_disable();
383  	__bpf_spin_lock_irqsave(lock);
384  	copy_map_value(map, dst, src);
385  	__bpf_spin_unlock_irqrestore(lock);
386  	preempt_enable();
387  }
388  
BPF_CALL_0(bpf_jiffies64)389  BPF_CALL_0(bpf_jiffies64)
390  {
391  	return get_jiffies_64();
392  }
393  
394  const struct bpf_func_proto bpf_jiffies64_proto = {
395  	.func		= bpf_jiffies64,
396  	.gpl_only	= false,
397  	.ret_type	= RET_INTEGER,
398  };
399  
400  #ifdef CONFIG_CGROUPS
BPF_CALL_0(bpf_get_current_cgroup_id)401  BPF_CALL_0(bpf_get_current_cgroup_id)
402  {
403  	struct cgroup *cgrp;
404  	u64 cgrp_id;
405  
406  	rcu_read_lock();
407  	cgrp = task_dfl_cgroup(current);
408  	cgrp_id = cgroup_id(cgrp);
409  	rcu_read_unlock();
410  
411  	return cgrp_id;
412  }
413  
414  const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
415  	.func		= bpf_get_current_cgroup_id,
416  	.gpl_only	= false,
417  	.ret_type	= RET_INTEGER,
418  };
419  
BPF_CALL_1(bpf_get_current_ancestor_cgroup_id,int,ancestor_level)420  BPF_CALL_1(bpf_get_current_ancestor_cgroup_id, int, ancestor_level)
421  {
422  	struct cgroup *cgrp;
423  	struct cgroup *ancestor;
424  	u64 cgrp_id;
425  
426  	rcu_read_lock();
427  	cgrp = task_dfl_cgroup(current);
428  	ancestor = cgroup_ancestor(cgrp, ancestor_level);
429  	cgrp_id = ancestor ? cgroup_id(ancestor) : 0;
430  	rcu_read_unlock();
431  
432  	return cgrp_id;
433  }
434  
435  const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = {
436  	.func		= bpf_get_current_ancestor_cgroup_id,
437  	.gpl_only	= false,
438  	.ret_type	= RET_INTEGER,
439  	.arg1_type	= ARG_ANYTHING,
440  };
441  #endif /* CONFIG_CGROUPS */
442  
443  #define BPF_STRTOX_BASE_MASK 0x1F
444  
__bpf_strtoull(const char * buf,size_t buf_len,u64 flags,unsigned long long * res,bool * is_negative)445  static int __bpf_strtoull(const char *buf, size_t buf_len, u64 flags,
446  			  unsigned long long *res, bool *is_negative)
447  {
448  	unsigned int base = flags & BPF_STRTOX_BASE_MASK;
449  	const char *cur_buf = buf;
450  	size_t cur_len = buf_len;
451  	unsigned int consumed;
452  	size_t val_len;
453  	char str[64];
454  
455  	if (!buf || !buf_len || !res || !is_negative)
456  		return -EINVAL;
457  
458  	if (base != 0 && base != 8 && base != 10 && base != 16)
459  		return -EINVAL;
460  
461  	if (flags & ~BPF_STRTOX_BASE_MASK)
462  		return -EINVAL;
463  
464  	while (cur_buf < buf + buf_len && isspace(*cur_buf))
465  		++cur_buf;
466  
467  	*is_negative = (cur_buf < buf + buf_len && *cur_buf == '-');
468  	if (*is_negative)
469  		++cur_buf;
470  
471  	consumed = cur_buf - buf;
472  	cur_len -= consumed;
473  	if (!cur_len)
474  		return -EINVAL;
475  
476  	cur_len = min(cur_len, sizeof(str) - 1);
477  	memcpy(str, cur_buf, cur_len);
478  	str[cur_len] = '\0';
479  	cur_buf = str;
480  
481  	cur_buf = _parse_integer_fixup_radix(cur_buf, &base);
482  	val_len = _parse_integer(cur_buf, base, res);
483  
484  	if (val_len & KSTRTOX_OVERFLOW)
485  		return -ERANGE;
486  
487  	if (val_len == 0)
488  		return -EINVAL;
489  
490  	cur_buf += val_len;
491  	consumed += cur_buf - str;
492  
493  	return consumed;
494  }
495  
__bpf_strtoll(const char * buf,size_t buf_len,u64 flags,long long * res)496  static int __bpf_strtoll(const char *buf, size_t buf_len, u64 flags,
497  			 long long *res)
498  {
499  	unsigned long long _res;
500  	bool is_negative;
501  	int err;
502  
503  	err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative);
504  	if (err < 0)
505  		return err;
506  	if (is_negative) {
507  		if ((long long)-_res > 0)
508  			return -ERANGE;
509  		*res = -_res;
510  	} else {
511  		if ((long long)_res < 0)
512  			return -ERANGE;
513  		*res = _res;
514  	}
515  	return err;
516  }
517  
BPF_CALL_4(bpf_strtol,const char *,buf,size_t,buf_len,u64,flags,s64 *,res)518  BPF_CALL_4(bpf_strtol, const char *, buf, size_t, buf_len, u64, flags,
519  	   s64 *, res)
520  {
521  	long long _res;
522  	int err;
523  
524  	*res = 0;
525  	err = __bpf_strtoll(buf, buf_len, flags, &_res);
526  	if (err < 0)
527  		return err;
528  	if (_res != (long)_res)
529  		return -ERANGE;
530  	*res = _res;
531  	return err;
532  }
533  
534  const struct bpf_func_proto bpf_strtol_proto = {
535  	.func		= bpf_strtol,
536  	.gpl_only	= false,
537  	.ret_type	= RET_INTEGER,
538  	.arg1_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
539  	.arg2_type	= ARG_CONST_SIZE,
540  	.arg3_type	= ARG_ANYTHING,
541  	.arg4_type	= ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
542  	.arg4_size	= sizeof(s64),
543  };
544  
BPF_CALL_4(bpf_strtoul,const char *,buf,size_t,buf_len,u64,flags,u64 *,res)545  BPF_CALL_4(bpf_strtoul, const char *, buf, size_t, buf_len, u64, flags,
546  	   u64 *, res)
547  {
548  	unsigned long long _res;
549  	bool is_negative;
550  	int err;
551  
552  	*res = 0;
553  	err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative);
554  	if (err < 0)
555  		return err;
556  	if (is_negative)
557  		return -EINVAL;
558  	if (_res != (unsigned long)_res)
559  		return -ERANGE;
560  	*res = _res;
561  	return err;
562  }
563  
564  const struct bpf_func_proto bpf_strtoul_proto = {
565  	.func		= bpf_strtoul,
566  	.gpl_only	= false,
567  	.ret_type	= RET_INTEGER,
568  	.arg1_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
569  	.arg2_type	= ARG_CONST_SIZE,
570  	.arg3_type	= ARG_ANYTHING,
571  	.arg4_type	= ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
572  	.arg4_size	= sizeof(u64),
573  };
574  
BPF_CALL_3(bpf_strncmp,const char *,s1,u32,s1_sz,const char *,s2)575  BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2)
576  {
577  	return strncmp(s1, s2, s1_sz);
578  }
579  
580  static const struct bpf_func_proto bpf_strncmp_proto = {
581  	.func		= bpf_strncmp,
582  	.gpl_only	= false,
583  	.ret_type	= RET_INTEGER,
584  	.arg1_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
585  	.arg2_type	= ARG_CONST_SIZE,
586  	.arg3_type	= ARG_PTR_TO_CONST_STR,
587  };
588  
BPF_CALL_4(bpf_get_ns_current_pid_tgid,u64,dev,u64,ino,struct bpf_pidns_info *,nsdata,u32,size)589  BPF_CALL_4(bpf_get_ns_current_pid_tgid, u64, dev, u64, ino,
590  	   struct bpf_pidns_info *, nsdata, u32, size)
591  {
592  	struct task_struct *task = current;
593  	struct pid_namespace *pidns;
594  	int err = -EINVAL;
595  
596  	if (unlikely(size != sizeof(struct bpf_pidns_info)))
597  		goto clear;
598  
599  	if (unlikely((u64)(dev_t)dev != dev))
600  		goto clear;
601  
602  	if (unlikely(!task))
603  		goto clear;
604  
605  	pidns = task_active_pid_ns(task);
606  	if (unlikely(!pidns)) {
607  		err = -ENOENT;
608  		goto clear;
609  	}
610  
611  	if (!ns_match(&pidns->ns, (dev_t)dev, ino))
612  		goto clear;
613  
614  	nsdata->pid = task_pid_nr_ns(task, pidns);
615  	nsdata->tgid = task_tgid_nr_ns(task, pidns);
616  	return 0;
617  clear:
618  	memset((void *)nsdata, 0, (size_t) size);
619  	return err;
620  }
621  
622  const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto = {
623  	.func		= bpf_get_ns_current_pid_tgid,
624  	.gpl_only	= false,
625  	.ret_type	= RET_INTEGER,
626  	.arg1_type	= ARG_ANYTHING,
627  	.arg2_type	= ARG_ANYTHING,
628  	.arg3_type      = ARG_PTR_TO_UNINIT_MEM,
629  	.arg4_type      = ARG_CONST_SIZE,
630  };
631  
632  static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = {
633  	.func		= bpf_get_raw_cpu_id,
634  	.gpl_only	= false,
635  	.ret_type	= RET_INTEGER,
636  };
637  
BPF_CALL_5(bpf_event_output_data,void *,ctx,struct bpf_map *,map,u64,flags,void *,data,u64,size)638  BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map,
639  	   u64, flags, void *, data, u64, size)
640  {
641  	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
642  		return -EINVAL;
643  
644  	return bpf_event_output(map, flags, data, size, NULL, 0, NULL);
645  }
646  
647  const struct bpf_func_proto bpf_event_output_data_proto =  {
648  	.func		= bpf_event_output_data,
649  	.gpl_only       = true,
650  	.ret_type       = RET_INTEGER,
651  	.arg1_type      = ARG_PTR_TO_CTX,
652  	.arg2_type      = ARG_CONST_MAP_PTR,
653  	.arg3_type      = ARG_ANYTHING,
654  	.arg4_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
655  	.arg5_type      = ARG_CONST_SIZE_OR_ZERO,
656  };
657  
BPF_CALL_3(bpf_copy_from_user,void *,dst,u32,size,const void __user *,user_ptr)658  BPF_CALL_3(bpf_copy_from_user, void *, dst, u32, size,
659  	   const void __user *, user_ptr)
660  {
661  	int ret = copy_from_user(dst, user_ptr, size);
662  
663  	if (unlikely(ret)) {
664  		memset(dst, 0, size);
665  		ret = -EFAULT;
666  	}
667  
668  	return ret;
669  }
670  
671  const struct bpf_func_proto bpf_copy_from_user_proto = {
672  	.func		= bpf_copy_from_user,
673  	.gpl_only	= false,
674  	.might_sleep	= true,
675  	.ret_type	= RET_INTEGER,
676  	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
677  	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
678  	.arg3_type	= ARG_ANYTHING,
679  };
680  
BPF_CALL_5(bpf_copy_from_user_task,void *,dst,u32,size,const void __user *,user_ptr,struct task_struct *,tsk,u64,flags)681  BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size,
682  	   const void __user *, user_ptr, struct task_struct *, tsk, u64, flags)
683  {
684  	int ret;
685  
686  	/* flags is not used yet */
687  	if (unlikely(flags))
688  		return -EINVAL;
689  
690  	if (unlikely(!size))
691  		return 0;
692  
693  	ret = access_process_vm(tsk, (unsigned long)user_ptr, dst, size, 0);
694  	if (ret == size)
695  		return 0;
696  
697  	memset(dst, 0, size);
698  	/* Return -EFAULT for partial read */
699  	return ret < 0 ? ret : -EFAULT;
700  }
701  
702  const struct bpf_func_proto bpf_copy_from_user_task_proto = {
703  	.func		= bpf_copy_from_user_task,
704  	.gpl_only	= true,
705  	.might_sleep	= true,
706  	.ret_type	= RET_INTEGER,
707  	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
708  	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
709  	.arg3_type	= ARG_ANYTHING,
710  	.arg4_type	= ARG_PTR_TO_BTF_ID,
711  	.arg4_btf_id	= &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
712  	.arg5_type	= ARG_ANYTHING
713  };
714  
BPF_CALL_2(bpf_per_cpu_ptr,const void *,ptr,u32,cpu)715  BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)
716  {
717  	if (cpu >= nr_cpu_ids)
718  		return (unsigned long)NULL;
719  
720  	return (unsigned long)per_cpu_ptr((const void __percpu *)ptr, cpu);
721  }
722  
723  const struct bpf_func_proto bpf_per_cpu_ptr_proto = {
724  	.func		= bpf_per_cpu_ptr,
725  	.gpl_only	= false,
726  	.ret_type	= RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL | MEM_RDONLY,
727  	.arg1_type	= ARG_PTR_TO_PERCPU_BTF_ID,
728  	.arg2_type	= ARG_ANYTHING,
729  };
730  
BPF_CALL_1(bpf_this_cpu_ptr,const void *,percpu_ptr)731  BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr)
732  {
733  	return (unsigned long)this_cpu_ptr((const void __percpu *)percpu_ptr);
734  }
735  
736  const struct bpf_func_proto bpf_this_cpu_ptr_proto = {
737  	.func		= bpf_this_cpu_ptr,
738  	.gpl_only	= false,
739  	.ret_type	= RET_PTR_TO_MEM_OR_BTF_ID | MEM_RDONLY,
740  	.arg1_type	= ARG_PTR_TO_PERCPU_BTF_ID,
741  };
742  
bpf_trace_copy_string(char * buf,void * unsafe_ptr,char fmt_ptype,size_t bufsz)743  static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,
744  		size_t bufsz)
745  {
746  	void __user *user_ptr = (__force void __user *)unsafe_ptr;
747  
748  	buf[0] = 0;
749  
750  	switch (fmt_ptype) {
751  	case 's':
752  #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
753  		if ((unsigned long)unsafe_ptr < TASK_SIZE)
754  			return strncpy_from_user_nofault(buf, user_ptr, bufsz);
755  		fallthrough;
756  #endif
757  	case 'k':
758  		return strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz);
759  	case 'u':
760  		return strncpy_from_user_nofault(buf, user_ptr, bufsz);
761  	}
762  
763  	return -EINVAL;
764  }
765  
766  /* Per-cpu temp buffers used by printf-like helpers to store the bprintf binary
767   * arguments representation.
768   */
769  #define MAX_BPRINTF_BIN_ARGS	512
770  
771  /* Support executing three nested bprintf helper calls on a given CPU */
772  #define MAX_BPRINTF_NEST_LEVEL	3
773  struct bpf_bprintf_buffers {
774  	char bin_args[MAX_BPRINTF_BIN_ARGS];
775  	char buf[MAX_BPRINTF_BUF];
776  };
777  
778  static DEFINE_PER_CPU(struct bpf_bprintf_buffers[MAX_BPRINTF_NEST_LEVEL], bpf_bprintf_bufs);
779  static DEFINE_PER_CPU(int, bpf_bprintf_nest_level);
780  
try_get_buffers(struct bpf_bprintf_buffers ** bufs)781  static int try_get_buffers(struct bpf_bprintf_buffers **bufs)
782  {
783  	int nest_level;
784  
785  	preempt_disable();
786  	nest_level = this_cpu_inc_return(bpf_bprintf_nest_level);
787  	if (WARN_ON_ONCE(nest_level > MAX_BPRINTF_NEST_LEVEL)) {
788  		this_cpu_dec(bpf_bprintf_nest_level);
789  		preempt_enable();
790  		return -EBUSY;
791  	}
792  	*bufs = this_cpu_ptr(&bpf_bprintf_bufs[nest_level - 1]);
793  
794  	return 0;
795  }
796  
bpf_bprintf_cleanup(struct bpf_bprintf_data * data)797  void bpf_bprintf_cleanup(struct bpf_bprintf_data *data)
798  {
799  	if (!data->bin_args && !data->buf)
800  		return;
801  	if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0))
802  		return;
803  	this_cpu_dec(bpf_bprintf_nest_level);
804  	preempt_enable();
805  }
806  
807  /*
808   * bpf_bprintf_prepare - Generic pass on format strings for bprintf-like helpers
809   *
810   * Returns a negative value if fmt is an invalid format string or 0 otherwise.
811   *
812   * This can be used in two ways:
813   * - Format string verification only: when data->get_bin_args is false
814   * - Arguments preparation: in addition to the above verification, it writes in
815   *   data->bin_args a binary representation of arguments usable by bstr_printf
816   *   where pointers from BPF have been sanitized.
817   *
818   * In argument preparation mode, if 0 is returned, safe temporary buffers are
819   * allocated and bpf_bprintf_cleanup should be called to free them after use.
820   */
bpf_bprintf_prepare(char * fmt,u32 fmt_size,const u64 * raw_args,u32 num_args,struct bpf_bprintf_data * data)821  int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
822  			u32 num_args, struct bpf_bprintf_data *data)
823  {
824  	bool get_buffers = (data->get_bin_args && num_args) || data->get_buf;
825  	char *unsafe_ptr = NULL, *tmp_buf = NULL, *tmp_buf_end, *fmt_end;
826  	struct bpf_bprintf_buffers *buffers = NULL;
827  	size_t sizeof_cur_arg, sizeof_cur_ip;
828  	int err, i, num_spec = 0;
829  	u64 cur_arg;
830  	char fmt_ptype, cur_ip[16], ip_spec[] = "%pXX";
831  
832  	fmt_end = strnchr(fmt, fmt_size, 0);
833  	if (!fmt_end)
834  		return -EINVAL;
835  	fmt_size = fmt_end - fmt;
836  
837  	if (get_buffers && try_get_buffers(&buffers))
838  		return -EBUSY;
839  
840  	if (data->get_bin_args) {
841  		if (num_args)
842  			tmp_buf = buffers->bin_args;
843  		tmp_buf_end = tmp_buf + MAX_BPRINTF_BIN_ARGS;
844  		data->bin_args = (u32 *)tmp_buf;
845  	}
846  
847  	if (data->get_buf)
848  		data->buf = buffers->buf;
849  
850  	for (i = 0; i < fmt_size; i++) {
851  		if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) {
852  			err = -EINVAL;
853  			goto out;
854  		}
855  
856  		if (fmt[i] != '%')
857  			continue;
858  
859  		if (fmt[i + 1] == '%') {
860  			i++;
861  			continue;
862  		}
863  
864  		if (num_spec >= num_args) {
865  			err = -EINVAL;
866  			goto out;
867  		}
868  
869  		/* The string is zero-terminated so if fmt[i] != 0, we can
870  		 * always access fmt[i + 1], in the worst case it will be a 0
871  		 */
872  		i++;
873  
874  		/* skip optional "[0 +-][num]" width formatting field */
875  		while (fmt[i] == '0' || fmt[i] == '+'  || fmt[i] == '-' ||
876  		       fmt[i] == ' ')
877  			i++;
878  		if (fmt[i] >= '1' && fmt[i] <= '9') {
879  			i++;
880  			while (fmt[i] >= '0' && fmt[i] <= '9')
881  				i++;
882  		}
883  
884  		if (fmt[i] == 'p') {
885  			sizeof_cur_arg = sizeof(long);
886  
887  			if ((fmt[i + 1] == 'k' || fmt[i + 1] == 'u') &&
888  			    fmt[i + 2] == 's') {
889  				fmt_ptype = fmt[i + 1];
890  				i += 2;
891  				goto fmt_str;
892  			}
893  
894  			if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) ||
895  			    ispunct(fmt[i + 1]) || fmt[i + 1] == 'K' ||
896  			    fmt[i + 1] == 'x' || fmt[i + 1] == 's' ||
897  			    fmt[i + 1] == 'S') {
898  				/* just kernel pointers */
899  				if (tmp_buf)
900  					cur_arg = raw_args[num_spec];
901  				i++;
902  				goto nocopy_fmt;
903  			}
904  
905  			if (fmt[i + 1] == 'B') {
906  				if (tmp_buf)  {
907  					err = snprintf(tmp_buf,
908  						       (tmp_buf_end - tmp_buf),
909  						       "%pB",
910  						       (void *)(long)raw_args[num_spec]);
911  					tmp_buf += (err + 1);
912  				}
913  
914  				i++;
915  				num_spec++;
916  				continue;
917  			}
918  
919  			/* only support "%pI4", "%pi4", "%pI6" and "%pi6". */
920  			if ((fmt[i + 1] != 'i' && fmt[i + 1] != 'I') ||
921  			    (fmt[i + 2] != '4' && fmt[i + 2] != '6')) {
922  				err = -EINVAL;
923  				goto out;
924  			}
925  
926  			i += 2;
927  			if (!tmp_buf)
928  				goto nocopy_fmt;
929  
930  			sizeof_cur_ip = (fmt[i] == '4') ? 4 : 16;
931  			if (tmp_buf_end - tmp_buf < sizeof_cur_ip) {
932  				err = -ENOSPC;
933  				goto out;
934  			}
935  
936  			unsafe_ptr = (char *)(long)raw_args[num_spec];
937  			err = copy_from_kernel_nofault(cur_ip, unsafe_ptr,
938  						       sizeof_cur_ip);
939  			if (err < 0)
940  				memset(cur_ip, 0, sizeof_cur_ip);
941  
942  			/* hack: bstr_printf expects IP addresses to be
943  			 * pre-formatted as strings, ironically, the easiest way
944  			 * to do that is to call snprintf.
945  			 */
946  			ip_spec[2] = fmt[i - 1];
947  			ip_spec[3] = fmt[i];
948  			err = snprintf(tmp_buf, tmp_buf_end - tmp_buf,
949  				       ip_spec, &cur_ip);
950  
951  			tmp_buf += err + 1;
952  			num_spec++;
953  
954  			continue;
955  		} else if (fmt[i] == 's') {
956  			fmt_ptype = fmt[i];
957  fmt_str:
958  			if (fmt[i + 1] != 0 &&
959  			    !isspace(fmt[i + 1]) &&
960  			    !ispunct(fmt[i + 1])) {
961  				err = -EINVAL;
962  				goto out;
963  			}
964  
965  			if (!tmp_buf)
966  				goto nocopy_fmt;
967  
968  			if (tmp_buf_end == tmp_buf) {
969  				err = -ENOSPC;
970  				goto out;
971  			}
972  
973  			unsafe_ptr = (char *)(long)raw_args[num_spec];
974  			err = bpf_trace_copy_string(tmp_buf, unsafe_ptr,
975  						    fmt_ptype,
976  						    tmp_buf_end - tmp_buf);
977  			if (err < 0) {
978  				tmp_buf[0] = '\0';
979  				err = 1;
980  			}
981  
982  			tmp_buf += err;
983  			num_spec++;
984  
985  			continue;
986  		} else if (fmt[i] == 'c') {
987  			if (!tmp_buf)
988  				goto nocopy_fmt;
989  
990  			if (tmp_buf_end == tmp_buf) {
991  				err = -ENOSPC;
992  				goto out;
993  			}
994  
995  			*tmp_buf = raw_args[num_spec];
996  			tmp_buf++;
997  			num_spec++;
998  
999  			continue;
1000  		}
1001  
1002  		sizeof_cur_arg = sizeof(int);
1003  
1004  		if (fmt[i] == 'l') {
1005  			sizeof_cur_arg = sizeof(long);
1006  			i++;
1007  		}
1008  		if (fmt[i] == 'l') {
1009  			sizeof_cur_arg = sizeof(long long);
1010  			i++;
1011  		}
1012  
1013  		if (fmt[i] != 'i' && fmt[i] != 'd' && fmt[i] != 'u' &&
1014  		    fmt[i] != 'x' && fmt[i] != 'X') {
1015  			err = -EINVAL;
1016  			goto out;
1017  		}
1018  
1019  		if (tmp_buf)
1020  			cur_arg = raw_args[num_spec];
1021  nocopy_fmt:
1022  		if (tmp_buf) {
1023  			tmp_buf = PTR_ALIGN(tmp_buf, sizeof(u32));
1024  			if (tmp_buf_end - tmp_buf < sizeof_cur_arg) {
1025  				err = -ENOSPC;
1026  				goto out;
1027  			}
1028  
1029  			if (sizeof_cur_arg == 8) {
1030  				*(u32 *)tmp_buf = *(u32 *)&cur_arg;
1031  				*(u32 *)(tmp_buf + 4) = *((u32 *)&cur_arg + 1);
1032  			} else {
1033  				*(u32 *)tmp_buf = (u32)(long)cur_arg;
1034  			}
1035  			tmp_buf += sizeof_cur_arg;
1036  		}
1037  		num_spec++;
1038  	}
1039  
1040  	err = 0;
1041  out:
1042  	if (err)
1043  		bpf_bprintf_cleanup(data);
1044  	return err;
1045  }
1046  
BPF_CALL_5(bpf_snprintf,char *,str,u32,str_size,char *,fmt,const void *,args,u32,data_len)1047  BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt,
1048  	   const void *, args, u32, data_len)
1049  {
1050  	struct bpf_bprintf_data data = {
1051  		.get_bin_args	= true,
1052  	};
1053  	int err, num_args;
1054  
1055  	if (data_len % 8 || data_len > MAX_BPRINTF_VARARGS * 8 ||
1056  	    (data_len && !args))
1057  		return -EINVAL;
1058  	num_args = data_len / 8;
1059  
1060  	/* ARG_PTR_TO_CONST_STR guarantees that fmt is zero-terminated so we
1061  	 * can safely give an unbounded size.
1062  	 */
1063  	err = bpf_bprintf_prepare(fmt, UINT_MAX, args, num_args, &data);
1064  	if (err < 0)
1065  		return err;
1066  
1067  	err = bstr_printf(str, str_size, fmt, data.bin_args);
1068  
1069  	bpf_bprintf_cleanup(&data);
1070  
1071  	return err + 1;
1072  }
1073  
1074  const struct bpf_func_proto bpf_snprintf_proto = {
1075  	.func		= bpf_snprintf,
1076  	.gpl_only	= true,
1077  	.ret_type	= RET_INTEGER,
1078  	.arg1_type	= ARG_PTR_TO_MEM_OR_NULL,
1079  	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
1080  	.arg3_type	= ARG_PTR_TO_CONST_STR,
1081  	.arg4_type	= ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
1082  	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
1083  };
1084  
1085  struct bpf_async_cb {
1086  	struct bpf_map *map;
1087  	struct bpf_prog *prog;
1088  	void __rcu *callback_fn;
1089  	void *value;
1090  	struct rcu_head rcu;
1091  	u64 flags;
1092  };
1093  
1094  /* BPF map elements can contain 'struct bpf_timer'.
1095   * Such map owns all of its BPF timers.
1096   * 'struct bpf_timer' is allocated as part of map element allocation
1097   * and it's zero initialized.
1098   * That space is used to keep 'struct bpf_async_kern'.
1099   * bpf_timer_init() allocates 'struct bpf_hrtimer', inits hrtimer, and
1100   * remembers 'struct bpf_map *' pointer it's part of.
1101   * bpf_timer_set_callback() increments prog refcnt and assign bpf callback_fn.
1102   * bpf_timer_start() arms the timer.
1103   * If user space reference to a map goes to zero at this point
1104   * ops->map_release_uref callback is responsible for cancelling the timers,
1105   * freeing their memory, and decrementing prog's refcnts.
1106   * bpf_timer_cancel() cancels the timer and decrements prog's refcnt.
1107   * Inner maps can contain bpf timers as well. ops->map_release_uref is
1108   * freeing the timers when inner map is replaced or deleted by user space.
1109   */
1110  struct bpf_hrtimer {
1111  	struct bpf_async_cb cb;
1112  	struct hrtimer timer;
1113  	atomic_t cancelling;
1114  };
1115  
1116  /* the actual struct hidden inside uapi struct bpf_timer */
1117  struct bpf_async_kern {
1118  	union {
1119  		struct bpf_async_cb *cb;
1120  		struct bpf_hrtimer *timer;
1121  	};
1122  	/* bpf_spin_lock is used here instead of spinlock_t to make
1123  	 * sure that it always fits into space reserved by struct bpf_timer
1124  	 * regardless of LOCKDEP and spinlock debug flags.
1125  	 */
1126  	struct bpf_spin_lock lock;
1127  } __attribute__((aligned(8)));
1128  
1129  enum bpf_async_type {
1130  	BPF_ASYNC_TYPE_TIMER = 0,
1131  };
1132  
1133  static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running);
1134  
bpf_timer_cb(struct hrtimer * hrtimer)1135  static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
1136  {
1137  	struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer);
1138  	struct bpf_map *map = t->cb.map;
1139  	void *value = t->cb.value;
1140  	bpf_callback_t callback_fn;
1141  	void *key;
1142  	u32 idx;
1143  
1144  	BTF_TYPE_EMIT(struct bpf_timer);
1145  	callback_fn = rcu_dereference_check(t->cb.callback_fn, rcu_read_lock_bh_held());
1146  	if (!callback_fn)
1147  		goto out;
1148  
1149  	/* bpf_timer_cb() runs in hrtimer_run_softirq. It doesn't migrate and
1150  	 * cannot be preempted by another bpf_timer_cb() on the same cpu.
1151  	 * Remember the timer this callback is servicing to prevent
1152  	 * deadlock if callback_fn() calls bpf_timer_cancel() or
1153  	 * bpf_map_delete_elem() on the same timer.
1154  	 */
1155  	this_cpu_write(hrtimer_running, t);
1156  	if (map->map_type == BPF_MAP_TYPE_ARRAY) {
1157  		struct bpf_array *array = container_of(map, struct bpf_array, map);
1158  
1159  		/* compute the key */
1160  		idx = ((char *)value - array->value) / array->elem_size;
1161  		key = &idx;
1162  	} else { /* hash or lru */
1163  		key = value - round_up(map->key_size, 8);
1164  	}
1165  
1166  	callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0);
1167  	/* The verifier checked that return value is zero. */
1168  
1169  	this_cpu_write(hrtimer_running, NULL);
1170  out:
1171  	return HRTIMER_NORESTART;
1172  }
1173  
__bpf_async_init(struct bpf_async_kern * async,struct bpf_map * map,u64 flags,enum bpf_async_type type)1174  static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
1175  			    enum bpf_async_type type)
1176  {
1177  	struct bpf_async_cb *cb;
1178  	struct bpf_hrtimer *t;
1179  	clockid_t clockid;
1180  	size_t size;
1181  	int ret = 0;
1182  
1183  	if (in_nmi())
1184  		return -EOPNOTSUPP;
1185  
1186  	switch (type) {
1187  	case BPF_ASYNC_TYPE_TIMER:
1188  		size = sizeof(struct bpf_hrtimer);
1189  		break;
1190  	default:
1191  		return -EINVAL;
1192  	}
1193  
1194  	__bpf_spin_lock_irqsave(&async->lock);
1195  	t = async->timer;
1196  	if (t) {
1197  		ret = -EBUSY;
1198  		goto out;
1199  	}
1200  
1201  	/* allocate hrtimer via map_kmalloc to use memcg accounting */
1202  	cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node);
1203  	if (!cb) {
1204  		ret = -ENOMEM;
1205  		goto out;
1206  	}
1207  
1208  	if (type == BPF_ASYNC_TYPE_TIMER) {
1209  		clockid = flags & (MAX_CLOCKS - 1);
1210  		t = (struct bpf_hrtimer *)cb;
1211  
1212  		atomic_set(&t->cancelling, 0);
1213  		hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
1214  		t->timer.function = bpf_timer_cb;
1215  		cb->value = (void *)async - map->record->timer_off;
1216  	}
1217  	cb->map = map;
1218  	cb->prog = NULL;
1219  	cb->flags = flags;
1220  	rcu_assign_pointer(cb->callback_fn, NULL);
1221  
1222  	WRITE_ONCE(async->cb, cb);
1223  	/* Guarantee the order between async->cb and map->usercnt. So
1224  	 * when there are concurrent uref release and bpf timer init, either
1225  	 * bpf_timer_cancel_and_free() called by uref release reads a no-NULL
1226  	 * timer or atomic64_read() below returns a zero usercnt.
1227  	 */
1228  	smp_mb();
1229  	if (!atomic64_read(&map->usercnt)) {
1230  		/* maps with timers must be either held by user space
1231  		 * or pinned in bpffs.
1232  		 */
1233  		WRITE_ONCE(async->cb, NULL);
1234  		kfree(cb);
1235  		ret = -EPERM;
1236  	}
1237  out:
1238  	__bpf_spin_unlock_irqrestore(&async->lock);
1239  	return ret;
1240  }
1241  
BPF_CALL_3(bpf_timer_init,struct bpf_async_kern *,timer,struct bpf_map *,map,u64,flags)1242  BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map,
1243  	   u64, flags)
1244  {
1245  	clock_t clockid = flags & (MAX_CLOCKS - 1);
1246  
1247  	BUILD_BUG_ON(MAX_CLOCKS != 16);
1248  	BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_timer));
1249  	BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_timer));
1250  
1251  	if (flags >= MAX_CLOCKS ||
1252  	    /* similar to timerfd except _ALARM variants are not supported */
1253  	    (clockid != CLOCK_MONOTONIC &&
1254  	     clockid != CLOCK_REALTIME &&
1255  	     clockid != CLOCK_BOOTTIME))
1256  		return -EINVAL;
1257  
1258  	return __bpf_async_init(timer, map, flags, BPF_ASYNC_TYPE_TIMER);
1259  }
1260  
1261  static const struct bpf_func_proto bpf_timer_init_proto = {
1262  	.func		= bpf_timer_init,
1263  	.gpl_only	= true,
1264  	.ret_type	= RET_INTEGER,
1265  	.arg1_type	= ARG_PTR_TO_TIMER,
1266  	.arg2_type	= ARG_CONST_MAP_PTR,
1267  	.arg3_type	= ARG_ANYTHING,
1268  };
1269  
BPF_CALL_3(bpf_timer_set_callback,struct bpf_async_kern *,timer,void *,callback_fn,struct bpf_prog_aux *,aux)1270  BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn,
1271  	   struct bpf_prog_aux *, aux)
1272  {
1273  	struct bpf_prog *prev, *prog = aux->prog;
1274  	struct bpf_hrtimer *t;
1275  	int ret = 0;
1276  
1277  	if (in_nmi())
1278  		return -EOPNOTSUPP;
1279  	__bpf_spin_lock_irqsave(&timer->lock);
1280  	t = timer->timer;
1281  	if (!t) {
1282  		ret = -EINVAL;
1283  		goto out;
1284  	}
1285  	if (!atomic64_read(&t->cb.map->usercnt)) {
1286  		/* maps with timers must be either held by user space
1287  		 * or pinned in bpffs. Otherwise timer might still be
1288  		 * running even when bpf prog is detached and user space
1289  		 * is gone, since map_release_uref won't ever be called.
1290  		 */
1291  		ret = -EPERM;
1292  		goto out;
1293  	}
1294  	prev = t->cb.prog;
1295  	if (prev != prog) {
1296  		/* Bump prog refcnt once. Every bpf_timer_set_callback()
1297  		 * can pick different callback_fn-s within the same prog.
1298  		 */
1299  		prog = bpf_prog_inc_not_zero(prog);
1300  		if (IS_ERR(prog)) {
1301  			ret = PTR_ERR(prog);
1302  			goto out;
1303  		}
1304  		if (prev)
1305  			/* Drop prev prog refcnt when swapping with new prog */
1306  			bpf_prog_put(prev);
1307  		t->cb.prog = prog;
1308  	}
1309  	rcu_assign_pointer(t->cb.callback_fn, callback_fn);
1310  out:
1311  	__bpf_spin_unlock_irqrestore(&timer->lock);
1312  	return ret;
1313  }
1314  
1315  static const struct bpf_func_proto bpf_timer_set_callback_proto = {
1316  	.func		= bpf_timer_set_callback,
1317  	.gpl_only	= true,
1318  	.ret_type	= RET_INTEGER,
1319  	.arg1_type	= ARG_PTR_TO_TIMER,
1320  	.arg2_type	= ARG_PTR_TO_FUNC,
1321  };
1322  
BPF_CALL_3(bpf_timer_start,struct bpf_async_kern *,timer,u64,nsecs,u64,flags)1323  BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, flags)
1324  {
1325  	struct bpf_hrtimer *t;
1326  	int ret = 0;
1327  	enum hrtimer_mode mode;
1328  
1329  	if (in_nmi())
1330  		return -EOPNOTSUPP;
1331  	if (flags > BPF_F_TIMER_ABS)
1332  		return -EINVAL;
1333  	__bpf_spin_lock_irqsave(&timer->lock);
1334  	t = timer->timer;
1335  	if (!t || !t->cb.prog) {
1336  		ret = -EINVAL;
1337  		goto out;
1338  	}
1339  
1340  	if (flags & BPF_F_TIMER_ABS)
1341  		mode = HRTIMER_MODE_ABS_SOFT;
1342  	else
1343  		mode = HRTIMER_MODE_REL_SOFT;
1344  
1345  	hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode);
1346  out:
1347  	__bpf_spin_unlock_irqrestore(&timer->lock);
1348  	return ret;
1349  }
1350  
1351  static const struct bpf_func_proto bpf_timer_start_proto = {
1352  	.func		= bpf_timer_start,
1353  	.gpl_only	= true,
1354  	.ret_type	= RET_INTEGER,
1355  	.arg1_type	= ARG_PTR_TO_TIMER,
1356  	.arg2_type	= ARG_ANYTHING,
1357  	.arg3_type	= ARG_ANYTHING,
1358  };
1359  
drop_prog_refcnt(struct bpf_async_cb * async)1360  static void drop_prog_refcnt(struct bpf_async_cb *async)
1361  {
1362  	struct bpf_prog *prog = async->prog;
1363  
1364  	if (prog) {
1365  		bpf_prog_put(prog);
1366  		async->prog = NULL;
1367  		rcu_assign_pointer(async->callback_fn, NULL);
1368  	}
1369  }
1370  
BPF_CALL_1(bpf_timer_cancel,struct bpf_async_kern *,timer)1371  BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
1372  {
1373  	struct bpf_hrtimer *t, *cur_t;
1374  	bool inc = false;
1375  	int ret = 0;
1376  
1377  	if (in_nmi())
1378  		return -EOPNOTSUPP;
1379  	rcu_read_lock();
1380  	__bpf_spin_lock_irqsave(&timer->lock);
1381  	t = timer->timer;
1382  	if (!t) {
1383  		ret = -EINVAL;
1384  		goto out;
1385  	}
1386  
1387  	cur_t = this_cpu_read(hrtimer_running);
1388  	if (cur_t == t) {
1389  		/* If bpf callback_fn is trying to bpf_timer_cancel()
1390  		 * its own timer the hrtimer_cancel() will deadlock
1391  		 * since it waits for callback_fn to finish.
1392  		 */
1393  		ret = -EDEADLK;
1394  		goto out;
1395  	}
1396  
1397  	/* Only account in-flight cancellations when invoked from a timer
1398  	 * callback, since we want to avoid waiting only if other _callbacks_
1399  	 * are waiting on us, to avoid introducing lockups. Non-callback paths
1400  	 * are ok, since nobody would synchronously wait for their completion.
1401  	 */
1402  	if (!cur_t)
1403  		goto drop;
1404  	atomic_inc(&t->cancelling);
1405  	/* Need full barrier after relaxed atomic_inc */
1406  	smp_mb__after_atomic();
1407  	inc = true;
1408  	if (atomic_read(&cur_t->cancelling)) {
1409  		/* We're cancelling timer t, while some other timer callback is
1410  		 * attempting to cancel us. In such a case, it might be possible
1411  		 * that timer t belongs to the other callback, or some other
1412  		 * callback waiting upon it (creating transitive dependencies
1413  		 * upon us), and we will enter a deadlock if we continue
1414  		 * cancelling and waiting for it synchronously, since it might
1415  		 * do the same. Bail!
1416  		 */
1417  		ret = -EDEADLK;
1418  		goto out;
1419  	}
1420  drop:
1421  	drop_prog_refcnt(&t->cb);
1422  out:
1423  	__bpf_spin_unlock_irqrestore(&timer->lock);
1424  	/* Cancel the timer and wait for associated callback to finish
1425  	 * if it was running.
1426  	 */
1427  	ret = ret ?: hrtimer_cancel(&t->timer);
1428  	if (inc)
1429  		atomic_dec(&t->cancelling);
1430  	rcu_read_unlock();
1431  	return ret;
1432  }
1433  
1434  static const struct bpf_func_proto bpf_timer_cancel_proto = {
1435  	.func		= bpf_timer_cancel,
1436  	.gpl_only	= true,
1437  	.ret_type	= RET_INTEGER,
1438  	.arg1_type	= ARG_PTR_TO_TIMER,
1439  };
1440  
1441  /* This function is called by map_delete/update_elem for individual element and
1442   * by ops->map_release_uref when the user space reference to a map reaches zero.
1443   */
bpf_timer_cancel_and_free(void * val)1444  void bpf_timer_cancel_and_free(void *val)
1445  {
1446  	struct bpf_async_kern *timer = val;
1447  	struct bpf_hrtimer *t;
1448  
1449  	/* Performance optimization: read timer->timer without lock first. */
1450  	if (!READ_ONCE(timer->timer))
1451  		return;
1452  
1453  	__bpf_spin_lock_irqsave(&timer->lock);
1454  	/* re-read it under lock */
1455  	t = timer->timer;
1456  	if (!t)
1457  		goto out;
1458  	drop_prog_refcnt(&t->cb);
1459  	/* The subsequent bpf_timer_start/cancel() helpers won't be able to use
1460  	 * this timer, since it won't be initialized.
1461  	 */
1462  	WRITE_ONCE(timer->timer, NULL);
1463  out:
1464  	__bpf_spin_unlock_irqrestore(&timer->lock);
1465  	if (!t)
1466  		return;
1467  	/* Cancel the timer and wait for callback to complete if it was running.
1468  	 * If hrtimer_cancel() can be safely called it's safe to call kfree(t)
1469  	 * right after for both preallocated and non-preallocated maps.
1470  	 * The timer->timer = NULL was already done and no code path can
1471  	 * see address 't' anymore.
1472  	 *
1473  	 * Check that bpf_map_delete/update_elem() wasn't called from timer
1474  	 * callback_fn. In such case don't call hrtimer_cancel() (since it will
1475  	 * deadlock) and don't call hrtimer_try_to_cancel() (since it will just
1476  	 * return -1). Though callback_fn is still running on this cpu it's
1477  	 * safe to do kfree(t) because bpf_timer_cb() read everything it needed
1478  	 * from 't'. The bpf subprog callback_fn won't be able to access 't',
1479  	 * since timer->timer = NULL was already done. The timer will be
1480  	 * effectively cancelled because bpf_timer_cb() will return
1481  	 * HRTIMER_NORESTART.
1482  	 */
1483  	if (this_cpu_read(hrtimer_running) != t)
1484  		hrtimer_cancel(&t->timer);
1485  	kfree_rcu(t, cb.rcu);
1486  }
1487  
BPF_CALL_2(bpf_kptr_xchg,void *,map_value,void *,ptr)1488  BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr)
1489  {
1490  	unsigned long *kptr = map_value;
1491  
1492  	return xchg(kptr, (unsigned long)ptr);
1493  }
1494  
1495  /* Unlike other PTR_TO_BTF_ID helpers the btf_id in bpf_kptr_xchg()
1496   * helper is determined dynamically by the verifier. Use BPF_PTR_POISON to
1497   * denote type that verifier will determine.
1498   */
1499  static const struct bpf_func_proto bpf_kptr_xchg_proto = {
1500  	.func         = bpf_kptr_xchg,
1501  	.gpl_only     = false,
1502  	.ret_type     = RET_PTR_TO_BTF_ID_OR_NULL,
1503  	.ret_btf_id   = BPF_PTR_POISON,
1504  	.arg1_type    = ARG_PTR_TO_KPTR,
1505  	.arg2_type    = ARG_PTR_TO_BTF_ID_OR_NULL | OBJ_RELEASE,
1506  	.arg2_btf_id  = BPF_PTR_POISON,
1507  };
1508  
1509  /* Since the upper 8 bits of dynptr->size is reserved, the
1510   * maximum supported size is 2^24 - 1.
1511   */
1512  #define DYNPTR_MAX_SIZE	((1UL << 24) - 1)
1513  #define DYNPTR_TYPE_SHIFT	28
1514  #define DYNPTR_SIZE_MASK	0xFFFFFF
1515  #define DYNPTR_RDONLY_BIT	BIT(31)
1516  
__bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern * ptr)1517  static bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)
1518  {
1519  	return ptr->size & DYNPTR_RDONLY_BIT;
1520  }
1521  
bpf_dynptr_set_rdonly(struct bpf_dynptr_kern * ptr)1522  void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr)
1523  {
1524  	ptr->size |= DYNPTR_RDONLY_BIT;
1525  }
1526  
bpf_dynptr_set_type(struct bpf_dynptr_kern * ptr,enum bpf_dynptr_type type)1527  static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_type type)
1528  {
1529  	ptr->size |= type << DYNPTR_TYPE_SHIFT;
1530  }
1531  
bpf_dynptr_get_type(const struct bpf_dynptr_kern * ptr)1532  static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *ptr)
1533  {
1534  	return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT;
1535  }
1536  
__bpf_dynptr_size(const struct bpf_dynptr_kern * ptr)1537  u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr)
1538  {
1539  	return ptr->size & DYNPTR_SIZE_MASK;
1540  }
1541  
bpf_dynptr_set_size(struct bpf_dynptr_kern * ptr,u32 new_size)1542  static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u32 new_size)
1543  {
1544  	u32 metadata = ptr->size & ~DYNPTR_SIZE_MASK;
1545  
1546  	ptr->size = new_size | metadata;
1547  }
1548  
bpf_dynptr_check_size(u32 size)1549  int bpf_dynptr_check_size(u32 size)
1550  {
1551  	return size > DYNPTR_MAX_SIZE ? -E2BIG : 0;
1552  }
1553  
bpf_dynptr_init(struct bpf_dynptr_kern * ptr,void * data,enum bpf_dynptr_type type,u32 offset,u32 size)1554  void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
1555  		     enum bpf_dynptr_type type, u32 offset, u32 size)
1556  {
1557  	ptr->data = data;
1558  	ptr->offset = offset;
1559  	ptr->size = size;
1560  	bpf_dynptr_set_type(ptr, type);
1561  }
1562  
bpf_dynptr_set_null(struct bpf_dynptr_kern * ptr)1563  void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
1564  {
1565  	memset(ptr, 0, sizeof(*ptr));
1566  }
1567  
bpf_dynptr_check_off_len(const struct bpf_dynptr_kern * ptr,u32 offset,u32 len)1568  static int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len)
1569  {
1570  	u32 size = __bpf_dynptr_size(ptr);
1571  
1572  	if (len > size || offset > size - len)
1573  		return -E2BIG;
1574  
1575  	return 0;
1576  }
1577  
BPF_CALL_4(bpf_dynptr_from_mem,void *,data,u32,size,u64,flags,struct bpf_dynptr_kern *,ptr)1578  BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr)
1579  {
1580  	int err;
1581  
1582  	BTF_TYPE_EMIT(struct bpf_dynptr);
1583  
1584  	err = bpf_dynptr_check_size(size);
1585  	if (err)
1586  		goto error;
1587  
1588  	/* flags is currently unsupported */
1589  	if (flags) {
1590  		err = -EINVAL;
1591  		goto error;
1592  	}
1593  
1594  	bpf_dynptr_init(ptr, data, BPF_DYNPTR_TYPE_LOCAL, 0, size);
1595  
1596  	return 0;
1597  
1598  error:
1599  	bpf_dynptr_set_null(ptr);
1600  	return err;
1601  }
1602  
1603  static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
1604  	.func		= bpf_dynptr_from_mem,
1605  	.gpl_only	= false,
1606  	.ret_type	= RET_INTEGER,
1607  	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
1608  	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
1609  	.arg3_type	= ARG_ANYTHING,
1610  	.arg4_type	= ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE,
1611  };
1612  
BPF_CALL_5(bpf_dynptr_read,void *,dst,u32,len,const struct bpf_dynptr_kern *,src,u32,offset,u64,flags)1613  BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src,
1614  	   u32, offset, u64, flags)
1615  {
1616  	enum bpf_dynptr_type type;
1617  	int err;
1618  
1619  	if (!src->data || flags)
1620  		return -EINVAL;
1621  
1622  	err = bpf_dynptr_check_off_len(src, offset, len);
1623  	if (err)
1624  		return err;
1625  
1626  	type = bpf_dynptr_get_type(src);
1627  
1628  	switch (type) {
1629  	case BPF_DYNPTR_TYPE_LOCAL:
1630  	case BPF_DYNPTR_TYPE_RINGBUF:
1631  		/* Source and destination may possibly overlap, hence use memmove to
1632  		 * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
1633  		 * pointing to overlapping PTR_TO_MAP_VALUE regions.
1634  		 */
1635  		memmove(dst, src->data + src->offset + offset, len);
1636  		return 0;
1637  	case BPF_DYNPTR_TYPE_SKB:
1638  		return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len);
1639  	case BPF_DYNPTR_TYPE_XDP:
1640  		return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len);
1641  	default:
1642  		WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type);
1643  		return -EFAULT;
1644  	}
1645  }
1646  
1647  static const struct bpf_func_proto bpf_dynptr_read_proto = {
1648  	.func		= bpf_dynptr_read,
1649  	.gpl_only	= false,
1650  	.ret_type	= RET_INTEGER,
1651  	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
1652  	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
1653  	.arg3_type	= ARG_PTR_TO_DYNPTR | MEM_RDONLY,
1654  	.arg4_type	= ARG_ANYTHING,
1655  	.arg5_type	= ARG_ANYTHING,
1656  };
1657  
BPF_CALL_5(bpf_dynptr_write,const struct bpf_dynptr_kern *,dst,u32,offset,void *,src,u32,len,u64,flags)1658  BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
1659  	   u32, len, u64, flags)
1660  {
1661  	enum bpf_dynptr_type type;
1662  	int err;
1663  
1664  	if (!dst->data || __bpf_dynptr_is_rdonly(dst))
1665  		return -EINVAL;
1666  
1667  	err = bpf_dynptr_check_off_len(dst, offset, len);
1668  	if (err)
1669  		return err;
1670  
1671  	type = bpf_dynptr_get_type(dst);
1672  
1673  	switch (type) {
1674  	case BPF_DYNPTR_TYPE_LOCAL:
1675  	case BPF_DYNPTR_TYPE_RINGBUF:
1676  		if (flags)
1677  			return -EINVAL;
1678  		/* Source and destination may possibly overlap, hence use memmove to
1679  		 * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
1680  		 * pointing to overlapping PTR_TO_MAP_VALUE regions.
1681  		 */
1682  		memmove(dst->data + dst->offset + offset, src, len);
1683  		return 0;
1684  	case BPF_DYNPTR_TYPE_SKB:
1685  		return __bpf_skb_store_bytes(dst->data, dst->offset + offset, src, len,
1686  					     flags);
1687  	case BPF_DYNPTR_TYPE_XDP:
1688  		if (flags)
1689  			return -EINVAL;
1690  		return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len);
1691  	default:
1692  		WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type);
1693  		return -EFAULT;
1694  	}
1695  }
1696  
1697  static const struct bpf_func_proto bpf_dynptr_write_proto = {
1698  	.func		= bpf_dynptr_write,
1699  	.gpl_only	= false,
1700  	.ret_type	= RET_INTEGER,
1701  	.arg1_type	= ARG_PTR_TO_DYNPTR | MEM_RDONLY,
1702  	.arg2_type	= ARG_ANYTHING,
1703  	.arg3_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
1704  	.arg4_type	= ARG_CONST_SIZE_OR_ZERO,
1705  	.arg5_type	= ARG_ANYTHING,
1706  };
1707  
BPF_CALL_3(bpf_dynptr_data,const struct bpf_dynptr_kern *,ptr,u32,offset,u32,len)1708  BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
1709  {
1710  	enum bpf_dynptr_type type;
1711  	int err;
1712  
1713  	if (!ptr->data)
1714  		return 0;
1715  
1716  	err = bpf_dynptr_check_off_len(ptr, offset, len);
1717  	if (err)
1718  		return 0;
1719  
1720  	if (__bpf_dynptr_is_rdonly(ptr))
1721  		return 0;
1722  
1723  	type = bpf_dynptr_get_type(ptr);
1724  
1725  	switch (type) {
1726  	case BPF_DYNPTR_TYPE_LOCAL:
1727  	case BPF_DYNPTR_TYPE_RINGBUF:
1728  		return (unsigned long)(ptr->data + ptr->offset + offset);
1729  	case BPF_DYNPTR_TYPE_SKB:
1730  	case BPF_DYNPTR_TYPE_XDP:
1731  		/* skb and xdp dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */
1732  		return 0;
1733  	default:
1734  		WARN_ONCE(true, "bpf_dynptr_data: unknown dynptr type %d\n", type);
1735  		return 0;
1736  	}
1737  }
1738  
1739  static const struct bpf_func_proto bpf_dynptr_data_proto = {
1740  	.func		= bpf_dynptr_data,
1741  	.gpl_only	= false,
1742  	.ret_type	= RET_PTR_TO_DYNPTR_MEM_OR_NULL,
1743  	.arg1_type	= ARG_PTR_TO_DYNPTR | MEM_RDONLY,
1744  	.arg2_type	= ARG_ANYTHING,
1745  	.arg3_type	= ARG_CONST_ALLOC_SIZE_OR_ZERO,
1746  };
1747  
1748  const struct bpf_func_proto bpf_get_current_task_proto __weak;
1749  const struct bpf_func_proto bpf_get_current_task_btf_proto __weak;
1750  const struct bpf_func_proto bpf_probe_read_user_proto __weak;
1751  const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
1752  const struct bpf_func_proto bpf_probe_read_kernel_proto __weak;
1753  const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak;
1754  const struct bpf_func_proto bpf_task_pt_regs_proto __weak;
1755  
1756  const struct bpf_func_proto *
bpf_base_func_proto(enum bpf_func_id func_id)1757  bpf_base_func_proto(enum bpf_func_id func_id)
1758  {
1759  	switch (func_id) {
1760  	case BPF_FUNC_map_lookup_elem:
1761  		return &bpf_map_lookup_elem_proto;
1762  	case BPF_FUNC_map_update_elem:
1763  		return &bpf_map_update_elem_proto;
1764  	case BPF_FUNC_map_delete_elem:
1765  		return &bpf_map_delete_elem_proto;
1766  	case BPF_FUNC_map_push_elem:
1767  		return &bpf_map_push_elem_proto;
1768  	case BPF_FUNC_map_pop_elem:
1769  		return &bpf_map_pop_elem_proto;
1770  	case BPF_FUNC_map_peek_elem:
1771  		return &bpf_map_peek_elem_proto;
1772  	case BPF_FUNC_map_lookup_percpu_elem:
1773  		return &bpf_map_lookup_percpu_elem_proto;
1774  	case BPF_FUNC_get_prandom_u32:
1775  		return &bpf_get_prandom_u32_proto;
1776  	case BPF_FUNC_get_smp_processor_id:
1777  		return &bpf_get_raw_smp_processor_id_proto;
1778  	case BPF_FUNC_get_numa_node_id:
1779  		return &bpf_get_numa_node_id_proto;
1780  	case BPF_FUNC_tail_call:
1781  		return &bpf_tail_call_proto;
1782  	case BPF_FUNC_ktime_get_ns:
1783  		return &bpf_ktime_get_ns_proto;
1784  	case BPF_FUNC_ktime_get_boot_ns:
1785  		return &bpf_ktime_get_boot_ns_proto;
1786  	case BPF_FUNC_ktime_get_tai_ns:
1787  		return &bpf_ktime_get_tai_ns_proto;
1788  	case BPF_FUNC_ringbuf_output:
1789  		return &bpf_ringbuf_output_proto;
1790  	case BPF_FUNC_ringbuf_reserve:
1791  		return &bpf_ringbuf_reserve_proto;
1792  	case BPF_FUNC_ringbuf_submit:
1793  		return &bpf_ringbuf_submit_proto;
1794  	case BPF_FUNC_ringbuf_discard:
1795  		return &bpf_ringbuf_discard_proto;
1796  	case BPF_FUNC_ringbuf_query:
1797  		return &bpf_ringbuf_query_proto;
1798  	case BPF_FUNC_strncmp:
1799  		return &bpf_strncmp_proto;
1800  	case BPF_FUNC_strtol:
1801  		return &bpf_strtol_proto;
1802  	case BPF_FUNC_strtoul:
1803  		return &bpf_strtoul_proto;
1804  	default:
1805  		break;
1806  	}
1807  
1808  	if (!bpf_capable())
1809  		return NULL;
1810  
1811  	switch (func_id) {
1812  	case BPF_FUNC_spin_lock:
1813  		return &bpf_spin_lock_proto;
1814  	case BPF_FUNC_spin_unlock:
1815  		return &bpf_spin_unlock_proto;
1816  	case BPF_FUNC_jiffies64:
1817  		return &bpf_jiffies64_proto;
1818  	case BPF_FUNC_per_cpu_ptr:
1819  		return &bpf_per_cpu_ptr_proto;
1820  	case BPF_FUNC_this_cpu_ptr:
1821  		return &bpf_this_cpu_ptr_proto;
1822  	case BPF_FUNC_timer_init:
1823  		return &bpf_timer_init_proto;
1824  	case BPF_FUNC_timer_set_callback:
1825  		return &bpf_timer_set_callback_proto;
1826  	case BPF_FUNC_timer_start:
1827  		return &bpf_timer_start_proto;
1828  	case BPF_FUNC_timer_cancel:
1829  		return &bpf_timer_cancel_proto;
1830  	case BPF_FUNC_kptr_xchg:
1831  		return &bpf_kptr_xchg_proto;
1832  	case BPF_FUNC_for_each_map_elem:
1833  		return &bpf_for_each_map_elem_proto;
1834  	case BPF_FUNC_loop:
1835  		return &bpf_loop_proto;
1836  	case BPF_FUNC_user_ringbuf_drain:
1837  		return &bpf_user_ringbuf_drain_proto;
1838  	case BPF_FUNC_ringbuf_reserve_dynptr:
1839  		return &bpf_ringbuf_reserve_dynptr_proto;
1840  	case BPF_FUNC_ringbuf_submit_dynptr:
1841  		return &bpf_ringbuf_submit_dynptr_proto;
1842  	case BPF_FUNC_ringbuf_discard_dynptr:
1843  		return &bpf_ringbuf_discard_dynptr_proto;
1844  	case BPF_FUNC_dynptr_from_mem:
1845  		return &bpf_dynptr_from_mem_proto;
1846  	case BPF_FUNC_dynptr_read:
1847  		return &bpf_dynptr_read_proto;
1848  	case BPF_FUNC_dynptr_write:
1849  		return &bpf_dynptr_write_proto;
1850  	case BPF_FUNC_dynptr_data:
1851  		return &bpf_dynptr_data_proto;
1852  #ifdef CONFIG_CGROUPS
1853  	case BPF_FUNC_cgrp_storage_get:
1854  		return &bpf_cgrp_storage_get_proto;
1855  	case BPF_FUNC_cgrp_storage_delete:
1856  		return &bpf_cgrp_storage_delete_proto;
1857  	case BPF_FUNC_get_current_cgroup_id:
1858  		return &bpf_get_current_cgroup_id_proto;
1859  	case BPF_FUNC_get_current_ancestor_cgroup_id:
1860  		return &bpf_get_current_ancestor_cgroup_id_proto;
1861  #endif
1862  	default:
1863  		break;
1864  	}
1865  
1866  	if (!perfmon_capable())
1867  		return NULL;
1868  
1869  	switch (func_id) {
1870  	case BPF_FUNC_trace_printk:
1871  		return bpf_get_trace_printk_proto();
1872  	case BPF_FUNC_get_current_task:
1873  		return &bpf_get_current_task_proto;
1874  	case BPF_FUNC_get_current_task_btf:
1875  		return &bpf_get_current_task_btf_proto;
1876  	case BPF_FUNC_probe_read_user:
1877  		return &bpf_probe_read_user_proto;
1878  	case BPF_FUNC_probe_read_kernel:
1879  		return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
1880  		       NULL : &bpf_probe_read_kernel_proto;
1881  	case BPF_FUNC_probe_read_user_str:
1882  		return &bpf_probe_read_user_str_proto;
1883  	case BPF_FUNC_probe_read_kernel_str:
1884  		return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
1885  		       NULL : &bpf_probe_read_kernel_str_proto;
1886  	case BPF_FUNC_snprintf_btf:
1887  		return &bpf_snprintf_btf_proto;
1888  	case BPF_FUNC_snprintf:
1889  		return &bpf_snprintf_proto;
1890  	case BPF_FUNC_task_pt_regs:
1891  		return &bpf_task_pt_regs_proto;
1892  	case BPF_FUNC_trace_vprintk:
1893  		return bpf_get_trace_vprintk_proto();
1894  	default:
1895  		return NULL;
1896  	}
1897  }
1898  
1899  void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
1900  
bpf_list_head_free(const struct btf_field * field,void * list_head,struct bpf_spin_lock * spin_lock)1901  void bpf_list_head_free(const struct btf_field *field, void *list_head,
1902  			struct bpf_spin_lock *spin_lock)
1903  {
1904  	struct list_head *head = list_head, *orig_head = list_head;
1905  
1906  	BUILD_BUG_ON(sizeof(struct list_head) > sizeof(struct bpf_list_head));
1907  	BUILD_BUG_ON(__alignof__(struct list_head) > __alignof__(struct bpf_list_head));
1908  
1909  	/* Do the actual list draining outside the lock to not hold the lock for
1910  	 * too long, and also prevent deadlocks if tracing programs end up
1911  	 * executing on entry/exit of functions called inside the critical
1912  	 * section, and end up doing map ops that call bpf_list_head_free for
1913  	 * the same map value again.
1914  	 */
1915  	__bpf_spin_lock_irqsave(spin_lock);
1916  	if (!head->next || list_empty(head))
1917  		goto unlock;
1918  	head = head->next;
1919  unlock:
1920  	INIT_LIST_HEAD(orig_head);
1921  	__bpf_spin_unlock_irqrestore(spin_lock);
1922  
1923  	while (head != orig_head) {
1924  		void *obj = head;
1925  
1926  		obj -= field->graph_root.node_offset;
1927  		head = head->next;
1928  		/* The contained type can also have resources, including a
1929  		 * bpf_list_head which needs to be freed.
1930  		 */
1931  		migrate_disable();
1932  		__bpf_obj_drop_impl(obj, field->graph_root.value_rec);
1933  		migrate_enable();
1934  	}
1935  }
1936  
1937  /* Like rbtree_postorder_for_each_entry_safe, but 'pos' and 'n' are
1938   * 'rb_node *', so field name of rb_node within containing struct is not
1939   * needed.
1940   *
1941   * Since bpf_rb_tree's node type has a corresponding struct btf_field with
1942   * graph_root.node_offset, it's not necessary to know field name
1943   * or type of node struct
1944   */
1945  #define bpf_rbtree_postorder_for_each_entry_safe(pos, n, root) \
1946  	for (pos = rb_first_postorder(root); \
1947  	    pos && ({ n = rb_next_postorder(pos); 1; }); \
1948  	    pos = n)
1949  
bpf_rb_root_free(const struct btf_field * field,void * rb_root,struct bpf_spin_lock * spin_lock)1950  void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
1951  		      struct bpf_spin_lock *spin_lock)
1952  {
1953  	struct rb_root_cached orig_root, *root = rb_root;
1954  	struct rb_node *pos, *n;
1955  	void *obj;
1956  
1957  	BUILD_BUG_ON(sizeof(struct rb_root_cached) > sizeof(struct bpf_rb_root));
1958  	BUILD_BUG_ON(__alignof__(struct rb_root_cached) > __alignof__(struct bpf_rb_root));
1959  
1960  	__bpf_spin_lock_irqsave(spin_lock);
1961  	orig_root = *root;
1962  	*root = RB_ROOT_CACHED;
1963  	__bpf_spin_unlock_irqrestore(spin_lock);
1964  
1965  	bpf_rbtree_postorder_for_each_entry_safe(pos, n, &orig_root.rb_root) {
1966  		obj = pos;
1967  		obj -= field->graph_root.node_offset;
1968  
1969  
1970  		migrate_disable();
1971  		__bpf_obj_drop_impl(obj, field->graph_root.value_rec);
1972  		migrate_enable();
1973  	}
1974  }
1975  
1976  __diag_push();
1977  __diag_ignore_all("-Wmissing-prototypes",
1978  		  "Global functions as their definitions will be in vmlinux BTF");
1979  
bpf_obj_new_impl(u64 local_type_id__k,void * meta__ign)1980  __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
1981  {
1982  	struct btf_struct_meta *meta = meta__ign;
1983  	u64 size = local_type_id__k;
1984  	void *p;
1985  
1986  	p = bpf_mem_alloc(&bpf_global_ma, size);
1987  	if (!p)
1988  		return NULL;
1989  	if (meta)
1990  		bpf_obj_init(meta->record, p);
1991  	return p;
1992  }
1993  
1994  /* Must be called under migrate_disable(), as required by bpf_mem_free */
__bpf_obj_drop_impl(void * p,const struct btf_record * rec)1995  void __bpf_obj_drop_impl(void *p, const struct btf_record *rec)
1996  {
1997  	if (rec && rec->refcount_off >= 0 &&
1998  	    !refcount_dec_and_test((refcount_t *)(p + rec->refcount_off))) {
1999  		/* Object is refcounted and refcount_dec didn't result in 0
2000  		 * refcount. Return without freeing the object
2001  		 */
2002  		return;
2003  	}
2004  
2005  	if (rec)
2006  		bpf_obj_free_fields(rec, p);
2007  
2008  	if (rec && rec->refcount_off >= 0)
2009  		bpf_mem_free_rcu(&bpf_global_ma, p);
2010  	else
2011  		bpf_mem_free(&bpf_global_ma, p);
2012  }
2013  
bpf_obj_drop_impl(void * p__alloc,void * meta__ign)2014  __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
2015  {
2016  	struct btf_struct_meta *meta = meta__ign;
2017  	void *p = p__alloc;
2018  
2019  	__bpf_obj_drop_impl(p, meta ? meta->record : NULL);
2020  }
2021  
bpf_refcount_acquire_impl(void * p__refcounted_kptr,void * meta__ign)2022  __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign)
2023  {
2024  	struct btf_struct_meta *meta = meta__ign;
2025  	struct bpf_refcount *ref;
2026  
2027  	/* Could just cast directly to refcount_t *, but need some code using
2028  	 * bpf_refcount type so that it is emitted in vmlinux BTF
2029  	 */
2030  	ref = (struct bpf_refcount *)(p__refcounted_kptr + meta->record->refcount_off);
2031  	if (!refcount_inc_not_zero((refcount_t *)ref))
2032  		return NULL;
2033  
2034  	/* Verifier strips KF_RET_NULL if input is owned ref, see is_kfunc_ret_null
2035  	 * in verifier.c
2036  	 */
2037  	return (void *)p__refcounted_kptr;
2038  }
2039  
__bpf_list_add(struct bpf_list_node_kern * node,struct bpf_list_head * head,bool tail,struct btf_record * rec,u64 off)2040  static int __bpf_list_add(struct bpf_list_node_kern *node,
2041  			  struct bpf_list_head *head,
2042  			  bool tail, struct btf_record *rec, u64 off)
2043  {
2044  	struct list_head *n = &node->list_head, *h = (void *)head;
2045  
2046  	/* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
2047  	 * called on its fields, so init here
2048  	 */
2049  	if (unlikely(!h->next))
2050  		INIT_LIST_HEAD(h);
2051  
2052  	/* node->owner != NULL implies !list_empty(n), no need to separately
2053  	 * check the latter
2054  	 */
2055  	if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
2056  		/* Only called from BPF prog, no need to migrate_disable */
2057  		__bpf_obj_drop_impl((void *)n - off, rec);
2058  		return -EINVAL;
2059  	}
2060  
2061  	tail ? list_add_tail(n, h) : list_add(n, h);
2062  	WRITE_ONCE(node->owner, head);
2063  
2064  	return 0;
2065  }
2066  
bpf_list_push_front_impl(struct bpf_list_head * head,struct bpf_list_node * node,void * meta__ign,u64 off)2067  __bpf_kfunc int bpf_list_push_front_impl(struct bpf_list_head *head,
2068  					 struct bpf_list_node *node,
2069  					 void *meta__ign, u64 off)
2070  {
2071  	struct bpf_list_node_kern *n = (void *)node;
2072  	struct btf_struct_meta *meta = meta__ign;
2073  
2074  	return __bpf_list_add(n, head, false, meta ? meta->record : NULL, off);
2075  }
2076  
bpf_list_push_back_impl(struct bpf_list_head * head,struct bpf_list_node * node,void * meta__ign,u64 off)2077  __bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head,
2078  					struct bpf_list_node *node,
2079  					void *meta__ign, u64 off)
2080  {
2081  	struct bpf_list_node_kern *n = (void *)node;
2082  	struct btf_struct_meta *meta = meta__ign;
2083  
2084  	return __bpf_list_add(n, head, true, meta ? meta->record : NULL, off);
2085  }
2086  
__bpf_list_del(struct bpf_list_head * head,bool tail)2087  static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail)
2088  {
2089  	struct list_head *n, *h = (void *)head;
2090  	struct bpf_list_node_kern *node;
2091  
2092  	/* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
2093  	 * called on its fields, so init here
2094  	 */
2095  	if (unlikely(!h->next))
2096  		INIT_LIST_HEAD(h);
2097  	if (list_empty(h))
2098  		return NULL;
2099  
2100  	n = tail ? h->prev : h->next;
2101  	node = container_of(n, struct bpf_list_node_kern, list_head);
2102  	if (WARN_ON_ONCE(READ_ONCE(node->owner) != head))
2103  		return NULL;
2104  
2105  	list_del_init(n);
2106  	WRITE_ONCE(node->owner, NULL);
2107  	return (struct bpf_list_node *)n;
2108  }
2109  
bpf_list_pop_front(struct bpf_list_head * head)2110  __bpf_kfunc struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head)
2111  {
2112  	return __bpf_list_del(head, false);
2113  }
2114  
bpf_list_pop_back(struct bpf_list_head * head)2115  __bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head)
2116  {
2117  	return __bpf_list_del(head, true);
2118  }
2119  
bpf_rbtree_remove(struct bpf_rb_root * root,struct bpf_rb_node * node)2120  __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
2121  						  struct bpf_rb_node *node)
2122  {
2123  	struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node;
2124  	struct rb_root_cached *r = (struct rb_root_cached *)root;
2125  	struct rb_node *n = &node_internal->rb_node;
2126  
2127  	/* node_internal->owner != root implies either RB_EMPTY_NODE(n) or
2128  	 * n is owned by some other tree. No need to check RB_EMPTY_NODE(n)
2129  	 */
2130  	if (READ_ONCE(node_internal->owner) != root)
2131  		return NULL;
2132  
2133  	rb_erase_cached(n, r);
2134  	RB_CLEAR_NODE(n);
2135  	WRITE_ONCE(node_internal->owner, NULL);
2136  	return (struct bpf_rb_node *)n;
2137  }
2138  
2139  /* Need to copy rbtree_add_cached's logic here because our 'less' is a BPF
2140   * program
2141   */
__bpf_rbtree_add(struct bpf_rb_root * root,struct bpf_rb_node_kern * node,void * less,struct btf_record * rec,u64 off)2142  static int __bpf_rbtree_add(struct bpf_rb_root *root,
2143  			    struct bpf_rb_node_kern *node,
2144  			    void *less, struct btf_record *rec, u64 off)
2145  {
2146  	struct rb_node **link = &((struct rb_root_cached *)root)->rb_root.rb_node;
2147  	struct rb_node *parent = NULL, *n = &node->rb_node;
2148  	bpf_callback_t cb = (bpf_callback_t)less;
2149  	bool leftmost = true;
2150  
2151  	/* node->owner != NULL implies !RB_EMPTY_NODE(n), no need to separately
2152  	 * check the latter
2153  	 */
2154  	if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
2155  		/* Only called from BPF prog, no need to migrate_disable */
2156  		__bpf_obj_drop_impl((void *)n - off, rec);
2157  		return -EINVAL;
2158  	}
2159  
2160  	while (*link) {
2161  		parent = *link;
2162  		if (cb((uintptr_t)node, (uintptr_t)parent, 0, 0, 0)) {
2163  			link = &parent->rb_left;
2164  		} else {
2165  			link = &parent->rb_right;
2166  			leftmost = false;
2167  		}
2168  	}
2169  
2170  	rb_link_node(n, parent, link);
2171  	rb_insert_color_cached(n, (struct rb_root_cached *)root, leftmost);
2172  	WRITE_ONCE(node->owner, root);
2173  	return 0;
2174  }
2175  
bpf_rbtree_add_impl(struct bpf_rb_root * root,struct bpf_rb_node * node,bool (less)(struct bpf_rb_node * a,const struct bpf_rb_node * b),void * meta__ign,u64 off)2176  __bpf_kfunc int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
2177  				    bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b),
2178  				    void *meta__ign, u64 off)
2179  {
2180  	struct btf_struct_meta *meta = meta__ign;
2181  	struct bpf_rb_node_kern *n = (void *)node;
2182  
2183  	return __bpf_rbtree_add(root, n, (void *)less, meta ? meta->record : NULL, off);
2184  }
2185  
bpf_rbtree_first(struct bpf_rb_root * root)2186  __bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root)
2187  {
2188  	struct rb_root_cached *r = (struct rb_root_cached *)root;
2189  
2190  	return (struct bpf_rb_node *)rb_first_cached(r);
2191  }
2192  
2193  /**
2194   * bpf_task_acquire - Acquire a reference to a task. A task acquired by this
2195   * kfunc which is not stored in a map as a kptr, must be released by calling
2196   * bpf_task_release().
2197   * @p: The task on which a reference is being acquired.
2198   */
bpf_task_acquire(struct task_struct * p)2199  __bpf_kfunc struct task_struct *bpf_task_acquire(struct task_struct *p)
2200  {
2201  	if (refcount_inc_not_zero(&p->rcu_users))
2202  		return p;
2203  	return NULL;
2204  }
2205  
2206  /**
2207   * bpf_task_release - Release the reference acquired on a task.
2208   * @p: The task on which a reference is being released.
2209   */
bpf_task_release(struct task_struct * p)2210  __bpf_kfunc void bpf_task_release(struct task_struct *p)
2211  {
2212  	put_task_struct_rcu_user(p);
2213  }
2214  
2215  #ifdef CONFIG_CGROUPS
2216  /**
2217   * bpf_cgroup_acquire - Acquire a reference to a cgroup. A cgroup acquired by
2218   * this kfunc which is not stored in a map as a kptr, must be released by
2219   * calling bpf_cgroup_release().
2220   * @cgrp: The cgroup on which a reference is being acquired.
2221   */
bpf_cgroup_acquire(struct cgroup * cgrp)2222  __bpf_kfunc struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp)
2223  {
2224  	return cgroup_tryget(cgrp) ? cgrp : NULL;
2225  }
2226  
2227  /**
2228   * bpf_cgroup_release - Release the reference acquired on a cgroup.
2229   * If this kfunc is invoked in an RCU read region, the cgroup is guaranteed to
2230   * not be freed until the current grace period has ended, even if its refcount
2231   * drops to 0.
2232   * @cgrp: The cgroup on which a reference is being released.
2233   */
bpf_cgroup_release(struct cgroup * cgrp)2234  __bpf_kfunc void bpf_cgroup_release(struct cgroup *cgrp)
2235  {
2236  	cgroup_put(cgrp);
2237  }
2238  
2239  /**
2240   * bpf_cgroup_ancestor - Perform a lookup on an entry in a cgroup's ancestor
2241   * array. A cgroup returned by this kfunc which is not subsequently stored in a
2242   * map, must be released by calling bpf_cgroup_release().
2243   * @cgrp: The cgroup for which we're performing a lookup.
2244   * @level: The level of ancestor to look up.
2245   */
bpf_cgroup_ancestor(struct cgroup * cgrp,int level)2246  __bpf_kfunc struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level)
2247  {
2248  	struct cgroup *ancestor;
2249  
2250  	if (level > cgrp->level || level < 0)
2251  		return NULL;
2252  
2253  	/* cgrp's refcnt could be 0 here, but ancestors can still be accessed */
2254  	ancestor = cgrp->ancestors[level];
2255  	if (!cgroup_tryget(ancestor))
2256  		return NULL;
2257  	return ancestor;
2258  }
2259  
2260  /**
2261   * bpf_cgroup_from_id - Find a cgroup from its ID. A cgroup returned by this
2262   * kfunc which is not subsequently stored in a map, must be released by calling
2263   * bpf_cgroup_release().
2264   * @cgid: cgroup id.
2265   */
bpf_cgroup_from_id(u64 cgid)2266  __bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid)
2267  {
2268  	struct cgroup *cgrp;
2269  
2270  	cgrp = cgroup_get_from_id(cgid);
2271  	if (IS_ERR(cgrp))
2272  		return NULL;
2273  	return cgrp;
2274  }
2275  
2276  /**
2277   * bpf_task_under_cgroup - wrap task_under_cgroup_hierarchy() as a kfunc, test
2278   * task's membership of cgroup ancestry.
2279   * @task: the task to be tested
2280   * @ancestor: possible ancestor of @task's cgroup
2281   *
2282   * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor.
2283   * It follows all the same rules as cgroup_is_descendant, and only applies
2284   * to the default hierarchy.
2285   */
bpf_task_under_cgroup(struct task_struct * task,struct cgroup * ancestor)2286  __bpf_kfunc long bpf_task_under_cgroup(struct task_struct *task,
2287  				       struct cgroup *ancestor)
2288  {
2289  	long ret;
2290  
2291  	rcu_read_lock();
2292  	ret = task_under_cgroup_hierarchy(task, ancestor);
2293  	rcu_read_unlock();
2294  	return ret;
2295  }
2296  #endif /* CONFIG_CGROUPS */
2297  
2298  /**
2299   * bpf_task_from_pid - Find a struct task_struct from its pid by looking it up
2300   * in the root pid namespace idr. If a task is returned, it must either be
2301   * stored in a map, or released with bpf_task_release().
2302   * @pid: The pid of the task being looked up.
2303   */
bpf_task_from_pid(s32 pid)2304  __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
2305  {
2306  	struct task_struct *p;
2307  
2308  	rcu_read_lock();
2309  	p = find_task_by_pid_ns(pid, &init_pid_ns);
2310  	if (p)
2311  		p = bpf_task_acquire(p);
2312  	rcu_read_unlock();
2313  
2314  	return p;
2315  }
2316  
2317  /**
2318   * bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data.
2319   * @ptr: The dynptr whose data slice to retrieve
2320   * @offset: Offset into the dynptr
2321   * @buffer__opt: User-provided buffer to copy contents into.  May be NULL
2322   * @buffer__szk: Size (in bytes) of the buffer if present. This is the
2323   *               length of the requested slice. This must be a constant.
2324   *
2325   * For non-skb and non-xdp type dynptrs, there is no difference between
2326   * bpf_dynptr_slice and bpf_dynptr_data.
2327   *
2328   *  If buffer__opt is NULL, the call will fail if buffer_opt was needed.
2329   *
2330   * If the intention is to write to the data slice, please use
2331   * bpf_dynptr_slice_rdwr.
2332   *
2333   * The user must check that the returned pointer is not null before using it.
2334   *
2335   * Please note that in the case of skb and xdp dynptrs, bpf_dynptr_slice
2336   * does not change the underlying packet data pointers, so a call to
2337   * bpf_dynptr_slice will not invalidate any ctx->data/data_end pointers in
2338   * the bpf program.
2339   *
2340   * Return: NULL if the call failed (eg invalid dynptr), pointer to a read-only
2341   * data slice (can be either direct pointer to the data or a pointer to the user
2342   * provided buffer, with its contents containing the data, if unable to obtain
2343   * direct pointer)
2344   */
bpf_dynptr_slice(const struct bpf_dynptr_kern * ptr,u32 offset,void * buffer__opt,u32 buffer__szk)2345  __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset,
2346  				   void *buffer__opt, u32 buffer__szk)
2347  {
2348  	enum bpf_dynptr_type type;
2349  	u32 len = buffer__szk;
2350  	int err;
2351  
2352  	if (!ptr->data)
2353  		return NULL;
2354  
2355  	err = bpf_dynptr_check_off_len(ptr, offset, len);
2356  	if (err)
2357  		return NULL;
2358  
2359  	type = bpf_dynptr_get_type(ptr);
2360  
2361  	switch (type) {
2362  	case BPF_DYNPTR_TYPE_LOCAL:
2363  	case BPF_DYNPTR_TYPE_RINGBUF:
2364  		return ptr->data + ptr->offset + offset;
2365  	case BPF_DYNPTR_TYPE_SKB:
2366  		if (buffer__opt)
2367  			return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer__opt);
2368  		else
2369  			return skb_pointer_if_linear(ptr->data, ptr->offset + offset, len);
2370  	case BPF_DYNPTR_TYPE_XDP:
2371  	{
2372  		void *xdp_ptr = bpf_xdp_pointer(ptr->data, ptr->offset + offset, len);
2373  		if (!IS_ERR_OR_NULL(xdp_ptr))
2374  			return xdp_ptr;
2375  
2376  		if (!buffer__opt)
2377  			return NULL;
2378  		bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__opt, len, false);
2379  		return buffer__opt;
2380  	}
2381  	default:
2382  		WARN_ONCE(true, "unknown dynptr type %d\n", type);
2383  		return NULL;
2384  	}
2385  }
2386  
2387  /**
2388   * bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data.
2389   * @ptr: The dynptr whose data slice to retrieve
2390   * @offset: Offset into the dynptr
2391   * @buffer__opt: User-provided buffer to copy contents into. May be NULL
2392   * @buffer__szk: Size (in bytes) of the buffer if present. This is the
2393   *               length of the requested slice. This must be a constant.
2394   *
2395   * For non-skb and non-xdp type dynptrs, there is no difference between
2396   * bpf_dynptr_slice and bpf_dynptr_data.
2397   *
2398   * If buffer__opt is NULL, the call will fail if buffer_opt was needed.
2399   *
2400   * The returned pointer is writable and may point to either directly the dynptr
2401   * data at the requested offset or to the buffer if unable to obtain a direct
2402   * data pointer to (example: the requested slice is to the paged area of an skb
2403   * packet). In the case where the returned pointer is to the buffer, the user
2404   * is responsible for persisting writes through calling bpf_dynptr_write(). This
2405   * usually looks something like this pattern:
2406   *
2407   * struct eth_hdr *eth = bpf_dynptr_slice_rdwr(&dynptr, 0, buffer, sizeof(buffer));
2408   * if (!eth)
2409   *	return TC_ACT_SHOT;
2410   *
2411   * // mutate eth header //
2412   *
2413   * if (eth == buffer)
2414   *	bpf_dynptr_write(&ptr, 0, buffer, sizeof(buffer), 0);
2415   *
2416   * Please note that, as in the example above, the user must check that the
2417   * returned pointer is not null before using it.
2418   *
2419   * Please also note that in the case of skb and xdp dynptrs, bpf_dynptr_slice_rdwr
2420   * does not change the underlying packet data pointers, so a call to
2421   * bpf_dynptr_slice_rdwr will not invalidate any ctx->data/data_end pointers in
2422   * the bpf program.
2423   *
2424   * Return: NULL if the call failed (eg invalid dynptr), pointer to a
2425   * data slice (can be either direct pointer to the data or a pointer to the user
2426   * provided buffer, with its contents containing the data, if unable to obtain
2427   * direct pointer)
2428   */
bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern * ptr,u32 offset,void * buffer__opt,u32 buffer__szk)2429  __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 offset,
2430  					void *buffer__opt, u32 buffer__szk)
2431  {
2432  	if (!ptr->data || __bpf_dynptr_is_rdonly(ptr))
2433  		return NULL;
2434  
2435  	/* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice.
2436  	 *
2437  	 * For skb-type dynptrs, it is safe to write into the returned pointer
2438  	 * if the bpf program allows skb data writes. There are two possiblities
2439  	 * that may occur when calling bpf_dynptr_slice_rdwr:
2440  	 *
2441  	 * 1) The requested slice is in the head of the skb. In this case, the
2442  	 * returned pointer is directly to skb data, and if the skb is cloned, the
2443  	 * verifier will have uncloned it (see bpf_unclone_prologue()) already.
2444  	 * The pointer can be directly written into.
2445  	 *
2446  	 * 2) Some portion of the requested slice is in the paged buffer area.
2447  	 * In this case, the requested data will be copied out into the buffer
2448  	 * and the returned pointer will be a pointer to the buffer. The skb
2449  	 * will not be pulled. To persist the write, the user will need to call
2450  	 * bpf_dynptr_write(), which will pull the skb and commit the write.
2451  	 *
2452  	 * Similarly for xdp programs, if the requested slice is not across xdp
2453  	 * fragments, then a direct pointer will be returned, otherwise the data
2454  	 * will be copied out into the buffer and the user will need to call
2455  	 * bpf_dynptr_write() to commit changes.
2456  	 */
2457  	return bpf_dynptr_slice(ptr, offset, buffer__opt, buffer__szk);
2458  }
2459  
bpf_dynptr_adjust(struct bpf_dynptr_kern * ptr,u32 start,u32 end)2460  __bpf_kfunc int bpf_dynptr_adjust(struct bpf_dynptr_kern *ptr, u32 start, u32 end)
2461  {
2462  	u32 size;
2463  
2464  	if (!ptr->data || start > end)
2465  		return -EINVAL;
2466  
2467  	size = __bpf_dynptr_size(ptr);
2468  
2469  	if (start > size || end > size)
2470  		return -ERANGE;
2471  
2472  	ptr->offset += start;
2473  	bpf_dynptr_set_size(ptr, end - start);
2474  
2475  	return 0;
2476  }
2477  
bpf_dynptr_is_null(struct bpf_dynptr_kern * ptr)2478  __bpf_kfunc bool bpf_dynptr_is_null(struct bpf_dynptr_kern *ptr)
2479  {
2480  	return !ptr->data;
2481  }
2482  
bpf_dynptr_is_rdonly(struct bpf_dynptr_kern * ptr)2483  __bpf_kfunc bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr)
2484  {
2485  	if (!ptr->data)
2486  		return false;
2487  
2488  	return __bpf_dynptr_is_rdonly(ptr);
2489  }
2490  
bpf_dynptr_size(const struct bpf_dynptr_kern * ptr)2491  __bpf_kfunc __u32 bpf_dynptr_size(const struct bpf_dynptr_kern *ptr)
2492  {
2493  	if (!ptr->data)
2494  		return -EINVAL;
2495  
2496  	return __bpf_dynptr_size(ptr);
2497  }
2498  
bpf_dynptr_clone(struct bpf_dynptr_kern * ptr,struct bpf_dynptr_kern * clone__uninit)2499  __bpf_kfunc int bpf_dynptr_clone(struct bpf_dynptr_kern *ptr,
2500  				 struct bpf_dynptr_kern *clone__uninit)
2501  {
2502  	if (!ptr->data) {
2503  		bpf_dynptr_set_null(clone__uninit);
2504  		return -EINVAL;
2505  	}
2506  
2507  	*clone__uninit = *ptr;
2508  
2509  	return 0;
2510  }
2511  
bpf_cast_to_kern_ctx(void * obj)2512  __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj)
2513  {
2514  	return obj;
2515  }
2516  
bpf_rdonly_cast(void * obj__ign,u32 btf_id__k)2517  __bpf_kfunc void *bpf_rdonly_cast(void *obj__ign, u32 btf_id__k)
2518  {
2519  	return obj__ign;
2520  }
2521  
bpf_rcu_read_lock(void)2522  __bpf_kfunc void bpf_rcu_read_lock(void)
2523  {
2524  	rcu_read_lock();
2525  }
2526  
bpf_rcu_read_unlock(void)2527  __bpf_kfunc void bpf_rcu_read_unlock(void)
2528  {
2529  	rcu_read_unlock();
2530  }
2531  
2532  __diag_pop();
2533  
2534  BTF_SET8_START(generic_btf_ids)
2535  #ifdef CONFIG_KEXEC_CORE
2536  BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
2537  #endif
2538  BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
2539  BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE)
2540  BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL)
2541  BTF_ID_FLAGS(func, bpf_list_push_front_impl)
2542  BTF_ID_FLAGS(func, bpf_list_push_back_impl)
2543  BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
2544  BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL)
2545  BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
2546  BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE)
2547  BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL)
2548  BTF_ID_FLAGS(func, bpf_rbtree_add_impl)
2549  BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
2550  
2551  #ifdef CONFIG_CGROUPS
2552  BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
2553  BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE)
2554  BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
2555  BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL)
2556  BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU)
2557  #endif
2558  BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
2559  BTF_SET8_END(generic_btf_ids)
2560  
2561  static const struct btf_kfunc_id_set generic_kfunc_set = {
2562  	.owner = THIS_MODULE,
2563  	.set   = &generic_btf_ids,
2564  };
2565  
2566  
2567  BTF_ID_LIST(generic_dtor_ids)
2568  BTF_ID(struct, task_struct)
2569  BTF_ID(func, bpf_task_release)
2570  #ifdef CONFIG_CGROUPS
2571  BTF_ID(struct, cgroup)
2572  BTF_ID(func, bpf_cgroup_release)
2573  #endif
2574  
2575  BTF_SET8_START(common_btf_ids)
2576  BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx)
2577  BTF_ID_FLAGS(func, bpf_rdonly_cast)
2578  BTF_ID_FLAGS(func, bpf_rcu_read_lock)
2579  BTF_ID_FLAGS(func, bpf_rcu_read_unlock)
2580  BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL)
2581  BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
2582  BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
2583  BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
2584  BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
2585  BTF_ID_FLAGS(func, bpf_dynptr_adjust)
2586  BTF_ID_FLAGS(func, bpf_dynptr_is_null)
2587  BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
2588  BTF_ID_FLAGS(func, bpf_dynptr_size)
2589  BTF_ID_FLAGS(func, bpf_dynptr_clone)
2590  BTF_SET8_END(common_btf_ids)
2591  
2592  static const struct btf_kfunc_id_set common_kfunc_set = {
2593  	.owner = THIS_MODULE,
2594  	.set   = &common_btf_ids,
2595  };
2596  
kfunc_init(void)2597  static int __init kfunc_init(void)
2598  {
2599  	int ret;
2600  	const struct btf_id_dtor_kfunc generic_dtors[] = {
2601  		{
2602  			.btf_id       = generic_dtor_ids[0],
2603  			.kfunc_btf_id = generic_dtor_ids[1]
2604  		},
2605  #ifdef CONFIG_CGROUPS
2606  		{
2607  			.btf_id       = generic_dtor_ids[2],
2608  			.kfunc_btf_id = generic_dtor_ids[3]
2609  		},
2610  #endif
2611  	};
2612  
2613  	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &generic_kfunc_set);
2614  	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set);
2615  	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set);
2616  	ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors,
2617  						  ARRAY_SIZE(generic_dtors),
2618  						  THIS_MODULE);
2619  	return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set);
2620  }
2621  
2622  late_initcall(kfunc_init);
2623