1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Unstable Conntrack Helpers for XDP and TC-BPF hook
3  *
4  * These are called from the XDP and SCHED_CLS BPF programs. Note that it is
5  * allowed to break compatibility for these functions since the interface they
6  * are exposed through to BPF programs is explicitly unstable.
7  */
8 
9 #include <linux/bpf_verifier.h>
10 #include <linux/bpf.h>
11 #include <linux/btf.h>
12 #include <linux/filter.h>
13 #include <linux/mutex.h>
14 #include <linux/types.h>
15 #include <linux/btf_ids.h>
16 #include <linux/net_namespace.h>
17 #include <net/netfilter/nf_conntrack_bpf.h>
18 #include <net/netfilter/nf_conntrack_core.h>
19 
20 /* bpf_ct_opts - Options for CT lookup helpers
21  *
22  * Members:
23  * @netns_id   - Specify the network namespace for lookup
24  *		 Values:
25  *		   BPF_F_CURRENT_NETNS (-1)
26  *		     Use namespace associated with ctx (xdp_md, __sk_buff)
27  *		   [0, S32_MAX]
28  *		     Network Namespace ID
29  * @error      - Out parameter, set for any errors encountered
30  *		 Values:
31  *		   -EINVAL - Passed NULL for bpf_tuple pointer
32  *		   -EINVAL - opts->reserved is not 0
33  *		   -EINVAL - netns_id is less than -1
34  *		   -EINVAL - opts__sz isn't NF_BPF_CT_OPTS_SZ (12)
35  *		   -EPROTO - l4proto isn't one of IPPROTO_TCP or IPPROTO_UDP
36  *		   -ENONET - No network namespace found for netns_id
37  *		   -ENOENT - Conntrack lookup could not find entry for tuple
38  *		   -EAFNOSUPPORT - tuple__sz isn't one of sizeof(tuple->ipv4)
39  *				   or sizeof(tuple->ipv6)
40  * @l4proto    - Layer 4 protocol
41  *		 Values:
42  *		   IPPROTO_TCP, IPPROTO_UDP
43  * @dir:       - connection tracking tuple direction.
44  * @reserved   - Reserved member, will be reused for more options in future
45  *		 Values:
46  *		   0
47  */
48 struct bpf_ct_opts {
49 	s32 netns_id;
50 	s32 error;
51 	u8 l4proto;
52 	u8 dir;
53 	u8 reserved[2];
54 };
55 
56 enum {
57 	NF_BPF_CT_OPTS_SZ = 12,
58 };
59 
60 static int bpf_nf_ct_tuple_parse(struct bpf_sock_tuple *bpf_tuple,
61 				 u32 tuple_len, u8 protonum, u8 dir,
62 				 struct nf_conntrack_tuple *tuple)
63 {
64 	union nf_inet_addr *src = dir ? &tuple->dst.u3 : &tuple->src.u3;
65 	union nf_inet_addr *dst = dir ? &tuple->src.u3 : &tuple->dst.u3;
66 	union nf_conntrack_man_proto *sport = dir ? (void *)&tuple->dst.u
67 						  : &tuple->src.u;
68 	union nf_conntrack_man_proto *dport = dir ? &tuple->src.u
69 						  : (void *)&tuple->dst.u;
70 
71 	if (unlikely(protonum != IPPROTO_TCP && protonum != IPPROTO_UDP))
72 		return -EPROTO;
73 
74 	memset(tuple, 0, sizeof(*tuple));
75 
76 	switch (tuple_len) {
77 	case sizeof(bpf_tuple->ipv4):
78 		tuple->src.l3num = AF_INET;
79 		src->ip = bpf_tuple->ipv4.saddr;
80 		sport->tcp.port = bpf_tuple->ipv4.sport;
81 		dst->ip = bpf_tuple->ipv4.daddr;
82 		dport->tcp.port = bpf_tuple->ipv4.dport;
83 		break;
84 	case sizeof(bpf_tuple->ipv6):
85 		tuple->src.l3num = AF_INET6;
86 		memcpy(src->ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr));
87 		sport->tcp.port = bpf_tuple->ipv6.sport;
88 		memcpy(dst->ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr));
89 		dport->tcp.port = bpf_tuple->ipv6.dport;
90 		break;
91 	default:
92 		return -EAFNOSUPPORT;
93 	}
94 	tuple->dst.protonum = protonum;
95 	tuple->dst.dir = dir;
96 
97 	return 0;
98 }
99 
100 static struct nf_conn *
101 __bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple,
102 			u32 tuple_len, struct bpf_ct_opts *opts, u32 opts_len,
103 			u32 timeout)
104 {
105 	struct nf_conntrack_tuple otuple, rtuple;
106 	struct nf_conn *ct;
107 	int err;
108 
109 	if (!opts || !bpf_tuple || opts->reserved[0] || opts->reserved[1] ||
110 	    opts_len != NF_BPF_CT_OPTS_SZ)
111 		return ERR_PTR(-EINVAL);
112 
113 	if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS))
114 		return ERR_PTR(-EINVAL);
115 
116 	err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto,
117 				    IP_CT_DIR_ORIGINAL, &otuple);
118 	if (err < 0)
119 		return ERR_PTR(err);
120 
121 	err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto,
122 				    IP_CT_DIR_REPLY, &rtuple);
123 	if (err < 0)
124 		return ERR_PTR(err);
125 
126 	if (opts->netns_id >= 0) {
127 		net = get_net_ns_by_id(net, opts->netns_id);
128 		if (unlikely(!net))
129 			return ERR_PTR(-ENONET);
130 	}
131 
132 	ct = nf_conntrack_alloc(net, &nf_ct_zone_dflt, &otuple, &rtuple,
133 				GFP_ATOMIC);
134 	if (IS_ERR(ct))
135 		goto out;
136 
137 	memset(&ct->proto, 0, sizeof(ct->proto));
138 	__nf_ct_set_timeout(ct, timeout * HZ);
139 
140 out:
141 	if (opts->netns_id >= 0)
142 		put_net(net);
143 
144 	return ct;
145 }
146 
147 static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
148 					  struct bpf_sock_tuple *bpf_tuple,
149 					  u32 tuple_len, struct bpf_ct_opts *opts,
150 					  u32 opts_len)
151 {
152 	struct nf_conntrack_tuple_hash *hash;
153 	struct nf_conntrack_tuple tuple;
154 	struct nf_conn *ct;
155 	int err;
156 
157 	if (!opts || !bpf_tuple || opts->reserved[0] || opts->reserved[1] ||
158 	    opts_len != NF_BPF_CT_OPTS_SZ)
159 		return ERR_PTR(-EINVAL);
160 	if (unlikely(opts->l4proto != IPPROTO_TCP && opts->l4proto != IPPROTO_UDP))
161 		return ERR_PTR(-EPROTO);
162 	if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS))
163 		return ERR_PTR(-EINVAL);
164 
165 	err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto,
166 				    IP_CT_DIR_ORIGINAL, &tuple);
167 	if (err < 0)
168 		return ERR_PTR(err);
169 
170 	if (opts->netns_id >= 0) {
171 		net = get_net_ns_by_id(net, opts->netns_id);
172 		if (unlikely(!net))
173 			return ERR_PTR(-ENONET);
174 	}
175 
176 	hash = nf_conntrack_find_get(net, &nf_ct_zone_dflt, &tuple);
177 	if (opts->netns_id >= 0)
178 		put_net(net);
179 	if (!hash)
180 		return ERR_PTR(-ENOENT);
181 
182 	ct = nf_ct_tuplehash_to_ctrack(hash);
183 	opts->dir = NF_CT_DIRECTION(hash);
184 
185 	return ct;
186 }
187 
188 BTF_ID_LIST(btf_nf_conn_ids)
189 BTF_ID(struct, nf_conn)
190 BTF_ID(struct, nf_conn___init)
191 
192 /* Check writes into `struct nf_conn` */
193 static int _nf_conntrack_btf_struct_access(struct bpf_verifier_log *log,
194 					   const struct btf *btf,
195 					   const struct btf_type *t, int off,
196 					   int size, enum bpf_access_type atype,
197 					   u32 *next_btf_id,
198 					   enum bpf_type_flag *flag)
199 {
200 	const struct btf_type *ncit;
201 	const struct btf_type *nct;
202 	size_t end;
203 
204 	ncit = btf_type_by_id(btf, btf_nf_conn_ids[1]);
205 	nct = btf_type_by_id(btf, btf_nf_conn_ids[0]);
206 
207 	if (t != nct && t != ncit) {
208 		bpf_log(log, "only read is supported\n");
209 		return -EACCES;
210 	}
211 
212 	/* `struct nf_conn` and `struct nf_conn___init` have the same layout
213 	 * so we are safe to simply merge offset checks here
214 	 */
215 	switch (off) {
216 #if defined(CONFIG_NF_CONNTRACK_MARK)
217 	case offsetof(struct nf_conn, mark):
218 		end = offsetofend(struct nf_conn, mark);
219 		break;
220 #endif
221 	default:
222 		bpf_log(log, "no write support to nf_conn at off %d\n", off);
223 		return -EACCES;
224 	}
225 
226 	if (off + size > end) {
227 		bpf_log(log,
228 			"write access at off %d with size %d beyond the member of nf_conn ended at %zu\n",
229 			off, size, end);
230 		return -EACCES;
231 	}
232 
233 	return 0;
234 }
235 
236 __diag_push();
237 __diag_ignore_all("-Wmissing-prototypes",
238 		  "Global functions as their definitions will be in nf_conntrack BTF");
239 
240 /* bpf_xdp_ct_alloc - Allocate a new CT entry
241  *
242  * Parameters:
243  * @xdp_ctx	- Pointer to ctx (xdp_md) in XDP program
244  *		    Cannot be NULL
245  * @bpf_tuple	- Pointer to memory representing the tuple to look up
246  *		    Cannot be NULL
247  * @tuple__sz	- Length of the tuple structure
248  *		    Must be one of sizeof(bpf_tuple->ipv4) or
249  *		    sizeof(bpf_tuple->ipv6)
250  * @opts	- Additional options for allocation (documented above)
251  *		    Cannot be NULL
252  * @opts__sz	- Length of the bpf_ct_opts structure
253  *		    Must be NF_BPF_CT_OPTS_SZ (12)
254  */
255 struct nf_conn___init *
256 bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
257 		 u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
258 {
259 	struct xdp_buff *ctx = (struct xdp_buff *)xdp_ctx;
260 	struct nf_conn *nfct;
261 
262 	nfct = __bpf_nf_ct_alloc_entry(dev_net(ctx->rxq->dev), bpf_tuple, tuple__sz,
263 				       opts, opts__sz, 10);
264 	if (IS_ERR(nfct)) {
265 		if (opts)
266 			opts->error = PTR_ERR(nfct);
267 		return NULL;
268 	}
269 
270 	return (struct nf_conn___init *)nfct;
271 }
272 
273 /* bpf_xdp_ct_lookup - Lookup CT entry for the given tuple, and acquire a
274  *		       reference to it
275  *
276  * Parameters:
277  * @xdp_ctx	- Pointer to ctx (xdp_md) in XDP program
278  *		    Cannot be NULL
279  * @bpf_tuple	- Pointer to memory representing the tuple to look up
280  *		    Cannot be NULL
281  * @tuple__sz	- Length of the tuple structure
282  *		    Must be one of sizeof(bpf_tuple->ipv4) or
283  *		    sizeof(bpf_tuple->ipv6)
284  * @opts	- Additional options for lookup (documented above)
285  *		    Cannot be NULL
286  * @opts__sz	- Length of the bpf_ct_opts structure
287  *		    Must be NF_BPF_CT_OPTS_SZ (12)
288  */
289 struct nf_conn *
290 bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
291 		  u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
292 {
293 	struct xdp_buff *ctx = (struct xdp_buff *)xdp_ctx;
294 	struct net *caller_net;
295 	struct nf_conn *nfct;
296 
297 	caller_net = dev_net(ctx->rxq->dev);
298 	nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts, opts__sz);
299 	if (IS_ERR(nfct)) {
300 		if (opts)
301 			opts->error = PTR_ERR(nfct);
302 		return NULL;
303 	}
304 	return nfct;
305 }
306 
307 /* bpf_skb_ct_alloc - Allocate a new CT entry
308  *
309  * Parameters:
310  * @skb_ctx	- Pointer to ctx (__sk_buff) in TC program
311  *		    Cannot be NULL
312  * @bpf_tuple	- Pointer to memory representing the tuple to look up
313  *		    Cannot be NULL
314  * @tuple__sz	- Length of the tuple structure
315  *		    Must be one of sizeof(bpf_tuple->ipv4) or
316  *		    sizeof(bpf_tuple->ipv6)
317  * @opts	- Additional options for allocation (documented above)
318  *		    Cannot be NULL
319  * @opts__sz	- Length of the bpf_ct_opts structure
320  *		    Must be NF_BPF_CT_OPTS_SZ (12)
321  */
322 struct nf_conn___init *
323 bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
324 		 u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
325 {
326 	struct sk_buff *skb = (struct sk_buff *)skb_ctx;
327 	struct nf_conn *nfct;
328 	struct net *net;
329 
330 	net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk);
331 	nfct = __bpf_nf_ct_alloc_entry(net, bpf_tuple, tuple__sz, opts, opts__sz, 10);
332 	if (IS_ERR(nfct)) {
333 		if (opts)
334 			opts->error = PTR_ERR(nfct);
335 		return NULL;
336 	}
337 
338 	return (struct nf_conn___init *)nfct;
339 }
340 
341 /* bpf_skb_ct_lookup - Lookup CT entry for the given tuple, and acquire a
342  *		       reference to it
343  *
344  * Parameters:
345  * @skb_ctx	- Pointer to ctx (__sk_buff) in TC program
346  *		    Cannot be NULL
347  * @bpf_tuple	- Pointer to memory representing the tuple to look up
348  *		    Cannot be NULL
349  * @tuple__sz	- Length of the tuple structure
350  *		    Must be one of sizeof(bpf_tuple->ipv4) or
351  *		    sizeof(bpf_tuple->ipv6)
352  * @opts	- Additional options for lookup (documented above)
353  *		    Cannot be NULL
354  * @opts__sz	- Length of the bpf_ct_opts structure
355  *		    Must be NF_BPF_CT_OPTS_SZ (12)
356  */
357 struct nf_conn *
358 bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
359 		  u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
360 {
361 	struct sk_buff *skb = (struct sk_buff *)skb_ctx;
362 	struct net *caller_net;
363 	struct nf_conn *nfct;
364 
365 	caller_net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk);
366 	nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts, opts__sz);
367 	if (IS_ERR(nfct)) {
368 		if (opts)
369 			opts->error = PTR_ERR(nfct);
370 		return NULL;
371 	}
372 	return nfct;
373 }
374 
375 /* bpf_ct_insert_entry - Add the provided entry into a CT map
376  *
377  * This must be invoked for referenced PTR_TO_BTF_ID.
378  *
379  * @nfct	 - Pointer to referenced nf_conn___init object, obtained
380  *		   using bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
381  */
382 struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i)
383 {
384 	struct nf_conn *nfct = (struct nf_conn *)nfct_i;
385 	int err;
386 
387 	nfct->status |= IPS_CONFIRMED;
388 	err = nf_conntrack_hash_check_insert(nfct);
389 	if (err < 0) {
390 		nf_conntrack_free(nfct);
391 		return NULL;
392 	}
393 	return nfct;
394 }
395 
396 /* bpf_ct_release - Release acquired nf_conn object
397  *
398  * This must be invoked for referenced PTR_TO_BTF_ID, and the verifier rejects
399  * the program if any references remain in the program in all of the explored
400  * states.
401  *
402  * Parameters:
403  * @nf_conn	 - Pointer to referenced nf_conn object, obtained using
404  *		   bpf_xdp_ct_lookup or bpf_skb_ct_lookup.
405  */
406 void bpf_ct_release(struct nf_conn *nfct)
407 {
408 	if (!nfct)
409 		return;
410 	nf_ct_put(nfct);
411 }
412 
413 /* bpf_ct_set_timeout - Set timeout of allocated nf_conn
414  *
415  * Sets the default timeout of newly allocated nf_conn before insertion.
416  * This helper must be invoked for refcounted pointer to nf_conn___init.
417  *
418  * Parameters:
419  * @nfct	 - Pointer to referenced nf_conn object, obtained using
420  *                 bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
421  * @timeout      - Timeout in msecs.
422  */
423 void bpf_ct_set_timeout(struct nf_conn___init *nfct, u32 timeout)
424 {
425 	__nf_ct_set_timeout((struct nf_conn *)nfct, msecs_to_jiffies(timeout));
426 }
427 
428 /* bpf_ct_change_timeout - Change timeout of inserted nf_conn
429  *
430  * Change timeout associated of the inserted or looked up nf_conn.
431  * This helper must be invoked for refcounted pointer to nf_conn.
432  *
433  * Parameters:
434  * @nfct	 - Pointer to referenced nf_conn object, obtained using
435  *		   bpf_ct_insert_entry, bpf_xdp_ct_lookup, or bpf_skb_ct_lookup.
436  * @timeout      - New timeout in msecs.
437  */
438 int bpf_ct_change_timeout(struct nf_conn *nfct, u32 timeout)
439 {
440 	return __nf_ct_change_timeout(nfct, msecs_to_jiffies(timeout));
441 }
442 
443 /* bpf_ct_set_status - Set status field of allocated nf_conn
444  *
445  * Set the status field of the newly allocated nf_conn before insertion.
446  * This must be invoked for referenced PTR_TO_BTF_ID to nf_conn___init.
447  *
448  * Parameters:
449  * @nfct	 - Pointer to referenced nf_conn object, obtained using
450  *		   bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
451  * @status       - New status value.
452  */
453 int bpf_ct_set_status(const struct nf_conn___init *nfct, u32 status)
454 {
455 	return nf_ct_change_status_common((struct nf_conn *)nfct, status);
456 }
457 
458 /* bpf_ct_change_status - Change status of inserted nf_conn
459  *
460  * Change the status field of the provided connection tracking entry.
461  * This must be invoked for referenced PTR_TO_BTF_ID to nf_conn.
462  *
463  * Parameters:
464  * @nfct	 - Pointer to referenced nf_conn object, obtained using
465  *		   bpf_ct_insert_entry, bpf_xdp_ct_lookup or bpf_skb_ct_lookup.
466  * @status       - New status value.
467  */
468 int bpf_ct_change_status(struct nf_conn *nfct, u32 status)
469 {
470 	return nf_ct_change_status_common(nfct, status);
471 }
472 
473 __diag_pop()
474 
475 BTF_SET8_START(nf_ct_kfunc_set)
476 BTF_ID_FLAGS(func, bpf_xdp_ct_alloc, KF_ACQUIRE | KF_RET_NULL)
477 BTF_ID_FLAGS(func, bpf_xdp_ct_lookup, KF_ACQUIRE | KF_RET_NULL)
478 BTF_ID_FLAGS(func, bpf_skb_ct_alloc, KF_ACQUIRE | KF_RET_NULL)
479 BTF_ID_FLAGS(func, bpf_skb_ct_lookup, KF_ACQUIRE | KF_RET_NULL)
480 BTF_ID_FLAGS(func, bpf_ct_insert_entry, KF_ACQUIRE | KF_RET_NULL | KF_RELEASE)
481 BTF_ID_FLAGS(func, bpf_ct_release, KF_RELEASE)
482 BTF_ID_FLAGS(func, bpf_ct_set_timeout, KF_TRUSTED_ARGS)
483 BTF_ID_FLAGS(func, bpf_ct_change_timeout, KF_TRUSTED_ARGS)
484 BTF_ID_FLAGS(func, bpf_ct_set_status, KF_TRUSTED_ARGS)
485 BTF_ID_FLAGS(func, bpf_ct_change_status, KF_TRUSTED_ARGS)
486 BTF_SET8_END(nf_ct_kfunc_set)
487 
488 static const struct btf_kfunc_id_set nf_conntrack_kfunc_set = {
489 	.owner = THIS_MODULE,
490 	.set   = &nf_ct_kfunc_set,
491 };
492 
493 int register_nf_conntrack_bpf(void)
494 {
495 	int ret;
496 
497 	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &nf_conntrack_kfunc_set);
498 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &nf_conntrack_kfunc_set);
499 	if (!ret) {
500 		mutex_lock(&nf_conn_btf_access_lock);
501 		nfct_btf_struct_access = _nf_conntrack_btf_struct_access;
502 		mutex_unlock(&nf_conn_btf_access_lock);
503 	}
504 
505 	return ret;
506 }
507 
508 void cleanup_nf_conntrack_bpf(void)
509 {
510 	mutex_lock(&nf_conn_btf_access_lock);
511 	nfct_btf_struct_access = NULL;
512 	mutex_unlock(&nf_conn_btf_access_lock);
513 }
514