1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <linux/kernel.h>
3 #include <linux/init.h>
4 #include <linux/module.h>
5 #include <linux/netfilter.h>
6 #include <linux/rhashtable.h>
7 #include <linux/netdevice.h>
8 #include <net/ip.h>
9 #include <net/ip6_route.h>
10 #include <net/netfilter/nf_tables.h>
11 #include <net/netfilter/nf_flow_table.h>
12 #include <net/netfilter/nf_conntrack.h>
13 #include <net/netfilter/nf_conntrack_core.h>
14 #include <net/netfilter/nf_conntrack_l4proto.h>
15 #include <net/netfilter/nf_conntrack_tuple.h>
16 
17 static DEFINE_MUTEX(flowtable_lock);
18 static LIST_HEAD(flowtables);
19 
20 static void
21 flow_offload_fill_dir(struct flow_offload *flow,
22 		      enum flow_offload_tuple_dir dir)
23 {
24 	struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple;
25 	struct nf_conntrack_tuple *ctt = &flow->ct->tuplehash[dir].tuple;
26 
27 	ft->dir = dir;
28 
29 	switch (ctt->src.l3num) {
30 	case NFPROTO_IPV4:
31 		ft->src_v4 = ctt->src.u3.in;
32 		ft->dst_v4 = ctt->dst.u3.in;
33 		break;
34 	case NFPROTO_IPV6:
35 		ft->src_v6 = ctt->src.u3.in6;
36 		ft->dst_v6 = ctt->dst.u3.in6;
37 		break;
38 	}
39 
40 	ft->l3proto = ctt->src.l3num;
41 	ft->l4proto = ctt->dst.protonum;
42 	ft->src_port = ctt->src.u.tcp.port;
43 	ft->dst_port = ctt->dst.u.tcp.port;
44 }
45 
46 struct flow_offload *flow_offload_alloc(struct nf_conn *ct)
47 {
48 	struct flow_offload *flow;
49 
50 	if (unlikely(nf_ct_is_dying(ct) ||
51 	    !atomic_inc_not_zero(&ct->ct_general.use)))
52 		return NULL;
53 
54 	flow = kzalloc(sizeof(*flow), GFP_ATOMIC);
55 	if (!flow)
56 		goto err_ct_refcnt;
57 
58 	flow->ct = ct;
59 
60 	flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
61 	flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_REPLY);
62 
63 	if (ct->status & IPS_SRC_NAT)
64 		__set_bit(NF_FLOW_SNAT, &flow->flags);
65 	if (ct->status & IPS_DST_NAT)
66 		__set_bit(NF_FLOW_DNAT, &flow->flags);
67 
68 	return flow;
69 
70 err_ct_refcnt:
71 	nf_ct_put(ct);
72 
73 	return NULL;
74 }
75 EXPORT_SYMBOL_GPL(flow_offload_alloc);
76 
77 static int flow_offload_fill_route(struct flow_offload *flow,
78 				   const struct nf_flow_route *route,
79 				   enum flow_offload_tuple_dir dir)
80 {
81 	struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple;
82 	struct dst_entry *other_dst = route->tuple[!dir].dst;
83 	struct dst_entry *dst = route->tuple[dir].dst;
84 
85 	if (!dst_hold_safe(route->tuple[dir].dst))
86 		return -1;
87 
88 	switch (flow_tuple->l3proto) {
89 	case NFPROTO_IPV4:
90 		flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true);
91 		break;
92 	case NFPROTO_IPV6:
93 		flow_tuple->mtu = ip6_dst_mtu_forward(dst);
94 		break;
95 	}
96 
97 	flow_tuple->iifidx = other_dst->dev->ifindex;
98 	flow_tuple->dst_cache = dst;
99 
100 	return 0;
101 }
102 
103 int flow_offload_route_init(struct flow_offload *flow,
104 			    const struct nf_flow_route *route)
105 {
106 	int err;
107 
108 	err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL);
109 	if (err < 0)
110 		return err;
111 
112 	err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY);
113 	if (err < 0)
114 		goto err_route_reply;
115 
116 	flow->type = NF_FLOW_OFFLOAD_ROUTE;
117 
118 	return 0;
119 
120 err_route_reply:
121 	dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst);
122 
123 	return err;
124 }
125 EXPORT_SYMBOL_GPL(flow_offload_route_init);
126 
127 static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
128 {
129 	tcp->state = TCP_CONNTRACK_ESTABLISHED;
130 	tcp->seen[0].td_maxwin = 0;
131 	tcp->seen[1].td_maxwin = 0;
132 }
133 
134 #define NF_FLOWTABLE_TCP_PICKUP_TIMEOUT	(120 * HZ)
135 #define NF_FLOWTABLE_UDP_PICKUP_TIMEOUT	(30 * HZ)
136 
137 static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
138 {
139 	const struct nf_conntrack_l4proto *l4proto;
140 	int l4num = nf_ct_protonum(ct);
141 	unsigned int timeout;
142 
143 	l4proto = nf_ct_l4proto_find(l4num);
144 	if (!l4proto)
145 		return;
146 
147 	if (l4num == IPPROTO_TCP)
148 		timeout = NF_FLOWTABLE_TCP_PICKUP_TIMEOUT;
149 	else if (l4num == IPPROTO_UDP)
150 		timeout = NF_FLOWTABLE_UDP_PICKUP_TIMEOUT;
151 	else
152 		return;
153 
154 	if (nf_flow_timeout_delta(ct->timeout) > (__s32)timeout)
155 		ct->timeout = nfct_time_stamp + timeout;
156 }
157 
158 static void flow_offload_fixup_ct_state(struct nf_conn *ct)
159 {
160 	if (nf_ct_protonum(ct) == IPPROTO_TCP)
161 		flow_offload_fixup_tcp(&ct->proto.tcp);
162 }
163 
164 static void flow_offload_fixup_ct(struct nf_conn *ct)
165 {
166 	flow_offload_fixup_ct_state(ct);
167 	flow_offload_fixup_ct_timeout(ct);
168 }
169 
170 static void flow_offload_route_release(struct flow_offload *flow)
171 {
172 	dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache);
173 	dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache);
174 }
175 
176 void flow_offload_free(struct flow_offload *flow)
177 {
178 	switch (flow->type) {
179 	case NF_FLOW_OFFLOAD_ROUTE:
180 		flow_offload_route_release(flow);
181 		break;
182 	default:
183 		break;
184 	}
185 	nf_ct_put(flow->ct);
186 	kfree_rcu(flow, rcu_head);
187 }
188 EXPORT_SYMBOL_GPL(flow_offload_free);
189 
190 static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
191 {
192 	const struct flow_offload_tuple *tuple = data;
193 
194 	return jhash(tuple, offsetof(struct flow_offload_tuple, __hash), seed);
195 }
196 
197 static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
198 {
199 	const struct flow_offload_tuple_rhash *tuplehash = data;
200 
201 	return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, __hash), seed);
202 }
203 
204 static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
205 					const void *ptr)
206 {
207 	const struct flow_offload_tuple *tuple = arg->key;
208 	const struct flow_offload_tuple_rhash *x = ptr;
209 
210 	if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, __hash)))
211 		return 1;
212 
213 	return 0;
214 }
215 
216 static const struct rhashtable_params nf_flow_offload_rhash_params = {
217 	.head_offset		= offsetof(struct flow_offload_tuple_rhash, node),
218 	.hashfn			= flow_offload_hash,
219 	.obj_hashfn		= flow_offload_hash_obj,
220 	.obj_cmpfn		= flow_offload_hash_cmp,
221 	.automatic_shrinking	= true,
222 };
223 
224 int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
225 {
226 	int err;
227 
228 	flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT;
229 
230 	err = rhashtable_insert_fast(&flow_table->rhashtable,
231 				     &flow->tuplehash[0].node,
232 				     nf_flow_offload_rhash_params);
233 	if (err < 0)
234 		return err;
235 
236 	err = rhashtable_insert_fast(&flow_table->rhashtable,
237 				     &flow->tuplehash[1].node,
238 				     nf_flow_offload_rhash_params);
239 	if (err < 0) {
240 		rhashtable_remove_fast(&flow_table->rhashtable,
241 				       &flow->tuplehash[0].node,
242 				       nf_flow_offload_rhash_params);
243 		return err;
244 	}
245 
246 	nf_ct_offload_timeout(flow->ct);
247 
248 	if (nf_flowtable_hw_offload(flow_table)) {
249 		__set_bit(NF_FLOW_HW, &flow->flags);
250 		nf_flow_offload_add(flow_table, flow);
251 	}
252 
253 	return 0;
254 }
255 EXPORT_SYMBOL_GPL(flow_offload_add);
256 
257 void flow_offload_refresh(struct nf_flowtable *flow_table,
258 			  struct flow_offload *flow)
259 {
260 	flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT;
261 
262 	if (likely(!nf_flowtable_hw_offload(flow_table) ||
263 		   !test_and_clear_bit(NF_FLOW_HW_REFRESH, &flow->flags)))
264 		return;
265 
266 	nf_flow_offload_add(flow_table, flow);
267 }
268 EXPORT_SYMBOL_GPL(flow_offload_refresh);
269 
270 static inline bool nf_flow_has_expired(const struct flow_offload *flow)
271 {
272 	return nf_flow_timeout_delta(flow->timeout) <= 0;
273 }
274 
275 static void flow_offload_del(struct nf_flowtable *flow_table,
276 			     struct flow_offload *flow)
277 {
278 	rhashtable_remove_fast(&flow_table->rhashtable,
279 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
280 			       nf_flow_offload_rhash_params);
281 	rhashtable_remove_fast(&flow_table->rhashtable,
282 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
283 			       nf_flow_offload_rhash_params);
284 
285 	clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
286 
287 	if (nf_flow_has_expired(flow))
288 		flow_offload_fixup_ct(flow->ct);
289 	else
290 		flow_offload_fixup_ct_timeout(flow->ct);
291 
292 	flow_offload_free(flow);
293 }
294 
295 void flow_offload_teardown(struct flow_offload *flow)
296 {
297 	set_bit(NF_FLOW_TEARDOWN, &flow->flags);
298 
299 	flow_offload_fixup_ct_state(flow->ct);
300 }
301 EXPORT_SYMBOL_GPL(flow_offload_teardown);
302 
303 struct flow_offload_tuple_rhash *
304 flow_offload_lookup(struct nf_flowtable *flow_table,
305 		    struct flow_offload_tuple *tuple)
306 {
307 	struct flow_offload_tuple_rhash *tuplehash;
308 	struct flow_offload *flow;
309 	int dir;
310 
311 	tuplehash = rhashtable_lookup(&flow_table->rhashtable, tuple,
312 				      nf_flow_offload_rhash_params);
313 	if (!tuplehash)
314 		return NULL;
315 
316 	dir = tuplehash->tuple.dir;
317 	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
318 	if (test_bit(NF_FLOW_TEARDOWN, &flow->flags))
319 		return NULL;
320 
321 	if (unlikely(nf_ct_is_dying(flow->ct)))
322 		return NULL;
323 
324 	return tuplehash;
325 }
326 EXPORT_SYMBOL_GPL(flow_offload_lookup);
327 
328 static int
329 nf_flow_table_iterate(struct nf_flowtable *flow_table,
330 		      void (*iter)(struct flow_offload *flow, void *data),
331 		      void *data)
332 {
333 	struct flow_offload_tuple_rhash *tuplehash;
334 	struct rhashtable_iter hti;
335 	struct flow_offload *flow;
336 	int err = 0;
337 
338 	rhashtable_walk_enter(&flow_table->rhashtable, &hti);
339 	rhashtable_walk_start(&hti);
340 
341 	while ((tuplehash = rhashtable_walk_next(&hti))) {
342 		if (IS_ERR(tuplehash)) {
343 			if (PTR_ERR(tuplehash) != -EAGAIN) {
344 				err = PTR_ERR(tuplehash);
345 				break;
346 			}
347 			continue;
348 		}
349 		if (tuplehash->tuple.dir)
350 			continue;
351 
352 		flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
353 
354 		iter(flow, data);
355 	}
356 	rhashtable_walk_stop(&hti);
357 	rhashtable_walk_exit(&hti);
358 
359 	return err;
360 }
361 
362 static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data)
363 {
364 	struct nf_flowtable *flow_table = data;
365 
366 	if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct))
367 		set_bit(NF_FLOW_TEARDOWN, &flow->flags);
368 
369 	if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) {
370 		if (test_bit(NF_FLOW_HW, &flow->flags)) {
371 			if (!test_bit(NF_FLOW_HW_DYING, &flow->flags))
372 				nf_flow_offload_del(flow_table, flow);
373 			else if (test_bit(NF_FLOW_HW_DEAD, &flow->flags))
374 				flow_offload_del(flow_table, flow);
375 		} else {
376 			flow_offload_del(flow_table, flow);
377 		}
378 	} else if (test_bit(NF_FLOW_HW, &flow->flags)) {
379 		nf_flow_offload_stats(flow_table, flow);
380 	}
381 }
382 
383 static void nf_flow_offload_work_gc(struct work_struct *work)
384 {
385 	struct nf_flowtable *flow_table;
386 
387 	flow_table = container_of(work, struct nf_flowtable, gc_work.work);
388 	nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table);
389 	queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
390 }
391 
392 
393 static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
394 				__be16 port, __be16 new_port)
395 {
396 	struct tcphdr *tcph;
397 
398 	if (skb_try_make_writable(skb, thoff + sizeof(*tcph)))
399 		return -1;
400 
401 	tcph = (void *)(skb_network_header(skb) + thoff);
402 	inet_proto_csum_replace2(&tcph->check, skb, port, new_port, false);
403 
404 	return 0;
405 }
406 
407 static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
408 				__be16 port, __be16 new_port)
409 {
410 	struct udphdr *udph;
411 
412 	if (skb_try_make_writable(skb, thoff + sizeof(*udph)))
413 		return -1;
414 
415 	udph = (void *)(skb_network_header(skb) + thoff);
416 	if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
417 		inet_proto_csum_replace2(&udph->check, skb, port,
418 					 new_port, false);
419 		if (!udph->check)
420 			udph->check = CSUM_MANGLED_0;
421 	}
422 
423 	return 0;
424 }
425 
426 static int nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff,
427 			    u8 protocol, __be16 port, __be16 new_port)
428 {
429 	switch (protocol) {
430 	case IPPROTO_TCP:
431 		if (nf_flow_nat_port_tcp(skb, thoff, port, new_port) < 0)
432 			return NF_DROP;
433 		break;
434 	case IPPROTO_UDP:
435 		if (nf_flow_nat_port_udp(skb, thoff, port, new_port) < 0)
436 			return NF_DROP;
437 		break;
438 	}
439 
440 	return 0;
441 }
442 
443 int nf_flow_snat_port(const struct flow_offload *flow,
444 		      struct sk_buff *skb, unsigned int thoff,
445 		      u8 protocol, enum flow_offload_tuple_dir dir)
446 {
447 	struct flow_ports *hdr;
448 	__be16 port, new_port;
449 
450 	if (skb_try_make_writable(skb, thoff + sizeof(*hdr)))
451 		return -1;
452 
453 	hdr = (void *)(skb_network_header(skb) + thoff);
454 
455 	switch (dir) {
456 	case FLOW_OFFLOAD_DIR_ORIGINAL:
457 		port = hdr->source;
458 		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port;
459 		hdr->source = new_port;
460 		break;
461 	case FLOW_OFFLOAD_DIR_REPLY:
462 		port = hdr->dest;
463 		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port;
464 		hdr->dest = new_port;
465 		break;
466 	default:
467 		return -1;
468 	}
469 
470 	return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
471 }
472 EXPORT_SYMBOL_GPL(nf_flow_snat_port);
473 
474 int nf_flow_dnat_port(const struct flow_offload *flow,
475 		      struct sk_buff *skb, unsigned int thoff,
476 		      u8 protocol, enum flow_offload_tuple_dir dir)
477 {
478 	struct flow_ports *hdr;
479 	__be16 port, new_port;
480 
481 	if (skb_try_make_writable(skb, thoff + sizeof(*hdr)))
482 		return -1;
483 
484 	hdr = (void *)(skb_network_header(skb) + thoff);
485 
486 	switch (dir) {
487 	case FLOW_OFFLOAD_DIR_ORIGINAL:
488 		port = hdr->dest;
489 		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port;
490 		hdr->dest = new_port;
491 		break;
492 	case FLOW_OFFLOAD_DIR_REPLY:
493 		port = hdr->source;
494 		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port;
495 		hdr->source = new_port;
496 		break;
497 	default:
498 		return -1;
499 	}
500 
501 	return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
502 }
503 EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
504 
505 int nf_flow_table_init(struct nf_flowtable *flowtable)
506 {
507 	int err;
508 
509 	INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
510 	flow_block_init(&flowtable->flow_block);
511 	init_rwsem(&flowtable->flow_block_lock);
512 
513 	err = rhashtable_init(&flowtable->rhashtable,
514 			      &nf_flow_offload_rhash_params);
515 	if (err < 0)
516 		return err;
517 
518 	queue_delayed_work(system_power_efficient_wq,
519 			   &flowtable->gc_work, HZ);
520 
521 	mutex_lock(&flowtable_lock);
522 	list_add(&flowtable->list, &flowtables);
523 	mutex_unlock(&flowtable_lock);
524 
525 	return 0;
526 }
527 EXPORT_SYMBOL_GPL(nf_flow_table_init);
528 
529 static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data)
530 {
531 	struct net_device *dev = data;
532 
533 	if (!dev) {
534 		flow_offload_teardown(flow);
535 		return;
536 	}
537 
538 	if (net_eq(nf_ct_net(flow->ct), dev_net(dev)) &&
539 	    (flow->tuplehash[0].tuple.iifidx == dev->ifindex ||
540 	     flow->tuplehash[1].tuple.iifidx == dev->ifindex))
541 		flow_offload_teardown(flow);
542 }
543 
544 void nf_flow_table_gc_cleanup(struct nf_flowtable *flowtable,
545 			      struct net_device *dev)
546 {
547 	nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev);
548 	flush_delayed_work(&flowtable->gc_work);
549 	nf_flow_table_offload_flush(flowtable);
550 }
551 
552 void nf_flow_table_cleanup(struct net_device *dev)
553 {
554 	struct nf_flowtable *flowtable;
555 
556 	mutex_lock(&flowtable_lock);
557 	list_for_each_entry(flowtable, &flowtables, list)
558 		nf_flow_table_gc_cleanup(flowtable, dev);
559 	mutex_unlock(&flowtable_lock);
560 }
561 EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
562 
563 void nf_flow_table_free(struct nf_flowtable *flow_table)
564 {
565 	mutex_lock(&flowtable_lock);
566 	list_del(&flow_table->list);
567 	mutex_unlock(&flowtable_lock);
568 
569 	cancel_delayed_work_sync(&flow_table->gc_work);
570 	nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
571 	nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table);
572 	nf_flow_table_offload_flush(flow_table);
573 	if (nf_flowtable_hw_offload(flow_table))
574 		nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step,
575 				      flow_table);
576 	rhashtable_destroy(&flow_table->rhashtable);
577 }
578 EXPORT_SYMBOL_GPL(nf_flow_table_free);
579 
580 static int __init nf_flow_table_module_init(void)
581 {
582 	return nf_flow_table_offload_init();
583 }
584 
585 static void __exit nf_flow_table_module_exit(void)
586 {
587 	nf_flow_table_offload_exit();
588 }
589 
590 module_init(nf_flow_table_module_init);
591 module_exit(nf_flow_table_module_exit);
592 
593 MODULE_LICENSE("GPL");
594 MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
595 MODULE_DESCRIPTION("Netfilter flow table module");
596