1#!/bin/bash
2# SPDX-License-Identifier: GPL-2.0
3#
4# This tests basic flowtable functionality.
5# Creates following default topology:
6#
7# Originator (MTU 9000) <-Router1-> MTU 1500 <-Router2-> Responder (MTU 2000)
8# Router1 is the one doing flow offloading, Router2 has no special
9# purpose other than having a link that is smaller than either Originator
10# and responder, i.e. TCPMSS announced values are too large and will still
11# result in fragmentation and/or PMTU discovery.
12#
13# You can check with different Orgininator/Link/Responder MTU eg:
14# nft_flowtable.sh -o8000 -l1500 -r2000
15#
16
17sfx=$(mktemp -u "XXXXXXXX")
18ns1="ns1-$sfx"
19ns2="ns2-$sfx"
20nsr1="nsr1-$sfx"
21nsr2="nsr2-$sfx"
22
23# Kselftest framework requirement - SKIP code is 4.
24ksft_skip=4
25ret=0
26
27nsin=""
28ns1out=""
29ns2out=""
30
31log_netns=$(sysctl -n net.netfilter.nf_log_all_netns)
32
33checktool (){
34	if ! $1 > /dev/null 2>&1; then
35		echo "SKIP: Could not $2"
36		exit $ksft_skip
37	fi
38}
39
40checktool "nft --version" "run test without nft tool"
41checktool "ip -Version" "run test without ip tool"
42checktool "which nc" "run test without nc (netcat)"
43checktool "ip netns add $nsr1" "create net namespace $nsr1"
44
45ip netns add $ns1
46ip netns add $ns2
47ip netns add $nsr2
48
49cleanup() {
50	ip netns del $ns1
51	ip netns del $ns2
52	ip netns del $nsr1
53	ip netns del $nsr2
54
55	rm -f "$nsin" "$ns1out" "$ns2out"
56
57	[ $log_netns -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns
58}
59
60trap cleanup EXIT
61
62sysctl -q net.netfilter.nf_log_all_netns=1
63
64ip link add veth0 netns $nsr1 type veth peer name eth0 netns $ns1
65ip link add veth1 netns $nsr1 type veth peer name veth0 netns $nsr2
66
67ip link add veth1 netns $nsr2 type veth peer name eth0 netns $ns2
68
69for dev in lo veth0 veth1; do
70    ip -net $nsr1 link set $dev up
71    ip -net $nsr2 link set $dev up
72done
73
74ip -net $nsr1 addr add 10.0.1.1/24 dev veth0
75ip -net $nsr1 addr add dead:1::1/64 dev veth0
76
77ip -net $nsr2 addr add 10.0.2.1/24 dev veth1
78ip -net $nsr2 addr add dead:2::1/64 dev veth1
79
80# set different MTUs so we need to push packets coming from ns1 (large MTU)
81# to ns2 (smaller MTU) to stack either to perform fragmentation (ip_no_pmtu_disc=1),
82# or to do PTMU discovery (send ICMP error back to originator).
83# ns2 is going via nsr2 with a smaller mtu, so that TCPMSS announced by both peers
84# is NOT the lowest link mtu.
85
86omtu=9000
87lmtu=1500
88rmtu=2000
89
90usage(){
91	echo "nft_flowtable.sh [OPTIONS]"
92	echo
93	echo "MTU options"
94	echo "   -o originator"
95	echo "   -l link"
96	echo "   -r responder"
97	exit 1
98}
99
100while getopts "o:l:r:" o
101do
102	case $o in
103		o) omtu=$OPTARG;;
104		l) lmtu=$OPTARG;;
105		r) rmtu=$OPTARG;;
106		*) usage;;
107	esac
108done
109
110if ! ip -net $nsr1 link set veth0 mtu $omtu; then
111	exit 1
112fi
113
114ip -net $ns1 link set eth0 mtu $omtu
115
116if ! ip -net $nsr2 link set veth1 mtu $rmtu; then
117	exit 1
118fi
119
120ip -net $ns2 link set eth0 mtu $rmtu
121
122# transfer-net between nsr1 and nsr2.
123# these addresses are not used for connections.
124ip -net $nsr1 addr add 192.168.10.1/24 dev veth1
125ip -net $nsr1 addr add fee1:2::1/64 dev veth1
126
127ip -net $nsr2 addr add 192.168.10.2/24 dev veth0
128ip -net $nsr2 addr add fee1:2::2/64 dev veth0
129
130for i in 0 1; do
131  ip netns exec $nsr1 sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null
132  ip netns exec $nsr2 sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null
133done
134
135for ns in $ns1 $ns2;do
136  ip -net $ns link set lo up
137  ip -net $ns link set eth0 up
138
139  if ! ip netns exec $ns sysctl net.ipv4.tcp_no_metrics_save=1 > /dev/null; then
140	echo "ERROR: Check Originator/Responder values (problem during address addition)"
141	exit 1
142  fi
143  # don't set ip DF bit for first two tests
144  ip netns exec $ns sysctl net.ipv4.ip_no_pmtu_disc=1 > /dev/null
145done
146
147ip -net $ns1 addr add 10.0.1.99/24 dev eth0
148ip -net $ns2 addr add 10.0.2.99/24 dev eth0
149ip -net $ns1 route add default via 10.0.1.1
150ip -net $ns2 route add default via 10.0.2.1
151ip -net $ns1 addr add dead:1::99/64 dev eth0
152ip -net $ns2 addr add dead:2::99/64 dev eth0
153ip -net $ns1 route add default via dead:1::1
154ip -net $ns2 route add default via dead:2::1
155
156ip -net $nsr1 route add default via 192.168.10.2
157ip -net $nsr2 route add default via 192.168.10.1
158
159ip netns exec $nsr1 nft -f - <<EOF
160table inet filter {
161  flowtable f1 {
162     hook ingress priority 0
163     devices = { veth0, veth1 }
164   }
165
166   counter routed_orig { }
167   counter routed_repl { }
168
169   chain forward {
170      type filter hook forward priority 0; policy drop;
171
172      # flow offloaded? Tag ct with mark 1, so we can detect when it fails.
173      meta oif "veth1" tcp dport 12345 ct mark set 1 flow add @f1 counter name routed_orig accept
174
175      # count packets supposedly offloaded as per direction.
176      ct mark 1 counter name ct direction map { original : routed_orig, reply : routed_repl } accept
177
178      ct state established,related accept
179
180      meta nfproto ipv4 meta l4proto icmp accept
181      meta nfproto ipv6 meta l4proto icmpv6 accept
182   }
183}
184EOF
185
186if [ $? -ne 0 ]; then
187	echo "SKIP: Could not load nft ruleset"
188	exit $ksft_skip
189fi
190
191ip netns exec $ns2 nft -f - <<EOF
192table inet filter {
193   counter ip4dscp0 { }
194   counter ip4dscp3 { }
195
196   chain input {
197      type filter hook input priority 0; policy accept;
198      meta l4proto tcp goto {
199	      ip dscp cs3 counter name ip4dscp3 accept
200	      ip dscp 0 counter name ip4dscp0 accept
201      }
202   }
203}
204EOF
205
206if [ $? -ne 0 ]; then
207	echo "SKIP: Could not load nft ruleset"
208	exit $ksft_skip
209fi
210
211# test basic connectivity
212if ! ip netns exec $ns1 ping -c 1 -q 10.0.2.99 > /dev/null; then
213  echo "ERROR: $ns1 cannot reach ns2" 1>&2
214  exit 1
215fi
216
217if ! ip netns exec $ns2 ping -c 1 -q 10.0.1.99 > /dev/null; then
218  echo "ERROR: $ns2 cannot reach $ns1" 1>&2
219  exit 1
220fi
221
222if [ $ret -eq 0 ];then
223	echo "PASS: netns routing/connectivity: $ns1 can reach $ns2"
224fi
225
226nsin=$(mktemp)
227ns1out=$(mktemp)
228ns2out=$(mktemp)
229
230make_file()
231{
232	name=$1
233
234	SIZE=$((RANDOM % (1024 * 128)))
235	SIZE=$((SIZE + (1024 * 8)))
236	TSIZE=$((SIZE * 1024))
237
238	dd if=/dev/urandom of="$name" bs=1024 count=$SIZE 2> /dev/null
239
240	SIZE=$((RANDOM % 1024))
241	SIZE=$((SIZE + 128))
242	TSIZE=$((TSIZE + SIZE))
243	dd if=/dev/urandom conf=notrunc of="$name" bs=1 count=$SIZE 2> /dev/null
244}
245
246check_counters()
247{
248	local what=$1
249	local ok=1
250
251	local orig=$(ip netns exec $nsr1 nft reset counter inet filter routed_orig | grep packets)
252	local repl=$(ip netns exec $nsr1 nft reset counter inet filter routed_repl | grep packets)
253
254	local orig_cnt=${orig#*bytes}
255	local repl_cnt=${repl#*bytes}
256
257	local fs=$(du -sb $nsin)
258	local max_orig=${fs%%/*}
259	local max_repl=$((max_orig/4))
260
261	if [ $orig_cnt -gt $max_orig ];then
262		echo "FAIL: $what: original counter $orig_cnt exceeds expected value $max_orig" 1>&2
263		ret=1
264		ok=0
265	fi
266
267	if [ $repl_cnt -gt $max_repl ];then
268		echo "FAIL: $what: reply counter $repl_cnt exceeds expected value $max_repl" 1>&2
269		ret=1
270		ok=0
271	fi
272
273	if [ $ok -eq 1 ]; then
274		echo "PASS: $what"
275	fi
276}
277
278check_dscp()
279{
280	local what=$1
281	local ok=1
282
283	local counter=$(ip netns exec $ns2 nft reset counter inet filter ip4dscp3 | grep packets)
284
285	local pc4=${counter%*bytes*}
286	local pc4=${pc4#*packets}
287
288	local counter=$(ip netns exec $ns2 nft reset counter inet filter ip4dscp0 | grep packets)
289	local pc4z=${counter%*bytes*}
290	local pc4z=${pc4z#*packets}
291
292	case "$what" in
293	"dscp_none")
294		if [ $pc4 -gt 0 ] || [ $pc4z -eq 0 ]; then
295			echo "FAIL: dscp counters do not match, expected dscp3 == 0, dscp0 > 0, but got $pc4,$pc4z" 1>&2
296			ret=1
297			ok=0
298		fi
299		;;
300	"dscp_fwd")
301		if [ $pc4 -eq 0 ] || [ $pc4z -eq 0 ]; then
302			echo "FAIL: dscp counters do not match, expected dscp3 and dscp0 > 0 but got $pc4,$pc4z" 1>&2
303			ret=1
304			ok=0
305		fi
306		;;
307	"dscp_ingress")
308		if [ $pc4 -eq 0 ] || [ $pc4z -gt 0 ]; then
309			echo "FAIL: dscp counters do not match, expected dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2
310			ret=1
311			ok=0
312		fi
313		;;
314	"dscp_egress")
315		if [ $pc4 -eq 0 ] || [ $pc4z -gt 0 ]; then
316			echo "FAIL: dscp counters do not match, expected dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2
317			ret=1
318			ok=0
319		fi
320		;;
321	*)
322		echo "FAIL: Unknown DSCP check" 1>&2
323		ret=1
324		ok=0
325	esac
326
327	if [ $ok -eq 1 ] ;then
328		echo "PASS: $what: dscp packet counters match"
329	fi
330}
331
332check_transfer()
333{
334	in=$1
335	out=$2
336	what=$3
337
338	if ! cmp "$in" "$out" > /dev/null 2>&1; then
339		echo "FAIL: file mismatch for $what" 1>&2
340		ls -l "$in"
341		ls -l "$out"
342		return 1
343	fi
344
345	return 0
346}
347
348test_tcp_forwarding_ip()
349{
350	local nsa=$1
351	local nsb=$2
352	local dstip=$3
353	local dstport=$4
354	local lret=0
355
356	ip netns exec $nsb nc -w 5 -l -p 12345 < "$nsin" > "$ns2out" &
357	lpid=$!
358
359	sleep 1
360	ip netns exec $nsa nc -w 4 "$dstip" "$dstport" < "$nsin" > "$ns1out" &
361	cpid=$!
362
363	sleep 1
364
365	prev="$(ls -l $ns1out $ns2out)"
366	sleep 1
367
368	while [[ "$prev" != "$(ls -l $ns1out $ns2out)" ]]; do
369		sleep 1;
370		prev="$(ls -l $ns1out $ns2out)"
371	done
372
373	if test -d /proc/"$lpid"/; then
374		kill $lpid
375	fi
376
377	if test -d /proc/"$cpid"/; then
378		kill $cpid
379	fi
380
381	wait $lpid
382	wait $cpid
383
384	if ! check_transfer "$nsin" "$ns2out" "ns1 -> ns2"; then
385		lret=1
386	fi
387
388	if ! check_transfer "$nsin" "$ns1out" "ns1 <- ns2"; then
389		lret=1
390	fi
391
392	return $lret
393}
394
395test_tcp_forwarding()
396{
397	test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345
398
399	return $?
400}
401
402test_tcp_forwarding_set_dscp()
403{
404	check_dscp "dscp_none"
405
406ip netns exec $nsr1 nft -f - <<EOF
407table netdev dscpmangle {
408   chain setdscp0 {
409      type filter hook ingress device "veth0" priority 0; policy accept
410	ip dscp set cs3
411  }
412}
413EOF
414if [ $? -eq 0 ]; then
415	test_tcp_forwarding_ip "$1" "$2"  10.0.2.99 12345
416	check_dscp "dscp_ingress"
417
418	ip netns exec $nsr1 nft delete table netdev dscpmangle
419else
420	echo "SKIP: Could not load netdev:ingress for veth0"
421fi
422
423ip netns exec $nsr1 nft -f - <<EOF
424table netdev dscpmangle {
425   chain setdscp0 {
426      type filter hook egress device "veth1" priority 0; policy accept
427      ip dscp set cs3
428  }
429}
430EOF
431if [ $? -eq 0 ]; then
432	test_tcp_forwarding_ip "$1" "$2"  10.0.2.99 12345
433	check_dscp "dscp_egress"
434
435	ip netns exec $nsr1 nft flush table netdev dscpmangle
436else
437	echo "SKIP: Could not load netdev:egress for veth1"
438fi
439
440	# partial.  If flowtable really works, then both dscp-is-0 and dscp-is-cs3
441	# counters should have seen packets (before and after ft offload kicks in).
442	ip netns exec $nsr1 nft -a insert rule inet filter forward ip dscp set cs3
443	test_tcp_forwarding_ip "$1" "$2"  10.0.2.99 12345
444	check_dscp "dscp_fwd"
445}
446
447test_tcp_forwarding_nat()
448{
449	local lret
450	local pmtu
451
452	test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345
453	lret=$?
454
455	pmtu=$3
456	what=$4
457
458	if [ $lret -eq 0 ] ; then
459		if [ $pmtu -eq 1 ] ;then
460			check_counters "flow offload for ns1/ns2 with masquerade and pmtu discovery $what"
461		else
462			echo "PASS: flow offload for ns1/ns2 with masquerade $what"
463		fi
464
465		test_tcp_forwarding_ip "$1" "$2" 10.6.6.6 1666
466		lret=$?
467		if [ $pmtu -eq 1 ] ;then
468			check_counters "flow offload for ns1/ns2 with dnat and pmtu discovery $what"
469		elif [ $lret -eq 0 ] ; then
470			echo "PASS: flow offload for ns1/ns2 with dnat $what"
471		fi
472	fi
473
474	return $lret
475}
476
477make_file "$nsin"
478
479# First test:
480# No PMTU discovery, nsr1 is expected to fragment packets from ns1 to ns2 as needed.
481# Due to MTU mismatch in both directions, all packets (except small packets like pure
482# acks) have to be handled by normal forwarding path.  Therefore, packet counters
483# are not checked.
484if test_tcp_forwarding $ns1 $ns2; then
485	echo "PASS: flow offloaded for ns1/ns2"
486else
487	echo "FAIL: flow offload for ns1/ns2:" 1>&2
488	ip netns exec $nsr1 nft list ruleset
489	ret=1
490fi
491
492# delete default route, i.e. ns2 won't be able to reach ns1 and
493# will depend on ns1 being masqueraded in nsr1.
494# expect ns1 has nsr1 address.
495ip -net $ns2 route del default via 10.0.2.1
496ip -net $ns2 route del default via dead:2::1
497ip -net $ns2 route add 192.168.10.1 via 10.0.2.1
498
499# Second test:
500# Same, but with NAT enabled.  Same as in first test: we expect normal forward path
501# to handle most packets.
502ip netns exec $nsr1 nft -f - <<EOF
503table ip nat {
504   chain prerouting {
505      type nat hook prerouting priority 0; policy accept;
506      meta iif "veth0" ip daddr 10.6.6.6 tcp dport 1666 counter dnat ip to 10.0.2.99:12345
507   }
508
509   chain postrouting {
510      type nat hook postrouting priority 0; policy accept;
511      meta oifname "veth1" counter masquerade
512   }
513}
514EOF
515
516if ! test_tcp_forwarding_set_dscp $ns1 $ns2 0 ""; then
517	echo "FAIL: flow offload for ns1/ns2 with dscp update" 1>&2
518	exit 0
519fi
520
521if ! test_tcp_forwarding_nat $ns1 $ns2 0 ""; then
522	echo "FAIL: flow offload for ns1/ns2 with NAT" 1>&2
523	ip netns exec $nsr1 nft list ruleset
524	ret=1
525fi
526
527# Third test:
528# Same as second test, but with PMTU discovery enabled. This
529# means that we expect the fastpath to handle packets as soon
530# as the endpoints adjust the packet size.
531ip netns exec $ns1 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null
532ip netns exec $ns2 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null
533
534# reset counters.
535# With pmtu in-place we'll also check that nft counters
536# are lower than file size and packets were forwarded via flowtable layer.
537# For earlier tests (large mtus), packets cannot be handled via flowtable
538# (except pure acks and other small packets).
539ip netns exec $nsr1 nft reset counters table inet filter >/dev/null
540
541if ! test_tcp_forwarding_nat $ns1 $ns2 1 ""; then
542	echo "FAIL: flow offload for ns1/ns2 with NAT and pmtu discovery" 1>&2
543	ip netns exec $nsr1 nft list ruleset
544fi
545
546# Another test:
547# Add bridge interface br0 to Router1, with NAT enabled.
548ip -net $nsr1 link add name br0 type bridge
549ip -net $nsr1 addr flush dev veth0
550ip -net $nsr1 link set up dev veth0
551ip -net $nsr1 link set veth0 master br0
552ip -net $nsr1 addr add 10.0.1.1/24 dev br0
553ip -net $nsr1 addr add dead:1::1/64 dev br0
554ip -net $nsr1 link set up dev br0
555
556ip netns exec $nsr1 sysctl net.ipv4.conf.br0.forwarding=1 > /dev/null
557
558# br0 with NAT enabled.
559ip netns exec $nsr1 nft -f - <<EOF
560flush table ip nat
561table ip nat {
562   chain prerouting {
563      type nat hook prerouting priority 0; policy accept;
564      meta iif "br0" ip daddr 10.6.6.6 tcp dport 1666 counter dnat ip to 10.0.2.99:12345
565   }
566
567   chain postrouting {
568      type nat hook postrouting priority 0; policy accept;
569      meta oifname "veth1" counter masquerade
570   }
571}
572EOF
573
574if ! test_tcp_forwarding_nat $ns1 $ns2 1 "on bridge"; then
575	echo "FAIL: flow offload for ns1/ns2 with bridge NAT" 1>&2
576	ip netns exec $nsr1 nft list ruleset
577	ret=1
578fi
579
580
581# Another test:
582# Add bridge interface br0 to Router1, with NAT and VLAN.
583ip -net $nsr1 link set veth0 nomaster
584ip -net $nsr1 link set down dev veth0
585ip -net $nsr1 link add link veth0 name veth0.10 type vlan id 10
586ip -net $nsr1 link set up dev veth0
587ip -net $nsr1 link set up dev veth0.10
588ip -net $nsr1 link set veth0.10 master br0
589
590ip -net $ns1 addr flush dev eth0
591ip -net $ns1 link add link eth0 name eth0.10 type vlan id 10
592ip -net $ns1 link set eth0 up
593ip -net $ns1 link set eth0.10 up
594ip -net $ns1 addr add 10.0.1.99/24 dev eth0.10
595ip -net $ns1 route add default via 10.0.1.1
596ip -net $ns1 addr add dead:1::99/64 dev eth0.10
597
598if ! test_tcp_forwarding_nat $ns1 $ns2 1 "bridge and VLAN"; then
599	echo "FAIL: flow offload for ns1/ns2 with bridge NAT and VLAN" 1>&2
600	ip netns exec $nsr1 nft list ruleset
601	ret=1
602fi
603
604# restore test topology (remove bridge and VLAN)
605ip -net $nsr1 link set veth0 nomaster
606ip -net $nsr1 link set veth0 down
607ip -net $nsr1 link set veth0.10 down
608ip -net $nsr1 link delete veth0.10 type vlan
609ip -net $nsr1 link delete br0 type bridge
610ip -net $ns1 addr flush dev eth0.10
611ip -net $ns1 link set eth0.10 down
612ip -net $ns1 link set eth0 down
613ip -net $ns1 link delete eth0.10 type vlan
614
615# restore address in ns1 and nsr1
616ip -net $ns1 link set eth0 up
617ip -net $ns1 addr add 10.0.1.99/24 dev eth0
618ip -net $ns1 route add default via 10.0.1.1
619ip -net $ns1 addr add dead:1::99/64 dev eth0
620ip -net $ns1 route add default via dead:1::1
621ip -net $nsr1 addr add 10.0.1.1/24 dev veth0
622ip -net $nsr1 addr add dead:1::1/64 dev veth0
623ip -net $nsr1 link set up dev veth0
624
625KEY_SHA="0x"$(ps -af | sha1sum | cut -d " " -f 1)
626KEY_AES="0x"$(ps -af | md5sum | cut -d " " -f 1)
627SPI1=$RANDOM
628SPI2=$RANDOM
629
630if [ $SPI1 -eq $SPI2 ]; then
631	SPI2=$((SPI2+1))
632fi
633
634do_esp() {
635    local ns=$1
636    local me=$2
637    local remote=$3
638    local lnet=$4
639    local rnet=$5
640    local spi_out=$6
641    local spi_in=$7
642
643    ip -net $ns xfrm state add src $remote dst $me proto esp spi $spi_in  enc aes $KEY_AES  auth sha1 $KEY_SHA mode tunnel sel src $rnet dst $lnet
644    ip -net $ns xfrm state add src $me  dst $remote proto esp spi $spi_out enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $lnet dst $rnet
645
646    # to encrypt packets as they go out (includes forwarded packets that need encapsulation)
647    ip -net $ns xfrm policy add src $lnet dst $rnet dir out tmpl src $me dst $remote proto esp mode tunnel priority 1 action allow
648    # to fwd decrypted packets after esp processing:
649    ip -net $ns xfrm policy add src $rnet dst $lnet dir fwd tmpl src $remote dst $me proto esp mode tunnel priority 1 action allow
650
651}
652
653do_esp $nsr1 192.168.10.1 192.168.10.2 10.0.1.0/24 10.0.2.0/24 $SPI1 $SPI2
654
655do_esp $nsr2 192.168.10.2 192.168.10.1 10.0.2.0/24 10.0.1.0/24 $SPI2 $SPI1
656
657ip netns exec $nsr1 nft delete table ip nat
658
659# restore default routes
660ip -net $ns2 route del 192.168.10.1 via 10.0.2.1
661ip -net $ns2 route add default via 10.0.2.1
662ip -net $ns2 route add default via dead:2::1
663
664if test_tcp_forwarding $ns1 $ns2; then
665	check_counters "ipsec tunnel mode for ns1/ns2"
666else
667	echo "FAIL: ipsec tunnel mode for ns1/ns2"
668	ip netns exec $nsr1 nft list ruleset 1>&2
669	ip netns exec $nsr1 cat /proc/net/xfrm_stat 1>&2
670fi
671
672exit $ret
673