1#!/bin/bash
2# SPDX-License-Identifier: GPL-2.0
3#
4# This tests basic flowtable functionality.
5# Creates following topology:
6#
7# Originator (MTU 9000) <-Router1-> MTU 1500 <-Router2-> Responder (MTU 2000)
8# Router1 is the one doing flow offloading, Router2 has no special
9# purpose other than having a link that is smaller than either Originator
10# and responder, i.e. TCPMSS announced values are too large and will still
11# result in fragmentation and/or PMTU discovery.
12
13# Kselftest framework requirement - SKIP code is 4.
14ksft_skip=4
15ret=0
16
17ns1in=""
18ns2in=""
19ns1out=""
20ns2out=""
21
22log_netns=$(sysctl -n net.netfilter.nf_log_all_netns)
23
24nft --version > /dev/null 2>&1
25if [ $? -ne 0 ];then
26	echo "SKIP: Could not run test without nft tool"
27	exit $ksft_skip
28fi
29
30ip -Version > /dev/null 2>&1
31if [ $? -ne 0 ];then
32	echo "SKIP: Could not run test without ip tool"
33	exit $ksft_skip
34fi
35
36which nc > /dev/null 2>&1
37if [ $? -ne 0 ];then
38	echo "SKIP: Could not run test without nc (netcat)"
39	exit $ksft_skip
40fi
41
42ip netns add nsr1
43if [ $? -ne 0 ];then
44	echo "SKIP: Could not create net namespace"
45	exit $ksft_skip
46fi
47
48ip netns add ns1
49ip netns add ns2
50
51ip netns add nsr2
52
53cleanup() {
54	for i in 1 2; do
55		ip netns del ns$i
56		ip netns del nsr$i
57	done
58
59	rm -f "$ns1in" "$ns1out"
60	rm -f "$ns2in" "$ns2out"
61
62	[ $log_netns -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns
63}
64
65trap cleanup EXIT
66
67sysctl -q net.netfilter.nf_log_all_netns=1
68
69ip link add veth0 netns nsr1 type veth peer name eth0 netns ns1
70ip link add veth1 netns nsr1 type veth peer name veth0 netns nsr2
71
72ip link add veth1 netns nsr2 type veth peer name eth0 netns ns2
73
74for dev in lo veth0 veth1; do
75  for i in 1 2; do
76    ip -net nsr$i link set $dev up
77  done
78done
79
80ip -net nsr1 addr add 10.0.1.1/24 dev veth0
81ip -net nsr1 addr add dead:1::1/64 dev veth0
82
83ip -net nsr2 addr add 10.0.2.1/24 dev veth1
84ip -net nsr2 addr add dead:2::1/64 dev veth1
85
86# set different MTUs so we need to push packets coming from ns1 (large MTU)
87# to ns2 (smaller MTU) to stack either to perform fragmentation (ip_no_pmtu_disc=1),
88# or to do PTMU discovery (send ICMP error back to originator).
89# ns2 is going via nsr2 with a smaller mtu, so that TCPMSS announced by both peers
90# is NOT the lowest link mtu.
91
92ip -net nsr1 link set veth0 mtu 9000
93ip -net ns1 link set eth0 mtu 9000
94
95ip -net nsr2 link set veth1 mtu 2000
96ip -net ns2 link set eth0 mtu 2000
97
98# transfer-net between nsr1 and nsr2.
99# these addresses are not used for connections.
100ip -net nsr1 addr add 192.168.10.1/24 dev veth1
101ip -net nsr1 addr add fee1:2::1/64 dev veth1
102
103ip -net nsr2 addr add 192.168.10.2/24 dev veth0
104ip -net nsr2 addr add fee1:2::2/64 dev veth0
105
106for i in 1 2; do
107  ip netns exec nsr$i sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
108  ip netns exec nsr$i sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
109
110  ip -net ns$i link set lo up
111  ip -net ns$i link set eth0 up
112  ip -net ns$i addr add 10.0.$i.99/24 dev eth0
113  ip -net ns$i route add default via 10.0.$i.1
114  ip -net ns$i addr add dead:$i::99/64 dev eth0
115  ip -net ns$i route add default via dead:$i::1
116  ip netns exec ns$i sysctl net.ipv4.tcp_no_metrics_save=1 > /dev/null
117
118  # don't set ip DF bit for first two tests
119  ip netns exec ns$i sysctl net.ipv4.ip_no_pmtu_disc=1 > /dev/null
120done
121
122ip -net nsr1 route add default via 192.168.10.2
123ip -net nsr2 route add default via 192.168.10.1
124
125ip netns exec nsr1 nft -f - <<EOF
126table inet filter {
127  flowtable f1 {
128     hook ingress priority 0
129     devices = { veth0, veth1 }
130   }
131
132   chain forward {
133      type filter hook forward priority 0; policy drop;
134
135      # flow offloaded? Tag ct with mark 1, so we can detect when it fails.
136      meta oif "veth1" tcp dport 12345 flow offload @f1 counter
137
138      # use packet size to trigger 'should be offloaded by now'.
139      # otherwise, if 'flow offload' expression never offloads, the
140      # test will pass.
141      tcp dport 12345 meta length gt 200 ct mark set 1 counter
142
143      # this turns off flow offloading internally, so expect packets again
144      tcp flags fin,rst ct mark set 0 accept
145
146      # this allows large packets from responder, we need this as long
147      # as PMTUd is off.
148      # This rule is deleted for the last test, when we expect PMTUd
149      # to kick in and ensure all packets meet mtu requirements.
150      meta length gt 1500 accept comment something-to-grep-for
151
152      # next line blocks connection w.o. working offload.
153      # we only do this for reverse dir, because we expect packets to
154      # enter slow path due to MTU mismatch of veth0 and veth1.
155      tcp sport 12345 ct mark 1 counter log prefix "mark failure " drop
156
157      ct state established,related accept
158
159      # for packets that we can't offload yet, i.e. SYN (any ct that is not confirmed)
160      meta length lt 200 oif "veth1" tcp dport 12345 counter accept
161
162      meta nfproto ipv4 meta l4proto icmp accept
163      meta nfproto ipv6 meta l4proto icmpv6 accept
164   }
165}
166EOF
167
168if [ $? -ne 0 ]; then
169	echo "SKIP: Could not load nft ruleset"
170	exit $ksft_skip
171fi
172
173# test basic connectivity
174ip netns exec ns1 ping -c 1 -q 10.0.2.99 > /dev/null
175if [ $? -ne 0 ];then
176  echo "ERROR: ns1 cannot reach ns2" 1>&2
177  bash
178  exit 1
179fi
180
181ip netns exec ns2 ping -c 1 -q 10.0.1.99 > /dev/null
182if [ $? -ne 0 ];then
183  echo "ERROR: ns2 cannot reach ns1" 1>&2
184  exit 1
185fi
186
187if [ $ret -eq 0 ];then
188	echo "PASS: netns routing/connectivity: ns1 can reach ns2"
189fi
190
191ns1in=$(mktemp)
192ns1out=$(mktemp)
193ns2in=$(mktemp)
194ns2out=$(mktemp)
195
196make_file()
197{
198	name=$1
199	who=$2
200
201	SIZE=$((RANDOM % (1024 * 8)))
202	TSIZE=$((SIZE * 1024))
203
204	dd if=/dev/urandom of="$name" bs=1024 count=$SIZE 2> /dev/null
205
206	SIZE=$((RANDOM % 1024))
207	SIZE=$((SIZE + 128))
208	TSIZE=$((TSIZE + SIZE))
209	dd if=/dev/urandom conf=notrunc of="$name" bs=1 count=$SIZE 2> /dev/null
210}
211
212check_transfer()
213{
214	in=$1
215	out=$2
216	what=$3
217
218	cmp "$in" "$out" > /dev/null 2>&1
219	if [ $? -ne 0 ] ;then
220		echo "FAIL: file mismatch for $what" 1>&2
221		ls -l "$in"
222		ls -l "$out"
223		return 1
224	fi
225
226	return 0
227}
228
229test_tcp_forwarding_ip()
230{
231	local nsa=$1
232	local nsb=$2
233	local dstip=$3
234	local dstport=$4
235	local lret=0
236
237	ip netns exec $nsb nc -w 5 -l -p 12345 < "$ns2in" > "$ns2out" &
238	lpid=$!
239
240	sleep 1
241	ip netns exec $nsa nc -w 4 "$dstip" "$dstport" < "$ns1in" > "$ns1out" &
242	cpid=$!
243
244	sleep 3
245
246	kill $lpid
247	kill $cpid
248	wait
249
250	check_transfer "$ns1in" "$ns2out" "ns1 -> ns2"
251	if [ $? -ne 0 ];then
252		lret=1
253	fi
254
255	check_transfer "$ns2in" "$ns1out" "ns1 <- ns2"
256	if [ $? -ne 0 ];then
257		lret=1
258	fi
259
260	return $lret
261}
262
263test_tcp_forwarding()
264{
265	test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345
266
267	return $?
268}
269
270test_tcp_forwarding_nat()
271{
272	local lret
273
274	test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345
275	lret=$?
276
277	if [ $lret -eq 0 ] ; then
278		test_tcp_forwarding_ip "$1" "$2" 10.6.6.6 1666
279		lret=$?
280	fi
281
282	return $lret
283}
284
285make_file "$ns1in" "ns1"
286make_file "$ns2in" "ns2"
287
288# First test:
289# No PMTU discovery, nsr1 is expected to fragment packets from ns1 to ns2 as needed.
290test_tcp_forwarding ns1 ns2
291if [ $? -eq 0 ] ;then
292	echo "PASS: flow offloaded for ns1/ns2"
293else
294	echo "FAIL: flow offload for ns1/ns2:" 1>&2
295	ip netns exec nsr1 nft list ruleset
296	ret=1
297fi
298
299# delete default route, i.e. ns2 won't be able to reach ns1 and
300# will depend on ns1 being masqueraded in nsr1.
301# expect ns1 has nsr1 address.
302ip -net ns2 route del default via 10.0.2.1
303ip -net ns2 route del default via dead:2::1
304ip -net ns2 route add 192.168.10.1 via 10.0.2.1
305
306# Second test:
307# Same, but with NAT enabled.
308ip netns exec nsr1 nft -f - <<EOF
309table ip nat {
310   chain prerouting {
311      type nat hook prerouting priority 0; policy accept;
312      meta iif "veth0" ip daddr 10.6.6.6 tcp dport 1666 counter dnat ip to 10.0.2.99:12345
313   }
314
315   chain postrouting {
316      type nat hook postrouting priority 0; policy accept;
317      meta oifname "veth1" counter masquerade
318   }
319}
320EOF
321
322test_tcp_forwarding_nat ns1 ns2
323
324if [ $? -eq 0 ] ;then
325	echo "PASS: flow offloaded for ns1/ns2 with NAT"
326else
327	echo "FAIL: flow offload for ns1/ns2 with NAT" 1>&2
328	ip netns exec nsr1 nft list ruleset
329	ret=1
330fi
331
332# Third test:
333# Same as second test, but with PMTU discovery enabled.
334handle=$(ip netns exec nsr1 nft -a list table inet filter | grep something-to-grep-for | cut -d \# -f 2)
335
336ip netns exec nsr1 nft delete rule inet filter forward $handle
337if [ $? -ne 0 ] ;then
338	echo "FAIL: Could not delete large-packet accept rule"
339	exit 1
340fi
341
342ip netns exec ns1 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null
343ip netns exec ns2 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null
344
345test_tcp_forwarding_nat ns1 ns2
346if [ $? -eq 0 ] ;then
347	echo "PASS: flow offloaded for ns1/ns2 with NAT and pmtu discovery"
348else
349	echo "FAIL: flow offload for ns1/ns2 with NAT and pmtu discovery" 1>&2
350	ip netns exec nsr1 nft list ruleset
351fi
352
353KEY_SHA="0x"$(ps -xaf | sha1sum | cut -d " " -f 1)
354KEY_AES="0x"$(ps -xaf | md5sum | cut -d " " -f 1)
355SPI1=$RANDOM
356SPI2=$RANDOM
357
358if [ $SPI1 -eq $SPI2 ]; then
359	SPI2=$((SPI2+1))
360fi
361
362do_esp() {
363    local ns=$1
364    local me=$2
365    local remote=$3
366    local lnet=$4
367    local rnet=$5
368    local spi_out=$6
369    local spi_in=$7
370
371    ip -net $ns xfrm state add src $remote dst $me proto esp spi $spi_in  enc aes $KEY_AES  auth sha1 $KEY_SHA mode tunnel sel src $rnet dst $lnet
372    ip -net $ns xfrm state add src $me  dst $remote proto esp spi $spi_out enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $lnet dst $rnet
373
374    # to encrypt packets as they go out (includes forwarded packets that need encapsulation)
375    ip -net $ns xfrm policy add src $lnet dst $rnet dir out tmpl src $me dst $remote proto esp mode tunnel priority 1 action allow
376    # to fwd decrypted packets after esp processing:
377    ip -net $ns xfrm policy add src $rnet dst $lnet dir fwd tmpl src $remote dst $me proto esp mode tunnel priority 1 action allow
378
379}
380
381do_esp nsr1 192.168.10.1 192.168.10.2 10.0.1.0/24 10.0.2.0/24 $SPI1 $SPI2
382
383do_esp nsr2 192.168.10.2 192.168.10.1 10.0.2.0/24 10.0.1.0/24 $SPI2 $SPI1
384
385ip netns exec nsr1 nft delete table ip nat
386
387# restore default routes
388ip -net ns2 route del 192.168.10.1 via 10.0.2.1
389ip -net ns2 route add default via 10.0.2.1
390ip -net ns2 route add default via dead:2::1
391
392test_tcp_forwarding ns1 ns2
393if [ $? -eq 0 ] ;then
394	echo "PASS: ipsec tunnel mode for ns1/ns2"
395else
396	echo "FAIL: ipsec tunnel mode for ns1/ns2"
397	ip netns exec nsr1 nft list ruleset 1>&2
398	ip netns exec nsr1 cat /proc/net/xfrm_stat 1>&2
399fi
400
401exit $ret
402