1#!/bin/sh
2# SPDX-License-Identifier: GPL-2.0
3#
4# Check that route PMTU values match expectations, and that initial device MTU
5# values are assigned correctly
6#
7# Tests currently implemented:
8#
9# - pmtu_ipv4
10#	Set up two namespaces, A and B, with two paths between them over routers
11#	R1 and R2 (also implemented with namespaces), with different MTUs:
12#
13#	  segment a_r1    segment b_r1		a_r1: 2000
14#	.--------------R1--------------.	a_r2: 1500
15#	A                               B	a_r3: 2000
16#	'--------------R2--------------'	a_r4: 1400
17#	  segment a_r2    segment b_r2
18#
19#	Check that PMTU exceptions with the correct PMTU are created. Then
20#	decrease and increase the MTU of the local link for one of the paths,
21#	A to R1, checking that route exception PMTU changes accordingly over
22#	this path. Also check that locked exceptions are created when an ICMP
23#	message advertising a PMTU smaller than net.ipv4.route.min_pmtu is
24#	received
25#
26# - pmtu_ipv6
27#	Same as pmtu_ipv4, except for locked PMTU tests, using IPv6
28#
29# - pmtu_ipv4_vxlan4_exception
30#	Set up the same network topology as pmtu_ipv4, create a VXLAN tunnel
31#	over IPv4 between A and B, routed via R1. On the link between R1 and B,
32#	set a MTU lower than the VXLAN MTU and the MTU on the link between A and
33#	R1. Send IPv4 packets, exceeding the MTU between R1 and B, over VXLAN
34#	from A to B and check that the PMTU exception is created with the right
35#	value on A
36#
37# - pmtu_ipv6_vxlan4_exception
38#	Same as pmtu_ipv4_vxlan4_exception, but send IPv6 packets from A to B
39#
40# - pmtu_ipv4_vxlan6_exception
41#	Same as pmtu_ipv4_vxlan4_exception, but use IPv6 transport from A to B
42#
43# - pmtu_ipv6_vxlan6_exception
44#	Same as pmtu_ipv4_vxlan6_exception, but send IPv6 packets from A to B
45#
46# - pmtu_ipv4_geneve4_exception
47#	Same as pmtu_ipv4_vxlan4_exception, but using a GENEVE tunnel instead of
48#	VXLAN
49#
50# - pmtu_ipv6_geneve4_exception
51#	Same as pmtu_ipv6_vxlan4_exception, but using a GENEVE tunnel instead of
52#	VXLAN
53#
54# - pmtu_ipv4_geneve6_exception
55#	Same as pmtu_ipv4_vxlan6_exception, but using a GENEVE tunnel instead of
56#	VXLAN
57#
58# - pmtu_ipv6_geneve6_exception
59#	Same as pmtu_ipv6_vxlan6_exception, but using a GENEVE tunnel instead of
60#	VXLAN
61#
62# - pmtu_ipv{4,6}_fou{4,6}_exception
63#	Same as pmtu_ipv4_vxlan4, but using a direct IPv4/IPv6 encapsulation
64#	(FoU) over IPv4/IPv6, instead of VXLAN
65#
66# - pmtu_ipv{4,6}_fou{4,6}_exception
67#	Same as pmtu_ipv4_vxlan4, but using a generic UDP IPv4/IPv6
68#	encapsulation (GUE) over IPv4/IPv6, instead of VXLAN
69#
70# - pmtu_vti4_exception
71#	Set up vti tunnel on top of veth, with xfrm states and policies, in two
72#	namespaces with matching endpoints. Check that route exception is not
73#	created if link layer MTU is not exceeded, then exceed it and check that
74#	exception is created with the expected PMTU. The approach described
75#	below for IPv6 doesn't apply here, because, on IPv4, administrative MTU
76#	changes alone won't affect PMTU
77#
78# - pmtu_vti6_exception
79#	Set up vti6 tunnel on top of veth, with xfrm states and policies, in two
80#	namespaces with matching endpoints. Check that route exception is
81#	created by exceeding link layer MTU with ping to other endpoint. Then
82#	decrease and increase MTU of tunnel, checking that route exception PMTU
83#	changes accordingly
84#
85# - pmtu_vti4_default_mtu
86#	Set up vti4 tunnel on top of veth, in two namespaces with matching
87#	endpoints. Check that MTU assigned to vti interface is the MTU of the
88#	lower layer (veth) minus additional lower layer headers (zero, for veth)
89#	minus IPv4 header length
90#
91# - pmtu_vti6_default_mtu
92#	Same as above, for IPv6
93#
94# - pmtu_vti4_link_add_mtu
95#	Set up vti4 interface passing MTU value at link creation, check MTU is
96#	configured, and that link is not created with invalid MTU values
97#
98# - pmtu_vti6_link_add_mtu
99#	Same as above, for IPv6
100#
101# - pmtu_vti6_link_change_mtu
102#	Set up two dummy interfaces with different MTUs, create a vti6 tunnel
103#	and check that configured MTU is used on link creation and changes, and
104#	that MTU is properly calculated instead when MTU is not configured from
105#	userspace
106
107# Kselftest framework requirement - SKIP code is 4.
108ksft_skip=4
109
110# Some systems don't have a ping6 binary anymore
111which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping)
112
113tests="
114	pmtu_ipv4_exception		ipv4: PMTU exceptions
115	pmtu_ipv6_exception		ipv6: PMTU exceptions
116	pmtu_ipv4_vxlan4_exception	IPv4 over vxlan4: PMTU exceptions
117	pmtu_ipv6_vxlan4_exception	IPv6 over vxlan4: PMTU exceptions
118	pmtu_ipv4_vxlan6_exception	IPv4 over vxlan6: PMTU exceptions
119	pmtu_ipv6_vxlan6_exception	IPv6 over vxlan6: PMTU exceptions
120	pmtu_ipv4_geneve4_exception	IPv4 over geneve4: PMTU exceptions
121	pmtu_ipv6_geneve4_exception	IPv6 over geneve4: PMTU exceptions
122	pmtu_ipv4_geneve6_exception	IPv4 over geneve6: PMTU exceptions
123	pmtu_ipv6_geneve6_exception	IPv6 over geneve6: PMTU exceptions
124	pmtu_ipv4_fou4_exception	IPv4 over fou4: PMTU exceptions
125	pmtu_ipv6_fou4_exception	IPv6 over fou4: PMTU exceptions
126	pmtu_ipv4_fou6_exception	IPv4 over fou6: PMTU exceptions
127	pmtu_ipv6_fou6_exception	IPv6 over fou6: PMTU exceptions
128	pmtu_ipv4_gue4_exception	IPv4 over gue4: PMTU exceptions
129	pmtu_ipv6_gue4_exception	IPv6 over gue4: PMTU exceptions
130	pmtu_ipv4_gue6_exception	IPv4 over gue6: PMTU exceptions
131	pmtu_ipv6_gue6_exception	IPv6 over gue6: PMTU exceptions
132	pmtu_vti6_exception		vti6: PMTU exceptions
133	pmtu_vti4_exception		vti4: PMTU exceptions
134	pmtu_vti4_default_mtu		vti4: default MTU assignment
135	pmtu_vti6_default_mtu		vti6: default MTU assignment
136	pmtu_vti4_link_add_mtu		vti4: MTU setting on link creation
137	pmtu_vti6_link_add_mtu		vti6: MTU setting on link creation
138	pmtu_vti6_link_change_mtu	vti6: MTU changes on link changes"
139
140NS_A="ns-$(mktemp -u XXXXXX)"
141NS_B="ns-$(mktemp -u XXXXXX)"
142NS_R1="ns-$(mktemp -u XXXXXX)"
143NS_R2="ns-$(mktemp -u XXXXXX)"
144ns_a="ip netns exec ${NS_A}"
145ns_b="ip netns exec ${NS_B}"
146ns_r1="ip netns exec ${NS_R1}"
147ns_r2="ip netns exec ${NS_R2}"
148
149# Addressing and routing for tests with routers: four network segments, with
150# index SEGMENT between 1 and 4, a common prefix (PREFIX4 or PREFIX6) and an
151# identifier ID, which is 1 for hosts (A and B), 2 for routers (R1 and R2).
152# Addresses are:
153# - IPv4: PREFIX4.SEGMENT.ID (/24)
154# - IPv6: PREFIX6:SEGMENT::ID (/64)
155prefix4="10.0"
156prefix6="fc00"
157a_r1=1
158a_r2=2
159b_r1=3
160b_r2=4
161#	ns	peer	segment
162routing_addrs="
163	A	R1	${a_r1}
164	A	R2	${a_r2}
165	B	R1	${b_r1}
166	B	R2	${b_r2}
167"
168# Traffic from A to B goes through R1 by default, and through R2, if destined to
169# B's address on the b_r2 segment.
170# Traffic from B to A goes through R1.
171#	ns	destination		gateway
172routes="
173	A	default			${prefix4}.${a_r1}.2
174	A	${prefix4}.${b_r2}.1	${prefix4}.${a_r2}.2
175	B	default			${prefix4}.${b_r1}.2
176
177	A	default			${prefix6}:${a_r1}::2
178	A	${prefix6}:${b_r2}::1	${prefix6}:${a_r2}::2
179	B	default			${prefix6}:${b_r1}::2
180"
181
182veth4_a_addr="192.168.1.1"
183veth4_b_addr="192.168.1.2"
184veth4_mask="24"
185veth6_a_addr="fd00:1::a"
186veth6_b_addr="fd00:1::b"
187veth6_mask="64"
188
189tunnel4_a_addr="192.168.2.1"
190tunnel4_b_addr="192.168.2.2"
191tunnel4_mask="24"
192tunnel6_a_addr="fd00:2::a"
193tunnel6_b_addr="fd00:2::b"
194tunnel6_mask="64"
195
196dummy6_0_addr="fc00:1000::0"
197dummy6_1_addr="fc00:1001::0"
198dummy6_mask="64"
199
200cleanup_done=1
201err_buf=
202tcpdump_pids=
203
204err() {
205	err_buf="${err_buf}${1}
206"
207}
208
209err_flush() {
210	echo -n "${err_buf}"
211	err_buf=
212}
213
214# Find the auto-generated name for this namespace
215nsname() {
216	eval echo \$NS_$1
217}
218
219setup_fou_or_gue() {
220	outer="${1}"
221	inner="${2}"
222	encap="${3}"
223
224	if [ "${outer}" = "4" ]; then
225		modprobe fou || return 2
226		a_addr="${prefix4}.${a_r1}.1"
227		b_addr="${prefix4}.${b_r1}.1"
228		if [ "${inner}" = "4" ]; then
229			type="ipip"
230			ipproto="4"
231		else
232			type="sit"
233			ipproto="41"
234		fi
235	else
236		modprobe fou6 || return 2
237		a_addr="${prefix6}:${a_r1}::1"
238		b_addr="${prefix6}:${b_r1}::1"
239		if [ "${inner}" = "4" ]; then
240			type="ip6tnl"
241			mode="mode ipip6"
242			ipproto="4 -6"
243		else
244			type="ip6tnl"
245			mode="mode ip6ip6"
246			ipproto="41 -6"
247		fi
248	fi
249
250	${ns_a} ip fou add port 5555 ipproto ${ipproto} || return 2
251	${ns_a} ip link add ${encap}_a type ${type} ${mode} local ${a_addr} remote ${b_addr} encap ${encap} encap-sport auto encap-dport 5556 || return 2
252
253	${ns_b} ip fou add port 5556 ipproto ${ipproto}
254	${ns_b} ip link add ${encap}_b type ${type} ${mode} local ${b_addr} remote ${a_addr} encap ${encap} encap-sport auto encap-dport 5555
255
256	if [ "${inner}" = "4" ]; then
257		${ns_a} ip addr add ${tunnel4_a_addr}/${tunnel4_mask} dev ${encap}_a
258		${ns_b} ip addr add ${tunnel4_b_addr}/${tunnel4_mask} dev ${encap}_b
259	else
260		${ns_a} ip addr add ${tunnel6_a_addr}/${tunnel6_mask} dev ${encap}_a
261		${ns_b} ip addr add ${tunnel6_b_addr}/${tunnel6_mask} dev ${encap}_b
262	fi
263
264	${ns_a} ip link set ${encap}_a up
265	${ns_b} ip link set ${encap}_b up
266
267	sleep 1
268}
269
270setup_fou44() {
271	setup_fou_or_gue 4 4 fou
272}
273
274setup_fou46() {
275	setup_fou_or_gue 4 6 fou
276}
277
278setup_fou64() {
279	setup_fou_or_gue 6 4 fou
280}
281
282setup_fou66() {
283	setup_fou_or_gue 6 6 fou
284}
285
286setup_gue44() {
287	setup_fou_or_gue 4 4 gue
288}
289
290setup_gue46() {
291	setup_fou_or_gue 4 6 gue
292}
293
294setup_gue64() {
295	setup_fou_or_gue 6 4 gue
296}
297
298setup_gue66() {
299	setup_fou_or_gue 6 6 gue
300}
301
302setup_namespaces() {
303	for n in ${NS_A} ${NS_B} ${NS_R1} ${NS_R2}; do
304		ip netns add ${n} || return 1
305	done
306}
307
308setup_veth() {
309	${ns_a} ip link add veth_a type veth peer name veth_b || return 1
310	${ns_a} ip link set veth_b netns ${NS_B}
311
312	${ns_a} ip addr add ${veth4_a_addr}/${veth4_mask} dev veth_a
313	${ns_b} ip addr add ${veth4_b_addr}/${veth4_mask} dev veth_b
314
315	${ns_a} ip addr add ${veth6_a_addr}/${veth6_mask} dev veth_a
316	${ns_b} ip addr add ${veth6_b_addr}/${veth6_mask} dev veth_b
317
318	${ns_a} ip link set veth_a up
319	${ns_b} ip link set veth_b up
320}
321
322setup_vti() {
323	proto=${1}
324	veth_a_addr="${2}"
325	veth_b_addr="${3}"
326	vti_a_addr="${4}"
327	vti_b_addr="${5}"
328	vti_mask=${6}
329
330	[ ${proto} -eq 6 ] && vti_type="vti6" || vti_type="vti"
331
332	${ns_a} ip link add vti${proto}_a type ${vti_type} local ${veth_a_addr} remote ${veth_b_addr} key 10 || return 1
333	${ns_b} ip link add vti${proto}_b type ${vti_type} local ${veth_b_addr} remote ${veth_a_addr} key 10
334
335	${ns_a} ip addr add ${vti_a_addr}/${vti_mask} dev vti${proto}_a
336	${ns_b} ip addr add ${vti_b_addr}/${vti_mask} dev vti${proto}_b
337
338	${ns_a} ip link set vti${proto}_a up
339	${ns_b} ip link set vti${proto}_b up
340
341	sleep 1
342}
343
344setup_vti4() {
345	setup_vti 4 ${veth4_a_addr} ${veth4_b_addr} ${tunnel4_a_addr} ${tunnel4_b_addr} ${tunnel4_mask}
346}
347
348setup_vti6() {
349	setup_vti 6 ${veth6_a_addr} ${veth6_b_addr} ${tunnel6_a_addr} ${tunnel6_b_addr} ${tunnel6_mask}
350}
351
352setup_vxlan_or_geneve() {
353	type="${1}"
354	a_addr="${2}"
355	b_addr="${3}"
356	opts="${4}"
357
358	if [ "${type}" = "vxlan" ]; then
359		opts="${opts} ttl 64 dstport 4789"
360		opts_a="local ${a_addr}"
361		opts_b="local ${b_addr}"
362	else
363		opts_a=""
364		opts_b=""
365	fi
366
367	${ns_a} ip link add ${type}_a type ${type} id 1 ${opts_a} remote ${b_addr} ${opts} || return 1
368	${ns_b} ip link add ${type}_b type ${type} id 1 ${opts_b} remote ${a_addr} ${opts}
369
370	${ns_a} ip addr add ${tunnel4_a_addr}/${tunnel4_mask} dev ${type}_a
371	${ns_b} ip addr add ${tunnel4_b_addr}/${tunnel4_mask} dev ${type}_b
372
373	${ns_a} ip addr add ${tunnel6_a_addr}/${tunnel6_mask} dev ${type}_a
374	${ns_b} ip addr add ${tunnel6_b_addr}/${tunnel6_mask} dev ${type}_b
375
376	${ns_a} ip link set ${type}_a up
377	${ns_b} ip link set ${type}_b up
378
379	sleep 1
380}
381
382setup_geneve4() {
383	setup_vxlan_or_geneve geneve ${prefix4}.${a_r1}.1  ${prefix4}.${b_r1}.1  "df set"
384}
385
386setup_vxlan4() {
387	setup_vxlan_or_geneve vxlan  ${prefix4}.${a_r1}.1  ${prefix4}.${b_r1}.1  "df set"
388}
389
390setup_geneve6() {
391	setup_vxlan_or_geneve geneve ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1
392}
393
394setup_vxlan6() {
395	setup_vxlan_or_geneve vxlan  ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1
396}
397
398setup_xfrm() {
399	proto=${1}
400	veth_a_addr="${2}"
401	veth_b_addr="${3}"
402
403	${ns_a} ip -${proto} xfrm state add src ${veth_a_addr} dst ${veth_b_addr} spi 0x1000 proto esp aead "rfc4106(gcm(aes))" 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode tunnel || return 1
404	${ns_a} ip -${proto} xfrm state add src ${veth_b_addr} dst ${veth_a_addr} spi 0x1001 proto esp aead "rfc4106(gcm(aes))" 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode tunnel
405	${ns_a} ip -${proto} xfrm policy add dir out mark 10 tmpl src ${veth_a_addr} dst ${veth_b_addr} proto esp mode tunnel
406	${ns_a} ip -${proto} xfrm policy add dir in mark 10 tmpl src ${veth_b_addr} dst ${veth_a_addr} proto esp mode tunnel
407
408	${ns_b} ip -${proto} xfrm state add src ${veth_a_addr} dst ${veth_b_addr} spi 0x1000 proto esp aead "rfc4106(gcm(aes))" 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode tunnel
409	${ns_b} ip -${proto} xfrm state add src ${veth_b_addr} dst ${veth_a_addr} spi 0x1001 proto esp aead "rfc4106(gcm(aes))" 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode tunnel
410	${ns_b} ip -${proto} xfrm policy add dir out mark 10 tmpl src ${veth_b_addr} dst ${veth_a_addr} proto esp mode tunnel
411	${ns_b} ip -${proto} xfrm policy add dir in mark 10 tmpl src ${veth_a_addr} dst ${veth_b_addr} proto esp mode tunnel
412}
413
414setup_xfrm4() {
415	setup_xfrm 4 ${veth4_a_addr} ${veth4_b_addr}
416}
417
418setup_xfrm6() {
419	setup_xfrm 6 ${veth6_a_addr} ${veth6_b_addr}
420}
421
422setup_routing() {
423	for i in ${NS_R1} ${NS_R2}; do
424		ip netns exec ${i} sysctl -q net/ipv4/ip_forward=1
425		ip netns exec ${i} sysctl -q net/ipv6/conf/all/forwarding=1
426	done
427
428	for i in ${routing_addrs}; do
429		[ "${ns}" = "" ]	&& ns="${i}"		&& continue
430		[ "${peer}" = "" ]	&& peer="${i}"		&& continue
431		[ "${segment}" = "" ]	&& segment="${i}"
432
433		ns_name="$(nsname ${ns})"
434		peer_name="$(nsname ${peer})"
435		if="veth_${ns}-${peer}"
436		ifpeer="veth_${peer}-${ns}"
437
438		# Create veth links
439		ip link add ${if} up netns ${ns_name} type veth peer name ${ifpeer} netns ${peer_name} || return 1
440		ip -n ${peer_name} link set dev ${ifpeer} up
441
442		# Add addresses
443		ip -n ${ns_name}   addr add ${prefix4}.${segment}.1/24  dev ${if}
444		ip -n ${ns_name}   addr add ${prefix6}:${segment}::1/64 dev ${if}
445
446		ip -n ${peer_name} addr add ${prefix4}.${segment}.2/24  dev ${ifpeer}
447		ip -n ${peer_name} addr add ${prefix6}:${segment}::2/64 dev ${ifpeer}
448
449		ns=""; peer=""; segment=""
450	done
451
452	for i in ${routes}; do
453		[ "${ns}" = "" ]	&& ns="${i}"		&& continue
454		[ "${addr}" = "" ]	&& addr="${i}"		&& continue
455		[ "${gw}" = "" ]	&& gw="${i}"
456
457		ns_name="$(nsname ${ns})"
458
459		ip -n ${ns_name} route add ${addr} via ${gw}
460
461		ns=""; addr=""; gw=""
462	done
463}
464
465setup() {
466	[ "$(id -u)" -ne 0 ] && echo "  need to run as root" && return $ksft_skip
467
468	cleanup_done=0
469	for arg do
470		eval setup_${arg} || { echo "  ${arg} not supported"; return 1; }
471	done
472}
473
474trace() {
475	[ $tracing -eq 0 ] && return
476
477	for arg do
478		[ "${ns_cmd}" = "" ] && ns_cmd="${arg}" && continue
479		${ns_cmd} tcpdump -s 0 -i "${arg}" -w "${name}_${arg}.pcap" 2> /dev/null &
480		tcpdump_pids="${tcpdump_pids} $!"
481		ns_cmd=
482	done
483	sleep 1
484}
485
486cleanup() {
487	for pid in ${tcpdump_pids}; do
488		kill ${pid}
489	done
490	tcpdump_pids=
491
492	[ ${cleanup_done} -eq 1 ] && return
493	for n in ${NS_A} ${NS_B} ${NS_R1} ${NS_R2}; do
494		ip netns del ${n} 2> /dev/null
495	done
496	cleanup_done=1
497}
498
499mtu() {
500	ns_cmd="${1}"
501	dev="${2}"
502	mtu="${3}"
503
504	${ns_cmd} ip link set dev ${dev} mtu ${mtu}
505}
506
507mtu_parse() {
508	input="${1}"
509
510	next=0
511	for i in ${input}; do
512		[ ${next} -eq 1 -a "${i}" = "lock" ] && next=2 && continue
513		[ ${next} -eq 1 ] && echo "${i}" && return
514		[ ${next} -eq 2 ] && echo "lock ${i}" && return
515		[ "${i}" = "mtu" ] && next=1
516	done
517}
518
519link_get() {
520	ns_cmd="${1}"
521	name="${2}"
522
523	${ns_cmd} ip link show dev "${name}"
524}
525
526link_get_mtu() {
527	ns_cmd="${1}"
528	name="${2}"
529
530	mtu_parse "$(link_get "${ns_cmd}" ${name})"
531}
532
533route_get_dst_exception() {
534	ns_cmd="${1}"
535	dst="${2}"
536
537	${ns_cmd} ip route get "${dst}"
538}
539
540route_get_dst_pmtu_from_exception() {
541	ns_cmd="${1}"
542	dst="${2}"
543
544	mtu_parse "$(route_get_dst_exception "${ns_cmd}" ${dst})"
545}
546
547check_pmtu_value() {
548	expected="${1}"
549	value="${2}"
550	event="${3}"
551
552	[ "${expected}" = "any" ] && [ -n "${value}" ] && return 0
553	[ "${value}" = "${expected}" ] && return 0
554	[ -z "${value}" ] &&    err "  PMTU exception wasn't created after ${event}" && return 1
555	[ -z "${expected}" ] && err "  PMTU exception shouldn't exist after ${event}" && return 1
556	err "  found PMTU exception with incorrect MTU ${value}, expected ${expected}, after ${event}"
557	return 1
558}
559
560test_pmtu_ipvX() {
561	family=${1}
562
563	setup namespaces routing || return 2
564	trace "${ns_a}"  veth_A-R1    "${ns_r1}" veth_R1-A \
565	      "${ns_r1}" veth_R1-B    "${ns_b}"  veth_B-R1 \
566	      "${ns_a}"  veth_A-R2    "${ns_r2}" veth_R2-A \
567	      "${ns_r2}" veth_R2-B    "${ns_b}"  veth_B-R2
568
569	if [ ${family} -eq 4 ]; then
570		ping=ping
571		dst1="${prefix4}.${b_r1}.1"
572		dst2="${prefix4}.${b_r2}.1"
573	else
574		ping=${ping6}
575		dst1="${prefix6}:${b_r1}::1"
576		dst2="${prefix6}:${b_r2}::1"
577	fi
578
579	# Set up initial MTU values
580	mtu "${ns_a}"  veth_A-R1 2000
581	mtu "${ns_r1}" veth_R1-A 2000
582	mtu "${ns_r1}" veth_R1-B 1400
583	mtu "${ns_b}"  veth_B-R1 1400
584
585	mtu "${ns_a}"  veth_A-R2 2000
586	mtu "${ns_r2}" veth_R2-A 2000
587	mtu "${ns_r2}" veth_R2-B 1500
588	mtu "${ns_b}"  veth_B-R2 1500
589
590	# Create route exceptions
591	${ns_a} ${ping} -q -M want -i 0.1 -w 2 -s 1800 ${dst1} > /dev/null
592	${ns_a} ${ping} -q -M want -i 0.1 -w 2 -s 1800 ${dst2} > /dev/null
593
594	# Check that exceptions have been created with the correct PMTU
595	pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst1})"
596	check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1
597	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst2})"
598	check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1
599
600	# Decrease local MTU below PMTU, check for PMTU decrease in route exception
601	mtu "${ns_a}"  veth_A-R1 1300
602	mtu "${ns_r1}" veth_R1-A 1300
603	pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst1})"
604	check_pmtu_value "1300" "${pmtu_1}" "decreasing local MTU" || return 1
605	# Second exception shouldn't be modified
606	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst2})"
607	check_pmtu_value "1500" "${pmtu_2}" "changing local MTU on a link not on this path" || return 1
608
609	# Increase MTU, check for PMTU increase in route exception
610	mtu "${ns_a}"  veth_A-R1 1700
611	mtu "${ns_r1}" veth_R1-A 1700
612	pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst1})"
613	check_pmtu_value "1700" "${pmtu_1}" "increasing local MTU" || return 1
614	# Second exception shouldn't be modified
615	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst2})"
616	check_pmtu_value "1500" "${pmtu_2}" "changing local MTU on a link not on this path" || return 1
617
618	# Skip PMTU locking tests for IPv6
619	[ $family -eq 6 ] && return 0
620
621	# Decrease remote MTU on path via R2, get new exception
622	mtu "${ns_r2}" veth_R2-B 400
623	mtu "${ns_b}"  veth_B-R2 400
624	${ns_a} ${ping} -q -M want -i 0.1 -w 2 -s 1400 ${dst2} > /dev/null
625	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst2})"
626	check_pmtu_value "lock 552" "${pmtu_2}" "exceeding MTU, with MTU < min_pmtu" || return 1
627
628	# Decrease local MTU below PMTU
629	mtu "${ns_a}"  veth_A-R2 500
630	mtu "${ns_r2}" veth_R2-A 500
631	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst2})"
632	check_pmtu_value "500" "${pmtu_2}" "decreasing local MTU" || return 1
633
634	# Increase local MTU
635	mtu "${ns_a}"  veth_A-R2 1500
636	mtu "${ns_r2}" veth_R2-A 1500
637	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst2})"
638	check_pmtu_value "1500" "${pmtu_2}" "increasing local MTU" || return 1
639
640	# Get new exception
641	${ns_a} ${ping} -q -M want -i 0.1 -w 2 -s 1400 ${dst2} > /dev/null
642	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst2})"
643	check_pmtu_value "lock 552" "${pmtu_2}" "exceeding MTU, with MTU < min_pmtu" || return 1
644}
645
646test_pmtu_ipv4_exception() {
647	test_pmtu_ipvX 4
648}
649
650test_pmtu_ipv6_exception() {
651	test_pmtu_ipvX 6
652}
653
654test_pmtu_ipvX_over_vxlanY_or_geneveY_exception() {
655	type=${1}
656	family=${2}
657	outer_family=${3}
658	ll_mtu=4000
659
660	if [ ${outer_family} -eq 4 ]; then
661		setup namespaces routing ${type}4 || return 2
662		#                      IPv4 header   UDP header   VXLAN/GENEVE header   Ethernet header
663		exp_mtu=$((${ll_mtu} - 20          - 8          - 8                   - 14))
664	else
665		setup namespaces routing ${type}6 || return 2
666		#                      IPv6 header   UDP header   VXLAN/GENEVE header   Ethernet header
667		exp_mtu=$((${ll_mtu} - 40          - 8          - 8                   - 14))
668	fi
669
670	trace "${ns_a}" ${type}_a    "${ns_b}"  ${type}_b \
671	      "${ns_a}" veth_A-R1    "${ns_r1}" veth_R1-A \
672	      "${ns_b}" veth_B-R1    "${ns_r1}" veth_R1-B
673
674	if [ ${family} -eq 4 ]; then
675		ping=ping
676		dst=${tunnel4_b_addr}
677	else
678		ping=${ping6}
679		dst=${tunnel6_b_addr}
680	fi
681
682	# Create route exception by exceeding link layer MTU
683	mtu "${ns_a}"  veth_A-R1 $((${ll_mtu} + 1000))
684	mtu "${ns_r1}" veth_R1-A $((${ll_mtu} + 1000))
685	mtu "${ns_b}"  veth_B-R1 ${ll_mtu}
686	mtu "${ns_r1}" veth_R1-B ${ll_mtu}
687
688	mtu "${ns_a}" ${type}_a $((${ll_mtu} + 1000))
689	mtu "${ns_b}" ${type}_b $((${ll_mtu} + 1000))
690	${ns_a} ${ping} -q -M want -i 0.1 -w 2 -s $((${ll_mtu} + 500)) ${dst} > /dev/null
691
692	# Check that exception was created
693	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst})"
694	check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on ${type} interface"
695}
696
697test_pmtu_ipv4_vxlan4_exception() {
698	test_pmtu_ipvX_over_vxlanY_or_geneveY_exception vxlan  4 4
699}
700
701test_pmtu_ipv6_vxlan4_exception() {
702	test_pmtu_ipvX_over_vxlanY_or_geneveY_exception vxlan  6 4
703}
704
705test_pmtu_ipv4_geneve4_exception() {
706	test_pmtu_ipvX_over_vxlanY_or_geneveY_exception geneve 4 4
707}
708
709test_pmtu_ipv6_geneve4_exception() {
710	test_pmtu_ipvX_over_vxlanY_or_geneveY_exception geneve 6 4
711}
712
713test_pmtu_ipv4_vxlan6_exception() {
714	test_pmtu_ipvX_over_vxlanY_or_geneveY_exception vxlan  4 6
715}
716
717test_pmtu_ipv6_vxlan6_exception() {
718	test_pmtu_ipvX_over_vxlanY_or_geneveY_exception vxlan  6 6
719}
720
721test_pmtu_ipv4_geneve6_exception() {
722	test_pmtu_ipvX_over_vxlanY_or_geneveY_exception geneve 4 6
723}
724
725test_pmtu_ipv6_geneve6_exception() {
726	test_pmtu_ipvX_over_vxlanY_or_geneveY_exception geneve 6 6
727}
728
729test_pmtu_ipvX_over_fouY_or_gueY() {
730	inner_family=${1}
731	outer_family=${2}
732	encap=${3}
733	ll_mtu=4000
734
735	setup namespaces routing ${encap}${outer_family}${inner_family} || return 2
736	trace "${ns_a}" ${encap}_a   "${ns_b}"  ${encap}_b \
737	      "${ns_a}" veth_A-R1    "${ns_r1}" veth_R1-A \
738	      "${ns_b}" veth_B-R1    "${ns_r1}" veth_R1-B
739
740	if [ ${inner_family} -eq 4 ]; then
741		ping=ping
742		dst=${tunnel4_b_addr}
743	else
744		ping=${ping6}
745		dst=${tunnel6_b_addr}
746	fi
747
748	if [ "${encap}" = "gue" ]; then
749		encap_overhead=4
750	else
751		encap_overhead=0
752	fi
753
754	if [ ${outer_family} -eq 4 ]; then
755		#                      IPv4 header   UDP header
756		exp_mtu=$((${ll_mtu} - 20          - 8         - ${encap_overhead}))
757	else
758		#                      IPv6 header   Option 4   UDP header
759		exp_mtu=$((${ll_mtu} - 40          - 8        - 8       - ${encap_overhead}))
760	fi
761
762	# Create route exception by exceeding link layer MTU
763	mtu "${ns_a}"  veth_A-R1 $((${ll_mtu} + 1000))
764	mtu "${ns_r1}" veth_R1-A $((${ll_mtu} + 1000))
765	mtu "${ns_b}"  veth_B-R1 ${ll_mtu}
766	mtu "${ns_r1}" veth_R1-B ${ll_mtu}
767
768	mtu "${ns_a}" ${encap}_a $((${ll_mtu} + 1000))
769	mtu "${ns_b}" ${encap}_b $((${ll_mtu} + 1000))
770	${ns_a} ${ping} -q -M want -i 0.1 -w 2 -s $((${ll_mtu} + 500)) ${dst} > /dev/null
771
772	# Check that exception was created
773	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst})"
774	check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on ${encap} interface"
775}
776
777test_pmtu_ipv4_fou4_exception() {
778	test_pmtu_ipvX_over_fouY_or_gueY 4 4 fou
779}
780
781test_pmtu_ipv6_fou4_exception() {
782	test_pmtu_ipvX_over_fouY_or_gueY 6 4 fou
783}
784
785test_pmtu_ipv4_fou6_exception() {
786	test_pmtu_ipvX_over_fouY_or_gueY 4 6 fou
787}
788
789test_pmtu_ipv6_fou6_exception() {
790	test_pmtu_ipvX_over_fouY_or_gueY 6 6 fou
791}
792
793test_pmtu_ipv4_gue4_exception() {
794	test_pmtu_ipvX_over_fouY_or_gueY 4 4 gue
795}
796
797test_pmtu_ipv6_gue4_exception() {
798	test_pmtu_ipvX_over_fouY_or_gueY 6 4 gue
799}
800
801test_pmtu_ipv4_gue6_exception() {
802	test_pmtu_ipvX_over_fouY_or_gueY 4 6 gue
803}
804
805test_pmtu_ipv6_gue6_exception() {
806	test_pmtu_ipvX_over_fouY_or_gueY 6 6 gue
807}
808
809test_pmtu_vti4_exception() {
810	setup namespaces veth vti4 xfrm4 || return 2
811	trace "${ns_a}" veth_a    "${ns_b}" veth_b \
812	      "${ns_a}" vti4_a    "${ns_b}" vti4_b
813
814	veth_mtu=1500
815	vti_mtu=$((veth_mtu - 20))
816
817	#                                SPI   SN   IV  ICV   pad length   next header
818	esp_payload_rfc4106=$((vti_mtu - 4   - 4  - 8 - 16  - 1          - 1))
819	ping_payload=$((esp_payload_rfc4106 - 28))
820
821	mtu "${ns_a}" veth_a ${veth_mtu}
822	mtu "${ns_b}" veth_b ${veth_mtu}
823	mtu "${ns_a}" vti4_a ${vti_mtu}
824	mtu "${ns_b}" vti4_b ${vti_mtu}
825
826	# Send DF packet without exceeding link layer MTU, check that no
827	# exception is created
828	${ns_a} ping -q -M want -i 0.1 -w 2 -s ${ping_payload} ${tunnel4_b_addr} > /dev/null
829	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel4_b_addr})"
830	check_pmtu_value "" "${pmtu}" "sending packet smaller than PMTU (IP payload length ${esp_payload_rfc4106})" || return 1
831
832	# Now exceed link layer MTU by one byte, check that exception is created
833	# with the right PMTU value
834	${ns_a} ping -q -M want -i 0.1 -w 2 -s $((ping_payload + 1)) ${tunnel4_b_addr} > /dev/null
835	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel4_b_addr})"
836	check_pmtu_value "${esp_payload_rfc4106}" "${pmtu}" "exceeding PMTU (IP payload length $((esp_payload_rfc4106 + 1)))"
837}
838
839test_pmtu_vti6_exception() {
840	setup namespaces veth vti6 xfrm6 || return 2
841	trace "${ns_a}" veth_a    "${ns_b}" veth_b \
842	      "${ns_a}" vti6_a    "${ns_b}" vti6_b
843	fail=0
844
845	# Create route exception by exceeding link layer MTU
846	mtu "${ns_a}" veth_a 4000
847	mtu "${ns_b}" veth_b 4000
848	mtu "${ns_a}" vti6_a 5000
849	mtu "${ns_b}" vti6_b 5000
850	${ns_a} ${ping6} -q -i 0.1 -w 2 -s 60000 ${tunnel6_b_addr} > /dev/null
851
852	# Check that exception was created
853	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel6_b_addr})"
854	check_pmtu_value any "${pmtu}" "creating tunnel exceeding link layer MTU" || return 1
855
856	# Decrease tunnel MTU, check for PMTU decrease in route exception
857	mtu "${ns_a}" vti6_a 3000
858	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel6_b_addr})"
859	check_pmtu_value "3000" "${pmtu}" "decreasing tunnel MTU" || fail=1
860
861	# Increase tunnel MTU, check for PMTU increase in route exception
862	mtu "${ns_a}" vti6_a 9000
863	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel6_b_addr})"
864	check_pmtu_value "9000" "${pmtu}" "increasing tunnel MTU" || fail=1
865
866	return ${fail}
867}
868
869test_pmtu_vti4_default_mtu() {
870	setup namespaces veth vti4 || return 2
871
872	# Check that MTU of vti device is MTU of veth minus IPv4 header length
873	veth_mtu="$(link_get_mtu "${ns_a}" veth_a)"
874	vti4_mtu="$(link_get_mtu "${ns_a}" vti4_a)"
875	if [ $((veth_mtu - vti4_mtu)) -ne 20 ]; then
876		err "  vti MTU ${vti4_mtu} is not veth MTU ${veth_mtu} minus IPv4 header length"
877		return 1
878	fi
879}
880
881test_pmtu_vti6_default_mtu() {
882	setup namespaces veth vti6 || return 2
883
884	# Check that MTU of vti device is MTU of veth minus IPv6 header length
885	veth_mtu="$(link_get_mtu "${ns_a}" veth_a)"
886	vti6_mtu="$(link_get_mtu "${ns_a}" vti6_a)"
887	if [ $((veth_mtu - vti6_mtu)) -ne 40 ]; then
888		err "  vti MTU ${vti6_mtu} is not veth MTU ${veth_mtu} minus IPv6 header length"
889		return 1
890	fi
891}
892
893test_pmtu_vti4_link_add_mtu() {
894	setup namespaces || return 2
895
896	${ns_a} ip link add vti4_a type vti local ${veth4_a_addr} remote ${veth4_b_addr} key 10
897	[ $? -ne 0 ] && err "  vti not supported" && return 2
898	${ns_a} ip link del vti4_a
899
900	fail=0
901
902	min=68
903	max=$((65535 - 20))
904	# Check invalid values first
905	for v in $((min - 1)) $((max + 1)); do
906		${ns_a} ip link add vti4_a mtu ${v} type vti local ${veth4_a_addr} remote ${veth4_b_addr} key 10 2>/dev/null
907		# This can fail, or MTU can be adjusted to a proper value
908		[ $? -ne 0 ] && continue
909		mtu="$(link_get_mtu "${ns_a}" vti4_a)"
910		if [ ${mtu} -lt ${min} -o ${mtu} -gt ${max} ]; then
911			err "  vti tunnel created with invalid MTU ${mtu}"
912			fail=1
913		fi
914		${ns_a} ip link del vti4_a
915	done
916
917	# Now check valid values
918	for v in ${min} 1300 ${max}; do
919		${ns_a} ip link add vti4_a mtu ${v} type vti local ${veth4_a_addr} remote ${veth4_b_addr} key 10
920		mtu="$(link_get_mtu "${ns_a}" vti4_a)"
921		${ns_a} ip link del vti4_a
922		if [ "${mtu}" != "${v}" ]; then
923			err "  vti MTU ${mtu} doesn't match configured value ${v}"
924			fail=1
925		fi
926	done
927
928	return ${fail}
929}
930
931test_pmtu_vti6_link_add_mtu() {
932	setup namespaces || return 2
933
934	${ns_a} ip link add vti6_a type vti6 local ${veth6_a_addr} remote ${veth6_b_addr} key 10
935	[ $? -ne 0 ] && err "  vti6 not supported" && return 2
936	${ns_a} ip link del vti6_a
937
938	fail=0
939
940	min=68			# vti6 can carry IPv4 packets too
941	max=$((65535 - 40))
942	# Check invalid values first
943	for v in $((min - 1)) $((max + 1)); do
944		${ns_a} ip link add vti6_a mtu ${v} type vti6 local ${veth6_a_addr} remote ${veth6_b_addr} key 10 2>/dev/null
945		# This can fail, or MTU can be adjusted to a proper value
946		[ $? -ne 0 ] && continue
947		mtu="$(link_get_mtu "${ns_a}" vti6_a)"
948		if [ ${mtu} -lt ${min} -o ${mtu} -gt ${max} ]; then
949			err "  vti6 tunnel created with invalid MTU ${v}"
950			fail=1
951		fi
952		${ns_a} ip link del vti6_a
953	done
954
955	# Now check valid values
956	for v in 68 1280 1300 $((65535 - 40)); do
957		${ns_a} ip link add vti6_a mtu ${v} type vti6 local ${veth6_a_addr} remote ${veth6_b_addr} key 10
958		mtu="$(link_get_mtu "${ns_a}" vti6_a)"
959		${ns_a} ip link del vti6_a
960		if [ "${mtu}" != "${v}" ]; then
961			err "  vti6 MTU ${mtu} doesn't match configured value ${v}"
962			fail=1
963		fi
964	done
965
966	return ${fail}
967}
968
969test_pmtu_vti6_link_change_mtu() {
970	setup namespaces || return 2
971
972	${ns_a} ip link add dummy0 mtu 1500 type dummy
973	[ $? -ne 0 ] && err "  dummy not supported" && return 2
974	${ns_a} ip link add dummy1 mtu 3000 type dummy
975	${ns_a} ip link set dummy0 up
976	${ns_a} ip link set dummy1 up
977
978	${ns_a} ip addr add ${dummy6_0_addr}/${dummy6_mask} dev dummy0
979	${ns_a} ip addr add ${dummy6_1_addr}/${dummy6_mask} dev dummy1
980
981	fail=0
982
983	# Create vti6 interface bound to device, passing MTU, check it
984	${ns_a} ip link add vti6_a mtu 1300 type vti6 remote ${dummy6_0_addr} local ${dummy6_0_addr}
985	mtu="$(link_get_mtu "${ns_a}" vti6_a)"
986	if [ ${mtu} -ne 1300 ]; then
987		err "  vti6 MTU ${mtu} doesn't match configured value 1300"
988		fail=1
989	fi
990
991	# Move to another device with different MTU, without passing MTU, check
992	# MTU is adjusted
993	${ns_a} ip link set vti6_a type vti6 remote ${dummy6_1_addr} local ${dummy6_1_addr}
994	mtu="$(link_get_mtu "${ns_a}" vti6_a)"
995	if [ ${mtu} -ne $((3000 - 40)) ]; then
996		err "  vti MTU ${mtu} is not dummy MTU 3000 minus IPv6 header length"
997		fail=1
998	fi
999
1000	# Move it back, passing MTU, check MTU is not overridden
1001	${ns_a} ip link set vti6_a mtu 1280 type vti6 remote ${dummy6_0_addr} local ${dummy6_0_addr}
1002	mtu="$(link_get_mtu "${ns_a}" vti6_a)"
1003	if [ ${mtu} -ne 1280 ]; then
1004		err "  vti6 MTU ${mtu} doesn't match configured value 1280"
1005		fail=1
1006	fi
1007
1008	return ${fail}
1009}
1010
1011usage() {
1012	echo
1013	echo "$0 [OPTIONS] [TEST]..."
1014	echo "If no TEST argument is given, all tests will be run."
1015	echo
1016	echo "Options"
1017	echo "  --trace: capture traffic to TEST_INTERFACE.pcap"
1018	echo
1019	echo "Available tests${tests}"
1020	exit 1
1021}
1022
1023exitcode=0
1024desc=0
1025IFS="
1026"
1027
1028tracing=0
1029for arg do
1030	if [ "${arg}" != "${arg#--*}" ]; then
1031		opt="${arg#--}"
1032		if [ "${opt}" = "trace" ]; then
1033			if which tcpdump > /dev/null 2>&1; then
1034				tracing=1
1035			else
1036				echo "=== tcpdump not available, tracing disabled"
1037			fi
1038		else
1039			usage
1040		fi
1041	else
1042		# Check first that all requested tests are available before
1043		# running any
1044		command -v > /dev/null "test_${arg}" || { echo "=== Test ${arg} not found"; usage; }
1045	fi
1046done
1047
1048trap cleanup EXIT
1049
1050for t in ${tests}; do
1051	[ $desc -eq 0 ] && name="${t}" && desc=1 && continue || desc=0
1052
1053	run_this=1
1054	for arg do
1055		[ "${arg}" != "${arg#--*}" ] && continue
1056		[ "${arg}" = "${name}" ] && run_this=1 && break
1057		run_this=0
1058	done
1059	[ $run_this -eq 0 ] && continue
1060
1061	(
1062		unset IFS
1063		eval test_${name}
1064		ret=$?
1065		cleanup
1066
1067		if [ $ret -eq 0 ]; then
1068			printf "TEST: %-60s  [ OK ]\n" "${t}"
1069		elif [ $ret -eq 1 ]; then
1070			printf "TEST: %-60s  [FAIL]\n" "${t}"
1071			err_flush
1072			exit 1
1073		elif [ $ret -eq 2 ]; then
1074			printf "TEST: %-60s  [SKIP]\n" "${t}"
1075			err_flush
1076		fi
1077	)
1078	[ $? -ne 0 ] && exitcode=1
1079done
1080
1081exit ${exitcode}
1082