xref: /openbmc/linux/net/ipv6/icmp.c (revision 47aab53331effedd3f5a6136854bd1da011f94b6)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	Internet Control Message Protocol (ICMPv6)
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on net/ipv4/icmp.c
10  *
11  *	RFC 1885
12  */
13 
14 /*
15  *	Changes:
16  *
17  *	Andi Kleen		:	exception handling
18  *	Andi Kleen			add rate limits. never reply to a icmp.
19  *					add more length checks and other fixes.
20  *	yoshfuji		:	ensure to sent parameter problem for
21  *					fragments.
22  *	YOSHIFUJI Hideaki @USAGI:	added sysctl for icmp rate limit.
23  *	Randy Dunlap and
24  *	YOSHIFUJI Hideaki @USAGI:	Per-interface statistics support
25  *	Kazunori MIYAZAWA @USAGI:       change output process to use ip6_append_data
26  */
27 
28 #define pr_fmt(fmt) "IPv6: " fmt
29 
30 #include <linux/module.h>
31 #include <linux/errno.h>
32 #include <linux/types.h>
33 #include <linux/socket.h>
34 #include <linux/in.h>
35 #include <linux/kernel.h>
36 #include <linux/sockios.h>
37 #include <linux/net.h>
38 #include <linux/skbuff.h>
39 #include <linux/init.h>
40 #include <linux/netfilter.h>
41 #include <linux/slab.h>
42 
43 #ifdef CONFIG_SYSCTL
44 #include <linux/sysctl.h>
45 #endif
46 
47 #include <linux/inet.h>
48 #include <linux/netdevice.h>
49 #include <linux/icmpv6.h>
50 
51 #include <net/ip.h>
52 #include <net/sock.h>
53 
54 #include <net/ipv6.h>
55 #include <net/ip6_checksum.h>
56 #include <net/ping.h>
57 #include <net/protocol.h>
58 #include <net/raw.h>
59 #include <net/rawv6.h>
60 #include <net/seg6.h>
61 #include <net/transp_v6.h>
62 #include <net/ip6_route.h>
63 #include <net/addrconf.h>
64 #include <net/icmp.h>
65 #include <net/xfrm.h>
66 #include <net/inet_common.h>
67 #include <net/dsfield.h>
68 #include <net/l3mdev.h>
69 
70 #include <linux/uaccess.h>
71 
72 static DEFINE_PER_CPU(struct sock *, ipv6_icmp_sk);
73 
74 static int icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
75 		       u8 type, u8 code, int offset, __be32 info)
76 {
77 	/* icmpv6_notify checks 8 bytes can be pulled, icmp6hdr is 8 bytes */
78 	struct icmp6hdr *icmp6 = (struct icmp6hdr *) (skb->data + offset);
79 	struct net *net = dev_net(skb->dev);
80 
81 	if (type == ICMPV6_PKT_TOOBIG)
82 		ip6_update_pmtu(skb, net, info, skb->dev->ifindex, 0, sock_net_uid(net, NULL));
83 	else if (type == NDISC_REDIRECT)
84 		ip6_redirect(skb, net, skb->dev->ifindex, 0,
85 			     sock_net_uid(net, NULL));
86 
87 	if (!(type & ICMPV6_INFOMSG_MASK))
88 		if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST)
89 			ping_err(skb, offset, ntohl(info));
90 
91 	return 0;
92 }
93 
94 static int icmpv6_rcv(struct sk_buff *skb);
95 
96 static const struct inet6_protocol icmpv6_protocol = {
97 	.handler	=	icmpv6_rcv,
98 	.err_handler	=	icmpv6_err,
99 	.flags		=	INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
100 };
101 
102 /* Called with BH disabled */
103 static struct sock *icmpv6_xmit_lock(struct net *net)
104 {
105 	struct sock *sk;
106 
107 	sk = this_cpu_read(ipv6_icmp_sk);
108 	if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
109 		/* This can happen if the output path (f.e. SIT or
110 		 * ip6ip6 tunnel) signals dst_link_failure() for an
111 		 * outgoing ICMP6 packet.
112 		 */
113 		return NULL;
114 	}
115 	sock_net_set(sk, net);
116 	return sk;
117 }
118 
119 static void icmpv6_xmit_unlock(struct sock *sk)
120 {
121 	sock_net_set(sk, &init_net);
122 	spin_unlock(&sk->sk_lock.slock);
123 }
124 
125 /*
126  * Figure out, may we reply to this packet with icmp error.
127  *
128  * We do not reply, if:
129  *	- it was icmp error message.
130  *	- it is truncated, so that it is known, that protocol is ICMPV6
131  *	  (i.e. in the middle of some exthdr)
132  *
133  *	--ANK (980726)
134  */
135 
136 static bool is_ineligible(const struct sk_buff *skb)
137 {
138 	int ptr = (u8 *)(ipv6_hdr(skb) + 1) - skb->data;
139 	int len = skb->len - ptr;
140 	__u8 nexthdr = ipv6_hdr(skb)->nexthdr;
141 	__be16 frag_off;
142 
143 	if (len < 0)
144 		return true;
145 
146 	ptr = ipv6_skip_exthdr(skb, ptr, &nexthdr, &frag_off);
147 	if (ptr < 0)
148 		return false;
149 	if (nexthdr == IPPROTO_ICMPV6) {
150 		u8 _type, *tp;
151 		tp = skb_header_pointer(skb,
152 			ptr+offsetof(struct icmp6hdr, icmp6_type),
153 			sizeof(_type), &_type);
154 
155 		/* Based on RFC 8200, Section 4.5 Fragment Header, return
156 		 * false if this is a fragment packet with no icmp header info.
157 		 */
158 		if (!tp && frag_off != 0)
159 			return false;
160 		else if (!tp || !(*tp & ICMPV6_INFOMSG_MASK))
161 			return true;
162 	}
163 	return false;
164 }
165 
166 static bool icmpv6_mask_allow(struct net *net, int type)
167 {
168 	if (type > ICMPV6_MSG_MAX)
169 		return true;
170 
171 	/* Limit if icmp type is set in ratemask. */
172 	if (!test_bit(type, net->ipv6.sysctl.icmpv6_ratemask))
173 		return true;
174 
175 	return false;
176 }
177 
178 static bool icmpv6_global_allow(struct net *net, int type)
179 {
180 	if (icmpv6_mask_allow(net, type))
181 		return true;
182 
183 	if (icmp_global_allow())
184 		return true;
185 
186 	__ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL);
187 	return false;
188 }
189 
190 /*
191  * Check the ICMP output rate limit
192  */
193 static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
194 			       struct flowi6 *fl6)
195 {
196 	struct net *net = sock_net(sk);
197 	struct dst_entry *dst;
198 	bool res = false;
199 
200 	if (icmpv6_mask_allow(net, type))
201 		return true;
202 
203 	/*
204 	 * Look up the output route.
205 	 * XXX: perhaps the expire for routing entries cloned by
206 	 * this lookup should be more aggressive (not longer than timeout).
207 	 */
208 	dst = ip6_route_output(net, sk, fl6);
209 	if (dst->error) {
210 		IP6_INC_STATS(net, ip6_dst_idev(dst),
211 			      IPSTATS_MIB_OUTNOROUTES);
212 	} else if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) {
213 		res = true;
214 	} else {
215 		struct rt6_info *rt = (struct rt6_info *)dst;
216 		int tmo = net->ipv6.sysctl.icmpv6_time;
217 		struct inet_peer *peer;
218 
219 		/* Give more bandwidth to wider prefixes. */
220 		if (rt->rt6i_dst.plen < 128)
221 			tmo >>= ((128 - rt->rt6i_dst.plen)>>5);
222 
223 		peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr, 1);
224 		res = inet_peer_xrlim_allow(peer, tmo);
225 		if (peer)
226 			inet_putpeer(peer);
227 	}
228 	if (!res)
229 		__ICMP6_INC_STATS(net, ip6_dst_idev(dst),
230 				  ICMP6_MIB_RATELIMITHOST);
231 	dst_release(dst);
232 	return res;
233 }
234 
235 static bool icmpv6_rt_has_prefsrc(struct sock *sk, u8 type,
236 				  struct flowi6 *fl6)
237 {
238 	struct net *net = sock_net(sk);
239 	struct dst_entry *dst;
240 	bool res = false;
241 
242 	dst = ip6_route_output(net, sk, fl6);
243 	if (!dst->error) {
244 		struct rt6_info *rt = (struct rt6_info *)dst;
245 		struct in6_addr prefsrc;
246 
247 		rt6_get_prefsrc(rt, &prefsrc);
248 		res = !ipv6_addr_any(&prefsrc);
249 	}
250 	dst_release(dst);
251 	return res;
252 }
253 
254 /*
255  *	an inline helper for the "simple" if statement below
256  *	checks if parameter problem report is caused by an
257  *	unrecognized IPv6 option that has the Option Type
258  *	highest-order two bits set to 10
259  */
260 
261 static bool opt_unrec(struct sk_buff *skb, __u32 offset)
262 {
263 	u8 _optval, *op;
264 
265 	offset += skb_network_offset(skb);
266 	op = skb_header_pointer(skb, offset, sizeof(_optval), &_optval);
267 	if (!op)
268 		return true;
269 	return (*op & 0xC0) == 0x80;
270 }
271 
272 void icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
273 				struct icmp6hdr *thdr, int len)
274 {
275 	struct sk_buff *skb;
276 	struct icmp6hdr *icmp6h;
277 
278 	skb = skb_peek(&sk->sk_write_queue);
279 	if (!skb)
280 		return;
281 
282 	icmp6h = icmp6_hdr(skb);
283 	memcpy(icmp6h, thdr, sizeof(struct icmp6hdr));
284 	icmp6h->icmp6_cksum = 0;
285 
286 	if (skb_queue_len(&sk->sk_write_queue) == 1) {
287 		skb->csum = csum_partial(icmp6h,
288 					sizeof(struct icmp6hdr), skb->csum);
289 		icmp6h->icmp6_cksum = csum_ipv6_magic(&fl6->saddr,
290 						      &fl6->daddr,
291 						      len, fl6->flowi6_proto,
292 						      skb->csum);
293 	} else {
294 		__wsum tmp_csum = 0;
295 
296 		skb_queue_walk(&sk->sk_write_queue, skb) {
297 			tmp_csum = csum_add(tmp_csum, skb->csum);
298 		}
299 
300 		tmp_csum = csum_partial(icmp6h,
301 					sizeof(struct icmp6hdr), tmp_csum);
302 		icmp6h->icmp6_cksum = csum_ipv6_magic(&fl6->saddr,
303 						      &fl6->daddr,
304 						      len, fl6->flowi6_proto,
305 						      tmp_csum);
306 	}
307 	ip6_push_pending_frames(sk);
308 }
309 
310 struct icmpv6_msg {
311 	struct sk_buff	*skb;
312 	int		offset;
313 	uint8_t		type;
314 };
315 
316 static int icmpv6_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
317 {
318 	struct icmpv6_msg *msg = (struct icmpv6_msg *) from;
319 	struct sk_buff *org_skb = msg->skb;
320 	__wsum csum;
321 
322 	csum = skb_copy_and_csum_bits(org_skb, msg->offset + offset,
323 				      to, len);
324 	skb->csum = csum_block_add(skb->csum, csum, odd);
325 	if (!(msg->type & ICMPV6_INFOMSG_MASK))
326 		nf_ct_attach(skb, org_skb);
327 	return 0;
328 }
329 
330 #if IS_ENABLED(CONFIG_IPV6_MIP6)
331 static void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt)
332 {
333 	struct ipv6hdr *iph = ipv6_hdr(skb);
334 	struct ipv6_destopt_hao *hao;
335 	int off;
336 
337 	if (opt->dsthao) {
338 		off = ipv6_find_tlv(skb, opt->dsthao, IPV6_TLV_HAO);
339 		if (likely(off >= 0)) {
340 			hao = (struct ipv6_destopt_hao *)
341 					(skb_network_header(skb) + off);
342 			swap(iph->saddr, hao->addr);
343 		}
344 	}
345 }
346 #else
347 static inline void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt) {}
348 #endif
349 
350 static struct dst_entry *icmpv6_route_lookup(struct net *net,
351 					     struct sk_buff *skb,
352 					     struct sock *sk,
353 					     struct flowi6 *fl6)
354 {
355 	struct dst_entry *dst, *dst2;
356 	struct flowi6 fl2;
357 	int err;
358 
359 	err = ip6_dst_lookup(net, sk, &dst, fl6);
360 	if (err)
361 		return ERR_PTR(err);
362 
363 	/*
364 	 * We won't send icmp if the destination is known
365 	 * anycast unless we need to treat anycast as unicast.
366 	 */
367 	if (!READ_ONCE(net->ipv6.sysctl.icmpv6_error_anycast_as_unicast) &&
368 	    ipv6_anycast_destination(dst, &fl6->daddr)) {
369 		net_dbg_ratelimited("icmp6_send: acast source\n");
370 		dst_release(dst);
371 		return ERR_PTR(-EINVAL);
372 	}
373 
374 	/* No need to clone since we're just using its address. */
375 	dst2 = dst;
376 
377 	dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), sk, 0);
378 	if (!IS_ERR(dst)) {
379 		if (dst != dst2)
380 			return dst;
381 	} else {
382 		if (PTR_ERR(dst) == -EPERM)
383 			dst = NULL;
384 		else
385 			return dst;
386 	}
387 
388 	err = xfrm_decode_session_reverse(skb, flowi6_to_flowi(&fl2), AF_INET6);
389 	if (err)
390 		goto relookup_failed;
391 
392 	err = ip6_dst_lookup(net, sk, &dst2, &fl2);
393 	if (err)
394 		goto relookup_failed;
395 
396 	dst2 = xfrm_lookup(net, dst2, flowi6_to_flowi(&fl2), sk, XFRM_LOOKUP_ICMP);
397 	if (!IS_ERR(dst2)) {
398 		dst_release(dst);
399 		dst = dst2;
400 	} else {
401 		err = PTR_ERR(dst2);
402 		if (err == -EPERM) {
403 			dst_release(dst);
404 			return dst2;
405 		} else
406 			goto relookup_failed;
407 	}
408 
409 relookup_failed:
410 	if (dst)
411 		return dst;
412 	return ERR_PTR(err);
413 }
414 
415 static struct net_device *icmp6_dev(const struct sk_buff *skb)
416 {
417 	struct net_device *dev = skb->dev;
418 
419 	/* for local traffic to local address, skb dev is the loopback
420 	 * device. Check if there is a dst attached to the skb and if so
421 	 * get the real device index. Same is needed for replies to a link
422 	 * local address on a device enslaved to an L3 master device
423 	 */
424 	if (unlikely(dev->ifindex == LOOPBACK_IFINDEX || netif_is_l3_master(skb->dev))) {
425 		const struct rt6_info *rt6 = skb_rt6_info(skb);
426 
427 		if (rt6)
428 			dev = rt6->rt6i_idev->dev;
429 	}
430 
431 	return dev;
432 }
433 
434 static int icmp6_iif(const struct sk_buff *skb)
435 {
436 	return icmp6_dev(skb)->ifindex;
437 }
438 
439 /*
440  *	Send an ICMP message in response to a packet in error
441  */
442 void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
443 		const struct in6_addr *force_saddr,
444 		const struct inet6_skb_parm *parm)
445 {
446 	struct inet6_dev *idev = NULL;
447 	struct ipv6hdr *hdr = ipv6_hdr(skb);
448 	struct sock *sk;
449 	struct net *net;
450 	struct ipv6_pinfo *np;
451 	const struct in6_addr *saddr = NULL;
452 	struct dst_entry *dst;
453 	struct icmp6hdr tmp_hdr;
454 	struct flowi6 fl6;
455 	struct icmpv6_msg msg;
456 	struct ipcm6_cookie ipc6;
457 	int iif = 0;
458 	int addr_type = 0;
459 	int len;
460 	u32 mark;
461 
462 	if ((u8 *)hdr < skb->head ||
463 	    (skb_network_header(skb) + sizeof(*hdr)) > skb_tail_pointer(skb))
464 		return;
465 
466 	if (!skb->dev)
467 		return;
468 	net = dev_net(skb->dev);
469 	mark = IP6_REPLY_MARK(net, skb->mark);
470 	/*
471 	 *	Make sure we respect the rules
472 	 *	i.e. RFC 1885 2.4(e)
473 	 *	Rule (e.1) is enforced by not using icmp6_send
474 	 *	in any code that processes icmp errors.
475 	 */
476 	addr_type = ipv6_addr_type(&hdr->daddr);
477 
478 	if (ipv6_chk_addr(net, &hdr->daddr, skb->dev, 0) ||
479 	    ipv6_chk_acast_addr_src(net, skb->dev, &hdr->daddr))
480 		saddr = &hdr->daddr;
481 
482 	/*
483 	 *	Dest addr check
484 	 */
485 
486 	if (addr_type & IPV6_ADDR_MULTICAST || skb->pkt_type != PACKET_HOST) {
487 		if (type != ICMPV6_PKT_TOOBIG &&
488 		    !(type == ICMPV6_PARAMPROB &&
489 		      code == ICMPV6_UNK_OPTION &&
490 		      (opt_unrec(skb, info))))
491 			return;
492 
493 		saddr = NULL;
494 	}
495 
496 	addr_type = ipv6_addr_type(&hdr->saddr);
497 
498 	/*
499 	 *	Source addr check
500 	 */
501 
502 	if (__ipv6_addr_needs_scope_id(addr_type)) {
503 		iif = icmp6_iif(skb);
504 	} else {
505 		/*
506 		 * The source device is used for looking up which routing table
507 		 * to use for sending an ICMP error.
508 		 */
509 		iif = l3mdev_master_ifindex(skb->dev);
510 	}
511 
512 	/*
513 	 *	Must not send error if the source does not uniquely
514 	 *	identify a single node (RFC2463 Section 2.4).
515 	 *	We check unspecified / multicast addresses here,
516 	 *	and anycast addresses will be checked later.
517 	 */
518 	if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) {
519 		net_dbg_ratelimited("icmp6_send: addr_any/mcast source [%pI6c > %pI6c]\n",
520 				    &hdr->saddr, &hdr->daddr);
521 		return;
522 	}
523 
524 	/*
525 	 *	Never answer to a ICMP packet.
526 	 */
527 	if (is_ineligible(skb)) {
528 		net_dbg_ratelimited("icmp6_send: no reply to icmp error [%pI6c > %pI6c]\n",
529 				    &hdr->saddr, &hdr->daddr);
530 		return;
531 	}
532 
533 	/* Needed by both icmp_global_allow and icmpv6_xmit_lock */
534 	local_bh_disable();
535 
536 	/* Check global sysctl_icmp_msgs_per_sec ratelimit */
537 	if (!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, type))
538 		goto out_bh_enable;
539 
540 	mip6_addr_swap(skb, parm);
541 
542 	sk = icmpv6_xmit_lock(net);
543 	if (!sk)
544 		goto out_bh_enable;
545 
546 	memset(&fl6, 0, sizeof(fl6));
547 	fl6.flowi6_proto = IPPROTO_ICMPV6;
548 	fl6.daddr = hdr->saddr;
549 	if (force_saddr)
550 		saddr = force_saddr;
551 	if (saddr) {
552 		fl6.saddr = *saddr;
553 	} else if (!icmpv6_rt_has_prefsrc(sk, type, &fl6)) {
554 		/* select a more meaningful saddr from input if */
555 		struct net_device *in_netdev;
556 
557 		in_netdev = dev_get_by_index(net, parm->iif);
558 		if (in_netdev) {
559 			ipv6_dev_get_saddr(net, in_netdev, &fl6.daddr,
560 					   inet6_sk(sk)->srcprefs,
561 					   &fl6.saddr);
562 			dev_put(in_netdev);
563 		}
564 	}
565 	fl6.flowi6_mark = mark;
566 	fl6.flowi6_oif = iif;
567 	fl6.fl6_icmp_type = type;
568 	fl6.fl6_icmp_code = code;
569 	fl6.flowi6_uid = sock_net_uid(net, NULL);
570 	fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, NULL);
571 	security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6));
572 
573 	np = inet6_sk(sk);
574 
575 	if (!icmpv6_xrlim_allow(sk, type, &fl6))
576 		goto out;
577 
578 	tmp_hdr.icmp6_type = type;
579 	tmp_hdr.icmp6_code = code;
580 	tmp_hdr.icmp6_cksum = 0;
581 	tmp_hdr.icmp6_pointer = htonl(info);
582 
583 	if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
584 		fl6.flowi6_oif = np->mcast_oif;
585 	else if (!fl6.flowi6_oif)
586 		fl6.flowi6_oif = np->ucast_oif;
587 
588 	ipcm6_init_sk(&ipc6, np);
589 	ipc6.sockc.mark = mark;
590 	fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);
591 
592 	dst = icmpv6_route_lookup(net, skb, sk, &fl6);
593 	if (IS_ERR(dst))
594 		goto out;
595 
596 	ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
597 
598 	msg.skb = skb;
599 	msg.offset = skb_network_offset(skb);
600 	msg.type = type;
601 
602 	len = skb->len - msg.offset;
603 	len = min_t(unsigned int, len, IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(struct icmp6hdr));
604 	if (len < 0) {
605 		net_dbg_ratelimited("icmp: len problem [%pI6c > %pI6c]\n",
606 				    &hdr->saddr, &hdr->daddr);
607 		goto out_dst_release;
608 	}
609 
610 	rcu_read_lock();
611 	idev = __in6_dev_get(skb->dev);
612 
613 	if (ip6_append_data(sk, icmpv6_getfrag, &msg,
614 			    len + sizeof(struct icmp6hdr),
615 			    sizeof(struct icmp6hdr),
616 			    &ipc6, &fl6, (struct rt6_info *)dst,
617 			    MSG_DONTWAIT)) {
618 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS);
619 		ip6_flush_pending_frames(sk);
620 	} else {
621 		icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr,
622 					   len + sizeof(struct icmp6hdr));
623 	}
624 	rcu_read_unlock();
625 out_dst_release:
626 	dst_release(dst);
627 out:
628 	icmpv6_xmit_unlock(sk);
629 out_bh_enable:
630 	local_bh_enable();
631 }
632 EXPORT_SYMBOL(icmp6_send);
633 
634 /* Slightly more convenient version of icmp6_send with drop reasons.
635  */
636 void icmpv6_param_prob_reason(struct sk_buff *skb, u8 code, int pos,
637 			      enum skb_drop_reason reason)
638 {
639 	icmp6_send(skb, ICMPV6_PARAMPROB, code, pos, NULL, IP6CB(skb));
640 	kfree_skb_reason(skb, reason);
641 }
642 
643 /* Generate icmpv6 with type/code ICMPV6_DEST_UNREACH/ICMPV6_ADDR_UNREACH
644  * if sufficient data bytes are available
645  * @nhs is the size of the tunnel header(s) :
646  *  Either an IPv4 header for SIT encap
647  *         an IPv4 header + GRE header for GRE encap
648  */
649 int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type,
650 			       unsigned int data_len)
651 {
652 	struct in6_addr temp_saddr;
653 	struct rt6_info *rt;
654 	struct sk_buff *skb2;
655 	u32 info = 0;
656 
657 	if (!pskb_may_pull(skb, nhs + sizeof(struct ipv6hdr) + 8))
658 		return 1;
659 
660 	/* RFC 4884 (partial) support for ICMP extensions */
661 	if (data_len < 128 || (data_len & 7) || skb->len < data_len)
662 		data_len = 0;
663 
664 	skb2 = data_len ? skb_copy(skb, GFP_ATOMIC) : skb_clone(skb, GFP_ATOMIC);
665 
666 	if (!skb2)
667 		return 1;
668 
669 	skb_dst_drop(skb2);
670 	skb_pull(skb2, nhs);
671 	skb_reset_network_header(skb2);
672 
673 	rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0,
674 			skb, 0);
675 
676 	if (rt && rt->dst.dev)
677 		skb2->dev = rt->dst.dev;
678 
679 	ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, &temp_saddr);
680 
681 	if (data_len) {
682 		/* RFC 4884 (partial) support :
683 		 * insert 0 padding at the end, before the extensions
684 		 */
685 		__skb_push(skb2, nhs);
686 		skb_reset_network_header(skb2);
687 		memmove(skb2->data, skb2->data + nhs, data_len - nhs);
688 		memset(skb2->data + data_len - nhs, 0, nhs);
689 		/* RFC 4884 4.5 : Length is measured in 64-bit words,
690 		 * and stored in reserved[0]
691 		 */
692 		info = (data_len/8) << 24;
693 	}
694 	if (type == ICMP_TIME_EXCEEDED)
695 		icmp6_send(skb2, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
696 			   info, &temp_saddr, IP6CB(skb2));
697 	else
698 		icmp6_send(skb2, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH,
699 			   info, &temp_saddr, IP6CB(skb2));
700 	if (rt)
701 		ip6_rt_put(rt);
702 
703 	kfree_skb(skb2);
704 
705 	return 0;
706 }
707 EXPORT_SYMBOL(ip6_err_gen_icmpv6_unreach);
708 
709 static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb)
710 {
711 	struct net *net = dev_net(skb->dev);
712 	struct sock *sk;
713 	struct inet6_dev *idev;
714 	struct ipv6_pinfo *np;
715 	const struct in6_addr *saddr = NULL;
716 	struct icmp6hdr *icmph = icmp6_hdr(skb);
717 	struct icmp6hdr tmp_hdr;
718 	struct flowi6 fl6;
719 	struct icmpv6_msg msg;
720 	struct dst_entry *dst;
721 	struct ipcm6_cookie ipc6;
722 	u32 mark = IP6_REPLY_MARK(net, skb->mark);
723 	SKB_DR(reason);
724 	bool acast;
725 	u8 type;
726 
727 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) &&
728 	    net->ipv6.sysctl.icmpv6_echo_ignore_multicast)
729 		return reason;
730 
731 	saddr = &ipv6_hdr(skb)->daddr;
732 
733 	acast = ipv6_anycast_destination(skb_dst(skb), saddr);
734 	if (acast && net->ipv6.sysctl.icmpv6_echo_ignore_anycast)
735 		return reason;
736 
737 	if (!ipv6_unicast_destination(skb) &&
738 	    !(net->ipv6.sysctl.anycast_src_echo_reply && acast))
739 		saddr = NULL;
740 
741 	if (icmph->icmp6_type == ICMPV6_EXT_ECHO_REQUEST)
742 		type = ICMPV6_EXT_ECHO_REPLY;
743 	else
744 		type = ICMPV6_ECHO_REPLY;
745 
746 	memcpy(&tmp_hdr, icmph, sizeof(tmp_hdr));
747 	tmp_hdr.icmp6_type = type;
748 
749 	memset(&fl6, 0, sizeof(fl6));
750 	if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ICMPV6_ECHO_REPLIES)
751 		fl6.flowlabel = ip6_flowlabel(ipv6_hdr(skb));
752 
753 	fl6.flowi6_proto = IPPROTO_ICMPV6;
754 	fl6.daddr = ipv6_hdr(skb)->saddr;
755 	if (saddr)
756 		fl6.saddr = *saddr;
757 	fl6.flowi6_oif = icmp6_iif(skb);
758 	fl6.fl6_icmp_type = type;
759 	fl6.flowi6_mark = mark;
760 	fl6.flowi6_uid = sock_net_uid(net, NULL);
761 	security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6));
762 
763 	local_bh_disable();
764 	sk = icmpv6_xmit_lock(net);
765 	if (!sk)
766 		goto out_bh_enable;
767 	np = inet6_sk(sk);
768 
769 	if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
770 		fl6.flowi6_oif = np->mcast_oif;
771 	else if (!fl6.flowi6_oif)
772 		fl6.flowi6_oif = np->ucast_oif;
773 
774 	if (ip6_dst_lookup(net, sk, &dst, &fl6))
775 		goto out;
776 	dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0);
777 	if (IS_ERR(dst))
778 		goto out;
779 
780 	/* Check the ratelimit */
781 	if ((!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, ICMPV6_ECHO_REPLY)) ||
782 	    !icmpv6_xrlim_allow(sk, ICMPV6_ECHO_REPLY, &fl6))
783 		goto out_dst_release;
784 
785 	idev = __in6_dev_get(skb->dev);
786 
787 	msg.skb = skb;
788 	msg.offset = 0;
789 	msg.type = type;
790 
791 	ipcm6_init_sk(&ipc6, np);
792 	ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
793 	ipc6.tclass = ipv6_get_dsfield(ipv6_hdr(skb));
794 	ipc6.sockc.mark = mark;
795 
796 	if (icmph->icmp6_type == ICMPV6_EXT_ECHO_REQUEST)
797 		if (!icmp_build_probe(skb, (struct icmphdr *)&tmp_hdr))
798 			goto out_dst_release;
799 
800 	if (ip6_append_data(sk, icmpv6_getfrag, &msg,
801 			    skb->len + sizeof(struct icmp6hdr),
802 			    sizeof(struct icmp6hdr), &ipc6, &fl6,
803 			    (struct rt6_info *)dst, MSG_DONTWAIT)) {
804 		__ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS);
805 		ip6_flush_pending_frames(sk);
806 	} else {
807 		icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr,
808 					   skb->len + sizeof(struct icmp6hdr));
809 		reason = SKB_CONSUMED;
810 	}
811 out_dst_release:
812 	dst_release(dst);
813 out:
814 	icmpv6_xmit_unlock(sk);
815 out_bh_enable:
816 	local_bh_enable();
817 	return reason;
818 }
819 
820 enum skb_drop_reason icmpv6_notify(struct sk_buff *skb, u8 type,
821 				   u8 code, __be32 info)
822 {
823 	struct inet6_skb_parm *opt = IP6CB(skb);
824 	struct net *net = dev_net(skb->dev);
825 	const struct inet6_protocol *ipprot;
826 	enum skb_drop_reason reason;
827 	int inner_offset;
828 	__be16 frag_off;
829 	u8 nexthdr;
830 
831 	reason = pskb_may_pull_reason(skb, sizeof(struct ipv6hdr));
832 	if (reason != SKB_NOT_DROPPED_YET)
833 		goto out;
834 
835 	seg6_icmp_srh(skb, opt);
836 
837 	nexthdr = ((struct ipv6hdr *)skb->data)->nexthdr;
838 	if (ipv6_ext_hdr(nexthdr)) {
839 		/* now skip over extension headers */
840 		inner_offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr),
841 						&nexthdr, &frag_off);
842 		if (inner_offset < 0) {
843 			SKB_DR_SET(reason, IPV6_BAD_EXTHDR);
844 			goto out;
845 		}
846 	} else {
847 		inner_offset = sizeof(struct ipv6hdr);
848 	}
849 
850 	/* Checkin header including 8 bytes of inner protocol header. */
851 	reason = pskb_may_pull_reason(skb, inner_offset + 8);
852 	if (reason != SKB_NOT_DROPPED_YET)
853 		goto out;
854 
855 	/* BUGGG_FUTURE: we should try to parse exthdrs in this packet.
856 	   Without this we will not able f.e. to make source routed
857 	   pmtu discovery.
858 	   Corresponding argument (opt) to notifiers is already added.
859 	   --ANK (980726)
860 	 */
861 
862 	ipprot = rcu_dereference(inet6_protos[nexthdr]);
863 	if (ipprot && ipprot->err_handler)
864 		ipprot->err_handler(skb, opt, type, code, inner_offset, info);
865 
866 	raw6_icmp_error(skb, nexthdr, type, code, inner_offset, info);
867 	return SKB_CONSUMED;
868 
869 out:
870 	__ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS);
871 	return reason;
872 }
873 
874 /*
875  *	Handle icmp messages
876  */
877 
878 static int icmpv6_rcv(struct sk_buff *skb)
879 {
880 	enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
881 	struct net *net = dev_net(skb->dev);
882 	struct net_device *dev = icmp6_dev(skb);
883 	struct inet6_dev *idev = __in6_dev_get(dev);
884 	const struct in6_addr *saddr, *daddr;
885 	struct icmp6hdr *hdr;
886 	u8 type;
887 
888 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
889 		struct sec_path *sp = skb_sec_path(skb);
890 		int nh;
891 
892 		if (!(sp && sp->xvec[sp->len - 1]->props.flags &
893 				 XFRM_STATE_ICMP)) {
894 			reason = SKB_DROP_REASON_XFRM_POLICY;
895 			goto drop_no_count;
896 		}
897 
898 		if (!pskb_may_pull(skb, sizeof(*hdr) + sizeof(struct ipv6hdr)))
899 			goto drop_no_count;
900 
901 		nh = skb_network_offset(skb);
902 		skb_set_network_header(skb, sizeof(*hdr));
903 
904 		if (!xfrm6_policy_check_reverse(NULL, XFRM_POLICY_IN,
905 						skb)) {
906 			reason = SKB_DROP_REASON_XFRM_POLICY;
907 			goto drop_no_count;
908 		}
909 
910 		skb_set_network_header(skb, nh);
911 	}
912 
913 	__ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_INMSGS);
914 
915 	saddr = &ipv6_hdr(skb)->saddr;
916 	daddr = &ipv6_hdr(skb)->daddr;
917 
918 	if (skb_checksum_validate(skb, IPPROTO_ICMPV6, ip6_compute_pseudo)) {
919 		net_dbg_ratelimited("ICMPv6 checksum failed [%pI6c > %pI6c]\n",
920 				    saddr, daddr);
921 		goto csum_error;
922 	}
923 
924 	if (!pskb_pull(skb, sizeof(*hdr)))
925 		goto discard_it;
926 
927 	hdr = icmp6_hdr(skb);
928 
929 	type = hdr->icmp6_type;
930 
931 	ICMP6MSGIN_INC_STATS(dev_net(dev), idev, type);
932 
933 	switch (type) {
934 	case ICMPV6_ECHO_REQUEST:
935 		if (!net->ipv6.sysctl.icmpv6_echo_ignore_all)
936 			reason = icmpv6_echo_reply(skb);
937 		break;
938 	case ICMPV6_EXT_ECHO_REQUEST:
939 		if (!net->ipv6.sysctl.icmpv6_echo_ignore_all &&
940 		    READ_ONCE(net->ipv4.sysctl_icmp_echo_enable_probe))
941 			reason = icmpv6_echo_reply(skb);
942 		break;
943 
944 	case ICMPV6_ECHO_REPLY:
945 		reason = ping_rcv(skb);
946 		break;
947 
948 	case ICMPV6_EXT_ECHO_REPLY:
949 		reason = ping_rcv(skb);
950 		break;
951 
952 	case ICMPV6_PKT_TOOBIG:
953 		/* BUGGG_FUTURE: if packet contains rthdr, we cannot update
954 		   standard destination cache. Seems, only "advanced"
955 		   destination cache will allow to solve this problem
956 		   --ANK (980726)
957 		 */
958 		if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
959 			goto discard_it;
960 		hdr = icmp6_hdr(skb);
961 
962 		/* to notify */
963 		fallthrough;
964 	case ICMPV6_DEST_UNREACH:
965 	case ICMPV6_TIME_EXCEED:
966 	case ICMPV6_PARAMPROB:
967 		reason = icmpv6_notify(skb, type, hdr->icmp6_code,
968 				       hdr->icmp6_mtu);
969 		break;
970 
971 	case NDISC_ROUTER_SOLICITATION:
972 	case NDISC_ROUTER_ADVERTISEMENT:
973 	case NDISC_NEIGHBOUR_SOLICITATION:
974 	case NDISC_NEIGHBOUR_ADVERTISEMENT:
975 	case NDISC_REDIRECT:
976 		reason = ndisc_rcv(skb);
977 		break;
978 
979 	case ICMPV6_MGM_QUERY:
980 		igmp6_event_query(skb);
981 		return 0;
982 
983 	case ICMPV6_MGM_REPORT:
984 		igmp6_event_report(skb);
985 		return 0;
986 
987 	case ICMPV6_MGM_REDUCTION:
988 	case ICMPV6_NI_QUERY:
989 	case ICMPV6_NI_REPLY:
990 	case ICMPV6_MLD2_REPORT:
991 	case ICMPV6_DHAAD_REQUEST:
992 	case ICMPV6_DHAAD_REPLY:
993 	case ICMPV6_MOBILE_PREFIX_SOL:
994 	case ICMPV6_MOBILE_PREFIX_ADV:
995 		break;
996 
997 	default:
998 		/* informational */
999 		if (type & ICMPV6_INFOMSG_MASK)
1000 			break;
1001 
1002 		net_dbg_ratelimited("icmpv6: msg of unknown type [%pI6c > %pI6c]\n",
1003 				    saddr, daddr);
1004 
1005 		/*
1006 		 * error of unknown type.
1007 		 * must pass to upper level
1008 		 */
1009 
1010 		reason = icmpv6_notify(skb, type, hdr->icmp6_code,
1011 				       hdr->icmp6_mtu);
1012 	}
1013 
1014 	/* until the v6 path can be better sorted assume failure and
1015 	 * preserve the status quo behaviour for the rest of the paths to here
1016 	 */
1017 	if (reason)
1018 		kfree_skb_reason(skb, reason);
1019 	else
1020 		consume_skb(skb);
1021 
1022 	return 0;
1023 
1024 csum_error:
1025 	reason = SKB_DROP_REASON_ICMP_CSUM;
1026 	__ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_CSUMERRORS);
1027 discard_it:
1028 	__ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_INERRORS);
1029 drop_no_count:
1030 	kfree_skb_reason(skb, reason);
1031 	return 0;
1032 }
1033 
1034 void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6,
1035 		      u8 type,
1036 		      const struct in6_addr *saddr,
1037 		      const struct in6_addr *daddr,
1038 		      int oif)
1039 {
1040 	memset(fl6, 0, sizeof(*fl6));
1041 	fl6->saddr = *saddr;
1042 	fl6->daddr = *daddr;
1043 	fl6->flowi6_proto	= IPPROTO_ICMPV6;
1044 	fl6->fl6_icmp_type	= type;
1045 	fl6->fl6_icmp_code	= 0;
1046 	fl6->flowi6_oif		= oif;
1047 	security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));
1048 }
1049 
1050 int __init icmpv6_init(void)
1051 {
1052 	struct sock *sk;
1053 	int err, i;
1054 
1055 	for_each_possible_cpu(i) {
1056 		err = inet_ctl_sock_create(&sk, PF_INET6,
1057 					   SOCK_RAW, IPPROTO_ICMPV6, &init_net);
1058 		if (err < 0) {
1059 			pr_err("Failed to initialize the ICMP6 control socket (err %d)\n",
1060 			       err);
1061 			return err;
1062 		}
1063 
1064 		per_cpu(ipv6_icmp_sk, i) = sk;
1065 
1066 		/* Enough space for 2 64K ICMP packets, including
1067 		 * sk_buff struct overhead.
1068 		 */
1069 		sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024);
1070 	}
1071 
1072 	err = -EAGAIN;
1073 	if (inet6_add_protocol(&icmpv6_protocol, IPPROTO_ICMPV6) < 0)
1074 		goto fail;
1075 
1076 	err = inet6_register_icmp_sender(icmp6_send);
1077 	if (err)
1078 		goto sender_reg_err;
1079 	return 0;
1080 
1081 sender_reg_err:
1082 	inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6);
1083 fail:
1084 	pr_err("Failed to register ICMP6 protocol\n");
1085 	return err;
1086 }
1087 
1088 void icmpv6_cleanup(void)
1089 {
1090 	inet6_unregister_icmp_sender(icmp6_send);
1091 	inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6);
1092 }
1093 
1094 
1095 static const struct icmp6_err {
1096 	int err;
1097 	int fatal;
1098 } tab_unreach[] = {
1099 	{	/* NOROUTE */
1100 		.err	= ENETUNREACH,
1101 		.fatal	= 0,
1102 	},
1103 	{	/* ADM_PROHIBITED */
1104 		.err	= EACCES,
1105 		.fatal	= 1,
1106 	},
1107 	{	/* Was NOT_NEIGHBOUR, now reserved */
1108 		.err	= EHOSTUNREACH,
1109 		.fatal	= 0,
1110 	},
1111 	{	/* ADDR_UNREACH	*/
1112 		.err	= EHOSTUNREACH,
1113 		.fatal	= 0,
1114 	},
1115 	{	/* PORT_UNREACH	*/
1116 		.err	= ECONNREFUSED,
1117 		.fatal	= 1,
1118 	},
1119 	{	/* POLICY_FAIL */
1120 		.err	= EACCES,
1121 		.fatal	= 1,
1122 	},
1123 	{	/* REJECT_ROUTE	*/
1124 		.err	= EACCES,
1125 		.fatal	= 1,
1126 	},
1127 };
1128 
1129 int icmpv6_err_convert(u8 type, u8 code, int *err)
1130 {
1131 	int fatal = 0;
1132 
1133 	*err = EPROTO;
1134 
1135 	switch (type) {
1136 	case ICMPV6_DEST_UNREACH:
1137 		fatal = 1;
1138 		if (code < ARRAY_SIZE(tab_unreach)) {
1139 			*err  = tab_unreach[code].err;
1140 			fatal = tab_unreach[code].fatal;
1141 		}
1142 		break;
1143 
1144 	case ICMPV6_PKT_TOOBIG:
1145 		*err = EMSGSIZE;
1146 		break;
1147 
1148 	case ICMPV6_PARAMPROB:
1149 		*err = EPROTO;
1150 		fatal = 1;
1151 		break;
1152 
1153 	case ICMPV6_TIME_EXCEED:
1154 		*err = EHOSTUNREACH;
1155 		break;
1156 	}
1157 
1158 	return fatal;
1159 }
1160 EXPORT_SYMBOL(icmpv6_err_convert);
1161 
1162 #ifdef CONFIG_SYSCTL
1163 static struct ctl_table ipv6_icmp_table_template[] = {
1164 	{
1165 		.procname	= "ratelimit",
1166 		.data		= &init_net.ipv6.sysctl.icmpv6_time,
1167 		.maxlen		= sizeof(int),
1168 		.mode		= 0644,
1169 		.proc_handler	= proc_dointvec_ms_jiffies,
1170 	},
1171 	{
1172 		.procname	= "echo_ignore_all",
1173 		.data		= &init_net.ipv6.sysctl.icmpv6_echo_ignore_all,
1174 		.maxlen		= sizeof(u8),
1175 		.mode		= 0644,
1176 		.proc_handler = proc_dou8vec_minmax,
1177 	},
1178 	{
1179 		.procname	= "echo_ignore_multicast",
1180 		.data		= &init_net.ipv6.sysctl.icmpv6_echo_ignore_multicast,
1181 		.maxlen		= sizeof(u8),
1182 		.mode		= 0644,
1183 		.proc_handler = proc_dou8vec_minmax,
1184 	},
1185 	{
1186 		.procname	= "echo_ignore_anycast",
1187 		.data		= &init_net.ipv6.sysctl.icmpv6_echo_ignore_anycast,
1188 		.maxlen		= sizeof(u8),
1189 		.mode		= 0644,
1190 		.proc_handler = proc_dou8vec_minmax,
1191 	},
1192 	{
1193 		.procname	= "ratemask",
1194 		.data		= &init_net.ipv6.sysctl.icmpv6_ratemask_ptr,
1195 		.maxlen		= ICMPV6_MSG_MAX + 1,
1196 		.mode		= 0644,
1197 		.proc_handler = proc_do_large_bitmap,
1198 	},
1199 	{
1200 		.procname	= "error_anycast_as_unicast",
1201 		.data		= &init_net.ipv6.sysctl.icmpv6_error_anycast_as_unicast,
1202 		.maxlen		= sizeof(u8),
1203 		.mode		= 0644,
1204 		.proc_handler	= proc_dou8vec_minmax,
1205 		.extra1		= SYSCTL_ZERO,
1206 		.extra2		= SYSCTL_ONE,
1207 	},
1208 	{ },
1209 };
1210 
1211 struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net)
1212 {
1213 	struct ctl_table *table;
1214 
1215 	table = kmemdup(ipv6_icmp_table_template,
1216 			sizeof(ipv6_icmp_table_template),
1217 			GFP_KERNEL);
1218 
1219 	if (table) {
1220 		table[0].data = &net->ipv6.sysctl.icmpv6_time;
1221 		table[1].data = &net->ipv6.sysctl.icmpv6_echo_ignore_all;
1222 		table[2].data = &net->ipv6.sysctl.icmpv6_echo_ignore_multicast;
1223 		table[3].data = &net->ipv6.sysctl.icmpv6_echo_ignore_anycast;
1224 		table[4].data = &net->ipv6.sysctl.icmpv6_ratemask_ptr;
1225 		table[5].data = &net->ipv6.sysctl.icmpv6_error_anycast_as_unicast;
1226 	}
1227 	return table;
1228 }
1229 #endif
1230