xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 97a532c3)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	IPv6 output functions
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on linux/net/ipv4/ip_output.c
10  *
11  *	Changes:
12  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13  *				extension headers are implemented.
14  *				route changes now work.
15  *				ip6_forward does not confuse sniffers.
16  *				etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *	Imran Patel	:	frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *			:       add ip6_append_data and related functions
22  *				for datagram xmit
23  */
24 
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37 
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41 
42 #include <net/sock.h>
43 #include <net/snmp.h>
44 
45 #include <net/gso.h>
46 #include <net/ipv6.h>
47 #include <net/ndisc.h>
48 #include <net/protocol.h>
49 #include <net/ip6_route.h>
50 #include <net/addrconf.h>
51 #include <net/rawv6.h>
52 #include <net/icmp.h>
53 #include <net/xfrm.h>
54 #include <net/checksum.h>
55 #include <linux/mroute6.h>
56 #include <net/l3mdev.h>
57 #include <net/lwtunnel.h>
58 #include <net/ip_tunnels.h>
59 
60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61 {
62 	struct dst_entry *dst = skb_dst(skb);
63 	struct net_device *dev = dst->dev;
64 	struct inet6_dev *idev = ip6_dst_idev(dst);
65 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
66 	const struct in6_addr *daddr, *nexthop;
67 	struct ipv6hdr *hdr;
68 	struct neighbour *neigh;
69 	int ret;
70 
71 	/* Be paranoid, rather than too clever. */
72 	if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
73 		skb = skb_expand_head(skb, hh_len);
74 		if (!skb) {
75 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
76 			return -ENOMEM;
77 		}
78 	}
79 
80 	hdr = ipv6_hdr(skb);
81 	daddr = &hdr->daddr;
82 	if (ipv6_addr_is_multicast(daddr)) {
83 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
84 		    ((mroute6_is_socket(net, skb) &&
85 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
86 		     ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
87 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
88 
89 			/* Do not check for IFF_ALLMULTI; multicast routing
90 			   is not supported in any case.
91 			 */
92 			if (newskb)
93 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
94 					net, sk, newskb, NULL, newskb->dev,
95 					dev_loopback_xmit);
96 
97 			if (hdr->hop_limit == 0) {
98 				IP6_INC_STATS(net, idev,
99 					      IPSTATS_MIB_OUTDISCARDS);
100 				kfree_skb(skb);
101 				return 0;
102 			}
103 		}
104 
105 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
106 		if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
107 		    !(dev->flags & IFF_LOOPBACK)) {
108 			kfree_skb(skb);
109 			return 0;
110 		}
111 	}
112 
113 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
114 		int res = lwtunnel_xmit(skb);
115 
116 		if (res != LWTUNNEL_XMIT_CONTINUE)
117 			return res;
118 	}
119 
120 	IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
121 
122 	rcu_read_lock();
123 	nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
124 	neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
125 
126 	if (unlikely(IS_ERR_OR_NULL(neigh))) {
127 		if (unlikely(!neigh))
128 			neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
129 		if (IS_ERR(neigh)) {
130 			rcu_read_unlock();
131 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
132 			kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
133 			return -EINVAL;
134 		}
135 	}
136 	sock_confirm_neigh(skb, neigh);
137 	ret = neigh_output(neigh, skb, false);
138 	rcu_read_unlock();
139 	return ret;
140 }
141 
142 static int
143 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
144 				    struct sk_buff *skb, unsigned int mtu)
145 {
146 	struct sk_buff *segs, *nskb;
147 	netdev_features_t features;
148 	int ret = 0;
149 
150 	/* Please see corresponding comment in ip_finish_output_gso
151 	 * describing the cases where GSO segment length exceeds the
152 	 * egress MTU.
153 	 */
154 	features = netif_skb_features(skb);
155 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
156 	if (IS_ERR_OR_NULL(segs)) {
157 		kfree_skb(skb);
158 		return -ENOMEM;
159 	}
160 
161 	consume_skb(skb);
162 
163 	skb_list_walk_safe(segs, segs, nskb) {
164 		int err;
165 
166 		skb_mark_not_on_list(segs);
167 		/* Last GSO segment can be smaller than gso_size (and MTU).
168 		 * Adding a fragment header would produce an "atomic fragment",
169 		 * which is considered harmful (RFC-8021). Avoid that.
170 		 */
171 		err = segs->len > mtu ?
172 			ip6_fragment(net, sk, segs, ip6_finish_output2) :
173 			ip6_finish_output2(net, sk, segs);
174 		if (err && ret == 0)
175 			ret = err;
176 	}
177 
178 	return ret;
179 }
180 
181 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
182 {
183 	unsigned int mtu;
184 
185 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
186 	/* Policy lookup after SNAT yielded a new policy */
187 	if (skb_dst(skb)->xfrm) {
188 		IP6CB(skb)->flags |= IP6SKB_REROUTED;
189 		return dst_output(net, sk, skb);
190 	}
191 #endif
192 
193 	mtu = ip6_skb_dst_mtu(skb);
194 	if (skb_is_gso(skb) &&
195 	    !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
196 	    !skb_gso_validate_network_len(skb, mtu))
197 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
198 
199 	if ((skb->len > mtu && !skb_is_gso(skb)) ||
200 	    dst_allfrag(skb_dst(skb)) ||
201 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
202 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
203 	else
204 		return ip6_finish_output2(net, sk, skb);
205 }
206 
207 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
208 {
209 	int ret;
210 
211 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
212 	switch (ret) {
213 	case NET_XMIT_SUCCESS:
214 	case NET_XMIT_CN:
215 		return __ip6_finish_output(net, sk, skb) ? : ret;
216 	default:
217 		kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
218 		return ret;
219 	}
220 }
221 
222 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
223 {
224 	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
225 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
226 
227 	skb->protocol = htons(ETH_P_IPV6);
228 	skb->dev = dev;
229 
230 	if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) {
231 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
232 		kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
233 		return 0;
234 	}
235 
236 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
237 			    net, sk, skb, indev, dev,
238 			    ip6_finish_output,
239 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
240 }
241 EXPORT_SYMBOL(ip6_output);
242 
243 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
244 {
245 	if (!np->autoflowlabel_set)
246 		return ip6_default_np_autolabel(net);
247 	else
248 		return np->autoflowlabel;
249 }
250 
251 /*
252  * xmit an sk_buff (used by TCP, SCTP and DCCP)
253  * Note : socket lock is not held for SYNACK packets, but might be modified
254  * by calls to skb_set_owner_w() and ipv6_local_error(),
255  * which are using proper atomic operations or spinlocks.
256  */
257 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
258 	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
259 {
260 	struct net *net = sock_net(sk);
261 	const struct ipv6_pinfo *np = inet6_sk(sk);
262 	struct in6_addr *first_hop = &fl6->daddr;
263 	struct dst_entry *dst = skb_dst(skb);
264 	struct net_device *dev = dst->dev;
265 	struct inet6_dev *idev = ip6_dst_idev(dst);
266 	struct hop_jumbo_hdr *hop_jumbo;
267 	int hoplen = sizeof(*hop_jumbo);
268 	unsigned int head_room;
269 	struct ipv6hdr *hdr;
270 	u8  proto = fl6->flowi6_proto;
271 	int seg_len = skb->len;
272 	int hlimit = -1;
273 	u32 mtu;
274 
275 	head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
276 	if (opt)
277 		head_room += opt->opt_nflen + opt->opt_flen;
278 
279 	if (unlikely(head_room > skb_headroom(skb))) {
280 		skb = skb_expand_head(skb, head_room);
281 		if (!skb) {
282 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
283 			return -ENOBUFS;
284 		}
285 	}
286 
287 	if (opt) {
288 		seg_len += opt->opt_nflen + opt->opt_flen;
289 
290 		if (opt->opt_flen)
291 			ipv6_push_frag_opts(skb, opt, &proto);
292 
293 		if (opt->opt_nflen)
294 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
295 					     &fl6->saddr);
296 	}
297 
298 	if (unlikely(seg_len > IPV6_MAXPLEN)) {
299 		hop_jumbo = skb_push(skb, hoplen);
300 
301 		hop_jumbo->nexthdr = proto;
302 		hop_jumbo->hdrlen = 0;
303 		hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
304 		hop_jumbo->tlv_len = 4;
305 		hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
306 
307 		proto = IPPROTO_HOPOPTS;
308 		seg_len = 0;
309 		IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
310 	}
311 
312 	skb_push(skb, sizeof(struct ipv6hdr));
313 	skb_reset_network_header(skb);
314 	hdr = ipv6_hdr(skb);
315 
316 	/*
317 	 *	Fill in the IPv6 header
318 	 */
319 	if (np)
320 		hlimit = np->hop_limit;
321 	if (hlimit < 0)
322 		hlimit = ip6_dst_hoplimit(dst);
323 
324 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
325 				ip6_autoflowlabel(net, np), fl6));
326 
327 	hdr->payload_len = htons(seg_len);
328 	hdr->nexthdr = proto;
329 	hdr->hop_limit = hlimit;
330 
331 	hdr->saddr = fl6->saddr;
332 	hdr->daddr = *first_hop;
333 
334 	skb->protocol = htons(ETH_P_IPV6);
335 	skb->priority = priority;
336 	skb->mark = mark;
337 
338 	mtu = dst_mtu(dst);
339 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
340 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
341 
342 		/* if egress device is enslaved to an L3 master device pass the
343 		 * skb to its handler for processing
344 		 */
345 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
346 		if (unlikely(!skb))
347 			return 0;
348 
349 		/* hooks should never assume socket lock is held.
350 		 * we promote our socket to non const
351 		 */
352 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
353 			       net, (struct sock *)sk, skb, NULL, dev,
354 			       dst_output);
355 	}
356 
357 	skb->dev = dev;
358 	/* ipv6_local_error() does not require socket lock,
359 	 * we promote our socket to non const
360 	 */
361 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
362 
363 	IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
364 	kfree_skb(skb);
365 	return -EMSGSIZE;
366 }
367 EXPORT_SYMBOL(ip6_xmit);
368 
369 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
370 {
371 	struct ip6_ra_chain *ra;
372 	struct sock *last = NULL;
373 
374 	read_lock(&ip6_ra_lock);
375 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
376 		struct sock *sk = ra->sk;
377 		if (sk && ra->sel == sel &&
378 		    (!sk->sk_bound_dev_if ||
379 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
380 			struct ipv6_pinfo *np = inet6_sk(sk);
381 
382 			if (np && np->rtalert_isolate &&
383 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
384 				continue;
385 			}
386 			if (last) {
387 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
388 				if (skb2)
389 					rawv6_rcv(last, skb2);
390 			}
391 			last = sk;
392 		}
393 	}
394 
395 	if (last) {
396 		rawv6_rcv(last, skb);
397 		read_unlock(&ip6_ra_lock);
398 		return 1;
399 	}
400 	read_unlock(&ip6_ra_lock);
401 	return 0;
402 }
403 
404 static int ip6_forward_proxy_check(struct sk_buff *skb)
405 {
406 	struct ipv6hdr *hdr = ipv6_hdr(skb);
407 	u8 nexthdr = hdr->nexthdr;
408 	__be16 frag_off;
409 	int offset;
410 
411 	if (ipv6_ext_hdr(nexthdr)) {
412 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
413 		if (offset < 0)
414 			return 0;
415 	} else
416 		offset = sizeof(struct ipv6hdr);
417 
418 	if (nexthdr == IPPROTO_ICMPV6) {
419 		struct icmp6hdr *icmp6;
420 
421 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
422 					 offset + 1 - skb->data)))
423 			return 0;
424 
425 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
426 
427 		switch (icmp6->icmp6_type) {
428 		case NDISC_ROUTER_SOLICITATION:
429 		case NDISC_ROUTER_ADVERTISEMENT:
430 		case NDISC_NEIGHBOUR_SOLICITATION:
431 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
432 		case NDISC_REDIRECT:
433 			/* For reaction involving unicast neighbor discovery
434 			 * message destined to the proxied address, pass it to
435 			 * input function.
436 			 */
437 			return 1;
438 		default:
439 			break;
440 		}
441 	}
442 
443 	/*
444 	 * The proxying router can't forward traffic sent to a link-local
445 	 * address, so signal the sender and discard the packet. This
446 	 * behavior is clarified by the MIPv6 specification.
447 	 */
448 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
449 		dst_link_failure(skb);
450 		return -1;
451 	}
452 
453 	return 0;
454 }
455 
456 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
457 				     struct sk_buff *skb)
458 {
459 	struct dst_entry *dst = skb_dst(skb);
460 
461 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
462 
463 #ifdef CONFIG_NET_SWITCHDEV
464 	if (skb->offload_l3_fwd_mark) {
465 		consume_skb(skb);
466 		return 0;
467 	}
468 #endif
469 
470 	skb_clear_tstamp(skb);
471 	return dst_output(net, sk, skb);
472 }
473 
474 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
475 {
476 	if (skb->len <= mtu)
477 		return false;
478 
479 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
480 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
481 		return true;
482 
483 	if (skb->ignore_df)
484 		return false;
485 
486 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
487 		return false;
488 
489 	return true;
490 }
491 
492 int ip6_forward(struct sk_buff *skb)
493 {
494 	struct dst_entry *dst = skb_dst(skb);
495 	struct ipv6hdr *hdr = ipv6_hdr(skb);
496 	struct inet6_skb_parm *opt = IP6CB(skb);
497 	struct net *net = dev_net(dst->dev);
498 	struct inet6_dev *idev;
499 	SKB_DR(reason);
500 	u32 mtu;
501 
502 	idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
503 	if (net->ipv6.devconf_all->forwarding == 0)
504 		goto error;
505 
506 	if (skb->pkt_type != PACKET_HOST)
507 		goto drop;
508 
509 	if (unlikely(skb->sk))
510 		goto drop;
511 
512 	if (skb_warn_if_lro(skb))
513 		goto drop;
514 
515 	if (!net->ipv6.devconf_all->disable_policy &&
516 	    (!idev || !idev->cnf.disable_policy) &&
517 	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
518 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
519 		goto drop;
520 	}
521 
522 	skb_forward_csum(skb);
523 
524 	/*
525 	 *	We DO NOT make any processing on
526 	 *	RA packets, pushing them to user level AS IS
527 	 *	without ane WARRANTY that application will be able
528 	 *	to interpret them. The reason is that we
529 	 *	cannot make anything clever here.
530 	 *
531 	 *	We are not end-node, so that if packet contains
532 	 *	AH/ESP, we cannot make anything.
533 	 *	Defragmentation also would be mistake, RA packets
534 	 *	cannot be fragmented, because there is no warranty
535 	 *	that different fragments will go along one path. --ANK
536 	 */
537 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
538 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
539 			return 0;
540 	}
541 
542 	/*
543 	 *	check and decrement ttl
544 	 */
545 	if (hdr->hop_limit <= 1) {
546 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
547 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
548 
549 		kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
550 		return -ETIMEDOUT;
551 	}
552 
553 	/* XXX: idev->cnf.proxy_ndp? */
554 	if (net->ipv6.devconf_all->proxy_ndp &&
555 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
556 		int proxied = ip6_forward_proxy_check(skb);
557 		if (proxied > 0) {
558 			/* It's tempting to decrease the hop limit
559 			 * here by 1, as we do at the end of the
560 			 * function too.
561 			 *
562 			 * But that would be incorrect, as proxying is
563 			 * not forwarding.  The ip6_input function
564 			 * will handle this packet locally, and it
565 			 * depends on the hop limit being unchanged.
566 			 *
567 			 * One example is the NDP hop limit, that
568 			 * always has to stay 255, but other would be
569 			 * similar checks around RA packets, where the
570 			 * user can even change the desired limit.
571 			 */
572 			return ip6_input(skb);
573 		} else if (proxied < 0) {
574 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
575 			goto drop;
576 		}
577 	}
578 
579 	if (!xfrm6_route_forward(skb)) {
580 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
581 		SKB_DR_SET(reason, XFRM_POLICY);
582 		goto drop;
583 	}
584 	dst = skb_dst(skb);
585 
586 	/* IPv6 specs say nothing about it, but it is clear that we cannot
587 	   send redirects to source routed frames.
588 	   We don't send redirects to frames decapsulated from IPsec.
589 	 */
590 	if (IP6CB(skb)->iif == dst->dev->ifindex &&
591 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
592 		struct in6_addr *target = NULL;
593 		struct inet_peer *peer;
594 		struct rt6_info *rt;
595 
596 		/*
597 		 *	incoming and outgoing devices are the same
598 		 *	send a redirect.
599 		 */
600 
601 		rt = (struct rt6_info *) dst;
602 		if (rt->rt6i_flags & RTF_GATEWAY)
603 			target = &rt->rt6i_gateway;
604 		else
605 			target = &hdr->daddr;
606 
607 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
608 
609 		/* Limit redirects both by destination (here)
610 		   and by source (inside ndisc_send_redirect)
611 		 */
612 		if (inet_peer_xrlim_allow(peer, 1*HZ))
613 			ndisc_send_redirect(skb, target);
614 		if (peer)
615 			inet_putpeer(peer);
616 	} else {
617 		int addrtype = ipv6_addr_type(&hdr->saddr);
618 
619 		/* This check is security critical. */
620 		if (addrtype == IPV6_ADDR_ANY ||
621 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
622 			goto error;
623 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
624 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
625 				    ICMPV6_NOT_NEIGHBOUR, 0);
626 			goto error;
627 		}
628 	}
629 
630 	mtu = ip6_dst_mtu_maybe_forward(dst, true);
631 	if (mtu < IPV6_MIN_MTU)
632 		mtu = IPV6_MIN_MTU;
633 
634 	if (ip6_pkt_too_big(skb, mtu)) {
635 		/* Again, force OUTPUT device used as source address */
636 		skb->dev = dst->dev;
637 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
638 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
639 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
640 				IPSTATS_MIB_FRAGFAILS);
641 		kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
642 		return -EMSGSIZE;
643 	}
644 
645 	if (skb_cow(skb, dst->dev->hard_header_len)) {
646 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
647 				IPSTATS_MIB_OUTDISCARDS);
648 		goto drop;
649 	}
650 
651 	hdr = ipv6_hdr(skb);
652 
653 	/* Mangling hops number delayed to point after skb COW */
654 
655 	hdr->hop_limit--;
656 
657 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
658 		       net, NULL, skb, skb->dev, dst->dev,
659 		       ip6_forward_finish);
660 
661 error:
662 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
663 	SKB_DR_SET(reason, IP_INADDRERRORS);
664 drop:
665 	kfree_skb_reason(skb, reason);
666 	return -EINVAL;
667 }
668 
669 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
670 {
671 	to->pkt_type = from->pkt_type;
672 	to->priority = from->priority;
673 	to->protocol = from->protocol;
674 	skb_dst_drop(to);
675 	skb_dst_set(to, dst_clone(skb_dst(from)));
676 	to->dev = from->dev;
677 	to->mark = from->mark;
678 
679 	skb_copy_hash(to, from);
680 
681 #ifdef CONFIG_NET_SCHED
682 	to->tc_index = from->tc_index;
683 #endif
684 	nf_copy(to, from);
685 	skb_ext_copy(to, from);
686 	skb_copy_secmark(to, from);
687 }
688 
689 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
690 		      u8 nexthdr, __be32 frag_id,
691 		      struct ip6_fraglist_iter *iter)
692 {
693 	unsigned int first_len;
694 	struct frag_hdr *fh;
695 
696 	/* BUILD HEADER */
697 	*prevhdr = NEXTHDR_FRAGMENT;
698 	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
699 	if (!iter->tmp_hdr)
700 		return -ENOMEM;
701 
702 	iter->frag = skb_shinfo(skb)->frag_list;
703 	skb_frag_list_init(skb);
704 
705 	iter->offset = 0;
706 	iter->hlen = hlen;
707 	iter->frag_id = frag_id;
708 	iter->nexthdr = nexthdr;
709 
710 	__skb_pull(skb, hlen);
711 	fh = __skb_push(skb, sizeof(struct frag_hdr));
712 	__skb_push(skb, hlen);
713 	skb_reset_network_header(skb);
714 	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
715 
716 	fh->nexthdr = nexthdr;
717 	fh->reserved = 0;
718 	fh->frag_off = htons(IP6_MF);
719 	fh->identification = frag_id;
720 
721 	first_len = skb_pagelen(skb);
722 	skb->data_len = first_len - skb_headlen(skb);
723 	skb->len = first_len;
724 	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
725 
726 	return 0;
727 }
728 EXPORT_SYMBOL(ip6_fraglist_init);
729 
730 void ip6_fraglist_prepare(struct sk_buff *skb,
731 			  struct ip6_fraglist_iter *iter)
732 {
733 	struct sk_buff *frag = iter->frag;
734 	unsigned int hlen = iter->hlen;
735 	struct frag_hdr *fh;
736 
737 	frag->ip_summed = CHECKSUM_NONE;
738 	skb_reset_transport_header(frag);
739 	fh = __skb_push(frag, sizeof(struct frag_hdr));
740 	__skb_push(frag, hlen);
741 	skb_reset_network_header(frag);
742 	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
743 	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
744 	fh->nexthdr = iter->nexthdr;
745 	fh->reserved = 0;
746 	fh->frag_off = htons(iter->offset);
747 	if (frag->next)
748 		fh->frag_off |= htons(IP6_MF);
749 	fh->identification = iter->frag_id;
750 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
751 	ip6_copy_metadata(frag, skb);
752 }
753 EXPORT_SYMBOL(ip6_fraglist_prepare);
754 
755 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
756 		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
757 		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
758 {
759 	state->prevhdr = prevhdr;
760 	state->nexthdr = nexthdr;
761 	state->frag_id = frag_id;
762 
763 	state->hlen = hlen;
764 	state->mtu = mtu;
765 
766 	state->left = skb->len - hlen;	/* Space per frame */
767 	state->ptr = hlen;		/* Where to start from */
768 
769 	state->hroom = hdr_room;
770 	state->troom = needed_tailroom;
771 
772 	state->offset = 0;
773 }
774 EXPORT_SYMBOL(ip6_frag_init);
775 
776 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
777 {
778 	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
779 	struct sk_buff *frag;
780 	struct frag_hdr *fh;
781 	unsigned int len;
782 
783 	len = state->left;
784 	/* IF: it doesn't fit, use 'mtu' - the data space left */
785 	if (len > state->mtu)
786 		len = state->mtu;
787 	/* IF: we are not sending up to and including the packet end
788 	   then align the next start on an eight byte boundary */
789 	if (len < state->left)
790 		len &= ~7;
791 
792 	/* Allocate buffer */
793 	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
794 			 state->hroom + state->troom, GFP_ATOMIC);
795 	if (!frag)
796 		return ERR_PTR(-ENOMEM);
797 
798 	/*
799 	 *	Set up data on packet
800 	 */
801 
802 	ip6_copy_metadata(frag, skb);
803 	skb_reserve(frag, state->hroom);
804 	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
805 	skb_reset_network_header(frag);
806 	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
807 	frag->transport_header = (frag->network_header + state->hlen +
808 				  sizeof(struct frag_hdr));
809 
810 	/*
811 	 *	Charge the memory for the fragment to any owner
812 	 *	it might possess
813 	 */
814 	if (skb->sk)
815 		skb_set_owner_w(frag, skb->sk);
816 
817 	/*
818 	 *	Copy the packet header into the new buffer.
819 	 */
820 	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
821 
822 	fragnexthdr_offset = skb_network_header(frag);
823 	fragnexthdr_offset += prevhdr - skb_network_header(skb);
824 	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
825 
826 	/*
827 	 *	Build fragment header.
828 	 */
829 	fh->nexthdr = state->nexthdr;
830 	fh->reserved = 0;
831 	fh->identification = state->frag_id;
832 
833 	/*
834 	 *	Copy a block of the IP datagram.
835 	 */
836 	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
837 			     len));
838 	state->left -= len;
839 
840 	fh->frag_off = htons(state->offset);
841 	if (state->left > 0)
842 		fh->frag_off |= htons(IP6_MF);
843 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
844 
845 	state->ptr += len;
846 	state->offset += len;
847 
848 	return frag;
849 }
850 EXPORT_SYMBOL(ip6_frag_next);
851 
852 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
853 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
854 {
855 	struct sk_buff *frag;
856 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
857 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
858 				inet6_sk(skb->sk) : NULL;
859 	bool mono_delivery_time = skb->mono_delivery_time;
860 	struct ip6_frag_state state;
861 	unsigned int mtu, hlen, nexthdr_offset;
862 	ktime_t tstamp = skb->tstamp;
863 	int hroom, err = 0;
864 	__be32 frag_id;
865 	u8 *prevhdr, nexthdr = 0;
866 
867 	err = ip6_find_1stfragopt(skb, &prevhdr);
868 	if (err < 0)
869 		goto fail;
870 	hlen = err;
871 	nexthdr = *prevhdr;
872 	nexthdr_offset = prevhdr - skb_network_header(skb);
873 
874 	mtu = ip6_skb_dst_mtu(skb);
875 
876 	/* We must not fragment if the socket is set to force MTU discovery
877 	 * or if the skb it not generated by a local socket.
878 	 */
879 	if (unlikely(!skb->ignore_df && skb->len > mtu))
880 		goto fail_toobig;
881 
882 	if (IP6CB(skb)->frag_max_size) {
883 		if (IP6CB(skb)->frag_max_size > mtu)
884 			goto fail_toobig;
885 
886 		/* don't send fragments larger than what we received */
887 		mtu = IP6CB(skb)->frag_max_size;
888 		if (mtu < IPV6_MIN_MTU)
889 			mtu = IPV6_MIN_MTU;
890 	}
891 
892 	if (np && np->frag_size < mtu) {
893 		if (np->frag_size)
894 			mtu = np->frag_size;
895 	}
896 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
897 		goto fail_toobig;
898 	mtu -= hlen + sizeof(struct frag_hdr);
899 
900 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
901 				    &ipv6_hdr(skb)->saddr);
902 
903 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
904 	    (err = skb_checksum_help(skb)))
905 		goto fail;
906 
907 	prevhdr = skb_network_header(skb) + nexthdr_offset;
908 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
909 	if (skb_has_frag_list(skb)) {
910 		unsigned int first_len = skb_pagelen(skb);
911 		struct ip6_fraglist_iter iter;
912 		struct sk_buff *frag2;
913 
914 		if (first_len - hlen > mtu ||
915 		    ((first_len - hlen) & 7) ||
916 		    skb_cloned(skb) ||
917 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
918 			goto slow_path;
919 
920 		skb_walk_frags(skb, frag) {
921 			/* Correct geometry. */
922 			if (frag->len > mtu ||
923 			    ((frag->len & 7) && frag->next) ||
924 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
925 				goto slow_path_clean;
926 
927 			/* Partially cloned skb? */
928 			if (skb_shared(frag))
929 				goto slow_path_clean;
930 
931 			BUG_ON(frag->sk);
932 			if (skb->sk) {
933 				frag->sk = skb->sk;
934 				frag->destructor = sock_wfree;
935 			}
936 			skb->truesize -= frag->truesize;
937 		}
938 
939 		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
940 					&iter);
941 		if (err < 0)
942 			goto fail;
943 
944 		/* We prevent @rt from being freed. */
945 		rcu_read_lock();
946 
947 		for (;;) {
948 			/* Prepare header of the next frame,
949 			 * before previous one went down. */
950 			if (iter.frag)
951 				ip6_fraglist_prepare(skb, &iter);
952 
953 			skb_set_delivery_time(skb, tstamp, mono_delivery_time);
954 			err = output(net, sk, skb);
955 			if (!err)
956 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
957 					      IPSTATS_MIB_FRAGCREATES);
958 
959 			if (err || !iter.frag)
960 				break;
961 
962 			skb = ip6_fraglist_next(&iter);
963 		}
964 
965 		kfree(iter.tmp_hdr);
966 
967 		if (err == 0) {
968 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
969 				      IPSTATS_MIB_FRAGOKS);
970 			rcu_read_unlock();
971 			return 0;
972 		}
973 
974 		kfree_skb_list(iter.frag);
975 
976 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
977 			      IPSTATS_MIB_FRAGFAILS);
978 		rcu_read_unlock();
979 		return err;
980 
981 slow_path_clean:
982 		skb_walk_frags(skb, frag2) {
983 			if (frag2 == frag)
984 				break;
985 			frag2->sk = NULL;
986 			frag2->destructor = NULL;
987 			skb->truesize += frag2->truesize;
988 		}
989 	}
990 
991 slow_path:
992 	/*
993 	 *	Fragment the datagram.
994 	 */
995 
996 	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
997 		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
998 		      &state);
999 
1000 	/*
1001 	 *	Keep copying data until we run out.
1002 	 */
1003 
1004 	while (state.left > 0) {
1005 		frag = ip6_frag_next(skb, &state);
1006 		if (IS_ERR(frag)) {
1007 			err = PTR_ERR(frag);
1008 			goto fail;
1009 		}
1010 
1011 		/*
1012 		 *	Put this fragment into the sending queue.
1013 		 */
1014 		skb_set_delivery_time(frag, tstamp, mono_delivery_time);
1015 		err = output(net, sk, frag);
1016 		if (err)
1017 			goto fail;
1018 
1019 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1020 			      IPSTATS_MIB_FRAGCREATES);
1021 	}
1022 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1023 		      IPSTATS_MIB_FRAGOKS);
1024 	consume_skb(skb);
1025 	return err;
1026 
1027 fail_toobig:
1028 	if (skb->sk && dst_allfrag(skb_dst(skb)))
1029 		sk_gso_disable(skb->sk);
1030 
1031 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1032 	err = -EMSGSIZE;
1033 
1034 fail:
1035 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1036 		      IPSTATS_MIB_FRAGFAILS);
1037 	kfree_skb(skb);
1038 	return err;
1039 }
1040 
1041 static inline int ip6_rt_check(const struct rt6key *rt_key,
1042 			       const struct in6_addr *fl_addr,
1043 			       const struct in6_addr *addr_cache)
1044 {
1045 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1046 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1047 }
1048 
1049 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1050 					  struct dst_entry *dst,
1051 					  const struct flowi6 *fl6)
1052 {
1053 	struct ipv6_pinfo *np = inet6_sk(sk);
1054 	struct rt6_info *rt;
1055 
1056 	if (!dst)
1057 		goto out;
1058 
1059 	if (dst->ops->family != AF_INET6) {
1060 		dst_release(dst);
1061 		return NULL;
1062 	}
1063 
1064 	rt = (struct rt6_info *)dst;
1065 	/* Yes, checking route validity in not connected
1066 	 * case is not very simple. Take into account,
1067 	 * that we do not support routing by source, TOS,
1068 	 * and MSG_DONTROUTE		--ANK (980726)
1069 	 *
1070 	 * 1. ip6_rt_check(): If route was host route,
1071 	 *    check that cached destination is current.
1072 	 *    If it is network route, we still may
1073 	 *    check its validity using saved pointer
1074 	 *    to the last used address: daddr_cache.
1075 	 *    We do not want to save whole address now,
1076 	 *    (because main consumer of this service
1077 	 *    is tcp, which has not this problem),
1078 	 *    so that the last trick works only on connected
1079 	 *    sockets.
1080 	 * 2. oif also should be the same.
1081 	 */
1082 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1083 #ifdef CONFIG_IPV6_SUBTREES
1084 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1085 #endif
1086 	   (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
1087 		dst_release(dst);
1088 		dst = NULL;
1089 	}
1090 
1091 out:
1092 	return dst;
1093 }
1094 
1095 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1096 			       struct dst_entry **dst, struct flowi6 *fl6)
1097 {
1098 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1099 	struct neighbour *n;
1100 	struct rt6_info *rt;
1101 #endif
1102 	int err;
1103 	int flags = 0;
1104 
1105 	/* The correct way to handle this would be to do
1106 	 * ip6_route_get_saddr, and then ip6_route_output; however,
1107 	 * the route-specific preferred source forces the
1108 	 * ip6_route_output call _before_ ip6_route_get_saddr.
1109 	 *
1110 	 * In source specific routing (no src=any default route),
1111 	 * ip6_route_output will fail given src=any saddr, though, so
1112 	 * that's why we try it again later.
1113 	 */
1114 	if (ipv6_addr_any(&fl6->saddr)) {
1115 		struct fib6_info *from;
1116 		struct rt6_info *rt;
1117 
1118 		*dst = ip6_route_output(net, sk, fl6);
1119 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1120 
1121 		rcu_read_lock();
1122 		from = rt ? rcu_dereference(rt->from) : NULL;
1123 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1124 					  sk ? inet6_sk(sk)->srcprefs : 0,
1125 					  fl6->flowi6_l3mdev,
1126 					  &fl6->saddr);
1127 		rcu_read_unlock();
1128 
1129 		if (err)
1130 			goto out_err_release;
1131 
1132 		/* If we had an erroneous initial result, pretend it
1133 		 * never existed and let the SA-enabled version take
1134 		 * over.
1135 		 */
1136 		if ((*dst)->error) {
1137 			dst_release(*dst);
1138 			*dst = NULL;
1139 		}
1140 
1141 		if (fl6->flowi6_oif)
1142 			flags |= RT6_LOOKUP_F_IFACE;
1143 	}
1144 
1145 	if (!*dst)
1146 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1147 
1148 	err = (*dst)->error;
1149 	if (err)
1150 		goto out_err_release;
1151 
1152 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1153 	/*
1154 	 * Here if the dst entry we've looked up
1155 	 * has a neighbour entry that is in the INCOMPLETE
1156 	 * state and the src address from the flow is
1157 	 * marked as OPTIMISTIC, we release the found
1158 	 * dst entry and replace it instead with the
1159 	 * dst entry of the nexthop router
1160 	 */
1161 	rt = (struct rt6_info *) *dst;
1162 	rcu_read_lock();
1163 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1164 				      rt6_nexthop(rt, &fl6->daddr));
1165 	err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
1166 	rcu_read_unlock();
1167 
1168 	if (err) {
1169 		struct inet6_ifaddr *ifp;
1170 		struct flowi6 fl_gw6;
1171 		int redirect;
1172 
1173 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1174 				      (*dst)->dev, 1);
1175 
1176 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1177 		if (ifp)
1178 			in6_ifa_put(ifp);
1179 
1180 		if (redirect) {
1181 			/*
1182 			 * We need to get the dst entry for the
1183 			 * default router instead
1184 			 */
1185 			dst_release(*dst);
1186 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1187 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1188 			*dst = ip6_route_output(net, sk, &fl_gw6);
1189 			err = (*dst)->error;
1190 			if (err)
1191 				goto out_err_release;
1192 		}
1193 	}
1194 #endif
1195 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1196 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1197 		err = -EAFNOSUPPORT;
1198 		goto out_err_release;
1199 	}
1200 
1201 	return 0;
1202 
1203 out_err_release:
1204 	dst_release(*dst);
1205 	*dst = NULL;
1206 
1207 	if (err == -ENETUNREACH)
1208 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1209 	return err;
1210 }
1211 
1212 /**
1213  *	ip6_dst_lookup - perform route lookup on flow
1214  *	@net: Network namespace to perform lookup in
1215  *	@sk: socket which provides route info
1216  *	@dst: pointer to dst_entry * for result
1217  *	@fl6: flow to lookup
1218  *
1219  *	This function performs a route lookup on the given flow.
1220  *
1221  *	It returns zero on success, or a standard errno code on error.
1222  */
1223 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1224 		   struct flowi6 *fl6)
1225 {
1226 	*dst = NULL;
1227 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1228 }
1229 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1230 
1231 /**
1232  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1233  *	@net: Network namespace to perform lookup in
1234  *	@sk: socket which provides route info
1235  *	@fl6: flow to lookup
1236  *	@final_dst: final destination address for ipsec lookup
1237  *
1238  *	This function performs a route lookup on the given flow.
1239  *
1240  *	It returns a valid dst pointer on success, or a pointer encoded
1241  *	error code.
1242  */
1243 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1244 				      const struct in6_addr *final_dst)
1245 {
1246 	struct dst_entry *dst = NULL;
1247 	int err;
1248 
1249 	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1250 	if (err)
1251 		return ERR_PTR(err);
1252 	if (final_dst)
1253 		fl6->daddr = *final_dst;
1254 
1255 	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1256 }
1257 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1258 
1259 /**
1260  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1261  *	@sk: socket which provides the dst cache and route info
1262  *	@fl6: flow to lookup
1263  *	@final_dst: final destination address for ipsec lookup
1264  *	@connected: whether @sk is connected or not
1265  *
1266  *	This function performs a route lookup on the given flow with the
1267  *	possibility of using the cached route in the socket if it is valid.
1268  *	It will take the socket dst lock when operating on the dst cache.
1269  *	As a result, this function can only be used in process context.
1270  *
1271  *	In addition, for a connected socket, cache the dst in the socket
1272  *	if the current cache is not valid.
1273  *
1274  *	It returns a valid dst pointer on success, or a pointer encoded
1275  *	error code.
1276  */
1277 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1278 					 const struct in6_addr *final_dst,
1279 					 bool connected)
1280 {
1281 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1282 
1283 	dst = ip6_sk_dst_check(sk, dst, fl6);
1284 	if (dst)
1285 		return dst;
1286 
1287 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1288 	if (connected && !IS_ERR(dst))
1289 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1290 
1291 	return dst;
1292 }
1293 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1294 
1295 /**
1296  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1297  *      @skb: Packet for which lookup is done
1298  *      @dev: Tunnel device
1299  *      @net: Network namespace of tunnel device
1300  *      @sock: Socket which provides route info
1301  *      @saddr: Memory to store the src ip address
1302  *      @info: Tunnel information
1303  *      @protocol: IP protocol
1304  *      @use_cache: Flag to enable cache usage
1305  *      This function performs a route lookup on a tunnel
1306  *
1307  *      It returns a valid dst pointer and stores src address to be used in
1308  *      tunnel in param saddr on success, else a pointer encoded error code.
1309  */
1310 
1311 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1312 					struct net_device *dev,
1313 					struct net *net,
1314 					struct socket *sock,
1315 					struct in6_addr *saddr,
1316 					const struct ip_tunnel_info *info,
1317 					u8 protocol,
1318 					bool use_cache)
1319 {
1320 	struct dst_entry *dst = NULL;
1321 #ifdef CONFIG_DST_CACHE
1322 	struct dst_cache *dst_cache;
1323 #endif
1324 	struct flowi6 fl6;
1325 	__u8 prio;
1326 
1327 #ifdef CONFIG_DST_CACHE
1328 	dst_cache = (struct dst_cache *)&info->dst_cache;
1329 	if (use_cache) {
1330 		dst = dst_cache_get_ip6(dst_cache, saddr);
1331 		if (dst)
1332 			return dst;
1333 	}
1334 #endif
1335 	memset(&fl6, 0, sizeof(fl6));
1336 	fl6.flowi6_mark = skb->mark;
1337 	fl6.flowi6_proto = protocol;
1338 	fl6.daddr = info->key.u.ipv6.dst;
1339 	fl6.saddr = info->key.u.ipv6.src;
1340 	prio = info->key.tos;
1341 	fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
1342 
1343 	dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1344 					      NULL);
1345 	if (IS_ERR(dst)) {
1346 		netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1347 		return ERR_PTR(-ENETUNREACH);
1348 	}
1349 	if (dst->dev == dev) { /* is this necessary? */
1350 		netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1351 		dst_release(dst);
1352 		return ERR_PTR(-ELOOP);
1353 	}
1354 #ifdef CONFIG_DST_CACHE
1355 	if (use_cache)
1356 		dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1357 #endif
1358 	*saddr = fl6.saddr;
1359 	return dst;
1360 }
1361 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1362 
1363 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1364 					       gfp_t gfp)
1365 {
1366 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1367 }
1368 
1369 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1370 						gfp_t gfp)
1371 {
1372 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1373 }
1374 
1375 static void ip6_append_data_mtu(unsigned int *mtu,
1376 				int *maxfraglen,
1377 				unsigned int fragheaderlen,
1378 				struct sk_buff *skb,
1379 				struct rt6_info *rt,
1380 				unsigned int orig_mtu)
1381 {
1382 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1383 		if (!skb) {
1384 			/* first fragment, reserve header_len */
1385 			*mtu = orig_mtu - rt->dst.header_len;
1386 
1387 		} else {
1388 			/*
1389 			 * this fragment is not first, the headers
1390 			 * space is regarded as data space.
1391 			 */
1392 			*mtu = orig_mtu;
1393 		}
1394 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1395 			      + fragheaderlen - sizeof(struct frag_hdr);
1396 	}
1397 }
1398 
1399 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1400 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1401 			  struct rt6_info *rt)
1402 {
1403 	struct ipv6_pinfo *np = inet6_sk(sk);
1404 	unsigned int mtu;
1405 	struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1406 
1407 	/* callers pass dst together with a reference, set it first so
1408 	 * ip6_cork_release() can put it down even in case of an error.
1409 	 */
1410 	cork->base.dst = &rt->dst;
1411 
1412 	/*
1413 	 * setup for corking
1414 	 */
1415 	if (opt) {
1416 		if (WARN_ON(v6_cork->opt))
1417 			return -EINVAL;
1418 
1419 		nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1420 		if (unlikely(!nopt))
1421 			return -ENOBUFS;
1422 
1423 		nopt->tot_len = sizeof(*opt);
1424 		nopt->opt_flen = opt->opt_flen;
1425 		nopt->opt_nflen = opt->opt_nflen;
1426 
1427 		nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1428 		if (opt->dst0opt && !nopt->dst0opt)
1429 			return -ENOBUFS;
1430 
1431 		nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1432 		if (opt->dst1opt && !nopt->dst1opt)
1433 			return -ENOBUFS;
1434 
1435 		nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1436 		if (opt->hopopt && !nopt->hopopt)
1437 			return -ENOBUFS;
1438 
1439 		nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1440 		if (opt->srcrt && !nopt->srcrt)
1441 			return -ENOBUFS;
1442 
1443 		/* need source address above miyazawa*/
1444 	}
1445 	v6_cork->hop_limit = ipc6->hlimit;
1446 	v6_cork->tclass = ipc6->tclass;
1447 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1448 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1449 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1450 	else
1451 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1452 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1453 	if (np->frag_size < mtu) {
1454 		if (np->frag_size)
1455 			mtu = np->frag_size;
1456 	}
1457 	cork->base.fragsize = mtu;
1458 	cork->base.gso_size = ipc6->gso_size;
1459 	cork->base.tx_flags = 0;
1460 	cork->base.mark = ipc6->sockc.mark;
1461 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1462 
1463 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1464 		cork->base.flags |= IPCORK_ALLFRAG;
1465 	cork->base.length = 0;
1466 
1467 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1468 
1469 	return 0;
1470 }
1471 
1472 static int __ip6_append_data(struct sock *sk,
1473 			     struct sk_buff_head *queue,
1474 			     struct inet_cork_full *cork_full,
1475 			     struct inet6_cork *v6_cork,
1476 			     struct page_frag *pfrag,
1477 			     int getfrag(void *from, char *to, int offset,
1478 					 int len, int odd, struct sk_buff *skb),
1479 			     void *from, size_t length, int transhdrlen,
1480 			     unsigned int flags, struct ipcm6_cookie *ipc6)
1481 {
1482 	struct sk_buff *skb, *skb_prev = NULL;
1483 	struct inet_cork *cork = &cork_full->base;
1484 	struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1485 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1486 	struct ubuf_info *uarg = NULL;
1487 	int exthdrlen = 0;
1488 	int dst_exthdrlen = 0;
1489 	int hh_len;
1490 	int copy;
1491 	int err;
1492 	int offset = 0;
1493 	bool zc = false;
1494 	u32 tskey = 0;
1495 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1496 	struct ipv6_txoptions *opt = v6_cork->opt;
1497 	int csummode = CHECKSUM_NONE;
1498 	unsigned int maxnonfragsize, headersize;
1499 	unsigned int wmem_alloc_delta = 0;
1500 	bool paged, extra_uref = false;
1501 
1502 	skb = skb_peek_tail(queue);
1503 	if (!skb) {
1504 		exthdrlen = opt ? opt->opt_flen : 0;
1505 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1506 	}
1507 
1508 	paged = !!cork->gso_size;
1509 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1510 	orig_mtu = mtu;
1511 
1512 	if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
1513 	    READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID)
1514 		tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1515 
1516 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1517 
1518 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1519 			(opt ? opt->opt_nflen : 0);
1520 
1521 	headersize = sizeof(struct ipv6hdr) +
1522 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1523 		     (dst_allfrag(&rt->dst) ?
1524 		      sizeof(struct frag_hdr) : 0) +
1525 		     rt->rt6i_nfheader_len;
1526 
1527 	if (mtu <= fragheaderlen ||
1528 	    ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1529 		goto emsgsize;
1530 
1531 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1532 		     sizeof(struct frag_hdr);
1533 
1534 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1535 	 * the first fragment
1536 	 */
1537 	if (headersize + transhdrlen > mtu)
1538 		goto emsgsize;
1539 
1540 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1541 	    (sk->sk_protocol == IPPROTO_UDP ||
1542 	     sk->sk_protocol == IPPROTO_ICMPV6 ||
1543 	     sk->sk_protocol == IPPROTO_RAW)) {
1544 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1545 				sizeof(struct ipv6hdr));
1546 		goto emsgsize;
1547 	}
1548 
1549 	if (ip6_sk_ignore_df(sk))
1550 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1551 	else
1552 		maxnonfragsize = mtu;
1553 
1554 	if (cork->length + length > maxnonfragsize - headersize) {
1555 emsgsize:
1556 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1557 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1558 		return -EMSGSIZE;
1559 	}
1560 
1561 	/* CHECKSUM_PARTIAL only with no extension headers and when
1562 	 * we are not going to fragment
1563 	 */
1564 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1565 	    headersize == sizeof(struct ipv6hdr) &&
1566 	    length <= mtu - headersize &&
1567 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1568 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1569 		csummode = CHECKSUM_PARTIAL;
1570 
1571 	if ((flags & MSG_ZEROCOPY) && length) {
1572 		struct msghdr *msg = from;
1573 
1574 		if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1575 			if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1576 				return -EINVAL;
1577 
1578 			/* Leave uarg NULL if can't zerocopy, callers should
1579 			 * be able to handle it.
1580 			 */
1581 			if ((rt->dst.dev->features & NETIF_F_SG) &&
1582 			    csummode == CHECKSUM_PARTIAL) {
1583 				paged = true;
1584 				zc = true;
1585 				uarg = msg->msg_ubuf;
1586 			}
1587 		} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1588 			uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1589 			if (!uarg)
1590 				return -ENOBUFS;
1591 			extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1592 			if (rt->dst.dev->features & NETIF_F_SG &&
1593 			    csummode == CHECKSUM_PARTIAL) {
1594 				paged = true;
1595 				zc = true;
1596 			} else {
1597 				uarg_to_msgzc(uarg)->zerocopy = 0;
1598 				skb_zcopy_set(skb, uarg, &extra_uref);
1599 			}
1600 		}
1601 	} else if ((flags & MSG_SPLICE_PAGES) && length) {
1602 		if (inet_test_bit(HDRINCL, sk))
1603 			return -EPERM;
1604 		if (rt->dst.dev->features & NETIF_F_SG &&
1605 		    getfrag == ip_generic_getfrag)
1606 			/* We need an empty buffer to attach stuff to */
1607 			paged = true;
1608 		else
1609 			flags &= ~MSG_SPLICE_PAGES;
1610 	}
1611 
1612 	/*
1613 	 * Let's try using as much space as possible.
1614 	 * Use MTU if total length of the message fits into the MTU.
1615 	 * Otherwise, we need to reserve fragment header and
1616 	 * fragment alignment (= 8-15 octects, in total).
1617 	 *
1618 	 * Note that we may need to "move" the data from the tail
1619 	 * of the buffer to the new fragment when we split
1620 	 * the message.
1621 	 *
1622 	 * FIXME: It may be fragmented into multiple chunks
1623 	 *        at once if non-fragmentable extension headers
1624 	 *        are too large.
1625 	 * --yoshfuji
1626 	 */
1627 
1628 	cork->length += length;
1629 	if (!skb)
1630 		goto alloc_new_skb;
1631 
1632 	while (length > 0) {
1633 		/* Check if the remaining data fits into current packet. */
1634 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1635 		if (copy < length)
1636 			copy = maxfraglen - skb->len;
1637 
1638 		if (copy <= 0) {
1639 			char *data;
1640 			unsigned int datalen;
1641 			unsigned int fraglen;
1642 			unsigned int fraggap;
1643 			unsigned int alloclen, alloc_extra;
1644 			unsigned int pagedlen;
1645 alloc_new_skb:
1646 			/* There's no room in the current skb */
1647 			if (skb)
1648 				fraggap = skb->len - maxfraglen;
1649 			else
1650 				fraggap = 0;
1651 			/* update mtu and maxfraglen if necessary */
1652 			if (!skb || !skb_prev)
1653 				ip6_append_data_mtu(&mtu, &maxfraglen,
1654 						    fragheaderlen, skb, rt,
1655 						    orig_mtu);
1656 
1657 			skb_prev = skb;
1658 
1659 			/*
1660 			 * If remaining data exceeds the mtu,
1661 			 * we know we need more fragment(s).
1662 			 */
1663 			datalen = length + fraggap;
1664 
1665 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1666 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1667 			fraglen = datalen + fragheaderlen;
1668 			pagedlen = 0;
1669 
1670 			alloc_extra = hh_len;
1671 			alloc_extra += dst_exthdrlen;
1672 			alloc_extra += rt->dst.trailer_len;
1673 
1674 			/* We just reserve space for fragment header.
1675 			 * Note: this may be overallocation if the message
1676 			 * (without MSG_MORE) fits into the MTU.
1677 			 */
1678 			alloc_extra += sizeof(struct frag_hdr);
1679 
1680 			if ((flags & MSG_MORE) &&
1681 			    !(rt->dst.dev->features&NETIF_F_SG))
1682 				alloclen = mtu;
1683 			else if (!paged &&
1684 				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1685 				  !(rt->dst.dev->features & NETIF_F_SG)))
1686 				alloclen = fraglen;
1687 			else {
1688 				alloclen = fragheaderlen + transhdrlen;
1689 				pagedlen = datalen - transhdrlen;
1690 			}
1691 			alloclen += alloc_extra;
1692 
1693 			if (datalen != length + fraggap) {
1694 				/*
1695 				 * this is not the last fragment, the trailer
1696 				 * space is regarded as data space.
1697 				 */
1698 				datalen += rt->dst.trailer_len;
1699 			}
1700 
1701 			fraglen = datalen + fragheaderlen;
1702 
1703 			copy = datalen - transhdrlen - fraggap - pagedlen;
1704 			/* [!] NOTE: copy may be negative if pagedlen>0
1705 			 * because then the equation may reduces to -fraggap.
1706 			 */
1707 			if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) {
1708 				err = -EINVAL;
1709 				goto error;
1710 			}
1711 			if (transhdrlen) {
1712 				skb = sock_alloc_send_skb(sk, alloclen,
1713 						(flags & MSG_DONTWAIT), &err);
1714 			} else {
1715 				skb = NULL;
1716 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1717 				    2 * sk->sk_sndbuf)
1718 					skb = alloc_skb(alloclen,
1719 							sk->sk_allocation);
1720 				if (unlikely(!skb))
1721 					err = -ENOBUFS;
1722 			}
1723 			if (!skb)
1724 				goto error;
1725 			/*
1726 			 *	Fill in the control structures
1727 			 */
1728 			skb->protocol = htons(ETH_P_IPV6);
1729 			skb->ip_summed = csummode;
1730 			skb->csum = 0;
1731 			/* reserve for fragmentation and ipsec header */
1732 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1733 				    dst_exthdrlen);
1734 
1735 			/*
1736 			 *	Find where to start putting bytes
1737 			 */
1738 			data = skb_put(skb, fraglen - pagedlen);
1739 			skb_set_network_header(skb, exthdrlen);
1740 			data += fragheaderlen;
1741 			skb->transport_header = (skb->network_header +
1742 						 fragheaderlen);
1743 			if (fraggap) {
1744 				skb->csum = skb_copy_and_csum_bits(
1745 					skb_prev, maxfraglen,
1746 					data + transhdrlen, fraggap);
1747 				skb_prev->csum = csum_sub(skb_prev->csum,
1748 							  skb->csum);
1749 				data += fraggap;
1750 				pskb_trim_unique(skb_prev, maxfraglen);
1751 			}
1752 			if (copy > 0 &&
1753 			    getfrag(from, data + transhdrlen, offset,
1754 				    copy, fraggap, skb) < 0) {
1755 				err = -EFAULT;
1756 				kfree_skb(skb);
1757 				goto error;
1758 			} else if (flags & MSG_SPLICE_PAGES) {
1759 				copy = 0;
1760 			}
1761 
1762 			offset += copy;
1763 			length -= copy + transhdrlen;
1764 			transhdrlen = 0;
1765 			exthdrlen = 0;
1766 			dst_exthdrlen = 0;
1767 
1768 			/* Only the initial fragment is time stamped */
1769 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1770 			cork->tx_flags = 0;
1771 			skb_shinfo(skb)->tskey = tskey;
1772 			tskey = 0;
1773 			skb_zcopy_set(skb, uarg, &extra_uref);
1774 
1775 			if ((flags & MSG_CONFIRM) && !skb_prev)
1776 				skb_set_dst_pending_confirm(skb, 1);
1777 
1778 			/*
1779 			 * Put the packet on the pending queue
1780 			 */
1781 			if (!skb->destructor) {
1782 				skb->destructor = sock_wfree;
1783 				skb->sk = sk;
1784 				wmem_alloc_delta += skb->truesize;
1785 			}
1786 			__skb_queue_tail(queue, skb);
1787 			continue;
1788 		}
1789 
1790 		if (copy > length)
1791 			copy = length;
1792 
1793 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1794 		    skb_tailroom(skb) >= copy) {
1795 			unsigned int off;
1796 
1797 			off = skb->len;
1798 			if (getfrag(from, skb_put(skb, copy),
1799 						offset, copy, off, skb) < 0) {
1800 				__skb_trim(skb, off);
1801 				err = -EFAULT;
1802 				goto error;
1803 			}
1804 		} else if (flags & MSG_SPLICE_PAGES) {
1805 			struct msghdr *msg = from;
1806 
1807 			err = -EIO;
1808 			if (WARN_ON_ONCE(copy > msg->msg_iter.count))
1809 				goto error;
1810 
1811 			err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
1812 						   sk->sk_allocation);
1813 			if (err < 0)
1814 				goto error;
1815 			copy = err;
1816 			wmem_alloc_delta += copy;
1817 		} else if (!zc) {
1818 			int i = skb_shinfo(skb)->nr_frags;
1819 
1820 			err = -ENOMEM;
1821 			if (!sk_page_frag_refill(sk, pfrag))
1822 				goto error;
1823 
1824 			skb_zcopy_downgrade_managed(skb);
1825 			if (!skb_can_coalesce(skb, i, pfrag->page,
1826 					      pfrag->offset)) {
1827 				err = -EMSGSIZE;
1828 				if (i == MAX_SKB_FRAGS)
1829 					goto error;
1830 
1831 				__skb_fill_page_desc(skb, i, pfrag->page,
1832 						     pfrag->offset, 0);
1833 				skb_shinfo(skb)->nr_frags = ++i;
1834 				get_page(pfrag->page);
1835 			}
1836 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1837 			if (getfrag(from,
1838 				    page_address(pfrag->page) + pfrag->offset,
1839 				    offset, copy, skb->len, skb) < 0)
1840 				goto error_efault;
1841 
1842 			pfrag->offset += copy;
1843 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1844 			skb->len += copy;
1845 			skb->data_len += copy;
1846 			skb->truesize += copy;
1847 			wmem_alloc_delta += copy;
1848 		} else {
1849 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1850 			if (err < 0)
1851 				goto error;
1852 		}
1853 		offset += copy;
1854 		length -= copy;
1855 	}
1856 
1857 	if (wmem_alloc_delta)
1858 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1859 	return 0;
1860 
1861 error_efault:
1862 	err = -EFAULT;
1863 error:
1864 	net_zcopy_put_abort(uarg, extra_uref);
1865 	cork->length -= length;
1866 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1867 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1868 	return err;
1869 }
1870 
1871 int ip6_append_data(struct sock *sk,
1872 		    int getfrag(void *from, char *to, int offset, int len,
1873 				int odd, struct sk_buff *skb),
1874 		    void *from, size_t length, int transhdrlen,
1875 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1876 		    struct rt6_info *rt, unsigned int flags)
1877 {
1878 	struct inet_sock *inet = inet_sk(sk);
1879 	struct ipv6_pinfo *np = inet6_sk(sk);
1880 	int exthdrlen;
1881 	int err;
1882 
1883 	if (flags&MSG_PROBE)
1884 		return 0;
1885 	if (skb_queue_empty(&sk->sk_write_queue)) {
1886 		/*
1887 		 * setup for corking
1888 		 */
1889 		dst_hold(&rt->dst);
1890 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1891 				     ipc6, rt);
1892 		if (err)
1893 			return err;
1894 
1895 		inet->cork.fl.u.ip6 = *fl6;
1896 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1897 		length += exthdrlen;
1898 		transhdrlen += exthdrlen;
1899 	} else {
1900 		transhdrlen = 0;
1901 	}
1902 
1903 	return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1904 				 &np->cork, sk_page_frag(sk), getfrag,
1905 				 from, length, transhdrlen, flags, ipc6);
1906 }
1907 EXPORT_SYMBOL_GPL(ip6_append_data);
1908 
1909 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1910 {
1911 	struct dst_entry *dst = cork->base.dst;
1912 
1913 	cork->base.dst = NULL;
1914 	cork->base.flags &= ~IPCORK_ALLFRAG;
1915 	skb_dst_set(skb, dst);
1916 }
1917 
1918 static void ip6_cork_release(struct inet_cork_full *cork,
1919 			     struct inet6_cork *v6_cork)
1920 {
1921 	if (v6_cork->opt) {
1922 		struct ipv6_txoptions *opt = v6_cork->opt;
1923 
1924 		kfree(opt->dst0opt);
1925 		kfree(opt->dst1opt);
1926 		kfree(opt->hopopt);
1927 		kfree(opt->srcrt);
1928 		kfree(opt);
1929 		v6_cork->opt = NULL;
1930 	}
1931 
1932 	if (cork->base.dst) {
1933 		dst_release(cork->base.dst);
1934 		cork->base.dst = NULL;
1935 		cork->base.flags &= ~IPCORK_ALLFRAG;
1936 	}
1937 }
1938 
1939 struct sk_buff *__ip6_make_skb(struct sock *sk,
1940 			       struct sk_buff_head *queue,
1941 			       struct inet_cork_full *cork,
1942 			       struct inet6_cork *v6_cork)
1943 {
1944 	struct sk_buff *skb, *tmp_skb;
1945 	struct sk_buff **tail_skb;
1946 	struct in6_addr *final_dst;
1947 	struct ipv6_pinfo *np = inet6_sk(sk);
1948 	struct net *net = sock_net(sk);
1949 	struct ipv6hdr *hdr;
1950 	struct ipv6_txoptions *opt = v6_cork->opt;
1951 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1952 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1953 	unsigned char proto = fl6->flowi6_proto;
1954 
1955 	skb = __skb_dequeue(queue);
1956 	if (!skb)
1957 		goto out;
1958 	tail_skb = &(skb_shinfo(skb)->frag_list);
1959 
1960 	/* move skb->data to ip header from ext header */
1961 	if (skb->data < skb_network_header(skb))
1962 		__skb_pull(skb, skb_network_offset(skb));
1963 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1964 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1965 		*tail_skb = tmp_skb;
1966 		tail_skb = &(tmp_skb->next);
1967 		skb->len += tmp_skb->len;
1968 		skb->data_len += tmp_skb->len;
1969 		skb->truesize += tmp_skb->truesize;
1970 		tmp_skb->destructor = NULL;
1971 		tmp_skb->sk = NULL;
1972 	}
1973 
1974 	/* Allow local fragmentation. */
1975 	skb->ignore_df = ip6_sk_ignore_df(sk);
1976 	__skb_pull(skb, skb_network_header_len(skb));
1977 
1978 	final_dst = &fl6->daddr;
1979 	if (opt && opt->opt_flen)
1980 		ipv6_push_frag_opts(skb, opt, &proto);
1981 	if (opt && opt->opt_nflen)
1982 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1983 
1984 	skb_push(skb, sizeof(struct ipv6hdr));
1985 	skb_reset_network_header(skb);
1986 	hdr = ipv6_hdr(skb);
1987 
1988 	ip6_flow_hdr(hdr, v6_cork->tclass,
1989 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1990 					ip6_autoflowlabel(net, np), fl6));
1991 	hdr->hop_limit = v6_cork->hop_limit;
1992 	hdr->nexthdr = proto;
1993 	hdr->saddr = fl6->saddr;
1994 	hdr->daddr = *final_dst;
1995 
1996 	skb->priority = sk->sk_priority;
1997 	skb->mark = cork->base.mark;
1998 	skb->tstamp = cork->base.transmit_time;
1999 
2000 	ip6_cork_steal_dst(skb, cork);
2001 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
2002 	if (proto == IPPROTO_ICMPV6) {
2003 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
2004 		u8 icmp6_type;
2005 
2006 		if (sk->sk_socket->type == SOCK_RAW &&
2007 		   !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH))
2008 			icmp6_type = fl6->fl6_icmp_type;
2009 		else
2010 			icmp6_type = icmp6_hdr(skb)->icmp6_type;
2011 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
2012 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
2013 	}
2014 
2015 	ip6_cork_release(cork, v6_cork);
2016 out:
2017 	return skb;
2018 }
2019 
2020 int ip6_send_skb(struct sk_buff *skb)
2021 {
2022 	struct net *net = sock_net(skb->sk);
2023 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
2024 	int err;
2025 
2026 	err = ip6_local_out(net, skb->sk, skb);
2027 	if (err) {
2028 		if (err > 0)
2029 			err = net_xmit_errno(err);
2030 		if (err)
2031 			IP6_INC_STATS(net, rt->rt6i_idev,
2032 				      IPSTATS_MIB_OUTDISCARDS);
2033 	}
2034 
2035 	return err;
2036 }
2037 
2038 int ip6_push_pending_frames(struct sock *sk)
2039 {
2040 	struct sk_buff *skb;
2041 
2042 	skb = ip6_finish_skb(sk);
2043 	if (!skb)
2044 		return 0;
2045 
2046 	return ip6_send_skb(skb);
2047 }
2048 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
2049 
2050 static void __ip6_flush_pending_frames(struct sock *sk,
2051 				       struct sk_buff_head *queue,
2052 				       struct inet_cork_full *cork,
2053 				       struct inet6_cork *v6_cork)
2054 {
2055 	struct sk_buff *skb;
2056 
2057 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
2058 		if (skb_dst(skb))
2059 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
2060 				      IPSTATS_MIB_OUTDISCARDS);
2061 		kfree_skb(skb);
2062 	}
2063 
2064 	ip6_cork_release(cork, v6_cork);
2065 }
2066 
2067 void ip6_flush_pending_frames(struct sock *sk)
2068 {
2069 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2070 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
2071 }
2072 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2073 
2074 struct sk_buff *ip6_make_skb(struct sock *sk,
2075 			     int getfrag(void *from, char *to, int offset,
2076 					 int len, int odd, struct sk_buff *skb),
2077 			     void *from, size_t length, int transhdrlen,
2078 			     struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2079 			     unsigned int flags, struct inet_cork_full *cork)
2080 {
2081 	struct inet6_cork v6_cork;
2082 	struct sk_buff_head queue;
2083 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2084 	int err;
2085 
2086 	if (flags & MSG_PROBE) {
2087 		dst_release(&rt->dst);
2088 		return NULL;
2089 	}
2090 
2091 	__skb_queue_head_init(&queue);
2092 
2093 	cork->base.flags = 0;
2094 	cork->base.addr = 0;
2095 	cork->base.opt = NULL;
2096 	v6_cork.opt = NULL;
2097 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
2098 	if (err) {
2099 		ip6_cork_release(cork, &v6_cork);
2100 		return ERR_PTR(err);
2101 	}
2102 	if (ipc6->dontfrag < 0)
2103 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2104 
2105 	err = __ip6_append_data(sk, &queue, cork, &v6_cork,
2106 				&current->task_frag, getfrag, from,
2107 				length + exthdrlen, transhdrlen + exthdrlen,
2108 				flags, ipc6);
2109 	if (err) {
2110 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2111 		return ERR_PTR(err);
2112 	}
2113 
2114 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2115 }
2116