xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 5ffd8c73)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	IPv6 output functions
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on linux/net/ipv4/ip_output.c
10  *
11  *	Changes:
12  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13  *				extension headers are implemented.
14  *				route changes now work.
15  *				ip6_forward does not confuse sniffers.
16  *				etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *	Imran Patel	:	frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *			:       add ip6_append_data and related functions
22  *				for datagram xmit
23  */
24 
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37 
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41 
42 #include <net/sock.h>
43 #include <net/snmp.h>
44 
45 #include <net/gso.h>
46 #include <net/ipv6.h>
47 #include <net/ndisc.h>
48 #include <net/protocol.h>
49 #include <net/ip6_route.h>
50 #include <net/addrconf.h>
51 #include <net/rawv6.h>
52 #include <net/icmp.h>
53 #include <net/xfrm.h>
54 #include <net/checksum.h>
55 #include <linux/mroute6.h>
56 #include <net/l3mdev.h>
57 #include <net/lwtunnel.h>
58 #include <net/ip_tunnels.h>
59 
60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61 {
62 	struct dst_entry *dst = skb_dst(skb);
63 	struct net_device *dev = dst->dev;
64 	struct inet6_dev *idev = ip6_dst_idev(dst);
65 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
66 	const struct in6_addr *daddr, *nexthop;
67 	struct ipv6hdr *hdr;
68 	struct neighbour *neigh;
69 	int ret;
70 
71 	/* Be paranoid, rather than too clever. */
72 	if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
73 		skb = skb_expand_head(skb, hh_len);
74 		if (!skb) {
75 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
76 			return -ENOMEM;
77 		}
78 	}
79 
80 	hdr = ipv6_hdr(skb);
81 	daddr = &hdr->daddr;
82 	if (ipv6_addr_is_multicast(daddr)) {
83 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
84 		    ((mroute6_is_socket(net, skb) &&
85 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
86 		     ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
87 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
88 
89 			/* Do not check for IFF_ALLMULTI; multicast routing
90 			   is not supported in any case.
91 			 */
92 			if (newskb)
93 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
94 					net, sk, newskb, NULL, newskb->dev,
95 					dev_loopback_xmit);
96 
97 			if (hdr->hop_limit == 0) {
98 				IP6_INC_STATS(net, idev,
99 					      IPSTATS_MIB_OUTDISCARDS);
100 				kfree_skb(skb);
101 				return 0;
102 			}
103 		}
104 
105 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
106 		if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
107 		    !(dev->flags & IFF_LOOPBACK)) {
108 			kfree_skb(skb);
109 			return 0;
110 		}
111 	}
112 
113 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
114 		int res = lwtunnel_xmit(skb);
115 
116 		if (res != LWTUNNEL_XMIT_CONTINUE)
117 			return res;
118 	}
119 
120 	rcu_read_lock();
121 	nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
122 	neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
123 
124 	if (unlikely(IS_ERR_OR_NULL(neigh))) {
125 		if (unlikely(!neigh))
126 			neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
127 		if (IS_ERR(neigh)) {
128 			rcu_read_unlock();
129 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
130 			kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
131 			return -EINVAL;
132 		}
133 	}
134 	sock_confirm_neigh(skb, neigh);
135 	ret = neigh_output(neigh, skb, false);
136 	rcu_read_unlock();
137 	return ret;
138 }
139 
140 static int
141 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
142 				    struct sk_buff *skb, unsigned int mtu)
143 {
144 	struct sk_buff *segs, *nskb;
145 	netdev_features_t features;
146 	int ret = 0;
147 
148 	/* Please see corresponding comment in ip_finish_output_gso
149 	 * describing the cases where GSO segment length exceeds the
150 	 * egress MTU.
151 	 */
152 	features = netif_skb_features(skb);
153 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
154 	if (IS_ERR_OR_NULL(segs)) {
155 		kfree_skb(skb);
156 		return -ENOMEM;
157 	}
158 
159 	consume_skb(skb);
160 
161 	skb_list_walk_safe(segs, segs, nskb) {
162 		int err;
163 
164 		skb_mark_not_on_list(segs);
165 		/* Last GSO segment can be smaller than gso_size (and MTU).
166 		 * Adding a fragment header would produce an "atomic fragment",
167 		 * which is considered harmful (RFC-8021). Avoid that.
168 		 */
169 		err = segs->len > mtu ?
170 			ip6_fragment(net, sk, segs, ip6_finish_output2) :
171 			ip6_finish_output2(net, sk, segs);
172 		if (err && ret == 0)
173 			ret = err;
174 	}
175 
176 	return ret;
177 }
178 
179 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
180 {
181 	unsigned int mtu;
182 
183 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
184 	/* Policy lookup after SNAT yielded a new policy */
185 	if (skb_dst(skb)->xfrm) {
186 		IP6CB(skb)->flags |= IP6SKB_REROUTED;
187 		return dst_output(net, sk, skb);
188 	}
189 #endif
190 
191 	mtu = ip6_skb_dst_mtu(skb);
192 	if (skb_is_gso(skb) &&
193 	    !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
194 	    !skb_gso_validate_network_len(skb, mtu))
195 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
196 
197 	if ((skb->len > mtu && !skb_is_gso(skb)) ||
198 	    dst_allfrag(skb_dst(skb)) ||
199 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
200 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
201 	else
202 		return ip6_finish_output2(net, sk, skb);
203 }
204 
205 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
206 {
207 	int ret;
208 
209 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
210 	switch (ret) {
211 	case NET_XMIT_SUCCESS:
212 	case NET_XMIT_CN:
213 		return __ip6_finish_output(net, sk, skb) ? : ret;
214 	default:
215 		kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
216 		return ret;
217 	}
218 }
219 
220 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
221 {
222 	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
223 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
224 
225 	skb->protocol = htons(ETH_P_IPV6);
226 	skb->dev = dev;
227 
228 	if (unlikely(idev->cnf.disable_ipv6)) {
229 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
230 		kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
231 		return 0;
232 	}
233 
234 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
235 			    net, sk, skb, indev, dev,
236 			    ip6_finish_output,
237 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
238 }
239 EXPORT_SYMBOL(ip6_output);
240 
241 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
242 {
243 	if (!np->autoflowlabel_set)
244 		return ip6_default_np_autolabel(net);
245 	else
246 		return np->autoflowlabel;
247 }
248 
249 /*
250  * xmit an sk_buff (used by TCP, SCTP and DCCP)
251  * Note : socket lock is not held for SYNACK packets, but might be modified
252  * by calls to skb_set_owner_w() and ipv6_local_error(),
253  * which are using proper atomic operations or spinlocks.
254  */
255 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
256 	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
257 {
258 	struct net *net = sock_net(sk);
259 	const struct ipv6_pinfo *np = inet6_sk(sk);
260 	struct in6_addr *first_hop = &fl6->daddr;
261 	struct dst_entry *dst = skb_dst(skb);
262 	struct net_device *dev = dst->dev;
263 	struct inet6_dev *idev = ip6_dst_idev(dst);
264 	struct hop_jumbo_hdr *hop_jumbo;
265 	int hoplen = sizeof(*hop_jumbo);
266 	unsigned int head_room;
267 	struct ipv6hdr *hdr;
268 	u8  proto = fl6->flowi6_proto;
269 	int seg_len = skb->len;
270 	int hlimit = -1;
271 	u32 mtu;
272 
273 	head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
274 	if (opt)
275 		head_room += opt->opt_nflen + opt->opt_flen;
276 
277 	if (unlikely(head_room > skb_headroom(skb))) {
278 		skb = skb_expand_head(skb, head_room);
279 		if (!skb) {
280 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
281 			return -ENOBUFS;
282 		}
283 	}
284 
285 	if (opt) {
286 		seg_len += opt->opt_nflen + opt->opt_flen;
287 
288 		if (opt->opt_flen)
289 			ipv6_push_frag_opts(skb, opt, &proto);
290 
291 		if (opt->opt_nflen)
292 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
293 					     &fl6->saddr);
294 	}
295 
296 	if (unlikely(seg_len > IPV6_MAXPLEN)) {
297 		hop_jumbo = skb_push(skb, hoplen);
298 
299 		hop_jumbo->nexthdr = proto;
300 		hop_jumbo->hdrlen = 0;
301 		hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
302 		hop_jumbo->tlv_len = 4;
303 		hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
304 
305 		proto = IPPROTO_HOPOPTS;
306 		seg_len = 0;
307 		IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
308 	}
309 
310 	skb_push(skb, sizeof(struct ipv6hdr));
311 	skb_reset_network_header(skb);
312 	hdr = ipv6_hdr(skb);
313 
314 	/*
315 	 *	Fill in the IPv6 header
316 	 */
317 	if (np)
318 		hlimit = np->hop_limit;
319 	if (hlimit < 0)
320 		hlimit = ip6_dst_hoplimit(dst);
321 
322 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
323 				ip6_autoflowlabel(net, np), fl6));
324 
325 	hdr->payload_len = htons(seg_len);
326 	hdr->nexthdr = proto;
327 	hdr->hop_limit = hlimit;
328 
329 	hdr->saddr = fl6->saddr;
330 	hdr->daddr = *first_hop;
331 
332 	skb->protocol = htons(ETH_P_IPV6);
333 	skb->priority = priority;
334 	skb->mark = mark;
335 
336 	mtu = dst_mtu(dst);
337 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
338 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
339 
340 		/* if egress device is enslaved to an L3 master device pass the
341 		 * skb to its handler for processing
342 		 */
343 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
344 		if (unlikely(!skb))
345 			return 0;
346 
347 		/* hooks should never assume socket lock is held.
348 		 * we promote our socket to non const
349 		 */
350 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
351 			       net, (struct sock *)sk, skb, NULL, dev,
352 			       dst_output);
353 	}
354 
355 	skb->dev = dev;
356 	/* ipv6_local_error() does not require socket lock,
357 	 * we promote our socket to non const
358 	 */
359 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
360 
361 	IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
362 	kfree_skb(skb);
363 	return -EMSGSIZE;
364 }
365 EXPORT_SYMBOL(ip6_xmit);
366 
367 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
368 {
369 	struct ip6_ra_chain *ra;
370 	struct sock *last = NULL;
371 
372 	read_lock(&ip6_ra_lock);
373 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
374 		struct sock *sk = ra->sk;
375 		if (sk && ra->sel == sel &&
376 		    (!sk->sk_bound_dev_if ||
377 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
378 			struct ipv6_pinfo *np = inet6_sk(sk);
379 
380 			if (np && np->rtalert_isolate &&
381 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
382 				continue;
383 			}
384 			if (last) {
385 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
386 				if (skb2)
387 					rawv6_rcv(last, skb2);
388 			}
389 			last = sk;
390 		}
391 	}
392 
393 	if (last) {
394 		rawv6_rcv(last, skb);
395 		read_unlock(&ip6_ra_lock);
396 		return 1;
397 	}
398 	read_unlock(&ip6_ra_lock);
399 	return 0;
400 }
401 
402 static int ip6_forward_proxy_check(struct sk_buff *skb)
403 {
404 	struct ipv6hdr *hdr = ipv6_hdr(skb);
405 	u8 nexthdr = hdr->nexthdr;
406 	__be16 frag_off;
407 	int offset;
408 
409 	if (ipv6_ext_hdr(nexthdr)) {
410 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
411 		if (offset < 0)
412 			return 0;
413 	} else
414 		offset = sizeof(struct ipv6hdr);
415 
416 	if (nexthdr == IPPROTO_ICMPV6) {
417 		struct icmp6hdr *icmp6;
418 
419 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
420 					 offset + 1 - skb->data)))
421 			return 0;
422 
423 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
424 
425 		switch (icmp6->icmp6_type) {
426 		case NDISC_ROUTER_SOLICITATION:
427 		case NDISC_ROUTER_ADVERTISEMENT:
428 		case NDISC_NEIGHBOUR_SOLICITATION:
429 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
430 		case NDISC_REDIRECT:
431 			/* For reaction involving unicast neighbor discovery
432 			 * message destined to the proxied address, pass it to
433 			 * input function.
434 			 */
435 			return 1;
436 		default:
437 			break;
438 		}
439 	}
440 
441 	/*
442 	 * The proxying router can't forward traffic sent to a link-local
443 	 * address, so signal the sender and discard the packet. This
444 	 * behavior is clarified by the MIPv6 specification.
445 	 */
446 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
447 		dst_link_failure(skb);
448 		return -1;
449 	}
450 
451 	return 0;
452 }
453 
454 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
455 				     struct sk_buff *skb)
456 {
457 	struct dst_entry *dst = skb_dst(skb);
458 
459 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
460 
461 #ifdef CONFIG_NET_SWITCHDEV
462 	if (skb->offload_l3_fwd_mark) {
463 		consume_skb(skb);
464 		return 0;
465 	}
466 #endif
467 
468 	skb_clear_tstamp(skb);
469 	return dst_output(net, sk, skb);
470 }
471 
472 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
473 {
474 	if (skb->len <= mtu)
475 		return false;
476 
477 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
478 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
479 		return true;
480 
481 	if (skb->ignore_df)
482 		return false;
483 
484 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
485 		return false;
486 
487 	return true;
488 }
489 
490 int ip6_forward(struct sk_buff *skb)
491 {
492 	struct dst_entry *dst = skb_dst(skb);
493 	struct ipv6hdr *hdr = ipv6_hdr(skb);
494 	struct inet6_skb_parm *opt = IP6CB(skb);
495 	struct net *net = dev_net(dst->dev);
496 	struct inet6_dev *idev;
497 	SKB_DR(reason);
498 	u32 mtu;
499 
500 	idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
501 	if (net->ipv6.devconf_all->forwarding == 0)
502 		goto error;
503 
504 	if (skb->pkt_type != PACKET_HOST)
505 		goto drop;
506 
507 	if (unlikely(skb->sk))
508 		goto drop;
509 
510 	if (skb_warn_if_lro(skb))
511 		goto drop;
512 
513 	if (!net->ipv6.devconf_all->disable_policy &&
514 	    (!idev || !idev->cnf.disable_policy) &&
515 	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
516 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
517 		goto drop;
518 	}
519 
520 	skb_forward_csum(skb);
521 
522 	/*
523 	 *	We DO NOT make any processing on
524 	 *	RA packets, pushing them to user level AS IS
525 	 *	without ane WARRANTY that application will be able
526 	 *	to interpret them. The reason is that we
527 	 *	cannot make anything clever here.
528 	 *
529 	 *	We are not end-node, so that if packet contains
530 	 *	AH/ESP, we cannot make anything.
531 	 *	Defragmentation also would be mistake, RA packets
532 	 *	cannot be fragmented, because there is no warranty
533 	 *	that different fragments will go along one path. --ANK
534 	 */
535 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
536 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
537 			return 0;
538 	}
539 
540 	/*
541 	 *	check and decrement ttl
542 	 */
543 	if (hdr->hop_limit <= 1) {
544 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
545 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
546 
547 		kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
548 		return -ETIMEDOUT;
549 	}
550 
551 	/* XXX: idev->cnf.proxy_ndp? */
552 	if (net->ipv6.devconf_all->proxy_ndp &&
553 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
554 		int proxied = ip6_forward_proxy_check(skb);
555 		if (proxied > 0) {
556 			/* It's tempting to decrease the hop limit
557 			 * here by 1, as we do at the end of the
558 			 * function too.
559 			 *
560 			 * But that would be incorrect, as proxying is
561 			 * not forwarding.  The ip6_input function
562 			 * will handle this packet locally, and it
563 			 * depends on the hop limit being unchanged.
564 			 *
565 			 * One example is the NDP hop limit, that
566 			 * always has to stay 255, but other would be
567 			 * similar checks around RA packets, where the
568 			 * user can even change the desired limit.
569 			 */
570 			return ip6_input(skb);
571 		} else if (proxied < 0) {
572 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
573 			goto drop;
574 		}
575 	}
576 
577 	if (!xfrm6_route_forward(skb)) {
578 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
579 		SKB_DR_SET(reason, XFRM_POLICY);
580 		goto drop;
581 	}
582 	dst = skb_dst(skb);
583 
584 	/* IPv6 specs say nothing about it, but it is clear that we cannot
585 	   send redirects to source routed frames.
586 	   We don't send redirects to frames decapsulated from IPsec.
587 	 */
588 	if (IP6CB(skb)->iif == dst->dev->ifindex &&
589 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
590 		struct in6_addr *target = NULL;
591 		struct inet_peer *peer;
592 		struct rt6_info *rt;
593 
594 		/*
595 		 *	incoming and outgoing devices are the same
596 		 *	send a redirect.
597 		 */
598 
599 		rt = (struct rt6_info *) dst;
600 		if (rt->rt6i_flags & RTF_GATEWAY)
601 			target = &rt->rt6i_gateway;
602 		else
603 			target = &hdr->daddr;
604 
605 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
606 
607 		/* Limit redirects both by destination (here)
608 		   and by source (inside ndisc_send_redirect)
609 		 */
610 		if (inet_peer_xrlim_allow(peer, 1*HZ))
611 			ndisc_send_redirect(skb, target);
612 		if (peer)
613 			inet_putpeer(peer);
614 	} else {
615 		int addrtype = ipv6_addr_type(&hdr->saddr);
616 
617 		/* This check is security critical. */
618 		if (addrtype == IPV6_ADDR_ANY ||
619 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
620 			goto error;
621 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
622 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
623 				    ICMPV6_NOT_NEIGHBOUR, 0);
624 			goto error;
625 		}
626 	}
627 
628 	mtu = ip6_dst_mtu_maybe_forward(dst, true);
629 	if (mtu < IPV6_MIN_MTU)
630 		mtu = IPV6_MIN_MTU;
631 
632 	if (ip6_pkt_too_big(skb, mtu)) {
633 		/* Again, force OUTPUT device used as source address */
634 		skb->dev = dst->dev;
635 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
636 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
637 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
638 				IPSTATS_MIB_FRAGFAILS);
639 		kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
640 		return -EMSGSIZE;
641 	}
642 
643 	if (skb_cow(skb, dst->dev->hard_header_len)) {
644 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
645 				IPSTATS_MIB_OUTDISCARDS);
646 		goto drop;
647 	}
648 
649 	hdr = ipv6_hdr(skb);
650 
651 	/* Mangling hops number delayed to point after skb COW */
652 
653 	hdr->hop_limit--;
654 
655 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
656 		       net, NULL, skb, skb->dev, dst->dev,
657 		       ip6_forward_finish);
658 
659 error:
660 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
661 	SKB_DR_SET(reason, IP_INADDRERRORS);
662 drop:
663 	kfree_skb_reason(skb, reason);
664 	return -EINVAL;
665 }
666 
667 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
668 {
669 	to->pkt_type = from->pkt_type;
670 	to->priority = from->priority;
671 	to->protocol = from->protocol;
672 	skb_dst_drop(to);
673 	skb_dst_set(to, dst_clone(skb_dst(from)));
674 	to->dev = from->dev;
675 	to->mark = from->mark;
676 
677 	skb_copy_hash(to, from);
678 
679 #ifdef CONFIG_NET_SCHED
680 	to->tc_index = from->tc_index;
681 #endif
682 	nf_copy(to, from);
683 	skb_ext_copy(to, from);
684 	skb_copy_secmark(to, from);
685 }
686 
687 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
688 		      u8 nexthdr, __be32 frag_id,
689 		      struct ip6_fraglist_iter *iter)
690 {
691 	unsigned int first_len;
692 	struct frag_hdr *fh;
693 
694 	/* BUILD HEADER */
695 	*prevhdr = NEXTHDR_FRAGMENT;
696 	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
697 	if (!iter->tmp_hdr)
698 		return -ENOMEM;
699 
700 	iter->frag = skb_shinfo(skb)->frag_list;
701 	skb_frag_list_init(skb);
702 
703 	iter->offset = 0;
704 	iter->hlen = hlen;
705 	iter->frag_id = frag_id;
706 	iter->nexthdr = nexthdr;
707 
708 	__skb_pull(skb, hlen);
709 	fh = __skb_push(skb, sizeof(struct frag_hdr));
710 	__skb_push(skb, hlen);
711 	skb_reset_network_header(skb);
712 	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
713 
714 	fh->nexthdr = nexthdr;
715 	fh->reserved = 0;
716 	fh->frag_off = htons(IP6_MF);
717 	fh->identification = frag_id;
718 
719 	first_len = skb_pagelen(skb);
720 	skb->data_len = first_len - skb_headlen(skb);
721 	skb->len = first_len;
722 	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
723 
724 	return 0;
725 }
726 EXPORT_SYMBOL(ip6_fraglist_init);
727 
728 void ip6_fraglist_prepare(struct sk_buff *skb,
729 			  struct ip6_fraglist_iter *iter)
730 {
731 	struct sk_buff *frag = iter->frag;
732 	unsigned int hlen = iter->hlen;
733 	struct frag_hdr *fh;
734 
735 	frag->ip_summed = CHECKSUM_NONE;
736 	skb_reset_transport_header(frag);
737 	fh = __skb_push(frag, sizeof(struct frag_hdr));
738 	__skb_push(frag, hlen);
739 	skb_reset_network_header(frag);
740 	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
741 	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
742 	fh->nexthdr = iter->nexthdr;
743 	fh->reserved = 0;
744 	fh->frag_off = htons(iter->offset);
745 	if (frag->next)
746 		fh->frag_off |= htons(IP6_MF);
747 	fh->identification = iter->frag_id;
748 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
749 	ip6_copy_metadata(frag, skb);
750 }
751 EXPORT_SYMBOL(ip6_fraglist_prepare);
752 
753 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
754 		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
755 		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
756 {
757 	state->prevhdr = prevhdr;
758 	state->nexthdr = nexthdr;
759 	state->frag_id = frag_id;
760 
761 	state->hlen = hlen;
762 	state->mtu = mtu;
763 
764 	state->left = skb->len - hlen;	/* Space per frame */
765 	state->ptr = hlen;		/* Where to start from */
766 
767 	state->hroom = hdr_room;
768 	state->troom = needed_tailroom;
769 
770 	state->offset = 0;
771 }
772 EXPORT_SYMBOL(ip6_frag_init);
773 
774 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
775 {
776 	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
777 	struct sk_buff *frag;
778 	struct frag_hdr *fh;
779 	unsigned int len;
780 
781 	len = state->left;
782 	/* IF: it doesn't fit, use 'mtu' - the data space left */
783 	if (len > state->mtu)
784 		len = state->mtu;
785 	/* IF: we are not sending up to and including the packet end
786 	   then align the next start on an eight byte boundary */
787 	if (len < state->left)
788 		len &= ~7;
789 
790 	/* Allocate buffer */
791 	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
792 			 state->hroom + state->troom, GFP_ATOMIC);
793 	if (!frag)
794 		return ERR_PTR(-ENOMEM);
795 
796 	/*
797 	 *	Set up data on packet
798 	 */
799 
800 	ip6_copy_metadata(frag, skb);
801 	skb_reserve(frag, state->hroom);
802 	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
803 	skb_reset_network_header(frag);
804 	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
805 	frag->transport_header = (frag->network_header + state->hlen +
806 				  sizeof(struct frag_hdr));
807 
808 	/*
809 	 *	Charge the memory for the fragment to any owner
810 	 *	it might possess
811 	 */
812 	if (skb->sk)
813 		skb_set_owner_w(frag, skb->sk);
814 
815 	/*
816 	 *	Copy the packet header into the new buffer.
817 	 */
818 	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
819 
820 	fragnexthdr_offset = skb_network_header(frag);
821 	fragnexthdr_offset += prevhdr - skb_network_header(skb);
822 	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
823 
824 	/*
825 	 *	Build fragment header.
826 	 */
827 	fh->nexthdr = state->nexthdr;
828 	fh->reserved = 0;
829 	fh->identification = state->frag_id;
830 
831 	/*
832 	 *	Copy a block of the IP datagram.
833 	 */
834 	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
835 			     len));
836 	state->left -= len;
837 
838 	fh->frag_off = htons(state->offset);
839 	if (state->left > 0)
840 		fh->frag_off |= htons(IP6_MF);
841 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
842 
843 	state->ptr += len;
844 	state->offset += len;
845 
846 	return frag;
847 }
848 EXPORT_SYMBOL(ip6_frag_next);
849 
850 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
851 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
852 {
853 	struct sk_buff *frag;
854 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
855 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
856 				inet6_sk(skb->sk) : NULL;
857 	bool mono_delivery_time = skb->mono_delivery_time;
858 	struct ip6_frag_state state;
859 	unsigned int mtu, hlen, nexthdr_offset;
860 	ktime_t tstamp = skb->tstamp;
861 	int hroom, err = 0;
862 	__be32 frag_id;
863 	u8 *prevhdr, nexthdr = 0;
864 
865 	err = ip6_find_1stfragopt(skb, &prevhdr);
866 	if (err < 0)
867 		goto fail;
868 	hlen = err;
869 	nexthdr = *prevhdr;
870 	nexthdr_offset = prevhdr - skb_network_header(skb);
871 
872 	mtu = ip6_skb_dst_mtu(skb);
873 
874 	/* We must not fragment if the socket is set to force MTU discovery
875 	 * or if the skb it not generated by a local socket.
876 	 */
877 	if (unlikely(!skb->ignore_df && skb->len > mtu))
878 		goto fail_toobig;
879 
880 	if (IP6CB(skb)->frag_max_size) {
881 		if (IP6CB(skb)->frag_max_size > mtu)
882 			goto fail_toobig;
883 
884 		/* don't send fragments larger than what we received */
885 		mtu = IP6CB(skb)->frag_max_size;
886 		if (mtu < IPV6_MIN_MTU)
887 			mtu = IPV6_MIN_MTU;
888 	}
889 
890 	if (np && np->frag_size < mtu) {
891 		if (np->frag_size)
892 			mtu = np->frag_size;
893 	}
894 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
895 		goto fail_toobig;
896 	mtu -= hlen + sizeof(struct frag_hdr);
897 
898 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
899 				    &ipv6_hdr(skb)->saddr);
900 
901 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
902 	    (err = skb_checksum_help(skb)))
903 		goto fail;
904 
905 	prevhdr = skb_network_header(skb) + nexthdr_offset;
906 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
907 	if (skb_has_frag_list(skb)) {
908 		unsigned int first_len = skb_pagelen(skb);
909 		struct ip6_fraglist_iter iter;
910 		struct sk_buff *frag2;
911 
912 		if (first_len - hlen > mtu ||
913 		    ((first_len - hlen) & 7) ||
914 		    skb_cloned(skb) ||
915 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
916 			goto slow_path;
917 
918 		skb_walk_frags(skb, frag) {
919 			/* Correct geometry. */
920 			if (frag->len > mtu ||
921 			    ((frag->len & 7) && frag->next) ||
922 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
923 				goto slow_path_clean;
924 
925 			/* Partially cloned skb? */
926 			if (skb_shared(frag))
927 				goto slow_path_clean;
928 
929 			BUG_ON(frag->sk);
930 			if (skb->sk) {
931 				frag->sk = skb->sk;
932 				frag->destructor = sock_wfree;
933 			}
934 			skb->truesize -= frag->truesize;
935 		}
936 
937 		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
938 					&iter);
939 		if (err < 0)
940 			goto fail;
941 
942 		/* We prevent @rt from being freed. */
943 		rcu_read_lock();
944 
945 		for (;;) {
946 			/* Prepare header of the next frame,
947 			 * before previous one went down. */
948 			if (iter.frag)
949 				ip6_fraglist_prepare(skb, &iter);
950 
951 			skb_set_delivery_time(skb, tstamp, mono_delivery_time);
952 			err = output(net, sk, skb);
953 			if (!err)
954 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
955 					      IPSTATS_MIB_FRAGCREATES);
956 
957 			if (err || !iter.frag)
958 				break;
959 
960 			skb = ip6_fraglist_next(&iter);
961 		}
962 
963 		kfree(iter.tmp_hdr);
964 
965 		if (err == 0) {
966 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
967 				      IPSTATS_MIB_FRAGOKS);
968 			rcu_read_unlock();
969 			return 0;
970 		}
971 
972 		kfree_skb_list(iter.frag);
973 
974 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
975 			      IPSTATS_MIB_FRAGFAILS);
976 		rcu_read_unlock();
977 		return err;
978 
979 slow_path_clean:
980 		skb_walk_frags(skb, frag2) {
981 			if (frag2 == frag)
982 				break;
983 			frag2->sk = NULL;
984 			frag2->destructor = NULL;
985 			skb->truesize += frag2->truesize;
986 		}
987 	}
988 
989 slow_path:
990 	/*
991 	 *	Fragment the datagram.
992 	 */
993 
994 	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
995 		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
996 		      &state);
997 
998 	/*
999 	 *	Keep copying data until we run out.
1000 	 */
1001 
1002 	while (state.left > 0) {
1003 		frag = ip6_frag_next(skb, &state);
1004 		if (IS_ERR(frag)) {
1005 			err = PTR_ERR(frag);
1006 			goto fail;
1007 		}
1008 
1009 		/*
1010 		 *	Put this fragment into the sending queue.
1011 		 */
1012 		skb_set_delivery_time(frag, tstamp, mono_delivery_time);
1013 		err = output(net, sk, frag);
1014 		if (err)
1015 			goto fail;
1016 
1017 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1018 			      IPSTATS_MIB_FRAGCREATES);
1019 	}
1020 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1021 		      IPSTATS_MIB_FRAGOKS);
1022 	consume_skb(skb);
1023 	return err;
1024 
1025 fail_toobig:
1026 	if (skb->sk && dst_allfrag(skb_dst(skb)))
1027 		sk_gso_disable(skb->sk);
1028 
1029 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1030 	err = -EMSGSIZE;
1031 
1032 fail:
1033 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1034 		      IPSTATS_MIB_FRAGFAILS);
1035 	kfree_skb(skb);
1036 	return err;
1037 }
1038 
1039 static inline int ip6_rt_check(const struct rt6key *rt_key,
1040 			       const struct in6_addr *fl_addr,
1041 			       const struct in6_addr *addr_cache)
1042 {
1043 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1044 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1045 }
1046 
1047 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1048 					  struct dst_entry *dst,
1049 					  const struct flowi6 *fl6)
1050 {
1051 	struct ipv6_pinfo *np = inet6_sk(sk);
1052 	struct rt6_info *rt;
1053 
1054 	if (!dst)
1055 		goto out;
1056 
1057 	if (dst->ops->family != AF_INET6) {
1058 		dst_release(dst);
1059 		return NULL;
1060 	}
1061 
1062 	rt = (struct rt6_info *)dst;
1063 	/* Yes, checking route validity in not connected
1064 	 * case is not very simple. Take into account,
1065 	 * that we do not support routing by source, TOS,
1066 	 * and MSG_DONTROUTE		--ANK (980726)
1067 	 *
1068 	 * 1. ip6_rt_check(): If route was host route,
1069 	 *    check that cached destination is current.
1070 	 *    If it is network route, we still may
1071 	 *    check its validity using saved pointer
1072 	 *    to the last used address: daddr_cache.
1073 	 *    We do not want to save whole address now,
1074 	 *    (because main consumer of this service
1075 	 *    is tcp, which has not this problem),
1076 	 *    so that the last trick works only on connected
1077 	 *    sockets.
1078 	 * 2. oif also should be the same.
1079 	 */
1080 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1081 #ifdef CONFIG_IPV6_SUBTREES
1082 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1083 #endif
1084 	   (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
1085 		dst_release(dst);
1086 		dst = NULL;
1087 	}
1088 
1089 out:
1090 	return dst;
1091 }
1092 
1093 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1094 			       struct dst_entry **dst, struct flowi6 *fl6)
1095 {
1096 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1097 	struct neighbour *n;
1098 	struct rt6_info *rt;
1099 #endif
1100 	int err;
1101 	int flags = 0;
1102 
1103 	/* The correct way to handle this would be to do
1104 	 * ip6_route_get_saddr, and then ip6_route_output; however,
1105 	 * the route-specific preferred source forces the
1106 	 * ip6_route_output call _before_ ip6_route_get_saddr.
1107 	 *
1108 	 * In source specific routing (no src=any default route),
1109 	 * ip6_route_output will fail given src=any saddr, though, so
1110 	 * that's why we try it again later.
1111 	 */
1112 	if (ipv6_addr_any(&fl6->saddr)) {
1113 		struct fib6_info *from;
1114 		struct rt6_info *rt;
1115 
1116 		*dst = ip6_route_output(net, sk, fl6);
1117 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1118 
1119 		rcu_read_lock();
1120 		from = rt ? rcu_dereference(rt->from) : NULL;
1121 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1122 					  sk ? inet6_sk(sk)->srcprefs : 0,
1123 					  &fl6->saddr);
1124 		rcu_read_unlock();
1125 
1126 		if (err)
1127 			goto out_err_release;
1128 
1129 		/* If we had an erroneous initial result, pretend it
1130 		 * never existed and let the SA-enabled version take
1131 		 * over.
1132 		 */
1133 		if ((*dst)->error) {
1134 			dst_release(*dst);
1135 			*dst = NULL;
1136 		}
1137 
1138 		if (fl6->flowi6_oif)
1139 			flags |= RT6_LOOKUP_F_IFACE;
1140 	}
1141 
1142 	if (!*dst)
1143 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1144 
1145 	err = (*dst)->error;
1146 	if (err)
1147 		goto out_err_release;
1148 
1149 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1150 	/*
1151 	 * Here if the dst entry we've looked up
1152 	 * has a neighbour entry that is in the INCOMPLETE
1153 	 * state and the src address from the flow is
1154 	 * marked as OPTIMISTIC, we release the found
1155 	 * dst entry and replace it instead with the
1156 	 * dst entry of the nexthop router
1157 	 */
1158 	rt = (struct rt6_info *) *dst;
1159 	rcu_read_lock();
1160 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1161 				      rt6_nexthop(rt, &fl6->daddr));
1162 	err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
1163 	rcu_read_unlock();
1164 
1165 	if (err) {
1166 		struct inet6_ifaddr *ifp;
1167 		struct flowi6 fl_gw6;
1168 		int redirect;
1169 
1170 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1171 				      (*dst)->dev, 1);
1172 
1173 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1174 		if (ifp)
1175 			in6_ifa_put(ifp);
1176 
1177 		if (redirect) {
1178 			/*
1179 			 * We need to get the dst entry for the
1180 			 * default router instead
1181 			 */
1182 			dst_release(*dst);
1183 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1184 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1185 			*dst = ip6_route_output(net, sk, &fl_gw6);
1186 			err = (*dst)->error;
1187 			if (err)
1188 				goto out_err_release;
1189 		}
1190 	}
1191 #endif
1192 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1193 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1194 		err = -EAFNOSUPPORT;
1195 		goto out_err_release;
1196 	}
1197 
1198 	return 0;
1199 
1200 out_err_release:
1201 	dst_release(*dst);
1202 	*dst = NULL;
1203 
1204 	if (err == -ENETUNREACH)
1205 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1206 	return err;
1207 }
1208 
1209 /**
1210  *	ip6_dst_lookup - perform route lookup on flow
1211  *	@net: Network namespace to perform lookup in
1212  *	@sk: socket which provides route info
1213  *	@dst: pointer to dst_entry * for result
1214  *	@fl6: flow to lookup
1215  *
1216  *	This function performs a route lookup on the given flow.
1217  *
1218  *	It returns zero on success, or a standard errno code on error.
1219  */
1220 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1221 		   struct flowi6 *fl6)
1222 {
1223 	*dst = NULL;
1224 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1225 }
1226 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1227 
1228 /**
1229  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1230  *	@net: Network namespace to perform lookup in
1231  *	@sk: socket which provides route info
1232  *	@fl6: flow to lookup
1233  *	@final_dst: final destination address for ipsec lookup
1234  *
1235  *	This function performs a route lookup on the given flow.
1236  *
1237  *	It returns a valid dst pointer on success, or a pointer encoded
1238  *	error code.
1239  */
1240 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1241 				      const struct in6_addr *final_dst)
1242 {
1243 	struct dst_entry *dst = NULL;
1244 	int err;
1245 
1246 	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1247 	if (err)
1248 		return ERR_PTR(err);
1249 	if (final_dst)
1250 		fl6->daddr = *final_dst;
1251 
1252 	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1253 }
1254 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1255 
1256 /**
1257  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1258  *	@sk: socket which provides the dst cache and route info
1259  *	@fl6: flow to lookup
1260  *	@final_dst: final destination address for ipsec lookup
1261  *	@connected: whether @sk is connected or not
1262  *
1263  *	This function performs a route lookup on the given flow with the
1264  *	possibility of using the cached route in the socket if it is valid.
1265  *	It will take the socket dst lock when operating on the dst cache.
1266  *	As a result, this function can only be used in process context.
1267  *
1268  *	In addition, for a connected socket, cache the dst in the socket
1269  *	if the current cache is not valid.
1270  *
1271  *	It returns a valid dst pointer on success, or a pointer encoded
1272  *	error code.
1273  */
1274 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1275 					 const struct in6_addr *final_dst,
1276 					 bool connected)
1277 {
1278 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1279 
1280 	dst = ip6_sk_dst_check(sk, dst, fl6);
1281 	if (dst)
1282 		return dst;
1283 
1284 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1285 	if (connected && !IS_ERR(dst))
1286 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1287 
1288 	return dst;
1289 }
1290 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1291 
1292 /**
1293  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1294  *      @skb: Packet for which lookup is done
1295  *      @dev: Tunnel device
1296  *      @net: Network namespace of tunnel device
1297  *      @sock: Socket which provides route info
1298  *      @saddr: Memory to store the src ip address
1299  *      @info: Tunnel information
1300  *      @protocol: IP protocol
1301  *      @use_cache: Flag to enable cache usage
1302  *      This function performs a route lookup on a tunnel
1303  *
1304  *      It returns a valid dst pointer and stores src address to be used in
1305  *      tunnel in param saddr on success, else a pointer encoded error code.
1306  */
1307 
1308 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1309 					struct net_device *dev,
1310 					struct net *net,
1311 					struct socket *sock,
1312 					struct in6_addr *saddr,
1313 					const struct ip_tunnel_info *info,
1314 					u8 protocol,
1315 					bool use_cache)
1316 {
1317 	struct dst_entry *dst = NULL;
1318 #ifdef CONFIG_DST_CACHE
1319 	struct dst_cache *dst_cache;
1320 #endif
1321 	struct flowi6 fl6;
1322 	__u8 prio;
1323 
1324 #ifdef CONFIG_DST_CACHE
1325 	dst_cache = (struct dst_cache *)&info->dst_cache;
1326 	if (use_cache) {
1327 		dst = dst_cache_get_ip6(dst_cache, saddr);
1328 		if (dst)
1329 			return dst;
1330 	}
1331 #endif
1332 	memset(&fl6, 0, sizeof(fl6));
1333 	fl6.flowi6_mark = skb->mark;
1334 	fl6.flowi6_proto = protocol;
1335 	fl6.daddr = info->key.u.ipv6.dst;
1336 	fl6.saddr = info->key.u.ipv6.src;
1337 	prio = info->key.tos;
1338 	fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
1339 
1340 	dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1341 					      NULL);
1342 	if (IS_ERR(dst)) {
1343 		netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1344 		return ERR_PTR(-ENETUNREACH);
1345 	}
1346 	if (dst->dev == dev) { /* is this necessary? */
1347 		netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1348 		dst_release(dst);
1349 		return ERR_PTR(-ELOOP);
1350 	}
1351 #ifdef CONFIG_DST_CACHE
1352 	if (use_cache)
1353 		dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1354 #endif
1355 	*saddr = fl6.saddr;
1356 	return dst;
1357 }
1358 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1359 
1360 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1361 					       gfp_t gfp)
1362 {
1363 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1364 }
1365 
1366 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1367 						gfp_t gfp)
1368 {
1369 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1370 }
1371 
1372 static void ip6_append_data_mtu(unsigned int *mtu,
1373 				int *maxfraglen,
1374 				unsigned int fragheaderlen,
1375 				struct sk_buff *skb,
1376 				struct rt6_info *rt,
1377 				unsigned int orig_mtu)
1378 {
1379 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1380 		if (!skb) {
1381 			/* first fragment, reserve header_len */
1382 			*mtu = orig_mtu - rt->dst.header_len;
1383 
1384 		} else {
1385 			/*
1386 			 * this fragment is not first, the headers
1387 			 * space is regarded as data space.
1388 			 */
1389 			*mtu = orig_mtu;
1390 		}
1391 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1392 			      + fragheaderlen - sizeof(struct frag_hdr);
1393 	}
1394 }
1395 
1396 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1397 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1398 			  struct rt6_info *rt)
1399 {
1400 	struct ipv6_pinfo *np = inet6_sk(sk);
1401 	unsigned int mtu;
1402 	struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1403 
1404 	/* callers pass dst together with a reference, set it first so
1405 	 * ip6_cork_release() can put it down even in case of an error.
1406 	 */
1407 	cork->base.dst = &rt->dst;
1408 
1409 	/*
1410 	 * setup for corking
1411 	 */
1412 	if (opt) {
1413 		if (WARN_ON(v6_cork->opt))
1414 			return -EINVAL;
1415 
1416 		nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1417 		if (unlikely(!nopt))
1418 			return -ENOBUFS;
1419 
1420 		nopt->tot_len = sizeof(*opt);
1421 		nopt->opt_flen = opt->opt_flen;
1422 		nopt->opt_nflen = opt->opt_nflen;
1423 
1424 		nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1425 		if (opt->dst0opt && !nopt->dst0opt)
1426 			return -ENOBUFS;
1427 
1428 		nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1429 		if (opt->dst1opt && !nopt->dst1opt)
1430 			return -ENOBUFS;
1431 
1432 		nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1433 		if (opt->hopopt && !nopt->hopopt)
1434 			return -ENOBUFS;
1435 
1436 		nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1437 		if (opt->srcrt && !nopt->srcrt)
1438 			return -ENOBUFS;
1439 
1440 		/* need source address above miyazawa*/
1441 	}
1442 	v6_cork->hop_limit = ipc6->hlimit;
1443 	v6_cork->tclass = ipc6->tclass;
1444 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1445 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1446 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1447 	else
1448 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1449 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1450 	if (np->frag_size < mtu) {
1451 		if (np->frag_size)
1452 			mtu = np->frag_size;
1453 	}
1454 	cork->base.fragsize = mtu;
1455 	cork->base.gso_size = ipc6->gso_size;
1456 	cork->base.tx_flags = 0;
1457 	cork->base.mark = ipc6->sockc.mark;
1458 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1459 
1460 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1461 		cork->base.flags |= IPCORK_ALLFRAG;
1462 	cork->base.length = 0;
1463 
1464 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1465 
1466 	return 0;
1467 }
1468 
1469 static int __ip6_append_data(struct sock *sk,
1470 			     struct sk_buff_head *queue,
1471 			     struct inet_cork_full *cork_full,
1472 			     struct inet6_cork *v6_cork,
1473 			     struct page_frag *pfrag,
1474 			     int getfrag(void *from, char *to, int offset,
1475 					 int len, int odd, struct sk_buff *skb),
1476 			     void *from, size_t length, int transhdrlen,
1477 			     unsigned int flags, struct ipcm6_cookie *ipc6)
1478 {
1479 	struct sk_buff *skb, *skb_prev = NULL;
1480 	struct inet_cork *cork = &cork_full->base;
1481 	struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1482 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1483 	struct ubuf_info *uarg = NULL;
1484 	int exthdrlen = 0;
1485 	int dst_exthdrlen = 0;
1486 	int hh_len;
1487 	int copy;
1488 	int err;
1489 	int offset = 0;
1490 	bool zc = false;
1491 	u32 tskey = 0;
1492 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1493 	struct ipv6_txoptions *opt = v6_cork->opt;
1494 	int csummode = CHECKSUM_NONE;
1495 	unsigned int maxnonfragsize, headersize;
1496 	unsigned int wmem_alloc_delta = 0;
1497 	bool paged, extra_uref = false;
1498 
1499 	skb = skb_peek_tail(queue);
1500 	if (!skb) {
1501 		exthdrlen = opt ? opt->opt_flen : 0;
1502 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1503 	}
1504 
1505 	paged = !!cork->gso_size;
1506 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1507 	orig_mtu = mtu;
1508 
1509 	if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
1510 	    READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID)
1511 		tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1512 
1513 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1514 
1515 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1516 			(opt ? opt->opt_nflen : 0);
1517 
1518 	headersize = sizeof(struct ipv6hdr) +
1519 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1520 		     (dst_allfrag(&rt->dst) ?
1521 		      sizeof(struct frag_hdr) : 0) +
1522 		     rt->rt6i_nfheader_len;
1523 
1524 	if (mtu <= fragheaderlen ||
1525 	    ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1526 		goto emsgsize;
1527 
1528 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1529 		     sizeof(struct frag_hdr);
1530 
1531 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1532 	 * the first fragment
1533 	 */
1534 	if (headersize + transhdrlen > mtu)
1535 		goto emsgsize;
1536 
1537 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1538 	    (sk->sk_protocol == IPPROTO_UDP ||
1539 	     sk->sk_protocol == IPPROTO_ICMPV6 ||
1540 	     sk->sk_protocol == IPPROTO_RAW)) {
1541 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1542 				sizeof(struct ipv6hdr));
1543 		goto emsgsize;
1544 	}
1545 
1546 	if (ip6_sk_ignore_df(sk))
1547 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1548 	else
1549 		maxnonfragsize = mtu;
1550 
1551 	if (cork->length + length > maxnonfragsize - headersize) {
1552 emsgsize:
1553 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1554 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1555 		return -EMSGSIZE;
1556 	}
1557 
1558 	/* CHECKSUM_PARTIAL only with no extension headers and when
1559 	 * we are not going to fragment
1560 	 */
1561 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1562 	    headersize == sizeof(struct ipv6hdr) &&
1563 	    length <= mtu - headersize &&
1564 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1565 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1566 		csummode = CHECKSUM_PARTIAL;
1567 
1568 	if ((flags & MSG_ZEROCOPY) && length) {
1569 		struct msghdr *msg = from;
1570 
1571 		if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1572 			if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1573 				return -EINVAL;
1574 
1575 			/* Leave uarg NULL if can't zerocopy, callers should
1576 			 * be able to handle it.
1577 			 */
1578 			if ((rt->dst.dev->features & NETIF_F_SG) &&
1579 			    csummode == CHECKSUM_PARTIAL) {
1580 				paged = true;
1581 				zc = true;
1582 				uarg = msg->msg_ubuf;
1583 			}
1584 		} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1585 			uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1586 			if (!uarg)
1587 				return -ENOBUFS;
1588 			extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1589 			if (rt->dst.dev->features & NETIF_F_SG &&
1590 			    csummode == CHECKSUM_PARTIAL) {
1591 				paged = true;
1592 				zc = true;
1593 			} else {
1594 				uarg_to_msgzc(uarg)->zerocopy = 0;
1595 				skb_zcopy_set(skb, uarg, &extra_uref);
1596 			}
1597 		}
1598 	} else if ((flags & MSG_SPLICE_PAGES) && length) {
1599 		if (inet_test_bit(HDRINCL, sk))
1600 			return -EPERM;
1601 		if (rt->dst.dev->features & NETIF_F_SG &&
1602 		    getfrag == ip_generic_getfrag)
1603 			/* We need an empty buffer to attach stuff to */
1604 			paged = true;
1605 		else
1606 			flags &= ~MSG_SPLICE_PAGES;
1607 	}
1608 
1609 	/*
1610 	 * Let's try using as much space as possible.
1611 	 * Use MTU if total length of the message fits into the MTU.
1612 	 * Otherwise, we need to reserve fragment header and
1613 	 * fragment alignment (= 8-15 octects, in total).
1614 	 *
1615 	 * Note that we may need to "move" the data from the tail
1616 	 * of the buffer to the new fragment when we split
1617 	 * the message.
1618 	 *
1619 	 * FIXME: It may be fragmented into multiple chunks
1620 	 *        at once if non-fragmentable extension headers
1621 	 *        are too large.
1622 	 * --yoshfuji
1623 	 */
1624 
1625 	cork->length += length;
1626 	if (!skb)
1627 		goto alloc_new_skb;
1628 
1629 	while (length > 0) {
1630 		/* Check if the remaining data fits into current packet. */
1631 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1632 		if (copy < length)
1633 			copy = maxfraglen - skb->len;
1634 
1635 		if (copy <= 0) {
1636 			char *data;
1637 			unsigned int datalen;
1638 			unsigned int fraglen;
1639 			unsigned int fraggap;
1640 			unsigned int alloclen, alloc_extra;
1641 			unsigned int pagedlen;
1642 alloc_new_skb:
1643 			/* There's no room in the current skb */
1644 			if (skb)
1645 				fraggap = skb->len - maxfraglen;
1646 			else
1647 				fraggap = 0;
1648 			/* update mtu and maxfraglen if necessary */
1649 			if (!skb || !skb_prev)
1650 				ip6_append_data_mtu(&mtu, &maxfraglen,
1651 						    fragheaderlen, skb, rt,
1652 						    orig_mtu);
1653 
1654 			skb_prev = skb;
1655 
1656 			/*
1657 			 * If remaining data exceeds the mtu,
1658 			 * we know we need more fragment(s).
1659 			 */
1660 			datalen = length + fraggap;
1661 
1662 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1663 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1664 			fraglen = datalen + fragheaderlen;
1665 			pagedlen = 0;
1666 
1667 			alloc_extra = hh_len;
1668 			alloc_extra += dst_exthdrlen;
1669 			alloc_extra += rt->dst.trailer_len;
1670 
1671 			/* We just reserve space for fragment header.
1672 			 * Note: this may be overallocation if the message
1673 			 * (without MSG_MORE) fits into the MTU.
1674 			 */
1675 			alloc_extra += sizeof(struct frag_hdr);
1676 
1677 			if ((flags & MSG_MORE) &&
1678 			    !(rt->dst.dev->features&NETIF_F_SG))
1679 				alloclen = mtu;
1680 			else if (!paged &&
1681 				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1682 				  !(rt->dst.dev->features & NETIF_F_SG)))
1683 				alloclen = fraglen;
1684 			else {
1685 				alloclen = fragheaderlen + transhdrlen;
1686 				pagedlen = datalen - transhdrlen;
1687 			}
1688 			alloclen += alloc_extra;
1689 
1690 			if (datalen != length + fraggap) {
1691 				/*
1692 				 * this is not the last fragment, the trailer
1693 				 * space is regarded as data space.
1694 				 */
1695 				datalen += rt->dst.trailer_len;
1696 			}
1697 
1698 			fraglen = datalen + fragheaderlen;
1699 
1700 			copy = datalen - transhdrlen - fraggap - pagedlen;
1701 			/* [!] NOTE: copy may be negative if pagedlen>0
1702 			 * because then the equation may reduces to -fraggap.
1703 			 */
1704 			if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) {
1705 				err = -EINVAL;
1706 				goto error;
1707 			}
1708 			if (transhdrlen) {
1709 				skb = sock_alloc_send_skb(sk, alloclen,
1710 						(flags & MSG_DONTWAIT), &err);
1711 			} else {
1712 				skb = NULL;
1713 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1714 				    2 * sk->sk_sndbuf)
1715 					skb = alloc_skb(alloclen,
1716 							sk->sk_allocation);
1717 				if (unlikely(!skb))
1718 					err = -ENOBUFS;
1719 			}
1720 			if (!skb)
1721 				goto error;
1722 			/*
1723 			 *	Fill in the control structures
1724 			 */
1725 			skb->protocol = htons(ETH_P_IPV6);
1726 			skb->ip_summed = csummode;
1727 			skb->csum = 0;
1728 			/* reserve for fragmentation and ipsec header */
1729 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1730 				    dst_exthdrlen);
1731 
1732 			/*
1733 			 *	Find where to start putting bytes
1734 			 */
1735 			data = skb_put(skb, fraglen - pagedlen);
1736 			skb_set_network_header(skb, exthdrlen);
1737 			data += fragheaderlen;
1738 			skb->transport_header = (skb->network_header +
1739 						 fragheaderlen);
1740 			if (fraggap) {
1741 				skb->csum = skb_copy_and_csum_bits(
1742 					skb_prev, maxfraglen,
1743 					data + transhdrlen, fraggap);
1744 				skb_prev->csum = csum_sub(skb_prev->csum,
1745 							  skb->csum);
1746 				data += fraggap;
1747 				pskb_trim_unique(skb_prev, maxfraglen);
1748 			}
1749 			if (copy > 0 &&
1750 			    getfrag(from, data + transhdrlen, offset,
1751 				    copy, fraggap, skb) < 0) {
1752 				err = -EFAULT;
1753 				kfree_skb(skb);
1754 				goto error;
1755 			} else if (flags & MSG_SPLICE_PAGES) {
1756 				copy = 0;
1757 			}
1758 
1759 			offset += copy;
1760 			length -= copy + transhdrlen;
1761 			transhdrlen = 0;
1762 			exthdrlen = 0;
1763 			dst_exthdrlen = 0;
1764 
1765 			/* Only the initial fragment is time stamped */
1766 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1767 			cork->tx_flags = 0;
1768 			skb_shinfo(skb)->tskey = tskey;
1769 			tskey = 0;
1770 			skb_zcopy_set(skb, uarg, &extra_uref);
1771 
1772 			if ((flags & MSG_CONFIRM) && !skb_prev)
1773 				skb_set_dst_pending_confirm(skb, 1);
1774 
1775 			/*
1776 			 * Put the packet on the pending queue
1777 			 */
1778 			if (!skb->destructor) {
1779 				skb->destructor = sock_wfree;
1780 				skb->sk = sk;
1781 				wmem_alloc_delta += skb->truesize;
1782 			}
1783 			__skb_queue_tail(queue, skb);
1784 			continue;
1785 		}
1786 
1787 		if (copy > length)
1788 			copy = length;
1789 
1790 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1791 		    skb_tailroom(skb) >= copy) {
1792 			unsigned int off;
1793 
1794 			off = skb->len;
1795 			if (getfrag(from, skb_put(skb, copy),
1796 						offset, copy, off, skb) < 0) {
1797 				__skb_trim(skb, off);
1798 				err = -EFAULT;
1799 				goto error;
1800 			}
1801 		} else if (flags & MSG_SPLICE_PAGES) {
1802 			struct msghdr *msg = from;
1803 
1804 			err = -EIO;
1805 			if (WARN_ON_ONCE(copy > msg->msg_iter.count))
1806 				goto error;
1807 
1808 			err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
1809 						   sk->sk_allocation);
1810 			if (err < 0)
1811 				goto error;
1812 			copy = err;
1813 			wmem_alloc_delta += copy;
1814 		} else if (!zc) {
1815 			int i = skb_shinfo(skb)->nr_frags;
1816 
1817 			err = -ENOMEM;
1818 			if (!sk_page_frag_refill(sk, pfrag))
1819 				goto error;
1820 
1821 			skb_zcopy_downgrade_managed(skb);
1822 			if (!skb_can_coalesce(skb, i, pfrag->page,
1823 					      pfrag->offset)) {
1824 				err = -EMSGSIZE;
1825 				if (i == MAX_SKB_FRAGS)
1826 					goto error;
1827 
1828 				__skb_fill_page_desc(skb, i, pfrag->page,
1829 						     pfrag->offset, 0);
1830 				skb_shinfo(skb)->nr_frags = ++i;
1831 				get_page(pfrag->page);
1832 			}
1833 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1834 			if (getfrag(from,
1835 				    page_address(pfrag->page) + pfrag->offset,
1836 				    offset, copy, skb->len, skb) < 0)
1837 				goto error_efault;
1838 
1839 			pfrag->offset += copy;
1840 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1841 			skb->len += copy;
1842 			skb->data_len += copy;
1843 			skb->truesize += copy;
1844 			wmem_alloc_delta += copy;
1845 		} else {
1846 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1847 			if (err < 0)
1848 				goto error;
1849 		}
1850 		offset += copy;
1851 		length -= copy;
1852 	}
1853 
1854 	if (wmem_alloc_delta)
1855 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1856 	return 0;
1857 
1858 error_efault:
1859 	err = -EFAULT;
1860 error:
1861 	net_zcopy_put_abort(uarg, extra_uref);
1862 	cork->length -= length;
1863 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1864 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1865 	return err;
1866 }
1867 
1868 int ip6_append_data(struct sock *sk,
1869 		    int getfrag(void *from, char *to, int offset, int len,
1870 				int odd, struct sk_buff *skb),
1871 		    void *from, size_t length, int transhdrlen,
1872 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1873 		    struct rt6_info *rt, unsigned int flags)
1874 {
1875 	struct inet_sock *inet = inet_sk(sk);
1876 	struct ipv6_pinfo *np = inet6_sk(sk);
1877 	int exthdrlen;
1878 	int err;
1879 
1880 	if (flags&MSG_PROBE)
1881 		return 0;
1882 	if (skb_queue_empty(&sk->sk_write_queue)) {
1883 		/*
1884 		 * setup for corking
1885 		 */
1886 		dst_hold(&rt->dst);
1887 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1888 				     ipc6, rt);
1889 		if (err)
1890 			return err;
1891 
1892 		inet->cork.fl.u.ip6 = *fl6;
1893 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1894 		length += exthdrlen;
1895 		transhdrlen += exthdrlen;
1896 	} else {
1897 		transhdrlen = 0;
1898 	}
1899 
1900 	return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1901 				 &np->cork, sk_page_frag(sk), getfrag,
1902 				 from, length, transhdrlen, flags, ipc6);
1903 }
1904 EXPORT_SYMBOL_GPL(ip6_append_data);
1905 
1906 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1907 {
1908 	struct dst_entry *dst = cork->base.dst;
1909 
1910 	cork->base.dst = NULL;
1911 	cork->base.flags &= ~IPCORK_ALLFRAG;
1912 	skb_dst_set(skb, dst);
1913 }
1914 
1915 static void ip6_cork_release(struct inet_cork_full *cork,
1916 			     struct inet6_cork *v6_cork)
1917 {
1918 	if (v6_cork->opt) {
1919 		struct ipv6_txoptions *opt = v6_cork->opt;
1920 
1921 		kfree(opt->dst0opt);
1922 		kfree(opt->dst1opt);
1923 		kfree(opt->hopopt);
1924 		kfree(opt->srcrt);
1925 		kfree(opt);
1926 		v6_cork->opt = NULL;
1927 	}
1928 
1929 	if (cork->base.dst) {
1930 		dst_release(cork->base.dst);
1931 		cork->base.dst = NULL;
1932 		cork->base.flags &= ~IPCORK_ALLFRAG;
1933 	}
1934 }
1935 
1936 struct sk_buff *__ip6_make_skb(struct sock *sk,
1937 			       struct sk_buff_head *queue,
1938 			       struct inet_cork_full *cork,
1939 			       struct inet6_cork *v6_cork)
1940 {
1941 	struct sk_buff *skb, *tmp_skb;
1942 	struct sk_buff **tail_skb;
1943 	struct in6_addr *final_dst;
1944 	struct ipv6_pinfo *np = inet6_sk(sk);
1945 	struct net *net = sock_net(sk);
1946 	struct ipv6hdr *hdr;
1947 	struct ipv6_txoptions *opt = v6_cork->opt;
1948 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1949 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1950 	unsigned char proto = fl6->flowi6_proto;
1951 
1952 	skb = __skb_dequeue(queue);
1953 	if (!skb)
1954 		goto out;
1955 	tail_skb = &(skb_shinfo(skb)->frag_list);
1956 
1957 	/* move skb->data to ip header from ext header */
1958 	if (skb->data < skb_network_header(skb))
1959 		__skb_pull(skb, skb_network_offset(skb));
1960 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1961 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1962 		*tail_skb = tmp_skb;
1963 		tail_skb = &(tmp_skb->next);
1964 		skb->len += tmp_skb->len;
1965 		skb->data_len += tmp_skb->len;
1966 		skb->truesize += tmp_skb->truesize;
1967 		tmp_skb->destructor = NULL;
1968 		tmp_skb->sk = NULL;
1969 	}
1970 
1971 	/* Allow local fragmentation. */
1972 	skb->ignore_df = ip6_sk_ignore_df(sk);
1973 	__skb_pull(skb, skb_network_header_len(skb));
1974 
1975 	final_dst = &fl6->daddr;
1976 	if (opt && opt->opt_flen)
1977 		ipv6_push_frag_opts(skb, opt, &proto);
1978 	if (opt && opt->opt_nflen)
1979 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1980 
1981 	skb_push(skb, sizeof(struct ipv6hdr));
1982 	skb_reset_network_header(skb);
1983 	hdr = ipv6_hdr(skb);
1984 
1985 	ip6_flow_hdr(hdr, v6_cork->tclass,
1986 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1987 					ip6_autoflowlabel(net, np), fl6));
1988 	hdr->hop_limit = v6_cork->hop_limit;
1989 	hdr->nexthdr = proto;
1990 	hdr->saddr = fl6->saddr;
1991 	hdr->daddr = *final_dst;
1992 
1993 	skb->priority = sk->sk_priority;
1994 	skb->mark = cork->base.mark;
1995 	skb->tstamp = cork->base.transmit_time;
1996 
1997 	ip6_cork_steal_dst(skb, cork);
1998 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1999 	if (proto == IPPROTO_ICMPV6) {
2000 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
2001 		u8 icmp6_type;
2002 
2003 		if (sk->sk_socket->type == SOCK_RAW &&
2004 		   !inet_test_bit(HDRINCL, sk))
2005 			icmp6_type = fl6->fl6_icmp_type;
2006 		else
2007 			icmp6_type = icmp6_hdr(skb)->icmp6_type;
2008 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
2009 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
2010 	}
2011 
2012 	ip6_cork_release(cork, v6_cork);
2013 out:
2014 	return skb;
2015 }
2016 
2017 int ip6_send_skb(struct sk_buff *skb)
2018 {
2019 	struct net *net = sock_net(skb->sk);
2020 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
2021 	int err;
2022 
2023 	err = ip6_local_out(net, skb->sk, skb);
2024 	if (err) {
2025 		if (err > 0)
2026 			err = net_xmit_errno(err);
2027 		if (err)
2028 			IP6_INC_STATS(net, rt->rt6i_idev,
2029 				      IPSTATS_MIB_OUTDISCARDS);
2030 	}
2031 
2032 	return err;
2033 }
2034 
2035 int ip6_push_pending_frames(struct sock *sk)
2036 {
2037 	struct sk_buff *skb;
2038 
2039 	skb = ip6_finish_skb(sk);
2040 	if (!skb)
2041 		return 0;
2042 
2043 	return ip6_send_skb(skb);
2044 }
2045 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
2046 
2047 static void __ip6_flush_pending_frames(struct sock *sk,
2048 				       struct sk_buff_head *queue,
2049 				       struct inet_cork_full *cork,
2050 				       struct inet6_cork *v6_cork)
2051 {
2052 	struct sk_buff *skb;
2053 
2054 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
2055 		if (skb_dst(skb))
2056 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
2057 				      IPSTATS_MIB_OUTDISCARDS);
2058 		kfree_skb(skb);
2059 	}
2060 
2061 	ip6_cork_release(cork, v6_cork);
2062 }
2063 
2064 void ip6_flush_pending_frames(struct sock *sk)
2065 {
2066 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2067 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
2068 }
2069 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2070 
2071 struct sk_buff *ip6_make_skb(struct sock *sk,
2072 			     int getfrag(void *from, char *to, int offset,
2073 					 int len, int odd, struct sk_buff *skb),
2074 			     void *from, size_t length, int transhdrlen,
2075 			     struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2076 			     unsigned int flags, struct inet_cork_full *cork)
2077 {
2078 	struct inet6_cork v6_cork;
2079 	struct sk_buff_head queue;
2080 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2081 	int err;
2082 
2083 	if (flags & MSG_PROBE) {
2084 		dst_release(&rt->dst);
2085 		return NULL;
2086 	}
2087 
2088 	__skb_queue_head_init(&queue);
2089 
2090 	cork->base.flags = 0;
2091 	cork->base.addr = 0;
2092 	cork->base.opt = NULL;
2093 	v6_cork.opt = NULL;
2094 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
2095 	if (err) {
2096 		ip6_cork_release(cork, &v6_cork);
2097 		return ERR_PTR(err);
2098 	}
2099 	if (ipc6->dontfrag < 0)
2100 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2101 
2102 	err = __ip6_append_data(sk, &queue, cork, &v6_cork,
2103 				&current->task_frag, getfrag, from,
2104 				length + exthdrlen, transhdrlen + exthdrlen,
2105 				flags, ipc6);
2106 	if (err) {
2107 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2108 		return ERR_PTR(err);
2109 	}
2110 
2111 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2112 }
2113