xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 7288dd2f)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	IPv6 output functions
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on linux/net/ipv4/ip_output.c
10  *
11  *	Changes:
12  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13  *				extension headers are implemented.
14  *				route changes now work.
15  *				ip6_forward does not confuse sniffers.
16  *				etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *	Imran Patel	:	frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *			:       add ip6_append_data and related functions
22  *				for datagram xmit
23  */
24 
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37 
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41 
42 #include <net/sock.h>
43 #include <net/snmp.h>
44 
45 #include <net/gso.h>
46 #include <net/ipv6.h>
47 #include <net/ndisc.h>
48 #include <net/protocol.h>
49 #include <net/ip6_route.h>
50 #include <net/addrconf.h>
51 #include <net/rawv6.h>
52 #include <net/icmp.h>
53 #include <net/xfrm.h>
54 #include <net/checksum.h>
55 #include <linux/mroute6.h>
56 #include <net/l3mdev.h>
57 #include <net/lwtunnel.h>
58 #include <net/ip_tunnels.h>
59 
60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61 {
62 	struct dst_entry *dst = skb_dst(skb);
63 	struct net_device *dev = dst->dev;
64 	struct inet6_dev *idev = ip6_dst_idev(dst);
65 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
66 	const struct in6_addr *daddr, *nexthop;
67 	struct ipv6hdr *hdr;
68 	struct neighbour *neigh;
69 	int ret;
70 
71 	/* Be paranoid, rather than too clever. */
72 	if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
73 		skb = skb_expand_head(skb, hh_len);
74 		if (!skb) {
75 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
76 			return -ENOMEM;
77 		}
78 	}
79 
80 	hdr = ipv6_hdr(skb);
81 	daddr = &hdr->daddr;
82 	if (ipv6_addr_is_multicast(daddr)) {
83 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
84 		    ((mroute6_is_socket(net, skb) &&
85 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
86 		     ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
87 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
88 
89 			/* Do not check for IFF_ALLMULTI; multicast routing
90 			   is not supported in any case.
91 			 */
92 			if (newskb)
93 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
94 					net, sk, newskb, NULL, newskb->dev,
95 					dev_loopback_xmit);
96 
97 			if (hdr->hop_limit == 0) {
98 				IP6_INC_STATS(net, idev,
99 					      IPSTATS_MIB_OUTDISCARDS);
100 				kfree_skb(skb);
101 				return 0;
102 			}
103 		}
104 
105 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
106 		if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
107 		    !(dev->flags & IFF_LOOPBACK)) {
108 			kfree_skb(skb);
109 			return 0;
110 		}
111 	}
112 
113 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
114 		int res = lwtunnel_xmit(skb);
115 
116 		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
117 			return res;
118 	}
119 
120 	rcu_read_lock();
121 	nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
122 	neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
123 
124 	if (unlikely(IS_ERR_OR_NULL(neigh))) {
125 		if (unlikely(!neigh))
126 			neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
127 		if (IS_ERR(neigh)) {
128 			rcu_read_unlock();
129 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
130 			kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
131 			return -EINVAL;
132 		}
133 	}
134 	sock_confirm_neigh(skb, neigh);
135 	ret = neigh_output(neigh, skb, false);
136 	rcu_read_unlock();
137 	return ret;
138 }
139 
140 static int
141 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
142 				    struct sk_buff *skb, unsigned int mtu)
143 {
144 	struct sk_buff *segs, *nskb;
145 	netdev_features_t features;
146 	int ret = 0;
147 
148 	/* Please see corresponding comment in ip_finish_output_gso
149 	 * describing the cases where GSO segment length exceeds the
150 	 * egress MTU.
151 	 */
152 	features = netif_skb_features(skb);
153 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
154 	if (IS_ERR_OR_NULL(segs)) {
155 		kfree_skb(skb);
156 		return -ENOMEM;
157 	}
158 
159 	consume_skb(skb);
160 
161 	skb_list_walk_safe(segs, segs, nskb) {
162 		int err;
163 
164 		skb_mark_not_on_list(segs);
165 		err = ip6_fragment(net, sk, segs, ip6_finish_output2);
166 		if (err && ret == 0)
167 			ret = err;
168 	}
169 
170 	return ret;
171 }
172 
173 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
174 {
175 	unsigned int mtu;
176 
177 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
178 	/* Policy lookup after SNAT yielded a new policy */
179 	if (skb_dst(skb)->xfrm) {
180 		IP6CB(skb)->flags |= IP6SKB_REROUTED;
181 		return dst_output(net, sk, skb);
182 	}
183 #endif
184 
185 	mtu = ip6_skb_dst_mtu(skb);
186 	if (skb_is_gso(skb) &&
187 	    !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
188 	    !skb_gso_validate_network_len(skb, mtu))
189 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
190 
191 	if ((skb->len > mtu && !skb_is_gso(skb)) ||
192 	    dst_allfrag(skb_dst(skb)) ||
193 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
194 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
195 	else
196 		return ip6_finish_output2(net, sk, skb);
197 }
198 
199 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
200 {
201 	int ret;
202 
203 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
204 	switch (ret) {
205 	case NET_XMIT_SUCCESS:
206 	case NET_XMIT_CN:
207 		return __ip6_finish_output(net, sk, skb) ? : ret;
208 	default:
209 		kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
210 		return ret;
211 	}
212 }
213 
214 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
215 {
216 	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
217 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
218 
219 	skb->protocol = htons(ETH_P_IPV6);
220 	skb->dev = dev;
221 
222 	if (unlikely(idev->cnf.disable_ipv6)) {
223 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
224 		kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
225 		return 0;
226 	}
227 
228 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
229 			    net, sk, skb, indev, dev,
230 			    ip6_finish_output,
231 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
232 }
233 EXPORT_SYMBOL(ip6_output);
234 
235 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
236 {
237 	if (!np->autoflowlabel_set)
238 		return ip6_default_np_autolabel(net);
239 	else
240 		return np->autoflowlabel;
241 }
242 
243 /*
244  * xmit an sk_buff (used by TCP, SCTP and DCCP)
245  * Note : socket lock is not held for SYNACK packets, but might be modified
246  * by calls to skb_set_owner_w() and ipv6_local_error(),
247  * which are using proper atomic operations or spinlocks.
248  */
249 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
250 	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
251 {
252 	struct net *net = sock_net(sk);
253 	const struct ipv6_pinfo *np = inet6_sk(sk);
254 	struct in6_addr *first_hop = &fl6->daddr;
255 	struct dst_entry *dst = skb_dst(skb);
256 	struct net_device *dev = dst->dev;
257 	struct inet6_dev *idev = ip6_dst_idev(dst);
258 	struct hop_jumbo_hdr *hop_jumbo;
259 	int hoplen = sizeof(*hop_jumbo);
260 	unsigned int head_room;
261 	struct ipv6hdr *hdr;
262 	u8  proto = fl6->flowi6_proto;
263 	int seg_len = skb->len;
264 	int hlimit = -1;
265 	u32 mtu;
266 
267 	head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
268 	if (opt)
269 		head_room += opt->opt_nflen + opt->opt_flen;
270 
271 	if (unlikely(head_room > skb_headroom(skb))) {
272 		skb = skb_expand_head(skb, head_room);
273 		if (!skb) {
274 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
275 			return -ENOBUFS;
276 		}
277 	}
278 
279 	if (opt) {
280 		seg_len += opt->opt_nflen + opt->opt_flen;
281 
282 		if (opt->opt_flen)
283 			ipv6_push_frag_opts(skb, opt, &proto);
284 
285 		if (opt->opt_nflen)
286 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
287 					     &fl6->saddr);
288 	}
289 
290 	if (unlikely(seg_len > IPV6_MAXPLEN)) {
291 		hop_jumbo = skb_push(skb, hoplen);
292 
293 		hop_jumbo->nexthdr = proto;
294 		hop_jumbo->hdrlen = 0;
295 		hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
296 		hop_jumbo->tlv_len = 4;
297 		hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
298 
299 		proto = IPPROTO_HOPOPTS;
300 		seg_len = 0;
301 		IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
302 	}
303 
304 	skb_push(skb, sizeof(struct ipv6hdr));
305 	skb_reset_network_header(skb);
306 	hdr = ipv6_hdr(skb);
307 
308 	/*
309 	 *	Fill in the IPv6 header
310 	 */
311 	if (np)
312 		hlimit = np->hop_limit;
313 	if (hlimit < 0)
314 		hlimit = ip6_dst_hoplimit(dst);
315 
316 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
317 				ip6_autoflowlabel(net, np), fl6));
318 
319 	hdr->payload_len = htons(seg_len);
320 	hdr->nexthdr = proto;
321 	hdr->hop_limit = hlimit;
322 
323 	hdr->saddr = fl6->saddr;
324 	hdr->daddr = *first_hop;
325 
326 	skb->protocol = htons(ETH_P_IPV6);
327 	skb->priority = priority;
328 	skb->mark = mark;
329 
330 	mtu = dst_mtu(dst);
331 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
332 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
333 
334 		/* if egress device is enslaved to an L3 master device pass the
335 		 * skb to its handler for processing
336 		 */
337 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
338 		if (unlikely(!skb))
339 			return 0;
340 
341 		/* hooks should never assume socket lock is held.
342 		 * we promote our socket to non const
343 		 */
344 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
345 			       net, (struct sock *)sk, skb, NULL, dev,
346 			       dst_output);
347 	}
348 
349 	skb->dev = dev;
350 	/* ipv6_local_error() does not require socket lock,
351 	 * we promote our socket to non const
352 	 */
353 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
354 
355 	IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
356 	kfree_skb(skb);
357 	return -EMSGSIZE;
358 }
359 EXPORT_SYMBOL(ip6_xmit);
360 
361 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
362 {
363 	struct ip6_ra_chain *ra;
364 	struct sock *last = NULL;
365 
366 	read_lock(&ip6_ra_lock);
367 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
368 		struct sock *sk = ra->sk;
369 		if (sk && ra->sel == sel &&
370 		    (!sk->sk_bound_dev_if ||
371 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
372 			struct ipv6_pinfo *np = inet6_sk(sk);
373 
374 			if (np && np->rtalert_isolate &&
375 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
376 				continue;
377 			}
378 			if (last) {
379 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
380 				if (skb2)
381 					rawv6_rcv(last, skb2);
382 			}
383 			last = sk;
384 		}
385 	}
386 
387 	if (last) {
388 		rawv6_rcv(last, skb);
389 		read_unlock(&ip6_ra_lock);
390 		return 1;
391 	}
392 	read_unlock(&ip6_ra_lock);
393 	return 0;
394 }
395 
396 static int ip6_forward_proxy_check(struct sk_buff *skb)
397 {
398 	struct ipv6hdr *hdr = ipv6_hdr(skb);
399 	u8 nexthdr = hdr->nexthdr;
400 	__be16 frag_off;
401 	int offset;
402 
403 	if (ipv6_ext_hdr(nexthdr)) {
404 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
405 		if (offset < 0)
406 			return 0;
407 	} else
408 		offset = sizeof(struct ipv6hdr);
409 
410 	if (nexthdr == IPPROTO_ICMPV6) {
411 		struct icmp6hdr *icmp6;
412 
413 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
414 					 offset + 1 - skb->data)))
415 			return 0;
416 
417 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
418 
419 		switch (icmp6->icmp6_type) {
420 		case NDISC_ROUTER_SOLICITATION:
421 		case NDISC_ROUTER_ADVERTISEMENT:
422 		case NDISC_NEIGHBOUR_SOLICITATION:
423 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
424 		case NDISC_REDIRECT:
425 			/* For reaction involving unicast neighbor discovery
426 			 * message destined to the proxied address, pass it to
427 			 * input function.
428 			 */
429 			return 1;
430 		default:
431 			break;
432 		}
433 	}
434 
435 	/*
436 	 * The proxying router can't forward traffic sent to a link-local
437 	 * address, so signal the sender and discard the packet. This
438 	 * behavior is clarified by the MIPv6 specification.
439 	 */
440 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
441 		dst_link_failure(skb);
442 		return -1;
443 	}
444 
445 	return 0;
446 }
447 
448 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
449 				     struct sk_buff *skb)
450 {
451 	struct dst_entry *dst = skb_dst(skb);
452 
453 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
454 	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
455 
456 #ifdef CONFIG_NET_SWITCHDEV
457 	if (skb->offload_l3_fwd_mark) {
458 		consume_skb(skb);
459 		return 0;
460 	}
461 #endif
462 
463 	skb_clear_tstamp(skb);
464 	return dst_output(net, sk, skb);
465 }
466 
467 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
468 {
469 	if (skb->len <= mtu)
470 		return false;
471 
472 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
473 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
474 		return true;
475 
476 	if (skb->ignore_df)
477 		return false;
478 
479 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
480 		return false;
481 
482 	return true;
483 }
484 
485 int ip6_forward(struct sk_buff *skb)
486 {
487 	struct dst_entry *dst = skb_dst(skb);
488 	struct ipv6hdr *hdr = ipv6_hdr(skb);
489 	struct inet6_skb_parm *opt = IP6CB(skb);
490 	struct net *net = dev_net(dst->dev);
491 	struct inet6_dev *idev;
492 	SKB_DR(reason);
493 	u32 mtu;
494 
495 	idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
496 	if (net->ipv6.devconf_all->forwarding == 0)
497 		goto error;
498 
499 	if (skb->pkt_type != PACKET_HOST)
500 		goto drop;
501 
502 	if (unlikely(skb->sk))
503 		goto drop;
504 
505 	if (skb_warn_if_lro(skb))
506 		goto drop;
507 
508 	if (!net->ipv6.devconf_all->disable_policy &&
509 	    (!idev || !idev->cnf.disable_policy) &&
510 	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
511 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
512 		goto drop;
513 	}
514 
515 	skb_forward_csum(skb);
516 
517 	/*
518 	 *	We DO NOT make any processing on
519 	 *	RA packets, pushing them to user level AS IS
520 	 *	without ane WARRANTY that application will be able
521 	 *	to interpret them. The reason is that we
522 	 *	cannot make anything clever here.
523 	 *
524 	 *	We are not end-node, so that if packet contains
525 	 *	AH/ESP, we cannot make anything.
526 	 *	Defragmentation also would be mistake, RA packets
527 	 *	cannot be fragmented, because there is no warranty
528 	 *	that different fragments will go along one path. --ANK
529 	 */
530 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
531 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
532 			return 0;
533 	}
534 
535 	/*
536 	 *	check and decrement ttl
537 	 */
538 	if (hdr->hop_limit <= 1) {
539 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
540 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
541 
542 		kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
543 		return -ETIMEDOUT;
544 	}
545 
546 	/* XXX: idev->cnf.proxy_ndp? */
547 	if (net->ipv6.devconf_all->proxy_ndp &&
548 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
549 		int proxied = ip6_forward_proxy_check(skb);
550 		if (proxied > 0) {
551 			/* It's tempting to decrease the hop limit
552 			 * here by 1, as we do at the end of the
553 			 * function too.
554 			 *
555 			 * But that would be incorrect, as proxying is
556 			 * not forwarding.  The ip6_input function
557 			 * will handle this packet locally, and it
558 			 * depends on the hop limit being unchanged.
559 			 *
560 			 * One example is the NDP hop limit, that
561 			 * always has to stay 255, but other would be
562 			 * similar checks around RA packets, where the
563 			 * user can even change the desired limit.
564 			 */
565 			return ip6_input(skb);
566 		} else if (proxied < 0) {
567 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
568 			goto drop;
569 		}
570 	}
571 
572 	if (!xfrm6_route_forward(skb)) {
573 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
574 		SKB_DR_SET(reason, XFRM_POLICY);
575 		goto drop;
576 	}
577 	dst = skb_dst(skb);
578 
579 	/* IPv6 specs say nothing about it, but it is clear that we cannot
580 	   send redirects to source routed frames.
581 	   We don't send redirects to frames decapsulated from IPsec.
582 	 */
583 	if (IP6CB(skb)->iif == dst->dev->ifindex &&
584 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
585 		struct in6_addr *target = NULL;
586 		struct inet_peer *peer;
587 		struct rt6_info *rt;
588 
589 		/*
590 		 *	incoming and outgoing devices are the same
591 		 *	send a redirect.
592 		 */
593 
594 		rt = (struct rt6_info *) dst;
595 		if (rt->rt6i_flags & RTF_GATEWAY)
596 			target = &rt->rt6i_gateway;
597 		else
598 			target = &hdr->daddr;
599 
600 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
601 
602 		/* Limit redirects both by destination (here)
603 		   and by source (inside ndisc_send_redirect)
604 		 */
605 		if (inet_peer_xrlim_allow(peer, 1*HZ))
606 			ndisc_send_redirect(skb, target);
607 		if (peer)
608 			inet_putpeer(peer);
609 	} else {
610 		int addrtype = ipv6_addr_type(&hdr->saddr);
611 
612 		/* This check is security critical. */
613 		if (addrtype == IPV6_ADDR_ANY ||
614 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
615 			goto error;
616 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
617 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
618 				    ICMPV6_NOT_NEIGHBOUR, 0);
619 			goto error;
620 		}
621 	}
622 
623 	mtu = ip6_dst_mtu_maybe_forward(dst, true);
624 	if (mtu < IPV6_MIN_MTU)
625 		mtu = IPV6_MIN_MTU;
626 
627 	if (ip6_pkt_too_big(skb, mtu)) {
628 		/* Again, force OUTPUT device used as source address */
629 		skb->dev = dst->dev;
630 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
631 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
632 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
633 				IPSTATS_MIB_FRAGFAILS);
634 		kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
635 		return -EMSGSIZE;
636 	}
637 
638 	if (skb_cow(skb, dst->dev->hard_header_len)) {
639 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
640 				IPSTATS_MIB_OUTDISCARDS);
641 		goto drop;
642 	}
643 
644 	hdr = ipv6_hdr(skb);
645 
646 	/* Mangling hops number delayed to point after skb COW */
647 
648 	hdr->hop_limit--;
649 
650 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
651 		       net, NULL, skb, skb->dev, dst->dev,
652 		       ip6_forward_finish);
653 
654 error:
655 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
656 	SKB_DR_SET(reason, IP_INADDRERRORS);
657 drop:
658 	kfree_skb_reason(skb, reason);
659 	return -EINVAL;
660 }
661 
662 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
663 {
664 	to->pkt_type = from->pkt_type;
665 	to->priority = from->priority;
666 	to->protocol = from->protocol;
667 	skb_dst_drop(to);
668 	skb_dst_set(to, dst_clone(skb_dst(from)));
669 	to->dev = from->dev;
670 	to->mark = from->mark;
671 
672 	skb_copy_hash(to, from);
673 
674 #ifdef CONFIG_NET_SCHED
675 	to->tc_index = from->tc_index;
676 #endif
677 	nf_copy(to, from);
678 	skb_ext_copy(to, from);
679 	skb_copy_secmark(to, from);
680 }
681 
682 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
683 		      u8 nexthdr, __be32 frag_id,
684 		      struct ip6_fraglist_iter *iter)
685 {
686 	unsigned int first_len;
687 	struct frag_hdr *fh;
688 
689 	/* BUILD HEADER */
690 	*prevhdr = NEXTHDR_FRAGMENT;
691 	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
692 	if (!iter->tmp_hdr)
693 		return -ENOMEM;
694 
695 	iter->frag = skb_shinfo(skb)->frag_list;
696 	skb_frag_list_init(skb);
697 
698 	iter->offset = 0;
699 	iter->hlen = hlen;
700 	iter->frag_id = frag_id;
701 	iter->nexthdr = nexthdr;
702 
703 	__skb_pull(skb, hlen);
704 	fh = __skb_push(skb, sizeof(struct frag_hdr));
705 	__skb_push(skb, hlen);
706 	skb_reset_network_header(skb);
707 	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
708 
709 	fh->nexthdr = nexthdr;
710 	fh->reserved = 0;
711 	fh->frag_off = htons(IP6_MF);
712 	fh->identification = frag_id;
713 
714 	first_len = skb_pagelen(skb);
715 	skb->data_len = first_len - skb_headlen(skb);
716 	skb->len = first_len;
717 	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
718 
719 	return 0;
720 }
721 EXPORT_SYMBOL(ip6_fraglist_init);
722 
723 void ip6_fraglist_prepare(struct sk_buff *skb,
724 			  struct ip6_fraglist_iter *iter)
725 {
726 	struct sk_buff *frag = iter->frag;
727 	unsigned int hlen = iter->hlen;
728 	struct frag_hdr *fh;
729 
730 	frag->ip_summed = CHECKSUM_NONE;
731 	skb_reset_transport_header(frag);
732 	fh = __skb_push(frag, sizeof(struct frag_hdr));
733 	__skb_push(frag, hlen);
734 	skb_reset_network_header(frag);
735 	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
736 	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
737 	fh->nexthdr = iter->nexthdr;
738 	fh->reserved = 0;
739 	fh->frag_off = htons(iter->offset);
740 	if (frag->next)
741 		fh->frag_off |= htons(IP6_MF);
742 	fh->identification = iter->frag_id;
743 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
744 	ip6_copy_metadata(frag, skb);
745 }
746 EXPORT_SYMBOL(ip6_fraglist_prepare);
747 
748 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
749 		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
750 		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
751 {
752 	state->prevhdr = prevhdr;
753 	state->nexthdr = nexthdr;
754 	state->frag_id = frag_id;
755 
756 	state->hlen = hlen;
757 	state->mtu = mtu;
758 
759 	state->left = skb->len - hlen;	/* Space per frame */
760 	state->ptr = hlen;		/* Where to start from */
761 
762 	state->hroom = hdr_room;
763 	state->troom = needed_tailroom;
764 
765 	state->offset = 0;
766 }
767 EXPORT_SYMBOL(ip6_frag_init);
768 
769 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
770 {
771 	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
772 	struct sk_buff *frag;
773 	struct frag_hdr *fh;
774 	unsigned int len;
775 
776 	len = state->left;
777 	/* IF: it doesn't fit, use 'mtu' - the data space left */
778 	if (len > state->mtu)
779 		len = state->mtu;
780 	/* IF: we are not sending up to and including the packet end
781 	   then align the next start on an eight byte boundary */
782 	if (len < state->left)
783 		len &= ~7;
784 
785 	/* Allocate buffer */
786 	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
787 			 state->hroom + state->troom, GFP_ATOMIC);
788 	if (!frag)
789 		return ERR_PTR(-ENOMEM);
790 
791 	/*
792 	 *	Set up data on packet
793 	 */
794 
795 	ip6_copy_metadata(frag, skb);
796 	skb_reserve(frag, state->hroom);
797 	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
798 	skb_reset_network_header(frag);
799 	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
800 	frag->transport_header = (frag->network_header + state->hlen +
801 				  sizeof(struct frag_hdr));
802 
803 	/*
804 	 *	Charge the memory for the fragment to any owner
805 	 *	it might possess
806 	 */
807 	if (skb->sk)
808 		skb_set_owner_w(frag, skb->sk);
809 
810 	/*
811 	 *	Copy the packet header into the new buffer.
812 	 */
813 	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
814 
815 	fragnexthdr_offset = skb_network_header(frag);
816 	fragnexthdr_offset += prevhdr - skb_network_header(skb);
817 	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
818 
819 	/*
820 	 *	Build fragment header.
821 	 */
822 	fh->nexthdr = state->nexthdr;
823 	fh->reserved = 0;
824 	fh->identification = state->frag_id;
825 
826 	/*
827 	 *	Copy a block of the IP datagram.
828 	 */
829 	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
830 			     len));
831 	state->left -= len;
832 
833 	fh->frag_off = htons(state->offset);
834 	if (state->left > 0)
835 		fh->frag_off |= htons(IP6_MF);
836 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
837 
838 	state->ptr += len;
839 	state->offset += len;
840 
841 	return frag;
842 }
843 EXPORT_SYMBOL(ip6_frag_next);
844 
845 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
846 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
847 {
848 	struct sk_buff *frag;
849 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
850 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
851 				inet6_sk(skb->sk) : NULL;
852 	bool mono_delivery_time = skb->mono_delivery_time;
853 	struct ip6_frag_state state;
854 	unsigned int mtu, hlen, nexthdr_offset;
855 	ktime_t tstamp = skb->tstamp;
856 	int hroom, err = 0;
857 	__be32 frag_id;
858 	u8 *prevhdr, nexthdr = 0;
859 
860 	err = ip6_find_1stfragopt(skb, &prevhdr);
861 	if (err < 0)
862 		goto fail;
863 	hlen = err;
864 	nexthdr = *prevhdr;
865 	nexthdr_offset = prevhdr - skb_network_header(skb);
866 
867 	mtu = ip6_skb_dst_mtu(skb);
868 
869 	/* We must not fragment if the socket is set to force MTU discovery
870 	 * or if the skb it not generated by a local socket.
871 	 */
872 	if (unlikely(!skb->ignore_df && skb->len > mtu))
873 		goto fail_toobig;
874 
875 	if (IP6CB(skb)->frag_max_size) {
876 		if (IP6CB(skb)->frag_max_size > mtu)
877 			goto fail_toobig;
878 
879 		/* don't send fragments larger than what we received */
880 		mtu = IP6CB(skb)->frag_max_size;
881 		if (mtu < IPV6_MIN_MTU)
882 			mtu = IPV6_MIN_MTU;
883 	}
884 
885 	if (np && np->frag_size < mtu) {
886 		if (np->frag_size)
887 			mtu = np->frag_size;
888 	}
889 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
890 		goto fail_toobig;
891 	mtu -= hlen + sizeof(struct frag_hdr);
892 
893 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
894 				    &ipv6_hdr(skb)->saddr);
895 
896 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
897 	    (err = skb_checksum_help(skb)))
898 		goto fail;
899 
900 	prevhdr = skb_network_header(skb) + nexthdr_offset;
901 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
902 	if (skb_has_frag_list(skb)) {
903 		unsigned int first_len = skb_pagelen(skb);
904 		struct ip6_fraglist_iter iter;
905 		struct sk_buff *frag2;
906 
907 		if (first_len - hlen > mtu ||
908 		    ((first_len - hlen) & 7) ||
909 		    skb_cloned(skb) ||
910 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
911 			goto slow_path;
912 
913 		skb_walk_frags(skb, frag) {
914 			/* Correct geometry. */
915 			if (frag->len > mtu ||
916 			    ((frag->len & 7) && frag->next) ||
917 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
918 				goto slow_path_clean;
919 
920 			/* Partially cloned skb? */
921 			if (skb_shared(frag))
922 				goto slow_path_clean;
923 
924 			BUG_ON(frag->sk);
925 			if (skb->sk) {
926 				frag->sk = skb->sk;
927 				frag->destructor = sock_wfree;
928 			}
929 			skb->truesize -= frag->truesize;
930 		}
931 
932 		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
933 					&iter);
934 		if (err < 0)
935 			goto fail;
936 
937 		/* We prevent @rt from being freed. */
938 		rcu_read_lock();
939 
940 		for (;;) {
941 			/* Prepare header of the next frame,
942 			 * before previous one went down. */
943 			if (iter.frag)
944 				ip6_fraglist_prepare(skb, &iter);
945 
946 			skb_set_delivery_time(skb, tstamp, mono_delivery_time);
947 			err = output(net, sk, skb);
948 			if (!err)
949 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
950 					      IPSTATS_MIB_FRAGCREATES);
951 
952 			if (err || !iter.frag)
953 				break;
954 
955 			skb = ip6_fraglist_next(&iter);
956 		}
957 
958 		kfree(iter.tmp_hdr);
959 
960 		if (err == 0) {
961 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
962 				      IPSTATS_MIB_FRAGOKS);
963 			rcu_read_unlock();
964 			return 0;
965 		}
966 
967 		kfree_skb_list(iter.frag);
968 
969 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
970 			      IPSTATS_MIB_FRAGFAILS);
971 		rcu_read_unlock();
972 		return err;
973 
974 slow_path_clean:
975 		skb_walk_frags(skb, frag2) {
976 			if (frag2 == frag)
977 				break;
978 			frag2->sk = NULL;
979 			frag2->destructor = NULL;
980 			skb->truesize += frag2->truesize;
981 		}
982 	}
983 
984 slow_path:
985 	/*
986 	 *	Fragment the datagram.
987 	 */
988 
989 	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
990 		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
991 		      &state);
992 
993 	/*
994 	 *	Keep copying data until we run out.
995 	 */
996 
997 	while (state.left > 0) {
998 		frag = ip6_frag_next(skb, &state);
999 		if (IS_ERR(frag)) {
1000 			err = PTR_ERR(frag);
1001 			goto fail;
1002 		}
1003 
1004 		/*
1005 		 *	Put this fragment into the sending queue.
1006 		 */
1007 		skb_set_delivery_time(frag, tstamp, mono_delivery_time);
1008 		err = output(net, sk, frag);
1009 		if (err)
1010 			goto fail;
1011 
1012 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1013 			      IPSTATS_MIB_FRAGCREATES);
1014 	}
1015 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1016 		      IPSTATS_MIB_FRAGOKS);
1017 	consume_skb(skb);
1018 	return err;
1019 
1020 fail_toobig:
1021 	if (skb->sk && dst_allfrag(skb_dst(skb)))
1022 		sk_gso_disable(skb->sk);
1023 
1024 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1025 	err = -EMSGSIZE;
1026 
1027 fail:
1028 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1029 		      IPSTATS_MIB_FRAGFAILS);
1030 	kfree_skb(skb);
1031 	return err;
1032 }
1033 
1034 static inline int ip6_rt_check(const struct rt6key *rt_key,
1035 			       const struct in6_addr *fl_addr,
1036 			       const struct in6_addr *addr_cache)
1037 {
1038 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1039 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1040 }
1041 
1042 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1043 					  struct dst_entry *dst,
1044 					  const struct flowi6 *fl6)
1045 {
1046 	struct ipv6_pinfo *np = inet6_sk(sk);
1047 	struct rt6_info *rt;
1048 
1049 	if (!dst)
1050 		goto out;
1051 
1052 	if (dst->ops->family != AF_INET6) {
1053 		dst_release(dst);
1054 		return NULL;
1055 	}
1056 
1057 	rt = (struct rt6_info *)dst;
1058 	/* Yes, checking route validity in not connected
1059 	 * case is not very simple. Take into account,
1060 	 * that we do not support routing by source, TOS,
1061 	 * and MSG_DONTROUTE		--ANK (980726)
1062 	 *
1063 	 * 1. ip6_rt_check(): If route was host route,
1064 	 *    check that cached destination is current.
1065 	 *    If it is network route, we still may
1066 	 *    check its validity using saved pointer
1067 	 *    to the last used address: daddr_cache.
1068 	 *    We do not want to save whole address now,
1069 	 *    (because main consumer of this service
1070 	 *    is tcp, which has not this problem),
1071 	 *    so that the last trick works only on connected
1072 	 *    sockets.
1073 	 * 2. oif also should be the same.
1074 	 */
1075 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1076 #ifdef CONFIG_IPV6_SUBTREES
1077 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1078 #endif
1079 	   (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
1080 		dst_release(dst);
1081 		dst = NULL;
1082 	}
1083 
1084 out:
1085 	return dst;
1086 }
1087 
1088 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1089 			       struct dst_entry **dst, struct flowi6 *fl6)
1090 {
1091 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1092 	struct neighbour *n;
1093 	struct rt6_info *rt;
1094 #endif
1095 	int err;
1096 	int flags = 0;
1097 
1098 	/* The correct way to handle this would be to do
1099 	 * ip6_route_get_saddr, and then ip6_route_output; however,
1100 	 * the route-specific preferred source forces the
1101 	 * ip6_route_output call _before_ ip6_route_get_saddr.
1102 	 *
1103 	 * In source specific routing (no src=any default route),
1104 	 * ip6_route_output will fail given src=any saddr, though, so
1105 	 * that's why we try it again later.
1106 	 */
1107 	if (ipv6_addr_any(&fl6->saddr)) {
1108 		struct fib6_info *from;
1109 		struct rt6_info *rt;
1110 
1111 		*dst = ip6_route_output(net, sk, fl6);
1112 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1113 
1114 		rcu_read_lock();
1115 		from = rt ? rcu_dereference(rt->from) : NULL;
1116 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1117 					  sk ? inet6_sk(sk)->srcprefs : 0,
1118 					  &fl6->saddr);
1119 		rcu_read_unlock();
1120 
1121 		if (err)
1122 			goto out_err_release;
1123 
1124 		/* If we had an erroneous initial result, pretend it
1125 		 * never existed and let the SA-enabled version take
1126 		 * over.
1127 		 */
1128 		if ((*dst)->error) {
1129 			dst_release(*dst);
1130 			*dst = NULL;
1131 		}
1132 
1133 		if (fl6->flowi6_oif)
1134 			flags |= RT6_LOOKUP_F_IFACE;
1135 	}
1136 
1137 	if (!*dst)
1138 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1139 
1140 	err = (*dst)->error;
1141 	if (err)
1142 		goto out_err_release;
1143 
1144 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1145 	/*
1146 	 * Here if the dst entry we've looked up
1147 	 * has a neighbour entry that is in the INCOMPLETE
1148 	 * state and the src address from the flow is
1149 	 * marked as OPTIMISTIC, we release the found
1150 	 * dst entry and replace it instead with the
1151 	 * dst entry of the nexthop router
1152 	 */
1153 	rt = (struct rt6_info *) *dst;
1154 	rcu_read_lock();
1155 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1156 				      rt6_nexthop(rt, &fl6->daddr));
1157 	err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
1158 	rcu_read_unlock();
1159 
1160 	if (err) {
1161 		struct inet6_ifaddr *ifp;
1162 		struct flowi6 fl_gw6;
1163 		int redirect;
1164 
1165 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1166 				      (*dst)->dev, 1);
1167 
1168 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1169 		if (ifp)
1170 			in6_ifa_put(ifp);
1171 
1172 		if (redirect) {
1173 			/*
1174 			 * We need to get the dst entry for the
1175 			 * default router instead
1176 			 */
1177 			dst_release(*dst);
1178 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1179 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1180 			*dst = ip6_route_output(net, sk, &fl_gw6);
1181 			err = (*dst)->error;
1182 			if (err)
1183 				goto out_err_release;
1184 		}
1185 	}
1186 #endif
1187 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1188 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1189 		err = -EAFNOSUPPORT;
1190 		goto out_err_release;
1191 	}
1192 
1193 	return 0;
1194 
1195 out_err_release:
1196 	dst_release(*dst);
1197 	*dst = NULL;
1198 
1199 	if (err == -ENETUNREACH)
1200 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1201 	return err;
1202 }
1203 
1204 /**
1205  *	ip6_dst_lookup - perform route lookup on flow
1206  *	@net: Network namespace to perform lookup in
1207  *	@sk: socket which provides route info
1208  *	@dst: pointer to dst_entry * for result
1209  *	@fl6: flow to lookup
1210  *
1211  *	This function performs a route lookup on the given flow.
1212  *
1213  *	It returns zero on success, or a standard errno code on error.
1214  */
1215 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1216 		   struct flowi6 *fl6)
1217 {
1218 	*dst = NULL;
1219 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1220 }
1221 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1222 
1223 /**
1224  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1225  *	@net: Network namespace to perform lookup in
1226  *	@sk: socket which provides route info
1227  *	@fl6: flow to lookup
1228  *	@final_dst: final destination address for ipsec lookup
1229  *
1230  *	This function performs a route lookup on the given flow.
1231  *
1232  *	It returns a valid dst pointer on success, or a pointer encoded
1233  *	error code.
1234  */
1235 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1236 				      const struct in6_addr *final_dst)
1237 {
1238 	struct dst_entry *dst = NULL;
1239 	int err;
1240 
1241 	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1242 	if (err)
1243 		return ERR_PTR(err);
1244 	if (final_dst)
1245 		fl6->daddr = *final_dst;
1246 
1247 	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1248 }
1249 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1250 
1251 /**
1252  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1253  *	@sk: socket which provides the dst cache and route info
1254  *	@fl6: flow to lookup
1255  *	@final_dst: final destination address for ipsec lookup
1256  *	@connected: whether @sk is connected or not
1257  *
1258  *	This function performs a route lookup on the given flow with the
1259  *	possibility of using the cached route in the socket if it is valid.
1260  *	It will take the socket dst lock when operating on the dst cache.
1261  *	As a result, this function can only be used in process context.
1262  *
1263  *	In addition, for a connected socket, cache the dst in the socket
1264  *	if the current cache is not valid.
1265  *
1266  *	It returns a valid dst pointer on success, or a pointer encoded
1267  *	error code.
1268  */
1269 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1270 					 const struct in6_addr *final_dst,
1271 					 bool connected)
1272 {
1273 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1274 
1275 	dst = ip6_sk_dst_check(sk, dst, fl6);
1276 	if (dst)
1277 		return dst;
1278 
1279 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1280 	if (connected && !IS_ERR(dst))
1281 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1282 
1283 	return dst;
1284 }
1285 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1286 
1287 /**
1288  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1289  *      @skb: Packet for which lookup is done
1290  *      @dev: Tunnel device
1291  *      @net: Network namespace of tunnel device
1292  *      @sock: Socket which provides route info
1293  *      @saddr: Memory to store the src ip address
1294  *      @info: Tunnel information
1295  *      @protocol: IP protocol
1296  *      @use_cache: Flag to enable cache usage
1297  *      This function performs a route lookup on a tunnel
1298  *
1299  *      It returns a valid dst pointer and stores src address to be used in
1300  *      tunnel in param saddr on success, else a pointer encoded error code.
1301  */
1302 
1303 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1304 					struct net_device *dev,
1305 					struct net *net,
1306 					struct socket *sock,
1307 					struct in6_addr *saddr,
1308 					const struct ip_tunnel_info *info,
1309 					u8 protocol,
1310 					bool use_cache)
1311 {
1312 	struct dst_entry *dst = NULL;
1313 #ifdef CONFIG_DST_CACHE
1314 	struct dst_cache *dst_cache;
1315 #endif
1316 	struct flowi6 fl6;
1317 	__u8 prio;
1318 
1319 #ifdef CONFIG_DST_CACHE
1320 	dst_cache = (struct dst_cache *)&info->dst_cache;
1321 	if (use_cache) {
1322 		dst = dst_cache_get_ip6(dst_cache, saddr);
1323 		if (dst)
1324 			return dst;
1325 	}
1326 #endif
1327 	memset(&fl6, 0, sizeof(fl6));
1328 	fl6.flowi6_mark = skb->mark;
1329 	fl6.flowi6_proto = protocol;
1330 	fl6.daddr = info->key.u.ipv6.dst;
1331 	fl6.saddr = info->key.u.ipv6.src;
1332 	prio = info->key.tos;
1333 	fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
1334 
1335 	dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1336 					      NULL);
1337 	if (IS_ERR(dst)) {
1338 		netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1339 		return ERR_PTR(-ENETUNREACH);
1340 	}
1341 	if (dst->dev == dev) { /* is this necessary? */
1342 		netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1343 		dst_release(dst);
1344 		return ERR_PTR(-ELOOP);
1345 	}
1346 #ifdef CONFIG_DST_CACHE
1347 	if (use_cache)
1348 		dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1349 #endif
1350 	*saddr = fl6.saddr;
1351 	return dst;
1352 }
1353 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1354 
1355 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1356 					       gfp_t gfp)
1357 {
1358 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1359 }
1360 
1361 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1362 						gfp_t gfp)
1363 {
1364 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1365 }
1366 
1367 static void ip6_append_data_mtu(unsigned int *mtu,
1368 				int *maxfraglen,
1369 				unsigned int fragheaderlen,
1370 				struct sk_buff *skb,
1371 				struct rt6_info *rt,
1372 				unsigned int orig_mtu)
1373 {
1374 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1375 		if (!skb) {
1376 			/* first fragment, reserve header_len */
1377 			*mtu = orig_mtu - rt->dst.header_len;
1378 
1379 		} else {
1380 			/*
1381 			 * this fragment is not first, the headers
1382 			 * space is regarded as data space.
1383 			 */
1384 			*mtu = orig_mtu;
1385 		}
1386 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1387 			      + fragheaderlen - sizeof(struct frag_hdr);
1388 	}
1389 }
1390 
1391 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1392 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1393 			  struct rt6_info *rt)
1394 {
1395 	struct ipv6_pinfo *np = inet6_sk(sk);
1396 	unsigned int mtu;
1397 	struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1398 
1399 	/* callers pass dst together with a reference, set it first so
1400 	 * ip6_cork_release() can put it down even in case of an error.
1401 	 */
1402 	cork->base.dst = &rt->dst;
1403 
1404 	/*
1405 	 * setup for corking
1406 	 */
1407 	if (opt) {
1408 		if (WARN_ON(v6_cork->opt))
1409 			return -EINVAL;
1410 
1411 		nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1412 		if (unlikely(!nopt))
1413 			return -ENOBUFS;
1414 
1415 		nopt->tot_len = sizeof(*opt);
1416 		nopt->opt_flen = opt->opt_flen;
1417 		nopt->opt_nflen = opt->opt_nflen;
1418 
1419 		nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1420 		if (opt->dst0opt && !nopt->dst0opt)
1421 			return -ENOBUFS;
1422 
1423 		nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1424 		if (opt->dst1opt && !nopt->dst1opt)
1425 			return -ENOBUFS;
1426 
1427 		nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1428 		if (opt->hopopt && !nopt->hopopt)
1429 			return -ENOBUFS;
1430 
1431 		nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1432 		if (opt->srcrt && !nopt->srcrt)
1433 			return -ENOBUFS;
1434 
1435 		/* need source address above miyazawa*/
1436 	}
1437 	v6_cork->hop_limit = ipc6->hlimit;
1438 	v6_cork->tclass = ipc6->tclass;
1439 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1440 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1441 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1442 	else
1443 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1444 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1445 	if (np->frag_size < mtu) {
1446 		if (np->frag_size)
1447 			mtu = np->frag_size;
1448 	}
1449 	cork->base.fragsize = mtu;
1450 	cork->base.gso_size = ipc6->gso_size;
1451 	cork->base.tx_flags = 0;
1452 	cork->base.mark = ipc6->sockc.mark;
1453 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1454 
1455 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1456 		cork->base.flags |= IPCORK_ALLFRAG;
1457 	cork->base.length = 0;
1458 
1459 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1460 
1461 	return 0;
1462 }
1463 
1464 static int __ip6_append_data(struct sock *sk,
1465 			     struct sk_buff_head *queue,
1466 			     struct inet_cork_full *cork_full,
1467 			     struct inet6_cork *v6_cork,
1468 			     struct page_frag *pfrag,
1469 			     int getfrag(void *from, char *to, int offset,
1470 					 int len, int odd, struct sk_buff *skb),
1471 			     void *from, size_t length, int transhdrlen,
1472 			     unsigned int flags, struct ipcm6_cookie *ipc6)
1473 {
1474 	struct sk_buff *skb, *skb_prev = NULL;
1475 	struct inet_cork *cork = &cork_full->base;
1476 	struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1477 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1478 	struct ubuf_info *uarg = NULL;
1479 	int exthdrlen = 0;
1480 	int dst_exthdrlen = 0;
1481 	int hh_len;
1482 	int copy;
1483 	int err;
1484 	int offset = 0;
1485 	bool zc = false;
1486 	u32 tskey = 0;
1487 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1488 	struct ipv6_txoptions *opt = v6_cork->opt;
1489 	int csummode = CHECKSUM_NONE;
1490 	unsigned int maxnonfragsize, headersize;
1491 	unsigned int wmem_alloc_delta = 0;
1492 	bool paged, extra_uref = false;
1493 
1494 	skb = skb_peek_tail(queue);
1495 	if (!skb) {
1496 		exthdrlen = opt ? opt->opt_flen : 0;
1497 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1498 	}
1499 
1500 	paged = !!cork->gso_size;
1501 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1502 	orig_mtu = mtu;
1503 
1504 	if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
1505 	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1506 		tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1507 
1508 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1509 
1510 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1511 			(opt ? opt->opt_nflen : 0);
1512 
1513 	headersize = sizeof(struct ipv6hdr) +
1514 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1515 		     (dst_allfrag(&rt->dst) ?
1516 		      sizeof(struct frag_hdr) : 0) +
1517 		     rt->rt6i_nfheader_len;
1518 
1519 	if (mtu <= fragheaderlen ||
1520 	    ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1521 		goto emsgsize;
1522 
1523 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1524 		     sizeof(struct frag_hdr);
1525 
1526 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1527 	 * the first fragment
1528 	 */
1529 	if (headersize + transhdrlen > mtu)
1530 		goto emsgsize;
1531 
1532 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1533 	    (sk->sk_protocol == IPPROTO_UDP ||
1534 	     sk->sk_protocol == IPPROTO_ICMPV6 ||
1535 	     sk->sk_protocol == IPPROTO_RAW)) {
1536 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1537 				sizeof(struct ipv6hdr));
1538 		goto emsgsize;
1539 	}
1540 
1541 	if (ip6_sk_ignore_df(sk))
1542 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1543 	else
1544 		maxnonfragsize = mtu;
1545 
1546 	if (cork->length + length > maxnonfragsize - headersize) {
1547 emsgsize:
1548 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1549 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1550 		return -EMSGSIZE;
1551 	}
1552 
1553 	/* CHECKSUM_PARTIAL only with no extension headers and when
1554 	 * we are not going to fragment
1555 	 */
1556 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1557 	    headersize == sizeof(struct ipv6hdr) &&
1558 	    length <= mtu - headersize &&
1559 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1560 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1561 		csummode = CHECKSUM_PARTIAL;
1562 
1563 	if ((flags & MSG_ZEROCOPY) && length) {
1564 		struct msghdr *msg = from;
1565 
1566 		if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1567 			if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1568 				return -EINVAL;
1569 
1570 			/* Leave uarg NULL if can't zerocopy, callers should
1571 			 * be able to handle it.
1572 			 */
1573 			if ((rt->dst.dev->features & NETIF_F_SG) &&
1574 			    csummode == CHECKSUM_PARTIAL) {
1575 				paged = true;
1576 				zc = true;
1577 				uarg = msg->msg_ubuf;
1578 			}
1579 		} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1580 			uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1581 			if (!uarg)
1582 				return -ENOBUFS;
1583 			extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1584 			if (rt->dst.dev->features & NETIF_F_SG &&
1585 			    csummode == CHECKSUM_PARTIAL) {
1586 				paged = true;
1587 				zc = true;
1588 			} else {
1589 				uarg_to_msgzc(uarg)->zerocopy = 0;
1590 				skb_zcopy_set(skb, uarg, &extra_uref);
1591 			}
1592 		}
1593 	} else if ((flags & MSG_SPLICE_PAGES) && length) {
1594 		if (inet_sk(sk)->hdrincl)
1595 			return -EPERM;
1596 		if (rt->dst.dev->features & NETIF_F_SG &&
1597 		    getfrag == ip_generic_getfrag)
1598 			/* We need an empty buffer to attach stuff to */
1599 			paged = true;
1600 		else
1601 			flags &= ~MSG_SPLICE_PAGES;
1602 	}
1603 
1604 	/*
1605 	 * Let's try using as much space as possible.
1606 	 * Use MTU if total length of the message fits into the MTU.
1607 	 * Otherwise, we need to reserve fragment header and
1608 	 * fragment alignment (= 8-15 octects, in total).
1609 	 *
1610 	 * Note that we may need to "move" the data from the tail
1611 	 * of the buffer to the new fragment when we split
1612 	 * the message.
1613 	 *
1614 	 * FIXME: It may be fragmented into multiple chunks
1615 	 *        at once if non-fragmentable extension headers
1616 	 *        are too large.
1617 	 * --yoshfuji
1618 	 */
1619 
1620 	cork->length += length;
1621 	if (!skb)
1622 		goto alloc_new_skb;
1623 
1624 	while (length > 0) {
1625 		/* Check if the remaining data fits into current packet. */
1626 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1627 		if (copy < length)
1628 			copy = maxfraglen - skb->len;
1629 
1630 		if (copy <= 0) {
1631 			char *data;
1632 			unsigned int datalen;
1633 			unsigned int fraglen;
1634 			unsigned int fraggap;
1635 			unsigned int alloclen, alloc_extra;
1636 			unsigned int pagedlen;
1637 alloc_new_skb:
1638 			/* There's no room in the current skb */
1639 			if (skb)
1640 				fraggap = skb->len - maxfraglen;
1641 			else
1642 				fraggap = 0;
1643 			/* update mtu and maxfraglen if necessary */
1644 			if (!skb || !skb_prev)
1645 				ip6_append_data_mtu(&mtu, &maxfraglen,
1646 						    fragheaderlen, skb, rt,
1647 						    orig_mtu);
1648 
1649 			skb_prev = skb;
1650 
1651 			/*
1652 			 * If remaining data exceeds the mtu,
1653 			 * we know we need more fragment(s).
1654 			 */
1655 			datalen = length + fraggap;
1656 
1657 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1658 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1659 			fraglen = datalen + fragheaderlen;
1660 			pagedlen = 0;
1661 
1662 			alloc_extra = hh_len;
1663 			alloc_extra += dst_exthdrlen;
1664 			alloc_extra += rt->dst.trailer_len;
1665 
1666 			/* We just reserve space for fragment header.
1667 			 * Note: this may be overallocation if the message
1668 			 * (without MSG_MORE) fits into the MTU.
1669 			 */
1670 			alloc_extra += sizeof(struct frag_hdr);
1671 
1672 			if ((flags & MSG_MORE) &&
1673 			    !(rt->dst.dev->features&NETIF_F_SG))
1674 				alloclen = mtu;
1675 			else if (!paged &&
1676 				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1677 				  !(rt->dst.dev->features & NETIF_F_SG)))
1678 				alloclen = fraglen;
1679 			else {
1680 				alloclen = fragheaderlen + transhdrlen;
1681 				pagedlen = datalen - transhdrlen;
1682 			}
1683 			alloclen += alloc_extra;
1684 
1685 			if (datalen != length + fraggap) {
1686 				/*
1687 				 * this is not the last fragment, the trailer
1688 				 * space is regarded as data space.
1689 				 */
1690 				datalen += rt->dst.trailer_len;
1691 			}
1692 
1693 			fraglen = datalen + fragheaderlen;
1694 
1695 			copy = datalen - transhdrlen - fraggap - pagedlen;
1696 			/* [!] NOTE: copy may be negative if pagedlen>0
1697 			 * because then the equation may reduces to -fraggap.
1698 			 */
1699 			if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) {
1700 				err = -EINVAL;
1701 				goto error;
1702 			}
1703 			if (transhdrlen) {
1704 				skb = sock_alloc_send_skb(sk, alloclen,
1705 						(flags & MSG_DONTWAIT), &err);
1706 			} else {
1707 				skb = NULL;
1708 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1709 				    2 * sk->sk_sndbuf)
1710 					skb = alloc_skb(alloclen,
1711 							sk->sk_allocation);
1712 				if (unlikely(!skb))
1713 					err = -ENOBUFS;
1714 			}
1715 			if (!skb)
1716 				goto error;
1717 			/*
1718 			 *	Fill in the control structures
1719 			 */
1720 			skb->protocol = htons(ETH_P_IPV6);
1721 			skb->ip_summed = csummode;
1722 			skb->csum = 0;
1723 			/* reserve for fragmentation and ipsec header */
1724 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1725 				    dst_exthdrlen);
1726 
1727 			/*
1728 			 *	Find where to start putting bytes
1729 			 */
1730 			data = skb_put(skb, fraglen - pagedlen);
1731 			skb_set_network_header(skb, exthdrlen);
1732 			data += fragheaderlen;
1733 			skb->transport_header = (skb->network_header +
1734 						 fragheaderlen);
1735 			if (fraggap) {
1736 				skb->csum = skb_copy_and_csum_bits(
1737 					skb_prev, maxfraglen,
1738 					data + transhdrlen, fraggap);
1739 				skb_prev->csum = csum_sub(skb_prev->csum,
1740 							  skb->csum);
1741 				data += fraggap;
1742 				pskb_trim_unique(skb_prev, maxfraglen);
1743 			}
1744 			if (copy > 0 &&
1745 			    getfrag(from, data + transhdrlen, offset,
1746 				    copy, fraggap, skb) < 0) {
1747 				err = -EFAULT;
1748 				kfree_skb(skb);
1749 				goto error;
1750 			} else if (flags & MSG_SPLICE_PAGES) {
1751 				copy = 0;
1752 			}
1753 
1754 			offset += copy;
1755 			length -= copy + transhdrlen;
1756 			transhdrlen = 0;
1757 			exthdrlen = 0;
1758 			dst_exthdrlen = 0;
1759 
1760 			/* Only the initial fragment is time stamped */
1761 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1762 			cork->tx_flags = 0;
1763 			skb_shinfo(skb)->tskey = tskey;
1764 			tskey = 0;
1765 			skb_zcopy_set(skb, uarg, &extra_uref);
1766 
1767 			if ((flags & MSG_CONFIRM) && !skb_prev)
1768 				skb_set_dst_pending_confirm(skb, 1);
1769 
1770 			/*
1771 			 * Put the packet on the pending queue
1772 			 */
1773 			if (!skb->destructor) {
1774 				skb->destructor = sock_wfree;
1775 				skb->sk = sk;
1776 				wmem_alloc_delta += skb->truesize;
1777 			}
1778 			__skb_queue_tail(queue, skb);
1779 			continue;
1780 		}
1781 
1782 		if (copy > length)
1783 			copy = length;
1784 
1785 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1786 		    skb_tailroom(skb) >= copy) {
1787 			unsigned int off;
1788 
1789 			off = skb->len;
1790 			if (getfrag(from, skb_put(skb, copy),
1791 						offset, copy, off, skb) < 0) {
1792 				__skb_trim(skb, off);
1793 				err = -EFAULT;
1794 				goto error;
1795 			}
1796 		} else if (flags & MSG_SPLICE_PAGES) {
1797 			struct msghdr *msg = from;
1798 
1799 			err = -EIO;
1800 			if (WARN_ON_ONCE(copy > msg->msg_iter.count))
1801 				goto error;
1802 
1803 			err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
1804 						   sk->sk_allocation);
1805 			if (err < 0)
1806 				goto error;
1807 			copy = err;
1808 			wmem_alloc_delta += copy;
1809 		} else if (!zc) {
1810 			int i = skb_shinfo(skb)->nr_frags;
1811 
1812 			err = -ENOMEM;
1813 			if (!sk_page_frag_refill(sk, pfrag))
1814 				goto error;
1815 
1816 			skb_zcopy_downgrade_managed(skb);
1817 			if (!skb_can_coalesce(skb, i, pfrag->page,
1818 					      pfrag->offset)) {
1819 				err = -EMSGSIZE;
1820 				if (i == MAX_SKB_FRAGS)
1821 					goto error;
1822 
1823 				__skb_fill_page_desc(skb, i, pfrag->page,
1824 						     pfrag->offset, 0);
1825 				skb_shinfo(skb)->nr_frags = ++i;
1826 				get_page(pfrag->page);
1827 			}
1828 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1829 			if (getfrag(from,
1830 				    page_address(pfrag->page) + pfrag->offset,
1831 				    offset, copy, skb->len, skb) < 0)
1832 				goto error_efault;
1833 
1834 			pfrag->offset += copy;
1835 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1836 			skb->len += copy;
1837 			skb->data_len += copy;
1838 			skb->truesize += copy;
1839 			wmem_alloc_delta += copy;
1840 		} else {
1841 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1842 			if (err < 0)
1843 				goto error;
1844 		}
1845 		offset += copy;
1846 		length -= copy;
1847 	}
1848 
1849 	if (wmem_alloc_delta)
1850 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1851 	return 0;
1852 
1853 error_efault:
1854 	err = -EFAULT;
1855 error:
1856 	net_zcopy_put_abort(uarg, extra_uref);
1857 	cork->length -= length;
1858 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1859 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1860 	return err;
1861 }
1862 
1863 int ip6_append_data(struct sock *sk,
1864 		    int getfrag(void *from, char *to, int offset, int len,
1865 				int odd, struct sk_buff *skb),
1866 		    void *from, size_t length, int transhdrlen,
1867 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1868 		    struct rt6_info *rt, unsigned int flags)
1869 {
1870 	struct inet_sock *inet = inet_sk(sk);
1871 	struct ipv6_pinfo *np = inet6_sk(sk);
1872 	int exthdrlen;
1873 	int err;
1874 
1875 	if (flags&MSG_PROBE)
1876 		return 0;
1877 	if (skb_queue_empty(&sk->sk_write_queue)) {
1878 		/*
1879 		 * setup for corking
1880 		 */
1881 		dst_hold(&rt->dst);
1882 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1883 				     ipc6, rt);
1884 		if (err)
1885 			return err;
1886 
1887 		inet->cork.fl.u.ip6 = *fl6;
1888 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1889 		length += exthdrlen;
1890 		transhdrlen += exthdrlen;
1891 	} else {
1892 		transhdrlen = 0;
1893 	}
1894 
1895 	return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1896 				 &np->cork, sk_page_frag(sk), getfrag,
1897 				 from, length, transhdrlen, flags, ipc6);
1898 }
1899 EXPORT_SYMBOL_GPL(ip6_append_data);
1900 
1901 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1902 {
1903 	struct dst_entry *dst = cork->base.dst;
1904 
1905 	cork->base.dst = NULL;
1906 	cork->base.flags &= ~IPCORK_ALLFRAG;
1907 	skb_dst_set(skb, dst);
1908 }
1909 
1910 static void ip6_cork_release(struct inet_cork_full *cork,
1911 			     struct inet6_cork *v6_cork)
1912 {
1913 	if (v6_cork->opt) {
1914 		struct ipv6_txoptions *opt = v6_cork->opt;
1915 
1916 		kfree(opt->dst0opt);
1917 		kfree(opt->dst1opt);
1918 		kfree(opt->hopopt);
1919 		kfree(opt->srcrt);
1920 		kfree(opt);
1921 		v6_cork->opt = NULL;
1922 	}
1923 
1924 	if (cork->base.dst) {
1925 		dst_release(cork->base.dst);
1926 		cork->base.dst = NULL;
1927 		cork->base.flags &= ~IPCORK_ALLFRAG;
1928 	}
1929 }
1930 
1931 struct sk_buff *__ip6_make_skb(struct sock *sk,
1932 			       struct sk_buff_head *queue,
1933 			       struct inet_cork_full *cork,
1934 			       struct inet6_cork *v6_cork)
1935 {
1936 	struct sk_buff *skb, *tmp_skb;
1937 	struct sk_buff **tail_skb;
1938 	struct in6_addr *final_dst;
1939 	struct ipv6_pinfo *np = inet6_sk(sk);
1940 	struct net *net = sock_net(sk);
1941 	struct ipv6hdr *hdr;
1942 	struct ipv6_txoptions *opt = v6_cork->opt;
1943 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1944 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1945 	unsigned char proto = fl6->flowi6_proto;
1946 
1947 	skb = __skb_dequeue(queue);
1948 	if (!skb)
1949 		goto out;
1950 	tail_skb = &(skb_shinfo(skb)->frag_list);
1951 
1952 	/* move skb->data to ip header from ext header */
1953 	if (skb->data < skb_network_header(skb))
1954 		__skb_pull(skb, skb_network_offset(skb));
1955 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1956 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1957 		*tail_skb = tmp_skb;
1958 		tail_skb = &(tmp_skb->next);
1959 		skb->len += tmp_skb->len;
1960 		skb->data_len += tmp_skb->len;
1961 		skb->truesize += tmp_skb->truesize;
1962 		tmp_skb->destructor = NULL;
1963 		tmp_skb->sk = NULL;
1964 	}
1965 
1966 	/* Allow local fragmentation. */
1967 	skb->ignore_df = ip6_sk_ignore_df(sk);
1968 	__skb_pull(skb, skb_network_header_len(skb));
1969 
1970 	final_dst = &fl6->daddr;
1971 	if (opt && opt->opt_flen)
1972 		ipv6_push_frag_opts(skb, opt, &proto);
1973 	if (opt && opt->opt_nflen)
1974 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1975 
1976 	skb_push(skb, sizeof(struct ipv6hdr));
1977 	skb_reset_network_header(skb);
1978 	hdr = ipv6_hdr(skb);
1979 
1980 	ip6_flow_hdr(hdr, v6_cork->tclass,
1981 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1982 					ip6_autoflowlabel(net, np), fl6));
1983 	hdr->hop_limit = v6_cork->hop_limit;
1984 	hdr->nexthdr = proto;
1985 	hdr->saddr = fl6->saddr;
1986 	hdr->daddr = *final_dst;
1987 
1988 	skb->priority = sk->sk_priority;
1989 	skb->mark = cork->base.mark;
1990 	skb->tstamp = cork->base.transmit_time;
1991 
1992 	ip6_cork_steal_dst(skb, cork);
1993 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1994 	if (proto == IPPROTO_ICMPV6) {
1995 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1996 		u8 icmp6_type;
1997 
1998 		if (sk->sk_socket->type == SOCK_RAW && !inet_sk(sk)->hdrincl)
1999 			icmp6_type = fl6->fl6_icmp_type;
2000 		else
2001 			icmp6_type = icmp6_hdr(skb)->icmp6_type;
2002 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
2003 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
2004 	}
2005 
2006 	ip6_cork_release(cork, v6_cork);
2007 out:
2008 	return skb;
2009 }
2010 
2011 int ip6_send_skb(struct sk_buff *skb)
2012 {
2013 	struct net *net = sock_net(skb->sk);
2014 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
2015 	int err;
2016 
2017 	err = ip6_local_out(net, skb->sk, skb);
2018 	if (err) {
2019 		if (err > 0)
2020 			err = net_xmit_errno(err);
2021 		if (err)
2022 			IP6_INC_STATS(net, rt->rt6i_idev,
2023 				      IPSTATS_MIB_OUTDISCARDS);
2024 	}
2025 
2026 	return err;
2027 }
2028 
2029 int ip6_push_pending_frames(struct sock *sk)
2030 {
2031 	struct sk_buff *skb;
2032 
2033 	skb = ip6_finish_skb(sk);
2034 	if (!skb)
2035 		return 0;
2036 
2037 	return ip6_send_skb(skb);
2038 }
2039 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
2040 
2041 static void __ip6_flush_pending_frames(struct sock *sk,
2042 				       struct sk_buff_head *queue,
2043 				       struct inet_cork_full *cork,
2044 				       struct inet6_cork *v6_cork)
2045 {
2046 	struct sk_buff *skb;
2047 
2048 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
2049 		if (skb_dst(skb))
2050 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
2051 				      IPSTATS_MIB_OUTDISCARDS);
2052 		kfree_skb(skb);
2053 	}
2054 
2055 	ip6_cork_release(cork, v6_cork);
2056 }
2057 
2058 void ip6_flush_pending_frames(struct sock *sk)
2059 {
2060 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2061 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
2062 }
2063 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2064 
2065 struct sk_buff *ip6_make_skb(struct sock *sk,
2066 			     int getfrag(void *from, char *to, int offset,
2067 					 int len, int odd, struct sk_buff *skb),
2068 			     void *from, size_t length, int transhdrlen,
2069 			     struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2070 			     unsigned int flags, struct inet_cork_full *cork)
2071 {
2072 	struct inet6_cork v6_cork;
2073 	struct sk_buff_head queue;
2074 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2075 	int err;
2076 
2077 	if (flags & MSG_PROBE) {
2078 		dst_release(&rt->dst);
2079 		return NULL;
2080 	}
2081 
2082 	__skb_queue_head_init(&queue);
2083 
2084 	cork->base.flags = 0;
2085 	cork->base.addr = 0;
2086 	cork->base.opt = NULL;
2087 	v6_cork.opt = NULL;
2088 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
2089 	if (err) {
2090 		ip6_cork_release(cork, &v6_cork);
2091 		return ERR_PTR(err);
2092 	}
2093 	if (ipc6->dontfrag < 0)
2094 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2095 
2096 	err = __ip6_append_data(sk, &queue, cork, &v6_cork,
2097 				&current->task_frag, getfrag, from,
2098 				length + exthdrlen, transhdrlen + exthdrlen,
2099 				flags, ipc6);
2100 	if (err) {
2101 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2102 		return ERR_PTR(err);
2103 	}
2104 
2105 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2106 }
2107