xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 18da174d)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	IPv6 output functions
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on linux/net/ipv4/ip_output.c
10  *
11  *	Changes:
12  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13  *				extension headers are implemented.
14  *				route changes now work.
15  *				ip6_forward does not confuse sniffers.
16  *				etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *	Imran Patel	:	frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *			:       add ip6_append_data and related functions
22  *				for datagram xmit
23  */
24 
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37 
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41 
42 #include <net/sock.h>
43 #include <net/snmp.h>
44 
45 #include <net/gso.h>
46 #include <net/ipv6.h>
47 #include <net/ndisc.h>
48 #include <net/protocol.h>
49 #include <net/ip6_route.h>
50 #include <net/addrconf.h>
51 #include <net/rawv6.h>
52 #include <net/icmp.h>
53 #include <net/xfrm.h>
54 #include <net/checksum.h>
55 #include <linux/mroute6.h>
56 #include <net/l3mdev.h>
57 #include <net/lwtunnel.h>
58 #include <net/ip_tunnels.h>
59 
60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61 {
62 	struct dst_entry *dst = skb_dst(skb);
63 	struct net_device *dev = dst->dev;
64 	struct inet6_dev *idev = ip6_dst_idev(dst);
65 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
66 	const struct in6_addr *daddr, *nexthop;
67 	struct ipv6hdr *hdr;
68 	struct neighbour *neigh;
69 	int ret;
70 
71 	/* Be paranoid, rather than too clever. */
72 	if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
73 		skb = skb_expand_head(skb, hh_len);
74 		if (!skb) {
75 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
76 			return -ENOMEM;
77 		}
78 	}
79 
80 	hdr = ipv6_hdr(skb);
81 	daddr = &hdr->daddr;
82 	if (ipv6_addr_is_multicast(daddr)) {
83 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
84 		    ((mroute6_is_socket(net, skb) &&
85 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
86 		     ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
87 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
88 
89 			/* Do not check for IFF_ALLMULTI; multicast routing
90 			   is not supported in any case.
91 			 */
92 			if (newskb)
93 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
94 					net, sk, newskb, NULL, newskb->dev,
95 					dev_loopback_xmit);
96 
97 			if (hdr->hop_limit == 0) {
98 				IP6_INC_STATS(net, idev,
99 					      IPSTATS_MIB_OUTDISCARDS);
100 				kfree_skb(skb);
101 				return 0;
102 			}
103 		}
104 
105 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
106 		if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
107 		    !(dev->flags & IFF_LOOPBACK)) {
108 			kfree_skb(skb);
109 			return 0;
110 		}
111 	}
112 
113 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
114 		int res = lwtunnel_xmit(skb);
115 
116 		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
117 			return res;
118 	}
119 
120 	rcu_read_lock();
121 	nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
122 	neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
123 
124 	if (unlikely(IS_ERR_OR_NULL(neigh))) {
125 		if (unlikely(!neigh))
126 			neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
127 		if (IS_ERR(neigh)) {
128 			rcu_read_unlock();
129 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
130 			kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
131 			return -EINVAL;
132 		}
133 	}
134 	sock_confirm_neigh(skb, neigh);
135 	ret = neigh_output(neigh, skb, false);
136 	rcu_read_unlock();
137 	return ret;
138 }
139 
140 static int
141 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
142 				    struct sk_buff *skb, unsigned int mtu)
143 {
144 	struct sk_buff *segs, *nskb;
145 	netdev_features_t features;
146 	int ret = 0;
147 
148 	/* Please see corresponding comment in ip_finish_output_gso
149 	 * describing the cases where GSO segment length exceeds the
150 	 * egress MTU.
151 	 */
152 	features = netif_skb_features(skb);
153 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
154 	if (IS_ERR_OR_NULL(segs)) {
155 		kfree_skb(skb);
156 		return -ENOMEM;
157 	}
158 
159 	consume_skb(skb);
160 
161 	skb_list_walk_safe(segs, segs, nskb) {
162 		int err;
163 
164 		skb_mark_not_on_list(segs);
165 		err = ip6_fragment(net, sk, segs, ip6_finish_output2);
166 		if (err && ret == 0)
167 			ret = err;
168 	}
169 
170 	return ret;
171 }
172 
173 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
174 {
175 	unsigned int mtu;
176 
177 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
178 	/* Policy lookup after SNAT yielded a new policy */
179 	if (skb_dst(skb)->xfrm) {
180 		IP6CB(skb)->flags |= IP6SKB_REROUTED;
181 		return dst_output(net, sk, skb);
182 	}
183 #endif
184 
185 	mtu = ip6_skb_dst_mtu(skb);
186 	if (skb_is_gso(skb) &&
187 	    !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
188 	    !skb_gso_validate_network_len(skb, mtu))
189 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
190 
191 	if ((skb->len > mtu && !skb_is_gso(skb)) ||
192 	    dst_allfrag(skb_dst(skb)) ||
193 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
194 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
195 	else
196 		return ip6_finish_output2(net, sk, skb);
197 }
198 
199 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
200 {
201 	int ret;
202 
203 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
204 	switch (ret) {
205 	case NET_XMIT_SUCCESS:
206 	case NET_XMIT_CN:
207 		return __ip6_finish_output(net, sk, skb) ? : ret;
208 	default:
209 		kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
210 		return ret;
211 	}
212 }
213 
214 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
215 {
216 	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
217 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
218 
219 	skb->protocol = htons(ETH_P_IPV6);
220 	skb->dev = dev;
221 
222 	if (unlikely(idev->cnf.disable_ipv6)) {
223 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
224 		kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
225 		return 0;
226 	}
227 
228 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
229 			    net, sk, skb, indev, dev,
230 			    ip6_finish_output,
231 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
232 }
233 EXPORT_SYMBOL(ip6_output);
234 
235 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
236 {
237 	if (!np->autoflowlabel_set)
238 		return ip6_default_np_autolabel(net);
239 	else
240 		return np->autoflowlabel;
241 }
242 
243 /*
244  * xmit an sk_buff (used by TCP, SCTP and DCCP)
245  * Note : socket lock is not held for SYNACK packets, but might be modified
246  * by calls to skb_set_owner_w() and ipv6_local_error(),
247  * which are using proper atomic operations or spinlocks.
248  */
249 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
250 	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
251 {
252 	struct net *net = sock_net(sk);
253 	const struct ipv6_pinfo *np = inet6_sk(sk);
254 	struct in6_addr *first_hop = &fl6->daddr;
255 	struct dst_entry *dst = skb_dst(skb);
256 	struct net_device *dev = dst->dev;
257 	struct inet6_dev *idev = ip6_dst_idev(dst);
258 	struct hop_jumbo_hdr *hop_jumbo;
259 	int hoplen = sizeof(*hop_jumbo);
260 	unsigned int head_room;
261 	struct ipv6hdr *hdr;
262 	u8  proto = fl6->flowi6_proto;
263 	int seg_len = skb->len;
264 	int hlimit = -1;
265 	u32 mtu;
266 
267 	head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
268 	if (opt)
269 		head_room += opt->opt_nflen + opt->opt_flen;
270 
271 	if (unlikely(head_room > skb_headroom(skb))) {
272 		skb = skb_expand_head(skb, head_room);
273 		if (!skb) {
274 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
275 			return -ENOBUFS;
276 		}
277 	}
278 
279 	if (opt) {
280 		seg_len += opt->opt_nflen + opt->opt_flen;
281 
282 		if (opt->opt_flen)
283 			ipv6_push_frag_opts(skb, opt, &proto);
284 
285 		if (opt->opt_nflen)
286 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
287 					     &fl6->saddr);
288 	}
289 
290 	if (unlikely(seg_len > IPV6_MAXPLEN)) {
291 		hop_jumbo = skb_push(skb, hoplen);
292 
293 		hop_jumbo->nexthdr = proto;
294 		hop_jumbo->hdrlen = 0;
295 		hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
296 		hop_jumbo->tlv_len = 4;
297 		hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
298 
299 		proto = IPPROTO_HOPOPTS;
300 		seg_len = 0;
301 		IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
302 	}
303 
304 	skb_push(skb, sizeof(struct ipv6hdr));
305 	skb_reset_network_header(skb);
306 	hdr = ipv6_hdr(skb);
307 
308 	/*
309 	 *	Fill in the IPv6 header
310 	 */
311 	if (np)
312 		hlimit = np->hop_limit;
313 	if (hlimit < 0)
314 		hlimit = ip6_dst_hoplimit(dst);
315 
316 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
317 				ip6_autoflowlabel(net, np), fl6));
318 
319 	hdr->payload_len = htons(seg_len);
320 	hdr->nexthdr = proto;
321 	hdr->hop_limit = hlimit;
322 
323 	hdr->saddr = fl6->saddr;
324 	hdr->daddr = *first_hop;
325 
326 	skb->protocol = htons(ETH_P_IPV6);
327 	skb->priority = priority;
328 	skb->mark = mark;
329 
330 	mtu = dst_mtu(dst);
331 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
332 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
333 
334 		/* if egress device is enslaved to an L3 master device pass the
335 		 * skb to its handler for processing
336 		 */
337 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
338 		if (unlikely(!skb))
339 			return 0;
340 
341 		/* hooks should never assume socket lock is held.
342 		 * we promote our socket to non const
343 		 */
344 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
345 			       net, (struct sock *)sk, skb, NULL, dev,
346 			       dst_output);
347 	}
348 
349 	skb->dev = dev;
350 	/* ipv6_local_error() does not require socket lock,
351 	 * we promote our socket to non const
352 	 */
353 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
354 
355 	IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
356 	kfree_skb(skb);
357 	return -EMSGSIZE;
358 }
359 EXPORT_SYMBOL(ip6_xmit);
360 
361 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
362 {
363 	struct ip6_ra_chain *ra;
364 	struct sock *last = NULL;
365 
366 	read_lock(&ip6_ra_lock);
367 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
368 		struct sock *sk = ra->sk;
369 		if (sk && ra->sel == sel &&
370 		    (!sk->sk_bound_dev_if ||
371 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
372 			struct ipv6_pinfo *np = inet6_sk(sk);
373 
374 			if (np && np->rtalert_isolate &&
375 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
376 				continue;
377 			}
378 			if (last) {
379 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
380 				if (skb2)
381 					rawv6_rcv(last, skb2);
382 			}
383 			last = sk;
384 		}
385 	}
386 
387 	if (last) {
388 		rawv6_rcv(last, skb);
389 		read_unlock(&ip6_ra_lock);
390 		return 1;
391 	}
392 	read_unlock(&ip6_ra_lock);
393 	return 0;
394 }
395 
396 static int ip6_forward_proxy_check(struct sk_buff *skb)
397 {
398 	struct ipv6hdr *hdr = ipv6_hdr(skb);
399 	u8 nexthdr = hdr->nexthdr;
400 	__be16 frag_off;
401 	int offset;
402 
403 	if (ipv6_ext_hdr(nexthdr)) {
404 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
405 		if (offset < 0)
406 			return 0;
407 	} else
408 		offset = sizeof(struct ipv6hdr);
409 
410 	if (nexthdr == IPPROTO_ICMPV6) {
411 		struct icmp6hdr *icmp6;
412 
413 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
414 					 offset + 1 - skb->data)))
415 			return 0;
416 
417 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
418 
419 		switch (icmp6->icmp6_type) {
420 		case NDISC_ROUTER_SOLICITATION:
421 		case NDISC_ROUTER_ADVERTISEMENT:
422 		case NDISC_NEIGHBOUR_SOLICITATION:
423 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
424 		case NDISC_REDIRECT:
425 			/* For reaction involving unicast neighbor discovery
426 			 * message destined to the proxied address, pass it to
427 			 * input function.
428 			 */
429 			return 1;
430 		default:
431 			break;
432 		}
433 	}
434 
435 	/*
436 	 * The proxying router can't forward traffic sent to a link-local
437 	 * address, so signal the sender and discard the packet. This
438 	 * behavior is clarified by the MIPv6 specification.
439 	 */
440 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
441 		dst_link_failure(skb);
442 		return -1;
443 	}
444 
445 	return 0;
446 }
447 
448 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
449 				     struct sk_buff *skb)
450 {
451 	struct dst_entry *dst = skb_dst(skb);
452 
453 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
454 	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
455 
456 #ifdef CONFIG_NET_SWITCHDEV
457 	if (skb->offload_l3_fwd_mark) {
458 		consume_skb(skb);
459 		return 0;
460 	}
461 #endif
462 
463 	skb_clear_tstamp(skb);
464 	return dst_output(net, sk, skb);
465 }
466 
467 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
468 {
469 	if (skb->len <= mtu)
470 		return false;
471 
472 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
473 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
474 		return true;
475 
476 	if (skb->ignore_df)
477 		return false;
478 
479 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
480 		return false;
481 
482 	return true;
483 }
484 
485 int ip6_forward(struct sk_buff *skb)
486 {
487 	struct dst_entry *dst = skb_dst(skb);
488 	struct ipv6hdr *hdr = ipv6_hdr(skb);
489 	struct inet6_skb_parm *opt = IP6CB(skb);
490 	struct net *net = dev_net(dst->dev);
491 	struct inet6_dev *idev;
492 	SKB_DR(reason);
493 	u32 mtu;
494 
495 	idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
496 	if (net->ipv6.devconf_all->forwarding == 0)
497 		goto error;
498 
499 	if (skb->pkt_type != PACKET_HOST)
500 		goto drop;
501 
502 	if (unlikely(skb->sk))
503 		goto drop;
504 
505 	if (skb_warn_if_lro(skb))
506 		goto drop;
507 
508 	if (!net->ipv6.devconf_all->disable_policy &&
509 	    (!idev || !idev->cnf.disable_policy) &&
510 	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
511 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
512 		goto drop;
513 	}
514 
515 	skb_forward_csum(skb);
516 
517 	/*
518 	 *	We DO NOT make any processing on
519 	 *	RA packets, pushing them to user level AS IS
520 	 *	without ane WARRANTY that application will be able
521 	 *	to interpret them. The reason is that we
522 	 *	cannot make anything clever here.
523 	 *
524 	 *	We are not end-node, so that if packet contains
525 	 *	AH/ESP, we cannot make anything.
526 	 *	Defragmentation also would be mistake, RA packets
527 	 *	cannot be fragmented, because there is no warranty
528 	 *	that different fragments will go along one path. --ANK
529 	 */
530 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
531 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
532 			return 0;
533 	}
534 
535 	/*
536 	 *	check and decrement ttl
537 	 */
538 	if (hdr->hop_limit <= 1) {
539 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
540 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
541 
542 		kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
543 		return -ETIMEDOUT;
544 	}
545 
546 	/* XXX: idev->cnf.proxy_ndp? */
547 	if (net->ipv6.devconf_all->proxy_ndp &&
548 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
549 		int proxied = ip6_forward_proxy_check(skb);
550 		if (proxied > 0) {
551 			/* It's tempting to decrease the hop limit
552 			 * here by 1, as we do at the end of the
553 			 * function too.
554 			 *
555 			 * But that would be incorrect, as proxying is
556 			 * not forwarding.  The ip6_input function
557 			 * will handle this packet locally, and it
558 			 * depends on the hop limit being unchanged.
559 			 *
560 			 * One example is the NDP hop limit, that
561 			 * always has to stay 255, but other would be
562 			 * similar checks around RA packets, where the
563 			 * user can even change the desired limit.
564 			 */
565 			return ip6_input(skb);
566 		} else if (proxied < 0) {
567 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
568 			goto drop;
569 		}
570 	}
571 
572 	if (!xfrm6_route_forward(skb)) {
573 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
574 		SKB_DR_SET(reason, XFRM_POLICY);
575 		goto drop;
576 	}
577 	dst = skb_dst(skb);
578 
579 	/* IPv6 specs say nothing about it, but it is clear that we cannot
580 	   send redirects to source routed frames.
581 	   We don't send redirects to frames decapsulated from IPsec.
582 	 */
583 	if (IP6CB(skb)->iif == dst->dev->ifindex &&
584 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
585 		struct in6_addr *target = NULL;
586 		struct inet_peer *peer;
587 		struct rt6_info *rt;
588 
589 		/*
590 		 *	incoming and outgoing devices are the same
591 		 *	send a redirect.
592 		 */
593 
594 		rt = (struct rt6_info *) dst;
595 		if (rt->rt6i_flags & RTF_GATEWAY)
596 			target = &rt->rt6i_gateway;
597 		else
598 			target = &hdr->daddr;
599 
600 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
601 
602 		/* Limit redirects both by destination (here)
603 		   and by source (inside ndisc_send_redirect)
604 		 */
605 		if (inet_peer_xrlim_allow(peer, 1*HZ))
606 			ndisc_send_redirect(skb, target);
607 		if (peer)
608 			inet_putpeer(peer);
609 	} else {
610 		int addrtype = ipv6_addr_type(&hdr->saddr);
611 
612 		/* This check is security critical. */
613 		if (addrtype == IPV6_ADDR_ANY ||
614 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
615 			goto error;
616 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
617 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
618 				    ICMPV6_NOT_NEIGHBOUR, 0);
619 			goto error;
620 		}
621 	}
622 
623 	mtu = ip6_dst_mtu_maybe_forward(dst, true);
624 	if (mtu < IPV6_MIN_MTU)
625 		mtu = IPV6_MIN_MTU;
626 
627 	if (ip6_pkt_too_big(skb, mtu)) {
628 		/* Again, force OUTPUT device used as source address */
629 		skb->dev = dst->dev;
630 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
631 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
632 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
633 				IPSTATS_MIB_FRAGFAILS);
634 		kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
635 		return -EMSGSIZE;
636 	}
637 
638 	if (skb_cow(skb, dst->dev->hard_header_len)) {
639 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
640 				IPSTATS_MIB_OUTDISCARDS);
641 		goto drop;
642 	}
643 
644 	hdr = ipv6_hdr(skb);
645 
646 	/* Mangling hops number delayed to point after skb COW */
647 
648 	hdr->hop_limit--;
649 
650 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
651 		       net, NULL, skb, skb->dev, dst->dev,
652 		       ip6_forward_finish);
653 
654 error:
655 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
656 	SKB_DR_SET(reason, IP_INADDRERRORS);
657 drop:
658 	kfree_skb_reason(skb, reason);
659 	return -EINVAL;
660 }
661 
662 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
663 {
664 	to->pkt_type = from->pkt_type;
665 	to->priority = from->priority;
666 	to->protocol = from->protocol;
667 	skb_dst_drop(to);
668 	skb_dst_set(to, dst_clone(skb_dst(from)));
669 	to->dev = from->dev;
670 	to->mark = from->mark;
671 
672 	skb_copy_hash(to, from);
673 
674 #ifdef CONFIG_NET_SCHED
675 	to->tc_index = from->tc_index;
676 #endif
677 	nf_copy(to, from);
678 	skb_ext_copy(to, from);
679 	skb_copy_secmark(to, from);
680 }
681 
682 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
683 		      u8 nexthdr, __be32 frag_id,
684 		      struct ip6_fraglist_iter *iter)
685 {
686 	unsigned int first_len;
687 	struct frag_hdr *fh;
688 
689 	/* BUILD HEADER */
690 	*prevhdr = NEXTHDR_FRAGMENT;
691 	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
692 	if (!iter->tmp_hdr)
693 		return -ENOMEM;
694 
695 	iter->frag = skb_shinfo(skb)->frag_list;
696 	skb_frag_list_init(skb);
697 
698 	iter->offset = 0;
699 	iter->hlen = hlen;
700 	iter->frag_id = frag_id;
701 	iter->nexthdr = nexthdr;
702 
703 	__skb_pull(skb, hlen);
704 	fh = __skb_push(skb, sizeof(struct frag_hdr));
705 	__skb_push(skb, hlen);
706 	skb_reset_network_header(skb);
707 	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
708 
709 	fh->nexthdr = nexthdr;
710 	fh->reserved = 0;
711 	fh->frag_off = htons(IP6_MF);
712 	fh->identification = frag_id;
713 
714 	first_len = skb_pagelen(skb);
715 	skb->data_len = first_len - skb_headlen(skb);
716 	skb->len = first_len;
717 	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
718 
719 	return 0;
720 }
721 EXPORT_SYMBOL(ip6_fraglist_init);
722 
723 void ip6_fraglist_prepare(struct sk_buff *skb,
724 			  struct ip6_fraglist_iter *iter)
725 {
726 	struct sk_buff *frag = iter->frag;
727 	unsigned int hlen = iter->hlen;
728 	struct frag_hdr *fh;
729 
730 	frag->ip_summed = CHECKSUM_NONE;
731 	skb_reset_transport_header(frag);
732 	fh = __skb_push(frag, sizeof(struct frag_hdr));
733 	__skb_push(frag, hlen);
734 	skb_reset_network_header(frag);
735 	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
736 	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
737 	fh->nexthdr = iter->nexthdr;
738 	fh->reserved = 0;
739 	fh->frag_off = htons(iter->offset);
740 	if (frag->next)
741 		fh->frag_off |= htons(IP6_MF);
742 	fh->identification = iter->frag_id;
743 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
744 	ip6_copy_metadata(frag, skb);
745 }
746 EXPORT_SYMBOL(ip6_fraglist_prepare);
747 
748 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
749 		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
750 		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
751 {
752 	state->prevhdr = prevhdr;
753 	state->nexthdr = nexthdr;
754 	state->frag_id = frag_id;
755 
756 	state->hlen = hlen;
757 	state->mtu = mtu;
758 
759 	state->left = skb->len - hlen;	/* Space per frame */
760 	state->ptr = hlen;		/* Where to start from */
761 
762 	state->hroom = hdr_room;
763 	state->troom = needed_tailroom;
764 
765 	state->offset = 0;
766 }
767 EXPORT_SYMBOL(ip6_frag_init);
768 
769 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
770 {
771 	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
772 	struct sk_buff *frag;
773 	struct frag_hdr *fh;
774 	unsigned int len;
775 
776 	len = state->left;
777 	/* IF: it doesn't fit, use 'mtu' - the data space left */
778 	if (len > state->mtu)
779 		len = state->mtu;
780 	/* IF: we are not sending up to and including the packet end
781 	   then align the next start on an eight byte boundary */
782 	if (len < state->left)
783 		len &= ~7;
784 
785 	/* Allocate buffer */
786 	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
787 			 state->hroom + state->troom, GFP_ATOMIC);
788 	if (!frag)
789 		return ERR_PTR(-ENOMEM);
790 
791 	/*
792 	 *	Set up data on packet
793 	 */
794 
795 	ip6_copy_metadata(frag, skb);
796 	skb_reserve(frag, state->hroom);
797 	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
798 	skb_reset_network_header(frag);
799 	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
800 	frag->transport_header = (frag->network_header + state->hlen +
801 				  sizeof(struct frag_hdr));
802 
803 	/*
804 	 *	Charge the memory for the fragment to any owner
805 	 *	it might possess
806 	 */
807 	if (skb->sk)
808 		skb_set_owner_w(frag, skb->sk);
809 
810 	/*
811 	 *	Copy the packet header into the new buffer.
812 	 */
813 	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
814 
815 	fragnexthdr_offset = skb_network_header(frag);
816 	fragnexthdr_offset += prevhdr - skb_network_header(skb);
817 	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
818 
819 	/*
820 	 *	Build fragment header.
821 	 */
822 	fh->nexthdr = state->nexthdr;
823 	fh->reserved = 0;
824 	fh->identification = state->frag_id;
825 
826 	/*
827 	 *	Copy a block of the IP datagram.
828 	 */
829 	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
830 			     len));
831 	state->left -= len;
832 
833 	fh->frag_off = htons(state->offset);
834 	if (state->left > 0)
835 		fh->frag_off |= htons(IP6_MF);
836 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
837 
838 	state->ptr += len;
839 	state->offset += len;
840 
841 	return frag;
842 }
843 EXPORT_SYMBOL(ip6_frag_next);
844 
845 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
846 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
847 {
848 	struct sk_buff *frag;
849 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
850 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
851 				inet6_sk(skb->sk) : NULL;
852 	bool mono_delivery_time = skb->mono_delivery_time;
853 	struct ip6_frag_state state;
854 	unsigned int mtu, hlen, nexthdr_offset;
855 	ktime_t tstamp = skb->tstamp;
856 	int hroom, err = 0;
857 	__be32 frag_id;
858 	u8 *prevhdr, nexthdr = 0;
859 
860 	err = ip6_find_1stfragopt(skb, &prevhdr);
861 	if (err < 0)
862 		goto fail;
863 	hlen = err;
864 	nexthdr = *prevhdr;
865 	nexthdr_offset = prevhdr - skb_network_header(skb);
866 
867 	mtu = ip6_skb_dst_mtu(skb);
868 
869 	/* We must not fragment if the socket is set to force MTU discovery
870 	 * or if the skb it not generated by a local socket.
871 	 */
872 	if (unlikely(!skb->ignore_df && skb->len > mtu))
873 		goto fail_toobig;
874 
875 	if (IP6CB(skb)->frag_max_size) {
876 		if (IP6CB(skb)->frag_max_size > mtu)
877 			goto fail_toobig;
878 
879 		/* don't send fragments larger than what we received */
880 		mtu = IP6CB(skb)->frag_max_size;
881 		if (mtu < IPV6_MIN_MTU)
882 			mtu = IPV6_MIN_MTU;
883 	}
884 
885 	if (np && np->frag_size < mtu) {
886 		if (np->frag_size)
887 			mtu = np->frag_size;
888 	}
889 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
890 		goto fail_toobig;
891 	mtu -= hlen + sizeof(struct frag_hdr);
892 
893 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
894 				    &ipv6_hdr(skb)->saddr);
895 
896 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
897 	    (err = skb_checksum_help(skb)))
898 		goto fail;
899 
900 	prevhdr = skb_network_header(skb) + nexthdr_offset;
901 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
902 	if (skb_has_frag_list(skb)) {
903 		unsigned int first_len = skb_pagelen(skb);
904 		struct ip6_fraglist_iter iter;
905 		struct sk_buff *frag2;
906 
907 		if (first_len - hlen > mtu ||
908 		    ((first_len - hlen) & 7) ||
909 		    skb_cloned(skb) ||
910 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
911 			goto slow_path;
912 
913 		skb_walk_frags(skb, frag) {
914 			/* Correct geometry. */
915 			if (frag->len > mtu ||
916 			    ((frag->len & 7) && frag->next) ||
917 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
918 				goto slow_path_clean;
919 
920 			/* Partially cloned skb? */
921 			if (skb_shared(frag))
922 				goto slow_path_clean;
923 
924 			BUG_ON(frag->sk);
925 			if (skb->sk) {
926 				frag->sk = skb->sk;
927 				frag->destructor = sock_wfree;
928 			}
929 			skb->truesize -= frag->truesize;
930 		}
931 
932 		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
933 					&iter);
934 		if (err < 0)
935 			goto fail;
936 
937 		/* We prevent @rt from being freed. */
938 		rcu_read_lock();
939 
940 		for (;;) {
941 			/* Prepare header of the next frame,
942 			 * before previous one went down. */
943 			if (iter.frag)
944 				ip6_fraglist_prepare(skb, &iter);
945 
946 			skb_set_delivery_time(skb, tstamp, mono_delivery_time);
947 			err = output(net, sk, skb);
948 			if (!err)
949 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
950 					      IPSTATS_MIB_FRAGCREATES);
951 
952 			if (err || !iter.frag)
953 				break;
954 
955 			skb = ip6_fraglist_next(&iter);
956 		}
957 
958 		kfree(iter.tmp_hdr);
959 
960 		if (err == 0) {
961 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
962 				      IPSTATS_MIB_FRAGOKS);
963 			rcu_read_unlock();
964 			return 0;
965 		}
966 
967 		kfree_skb_list(iter.frag);
968 
969 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
970 			      IPSTATS_MIB_FRAGFAILS);
971 		rcu_read_unlock();
972 		return err;
973 
974 slow_path_clean:
975 		skb_walk_frags(skb, frag2) {
976 			if (frag2 == frag)
977 				break;
978 			frag2->sk = NULL;
979 			frag2->destructor = NULL;
980 			skb->truesize += frag2->truesize;
981 		}
982 	}
983 
984 slow_path:
985 	/*
986 	 *	Fragment the datagram.
987 	 */
988 
989 	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
990 		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
991 		      &state);
992 
993 	/*
994 	 *	Keep copying data until we run out.
995 	 */
996 
997 	while (state.left > 0) {
998 		frag = ip6_frag_next(skb, &state);
999 		if (IS_ERR(frag)) {
1000 			err = PTR_ERR(frag);
1001 			goto fail;
1002 		}
1003 
1004 		/*
1005 		 *	Put this fragment into the sending queue.
1006 		 */
1007 		skb_set_delivery_time(frag, tstamp, mono_delivery_time);
1008 		err = output(net, sk, frag);
1009 		if (err)
1010 			goto fail;
1011 
1012 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1013 			      IPSTATS_MIB_FRAGCREATES);
1014 	}
1015 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1016 		      IPSTATS_MIB_FRAGOKS);
1017 	consume_skb(skb);
1018 	return err;
1019 
1020 fail_toobig:
1021 	if (skb->sk && dst_allfrag(skb_dst(skb)))
1022 		sk_gso_disable(skb->sk);
1023 
1024 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1025 	err = -EMSGSIZE;
1026 
1027 fail:
1028 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1029 		      IPSTATS_MIB_FRAGFAILS);
1030 	kfree_skb(skb);
1031 	return err;
1032 }
1033 
1034 static inline int ip6_rt_check(const struct rt6key *rt_key,
1035 			       const struct in6_addr *fl_addr,
1036 			       const struct in6_addr *addr_cache)
1037 {
1038 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1039 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1040 }
1041 
1042 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1043 					  struct dst_entry *dst,
1044 					  const struct flowi6 *fl6)
1045 {
1046 	struct ipv6_pinfo *np = inet6_sk(sk);
1047 	struct rt6_info *rt;
1048 
1049 	if (!dst)
1050 		goto out;
1051 
1052 	if (dst->ops->family != AF_INET6) {
1053 		dst_release(dst);
1054 		return NULL;
1055 	}
1056 
1057 	rt = (struct rt6_info *)dst;
1058 	/* Yes, checking route validity in not connected
1059 	 * case is not very simple. Take into account,
1060 	 * that we do not support routing by source, TOS,
1061 	 * and MSG_DONTROUTE		--ANK (980726)
1062 	 *
1063 	 * 1. ip6_rt_check(): If route was host route,
1064 	 *    check that cached destination is current.
1065 	 *    If it is network route, we still may
1066 	 *    check its validity using saved pointer
1067 	 *    to the last used address: daddr_cache.
1068 	 *    We do not want to save whole address now,
1069 	 *    (because main consumer of this service
1070 	 *    is tcp, which has not this problem),
1071 	 *    so that the last trick works only on connected
1072 	 *    sockets.
1073 	 * 2. oif also should be the same.
1074 	 */
1075 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1076 #ifdef CONFIG_IPV6_SUBTREES
1077 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1078 #endif
1079 	   (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
1080 		dst_release(dst);
1081 		dst = NULL;
1082 	}
1083 
1084 out:
1085 	return dst;
1086 }
1087 
1088 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1089 			       struct dst_entry **dst, struct flowi6 *fl6)
1090 {
1091 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1092 	struct neighbour *n;
1093 	struct rt6_info *rt;
1094 #endif
1095 	int err;
1096 	int flags = 0;
1097 
1098 	/* The correct way to handle this would be to do
1099 	 * ip6_route_get_saddr, and then ip6_route_output; however,
1100 	 * the route-specific preferred source forces the
1101 	 * ip6_route_output call _before_ ip6_route_get_saddr.
1102 	 *
1103 	 * In source specific routing (no src=any default route),
1104 	 * ip6_route_output will fail given src=any saddr, though, so
1105 	 * that's why we try it again later.
1106 	 */
1107 	if (ipv6_addr_any(&fl6->saddr)) {
1108 		struct fib6_info *from;
1109 		struct rt6_info *rt;
1110 
1111 		*dst = ip6_route_output(net, sk, fl6);
1112 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1113 
1114 		rcu_read_lock();
1115 		from = rt ? rcu_dereference(rt->from) : NULL;
1116 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1117 					  sk ? inet6_sk(sk)->srcprefs : 0,
1118 					  &fl6->saddr);
1119 		rcu_read_unlock();
1120 
1121 		if (err)
1122 			goto out_err_release;
1123 
1124 		/* If we had an erroneous initial result, pretend it
1125 		 * never existed and let the SA-enabled version take
1126 		 * over.
1127 		 */
1128 		if ((*dst)->error) {
1129 			dst_release(*dst);
1130 			*dst = NULL;
1131 		}
1132 
1133 		if (fl6->flowi6_oif)
1134 			flags |= RT6_LOOKUP_F_IFACE;
1135 	}
1136 
1137 	if (!*dst)
1138 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1139 
1140 	err = (*dst)->error;
1141 	if (err)
1142 		goto out_err_release;
1143 
1144 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1145 	/*
1146 	 * Here if the dst entry we've looked up
1147 	 * has a neighbour entry that is in the INCOMPLETE
1148 	 * state and the src address from the flow is
1149 	 * marked as OPTIMISTIC, we release the found
1150 	 * dst entry and replace it instead with the
1151 	 * dst entry of the nexthop router
1152 	 */
1153 	rt = (struct rt6_info *) *dst;
1154 	rcu_read_lock();
1155 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1156 				      rt6_nexthop(rt, &fl6->daddr));
1157 	err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
1158 	rcu_read_unlock();
1159 
1160 	if (err) {
1161 		struct inet6_ifaddr *ifp;
1162 		struct flowi6 fl_gw6;
1163 		int redirect;
1164 
1165 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1166 				      (*dst)->dev, 1);
1167 
1168 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1169 		if (ifp)
1170 			in6_ifa_put(ifp);
1171 
1172 		if (redirect) {
1173 			/*
1174 			 * We need to get the dst entry for the
1175 			 * default router instead
1176 			 */
1177 			dst_release(*dst);
1178 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1179 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1180 			*dst = ip6_route_output(net, sk, &fl_gw6);
1181 			err = (*dst)->error;
1182 			if (err)
1183 				goto out_err_release;
1184 		}
1185 	}
1186 #endif
1187 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1188 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1189 		err = -EAFNOSUPPORT;
1190 		goto out_err_release;
1191 	}
1192 
1193 	return 0;
1194 
1195 out_err_release:
1196 	dst_release(*dst);
1197 	*dst = NULL;
1198 
1199 	if (err == -ENETUNREACH)
1200 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1201 	return err;
1202 }
1203 
1204 /**
1205  *	ip6_dst_lookup - perform route lookup on flow
1206  *	@net: Network namespace to perform lookup in
1207  *	@sk: socket which provides route info
1208  *	@dst: pointer to dst_entry * for result
1209  *	@fl6: flow to lookup
1210  *
1211  *	This function performs a route lookup on the given flow.
1212  *
1213  *	It returns zero on success, or a standard errno code on error.
1214  */
1215 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1216 		   struct flowi6 *fl6)
1217 {
1218 	*dst = NULL;
1219 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1220 }
1221 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1222 
1223 /**
1224  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1225  *	@net: Network namespace to perform lookup in
1226  *	@sk: socket which provides route info
1227  *	@fl6: flow to lookup
1228  *	@final_dst: final destination address for ipsec lookup
1229  *
1230  *	This function performs a route lookup on the given flow.
1231  *
1232  *	It returns a valid dst pointer on success, or a pointer encoded
1233  *	error code.
1234  */
1235 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1236 				      const struct in6_addr *final_dst)
1237 {
1238 	struct dst_entry *dst = NULL;
1239 	int err;
1240 
1241 	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1242 	if (err)
1243 		return ERR_PTR(err);
1244 	if (final_dst)
1245 		fl6->daddr = *final_dst;
1246 
1247 	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1248 }
1249 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1250 
1251 /**
1252  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1253  *	@sk: socket which provides the dst cache and route info
1254  *	@fl6: flow to lookup
1255  *	@final_dst: final destination address for ipsec lookup
1256  *	@connected: whether @sk is connected or not
1257  *
1258  *	This function performs a route lookup on the given flow with the
1259  *	possibility of using the cached route in the socket if it is valid.
1260  *	It will take the socket dst lock when operating on the dst cache.
1261  *	As a result, this function can only be used in process context.
1262  *
1263  *	In addition, for a connected socket, cache the dst in the socket
1264  *	if the current cache is not valid.
1265  *
1266  *	It returns a valid dst pointer on success, or a pointer encoded
1267  *	error code.
1268  */
1269 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1270 					 const struct in6_addr *final_dst,
1271 					 bool connected)
1272 {
1273 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1274 
1275 	dst = ip6_sk_dst_check(sk, dst, fl6);
1276 	if (dst)
1277 		return dst;
1278 
1279 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1280 	if (connected && !IS_ERR(dst))
1281 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1282 
1283 	return dst;
1284 }
1285 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1286 
1287 /**
1288  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1289  *      @skb: Packet for which lookup is done
1290  *      @dev: Tunnel device
1291  *      @net: Network namespace of tunnel device
1292  *      @sock: Socket which provides route info
1293  *      @saddr: Memory to store the src ip address
1294  *      @info: Tunnel information
1295  *      @protocol: IP protocol
1296  *      @use_cache: Flag to enable cache usage
1297  *      This function performs a route lookup on a tunnel
1298  *
1299  *      It returns a valid dst pointer and stores src address to be used in
1300  *      tunnel in param saddr on success, else a pointer encoded error code.
1301  */
1302 
1303 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1304 					struct net_device *dev,
1305 					struct net *net,
1306 					struct socket *sock,
1307 					struct in6_addr *saddr,
1308 					const struct ip_tunnel_info *info,
1309 					u8 protocol,
1310 					bool use_cache)
1311 {
1312 	struct dst_entry *dst = NULL;
1313 #ifdef CONFIG_DST_CACHE
1314 	struct dst_cache *dst_cache;
1315 #endif
1316 	struct flowi6 fl6;
1317 	__u8 prio;
1318 
1319 #ifdef CONFIG_DST_CACHE
1320 	dst_cache = (struct dst_cache *)&info->dst_cache;
1321 	if (use_cache) {
1322 		dst = dst_cache_get_ip6(dst_cache, saddr);
1323 		if (dst)
1324 			return dst;
1325 	}
1326 #endif
1327 	memset(&fl6, 0, sizeof(fl6));
1328 	fl6.flowi6_mark = skb->mark;
1329 	fl6.flowi6_proto = protocol;
1330 	fl6.daddr = info->key.u.ipv6.dst;
1331 	fl6.saddr = info->key.u.ipv6.src;
1332 	prio = info->key.tos;
1333 	fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
1334 
1335 	dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1336 					      NULL);
1337 	if (IS_ERR(dst)) {
1338 		netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1339 		return ERR_PTR(-ENETUNREACH);
1340 	}
1341 	if (dst->dev == dev) { /* is this necessary? */
1342 		netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1343 		dst_release(dst);
1344 		return ERR_PTR(-ELOOP);
1345 	}
1346 #ifdef CONFIG_DST_CACHE
1347 	if (use_cache)
1348 		dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1349 #endif
1350 	*saddr = fl6.saddr;
1351 	return dst;
1352 }
1353 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1354 
1355 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1356 					       gfp_t gfp)
1357 {
1358 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1359 }
1360 
1361 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1362 						gfp_t gfp)
1363 {
1364 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1365 }
1366 
1367 static void ip6_append_data_mtu(unsigned int *mtu,
1368 				int *maxfraglen,
1369 				unsigned int fragheaderlen,
1370 				struct sk_buff *skb,
1371 				struct rt6_info *rt,
1372 				unsigned int orig_mtu)
1373 {
1374 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1375 		if (!skb) {
1376 			/* first fragment, reserve header_len */
1377 			*mtu = orig_mtu - rt->dst.header_len;
1378 
1379 		} else {
1380 			/*
1381 			 * this fragment is not first, the headers
1382 			 * space is regarded as data space.
1383 			 */
1384 			*mtu = orig_mtu;
1385 		}
1386 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1387 			      + fragheaderlen - sizeof(struct frag_hdr);
1388 	}
1389 }
1390 
1391 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1392 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1393 			  struct rt6_info *rt)
1394 {
1395 	struct ipv6_pinfo *np = inet6_sk(sk);
1396 	unsigned int mtu;
1397 	struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1398 
1399 	/* callers pass dst together with a reference, set it first so
1400 	 * ip6_cork_release() can put it down even in case of an error.
1401 	 */
1402 	cork->base.dst = &rt->dst;
1403 
1404 	/*
1405 	 * setup for corking
1406 	 */
1407 	if (opt) {
1408 		if (WARN_ON(v6_cork->opt))
1409 			return -EINVAL;
1410 
1411 		nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1412 		if (unlikely(!nopt))
1413 			return -ENOBUFS;
1414 
1415 		nopt->tot_len = sizeof(*opt);
1416 		nopt->opt_flen = opt->opt_flen;
1417 		nopt->opt_nflen = opt->opt_nflen;
1418 
1419 		nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1420 		if (opt->dst0opt && !nopt->dst0opt)
1421 			return -ENOBUFS;
1422 
1423 		nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1424 		if (opt->dst1opt && !nopt->dst1opt)
1425 			return -ENOBUFS;
1426 
1427 		nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1428 		if (opt->hopopt && !nopt->hopopt)
1429 			return -ENOBUFS;
1430 
1431 		nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1432 		if (opt->srcrt && !nopt->srcrt)
1433 			return -ENOBUFS;
1434 
1435 		/* need source address above miyazawa*/
1436 	}
1437 	v6_cork->hop_limit = ipc6->hlimit;
1438 	v6_cork->tclass = ipc6->tclass;
1439 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1440 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1441 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1442 	else
1443 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1444 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1445 	if (np->frag_size < mtu) {
1446 		if (np->frag_size)
1447 			mtu = np->frag_size;
1448 	}
1449 	cork->base.fragsize = mtu;
1450 	cork->base.gso_size = ipc6->gso_size;
1451 	cork->base.tx_flags = 0;
1452 	cork->base.mark = ipc6->sockc.mark;
1453 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1454 
1455 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1456 		cork->base.flags |= IPCORK_ALLFRAG;
1457 	cork->base.length = 0;
1458 
1459 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1460 
1461 	return 0;
1462 }
1463 
1464 static int __ip6_append_data(struct sock *sk,
1465 			     struct sk_buff_head *queue,
1466 			     struct inet_cork_full *cork_full,
1467 			     struct inet6_cork *v6_cork,
1468 			     struct page_frag *pfrag,
1469 			     int getfrag(void *from, char *to, int offset,
1470 					 int len, int odd, struct sk_buff *skb),
1471 			     void *from, size_t length, int transhdrlen,
1472 			     unsigned int flags, struct ipcm6_cookie *ipc6)
1473 {
1474 	struct sk_buff *skb, *skb_prev = NULL;
1475 	struct inet_cork *cork = &cork_full->base;
1476 	struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1477 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1478 	struct ubuf_info *uarg = NULL;
1479 	int exthdrlen = 0;
1480 	int dst_exthdrlen = 0;
1481 	int hh_len;
1482 	int copy;
1483 	int err;
1484 	int offset = 0;
1485 	bool zc = false;
1486 	u32 tskey = 0;
1487 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1488 	struct ipv6_txoptions *opt = v6_cork->opt;
1489 	int csummode = CHECKSUM_NONE;
1490 	unsigned int maxnonfragsize, headersize;
1491 	unsigned int wmem_alloc_delta = 0;
1492 	bool paged, extra_uref = false;
1493 
1494 	skb = skb_peek_tail(queue);
1495 	if (!skb) {
1496 		exthdrlen = opt ? opt->opt_flen : 0;
1497 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1498 	}
1499 
1500 	paged = !!cork->gso_size;
1501 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1502 	orig_mtu = mtu;
1503 
1504 	if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
1505 	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1506 		tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1507 
1508 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1509 
1510 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1511 			(opt ? opt->opt_nflen : 0);
1512 
1513 	headersize = sizeof(struct ipv6hdr) +
1514 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1515 		     (dst_allfrag(&rt->dst) ?
1516 		      sizeof(struct frag_hdr) : 0) +
1517 		     rt->rt6i_nfheader_len;
1518 
1519 	if (mtu <= fragheaderlen ||
1520 	    ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1521 		goto emsgsize;
1522 
1523 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1524 		     sizeof(struct frag_hdr);
1525 
1526 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1527 	 * the first fragment
1528 	 */
1529 	if (headersize + transhdrlen > mtu)
1530 		goto emsgsize;
1531 
1532 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1533 	    (sk->sk_protocol == IPPROTO_UDP ||
1534 	     sk->sk_protocol == IPPROTO_ICMPV6 ||
1535 	     sk->sk_protocol == IPPROTO_RAW)) {
1536 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1537 				sizeof(struct ipv6hdr));
1538 		goto emsgsize;
1539 	}
1540 
1541 	if (ip6_sk_ignore_df(sk))
1542 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1543 	else
1544 		maxnonfragsize = mtu;
1545 
1546 	if (cork->length + length > maxnonfragsize - headersize) {
1547 emsgsize:
1548 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1549 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1550 		return -EMSGSIZE;
1551 	}
1552 
1553 	/* CHECKSUM_PARTIAL only with no extension headers and when
1554 	 * we are not going to fragment
1555 	 */
1556 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1557 	    headersize == sizeof(struct ipv6hdr) &&
1558 	    length <= mtu - headersize &&
1559 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1560 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1561 		csummode = CHECKSUM_PARTIAL;
1562 
1563 	if ((flags & MSG_ZEROCOPY) && length) {
1564 		struct msghdr *msg = from;
1565 
1566 		if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1567 			if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1568 				return -EINVAL;
1569 
1570 			/* Leave uarg NULL if can't zerocopy, callers should
1571 			 * be able to handle it.
1572 			 */
1573 			if ((rt->dst.dev->features & NETIF_F_SG) &&
1574 			    csummode == CHECKSUM_PARTIAL) {
1575 				paged = true;
1576 				zc = true;
1577 				uarg = msg->msg_ubuf;
1578 			}
1579 		} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1580 			uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1581 			if (!uarg)
1582 				return -ENOBUFS;
1583 			extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1584 			if (rt->dst.dev->features & NETIF_F_SG &&
1585 			    csummode == CHECKSUM_PARTIAL) {
1586 				paged = true;
1587 				zc = true;
1588 			} else {
1589 				uarg_to_msgzc(uarg)->zerocopy = 0;
1590 				skb_zcopy_set(skb, uarg, &extra_uref);
1591 			}
1592 		}
1593 	} else if ((flags & MSG_SPLICE_PAGES) && length) {
1594 		if (inet_sk(sk)->hdrincl)
1595 			return -EPERM;
1596 		if (rt->dst.dev->features & NETIF_F_SG)
1597 			/* We need an empty buffer to attach stuff to */
1598 			paged = true;
1599 		else
1600 			flags &= ~MSG_SPLICE_PAGES;
1601 	}
1602 
1603 	/*
1604 	 * Let's try using as much space as possible.
1605 	 * Use MTU if total length of the message fits into the MTU.
1606 	 * Otherwise, we need to reserve fragment header and
1607 	 * fragment alignment (= 8-15 octects, in total).
1608 	 *
1609 	 * Note that we may need to "move" the data from the tail
1610 	 * of the buffer to the new fragment when we split
1611 	 * the message.
1612 	 *
1613 	 * FIXME: It may be fragmented into multiple chunks
1614 	 *        at once if non-fragmentable extension headers
1615 	 *        are too large.
1616 	 * --yoshfuji
1617 	 */
1618 
1619 	cork->length += length;
1620 	if (!skb)
1621 		goto alloc_new_skb;
1622 
1623 	while (length > 0) {
1624 		/* Check if the remaining data fits into current packet. */
1625 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1626 		if (copy < length)
1627 			copy = maxfraglen - skb->len;
1628 
1629 		if (copy <= 0) {
1630 			char *data;
1631 			unsigned int datalen;
1632 			unsigned int fraglen;
1633 			unsigned int fraggap;
1634 			unsigned int alloclen, alloc_extra;
1635 			unsigned int pagedlen;
1636 alloc_new_skb:
1637 			/* There's no room in the current skb */
1638 			if (skb)
1639 				fraggap = skb->len - maxfraglen;
1640 			else
1641 				fraggap = 0;
1642 			/* update mtu and maxfraglen if necessary */
1643 			if (!skb || !skb_prev)
1644 				ip6_append_data_mtu(&mtu, &maxfraglen,
1645 						    fragheaderlen, skb, rt,
1646 						    orig_mtu);
1647 
1648 			skb_prev = skb;
1649 
1650 			/*
1651 			 * If remaining data exceeds the mtu,
1652 			 * we know we need more fragment(s).
1653 			 */
1654 			datalen = length + fraggap;
1655 
1656 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1657 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1658 			fraglen = datalen + fragheaderlen;
1659 			pagedlen = 0;
1660 
1661 			alloc_extra = hh_len;
1662 			alloc_extra += dst_exthdrlen;
1663 			alloc_extra += rt->dst.trailer_len;
1664 
1665 			/* We just reserve space for fragment header.
1666 			 * Note: this may be overallocation if the message
1667 			 * (without MSG_MORE) fits into the MTU.
1668 			 */
1669 			alloc_extra += sizeof(struct frag_hdr);
1670 
1671 			if ((flags & MSG_MORE) &&
1672 			    !(rt->dst.dev->features&NETIF_F_SG))
1673 				alloclen = mtu;
1674 			else if (!paged &&
1675 				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1676 				  !(rt->dst.dev->features & NETIF_F_SG)))
1677 				alloclen = fraglen;
1678 			else {
1679 				alloclen = fragheaderlen + transhdrlen;
1680 				pagedlen = datalen - transhdrlen;
1681 			}
1682 			alloclen += alloc_extra;
1683 
1684 			if (datalen != length + fraggap) {
1685 				/*
1686 				 * this is not the last fragment, the trailer
1687 				 * space is regarded as data space.
1688 				 */
1689 				datalen += rt->dst.trailer_len;
1690 			}
1691 
1692 			fraglen = datalen + fragheaderlen;
1693 
1694 			copy = datalen - transhdrlen - fraggap - pagedlen;
1695 			if (copy < 0) {
1696 				err = -EINVAL;
1697 				goto error;
1698 			}
1699 			if (transhdrlen) {
1700 				skb = sock_alloc_send_skb(sk, alloclen,
1701 						(flags & MSG_DONTWAIT), &err);
1702 			} else {
1703 				skb = NULL;
1704 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1705 				    2 * sk->sk_sndbuf)
1706 					skb = alloc_skb(alloclen,
1707 							sk->sk_allocation);
1708 				if (unlikely(!skb))
1709 					err = -ENOBUFS;
1710 			}
1711 			if (!skb)
1712 				goto error;
1713 			/*
1714 			 *	Fill in the control structures
1715 			 */
1716 			skb->protocol = htons(ETH_P_IPV6);
1717 			skb->ip_summed = csummode;
1718 			skb->csum = 0;
1719 			/* reserve for fragmentation and ipsec header */
1720 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1721 				    dst_exthdrlen);
1722 
1723 			/*
1724 			 *	Find where to start putting bytes
1725 			 */
1726 			data = skb_put(skb, fraglen - pagedlen);
1727 			skb_set_network_header(skb, exthdrlen);
1728 			data += fragheaderlen;
1729 			skb->transport_header = (skb->network_header +
1730 						 fragheaderlen);
1731 			if (fraggap) {
1732 				skb->csum = skb_copy_and_csum_bits(
1733 					skb_prev, maxfraglen,
1734 					data + transhdrlen, fraggap);
1735 				skb_prev->csum = csum_sub(skb_prev->csum,
1736 							  skb->csum);
1737 				data += fraggap;
1738 				pskb_trim_unique(skb_prev, maxfraglen);
1739 			}
1740 			if (copy > 0 &&
1741 			    getfrag(from, data + transhdrlen, offset,
1742 				    copy, fraggap, skb) < 0) {
1743 				err = -EFAULT;
1744 				kfree_skb(skb);
1745 				goto error;
1746 			}
1747 
1748 			offset += copy;
1749 			length -= copy + transhdrlen;
1750 			transhdrlen = 0;
1751 			exthdrlen = 0;
1752 			dst_exthdrlen = 0;
1753 
1754 			/* Only the initial fragment is time stamped */
1755 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1756 			cork->tx_flags = 0;
1757 			skb_shinfo(skb)->tskey = tskey;
1758 			tskey = 0;
1759 			skb_zcopy_set(skb, uarg, &extra_uref);
1760 
1761 			if ((flags & MSG_CONFIRM) && !skb_prev)
1762 				skb_set_dst_pending_confirm(skb, 1);
1763 
1764 			/*
1765 			 * Put the packet on the pending queue
1766 			 */
1767 			if (!skb->destructor) {
1768 				skb->destructor = sock_wfree;
1769 				skb->sk = sk;
1770 				wmem_alloc_delta += skb->truesize;
1771 			}
1772 			__skb_queue_tail(queue, skb);
1773 			continue;
1774 		}
1775 
1776 		if (copy > length)
1777 			copy = length;
1778 
1779 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1780 		    skb_tailroom(skb) >= copy) {
1781 			unsigned int off;
1782 
1783 			off = skb->len;
1784 			if (getfrag(from, skb_put(skb, copy),
1785 						offset, copy, off, skb) < 0) {
1786 				__skb_trim(skb, off);
1787 				err = -EFAULT;
1788 				goto error;
1789 			}
1790 		} else if (flags & MSG_SPLICE_PAGES) {
1791 			struct msghdr *msg = from;
1792 
1793 			err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
1794 						   sk->sk_allocation);
1795 			if (err < 0)
1796 				goto error;
1797 			copy = err;
1798 			wmem_alloc_delta += copy;
1799 		} else if (!zc) {
1800 			int i = skb_shinfo(skb)->nr_frags;
1801 
1802 			err = -ENOMEM;
1803 			if (!sk_page_frag_refill(sk, pfrag))
1804 				goto error;
1805 
1806 			skb_zcopy_downgrade_managed(skb);
1807 			if (!skb_can_coalesce(skb, i, pfrag->page,
1808 					      pfrag->offset)) {
1809 				err = -EMSGSIZE;
1810 				if (i == MAX_SKB_FRAGS)
1811 					goto error;
1812 
1813 				__skb_fill_page_desc(skb, i, pfrag->page,
1814 						     pfrag->offset, 0);
1815 				skb_shinfo(skb)->nr_frags = ++i;
1816 				get_page(pfrag->page);
1817 			}
1818 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1819 			if (getfrag(from,
1820 				    page_address(pfrag->page) + pfrag->offset,
1821 				    offset, copy, skb->len, skb) < 0)
1822 				goto error_efault;
1823 
1824 			pfrag->offset += copy;
1825 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1826 			skb->len += copy;
1827 			skb->data_len += copy;
1828 			skb->truesize += copy;
1829 			wmem_alloc_delta += copy;
1830 		} else {
1831 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1832 			if (err < 0)
1833 				goto error;
1834 		}
1835 		offset += copy;
1836 		length -= copy;
1837 	}
1838 
1839 	if (wmem_alloc_delta)
1840 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1841 	return 0;
1842 
1843 error_efault:
1844 	err = -EFAULT;
1845 error:
1846 	net_zcopy_put_abort(uarg, extra_uref);
1847 	cork->length -= length;
1848 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1849 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1850 	return err;
1851 }
1852 
1853 int ip6_append_data(struct sock *sk,
1854 		    int getfrag(void *from, char *to, int offset, int len,
1855 				int odd, struct sk_buff *skb),
1856 		    void *from, size_t length, int transhdrlen,
1857 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1858 		    struct rt6_info *rt, unsigned int flags)
1859 {
1860 	struct inet_sock *inet = inet_sk(sk);
1861 	struct ipv6_pinfo *np = inet6_sk(sk);
1862 	int exthdrlen;
1863 	int err;
1864 
1865 	if (flags&MSG_PROBE)
1866 		return 0;
1867 	if (skb_queue_empty(&sk->sk_write_queue)) {
1868 		/*
1869 		 * setup for corking
1870 		 */
1871 		dst_hold(&rt->dst);
1872 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1873 				     ipc6, rt);
1874 		if (err)
1875 			return err;
1876 
1877 		inet->cork.fl.u.ip6 = *fl6;
1878 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1879 		length += exthdrlen;
1880 		transhdrlen += exthdrlen;
1881 	} else {
1882 		transhdrlen = 0;
1883 	}
1884 
1885 	return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1886 				 &np->cork, sk_page_frag(sk), getfrag,
1887 				 from, length, transhdrlen, flags, ipc6);
1888 }
1889 EXPORT_SYMBOL_GPL(ip6_append_data);
1890 
1891 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1892 {
1893 	struct dst_entry *dst = cork->base.dst;
1894 
1895 	cork->base.dst = NULL;
1896 	cork->base.flags &= ~IPCORK_ALLFRAG;
1897 	skb_dst_set(skb, dst);
1898 }
1899 
1900 static void ip6_cork_release(struct inet_cork_full *cork,
1901 			     struct inet6_cork *v6_cork)
1902 {
1903 	if (v6_cork->opt) {
1904 		struct ipv6_txoptions *opt = v6_cork->opt;
1905 
1906 		kfree(opt->dst0opt);
1907 		kfree(opt->dst1opt);
1908 		kfree(opt->hopopt);
1909 		kfree(opt->srcrt);
1910 		kfree(opt);
1911 		v6_cork->opt = NULL;
1912 	}
1913 
1914 	if (cork->base.dst) {
1915 		dst_release(cork->base.dst);
1916 		cork->base.dst = NULL;
1917 		cork->base.flags &= ~IPCORK_ALLFRAG;
1918 	}
1919 }
1920 
1921 struct sk_buff *__ip6_make_skb(struct sock *sk,
1922 			       struct sk_buff_head *queue,
1923 			       struct inet_cork_full *cork,
1924 			       struct inet6_cork *v6_cork)
1925 {
1926 	struct sk_buff *skb, *tmp_skb;
1927 	struct sk_buff **tail_skb;
1928 	struct in6_addr *final_dst;
1929 	struct ipv6_pinfo *np = inet6_sk(sk);
1930 	struct net *net = sock_net(sk);
1931 	struct ipv6hdr *hdr;
1932 	struct ipv6_txoptions *opt = v6_cork->opt;
1933 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1934 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1935 	unsigned char proto = fl6->flowi6_proto;
1936 
1937 	skb = __skb_dequeue(queue);
1938 	if (!skb)
1939 		goto out;
1940 	tail_skb = &(skb_shinfo(skb)->frag_list);
1941 
1942 	/* move skb->data to ip header from ext header */
1943 	if (skb->data < skb_network_header(skb))
1944 		__skb_pull(skb, skb_network_offset(skb));
1945 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1946 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1947 		*tail_skb = tmp_skb;
1948 		tail_skb = &(tmp_skb->next);
1949 		skb->len += tmp_skb->len;
1950 		skb->data_len += tmp_skb->len;
1951 		skb->truesize += tmp_skb->truesize;
1952 		tmp_skb->destructor = NULL;
1953 		tmp_skb->sk = NULL;
1954 	}
1955 
1956 	/* Allow local fragmentation. */
1957 	skb->ignore_df = ip6_sk_ignore_df(sk);
1958 	__skb_pull(skb, skb_network_header_len(skb));
1959 
1960 	final_dst = &fl6->daddr;
1961 	if (opt && opt->opt_flen)
1962 		ipv6_push_frag_opts(skb, opt, &proto);
1963 	if (opt && opt->opt_nflen)
1964 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1965 
1966 	skb_push(skb, sizeof(struct ipv6hdr));
1967 	skb_reset_network_header(skb);
1968 	hdr = ipv6_hdr(skb);
1969 
1970 	ip6_flow_hdr(hdr, v6_cork->tclass,
1971 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1972 					ip6_autoflowlabel(net, np), fl6));
1973 	hdr->hop_limit = v6_cork->hop_limit;
1974 	hdr->nexthdr = proto;
1975 	hdr->saddr = fl6->saddr;
1976 	hdr->daddr = *final_dst;
1977 
1978 	skb->priority = sk->sk_priority;
1979 	skb->mark = cork->base.mark;
1980 	skb->tstamp = cork->base.transmit_time;
1981 
1982 	ip6_cork_steal_dst(skb, cork);
1983 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1984 	if (proto == IPPROTO_ICMPV6) {
1985 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1986 		u8 icmp6_type;
1987 
1988 		if (sk->sk_socket->type == SOCK_RAW && !inet_sk(sk)->hdrincl)
1989 			icmp6_type = fl6->fl6_icmp_type;
1990 		else
1991 			icmp6_type = icmp6_hdr(skb)->icmp6_type;
1992 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
1993 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1994 	}
1995 
1996 	ip6_cork_release(cork, v6_cork);
1997 out:
1998 	return skb;
1999 }
2000 
2001 int ip6_send_skb(struct sk_buff *skb)
2002 {
2003 	struct net *net = sock_net(skb->sk);
2004 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
2005 	int err;
2006 
2007 	err = ip6_local_out(net, skb->sk, skb);
2008 	if (err) {
2009 		if (err > 0)
2010 			err = net_xmit_errno(err);
2011 		if (err)
2012 			IP6_INC_STATS(net, rt->rt6i_idev,
2013 				      IPSTATS_MIB_OUTDISCARDS);
2014 	}
2015 
2016 	return err;
2017 }
2018 
2019 int ip6_push_pending_frames(struct sock *sk)
2020 {
2021 	struct sk_buff *skb;
2022 
2023 	skb = ip6_finish_skb(sk);
2024 	if (!skb)
2025 		return 0;
2026 
2027 	return ip6_send_skb(skb);
2028 }
2029 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
2030 
2031 static void __ip6_flush_pending_frames(struct sock *sk,
2032 				       struct sk_buff_head *queue,
2033 				       struct inet_cork_full *cork,
2034 				       struct inet6_cork *v6_cork)
2035 {
2036 	struct sk_buff *skb;
2037 
2038 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
2039 		if (skb_dst(skb))
2040 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
2041 				      IPSTATS_MIB_OUTDISCARDS);
2042 		kfree_skb(skb);
2043 	}
2044 
2045 	ip6_cork_release(cork, v6_cork);
2046 }
2047 
2048 void ip6_flush_pending_frames(struct sock *sk)
2049 {
2050 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2051 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
2052 }
2053 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2054 
2055 struct sk_buff *ip6_make_skb(struct sock *sk,
2056 			     int getfrag(void *from, char *to, int offset,
2057 					 int len, int odd, struct sk_buff *skb),
2058 			     void *from, size_t length, int transhdrlen,
2059 			     struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2060 			     unsigned int flags, struct inet_cork_full *cork)
2061 {
2062 	struct inet6_cork v6_cork;
2063 	struct sk_buff_head queue;
2064 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2065 	int err;
2066 
2067 	if (flags & MSG_PROBE) {
2068 		dst_release(&rt->dst);
2069 		return NULL;
2070 	}
2071 
2072 	__skb_queue_head_init(&queue);
2073 
2074 	cork->base.flags = 0;
2075 	cork->base.addr = 0;
2076 	cork->base.opt = NULL;
2077 	v6_cork.opt = NULL;
2078 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
2079 	if (err) {
2080 		ip6_cork_release(cork, &v6_cork);
2081 		return ERR_PTR(err);
2082 	}
2083 	if (ipc6->dontfrag < 0)
2084 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2085 
2086 	err = __ip6_append_data(sk, &queue, cork, &v6_cork,
2087 				&current->task_frag, getfrag, from,
2088 				length + exthdrlen, transhdrlen + exthdrlen,
2089 				flags, ipc6);
2090 	if (err) {
2091 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2092 		return ERR_PTR(err);
2093 	}
2094 
2095 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2096 }
2097