xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 8957261c)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	IPv6 output functions
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on linux/net/ipv4/ip_output.c
10  *
11  *	Changes:
12  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13  *				extension headers are implemented.
14  *				route changes now work.
15  *				ip6_forward does not confuse sniffers.
16  *				etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *	Imran Patel	:	frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *			:       add ip6_append_data and related functions
22  *				for datagram xmit
23  */
24 
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37 
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41 
42 #include <net/sock.h>
43 #include <net/snmp.h>
44 
45 #include <net/gso.h>
46 #include <net/ipv6.h>
47 #include <net/ndisc.h>
48 #include <net/protocol.h>
49 #include <net/ip6_route.h>
50 #include <net/addrconf.h>
51 #include <net/rawv6.h>
52 #include <net/icmp.h>
53 #include <net/xfrm.h>
54 #include <net/checksum.h>
55 #include <linux/mroute6.h>
56 #include <net/l3mdev.h>
57 #include <net/lwtunnel.h>
58 #include <net/ip_tunnels.h>
59 
60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61 {
62 	struct dst_entry *dst = skb_dst(skb);
63 	struct net_device *dev = dst->dev;
64 	struct inet6_dev *idev = ip6_dst_idev(dst);
65 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
66 	const struct in6_addr *daddr, *nexthop;
67 	struct ipv6hdr *hdr;
68 	struct neighbour *neigh;
69 	int ret;
70 
71 	/* Be paranoid, rather than too clever. */
72 	if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
73 		skb = skb_expand_head(skb, hh_len);
74 		if (!skb) {
75 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
76 			return -ENOMEM;
77 		}
78 	}
79 
80 	hdr = ipv6_hdr(skb);
81 	daddr = &hdr->daddr;
82 	if (ipv6_addr_is_multicast(daddr)) {
83 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
84 		    ((mroute6_is_socket(net, skb) &&
85 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
86 		     ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
87 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
88 
89 			/* Do not check for IFF_ALLMULTI; multicast routing
90 			   is not supported in any case.
91 			 */
92 			if (newskb)
93 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
94 					net, sk, newskb, NULL, newskb->dev,
95 					dev_loopback_xmit);
96 
97 			if (hdr->hop_limit == 0) {
98 				IP6_INC_STATS(net, idev,
99 					      IPSTATS_MIB_OUTDISCARDS);
100 				kfree_skb(skb);
101 				return 0;
102 			}
103 		}
104 
105 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
106 		if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
107 		    !(dev->flags & IFF_LOOPBACK)) {
108 			kfree_skb(skb);
109 			return 0;
110 		}
111 	}
112 
113 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
114 		int res = lwtunnel_xmit(skb);
115 
116 		if (res != LWTUNNEL_XMIT_CONTINUE)
117 			return res;
118 	}
119 
120 	rcu_read_lock();
121 	nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
122 	neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
123 
124 	if (unlikely(IS_ERR_OR_NULL(neigh))) {
125 		if (unlikely(!neigh))
126 			neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
127 		if (IS_ERR(neigh)) {
128 			rcu_read_unlock();
129 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
130 			kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
131 			return -EINVAL;
132 		}
133 	}
134 	sock_confirm_neigh(skb, neigh);
135 	ret = neigh_output(neigh, skb, false);
136 	rcu_read_unlock();
137 	return ret;
138 }
139 
140 static int
141 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
142 				    struct sk_buff *skb, unsigned int mtu)
143 {
144 	struct sk_buff *segs, *nskb;
145 	netdev_features_t features;
146 	int ret = 0;
147 
148 	/* Please see corresponding comment in ip_finish_output_gso
149 	 * describing the cases where GSO segment length exceeds the
150 	 * egress MTU.
151 	 */
152 	features = netif_skb_features(skb);
153 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
154 	if (IS_ERR_OR_NULL(segs)) {
155 		kfree_skb(skb);
156 		return -ENOMEM;
157 	}
158 
159 	consume_skb(skb);
160 
161 	skb_list_walk_safe(segs, segs, nskb) {
162 		int err;
163 
164 		skb_mark_not_on_list(segs);
165 		err = ip6_fragment(net, sk, segs, ip6_finish_output2);
166 		if (err && ret == 0)
167 			ret = err;
168 	}
169 
170 	return ret;
171 }
172 
173 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
174 {
175 	unsigned int mtu;
176 
177 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
178 	/* Policy lookup after SNAT yielded a new policy */
179 	if (skb_dst(skb)->xfrm) {
180 		IP6CB(skb)->flags |= IP6SKB_REROUTED;
181 		return dst_output(net, sk, skb);
182 	}
183 #endif
184 
185 	mtu = ip6_skb_dst_mtu(skb);
186 	if (skb_is_gso(skb) &&
187 	    !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
188 	    !skb_gso_validate_network_len(skb, mtu))
189 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
190 
191 	if ((skb->len > mtu && !skb_is_gso(skb)) ||
192 	    dst_allfrag(skb_dst(skb)) ||
193 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
194 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
195 	else
196 		return ip6_finish_output2(net, sk, skb);
197 }
198 
199 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
200 {
201 	int ret;
202 
203 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
204 	switch (ret) {
205 	case NET_XMIT_SUCCESS:
206 	case NET_XMIT_CN:
207 		return __ip6_finish_output(net, sk, skb) ? : ret;
208 	default:
209 		kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
210 		return ret;
211 	}
212 }
213 
214 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
215 {
216 	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
217 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
218 
219 	skb->protocol = htons(ETH_P_IPV6);
220 	skb->dev = dev;
221 
222 	if (unlikely(idev->cnf.disable_ipv6)) {
223 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
224 		kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
225 		return 0;
226 	}
227 
228 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
229 			    net, sk, skb, indev, dev,
230 			    ip6_finish_output,
231 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
232 }
233 EXPORT_SYMBOL(ip6_output);
234 
235 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
236 {
237 	if (!np->autoflowlabel_set)
238 		return ip6_default_np_autolabel(net);
239 	else
240 		return np->autoflowlabel;
241 }
242 
243 /*
244  * xmit an sk_buff (used by TCP, SCTP and DCCP)
245  * Note : socket lock is not held for SYNACK packets, but might be modified
246  * by calls to skb_set_owner_w() and ipv6_local_error(),
247  * which are using proper atomic operations or spinlocks.
248  */
249 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
250 	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
251 {
252 	struct net *net = sock_net(sk);
253 	const struct ipv6_pinfo *np = inet6_sk(sk);
254 	struct in6_addr *first_hop = &fl6->daddr;
255 	struct dst_entry *dst = skb_dst(skb);
256 	struct net_device *dev = dst->dev;
257 	struct inet6_dev *idev = ip6_dst_idev(dst);
258 	struct hop_jumbo_hdr *hop_jumbo;
259 	int hoplen = sizeof(*hop_jumbo);
260 	unsigned int head_room;
261 	struct ipv6hdr *hdr;
262 	u8  proto = fl6->flowi6_proto;
263 	int seg_len = skb->len;
264 	int hlimit = -1;
265 	u32 mtu;
266 
267 	head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
268 	if (opt)
269 		head_room += opt->opt_nflen + opt->opt_flen;
270 
271 	if (unlikely(head_room > skb_headroom(skb))) {
272 		skb = skb_expand_head(skb, head_room);
273 		if (!skb) {
274 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
275 			return -ENOBUFS;
276 		}
277 	}
278 
279 	if (opt) {
280 		seg_len += opt->opt_nflen + opt->opt_flen;
281 
282 		if (opt->opt_flen)
283 			ipv6_push_frag_opts(skb, opt, &proto);
284 
285 		if (opt->opt_nflen)
286 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
287 					     &fl6->saddr);
288 	}
289 
290 	if (unlikely(seg_len > IPV6_MAXPLEN)) {
291 		hop_jumbo = skb_push(skb, hoplen);
292 
293 		hop_jumbo->nexthdr = proto;
294 		hop_jumbo->hdrlen = 0;
295 		hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
296 		hop_jumbo->tlv_len = 4;
297 		hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
298 
299 		proto = IPPROTO_HOPOPTS;
300 		seg_len = 0;
301 		IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
302 	}
303 
304 	skb_push(skb, sizeof(struct ipv6hdr));
305 	skb_reset_network_header(skb);
306 	hdr = ipv6_hdr(skb);
307 
308 	/*
309 	 *	Fill in the IPv6 header
310 	 */
311 	if (np)
312 		hlimit = np->hop_limit;
313 	if (hlimit < 0)
314 		hlimit = ip6_dst_hoplimit(dst);
315 
316 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
317 				ip6_autoflowlabel(net, np), fl6));
318 
319 	hdr->payload_len = htons(seg_len);
320 	hdr->nexthdr = proto;
321 	hdr->hop_limit = hlimit;
322 
323 	hdr->saddr = fl6->saddr;
324 	hdr->daddr = *first_hop;
325 
326 	skb->protocol = htons(ETH_P_IPV6);
327 	skb->priority = priority;
328 	skb->mark = mark;
329 
330 	mtu = dst_mtu(dst);
331 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
332 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
333 
334 		/* if egress device is enslaved to an L3 master device pass the
335 		 * skb to its handler for processing
336 		 */
337 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
338 		if (unlikely(!skb))
339 			return 0;
340 
341 		/* hooks should never assume socket lock is held.
342 		 * we promote our socket to non const
343 		 */
344 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
345 			       net, (struct sock *)sk, skb, NULL, dev,
346 			       dst_output);
347 	}
348 
349 	skb->dev = dev;
350 	/* ipv6_local_error() does not require socket lock,
351 	 * we promote our socket to non const
352 	 */
353 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
354 
355 	IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
356 	kfree_skb(skb);
357 	return -EMSGSIZE;
358 }
359 EXPORT_SYMBOL(ip6_xmit);
360 
361 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
362 {
363 	struct ip6_ra_chain *ra;
364 	struct sock *last = NULL;
365 
366 	read_lock(&ip6_ra_lock);
367 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
368 		struct sock *sk = ra->sk;
369 		if (sk && ra->sel == sel &&
370 		    (!sk->sk_bound_dev_if ||
371 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
372 			struct ipv6_pinfo *np = inet6_sk(sk);
373 
374 			if (np && np->rtalert_isolate &&
375 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
376 				continue;
377 			}
378 			if (last) {
379 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
380 				if (skb2)
381 					rawv6_rcv(last, skb2);
382 			}
383 			last = sk;
384 		}
385 	}
386 
387 	if (last) {
388 		rawv6_rcv(last, skb);
389 		read_unlock(&ip6_ra_lock);
390 		return 1;
391 	}
392 	read_unlock(&ip6_ra_lock);
393 	return 0;
394 }
395 
396 static int ip6_forward_proxy_check(struct sk_buff *skb)
397 {
398 	struct ipv6hdr *hdr = ipv6_hdr(skb);
399 	u8 nexthdr = hdr->nexthdr;
400 	__be16 frag_off;
401 	int offset;
402 
403 	if (ipv6_ext_hdr(nexthdr)) {
404 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
405 		if (offset < 0)
406 			return 0;
407 	} else
408 		offset = sizeof(struct ipv6hdr);
409 
410 	if (nexthdr == IPPROTO_ICMPV6) {
411 		struct icmp6hdr *icmp6;
412 
413 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
414 					 offset + 1 - skb->data)))
415 			return 0;
416 
417 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
418 
419 		switch (icmp6->icmp6_type) {
420 		case NDISC_ROUTER_SOLICITATION:
421 		case NDISC_ROUTER_ADVERTISEMENT:
422 		case NDISC_NEIGHBOUR_SOLICITATION:
423 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
424 		case NDISC_REDIRECT:
425 			/* For reaction involving unicast neighbor discovery
426 			 * message destined to the proxied address, pass it to
427 			 * input function.
428 			 */
429 			return 1;
430 		default:
431 			break;
432 		}
433 	}
434 
435 	/*
436 	 * The proxying router can't forward traffic sent to a link-local
437 	 * address, so signal the sender and discard the packet. This
438 	 * behavior is clarified by the MIPv6 specification.
439 	 */
440 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
441 		dst_link_failure(skb);
442 		return -1;
443 	}
444 
445 	return 0;
446 }
447 
448 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
449 				     struct sk_buff *skb)
450 {
451 	struct dst_entry *dst = skb_dst(skb);
452 
453 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
454 
455 #ifdef CONFIG_NET_SWITCHDEV
456 	if (skb->offload_l3_fwd_mark) {
457 		consume_skb(skb);
458 		return 0;
459 	}
460 #endif
461 
462 	skb_clear_tstamp(skb);
463 	return dst_output(net, sk, skb);
464 }
465 
466 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
467 {
468 	if (skb->len <= mtu)
469 		return false;
470 
471 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
472 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
473 		return true;
474 
475 	if (skb->ignore_df)
476 		return false;
477 
478 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
479 		return false;
480 
481 	return true;
482 }
483 
484 int ip6_forward(struct sk_buff *skb)
485 {
486 	struct dst_entry *dst = skb_dst(skb);
487 	struct ipv6hdr *hdr = ipv6_hdr(skb);
488 	struct inet6_skb_parm *opt = IP6CB(skb);
489 	struct net *net = dev_net(dst->dev);
490 	struct inet6_dev *idev;
491 	SKB_DR(reason);
492 	u32 mtu;
493 
494 	idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
495 	if (net->ipv6.devconf_all->forwarding == 0)
496 		goto error;
497 
498 	if (skb->pkt_type != PACKET_HOST)
499 		goto drop;
500 
501 	if (unlikely(skb->sk))
502 		goto drop;
503 
504 	if (skb_warn_if_lro(skb))
505 		goto drop;
506 
507 	if (!net->ipv6.devconf_all->disable_policy &&
508 	    (!idev || !idev->cnf.disable_policy) &&
509 	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
510 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
511 		goto drop;
512 	}
513 
514 	skb_forward_csum(skb);
515 
516 	/*
517 	 *	We DO NOT make any processing on
518 	 *	RA packets, pushing them to user level AS IS
519 	 *	without ane WARRANTY that application will be able
520 	 *	to interpret them. The reason is that we
521 	 *	cannot make anything clever here.
522 	 *
523 	 *	We are not end-node, so that if packet contains
524 	 *	AH/ESP, we cannot make anything.
525 	 *	Defragmentation also would be mistake, RA packets
526 	 *	cannot be fragmented, because there is no warranty
527 	 *	that different fragments will go along one path. --ANK
528 	 */
529 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
530 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
531 			return 0;
532 	}
533 
534 	/*
535 	 *	check and decrement ttl
536 	 */
537 	if (hdr->hop_limit <= 1) {
538 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
539 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
540 
541 		kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
542 		return -ETIMEDOUT;
543 	}
544 
545 	/* XXX: idev->cnf.proxy_ndp? */
546 	if (net->ipv6.devconf_all->proxy_ndp &&
547 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
548 		int proxied = ip6_forward_proxy_check(skb);
549 		if (proxied > 0) {
550 			/* It's tempting to decrease the hop limit
551 			 * here by 1, as we do at the end of the
552 			 * function too.
553 			 *
554 			 * But that would be incorrect, as proxying is
555 			 * not forwarding.  The ip6_input function
556 			 * will handle this packet locally, and it
557 			 * depends on the hop limit being unchanged.
558 			 *
559 			 * One example is the NDP hop limit, that
560 			 * always has to stay 255, but other would be
561 			 * similar checks around RA packets, where the
562 			 * user can even change the desired limit.
563 			 */
564 			return ip6_input(skb);
565 		} else if (proxied < 0) {
566 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
567 			goto drop;
568 		}
569 	}
570 
571 	if (!xfrm6_route_forward(skb)) {
572 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
573 		SKB_DR_SET(reason, XFRM_POLICY);
574 		goto drop;
575 	}
576 	dst = skb_dst(skb);
577 
578 	/* IPv6 specs say nothing about it, but it is clear that we cannot
579 	   send redirects to source routed frames.
580 	   We don't send redirects to frames decapsulated from IPsec.
581 	 */
582 	if (IP6CB(skb)->iif == dst->dev->ifindex &&
583 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
584 		struct in6_addr *target = NULL;
585 		struct inet_peer *peer;
586 		struct rt6_info *rt;
587 
588 		/*
589 		 *	incoming and outgoing devices are the same
590 		 *	send a redirect.
591 		 */
592 
593 		rt = (struct rt6_info *) dst;
594 		if (rt->rt6i_flags & RTF_GATEWAY)
595 			target = &rt->rt6i_gateway;
596 		else
597 			target = &hdr->daddr;
598 
599 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
600 
601 		/* Limit redirects both by destination (here)
602 		   and by source (inside ndisc_send_redirect)
603 		 */
604 		if (inet_peer_xrlim_allow(peer, 1*HZ))
605 			ndisc_send_redirect(skb, target);
606 		if (peer)
607 			inet_putpeer(peer);
608 	} else {
609 		int addrtype = ipv6_addr_type(&hdr->saddr);
610 
611 		/* This check is security critical. */
612 		if (addrtype == IPV6_ADDR_ANY ||
613 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
614 			goto error;
615 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
616 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
617 				    ICMPV6_NOT_NEIGHBOUR, 0);
618 			goto error;
619 		}
620 	}
621 
622 	mtu = ip6_dst_mtu_maybe_forward(dst, true);
623 	if (mtu < IPV6_MIN_MTU)
624 		mtu = IPV6_MIN_MTU;
625 
626 	if (ip6_pkt_too_big(skb, mtu)) {
627 		/* Again, force OUTPUT device used as source address */
628 		skb->dev = dst->dev;
629 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
630 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
631 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
632 				IPSTATS_MIB_FRAGFAILS);
633 		kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
634 		return -EMSGSIZE;
635 	}
636 
637 	if (skb_cow(skb, dst->dev->hard_header_len)) {
638 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
639 				IPSTATS_MIB_OUTDISCARDS);
640 		goto drop;
641 	}
642 
643 	hdr = ipv6_hdr(skb);
644 
645 	/* Mangling hops number delayed to point after skb COW */
646 
647 	hdr->hop_limit--;
648 
649 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
650 		       net, NULL, skb, skb->dev, dst->dev,
651 		       ip6_forward_finish);
652 
653 error:
654 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
655 	SKB_DR_SET(reason, IP_INADDRERRORS);
656 drop:
657 	kfree_skb_reason(skb, reason);
658 	return -EINVAL;
659 }
660 
661 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
662 {
663 	to->pkt_type = from->pkt_type;
664 	to->priority = from->priority;
665 	to->protocol = from->protocol;
666 	skb_dst_drop(to);
667 	skb_dst_set(to, dst_clone(skb_dst(from)));
668 	to->dev = from->dev;
669 	to->mark = from->mark;
670 
671 	skb_copy_hash(to, from);
672 
673 #ifdef CONFIG_NET_SCHED
674 	to->tc_index = from->tc_index;
675 #endif
676 	nf_copy(to, from);
677 	skb_ext_copy(to, from);
678 	skb_copy_secmark(to, from);
679 }
680 
681 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
682 		      u8 nexthdr, __be32 frag_id,
683 		      struct ip6_fraglist_iter *iter)
684 {
685 	unsigned int first_len;
686 	struct frag_hdr *fh;
687 
688 	/* BUILD HEADER */
689 	*prevhdr = NEXTHDR_FRAGMENT;
690 	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
691 	if (!iter->tmp_hdr)
692 		return -ENOMEM;
693 
694 	iter->frag = skb_shinfo(skb)->frag_list;
695 	skb_frag_list_init(skb);
696 
697 	iter->offset = 0;
698 	iter->hlen = hlen;
699 	iter->frag_id = frag_id;
700 	iter->nexthdr = nexthdr;
701 
702 	__skb_pull(skb, hlen);
703 	fh = __skb_push(skb, sizeof(struct frag_hdr));
704 	__skb_push(skb, hlen);
705 	skb_reset_network_header(skb);
706 	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
707 
708 	fh->nexthdr = nexthdr;
709 	fh->reserved = 0;
710 	fh->frag_off = htons(IP6_MF);
711 	fh->identification = frag_id;
712 
713 	first_len = skb_pagelen(skb);
714 	skb->data_len = first_len - skb_headlen(skb);
715 	skb->len = first_len;
716 	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
717 
718 	return 0;
719 }
720 EXPORT_SYMBOL(ip6_fraglist_init);
721 
722 void ip6_fraglist_prepare(struct sk_buff *skb,
723 			  struct ip6_fraglist_iter *iter)
724 {
725 	struct sk_buff *frag = iter->frag;
726 	unsigned int hlen = iter->hlen;
727 	struct frag_hdr *fh;
728 
729 	frag->ip_summed = CHECKSUM_NONE;
730 	skb_reset_transport_header(frag);
731 	fh = __skb_push(frag, sizeof(struct frag_hdr));
732 	__skb_push(frag, hlen);
733 	skb_reset_network_header(frag);
734 	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
735 	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
736 	fh->nexthdr = iter->nexthdr;
737 	fh->reserved = 0;
738 	fh->frag_off = htons(iter->offset);
739 	if (frag->next)
740 		fh->frag_off |= htons(IP6_MF);
741 	fh->identification = iter->frag_id;
742 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
743 	ip6_copy_metadata(frag, skb);
744 }
745 EXPORT_SYMBOL(ip6_fraglist_prepare);
746 
747 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
748 		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
749 		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
750 {
751 	state->prevhdr = prevhdr;
752 	state->nexthdr = nexthdr;
753 	state->frag_id = frag_id;
754 
755 	state->hlen = hlen;
756 	state->mtu = mtu;
757 
758 	state->left = skb->len - hlen;	/* Space per frame */
759 	state->ptr = hlen;		/* Where to start from */
760 
761 	state->hroom = hdr_room;
762 	state->troom = needed_tailroom;
763 
764 	state->offset = 0;
765 }
766 EXPORT_SYMBOL(ip6_frag_init);
767 
768 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
769 {
770 	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
771 	struct sk_buff *frag;
772 	struct frag_hdr *fh;
773 	unsigned int len;
774 
775 	len = state->left;
776 	/* IF: it doesn't fit, use 'mtu' - the data space left */
777 	if (len > state->mtu)
778 		len = state->mtu;
779 	/* IF: we are not sending up to and including the packet end
780 	   then align the next start on an eight byte boundary */
781 	if (len < state->left)
782 		len &= ~7;
783 
784 	/* Allocate buffer */
785 	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
786 			 state->hroom + state->troom, GFP_ATOMIC);
787 	if (!frag)
788 		return ERR_PTR(-ENOMEM);
789 
790 	/*
791 	 *	Set up data on packet
792 	 */
793 
794 	ip6_copy_metadata(frag, skb);
795 	skb_reserve(frag, state->hroom);
796 	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
797 	skb_reset_network_header(frag);
798 	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
799 	frag->transport_header = (frag->network_header + state->hlen +
800 				  sizeof(struct frag_hdr));
801 
802 	/*
803 	 *	Charge the memory for the fragment to any owner
804 	 *	it might possess
805 	 */
806 	if (skb->sk)
807 		skb_set_owner_w(frag, skb->sk);
808 
809 	/*
810 	 *	Copy the packet header into the new buffer.
811 	 */
812 	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
813 
814 	fragnexthdr_offset = skb_network_header(frag);
815 	fragnexthdr_offset += prevhdr - skb_network_header(skb);
816 	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
817 
818 	/*
819 	 *	Build fragment header.
820 	 */
821 	fh->nexthdr = state->nexthdr;
822 	fh->reserved = 0;
823 	fh->identification = state->frag_id;
824 
825 	/*
826 	 *	Copy a block of the IP datagram.
827 	 */
828 	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
829 			     len));
830 	state->left -= len;
831 
832 	fh->frag_off = htons(state->offset);
833 	if (state->left > 0)
834 		fh->frag_off |= htons(IP6_MF);
835 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
836 
837 	state->ptr += len;
838 	state->offset += len;
839 
840 	return frag;
841 }
842 EXPORT_SYMBOL(ip6_frag_next);
843 
844 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
845 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
846 {
847 	struct sk_buff *frag;
848 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
849 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
850 				inet6_sk(skb->sk) : NULL;
851 	bool mono_delivery_time = skb->mono_delivery_time;
852 	struct ip6_frag_state state;
853 	unsigned int mtu, hlen, nexthdr_offset;
854 	ktime_t tstamp = skb->tstamp;
855 	int hroom, err = 0;
856 	__be32 frag_id;
857 	u8 *prevhdr, nexthdr = 0;
858 
859 	err = ip6_find_1stfragopt(skb, &prevhdr);
860 	if (err < 0)
861 		goto fail;
862 	hlen = err;
863 	nexthdr = *prevhdr;
864 	nexthdr_offset = prevhdr - skb_network_header(skb);
865 
866 	mtu = ip6_skb_dst_mtu(skb);
867 
868 	/* We must not fragment if the socket is set to force MTU discovery
869 	 * or if the skb it not generated by a local socket.
870 	 */
871 	if (unlikely(!skb->ignore_df && skb->len > mtu))
872 		goto fail_toobig;
873 
874 	if (IP6CB(skb)->frag_max_size) {
875 		if (IP6CB(skb)->frag_max_size > mtu)
876 			goto fail_toobig;
877 
878 		/* don't send fragments larger than what we received */
879 		mtu = IP6CB(skb)->frag_max_size;
880 		if (mtu < IPV6_MIN_MTU)
881 			mtu = IPV6_MIN_MTU;
882 	}
883 
884 	if (np && np->frag_size < mtu) {
885 		if (np->frag_size)
886 			mtu = np->frag_size;
887 	}
888 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
889 		goto fail_toobig;
890 	mtu -= hlen + sizeof(struct frag_hdr);
891 
892 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
893 				    &ipv6_hdr(skb)->saddr);
894 
895 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
896 	    (err = skb_checksum_help(skb)))
897 		goto fail;
898 
899 	prevhdr = skb_network_header(skb) + nexthdr_offset;
900 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
901 	if (skb_has_frag_list(skb)) {
902 		unsigned int first_len = skb_pagelen(skb);
903 		struct ip6_fraglist_iter iter;
904 		struct sk_buff *frag2;
905 
906 		if (first_len - hlen > mtu ||
907 		    ((first_len - hlen) & 7) ||
908 		    skb_cloned(skb) ||
909 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
910 			goto slow_path;
911 
912 		skb_walk_frags(skb, frag) {
913 			/* Correct geometry. */
914 			if (frag->len > mtu ||
915 			    ((frag->len & 7) && frag->next) ||
916 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
917 				goto slow_path_clean;
918 
919 			/* Partially cloned skb? */
920 			if (skb_shared(frag))
921 				goto slow_path_clean;
922 
923 			BUG_ON(frag->sk);
924 			if (skb->sk) {
925 				frag->sk = skb->sk;
926 				frag->destructor = sock_wfree;
927 			}
928 			skb->truesize -= frag->truesize;
929 		}
930 
931 		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
932 					&iter);
933 		if (err < 0)
934 			goto fail;
935 
936 		/* We prevent @rt from being freed. */
937 		rcu_read_lock();
938 
939 		for (;;) {
940 			/* Prepare header of the next frame,
941 			 * before previous one went down. */
942 			if (iter.frag)
943 				ip6_fraglist_prepare(skb, &iter);
944 
945 			skb_set_delivery_time(skb, tstamp, mono_delivery_time);
946 			err = output(net, sk, skb);
947 			if (!err)
948 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
949 					      IPSTATS_MIB_FRAGCREATES);
950 
951 			if (err || !iter.frag)
952 				break;
953 
954 			skb = ip6_fraglist_next(&iter);
955 		}
956 
957 		kfree(iter.tmp_hdr);
958 
959 		if (err == 0) {
960 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
961 				      IPSTATS_MIB_FRAGOKS);
962 			rcu_read_unlock();
963 			return 0;
964 		}
965 
966 		kfree_skb_list(iter.frag);
967 
968 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
969 			      IPSTATS_MIB_FRAGFAILS);
970 		rcu_read_unlock();
971 		return err;
972 
973 slow_path_clean:
974 		skb_walk_frags(skb, frag2) {
975 			if (frag2 == frag)
976 				break;
977 			frag2->sk = NULL;
978 			frag2->destructor = NULL;
979 			skb->truesize += frag2->truesize;
980 		}
981 	}
982 
983 slow_path:
984 	/*
985 	 *	Fragment the datagram.
986 	 */
987 
988 	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
989 		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
990 		      &state);
991 
992 	/*
993 	 *	Keep copying data until we run out.
994 	 */
995 
996 	while (state.left > 0) {
997 		frag = ip6_frag_next(skb, &state);
998 		if (IS_ERR(frag)) {
999 			err = PTR_ERR(frag);
1000 			goto fail;
1001 		}
1002 
1003 		/*
1004 		 *	Put this fragment into the sending queue.
1005 		 */
1006 		skb_set_delivery_time(frag, tstamp, mono_delivery_time);
1007 		err = output(net, sk, frag);
1008 		if (err)
1009 			goto fail;
1010 
1011 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1012 			      IPSTATS_MIB_FRAGCREATES);
1013 	}
1014 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1015 		      IPSTATS_MIB_FRAGOKS);
1016 	consume_skb(skb);
1017 	return err;
1018 
1019 fail_toobig:
1020 	if (skb->sk && dst_allfrag(skb_dst(skb)))
1021 		sk_gso_disable(skb->sk);
1022 
1023 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1024 	err = -EMSGSIZE;
1025 
1026 fail:
1027 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1028 		      IPSTATS_MIB_FRAGFAILS);
1029 	kfree_skb(skb);
1030 	return err;
1031 }
1032 
1033 static inline int ip6_rt_check(const struct rt6key *rt_key,
1034 			       const struct in6_addr *fl_addr,
1035 			       const struct in6_addr *addr_cache)
1036 {
1037 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1038 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1039 }
1040 
1041 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1042 					  struct dst_entry *dst,
1043 					  const struct flowi6 *fl6)
1044 {
1045 	struct ipv6_pinfo *np = inet6_sk(sk);
1046 	struct rt6_info *rt;
1047 
1048 	if (!dst)
1049 		goto out;
1050 
1051 	if (dst->ops->family != AF_INET6) {
1052 		dst_release(dst);
1053 		return NULL;
1054 	}
1055 
1056 	rt = (struct rt6_info *)dst;
1057 	/* Yes, checking route validity in not connected
1058 	 * case is not very simple. Take into account,
1059 	 * that we do not support routing by source, TOS,
1060 	 * and MSG_DONTROUTE		--ANK (980726)
1061 	 *
1062 	 * 1. ip6_rt_check(): If route was host route,
1063 	 *    check that cached destination is current.
1064 	 *    If it is network route, we still may
1065 	 *    check its validity using saved pointer
1066 	 *    to the last used address: daddr_cache.
1067 	 *    We do not want to save whole address now,
1068 	 *    (because main consumer of this service
1069 	 *    is tcp, which has not this problem),
1070 	 *    so that the last trick works only on connected
1071 	 *    sockets.
1072 	 * 2. oif also should be the same.
1073 	 */
1074 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1075 #ifdef CONFIG_IPV6_SUBTREES
1076 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1077 #endif
1078 	   (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
1079 		dst_release(dst);
1080 		dst = NULL;
1081 	}
1082 
1083 out:
1084 	return dst;
1085 }
1086 
1087 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1088 			       struct dst_entry **dst, struct flowi6 *fl6)
1089 {
1090 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1091 	struct neighbour *n;
1092 	struct rt6_info *rt;
1093 #endif
1094 	int err;
1095 	int flags = 0;
1096 
1097 	/* The correct way to handle this would be to do
1098 	 * ip6_route_get_saddr, and then ip6_route_output; however,
1099 	 * the route-specific preferred source forces the
1100 	 * ip6_route_output call _before_ ip6_route_get_saddr.
1101 	 *
1102 	 * In source specific routing (no src=any default route),
1103 	 * ip6_route_output will fail given src=any saddr, though, so
1104 	 * that's why we try it again later.
1105 	 */
1106 	if (ipv6_addr_any(&fl6->saddr)) {
1107 		struct fib6_info *from;
1108 		struct rt6_info *rt;
1109 
1110 		*dst = ip6_route_output(net, sk, fl6);
1111 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1112 
1113 		rcu_read_lock();
1114 		from = rt ? rcu_dereference(rt->from) : NULL;
1115 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1116 					  sk ? inet6_sk(sk)->srcprefs : 0,
1117 					  &fl6->saddr);
1118 		rcu_read_unlock();
1119 
1120 		if (err)
1121 			goto out_err_release;
1122 
1123 		/* If we had an erroneous initial result, pretend it
1124 		 * never existed and let the SA-enabled version take
1125 		 * over.
1126 		 */
1127 		if ((*dst)->error) {
1128 			dst_release(*dst);
1129 			*dst = NULL;
1130 		}
1131 
1132 		if (fl6->flowi6_oif)
1133 			flags |= RT6_LOOKUP_F_IFACE;
1134 	}
1135 
1136 	if (!*dst)
1137 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1138 
1139 	err = (*dst)->error;
1140 	if (err)
1141 		goto out_err_release;
1142 
1143 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1144 	/*
1145 	 * Here if the dst entry we've looked up
1146 	 * has a neighbour entry that is in the INCOMPLETE
1147 	 * state and the src address from the flow is
1148 	 * marked as OPTIMISTIC, we release the found
1149 	 * dst entry and replace it instead with the
1150 	 * dst entry of the nexthop router
1151 	 */
1152 	rt = (struct rt6_info *) *dst;
1153 	rcu_read_lock();
1154 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1155 				      rt6_nexthop(rt, &fl6->daddr));
1156 	err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
1157 	rcu_read_unlock();
1158 
1159 	if (err) {
1160 		struct inet6_ifaddr *ifp;
1161 		struct flowi6 fl_gw6;
1162 		int redirect;
1163 
1164 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1165 				      (*dst)->dev, 1);
1166 
1167 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1168 		if (ifp)
1169 			in6_ifa_put(ifp);
1170 
1171 		if (redirect) {
1172 			/*
1173 			 * We need to get the dst entry for the
1174 			 * default router instead
1175 			 */
1176 			dst_release(*dst);
1177 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1178 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1179 			*dst = ip6_route_output(net, sk, &fl_gw6);
1180 			err = (*dst)->error;
1181 			if (err)
1182 				goto out_err_release;
1183 		}
1184 	}
1185 #endif
1186 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1187 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1188 		err = -EAFNOSUPPORT;
1189 		goto out_err_release;
1190 	}
1191 
1192 	return 0;
1193 
1194 out_err_release:
1195 	dst_release(*dst);
1196 	*dst = NULL;
1197 
1198 	if (err == -ENETUNREACH)
1199 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1200 	return err;
1201 }
1202 
1203 /**
1204  *	ip6_dst_lookup - perform route lookup on flow
1205  *	@net: Network namespace to perform lookup in
1206  *	@sk: socket which provides route info
1207  *	@dst: pointer to dst_entry * for result
1208  *	@fl6: flow to lookup
1209  *
1210  *	This function performs a route lookup on the given flow.
1211  *
1212  *	It returns zero on success, or a standard errno code on error.
1213  */
1214 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1215 		   struct flowi6 *fl6)
1216 {
1217 	*dst = NULL;
1218 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1219 }
1220 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1221 
1222 /**
1223  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1224  *	@net: Network namespace to perform lookup in
1225  *	@sk: socket which provides route info
1226  *	@fl6: flow to lookup
1227  *	@final_dst: final destination address for ipsec lookup
1228  *
1229  *	This function performs a route lookup on the given flow.
1230  *
1231  *	It returns a valid dst pointer on success, or a pointer encoded
1232  *	error code.
1233  */
1234 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1235 				      const struct in6_addr *final_dst)
1236 {
1237 	struct dst_entry *dst = NULL;
1238 	int err;
1239 
1240 	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1241 	if (err)
1242 		return ERR_PTR(err);
1243 	if (final_dst)
1244 		fl6->daddr = *final_dst;
1245 
1246 	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1247 }
1248 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1249 
1250 /**
1251  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1252  *	@sk: socket which provides the dst cache and route info
1253  *	@fl6: flow to lookup
1254  *	@final_dst: final destination address for ipsec lookup
1255  *	@connected: whether @sk is connected or not
1256  *
1257  *	This function performs a route lookup on the given flow with the
1258  *	possibility of using the cached route in the socket if it is valid.
1259  *	It will take the socket dst lock when operating on the dst cache.
1260  *	As a result, this function can only be used in process context.
1261  *
1262  *	In addition, for a connected socket, cache the dst in the socket
1263  *	if the current cache is not valid.
1264  *
1265  *	It returns a valid dst pointer on success, or a pointer encoded
1266  *	error code.
1267  */
1268 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1269 					 const struct in6_addr *final_dst,
1270 					 bool connected)
1271 {
1272 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1273 
1274 	dst = ip6_sk_dst_check(sk, dst, fl6);
1275 	if (dst)
1276 		return dst;
1277 
1278 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1279 	if (connected && !IS_ERR(dst))
1280 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1281 
1282 	return dst;
1283 }
1284 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1285 
1286 /**
1287  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1288  *      @skb: Packet for which lookup is done
1289  *      @dev: Tunnel device
1290  *      @net: Network namespace of tunnel device
1291  *      @sock: Socket which provides route info
1292  *      @saddr: Memory to store the src ip address
1293  *      @info: Tunnel information
1294  *      @protocol: IP protocol
1295  *      @use_cache: Flag to enable cache usage
1296  *      This function performs a route lookup on a tunnel
1297  *
1298  *      It returns a valid dst pointer and stores src address to be used in
1299  *      tunnel in param saddr on success, else a pointer encoded error code.
1300  */
1301 
1302 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1303 					struct net_device *dev,
1304 					struct net *net,
1305 					struct socket *sock,
1306 					struct in6_addr *saddr,
1307 					const struct ip_tunnel_info *info,
1308 					u8 protocol,
1309 					bool use_cache)
1310 {
1311 	struct dst_entry *dst = NULL;
1312 #ifdef CONFIG_DST_CACHE
1313 	struct dst_cache *dst_cache;
1314 #endif
1315 	struct flowi6 fl6;
1316 	__u8 prio;
1317 
1318 #ifdef CONFIG_DST_CACHE
1319 	dst_cache = (struct dst_cache *)&info->dst_cache;
1320 	if (use_cache) {
1321 		dst = dst_cache_get_ip6(dst_cache, saddr);
1322 		if (dst)
1323 			return dst;
1324 	}
1325 #endif
1326 	memset(&fl6, 0, sizeof(fl6));
1327 	fl6.flowi6_mark = skb->mark;
1328 	fl6.flowi6_proto = protocol;
1329 	fl6.daddr = info->key.u.ipv6.dst;
1330 	fl6.saddr = info->key.u.ipv6.src;
1331 	prio = info->key.tos;
1332 	fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
1333 
1334 	dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1335 					      NULL);
1336 	if (IS_ERR(dst)) {
1337 		netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1338 		return ERR_PTR(-ENETUNREACH);
1339 	}
1340 	if (dst->dev == dev) { /* is this necessary? */
1341 		netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1342 		dst_release(dst);
1343 		return ERR_PTR(-ELOOP);
1344 	}
1345 #ifdef CONFIG_DST_CACHE
1346 	if (use_cache)
1347 		dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1348 #endif
1349 	*saddr = fl6.saddr;
1350 	return dst;
1351 }
1352 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1353 
1354 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1355 					       gfp_t gfp)
1356 {
1357 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1358 }
1359 
1360 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1361 						gfp_t gfp)
1362 {
1363 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1364 }
1365 
1366 static void ip6_append_data_mtu(unsigned int *mtu,
1367 				int *maxfraglen,
1368 				unsigned int fragheaderlen,
1369 				struct sk_buff *skb,
1370 				struct rt6_info *rt,
1371 				unsigned int orig_mtu)
1372 {
1373 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1374 		if (!skb) {
1375 			/* first fragment, reserve header_len */
1376 			*mtu = orig_mtu - rt->dst.header_len;
1377 
1378 		} else {
1379 			/*
1380 			 * this fragment is not first, the headers
1381 			 * space is regarded as data space.
1382 			 */
1383 			*mtu = orig_mtu;
1384 		}
1385 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1386 			      + fragheaderlen - sizeof(struct frag_hdr);
1387 	}
1388 }
1389 
1390 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1391 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1392 			  struct rt6_info *rt)
1393 {
1394 	struct ipv6_pinfo *np = inet6_sk(sk);
1395 	unsigned int mtu;
1396 	struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1397 
1398 	/* callers pass dst together with a reference, set it first so
1399 	 * ip6_cork_release() can put it down even in case of an error.
1400 	 */
1401 	cork->base.dst = &rt->dst;
1402 
1403 	/*
1404 	 * setup for corking
1405 	 */
1406 	if (opt) {
1407 		if (WARN_ON(v6_cork->opt))
1408 			return -EINVAL;
1409 
1410 		nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1411 		if (unlikely(!nopt))
1412 			return -ENOBUFS;
1413 
1414 		nopt->tot_len = sizeof(*opt);
1415 		nopt->opt_flen = opt->opt_flen;
1416 		nopt->opt_nflen = opt->opt_nflen;
1417 
1418 		nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1419 		if (opt->dst0opt && !nopt->dst0opt)
1420 			return -ENOBUFS;
1421 
1422 		nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1423 		if (opt->dst1opt && !nopt->dst1opt)
1424 			return -ENOBUFS;
1425 
1426 		nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1427 		if (opt->hopopt && !nopt->hopopt)
1428 			return -ENOBUFS;
1429 
1430 		nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1431 		if (opt->srcrt && !nopt->srcrt)
1432 			return -ENOBUFS;
1433 
1434 		/* need source address above miyazawa*/
1435 	}
1436 	v6_cork->hop_limit = ipc6->hlimit;
1437 	v6_cork->tclass = ipc6->tclass;
1438 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1439 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1440 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1441 	else
1442 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1443 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1444 	if (np->frag_size < mtu) {
1445 		if (np->frag_size)
1446 			mtu = np->frag_size;
1447 	}
1448 	cork->base.fragsize = mtu;
1449 	cork->base.gso_size = ipc6->gso_size;
1450 	cork->base.tx_flags = 0;
1451 	cork->base.mark = ipc6->sockc.mark;
1452 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1453 
1454 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1455 		cork->base.flags |= IPCORK_ALLFRAG;
1456 	cork->base.length = 0;
1457 
1458 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1459 
1460 	return 0;
1461 }
1462 
1463 static int __ip6_append_data(struct sock *sk,
1464 			     struct sk_buff_head *queue,
1465 			     struct inet_cork_full *cork_full,
1466 			     struct inet6_cork *v6_cork,
1467 			     struct page_frag *pfrag,
1468 			     int getfrag(void *from, char *to, int offset,
1469 					 int len, int odd, struct sk_buff *skb),
1470 			     void *from, size_t length, int transhdrlen,
1471 			     unsigned int flags, struct ipcm6_cookie *ipc6)
1472 {
1473 	struct sk_buff *skb, *skb_prev = NULL;
1474 	struct inet_cork *cork = &cork_full->base;
1475 	struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1476 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1477 	struct ubuf_info *uarg = NULL;
1478 	int exthdrlen = 0;
1479 	int dst_exthdrlen = 0;
1480 	int hh_len;
1481 	int copy;
1482 	int err;
1483 	int offset = 0;
1484 	bool zc = false;
1485 	u32 tskey = 0;
1486 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1487 	struct ipv6_txoptions *opt = v6_cork->opt;
1488 	int csummode = CHECKSUM_NONE;
1489 	unsigned int maxnonfragsize, headersize;
1490 	unsigned int wmem_alloc_delta = 0;
1491 	bool paged, extra_uref = false;
1492 
1493 	skb = skb_peek_tail(queue);
1494 	if (!skb) {
1495 		exthdrlen = opt ? opt->opt_flen : 0;
1496 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1497 	}
1498 
1499 	paged = !!cork->gso_size;
1500 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1501 	orig_mtu = mtu;
1502 
1503 	if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
1504 	    READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID)
1505 		tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1506 
1507 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1508 
1509 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1510 			(opt ? opt->opt_nflen : 0);
1511 
1512 	headersize = sizeof(struct ipv6hdr) +
1513 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1514 		     (dst_allfrag(&rt->dst) ?
1515 		      sizeof(struct frag_hdr) : 0) +
1516 		     rt->rt6i_nfheader_len;
1517 
1518 	if (mtu <= fragheaderlen ||
1519 	    ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1520 		goto emsgsize;
1521 
1522 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1523 		     sizeof(struct frag_hdr);
1524 
1525 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1526 	 * the first fragment
1527 	 */
1528 	if (headersize + transhdrlen > mtu)
1529 		goto emsgsize;
1530 
1531 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1532 	    (sk->sk_protocol == IPPROTO_UDP ||
1533 	     sk->sk_protocol == IPPROTO_ICMPV6 ||
1534 	     sk->sk_protocol == IPPROTO_RAW)) {
1535 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1536 				sizeof(struct ipv6hdr));
1537 		goto emsgsize;
1538 	}
1539 
1540 	if (ip6_sk_ignore_df(sk))
1541 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1542 	else
1543 		maxnonfragsize = mtu;
1544 
1545 	if (cork->length + length > maxnonfragsize - headersize) {
1546 emsgsize:
1547 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1548 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1549 		return -EMSGSIZE;
1550 	}
1551 
1552 	/* CHECKSUM_PARTIAL only with no extension headers and when
1553 	 * we are not going to fragment
1554 	 */
1555 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1556 	    headersize == sizeof(struct ipv6hdr) &&
1557 	    length <= mtu - headersize &&
1558 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1559 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1560 		csummode = CHECKSUM_PARTIAL;
1561 
1562 	if ((flags & MSG_ZEROCOPY) && length) {
1563 		struct msghdr *msg = from;
1564 
1565 		if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1566 			if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1567 				return -EINVAL;
1568 
1569 			/* Leave uarg NULL if can't zerocopy, callers should
1570 			 * be able to handle it.
1571 			 */
1572 			if ((rt->dst.dev->features & NETIF_F_SG) &&
1573 			    csummode == CHECKSUM_PARTIAL) {
1574 				paged = true;
1575 				zc = true;
1576 				uarg = msg->msg_ubuf;
1577 			}
1578 		} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1579 			uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1580 			if (!uarg)
1581 				return -ENOBUFS;
1582 			extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1583 			if (rt->dst.dev->features & NETIF_F_SG &&
1584 			    csummode == CHECKSUM_PARTIAL) {
1585 				paged = true;
1586 				zc = true;
1587 			} else {
1588 				uarg_to_msgzc(uarg)->zerocopy = 0;
1589 				skb_zcopy_set(skb, uarg, &extra_uref);
1590 			}
1591 		}
1592 	} else if ((flags & MSG_SPLICE_PAGES) && length) {
1593 		if (inet_test_bit(HDRINCL, sk))
1594 			return -EPERM;
1595 		if (rt->dst.dev->features & NETIF_F_SG &&
1596 		    getfrag == ip_generic_getfrag)
1597 			/* We need an empty buffer to attach stuff to */
1598 			paged = true;
1599 		else
1600 			flags &= ~MSG_SPLICE_PAGES;
1601 	}
1602 
1603 	/*
1604 	 * Let's try using as much space as possible.
1605 	 * Use MTU if total length of the message fits into the MTU.
1606 	 * Otherwise, we need to reserve fragment header and
1607 	 * fragment alignment (= 8-15 octects, in total).
1608 	 *
1609 	 * Note that we may need to "move" the data from the tail
1610 	 * of the buffer to the new fragment when we split
1611 	 * the message.
1612 	 *
1613 	 * FIXME: It may be fragmented into multiple chunks
1614 	 *        at once if non-fragmentable extension headers
1615 	 *        are too large.
1616 	 * --yoshfuji
1617 	 */
1618 
1619 	cork->length += length;
1620 	if (!skb)
1621 		goto alloc_new_skb;
1622 
1623 	while (length > 0) {
1624 		/* Check if the remaining data fits into current packet. */
1625 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1626 		if (copy < length)
1627 			copy = maxfraglen - skb->len;
1628 
1629 		if (copy <= 0) {
1630 			char *data;
1631 			unsigned int datalen;
1632 			unsigned int fraglen;
1633 			unsigned int fraggap;
1634 			unsigned int alloclen, alloc_extra;
1635 			unsigned int pagedlen;
1636 alloc_new_skb:
1637 			/* There's no room in the current skb */
1638 			if (skb)
1639 				fraggap = skb->len - maxfraglen;
1640 			else
1641 				fraggap = 0;
1642 			/* update mtu and maxfraglen if necessary */
1643 			if (!skb || !skb_prev)
1644 				ip6_append_data_mtu(&mtu, &maxfraglen,
1645 						    fragheaderlen, skb, rt,
1646 						    orig_mtu);
1647 
1648 			skb_prev = skb;
1649 
1650 			/*
1651 			 * If remaining data exceeds the mtu,
1652 			 * we know we need more fragment(s).
1653 			 */
1654 			datalen = length + fraggap;
1655 
1656 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1657 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1658 			fraglen = datalen + fragheaderlen;
1659 			pagedlen = 0;
1660 
1661 			alloc_extra = hh_len;
1662 			alloc_extra += dst_exthdrlen;
1663 			alloc_extra += rt->dst.trailer_len;
1664 
1665 			/* We just reserve space for fragment header.
1666 			 * Note: this may be overallocation if the message
1667 			 * (without MSG_MORE) fits into the MTU.
1668 			 */
1669 			alloc_extra += sizeof(struct frag_hdr);
1670 
1671 			if ((flags & MSG_MORE) &&
1672 			    !(rt->dst.dev->features&NETIF_F_SG))
1673 				alloclen = mtu;
1674 			else if (!paged &&
1675 				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1676 				  !(rt->dst.dev->features & NETIF_F_SG)))
1677 				alloclen = fraglen;
1678 			else {
1679 				alloclen = fragheaderlen + transhdrlen;
1680 				pagedlen = datalen - transhdrlen;
1681 			}
1682 			alloclen += alloc_extra;
1683 
1684 			if (datalen != length + fraggap) {
1685 				/*
1686 				 * this is not the last fragment, the trailer
1687 				 * space is regarded as data space.
1688 				 */
1689 				datalen += rt->dst.trailer_len;
1690 			}
1691 
1692 			fraglen = datalen + fragheaderlen;
1693 
1694 			copy = datalen - transhdrlen - fraggap - pagedlen;
1695 			/* [!] NOTE: copy may be negative if pagedlen>0
1696 			 * because then the equation may reduces to -fraggap.
1697 			 */
1698 			if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) {
1699 				err = -EINVAL;
1700 				goto error;
1701 			}
1702 			if (transhdrlen) {
1703 				skb = sock_alloc_send_skb(sk, alloclen,
1704 						(flags & MSG_DONTWAIT), &err);
1705 			} else {
1706 				skb = NULL;
1707 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1708 				    2 * sk->sk_sndbuf)
1709 					skb = alloc_skb(alloclen,
1710 							sk->sk_allocation);
1711 				if (unlikely(!skb))
1712 					err = -ENOBUFS;
1713 			}
1714 			if (!skb)
1715 				goto error;
1716 			/*
1717 			 *	Fill in the control structures
1718 			 */
1719 			skb->protocol = htons(ETH_P_IPV6);
1720 			skb->ip_summed = csummode;
1721 			skb->csum = 0;
1722 			/* reserve for fragmentation and ipsec header */
1723 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1724 				    dst_exthdrlen);
1725 
1726 			/*
1727 			 *	Find where to start putting bytes
1728 			 */
1729 			data = skb_put(skb, fraglen - pagedlen);
1730 			skb_set_network_header(skb, exthdrlen);
1731 			data += fragheaderlen;
1732 			skb->transport_header = (skb->network_header +
1733 						 fragheaderlen);
1734 			if (fraggap) {
1735 				skb->csum = skb_copy_and_csum_bits(
1736 					skb_prev, maxfraglen,
1737 					data + transhdrlen, fraggap);
1738 				skb_prev->csum = csum_sub(skb_prev->csum,
1739 							  skb->csum);
1740 				data += fraggap;
1741 				pskb_trim_unique(skb_prev, maxfraglen);
1742 			}
1743 			if (copy > 0 &&
1744 			    getfrag(from, data + transhdrlen, offset,
1745 				    copy, fraggap, skb) < 0) {
1746 				err = -EFAULT;
1747 				kfree_skb(skb);
1748 				goto error;
1749 			} else if (flags & MSG_SPLICE_PAGES) {
1750 				copy = 0;
1751 			}
1752 
1753 			offset += copy;
1754 			length -= copy + transhdrlen;
1755 			transhdrlen = 0;
1756 			exthdrlen = 0;
1757 			dst_exthdrlen = 0;
1758 
1759 			/* Only the initial fragment is time stamped */
1760 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1761 			cork->tx_flags = 0;
1762 			skb_shinfo(skb)->tskey = tskey;
1763 			tskey = 0;
1764 			skb_zcopy_set(skb, uarg, &extra_uref);
1765 
1766 			if ((flags & MSG_CONFIRM) && !skb_prev)
1767 				skb_set_dst_pending_confirm(skb, 1);
1768 
1769 			/*
1770 			 * Put the packet on the pending queue
1771 			 */
1772 			if (!skb->destructor) {
1773 				skb->destructor = sock_wfree;
1774 				skb->sk = sk;
1775 				wmem_alloc_delta += skb->truesize;
1776 			}
1777 			__skb_queue_tail(queue, skb);
1778 			continue;
1779 		}
1780 
1781 		if (copy > length)
1782 			copy = length;
1783 
1784 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1785 		    skb_tailroom(skb) >= copy) {
1786 			unsigned int off;
1787 
1788 			off = skb->len;
1789 			if (getfrag(from, skb_put(skb, copy),
1790 						offset, copy, off, skb) < 0) {
1791 				__skb_trim(skb, off);
1792 				err = -EFAULT;
1793 				goto error;
1794 			}
1795 		} else if (flags & MSG_SPLICE_PAGES) {
1796 			struct msghdr *msg = from;
1797 
1798 			err = -EIO;
1799 			if (WARN_ON_ONCE(copy > msg->msg_iter.count))
1800 				goto error;
1801 
1802 			err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
1803 						   sk->sk_allocation);
1804 			if (err < 0)
1805 				goto error;
1806 			copy = err;
1807 			wmem_alloc_delta += copy;
1808 		} else if (!zc) {
1809 			int i = skb_shinfo(skb)->nr_frags;
1810 
1811 			err = -ENOMEM;
1812 			if (!sk_page_frag_refill(sk, pfrag))
1813 				goto error;
1814 
1815 			skb_zcopy_downgrade_managed(skb);
1816 			if (!skb_can_coalesce(skb, i, pfrag->page,
1817 					      pfrag->offset)) {
1818 				err = -EMSGSIZE;
1819 				if (i == MAX_SKB_FRAGS)
1820 					goto error;
1821 
1822 				__skb_fill_page_desc(skb, i, pfrag->page,
1823 						     pfrag->offset, 0);
1824 				skb_shinfo(skb)->nr_frags = ++i;
1825 				get_page(pfrag->page);
1826 			}
1827 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1828 			if (getfrag(from,
1829 				    page_address(pfrag->page) + pfrag->offset,
1830 				    offset, copy, skb->len, skb) < 0)
1831 				goto error_efault;
1832 
1833 			pfrag->offset += copy;
1834 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1835 			skb->len += copy;
1836 			skb->data_len += copy;
1837 			skb->truesize += copy;
1838 			wmem_alloc_delta += copy;
1839 		} else {
1840 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1841 			if (err < 0)
1842 				goto error;
1843 		}
1844 		offset += copy;
1845 		length -= copy;
1846 	}
1847 
1848 	if (wmem_alloc_delta)
1849 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1850 	return 0;
1851 
1852 error_efault:
1853 	err = -EFAULT;
1854 error:
1855 	net_zcopy_put_abort(uarg, extra_uref);
1856 	cork->length -= length;
1857 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1858 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1859 	return err;
1860 }
1861 
1862 int ip6_append_data(struct sock *sk,
1863 		    int getfrag(void *from, char *to, int offset, int len,
1864 				int odd, struct sk_buff *skb),
1865 		    void *from, size_t length, int transhdrlen,
1866 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1867 		    struct rt6_info *rt, unsigned int flags)
1868 {
1869 	struct inet_sock *inet = inet_sk(sk);
1870 	struct ipv6_pinfo *np = inet6_sk(sk);
1871 	int exthdrlen;
1872 	int err;
1873 
1874 	if (flags&MSG_PROBE)
1875 		return 0;
1876 	if (skb_queue_empty(&sk->sk_write_queue)) {
1877 		/*
1878 		 * setup for corking
1879 		 */
1880 		dst_hold(&rt->dst);
1881 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1882 				     ipc6, rt);
1883 		if (err)
1884 			return err;
1885 
1886 		inet->cork.fl.u.ip6 = *fl6;
1887 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1888 		length += exthdrlen;
1889 		transhdrlen += exthdrlen;
1890 	} else {
1891 		transhdrlen = 0;
1892 	}
1893 
1894 	return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1895 				 &np->cork, sk_page_frag(sk), getfrag,
1896 				 from, length, transhdrlen, flags, ipc6);
1897 }
1898 EXPORT_SYMBOL_GPL(ip6_append_data);
1899 
1900 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1901 {
1902 	struct dst_entry *dst = cork->base.dst;
1903 
1904 	cork->base.dst = NULL;
1905 	cork->base.flags &= ~IPCORK_ALLFRAG;
1906 	skb_dst_set(skb, dst);
1907 }
1908 
1909 static void ip6_cork_release(struct inet_cork_full *cork,
1910 			     struct inet6_cork *v6_cork)
1911 {
1912 	if (v6_cork->opt) {
1913 		struct ipv6_txoptions *opt = v6_cork->opt;
1914 
1915 		kfree(opt->dst0opt);
1916 		kfree(opt->dst1opt);
1917 		kfree(opt->hopopt);
1918 		kfree(opt->srcrt);
1919 		kfree(opt);
1920 		v6_cork->opt = NULL;
1921 	}
1922 
1923 	if (cork->base.dst) {
1924 		dst_release(cork->base.dst);
1925 		cork->base.dst = NULL;
1926 		cork->base.flags &= ~IPCORK_ALLFRAG;
1927 	}
1928 }
1929 
1930 struct sk_buff *__ip6_make_skb(struct sock *sk,
1931 			       struct sk_buff_head *queue,
1932 			       struct inet_cork_full *cork,
1933 			       struct inet6_cork *v6_cork)
1934 {
1935 	struct sk_buff *skb, *tmp_skb;
1936 	struct sk_buff **tail_skb;
1937 	struct in6_addr *final_dst;
1938 	struct ipv6_pinfo *np = inet6_sk(sk);
1939 	struct net *net = sock_net(sk);
1940 	struct ipv6hdr *hdr;
1941 	struct ipv6_txoptions *opt = v6_cork->opt;
1942 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1943 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1944 	unsigned char proto = fl6->flowi6_proto;
1945 
1946 	skb = __skb_dequeue(queue);
1947 	if (!skb)
1948 		goto out;
1949 	tail_skb = &(skb_shinfo(skb)->frag_list);
1950 
1951 	/* move skb->data to ip header from ext header */
1952 	if (skb->data < skb_network_header(skb))
1953 		__skb_pull(skb, skb_network_offset(skb));
1954 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1955 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1956 		*tail_skb = tmp_skb;
1957 		tail_skb = &(tmp_skb->next);
1958 		skb->len += tmp_skb->len;
1959 		skb->data_len += tmp_skb->len;
1960 		skb->truesize += tmp_skb->truesize;
1961 		tmp_skb->destructor = NULL;
1962 		tmp_skb->sk = NULL;
1963 	}
1964 
1965 	/* Allow local fragmentation. */
1966 	skb->ignore_df = ip6_sk_ignore_df(sk);
1967 	__skb_pull(skb, skb_network_header_len(skb));
1968 
1969 	final_dst = &fl6->daddr;
1970 	if (opt && opt->opt_flen)
1971 		ipv6_push_frag_opts(skb, opt, &proto);
1972 	if (opt && opt->opt_nflen)
1973 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1974 
1975 	skb_push(skb, sizeof(struct ipv6hdr));
1976 	skb_reset_network_header(skb);
1977 	hdr = ipv6_hdr(skb);
1978 
1979 	ip6_flow_hdr(hdr, v6_cork->tclass,
1980 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1981 					ip6_autoflowlabel(net, np), fl6));
1982 	hdr->hop_limit = v6_cork->hop_limit;
1983 	hdr->nexthdr = proto;
1984 	hdr->saddr = fl6->saddr;
1985 	hdr->daddr = *final_dst;
1986 
1987 	skb->priority = sk->sk_priority;
1988 	skb->mark = cork->base.mark;
1989 	skb->tstamp = cork->base.transmit_time;
1990 
1991 	ip6_cork_steal_dst(skb, cork);
1992 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1993 	if (proto == IPPROTO_ICMPV6) {
1994 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1995 		u8 icmp6_type;
1996 
1997 		if (sk->sk_socket->type == SOCK_RAW &&
1998 		   !inet_test_bit(HDRINCL, sk))
1999 			icmp6_type = fl6->fl6_icmp_type;
2000 		else
2001 			icmp6_type = icmp6_hdr(skb)->icmp6_type;
2002 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
2003 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
2004 	}
2005 
2006 	ip6_cork_release(cork, v6_cork);
2007 out:
2008 	return skb;
2009 }
2010 
2011 int ip6_send_skb(struct sk_buff *skb)
2012 {
2013 	struct net *net = sock_net(skb->sk);
2014 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
2015 	int err;
2016 
2017 	err = ip6_local_out(net, skb->sk, skb);
2018 	if (err) {
2019 		if (err > 0)
2020 			err = net_xmit_errno(err);
2021 		if (err)
2022 			IP6_INC_STATS(net, rt->rt6i_idev,
2023 				      IPSTATS_MIB_OUTDISCARDS);
2024 	}
2025 
2026 	return err;
2027 }
2028 
2029 int ip6_push_pending_frames(struct sock *sk)
2030 {
2031 	struct sk_buff *skb;
2032 
2033 	skb = ip6_finish_skb(sk);
2034 	if (!skb)
2035 		return 0;
2036 
2037 	return ip6_send_skb(skb);
2038 }
2039 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
2040 
2041 static void __ip6_flush_pending_frames(struct sock *sk,
2042 				       struct sk_buff_head *queue,
2043 				       struct inet_cork_full *cork,
2044 				       struct inet6_cork *v6_cork)
2045 {
2046 	struct sk_buff *skb;
2047 
2048 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
2049 		if (skb_dst(skb))
2050 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
2051 				      IPSTATS_MIB_OUTDISCARDS);
2052 		kfree_skb(skb);
2053 	}
2054 
2055 	ip6_cork_release(cork, v6_cork);
2056 }
2057 
2058 void ip6_flush_pending_frames(struct sock *sk)
2059 {
2060 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2061 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
2062 }
2063 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2064 
2065 struct sk_buff *ip6_make_skb(struct sock *sk,
2066 			     int getfrag(void *from, char *to, int offset,
2067 					 int len, int odd, struct sk_buff *skb),
2068 			     void *from, size_t length, int transhdrlen,
2069 			     struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2070 			     unsigned int flags, struct inet_cork_full *cork)
2071 {
2072 	struct inet6_cork v6_cork;
2073 	struct sk_buff_head queue;
2074 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2075 	int err;
2076 
2077 	if (flags & MSG_PROBE) {
2078 		dst_release(&rt->dst);
2079 		return NULL;
2080 	}
2081 
2082 	__skb_queue_head_init(&queue);
2083 
2084 	cork->base.flags = 0;
2085 	cork->base.addr = 0;
2086 	cork->base.opt = NULL;
2087 	v6_cork.opt = NULL;
2088 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
2089 	if (err) {
2090 		ip6_cork_release(cork, &v6_cork);
2091 		return ERR_PTR(err);
2092 	}
2093 	if (ipc6->dontfrag < 0)
2094 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2095 
2096 	err = __ip6_append_data(sk, &queue, cork, &v6_cork,
2097 				&current->task_frag, getfrag, from,
2098 				length + exthdrlen, transhdrlen + exthdrlen,
2099 				flags, ipc6);
2100 	if (err) {
2101 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2102 		return ERR_PTR(err);
2103 	}
2104 
2105 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2106 }
2107