xref: /openbmc/linux/net/ipv6/ip6_output.c (revision f839a6c92504cff92a10f522cf686b51ff18dd35)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	:	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/bpf-cgroup.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45 
46 #include <net/sock.h>
47 #include <net/snmp.h>
48 
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
59 #include <net/l3mdev.h>
60 #include <net/lwtunnel.h>
61 
62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
63 {
64 	struct dst_entry *dst = skb_dst(skb);
65 	struct net_device *dev = dst->dev;
66 	struct neighbour *neigh;
67 	struct in6_addr *nexthop;
68 	int ret;
69 
70 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72 
73 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74 		    ((mroute6_is_socket(net, skb) &&
75 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 					 &ipv6_hdr(skb)->saddr))) {
78 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79 
80 			/* Do not check for IFF_ALLMULTI; multicast routing
81 			   is not supported in any case.
82 			 */
83 			if (newskb)
84 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 					net, sk, newskb, NULL, newskb->dev,
86 					dev_loopback_xmit);
87 
88 			if (ipv6_hdr(skb)->hop_limit == 0) {
89 				IP6_INC_STATS(net, idev,
90 					      IPSTATS_MIB_OUTDISCARDS);
91 				kfree_skb(skb);
92 				return 0;
93 			}
94 		}
95 
96 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
97 
98 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
99 		    IPV6_ADDR_SCOPE_NODELOCAL &&
100 		    !(dev->flags & IFF_LOOPBACK)) {
101 			kfree_skb(skb);
102 			return 0;
103 		}
104 	}
105 
106 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
107 		int res = lwtunnel_xmit(skb);
108 
109 		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
110 			return res;
111 	}
112 
113 	rcu_read_lock_bh();
114 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
115 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
116 	if (unlikely(!neigh))
117 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
118 	if (!IS_ERR(neigh)) {
119 		sock_confirm_neigh(skb, neigh);
120 		ret = neigh_output(neigh, skb);
121 		rcu_read_unlock_bh();
122 		return ret;
123 	}
124 	rcu_read_unlock_bh();
125 
126 	IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
127 	kfree_skb(skb);
128 	return -EINVAL;
129 }
130 
131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
132 {
133 	int ret;
134 
135 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
136 	if (ret) {
137 		kfree_skb(skb);
138 		return ret;
139 	}
140 
141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
142 	/* Policy lookup after SNAT yielded a new policy */
143 	if (skb_dst(skb)->xfrm) {
144 		IPCB(skb)->flags |= IPSKB_REROUTED;
145 		return dst_output(net, sk, skb);
146 	}
147 #endif
148 
149 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
150 	    dst_allfrag(skb_dst(skb)) ||
151 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
152 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
153 	else
154 		return ip6_finish_output2(net, sk, skb);
155 }
156 
157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
158 {
159 	struct net_device *dev = skb_dst(skb)->dev;
160 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161 
162 	skb->protocol = htons(ETH_P_IPV6);
163 	skb->dev = dev;
164 
165 	if (unlikely(idev->cnf.disable_ipv6)) {
166 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
167 		kfree_skb(skb);
168 		return 0;
169 	}
170 
171 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
172 			    net, sk, skb, NULL, dev,
173 			    ip6_finish_output,
174 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
175 }
176 
177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
178 {
179 	if (!np->autoflowlabel_set)
180 		return ip6_default_np_autolabel(net);
181 	else
182 		return np->autoflowlabel;
183 }
184 
185 /*
186  * xmit an sk_buff (used by TCP, SCTP and DCCP)
187  * Note : socket lock is not held for SYNACK packets, but might be modified
188  * by calls to skb_set_owner_w() and ipv6_local_error(),
189  * which are using proper atomic operations or spinlocks.
190  */
191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
192 	     __u32 mark, struct ipv6_txoptions *opt, int tclass)
193 {
194 	struct net *net = sock_net(sk);
195 	const struct ipv6_pinfo *np = inet6_sk(sk);
196 	struct in6_addr *first_hop = &fl6->daddr;
197 	struct dst_entry *dst = skb_dst(skb);
198 	struct ipv6hdr *hdr;
199 	u8  proto = fl6->flowi6_proto;
200 	int seg_len = skb->len;
201 	int hlimit = -1;
202 	u32 mtu;
203 
204 	if (opt) {
205 		unsigned int head_room;
206 
207 		/* First: exthdrs may take lots of space (~8K for now)
208 		   MAX_HEADER is not enough.
209 		 */
210 		head_room = opt->opt_nflen + opt->opt_flen;
211 		seg_len += head_room;
212 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
213 
214 		if (skb_headroom(skb) < head_room) {
215 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
216 			if (!skb2) {
217 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
218 					      IPSTATS_MIB_OUTDISCARDS);
219 				kfree_skb(skb);
220 				return -ENOBUFS;
221 			}
222 			if (skb->sk)
223 				skb_set_owner_w(skb2, skb->sk);
224 			consume_skb(skb);
225 			skb = skb2;
226 		}
227 		if (opt->opt_flen)
228 			ipv6_push_frag_opts(skb, opt, &proto);
229 		if (opt->opt_nflen)
230 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
231 					     &fl6->saddr);
232 	}
233 
234 	skb_push(skb, sizeof(struct ipv6hdr));
235 	skb_reset_network_header(skb);
236 	hdr = ipv6_hdr(skb);
237 
238 	/*
239 	 *	Fill in the IPv6 header
240 	 */
241 	if (np)
242 		hlimit = np->hop_limit;
243 	if (hlimit < 0)
244 		hlimit = ip6_dst_hoplimit(dst);
245 
246 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
247 				ip6_autoflowlabel(net, np), fl6));
248 
249 	hdr->payload_len = htons(seg_len);
250 	hdr->nexthdr = proto;
251 	hdr->hop_limit = hlimit;
252 
253 	hdr->saddr = fl6->saddr;
254 	hdr->daddr = *first_hop;
255 
256 	skb->protocol = htons(ETH_P_IPV6);
257 	skb->priority = sk->sk_priority;
258 	skb->mark = mark;
259 
260 	mtu = dst_mtu(dst);
261 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
262 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
263 			      IPSTATS_MIB_OUT, skb->len);
264 
265 		/* if egress device is enslaved to an L3 master device pass the
266 		 * skb to its handler for processing
267 		 */
268 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
269 		if (unlikely(!skb))
270 			return 0;
271 
272 		/* hooks should never assume socket lock is held.
273 		 * we promote our socket to non const
274 		 */
275 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
276 			       net, (struct sock *)sk, skb, NULL, dst->dev,
277 			       dst_output);
278 	}
279 
280 	skb->dev = dst->dev;
281 	/* ipv6_local_error() does not require socket lock,
282 	 * we promote our socket to non const
283 	 */
284 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
285 
286 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
287 	kfree_skb(skb);
288 	return -EMSGSIZE;
289 }
290 EXPORT_SYMBOL(ip6_xmit);
291 
292 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
293 {
294 	struct ip6_ra_chain *ra;
295 	struct sock *last = NULL;
296 
297 	read_lock(&ip6_ra_lock);
298 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
299 		struct sock *sk = ra->sk;
300 		if (sk && ra->sel == sel &&
301 		    (!sk->sk_bound_dev_if ||
302 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
303 			if (last) {
304 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
305 				if (skb2)
306 					rawv6_rcv(last, skb2);
307 			}
308 			last = sk;
309 		}
310 	}
311 
312 	if (last) {
313 		rawv6_rcv(last, skb);
314 		read_unlock(&ip6_ra_lock);
315 		return 1;
316 	}
317 	read_unlock(&ip6_ra_lock);
318 	return 0;
319 }
320 
321 static int ip6_forward_proxy_check(struct sk_buff *skb)
322 {
323 	struct ipv6hdr *hdr = ipv6_hdr(skb);
324 	u8 nexthdr = hdr->nexthdr;
325 	__be16 frag_off;
326 	int offset;
327 
328 	if (ipv6_ext_hdr(nexthdr)) {
329 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
330 		if (offset < 0)
331 			return 0;
332 	} else
333 		offset = sizeof(struct ipv6hdr);
334 
335 	if (nexthdr == IPPROTO_ICMPV6) {
336 		struct icmp6hdr *icmp6;
337 
338 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
339 					 offset + 1 - skb->data)))
340 			return 0;
341 
342 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
343 
344 		switch (icmp6->icmp6_type) {
345 		case NDISC_ROUTER_SOLICITATION:
346 		case NDISC_ROUTER_ADVERTISEMENT:
347 		case NDISC_NEIGHBOUR_SOLICITATION:
348 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
349 		case NDISC_REDIRECT:
350 			/* For reaction involving unicast neighbor discovery
351 			 * message destined to the proxied address, pass it to
352 			 * input function.
353 			 */
354 			return 1;
355 		default:
356 			break;
357 		}
358 	}
359 
360 	/*
361 	 * The proxying router can't forward traffic sent to a link-local
362 	 * address, so signal the sender and discard the packet. This
363 	 * behavior is clarified by the MIPv6 specification.
364 	 */
365 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
366 		dst_link_failure(skb);
367 		return -1;
368 	}
369 
370 	return 0;
371 }
372 
373 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
374 				     struct sk_buff *skb)
375 {
376 	struct dst_entry *dst = skb_dst(skb);
377 
378 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
379 	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
380 
381 #ifdef CONFIG_NET_SWITCHDEV
382 	if (skb->offload_l3_fwd_mark) {
383 		consume_skb(skb);
384 		return 0;
385 	}
386 #endif
387 
388 	return dst_output(net, sk, skb);
389 }
390 
391 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
392 {
393 	if (skb->len <= mtu)
394 		return false;
395 
396 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
397 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
398 		return true;
399 
400 	if (skb->ignore_df)
401 		return false;
402 
403 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
404 		return false;
405 
406 	return true;
407 }
408 
409 int ip6_forward(struct sk_buff *skb)
410 {
411 	struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
412 	struct dst_entry *dst = skb_dst(skb);
413 	struct ipv6hdr *hdr = ipv6_hdr(skb);
414 	struct inet6_skb_parm *opt = IP6CB(skb);
415 	struct net *net = dev_net(dst->dev);
416 	u32 mtu;
417 
418 	if (net->ipv6.devconf_all->forwarding == 0)
419 		goto error;
420 
421 	if (skb->pkt_type != PACKET_HOST)
422 		goto drop;
423 
424 	if (unlikely(skb->sk))
425 		goto drop;
426 
427 	if (skb_warn_if_lro(skb))
428 		goto drop;
429 
430 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
431 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
432 		goto drop;
433 	}
434 
435 	skb_forward_csum(skb);
436 
437 	/*
438 	 *	We DO NOT make any processing on
439 	 *	RA packets, pushing them to user level AS IS
440 	 *	without ane WARRANTY that application will be able
441 	 *	to interpret them. The reason is that we
442 	 *	cannot make anything clever here.
443 	 *
444 	 *	We are not end-node, so that if packet contains
445 	 *	AH/ESP, we cannot make anything.
446 	 *	Defragmentation also would be mistake, RA packets
447 	 *	cannot be fragmented, because there is no warranty
448 	 *	that different fragments will go along one path. --ANK
449 	 */
450 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
451 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
452 			return 0;
453 	}
454 
455 	/*
456 	 *	check and decrement ttl
457 	 */
458 	if (hdr->hop_limit <= 1) {
459 		/* Force OUTPUT device used as source address */
460 		skb->dev = dst->dev;
461 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
462 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
463 
464 		kfree_skb(skb);
465 		return -ETIMEDOUT;
466 	}
467 
468 	/* XXX: idev->cnf.proxy_ndp? */
469 	if (net->ipv6.devconf_all->proxy_ndp &&
470 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
471 		int proxied = ip6_forward_proxy_check(skb);
472 		if (proxied > 0)
473 			return ip6_input(skb);
474 		else if (proxied < 0) {
475 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
476 			goto drop;
477 		}
478 	}
479 
480 	if (!xfrm6_route_forward(skb)) {
481 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
482 		goto drop;
483 	}
484 	dst = skb_dst(skb);
485 
486 	/* IPv6 specs say nothing about it, but it is clear that we cannot
487 	   send redirects to source routed frames.
488 	   We don't send redirects to frames decapsulated from IPsec.
489 	 */
490 	if (IP6CB(skb)->iif == dst->dev->ifindex &&
491 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
492 		struct in6_addr *target = NULL;
493 		struct inet_peer *peer;
494 		struct rt6_info *rt;
495 
496 		/*
497 		 *	incoming and outgoing devices are the same
498 		 *	send a redirect.
499 		 */
500 
501 		rt = (struct rt6_info *) dst;
502 		if (rt->rt6i_flags & RTF_GATEWAY)
503 			target = &rt->rt6i_gateway;
504 		else
505 			target = &hdr->daddr;
506 
507 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
508 
509 		/* Limit redirects both by destination (here)
510 		   and by source (inside ndisc_send_redirect)
511 		 */
512 		if (inet_peer_xrlim_allow(peer, 1*HZ))
513 			ndisc_send_redirect(skb, target);
514 		if (peer)
515 			inet_putpeer(peer);
516 	} else {
517 		int addrtype = ipv6_addr_type(&hdr->saddr);
518 
519 		/* This check is security critical. */
520 		if (addrtype == IPV6_ADDR_ANY ||
521 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
522 			goto error;
523 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
524 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
525 				    ICMPV6_NOT_NEIGHBOUR, 0);
526 			goto error;
527 		}
528 	}
529 
530 	mtu = ip6_dst_mtu_forward(dst);
531 	if (mtu < IPV6_MIN_MTU)
532 		mtu = IPV6_MIN_MTU;
533 
534 	if (ip6_pkt_too_big(skb, mtu)) {
535 		/* Again, force OUTPUT device used as source address */
536 		skb->dev = dst->dev;
537 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
538 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
539 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
540 				IPSTATS_MIB_FRAGFAILS);
541 		kfree_skb(skb);
542 		return -EMSGSIZE;
543 	}
544 
545 	if (skb_cow(skb, dst->dev->hard_header_len)) {
546 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
547 				IPSTATS_MIB_OUTDISCARDS);
548 		goto drop;
549 	}
550 
551 	hdr = ipv6_hdr(skb);
552 
553 	/* Mangling hops number delayed to point after skb COW */
554 
555 	hdr->hop_limit--;
556 
557 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
558 		       net, NULL, skb, skb->dev, dst->dev,
559 		       ip6_forward_finish);
560 
561 error:
562 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
563 drop:
564 	kfree_skb(skb);
565 	return -EINVAL;
566 }
567 
568 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
569 {
570 	to->pkt_type = from->pkt_type;
571 	to->priority = from->priority;
572 	to->protocol = from->protocol;
573 	skb_dst_drop(to);
574 	skb_dst_set(to, dst_clone(skb_dst(from)));
575 	to->dev = from->dev;
576 	to->mark = from->mark;
577 
578 	skb_copy_hash(to, from);
579 
580 #ifdef CONFIG_NET_SCHED
581 	to->tc_index = from->tc_index;
582 #endif
583 	nf_copy(to, from);
584 	skb_copy_secmark(to, from);
585 }
586 
587 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
588 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
589 {
590 	struct sk_buff *frag;
591 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
592 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
593 				inet6_sk(skb->sk) : NULL;
594 	struct ipv6hdr *tmp_hdr;
595 	struct frag_hdr *fh;
596 	unsigned int mtu, hlen, left, len;
597 	int hroom, troom;
598 	__be32 frag_id;
599 	int ptr, offset = 0, err = 0;
600 	u8 *prevhdr, nexthdr = 0;
601 
602 	err = ip6_find_1stfragopt(skb, &prevhdr);
603 	if (err < 0)
604 		goto fail;
605 	hlen = err;
606 	nexthdr = *prevhdr;
607 
608 	mtu = ip6_skb_dst_mtu(skb);
609 
610 	/* We must not fragment if the socket is set to force MTU discovery
611 	 * or if the skb it not generated by a local socket.
612 	 */
613 	if (unlikely(!skb->ignore_df && skb->len > mtu))
614 		goto fail_toobig;
615 
616 	if (IP6CB(skb)->frag_max_size) {
617 		if (IP6CB(skb)->frag_max_size > mtu)
618 			goto fail_toobig;
619 
620 		/* don't send fragments larger than what we received */
621 		mtu = IP6CB(skb)->frag_max_size;
622 		if (mtu < IPV6_MIN_MTU)
623 			mtu = IPV6_MIN_MTU;
624 	}
625 
626 	if (np && np->frag_size < mtu) {
627 		if (np->frag_size)
628 			mtu = np->frag_size;
629 	}
630 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
631 		goto fail_toobig;
632 	mtu -= hlen + sizeof(struct frag_hdr);
633 
634 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
635 				    &ipv6_hdr(skb)->saddr);
636 
637 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
638 	    (err = skb_checksum_help(skb)))
639 		goto fail;
640 
641 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
642 	if (skb_has_frag_list(skb)) {
643 		unsigned int first_len = skb_pagelen(skb);
644 		struct sk_buff *frag2;
645 
646 		if (first_len - hlen > mtu ||
647 		    ((first_len - hlen) & 7) ||
648 		    skb_cloned(skb) ||
649 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
650 			goto slow_path;
651 
652 		skb_walk_frags(skb, frag) {
653 			/* Correct geometry. */
654 			if (frag->len > mtu ||
655 			    ((frag->len & 7) && frag->next) ||
656 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
657 				goto slow_path_clean;
658 
659 			/* Partially cloned skb? */
660 			if (skb_shared(frag))
661 				goto slow_path_clean;
662 
663 			BUG_ON(frag->sk);
664 			if (skb->sk) {
665 				frag->sk = skb->sk;
666 				frag->destructor = sock_wfree;
667 			}
668 			skb->truesize -= frag->truesize;
669 		}
670 
671 		err = 0;
672 		offset = 0;
673 		/* BUILD HEADER */
674 
675 		*prevhdr = NEXTHDR_FRAGMENT;
676 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
677 		if (!tmp_hdr) {
678 			err = -ENOMEM;
679 			goto fail;
680 		}
681 		frag = skb_shinfo(skb)->frag_list;
682 		skb_frag_list_init(skb);
683 
684 		__skb_pull(skb, hlen);
685 		fh = __skb_push(skb, sizeof(struct frag_hdr));
686 		__skb_push(skb, hlen);
687 		skb_reset_network_header(skb);
688 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
689 
690 		fh->nexthdr = nexthdr;
691 		fh->reserved = 0;
692 		fh->frag_off = htons(IP6_MF);
693 		fh->identification = frag_id;
694 
695 		first_len = skb_pagelen(skb);
696 		skb->data_len = first_len - skb_headlen(skb);
697 		skb->len = first_len;
698 		ipv6_hdr(skb)->payload_len = htons(first_len -
699 						   sizeof(struct ipv6hdr));
700 
701 		for (;;) {
702 			/* Prepare header of the next frame,
703 			 * before previous one went down. */
704 			if (frag) {
705 				frag->ip_summed = CHECKSUM_NONE;
706 				skb_reset_transport_header(frag);
707 				fh = __skb_push(frag, sizeof(struct frag_hdr));
708 				__skb_push(frag, hlen);
709 				skb_reset_network_header(frag);
710 				memcpy(skb_network_header(frag), tmp_hdr,
711 				       hlen);
712 				offset += skb->len - hlen - sizeof(struct frag_hdr);
713 				fh->nexthdr = nexthdr;
714 				fh->reserved = 0;
715 				fh->frag_off = htons(offset);
716 				if (frag->next)
717 					fh->frag_off |= htons(IP6_MF);
718 				fh->identification = frag_id;
719 				ipv6_hdr(frag)->payload_len =
720 						htons(frag->len -
721 						      sizeof(struct ipv6hdr));
722 				ip6_copy_metadata(frag, skb);
723 			}
724 
725 			err = output(net, sk, skb);
726 			if (!err)
727 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
728 					      IPSTATS_MIB_FRAGCREATES);
729 
730 			if (err || !frag)
731 				break;
732 
733 			skb = frag;
734 			frag = skb->next;
735 			skb_mark_not_on_list(skb);
736 		}
737 
738 		kfree(tmp_hdr);
739 
740 		if (err == 0) {
741 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
742 				      IPSTATS_MIB_FRAGOKS);
743 			return 0;
744 		}
745 
746 		kfree_skb_list(frag);
747 
748 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
749 			      IPSTATS_MIB_FRAGFAILS);
750 		return err;
751 
752 slow_path_clean:
753 		skb_walk_frags(skb, frag2) {
754 			if (frag2 == frag)
755 				break;
756 			frag2->sk = NULL;
757 			frag2->destructor = NULL;
758 			skb->truesize += frag2->truesize;
759 		}
760 	}
761 
762 slow_path:
763 	left = skb->len - hlen;		/* Space per frame */
764 	ptr = hlen;			/* Where to start from */
765 
766 	/*
767 	 *	Fragment the datagram.
768 	 */
769 
770 	troom = rt->dst.dev->needed_tailroom;
771 
772 	/*
773 	 *	Keep copying data until we run out.
774 	 */
775 	while (left > 0)	{
776 		u8 *fragnexthdr_offset;
777 
778 		len = left;
779 		/* IF: it doesn't fit, use 'mtu' - the data space left */
780 		if (len > mtu)
781 			len = mtu;
782 		/* IF: we are not sending up to and including the packet end
783 		   then align the next start on an eight byte boundary */
784 		if (len < left)	{
785 			len &= ~7;
786 		}
787 
788 		/* Allocate buffer */
789 		frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
790 				 hroom + troom, GFP_ATOMIC);
791 		if (!frag) {
792 			err = -ENOMEM;
793 			goto fail;
794 		}
795 
796 		/*
797 		 *	Set up data on packet
798 		 */
799 
800 		ip6_copy_metadata(frag, skb);
801 		skb_reserve(frag, hroom);
802 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
803 		skb_reset_network_header(frag);
804 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
805 		frag->transport_header = (frag->network_header + hlen +
806 					  sizeof(struct frag_hdr));
807 
808 		/*
809 		 *	Charge the memory for the fragment to any owner
810 		 *	it might possess
811 		 */
812 		if (skb->sk)
813 			skb_set_owner_w(frag, skb->sk);
814 
815 		/*
816 		 *	Copy the packet header into the new buffer.
817 		 */
818 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
819 
820 		fragnexthdr_offset = skb_network_header(frag);
821 		fragnexthdr_offset += prevhdr - skb_network_header(skb);
822 		*fragnexthdr_offset = NEXTHDR_FRAGMENT;
823 
824 		/*
825 		 *	Build fragment header.
826 		 */
827 		fh->nexthdr = nexthdr;
828 		fh->reserved = 0;
829 		fh->identification = frag_id;
830 
831 		/*
832 		 *	Copy a block of the IP datagram.
833 		 */
834 		BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
835 				     len));
836 		left -= len;
837 
838 		fh->frag_off = htons(offset);
839 		if (left > 0)
840 			fh->frag_off |= htons(IP6_MF);
841 		ipv6_hdr(frag)->payload_len = htons(frag->len -
842 						    sizeof(struct ipv6hdr));
843 
844 		ptr += len;
845 		offset += len;
846 
847 		/*
848 		 *	Put this fragment into the sending queue.
849 		 */
850 		err = output(net, sk, frag);
851 		if (err)
852 			goto fail;
853 
854 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
855 			      IPSTATS_MIB_FRAGCREATES);
856 	}
857 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
858 		      IPSTATS_MIB_FRAGOKS);
859 	consume_skb(skb);
860 	return err;
861 
862 fail_toobig:
863 	if (skb->sk && dst_allfrag(skb_dst(skb)))
864 		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
865 
866 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
867 	err = -EMSGSIZE;
868 
869 fail:
870 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
871 		      IPSTATS_MIB_FRAGFAILS);
872 	kfree_skb(skb);
873 	return err;
874 }
875 
876 static inline int ip6_rt_check(const struct rt6key *rt_key,
877 			       const struct in6_addr *fl_addr,
878 			       const struct in6_addr *addr_cache)
879 {
880 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
881 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
882 }
883 
884 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
885 					  struct dst_entry *dst,
886 					  const struct flowi6 *fl6)
887 {
888 	struct ipv6_pinfo *np = inet6_sk(sk);
889 	struct rt6_info *rt;
890 
891 	if (!dst)
892 		goto out;
893 
894 	if (dst->ops->family != AF_INET6) {
895 		dst_release(dst);
896 		return NULL;
897 	}
898 
899 	rt = (struct rt6_info *)dst;
900 	/* Yes, checking route validity in not connected
901 	 * case is not very simple. Take into account,
902 	 * that we do not support routing by source, TOS,
903 	 * and MSG_DONTROUTE		--ANK (980726)
904 	 *
905 	 * 1. ip6_rt_check(): If route was host route,
906 	 *    check that cached destination is current.
907 	 *    If it is network route, we still may
908 	 *    check its validity using saved pointer
909 	 *    to the last used address: daddr_cache.
910 	 *    We do not want to save whole address now,
911 	 *    (because main consumer of this service
912 	 *    is tcp, which has not this problem),
913 	 *    so that the last trick works only on connected
914 	 *    sockets.
915 	 * 2. oif also should be the same.
916 	 */
917 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
918 #ifdef CONFIG_IPV6_SUBTREES
919 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
920 #endif
921 	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
922 	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
923 		dst_release(dst);
924 		dst = NULL;
925 	}
926 
927 out:
928 	return dst;
929 }
930 
931 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
932 			       struct dst_entry **dst, struct flowi6 *fl6)
933 {
934 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
935 	struct neighbour *n;
936 	struct rt6_info *rt;
937 #endif
938 	int err;
939 	int flags = 0;
940 
941 	/* The correct way to handle this would be to do
942 	 * ip6_route_get_saddr, and then ip6_route_output; however,
943 	 * the route-specific preferred source forces the
944 	 * ip6_route_output call _before_ ip6_route_get_saddr.
945 	 *
946 	 * In source specific routing (no src=any default route),
947 	 * ip6_route_output will fail given src=any saddr, though, so
948 	 * that's why we try it again later.
949 	 */
950 	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
951 		struct fib6_info *from;
952 		struct rt6_info *rt;
953 		bool had_dst = *dst != NULL;
954 
955 		if (!had_dst)
956 			*dst = ip6_route_output(net, sk, fl6);
957 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
958 
959 		rcu_read_lock();
960 		from = rt ? rcu_dereference(rt->from) : NULL;
961 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
962 					  sk ? inet6_sk(sk)->srcprefs : 0,
963 					  &fl6->saddr);
964 		rcu_read_unlock();
965 
966 		if (err)
967 			goto out_err_release;
968 
969 		/* If we had an erroneous initial result, pretend it
970 		 * never existed and let the SA-enabled version take
971 		 * over.
972 		 */
973 		if (!had_dst && (*dst)->error) {
974 			dst_release(*dst);
975 			*dst = NULL;
976 		}
977 
978 		if (fl6->flowi6_oif)
979 			flags |= RT6_LOOKUP_F_IFACE;
980 	}
981 
982 	if (!*dst)
983 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
984 
985 	err = (*dst)->error;
986 	if (err)
987 		goto out_err_release;
988 
989 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
990 	/*
991 	 * Here if the dst entry we've looked up
992 	 * has a neighbour entry that is in the INCOMPLETE
993 	 * state and the src address from the flow is
994 	 * marked as OPTIMISTIC, we release the found
995 	 * dst entry and replace it instead with the
996 	 * dst entry of the nexthop router
997 	 */
998 	rt = (struct rt6_info *) *dst;
999 	rcu_read_lock_bh();
1000 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1001 				      rt6_nexthop(rt, &fl6->daddr));
1002 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1003 	rcu_read_unlock_bh();
1004 
1005 	if (err) {
1006 		struct inet6_ifaddr *ifp;
1007 		struct flowi6 fl_gw6;
1008 		int redirect;
1009 
1010 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1011 				      (*dst)->dev, 1);
1012 
1013 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1014 		if (ifp)
1015 			in6_ifa_put(ifp);
1016 
1017 		if (redirect) {
1018 			/*
1019 			 * We need to get the dst entry for the
1020 			 * default router instead
1021 			 */
1022 			dst_release(*dst);
1023 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1024 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1025 			*dst = ip6_route_output(net, sk, &fl_gw6);
1026 			err = (*dst)->error;
1027 			if (err)
1028 				goto out_err_release;
1029 		}
1030 	}
1031 #endif
1032 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1033 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1034 		err = -EAFNOSUPPORT;
1035 		goto out_err_release;
1036 	}
1037 
1038 	return 0;
1039 
1040 out_err_release:
1041 	dst_release(*dst);
1042 	*dst = NULL;
1043 
1044 	if (err == -ENETUNREACH)
1045 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1046 	return err;
1047 }
1048 
1049 /**
1050  *	ip6_dst_lookup - perform route lookup on flow
1051  *	@sk: socket which provides route info
1052  *	@dst: pointer to dst_entry * for result
1053  *	@fl6: flow to lookup
1054  *
1055  *	This function performs a route lookup on the given flow.
1056  *
1057  *	It returns zero on success, or a standard errno code on error.
1058  */
1059 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1060 		   struct flowi6 *fl6)
1061 {
1062 	*dst = NULL;
1063 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1064 }
1065 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1066 
1067 /**
1068  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1069  *	@sk: socket which provides route info
1070  *	@fl6: flow to lookup
1071  *	@final_dst: final destination address for ipsec lookup
1072  *
1073  *	This function performs a route lookup on the given flow.
1074  *
1075  *	It returns a valid dst pointer on success, or a pointer encoded
1076  *	error code.
1077  */
1078 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1079 				      const struct in6_addr *final_dst)
1080 {
1081 	struct dst_entry *dst = NULL;
1082 	int err;
1083 
1084 	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1085 	if (err)
1086 		return ERR_PTR(err);
1087 	if (final_dst)
1088 		fl6->daddr = *final_dst;
1089 
1090 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1091 }
1092 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1093 
1094 /**
1095  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1096  *	@sk: socket which provides the dst cache and route info
1097  *	@fl6: flow to lookup
1098  *	@final_dst: final destination address for ipsec lookup
1099  *	@connected: whether @sk is connected or not
1100  *
1101  *	This function performs a route lookup on the given flow with the
1102  *	possibility of using the cached route in the socket if it is valid.
1103  *	It will take the socket dst lock when operating on the dst cache.
1104  *	As a result, this function can only be used in process context.
1105  *
1106  *	In addition, for a connected socket, cache the dst in the socket
1107  *	if the current cache is not valid.
1108  *
1109  *	It returns a valid dst pointer on success, or a pointer encoded
1110  *	error code.
1111  */
1112 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1113 					 const struct in6_addr *final_dst,
1114 					 bool connected)
1115 {
1116 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1117 
1118 	dst = ip6_sk_dst_check(sk, dst, fl6);
1119 	if (dst)
1120 		return dst;
1121 
1122 	dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1123 	if (connected && !IS_ERR(dst))
1124 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1125 
1126 	return dst;
1127 }
1128 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1129 
1130 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1131 					       gfp_t gfp)
1132 {
1133 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1134 }
1135 
1136 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1137 						gfp_t gfp)
1138 {
1139 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1140 }
1141 
1142 static void ip6_append_data_mtu(unsigned int *mtu,
1143 				int *maxfraglen,
1144 				unsigned int fragheaderlen,
1145 				struct sk_buff *skb,
1146 				struct rt6_info *rt,
1147 				unsigned int orig_mtu)
1148 {
1149 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1150 		if (!skb) {
1151 			/* first fragment, reserve header_len */
1152 			*mtu = orig_mtu - rt->dst.header_len;
1153 
1154 		} else {
1155 			/*
1156 			 * this fragment is not first, the headers
1157 			 * space is regarded as data space.
1158 			 */
1159 			*mtu = orig_mtu;
1160 		}
1161 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1162 			      + fragheaderlen - sizeof(struct frag_hdr);
1163 	}
1164 }
1165 
1166 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1167 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1168 			  struct rt6_info *rt, struct flowi6 *fl6)
1169 {
1170 	struct ipv6_pinfo *np = inet6_sk(sk);
1171 	unsigned int mtu;
1172 	struct ipv6_txoptions *opt = ipc6->opt;
1173 
1174 	/*
1175 	 * setup for corking
1176 	 */
1177 	if (opt) {
1178 		if (WARN_ON(v6_cork->opt))
1179 			return -EINVAL;
1180 
1181 		v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1182 		if (unlikely(!v6_cork->opt))
1183 			return -ENOBUFS;
1184 
1185 		v6_cork->opt->tot_len = sizeof(*opt);
1186 		v6_cork->opt->opt_flen = opt->opt_flen;
1187 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1188 
1189 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1190 						    sk->sk_allocation);
1191 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1192 			return -ENOBUFS;
1193 
1194 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1195 						    sk->sk_allocation);
1196 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1197 			return -ENOBUFS;
1198 
1199 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1200 						   sk->sk_allocation);
1201 		if (opt->hopopt && !v6_cork->opt->hopopt)
1202 			return -ENOBUFS;
1203 
1204 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1205 						    sk->sk_allocation);
1206 		if (opt->srcrt && !v6_cork->opt->srcrt)
1207 			return -ENOBUFS;
1208 
1209 		/* need source address above miyazawa*/
1210 	}
1211 	dst_hold(&rt->dst);
1212 	cork->base.dst = &rt->dst;
1213 	cork->fl.u.ip6 = *fl6;
1214 	v6_cork->hop_limit = ipc6->hlimit;
1215 	v6_cork->tclass = ipc6->tclass;
1216 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1217 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1218 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1219 	else
1220 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1221 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1222 	if (np->frag_size < mtu) {
1223 		if (np->frag_size)
1224 			mtu = np->frag_size;
1225 	}
1226 	if (mtu < IPV6_MIN_MTU)
1227 		return -EINVAL;
1228 	cork->base.fragsize = mtu;
1229 	cork->base.gso_size = ipc6->gso_size;
1230 	cork->base.tx_flags = 0;
1231 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1232 
1233 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1234 		cork->base.flags |= IPCORK_ALLFRAG;
1235 	cork->base.length = 0;
1236 
1237 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1238 
1239 	return 0;
1240 }
1241 
1242 static int __ip6_append_data(struct sock *sk,
1243 			     struct flowi6 *fl6,
1244 			     struct sk_buff_head *queue,
1245 			     struct inet_cork *cork,
1246 			     struct inet6_cork *v6_cork,
1247 			     struct page_frag *pfrag,
1248 			     int getfrag(void *from, char *to, int offset,
1249 					 int len, int odd, struct sk_buff *skb),
1250 			     void *from, int length, int transhdrlen,
1251 			     unsigned int flags, struct ipcm6_cookie *ipc6)
1252 {
1253 	struct sk_buff *skb, *skb_prev = NULL;
1254 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1255 	struct ubuf_info *uarg = NULL;
1256 	int exthdrlen = 0;
1257 	int dst_exthdrlen = 0;
1258 	int hh_len;
1259 	int copy;
1260 	int err;
1261 	int offset = 0;
1262 	u32 tskey = 0;
1263 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1264 	struct ipv6_txoptions *opt = v6_cork->opt;
1265 	int csummode = CHECKSUM_NONE;
1266 	unsigned int maxnonfragsize, headersize;
1267 	unsigned int wmem_alloc_delta = 0;
1268 	bool paged, extra_uref;
1269 
1270 	skb = skb_peek_tail(queue);
1271 	if (!skb) {
1272 		exthdrlen = opt ? opt->opt_flen : 0;
1273 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1274 	}
1275 
1276 	paged = !!cork->gso_size;
1277 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1278 	orig_mtu = mtu;
1279 
1280 	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1281 	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1282 		tskey = sk->sk_tskey++;
1283 
1284 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1285 
1286 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1287 			(opt ? opt->opt_nflen : 0);
1288 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1289 		     sizeof(struct frag_hdr);
1290 
1291 	headersize = sizeof(struct ipv6hdr) +
1292 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1293 		     (dst_allfrag(&rt->dst) ?
1294 		      sizeof(struct frag_hdr) : 0) +
1295 		     rt->rt6i_nfheader_len;
1296 
1297 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1298 	 * the first fragment
1299 	 */
1300 	if (headersize + transhdrlen > mtu)
1301 		goto emsgsize;
1302 
1303 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1304 	    (sk->sk_protocol == IPPROTO_UDP ||
1305 	     sk->sk_protocol == IPPROTO_RAW)) {
1306 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1307 				sizeof(struct ipv6hdr));
1308 		goto emsgsize;
1309 	}
1310 
1311 	if (ip6_sk_ignore_df(sk))
1312 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1313 	else
1314 		maxnonfragsize = mtu;
1315 
1316 	if (cork->length + length > maxnonfragsize - headersize) {
1317 emsgsize:
1318 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1319 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1320 		return -EMSGSIZE;
1321 	}
1322 
1323 	/* CHECKSUM_PARTIAL only with no extension headers and when
1324 	 * we are not going to fragment
1325 	 */
1326 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1327 	    headersize == sizeof(struct ipv6hdr) &&
1328 	    length <= mtu - headersize &&
1329 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1330 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1331 		csummode = CHECKSUM_PARTIAL;
1332 
1333 	if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1334 		uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
1335 		if (!uarg)
1336 			return -ENOBUFS;
1337 		extra_uref = true;
1338 		if (rt->dst.dev->features & NETIF_F_SG &&
1339 		    csummode == CHECKSUM_PARTIAL) {
1340 			paged = true;
1341 		} else {
1342 			uarg->zerocopy = 0;
1343 			skb_zcopy_set(skb, uarg, &extra_uref);
1344 		}
1345 	}
1346 
1347 	/*
1348 	 * Let's try using as much space as possible.
1349 	 * Use MTU if total length of the message fits into the MTU.
1350 	 * Otherwise, we need to reserve fragment header and
1351 	 * fragment alignment (= 8-15 octects, in total).
1352 	 *
1353 	 * Note that we may need to "move" the data from the tail of
1354 	 * of the buffer to the new fragment when we split
1355 	 * the message.
1356 	 *
1357 	 * FIXME: It may be fragmented into multiple chunks
1358 	 *        at once if non-fragmentable extension headers
1359 	 *        are too large.
1360 	 * --yoshfuji
1361 	 */
1362 
1363 	cork->length += length;
1364 	if (!skb)
1365 		goto alloc_new_skb;
1366 
1367 	while (length > 0) {
1368 		/* Check if the remaining data fits into current packet. */
1369 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1370 		if (copy < length)
1371 			copy = maxfraglen - skb->len;
1372 
1373 		if (copy <= 0) {
1374 			char *data;
1375 			unsigned int datalen;
1376 			unsigned int fraglen;
1377 			unsigned int fraggap;
1378 			unsigned int alloclen;
1379 			unsigned int pagedlen;
1380 alloc_new_skb:
1381 			/* There's no room in the current skb */
1382 			if (skb)
1383 				fraggap = skb->len - maxfraglen;
1384 			else
1385 				fraggap = 0;
1386 			/* update mtu and maxfraglen if necessary */
1387 			if (!skb || !skb_prev)
1388 				ip6_append_data_mtu(&mtu, &maxfraglen,
1389 						    fragheaderlen, skb, rt,
1390 						    orig_mtu);
1391 
1392 			skb_prev = skb;
1393 
1394 			/*
1395 			 * If remaining data exceeds the mtu,
1396 			 * we know we need more fragment(s).
1397 			 */
1398 			datalen = length + fraggap;
1399 
1400 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1401 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1402 			fraglen = datalen + fragheaderlen;
1403 			pagedlen = 0;
1404 
1405 			if ((flags & MSG_MORE) &&
1406 			    !(rt->dst.dev->features&NETIF_F_SG))
1407 				alloclen = mtu;
1408 			else if (!paged)
1409 				alloclen = fraglen;
1410 			else {
1411 				alloclen = min_t(int, fraglen, MAX_HEADER);
1412 				pagedlen = fraglen - alloclen;
1413 			}
1414 
1415 			alloclen += dst_exthdrlen;
1416 
1417 			if (datalen != length + fraggap) {
1418 				/*
1419 				 * this is not the last fragment, the trailer
1420 				 * space is regarded as data space.
1421 				 */
1422 				datalen += rt->dst.trailer_len;
1423 			}
1424 
1425 			alloclen += rt->dst.trailer_len;
1426 			fraglen = datalen + fragheaderlen;
1427 
1428 			/*
1429 			 * We just reserve space for fragment header.
1430 			 * Note: this may be overallocation if the message
1431 			 * (without MSG_MORE) fits into the MTU.
1432 			 */
1433 			alloclen += sizeof(struct frag_hdr);
1434 
1435 			copy = datalen - transhdrlen - fraggap - pagedlen;
1436 			if (copy < 0) {
1437 				err = -EINVAL;
1438 				goto error;
1439 			}
1440 			if (transhdrlen) {
1441 				skb = sock_alloc_send_skb(sk,
1442 						alloclen + hh_len,
1443 						(flags & MSG_DONTWAIT), &err);
1444 			} else {
1445 				skb = NULL;
1446 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1447 				    2 * sk->sk_sndbuf)
1448 					skb = alloc_skb(alloclen + hh_len,
1449 							sk->sk_allocation);
1450 				if (unlikely(!skb))
1451 					err = -ENOBUFS;
1452 			}
1453 			if (!skb)
1454 				goto error;
1455 			/*
1456 			 *	Fill in the control structures
1457 			 */
1458 			skb->protocol = htons(ETH_P_IPV6);
1459 			skb->ip_summed = csummode;
1460 			skb->csum = 0;
1461 			/* reserve for fragmentation and ipsec header */
1462 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1463 				    dst_exthdrlen);
1464 
1465 			/*
1466 			 *	Find where to start putting bytes
1467 			 */
1468 			data = skb_put(skb, fraglen - pagedlen);
1469 			skb_set_network_header(skb, exthdrlen);
1470 			data += fragheaderlen;
1471 			skb->transport_header = (skb->network_header +
1472 						 fragheaderlen);
1473 			if (fraggap) {
1474 				skb->csum = skb_copy_and_csum_bits(
1475 					skb_prev, maxfraglen,
1476 					data + transhdrlen, fraggap, 0);
1477 				skb_prev->csum = csum_sub(skb_prev->csum,
1478 							  skb->csum);
1479 				data += fraggap;
1480 				pskb_trim_unique(skb_prev, maxfraglen);
1481 			}
1482 			if (copy > 0 &&
1483 			    getfrag(from, data + transhdrlen, offset,
1484 				    copy, fraggap, skb) < 0) {
1485 				err = -EFAULT;
1486 				kfree_skb(skb);
1487 				goto error;
1488 			}
1489 
1490 			offset += copy;
1491 			length -= copy + transhdrlen;
1492 			transhdrlen = 0;
1493 			exthdrlen = 0;
1494 			dst_exthdrlen = 0;
1495 
1496 			/* Only the initial fragment is time stamped */
1497 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1498 			cork->tx_flags = 0;
1499 			skb_shinfo(skb)->tskey = tskey;
1500 			tskey = 0;
1501 			skb_zcopy_set(skb, uarg, &extra_uref);
1502 
1503 			if ((flags & MSG_CONFIRM) && !skb_prev)
1504 				skb_set_dst_pending_confirm(skb, 1);
1505 
1506 			/*
1507 			 * Put the packet on the pending queue
1508 			 */
1509 			if (!skb->destructor) {
1510 				skb->destructor = sock_wfree;
1511 				skb->sk = sk;
1512 				wmem_alloc_delta += skb->truesize;
1513 			}
1514 			__skb_queue_tail(queue, skb);
1515 			continue;
1516 		}
1517 
1518 		if (copy > length)
1519 			copy = length;
1520 
1521 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1522 		    skb_tailroom(skb) >= copy) {
1523 			unsigned int off;
1524 
1525 			off = skb->len;
1526 			if (getfrag(from, skb_put(skb, copy),
1527 						offset, copy, off, skb) < 0) {
1528 				__skb_trim(skb, off);
1529 				err = -EFAULT;
1530 				goto error;
1531 			}
1532 		} else if (!uarg || !uarg->zerocopy) {
1533 			int i = skb_shinfo(skb)->nr_frags;
1534 
1535 			err = -ENOMEM;
1536 			if (!sk_page_frag_refill(sk, pfrag))
1537 				goto error;
1538 
1539 			if (!skb_can_coalesce(skb, i, pfrag->page,
1540 					      pfrag->offset)) {
1541 				err = -EMSGSIZE;
1542 				if (i == MAX_SKB_FRAGS)
1543 					goto error;
1544 
1545 				__skb_fill_page_desc(skb, i, pfrag->page,
1546 						     pfrag->offset, 0);
1547 				skb_shinfo(skb)->nr_frags = ++i;
1548 				get_page(pfrag->page);
1549 			}
1550 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1551 			if (getfrag(from,
1552 				    page_address(pfrag->page) + pfrag->offset,
1553 				    offset, copy, skb->len, skb) < 0)
1554 				goto error_efault;
1555 
1556 			pfrag->offset += copy;
1557 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1558 			skb->len += copy;
1559 			skb->data_len += copy;
1560 			skb->truesize += copy;
1561 			wmem_alloc_delta += copy;
1562 		} else {
1563 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1564 			if (err < 0)
1565 				goto error;
1566 		}
1567 		offset += copy;
1568 		length -= copy;
1569 	}
1570 
1571 	if (wmem_alloc_delta)
1572 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1573 	return 0;
1574 
1575 error_efault:
1576 	err = -EFAULT;
1577 error:
1578 	sock_zerocopy_put_abort(uarg, extra_uref);
1579 	cork->length -= length;
1580 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1581 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1582 	return err;
1583 }
1584 
1585 int ip6_append_data(struct sock *sk,
1586 		    int getfrag(void *from, char *to, int offset, int len,
1587 				int odd, struct sk_buff *skb),
1588 		    void *from, int length, int transhdrlen,
1589 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1590 		    struct rt6_info *rt, unsigned int flags)
1591 {
1592 	struct inet_sock *inet = inet_sk(sk);
1593 	struct ipv6_pinfo *np = inet6_sk(sk);
1594 	int exthdrlen;
1595 	int err;
1596 
1597 	if (flags&MSG_PROBE)
1598 		return 0;
1599 	if (skb_queue_empty(&sk->sk_write_queue)) {
1600 		/*
1601 		 * setup for corking
1602 		 */
1603 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1604 				     ipc6, rt, fl6);
1605 		if (err)
1606 			return err;
1607 
1608 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1609 		length += exthdrlen;
1610 		transhdrlen += exthdrlen;
1611 	} else {
1612 		fl6 = &inet->cork.fl.u.ip6;
1613 		transhdrlen = 0;
1614 	}
1615 
1616 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1617 				 &np->cork, sk_page_frag(sk), getfrag,
1618 				 from, length, transhdrlen, flags, ipc6);
1619 }
1620 EXPORT_SYMBOL_GPL(ip6_append_data);
1621 
1622 static void ip6_cork_release(struct inet_cork_full *cork,
1623 			     struct inet6_cork *v6_cork)
1624 {
1625 	if (v6_cork->opt) {
1626 		kfree(v6_cork->opt->dst0opt);
1627 		kfree(v6_cork->opt->dst1opt);
1628 		kfree(v6_cork->opt->hopopt);
1629 		kfree(v6_cork->opt->srcrt);
1630 		kfree(v6_cork->opt);
1631 		v6_cork->opt = NULL;
1632 	}
1633 
1634 	if (cork->base.dst) {
1635 		dst_release(cork->base.dst);
1636 		cork->base.dst = NULL;
1637 		cork->base.flags &= ~IPCORK_ALLFRAG;
1638 	}
1639 	memset(&cork->fl, 0, sizeof(cork->fl));
1640 }
1641 
1642 struct sk_buff *__ip6_make_skb(struct sock *sk,
1643 			       struct sk_buff_head *queue,
1644 			       struct inet_cork_full *cork,
1645 			       struct inet6_cork *v6_cork)
1646 {
1647 	struct sk_buff *skb, *tmp_skb;
1648 	struct sk_buff **tail_skb;
1649 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1650 	struct ipv6_pinfo *np = inet6_sk(sk);
1651 	struct net *net = sock_net(sk);
1652 	struct ipv6hdr *hdr;
1653 	struct ipv6_txoptions *opt = v6_cork->opt;
1654 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1655 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1656 	unsigned char proto = fl6->flowi6_proto;
1657 
1658 	skb = __skb_dequeue(queue);
1659 	if (!skb)
1660 		goto out;
1661 	tail_skb = &(skb_shinfo(skb)->frag_list);
1662 
1663 	/* move skb->data to ip header from ext header */
1664 	if (skb->data < skb_network_header(skb))
1665 		__skb_pull(skb, skb_network_offset(skb));
1666 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1667 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1668 		*tail_skb = tmp_skb;
1669 		tail_skb = &(tmp_skb->next);
1670 		skb->len += tmp_skb->len;
1671 		skb->data_len += tmp_skb->len;
1672 		skb->truesize += tmp_skb->truesize;
1673 		tmp_skb->destructor = NULL;
1674 		tmp_skb->sk = NULL;
1675 	}
1676 
1677 	/* Allow local fragmentation. */
1678 	skb->ignore_df = ip6_sk_ignore_df(sk);
1679 
1680 	*final_dst = fl6->daddr;
1681 	__skb_pull(skb, skb_network_header_len(skb));
1682 	if (opt && opt->opt_flen)
1683 		ipv6_push_frag_opts(skb, opt, &proto);
1684 	if (opt && opt->opt_nflen)
1685 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1686 
1687 	skb_push(skb, sizeof(struct ipv6hdr));
1688 	skb_reset_network_header(skb);
1689 	hdr = ipv6_hdr(skb);
1690 
1691 	ip6_flow_hdr(hdr, v6_cork->tclass,
1692 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1693 					ip6_autoflowlabel(net, np), fl6));
1694 	hdr->hop_limit = v6_cork->hop_limit;
1695 	hdr->nexthdr = proto;
1696 	hdr->saddr = fl6->saddr;
1697 	hdr->daddr = *final_dst;
1698 
1699 	skb->priority = sk->sk_priority;
1700 	skb->mark = sk->sk_mark;
1701 
1702 	skb->tstamp = cork->base.transmit_time;
1703 
1704 	skb_dst_set(skb, dst_clone(&rt->dst));
1705 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1706 	if (proto == IPPROTO_ICMPV6) {
1707 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1708 
1709 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1710 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1711 	}
1712 
1713 	ip6_cork_release(cork, v6_cork);
1714 out:
1715 	return skb;
1716 }
1717 
1718 int ip6_send_skb(struct sk_buff *skb)
1719 {
1720 	struct net *net = sock_net(skb->sk);
1721 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1722 	int err;
1723 
1724 	err = ip6_local_out(net, skb->sk, skb);
1725 	if (err) {
1726 		if (err > 0)
1727 			err = net_xmit_errno(err);
1728 		if (err)
1729 			IP6_INC_STATS(net, rt->rt6i_idev,
1730 				      IPSTATS_MIB_OUTDISCARDS);
1731 	}
1732 
1733 	return err;
1734 }
1735 
1736 int ip6_push_pending_frames(struct sock *sk)
1737 {
1738 	struct sk_buff *skb;
1739 
1740 	skb = ip6_finish_skb(sk);
1741 	if (!skb)
1742 		return 0;
1743 
1744 	return ip6_send_skb(skb);
1745 }
1746 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1747 
1748 static void __ip6_flush_pending_frames(struct sock *sk,
1749 				       struct sk_buff_head *queue,
1750 				       struct inet_cork_full *cork,
1751 				       struct inet6_cork *v6_cork)
1752 {
1753 	struct sk_buff *skb;
1754 
1755 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1756 		if (skb_dst(skb))
1757 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1758 				      IPSTATS_MIB_OUTDISCARDS);
1759 		kfree_skb(skb);
1760 	}
1761 
1762 	ip6_cork_release(cork, v6_cork);
1763 }
1764 
1765 void ip6_flush_pending_frames(struct sock *sk)
1766 {
1767 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1768 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1769 }
1770 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1771 
1772 struct sk_buff *ip6_make_skb(struct sock *sk,
1773 			     int getfrag(void *from, char *to, int offset,
1774 					 int len, int odd, struct sk_buff *skb),
1775 			     void *from, int length, int transhdrlen,
1776 			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1777 			     struct rt6_info *rt, unsigned int flags,
1778 			     struct inet_cork_full *cork)
1779 {
1780 	struct inet6_cork v6_cork;
1781 	struct sk_buff_head queue;
1782 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1783 	int err;
1784 
1785 	if (flags & MSG_PROBE)
1786 		return NULL;
1787 
1788 	__skb_queue_head_init(&queue);
1789 
1790 	cork->base.flags = 0;
1791 	cork->base.addr = 0;
1792 	cork->base.opt = NULL;
1793 	cork->base.dst = NULL;
1794 	v6_cork.opt = NULL;
1795 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1796 	if (err) {
1797 		ip6_cork_release(cork, &v6_cork);
1798 		return ERR_PTR(err);
1799 	}
1800 	if (ipc6->dontfrag < 0)
1801 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1802 
1803 	err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1804 				&current->task_frag, getfrag, from,
1805 				length + exthdrlen, transhdrlen + exthdrlen,
1806 				flags, ipc6);
1807 	if (err) {
1808 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1809 		return ERR_PTR(err);
1810 	}
1811 
1812 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1813 }
1814