xref: /openbmc/linux/net/ipv6/ip6_output.c (revision d2ba09c1)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	:	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/bpf-cgroup.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45 
46 #include <net/sock.h>
47 #include <net/snmp.h>
48 
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
59 #include <net/l3mdev.h>
60 #include <net/lwtunnel.h>
61 
62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
63 {
64 	struct dst_entry *dst = skb_dst(skb);
65 	struct net_device *dev = dst->dev;
66 	struct neighbour *neigh;
67 	struct in6_addr *nexthop;
68 	int ret;
69 
70 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72 
73 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74 		    ((mroute6_is_socket(net, skb) &&
75 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 					 &ipv6_hdr(skb)->saddr))) {
78 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79 
80 			/* Do not check for IFF_ALLMULTI; multicast routing
81 			   is not supported in any case.
82 			 */
83 			if (newskb)
84 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 					net, sk, newskb, NULL, newskb->dev,
86 					dev_loopback_xmit);
87 
88 			if (ipv6_hdr(skb)->hop_limit == 0) {
89 				IP6_INC_STATS(net, idev,
90 					      IPSTATS_MIB_OUTDISCARDS);
91 				kfree_skb(skb);
92 				return 0;
93 			}
94 		}
95 
96 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
97 
98 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
99 		    IPV6_ADDR_SCOPE_NODELOCAL &&
100 		    !(dev->flags & IFF_LOOPBACK)) {
101 			kfree_skb(skb);
102 			return 0;
103 		}
104 	}
105 
106 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
107 		int res = lwtunnel_xmit(skb);
108 
109 		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
110 			return res;
111 	}
112 
113 	rcu_read_lock_bh();
114 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
115 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
116 	if (unlikely(!neigh))
117 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
118 	if (!IS_ERR(neigh)) {
119 		sock_confirm_neigh(skb, neigh);
120 		ret = neigh_output(neigh, skb);
121 		rcu_read_unlock_bh();
122 		return ret;
123 	}
124 	rcu_read_unlock_bh();
125 
126 	IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
127 	kfree_skb(skb);
128 	return -EINVAL;
129 }
130 
131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
132 {
133 	int ret;
134 
135 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
136 	if (ret) {
137 		kfree_skb(skb);
138 		return ret;
139 	}
140 
141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
142 	/* Policy lookup after SNAT yielded a new policy */
143 	if (skb_dst(skb)->xfrm) {
144 		IPCB(skb)->flags |= IPSKB_REROUTED;
145 		return dst_output(net, sk, skb);
146 	}
147 #endif
148 
149 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
150 	    dst_allfrag(skb_dst(skb)) ||
151 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
152 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
153 	else
154 		return ip6_finish_output2(net, sk, skb);
155 }
156 
157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
158 {
159 	struct net_device *dev = skb_dst(skb)->dev;
160 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161 
162 	skb->protocol = htons(ETH_P_IPV6);
163 	skb->dev = dev;
164 
165 	if (unlikely(idev->cnf.disable_ipv6)) {
166 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
167 		kfree_skb(skb);
168 		return 0;
169 	}
170 
171 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
172 			    net, sk, skb, NULL, dev,
173 			    ip6_finish_output,
174 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
175 }
176 
177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
178 {
179 	if (!np->autoflowlabel_set)
180 		return ip6_default_np_autolabel(net);
181 	else
182 		return np->autoflowlabel;
183 }
184 
185 /*
186  * xmit an sk_buff (used by TCP, SCTP and DCCP)
187  * Note : socket lock is not held for SYNACK packets, but might be modified
188  * by calls to skb_set_owner_w() and ipv6_local_error(),
189  * which are using proper atomic operations or spinlocks.
190  */
191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
192 	     __u32 mark, struct ipv6_txoptions *opt, int tclass)
193 {
194 	struct net *net = sock_net(sk);
195 	const struct ipv6_pinfo *np = inet6_sk(sk);
196 	struct in6_addr *first_hop = &fl6->daddr;
197 	struct dst_entry *dst = skb_dst(skb);
198 	struct ipv6hdr *hdr;
199 	u8  proto = fl6->flowi6_proto;
200 	int seg_len = skb->len;
201 	int hlimit = -1;
202 	u32 mtu;
203 
204 	if (opt) {
205 		unsigned int head_room;
206 
207 		/* First: exthdrs may take lots of space (~8K for now)
208 		   MAX_HEADER is not enough.
209 		 */
210 		head_room = opt->opt_nflen + opt->opt_flen;
211 		seg_len += head_room;
212 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
213 
214 		if (skb_headroom(skb) < head_room) {
215 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
216 			if (!skb2) {
217 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
218 					      IPSTATS_MIB_OUTDISCARDS);
219 				kfree_skb(skb);
220 				return -ENOBUFS;
221 			}
222 			consume_skb(skb);
223 			skb = skb2;
224 			/* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
225 			 * it is safe to call in our context (socket lock not held)
226 			 */
227 			skb_set_owner_w(skb, (struct sock *)sk);
228 		}
229 		if (opt->opt_flen)
230 			ipv6_push_frag_opts(skb, opt, &proto);
231 		if (opt->opt_nflen)
232 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
233 					     &fl6->saddr);
234 	}
235 
236 	skb_push(skb, sizeof(struct ipv6hdr));
237 	skb_reset_network_header(skb);
238 	hdr = ipv6_hdr(skb);
239 
240 	/*
241 	 *	Fill in the IPv6 header
242 	 */
243 	if (np)
244 		hlimit = np->hop_limit;
245 	if (hlimit < 0)
246 		hlimit = ip6_dst_hoplimit(dst);
247 
248 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
249 				ip6_autoflowlabel(net, np), fl6));
250 
251 	hdr->payload_len = htons(seg_len);
252 	hdr->nexthdr = proto;
253 	hdr->hop_limit = hlimit;
254 
255 	hdr->saddr = fl6->saddr;
256 	hdr->daddr = *first_hop;
257 
258 	skb->protocol = htons(ETH_P_IPV6);
259 	skb->priority = sk->sk_priority;
260 	skb->mark = mark;
261 
262 	mtu = dst_mtu(dst);
263 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
264 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
265 			      IPSTATS_MIB_OUT, skb->len);
266 
267 		/* if egress device is enslaved to an L3 master device pass the
268 		 * skb to its handler for processing
269 		 */
270 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
271 		if (unlikely(!skb))
272 			return 0;
273 
274 		/* hooks should never assume socket lock is held.
275 		 * we promote our socket to non const
276 		 */
277 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
278 			       net, (struct sock *)sk, skb, NULL, dst->dev,
279 			       dst_output);
280 	}
281 
282 	skb->dev = dst->dev;
283 	/* ipv6_local_error() does not require socket lock,
284 	 * we promote our socket to non const
285 	 */
286 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
287 
288 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
289 	kfree_skb(skb);
290 	return -EMSGSIZE;
291 }
292 EXPORT_SYMBOL(ip6_xmit);
293 
294 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
295 {
296 	struct ip6_ra_chain *ra;
297 	struct sock *last = NULL;
298 
299 	read_lock(&ip6_ra_lock);
300 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
301 		struct sock *sk = ra->sk;
302 		if (sk && ra->sel == sel &&
303 		    (!sk->sk_bound_dev_if ||
304 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
305 			if (last) {
306 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
307 				if (skb2)
308 					rawv6_rcv(last, skb2);
309 			}
310 			last = sk;
311 		}
312 	}
313 
314 	if (last) {
315 		rawv6_rcv(last, skb);
316 		read_unlock(&ip6_ra_lock);
317 		return 1;
318 	}
319 	read_unlock(&ip6_ra_lock);
320 	return 0;
321 }
322 
323 static int ip6_forward_proxy_check(struct sk_buff *skb)
324 {
325 	struct ipv6hdr *hdr = ipv6_hdr(skb);
326 	u8 nexthdr = hdr->nexthdr;
327 	__be16 frag_off;
328 	int offset;
329 
330 	if (ipv6_ext_hdr(nexthdr)) {
331 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
332 		if (offset < 0)
333 			return 0;
334 	} else
335 		offset = sizeof(struct ipv6hdr);
336 
337 	if (nexthdr == IPPROTO_ICMPV6) {
338 		struct icmp6hdr *icmp6;
339 
340 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
341 					 offset + 1 - skb->data)))
342 			return 0;
343 
344 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
345 
346 		switch (icmp6->icmp6_type) {
347 		case NDISC_ROUTER_SOLICITATION:
348 		case NDISC_ROUTER_ADVERTISEMENT:
349 		case NDISC_NEIGHBOUR_SOLICITATION:
350 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
351 		case NDISC_REDIRECT:
352 			/* For reaction involving unicast neighbor discovery
353 			 * message destined to the proxied address, pass it to
354 			 * input function.
355 			 */
356 			return 1;
357 		default:
358 			break;
359 		}
360 	}
361 
362 	/*
363 	 * The proxying router can't forward traffic sent to a link-local
364 	 * address, so signal the sender and discard the packet. This
365 	 * behavior is clarified by the MIPv6 specification.
366 	 */
367 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
368 		dst_link_failure(skb);
369 		return -1;
370 	}
371 
372 	return 0;
373 }
374 
375 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
376 				     struct sk_buff *skb)
377 {
378 	struct dst_entry *dst = skb_dst(skb);
379 
380 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
381 	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
382 
383 	return dst_output(net, sk, skb);
384 }
385 
386 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
387 {
388 	if (skb->len <= mtu)
389 		return false;
390 
391 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
392 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
393 		return true;
394 
395 	if (skb->ignore_df)
396 		return false;
397 
398 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
399 		return false;
400 
401 	return true;
402 }
403 
404 int ip6_forward(struct sk_buff *skb)
405 {
406 	struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
407 	struct dst_entry *dst = skb_dst(skb);
408 	struct ipv6hdr *hdr = ipv6_hdr(skb);
409 	struct inet6_skb_parm *opt = IP6CB(skb);
410 	struct net *net = dev_net(dst->dev);
411 	u32 mtu;
412 
413 	if (net->ipv6.devconf_all->forwarding == 0)
414 		goto error;
415 
416 	if (skb->pkt_type != PACKET_HOST)
417 		goto drop;
418 
419 	if (unlikely(skb->sk))
420 		goto drop;
421 
422 	if (skb_warn_if_lro(skb))
423 		goto drop;
424 
425 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
426 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
427 		goto drop;
428 	}
429 
430 	skb_forward_csum(skb);
431 
432 	/*
433 	 *	We DO NOT make any processing on
434 	 *	RA packets, pushing them to user level AS IS
435 	 *	without ane WARRANTY that application will be able
436 	 *	to interpret them. The reason is that we
437 	 *	cannot make anything clever here.
438 	 *
439 	 *	We are not end-node, so that if packet contains
440 	 *	AH/ESP, we cannot make anything.
441 	 *	Defragmentation also would be mistake, RA packets
442 	 *	cannot be fragmented, because there is no warranty
443 	 *	that different fragments will go along one path. --ANK
444 	 */
445 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
446 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
447 			return 0;
448 	}
449 
450 	/*
451 	 *	check and decrement ttl
452 	 */
453 	if (hdr->hop_limit <= 1) {
454 		/* Force OUTPUT device used as source address */
455 		skb->dev = dst->dev;
456 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
457 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
458 
459 		kfree_skb(skb);
460 		return -ETIMEDOUT;
461 	}
462 
463 	/* XXX: idev->cnf.proxy_ndp? */
464 	if (net->ipv6.devconf_all->proxy_ndp &&
465 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
466 		int proxied = ip6_forward_proxy_check(skb);
467 		if (proxied > 0)
468 			return ip6_input(skb);
469 		else if (proxied < 0) {
470 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
471 			goto drop;
472 		}
473 	}
474 
475 	if (!xfrm6_route_forward(skb)) {
476 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
477 		goto drop;
478 	}
479 	dst = skb_dst(skb);
480 
481 	/* IPv6 specs say nothing about it, but it is clear that we cannot
482 	   send redirects to source routed frames.
483 	   We don't send redirects to frames decapsulated from IPsec.
484 	 */
485 	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
486 		struct in6_addr *target = NULL;
487 		struct inet_peer *peer;
488 		struct rt6_info *rt;
489 
490 		/*
491 		 *	incoming and outgoing devices are the same
492 		 *	send a redirect.
493 		 */
494 
495 		rt = (struct rt6_info *) dst;
496 		if (rt->rt6i_flags & RTF_GATEWAY)
497 			target = &rt->rt6i_gateway;
498 		else
499 			target = &hdr->daddr;
500 
501 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
502 
503 		/* Limit redirects both by destination (here)
504 		   and by source (inside ndisc_send_redirect)
505 		 */
506 		if (inet_peer_xrlim_allow(peer, 1*HZ))
507 			ndisc_send_redirect(skb, target);
508 		if (peer)
509 			inet_putpeer(peer);
510 	} else {
511 		int addrtype = ipv6_addr_type(&hdr->saddr);
512 
513 		/* This check is security critical. */
514 		if (addrtype == IPV6_ADDR_ANY ||
515 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
516 			goto error;
517 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
518 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
519 				    ICMPV6_NOT_NEIGHBOUR, 0);
520 			goto error;
521 		}
522 	}
523 
524 	mtu = ip6_dst_mtu_forward(dst);
525 	if (mtu < IPV6_MIN_MTU)
526 		mtu = IPV6_MIN_MTU;
527 
528 	if (ip6_pkt_too_big(skb, mtu)) {
529 		/* Again, force OUTPUT device used as source address */
530 		skb->dev = dst->dev;
531 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
532 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
533 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
534 				IPSTATS_MIB_FRAGFAILS);
535 		kfree_skb(skb);
536 		return -EMSGSIZE;
537 	}
538 
539 	if (skb_cow(skb, dst->dev->hard_header_len)) {
540 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
541 				IPSTATS_MIB_OUTDISCARDS);
542 		goto drop;
543 	}
544 
545 	hdr = ipv6_hdr(skb);
546 
547 	/* Mangling hops number delayed to point after skb COW */
548 
549 	hdr->hop_limit--;
550 
551 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
552 		       net, NULL, skb, skb->dev, dst->dev,
553 		       ip6_forward_finish);
554 
555 error:
556 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
557 drop:
558 	kfree_skb(skb);
559 	return -EINVAL;
560 }
561 
562 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
563 {
564 	to->pkt_type = from->pkt_type;
565 	to->priority = from->priority;
566 	to->protocol = from->protocol;
567 	skb_dst_drop(to);
568 	skb_dst_set(to, dst_clone(skb_dst(from)));
569 	to->dev = from->dev;
570 	to->mark = from->mark;
571 
572 #ifdef CONFIG_NET_SCHED
573 	to->tc_index = from->tc_index;
574 #endif
575 	nf_copy(to, from);
576 	skb_copy_secmark(to, from);
577 }
578 
579 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
580 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
581 {
582 	struct sk_buff *frag;
583 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
584 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
585 				inet6_sk(skb->sk) : NULL;
586 	struct ipv6hdr *tmp_hdr;
587 	struct frag_hdr *fh;
588 	unsigned int mtu, hlen, left, len;
589 	int hroom, troom;
590 	__be32 frag_id;
591 	int ptr, offset = 0, err = 0;
592 	u8 *prevhdr, nexthdr = 0;
593 
594 	err = ip6_find_1stfragopt(skb, &prevhdr);
595 	if (err < 0)
596 		goto fail;
597 	hlen = err;
598 	nexthdr = *prevhdr;
599 
600 	mtu = ip6_skb_dst_mtu(skb);
601 
602 	/* We must not fragment if the socket is set to force MTU discovery
603 	 * or if the skb it not generated by a local socket.
604 	 */
605 	if (unlikely(!skb->ignore_df && skb->len > mtu))
606 		goto fail_toobig;
607 
608 	if (IP6CB(skb)->frag_max_size) {
609 		if (IP6CB(skb)->frag_max_size > mtu)
610 			goto fail_toobig;
611 
612 		/* don't send fragments larger than what we received */
613 		mtu = IP6CB(skb)->frag_max_size;
614 		if (mtu < IPV6_MIN_MTU)
615 			mtu = IPV6_MIN_MTU;
616 	}
617 
618 	if (np && np->frag_size < mtu) {
619 		if (np->frag_size)
620 			mtu = np->frag_size;
621 	}
622 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
623 		goto fail_toobig;
624 	mtu -= hlen + sizeof(struct frag_hdr);
625 
626 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
627 				    &ipv6_hdr(skb)->saddr);
628 
629 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
630 	    (err = skb_checksum_help(skb)))
631 		goto fail;
632 
633 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
634 	if (skb_has_frag_list(skb)) {
635 		unsigned int first_len = skb_pagelen(skb);
636 		struct sk_buff *frag2;
637 
638 		if (first_len - hlen > mtu ||
639 		    ((first_len - hlen) & 7) ||
640 		    skb_cloned(skb) ||
641 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
642 			goto slow_path;
643 
644 		skb_walk_frags(skb, frag) {
645 			/* Correct geometry. */
646 			if (frag->len > mtu ||
647 			    ((frag->len & 7) && frag->next) ||
648 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
649 				goto slow_path_clean;
650 
651 			/* Partially cloned skb? */
652 			if (skb_shared(frag))
653 				goto slow_path_clean;
654 
655 			BUG_ON(frag->sk);
656 			if (skb->sk) {
657 				frag->sk = skb->sk;
658 				frag->destructor = sock_wfree;
659 			}
660 			skb->truesize -= frag->truesize;
661 		}
662 
663 		err = 0;
664 		offset = 0;
665 		/* BUILD HEADER */
666 
667 		*prevhdr = NEXTHDR_FRAGMENT;
668 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
669 		if (!tmp_hdr) {
670 			err = -ENOMEM;
671 			goto fail;
672 		}
673 		frag = skb_shinfo(skb)->frag_list;
674 		skb_frag_list_init(skb);
675 
676 		__skb_pull(skb, hlen);
677 		fh = __skb_push(skb, sizeof(struct frag_hdr));
678 		__skb_push(skb, hlen);
679 		skb_reset_network_header(skb);
680 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
681 
682 		fh->nexthdr = nexthdr;
683 		fh->reserved = 0;
684 		fh->frag_off = htons(IP6_MF);
685 		fh->identification = frag_id;
686 
687 		first_len = skb_pagelen(skb);
688 		skb->data_len = first_len - skb_headlen(skb);
689 		skb->len = first_len;
690 		ipv6_hdr(skb)->payload_len = htons(first_len -
691 						   sizeof(struct ipv6hdr));
692 
693 		for (;;) {
694 			/* Prepare header of the next frame,
695 			 * before previous one went down. */
696 			if (frag) {
697 				frag->ip_summed = CHECKSUM_NONE;
698 				skb_reset_transport_header(frag);
699 				fh = __skb_push(frag, sizeof(struct frag_hdr));
700 				__skb_push(frag, hlen);
701 				skb_reset_network_header(frag);
702 				memcpy(skb_network_header(frag), tmp_hdr,
703 				       hlen);
704 				offset += skb->len - hlen - sizeof(struct frag_hdr);
705 				fh->nexthdr = nexthdr;
706 				fh->reserved = 0;
707 				fh->frag_off = htons(offset);
708 				if (frag->next)
709 					fh->frag_off |= htons(IP6_MF);
710 				fh->identification = frag_id;
711 				ipv6_hdr(frag)->payload_len =
712 						htons(frag->len -
713 						      sizeof(struct ipv6hdr));
714 				ip6_copy_metadata(frag, skb);
715 			}
716 
717 			err = output(net, sk, skb);
718 			if (!err)
719 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
720 					      IPSTATS_MIB_FRAGCREATES);
721 
722 			if (err || !frag)
723 				break;
724 
725 			skb = frag;
726 			frag = skb->next;
727 			skb->next = NULL;
728 		}
729 
730 		kfree(tmp_hdr);
731 
732 		if (err == 0) {
733 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
734 				      IPSTATS_MIB_FRAGOKS);
735 			return 0;
736 		}
737 
738 		kfree_skb_list(frag);
739 
740 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
741 			      IPSTATS_MIB_FRAGFAILS);
742 		return err;
743 
744 slow_path_clean:
745 		skb_walk_frags(skb, frag2) {
746 			if (frag2 == frag)
747 				break;
748 			frag2->sk = NULL;
749 			frag2->destructor = NULL;
750 			skb->truesize += frag2->truesize;
751 		}
752 	}
753 
754 slow_path:
755 	left = skb->len - hlen;		/* Space per frame */
756 	ptr = hlen;			/* Where to start from */
757 
758 	/*
759 	 *	Fragment the datagram.
760 	 */
761 
762 	troom = rt->dst.dev->needed_tailroom;
763 
764 	/*
765 	 *	Keep copying data until we run out.
766 	 */
767 	while (left > 0)	{
768 		u8 *fragnexthdr_offset;
769 
770 		len = left;
771 		/* IF: it doesn't fit, use 'mtu' - the data space left */
772 		if (len > mtu)
773 			len = mtu;
774 		/* IF: we are not sending up to and including the packet end
775 		   then align the next start on an eight byte boundary */
776 		if (len < left)	{
777 			len &= ~7;
778 		}
779 
780 		/* Allocate buffer */
781 		frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
782 				 hroom + troom, GFP_ATOMIC);
783 		if (!frag) {
784 			err = -ENOMEM;
785 			goto fail;
786 		}
787 
788 		/*
789 		 *	Set up data on packet
790 		 */
791 
792 		ip6_copy_metadata(frag, skb);
793 		skb_reserve(frag, hroom);
794 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
795 		skb_reset_network_header(frag);
796 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
797 		frag->transport_header = (frag->network_header + hlen +
798 					  sizeof(struct frag_hdr));
799 
800 		/*
801 		 *	Charge the memory for the fragment to any owner
802 		 *	it might possess
803 		 */
804 		if (skb->sk)
805 			skb_set_owner_w(frag, skb->sk);
806 
807 		/*
808 		 *	Copy the packet header into the new buffer.
809 		 */
810 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
811 
812 		fragnexthdr_offset = skb_network_header(frag);
813 		fragnexthdr_offset += prevhdr - skb_network_header(skb);
814 		*fragnexthdr_offset = NEXTHDR_FRAGMENT;
815 
816 		/*
817 		 *	Build fragment header.
818 		 */
819 		fh->nexthdr = nexthdr;
820 		fh->reserved = 0;
821 		fh->identification = frag_id;
822 
823 		/*
824 		 *	Copy a block of the IP datagram.
825 		 */
826 		BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
827 				     len));
828 		left -= len;
829 
830 		fh->frag_off = htons(offset);
831 		if (left > 0)
832 			fh->frag_off |= htons(IP6_MF);
833 		ipv6_hdr(frag)->payload_len = htons(frag->len -
834 						    sizeof(struct ipv6hdr));
835 
836 		ptr += len;
837 		offset += len;
838 
839 		/*
840 		 *	Put this fragment into the sending queue.
841 		 */
842 		err = output(net, sk, frag);
843 		if (err)
844 			goto fail;
845 
846 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
847 			      IPSTATS_MIB_FRAGCREATES);
848 	}
849 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
850 		      IPSTATS_MIB_FRAGOKS);
851 	consume_skb(skb);
852 	return err;
853 
854 fail_toobig:
855 	if (skb->sk && dst_allfrag(skb_dst(skb)))
856 		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
857 
858 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
859 	err = -EMSGSIZE;
860 
861 fail:
862 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
863 		      IPSTATS_MIB_FRAGFAILS);
864 	kfree_skb(skb);
865 	return err;
866 }
867 
868 static inline int ip6_rt_check(const struct rt6key *rt_key,
869 			       const struct in6_addr *fl_addr,
870 			       const struct in6_addr *addr_cache)
871 {
872 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
873 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
874 }
875 
876 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
877 					  struct dst_entry *dst,
878 					  const struct flowi6 *fl6)
879 {
880 	struct ipv6_pinfo *np = inet6_sk(sk);
881 	struct rt6_info *rt;
882 
883 	if (!dst)
884 		goto out;
885 
886 	if (dst->ops->family != AF_INET6) {
887 		dst_release(dst);
888 		return NULL;
889 	}
890 
891 	rt = (struct rt6_info *)dst;
892 	/* Yes, checking route validity in not connected
893 	 * case is not very simple. Take into account,
894 	 * that we do not support routing by source, TOS,
895 	 * and MSG_DONTROUTE		--ANK (980726)
896 	 *
897 	 * 1. ip6_rt_check(): If route was host route,
898 	 *    check that cached destination is current.
899 	 *    If it is network route, we still may
900 	 *    check its validity using saved pointer
901 	 *    to the last used address: daddr_cache.
902 	 *    We do not want to save whole address now,
903 	 *    (because main consumer of this service
904 	 *    is tcp, which has not this problem),
905 	 *    so that the last trick works only on connected
906 	 *    sockets.
907 	 * 2. oif also should be the same.
908 	 */
909 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
910 #ifdef CONFIG_IPV6_SUBTREES
911 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
912 #endif
913 	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
914 	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
915 		dst_release(dst);
916 		dst = NULL;
917 	}
918 
919 out:
920 	return dst;
921 }
922 
923 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
924 			       struct dst_entry **dst, struct flowi6 *fl6)
925 {
926 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
927 	struct neighbour *n;
928 	struct rt6_info *rt;
929 #endif
930 	int err;
931 	int flags = 0;
932 
933 	/* The correct way to handle this would be to do
934 	 * ip6_route_get_saddr, and then ip6_route_output; however,
935 	 * the route-specific preferred source forces the
936 	 * ip6_route_output call _before_ ip6_route_get_saddr.
937 	 *
938 	 * In source specific routing (no src=any default route),
939 	 * ip6_route_output will fail given src=any saddr, though, so
940 	 * that's why we try it again later.
941 	 */
942 	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
943 		struct fib6_info *from;
944 		struct rt6_info *rt;
945 		bool had_dst = *dst != NULL;
946 
947 		if (!had_dst)
948 			*dst = ip6_route_output(net, sk, fl6);
949 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
950 
951 		rcu_read_lock();
952 		from = rt ? rcu_dereference(rt->from) : NULL;
953 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
954 					  sk ? inet6_sk(sk)->srcprefs : 0,
955 					  &fl6->saddr);
956 		rcu_read_unlock();
957 
958 		if (err)
959 			goto out_err_release;
960 
961 		/* If we had an erroneous initial result, pretend it
962 		 * never existed and let the SA-enabled version take
963 		 * over.
964 		 */
965 		if (!had_dst && (*dst)->error) {
966 			dst_release(*dst);
967 			*dst = NULL;
968 		}
969 
970 		if (fl6->flowi6_oif)
971 			flags |= RT6_LOOKUP_F_IFACE;
972 	}
973 
974 	if (!*dst)
975 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
976 
977 	err = (*dst)->error;
978 	if (err)
979 		goto out_err_release;
980 
981 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
982 	/*
983 	 * Here if the dst entry we've looked up
984 	 * has a neighbour entry that is in the INCOMPLETE
985 	 * state and the src address from the flow is
986 	 * marked as OPTIMISTIC, we release the found
987 	 * dst entry and replace it instead with the
988 	 * dst entry of the nexthop router
989 	 */
990 	rt = (struct rt6_info *) *dst;
991 	rcu_read_lock_bh();
992 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
993 				      rt6_nexthop(rt, &fl6->daddr));
994 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
995 	rcu_read_unlock_bh();
996 
997 	if (err) {
998 		struct inet6_ifaddr *ifp;
999 		struct flowi6 fl_gw6;
1000 		int redirect;
1001 
1002 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1003 				      (*dst)->dev, 1);
1004 
1005 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1006 		if (ifp)
1007 			in6_ifa_put(ifp);
1008 
1009 		if (redirect) {
1010 			/*
1011 			 * We need to get the dst entry for the
1012 			 * default router instead
1013 			 */
1014 			dst_release(*dst);
1015 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1016 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1017 			*dst = ip6_route_output(net, sk, &fl_gw6);
1018 			err = (*dst)->error;
1019 			if (err)
1020 				goto out_err_release;
1021 		}
1022 	}
1023 #endif
1024 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1025 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1026 		err = -EAFNOSUPPORT;
1027 		goto out_err_release;
1028 	}
1029 
1030 	return 0;
1031 
1032 out_err_release:
1033 	dst_release(*dst);
1034 	*dst = NULL;
1035 
1036 	if (err == -ENETUNREACH)
1037 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1038 	return err;
1039 }
1040 
1041 /**
1042  *	ip6_dst_lookup - perform route lookup on flow
1043  *	@sk: socket which provides route info
1044  *	@dst: pointer to dst_entry * for result
1045  *	@fl6: flow to lookup
1046  *
1047  *	This function performs a route lookup on the given flow.
1048  *
1049  *	It returns zero on success, or a standard errno code on error.
1050  */
1051 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1052 		   struct flowi6 *fl6)
1053 {
1054 	*dst = NULL;
1055 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1056 }
1057 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1058 
1059 /**
1060  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1061  *	@sk: socket which provides route info
1062  *	@fl6: flow to lookup
1063  *	@final_dst: final destination address for ipsec lookup
1064  *
1065  *	This function performs a route lookup on the given flow.
1066  *
1067  *	It returns a valid dst pointer on success, or a pointer encoded
1068  *	error code.
1069  */
1070 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1071 				      const struct in6_addr *final_dst)
1072 {
1073 	struct dst_entry *dst = NULL;
1074 	int err;
1075 
1076 	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1077 	if (err)
1078 		return ERR_PTR(err);
1079 	if (final_dst)
1080 		fl6->daddr = *final_dst;
1081 
1082 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1083 }
1084 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1085 
1086 /**
1087  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1088  *	@sk: socket which provides the dst cache and route info
1089  *	@fl6: flow to lookup
1090  *	@final_dst: final destination address for ipsec lookup
1091  *	@connected: whether @sk is connected or not
1092  *
1093  *	This function performs a route lookup on the given flow with the
1094  *	possibility of using the cached route in the socket if it is valid.
1095  *	It will take the socket dst lock when operating on the dst cache.
1096  *	As a result, this function can only be used in process context.
1097  *
1098  *	In addition, for a connected socket, cache the dst in the socket
1099  *	if the current cache is not valid.
1100  *
1101  *	It returns a valid dst pointer on success, or a pointer encoded
1102  *	error code.
1103  */
1104 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1105 					 const struct in6_addr *final_dst,
1106 					 bool connected)
1107 {
1108 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1109 
1110 	dst = ip6_sk_dst_check(sk, dst, fl6);
1111 	if (dst)
1112 		return dst;
1113 
1114 	dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1115 	if (connected && !IS_ERR(dst))
1116 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1117 
1118 	return dst;
1119 }
1120 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1121 
1122 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1123 					       gfp_t gfp)
1124 {
1125 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1126 }
1127 
1128 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1129 						gfp_t gfp)
1130 {
1131 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1132 }
1133 
1134 static void ip6_append_data_mtu(unsigned int *mtu,
1135 				int *maxfraglen,
1136 				unsigned int fragheaderlen,
1137 				struct sk_buff *skb,
1138 				struct rt6_info *rt,
1139 				unsigned int orig_mtu)
1140 {
1141 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1142 		if (!skb) {
1143 			/* first fragment, reserve header_len */
1144 			*mtu = orig_mtu - rt->dst.header_len;
1145 
1146 		} else {
1147 			/*
1148 			 * this fragment is not first, the headers
1149 			 * space is regarded as data space.
1150 			 */
1151 			*mtu = orig_mtu;
1152 		}
1153 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1154 			      + fragheaderlen - sizeof(struct frag_hdr);
1155 	}
1156 }
1157 
1158 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1159 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1160 			  struct rt6_info *rt, struct flowi6 *fl6)
1161 {
1162 	struct ipv6_pinfo *np = inet6_sk(sk);
1163 	unsigned int mtu;
1164 	struct ipv6_txoptions *opt = ipc6->opt;
1165 
1166 	/*
1167 	 * setup for corking
1168 	 */
1169 	if (opt) {
1170 		if (WARN_ON(v6_cork->opt))
1171 			return -EINVAL;
1172 
1173 		v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1174 		if (unlikely(!v6_cork->opt))
1175 			return -ENOBUFS;
1176 
1177 		v6_cork->opt->tot_len = sizeof(*opt);
1178 		v6_cork->opt->opt_flen = opt->opt_flen;
1179 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1180 
1181 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1182 						    sk->sk_allocation);
1183 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1184 			return -ENOBUFS;
1185 
1186 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1187 						    sk->sk_allocation);
1188 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1189 			return -ENOBUFS;
1190 
1191 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1192 						   sk->sk_allocation);
1193 		if (opt->hopopt && !v6_cork->opt->hopopt)
1194 			return -ENOBUFS;
1195 
1196 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1197 						    sk->sk_allocation);
1198 		if (opt->srcrt && !v6_cork->opt->srcrt)
1199 			return -ENOBUFS;
1200 
1201 		/* need source address above miyazawa*/
1202 	}
1203 	dst_hold(&rt->dst);
1204 	cork->base.dst = &rt->dst;
1205 	cork->fl.u.ip6 = *fl6;
1206 	v6_cork->hop_limit = ipc6->hlimit;
1207 	v6_cork->tclass = ipc6->tclass;
1208 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1209 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1210 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1211 	else
1212 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1213 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1214 	if (np->frag_size < mtu) {
1215 		if (np->frag_size)
1216 			mtu = np->frag_size;
1217 	}
1218 	if (mtu < IPV6_MIN_MTU)
1219 		return -EINVAL;
1220 	cork->base.fragsize = mtu;
1221 	cork->base.gso_size = sk->sk_type == SOCK_DGRAM ? ipc6->gso_size : 0;
1222 
1223 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1224 		cork->base.flags |= IPCORK_ALLFRAG;
1225 	cork->base.length = 0;
1226 
1227 	return 0;
1228 }
1229 
1230 static int __ip6_append_data(struct sock *sk,
1231 			     struct flowi6 *fl6,
1232 			     struct sk_buff_head *queue,
1233 			     struct inet_cork *cork,
1234 			     struct inet6_cork *v6_cork,
1235 			     struct page_frag *pfrag,
1236 			     int getfrag(void *from, char *to, int offset,
1237 					 int len, int odd, struct sk_buff *skb),
1238 			     void *from, int length, int transhdrlen,
1239 			     unsigned int flags, struct ipcm6_cookie *ipc6,
1240 			     const struct sockcm_cookie *sockc)
1241 {
1242 	struct sk_buff *skb, *skb_prev = NULL;
1243 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1244 	int exthdrlen = 0;
1245 	int dst_exthdrlen = 0;
1246 	int hh_len;
1247 	int copy;
1248 	int err;
1249 	int offset = 0;
1250 	__u8 tx_flags = 0;
1251 	u32 tskey = 0;
1252 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1253 	struct ipv6_txoptions *opt = v6_cork->opt;
1254 	int csummode = CHECKSUM_NONE;
1255 	unsigned int maxnonfragsize, headersize;
1256 	unsigned int wmem_alloc_delta = 0;
1257 	bool paged;
1258 
1259 	skb = skb_peek_tail(queue);
1260 	if (!skb) {
1261 		exthdrlen = opt ? opt->opt_flen : 0;
1262 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1263 	}
1264 
1265 	paged = !!cork->gso_size;
1266 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1267 	orig_mtu = mtu;
1268 
1269 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1270 
1271 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1272 			(opt ? opt->opt_nflen : 0);
1273 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1274 		     sizeof(struct frag_hdr);
1275 
1276 	headersize = sizeof(struct ipv6hdr) +
1277 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1278 		     (dst_allfrag(&rt->dst) ?
1279 		      sizeof(struct frag_hdr) : 0) +
1280 		     rt->rt6i_nfheader_len;
1281 
1282 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1283 	 * the first fragment
1284 	 */
1285 	if (headersize + transhdrlen > mtu)
1286 		goto emsgsize;
1287 
1288 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1289 	    (sk->sk_protocol == IPPROTO_UDP ||
1290 	     sk->sk_protocol == IPPROTO_RAW)) {
1291 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1292 				sizeof(struct ipv6hdr));
1293 		goto emsgsize;
1294 	}
1295 
1296 	if (ip6_sk_ignore_df(sk))
1297 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1298 	else
1299 		maxnonfragsize = mtu;
1300 
1301 	if (cork->length + length > maxnonfragsize - headersize) {
1302 emsgsize:
1303 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1304 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1305 		return -EMSGSIZE;
1306 	}
1307 
1308 	/* CHECKSUM_PARTIAL only with no extension headers and when
1309 	 * we are not going to fragment
1310 	 */
1311 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1312 	    headersize == sizeof(struct ipv6hdr) &&
1313 	    length <= mtu - headersize &&
1314 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1315 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1316 		csummode = CHECKSUM_PARTIAL;
1317 
1318 	if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1319 		sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1320 		if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1321 		    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1322 			tskey = sk->sk_tskey++;
1323 	}
1324 
1325 	/*
1326 	 * Let's try using as much space as possible.
1327 	 * Use MTU if total length of the message fits into the MTU.
1328 	 * Otherwise, we need to reserve fragment header and
1329 	 * fragment alignment (= 8-15 octects, in total).
1330 	 *
1331 	 * Note that we may need to "move" the data from the tail of
1332 	 * of the buffer to the new fragment when we split
1333 	 * the message.
1334 	 *
1335 	 * FIXME: It may be fragmented into multiple chunks
1336 	 *        at once if non-fragmentable extension headers
1337 	 *        are too large.
1338 	 * --yoshfuji
1339 	 */
1340 
1341 	cork->length += length;
1342 	if (!skb)
1343 		goto alloc_new_skb;
1344 
1345 	while (length > 0) {
1346 		/* Check if the remaining data fits into current packet. */
1347 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1348 		if (copy < length)
1349 			copy = maxfraglen - skb->len;
1350 
1351 		if (copy <= 0) {
1352 			char *data;
1353 			unsigned int datalen;
1354 			unsigned int fraglen;
1355 			unsigned int fraggap;
1356 			unsigned int alloclen;
1357 			unsigned int pagedlen = 0;
1358 alloc_new_skb:
1359 			/* There's no room in the current skb */
1360 			if (skb)
1361 				fraggap = skb->len - maxfraglen;
1362 			else
1363 				fraggap = 0;
1364 			/* update mtu and maxfraglen if necessary */
1365 			if (!skb || !skb_prev)
1366 				ip6_append_data_mtu(&mtu, &maxfraglen,
1367 						    fragheaderlen, skb, rt,
1368 						    orig_mtu);
1369 
1370 			skb_prev = skb;
1371 
1372 			/*
1373 			 * If remaining data exceeds the mtu,
1374 			 * we know we need more fragment(s).
1375 			 */
1376 			datalen = length + fraggap;
1377 
1378 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1379 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1380 			fraglen = datalen + fragheaderlen;
1381 
1382 			if ((flags & MSG_MORE) &&
1383 			    !(rt->dst.dev->features&NETIF_F_SG))
1384 				alloclen = mtu;
1385 			else if (!paged)
1386 				alloclen = fraglen;
1387 			else {
1388 				alloclen = min_t(int, fraglen, MAX_HEADER);
1389 				pagedlen = fraglen - alloclen;
1390 			}
1391 
1392 			alloclen += dst_exthdrlen;
1393 
1394 			if (datalen != length + fraggap) {
1395 				/*
1396 				 * this is not the last fragment, the trailer
1397 				 * space is regarded as data space.
1398 				 */
1399 				datalen += rt->dst.trailer_len;
1400 			}
1401 
1402 			alloclen += rt->dst.trailer_len;
1403 			fraglen = datalen + fragheaderlen;
1404 
1405 			/*
1406 			 * We just reserve space for fragment header.
1407 			 * Note: this may be overallocation if the message
1408 			 * (without MSG_MORE) fits into the MTU.
1409 			 */
1410 			alloclen += sizeof(struct frag_hdr);
1411 
1412 			copy = datalen - transhdrlen - fraggap - pagedlen;
1413 			if (copy < 0) {
1414 				err = -EINVAL;
1415 				goto error;
1416 			}
1417 			if (transhdrlen) {
1418 				skb = sock_alloc_send_skb(sk,
1419 						alloclen + hh_len,
1420 						(flags & MSG_DONTWAIT), &err);
1421 			} else {
1422 				skb = NULL;
1423 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1424 				    2 * sk->sk_sndbuf)
1425 					skb = alloc_skb(alloclen + hh_len,
1426 							sk->sk_allocation);
1427 				if (unlikely(!skb))
1428 					err = -ENOBUFS;
1429 			}
1430 			if (!skb)
1431 				goto error;
1432 			/*
1433 			 *	Fill in the control structures
1434 			 */
1435 			skb->protocol = htons(ETH_P_IPV6);
1436 			skb->ip_summed = csummode;
1437 			skb->csum = 0;
1438 			/* reserve for fragmentation and ipsec header */
1439 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1440 				    dst_exthdrlen);
1441 
1442 			/* Only the initial fragment is time stamped */
1443 			skb_shinfo(skb)->tx_flags = tx_flags;
1444 			tx_flags = 0;
1445 			skb_shinfo(skb)->tskey = tskey;
1446 			tskey = 0;
1447 
1448 			/*
1449 			 *	Find where to start putting bytes
1450 			 */
1451 			data = skb_put(skb, fraglen - pagedlen);
1452 			skb_set_network_header(skb, exthdrlen);
1453 			data += fragheaderlen;
1454 			skb->transport_header = (skb->network_header +
1455 						 fragheaderlen);
1456 			if (fraggap) {
1457 				skb->csum = skb_copy_and_csum_bits(
1458 					skb_prev, maxfraglen,
1459 					data + transhdrlen, fraggap, 0);
1460 				skb_prev->csum = csum_sub(skb_prev->csum,
1461 							  skb->csum);
1462 				data += fraggap;
1463 				pskb_trim_unique(skb_prev, maxfraglen);
1464 			}
1465 			if (copy > 0 &&
1466 			    getfrag(from, data + transhdrlen, offset,
1467 				    copy, fraggap, skb) < 0) {
1468 				err = -EFAULT;
1469 				kfree_skb(skb);
1470 				goto error;
1471 			}
1472 
1473 			offset += copy;
1474 			length -= copy + transhdrlen;
1475 			transhdrlen = 0;
1476 			exthdrlen = 0;
1477 			dst_exthdrlen = 0;
1478 
1479 			if ((flags & MSG_CONFIRM) && !skb_prev)
1480 				skb_set_dst_pending_confirm(skb, 1);
1481 
1482 			/*
1483 			 * Put the packet on the pending queue
1484 			 */
1485 			if (!skb->destructor) {
1486 				skb->destructor = sock_wfree;
1487 				skb->sk = sk;
1488 				wmem_alloc_delta += skb->truesize;
1489 			}
1490 			__skb_queue_tail(queue, skb);
1491 			continue;
1492 		}
1493 
1494 		if (copy > length)
1495 			copy = length;
1496 
1497 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1498 		    skb_tailroom(skb) >= copy) {
1499 			unsigned int off;
1500 
1501 			off = skb->len;
1502 			if (getfrag(from, skb_put(skb, copy),
1503 						offset, copy, off, skb) < 0) {
1504 				__skb_trim(skb, off);
1505 				err = -EFAULT;
1506 				goto error;
1507 			}
1508 		} else {
1509 			int i = skb_shinfo(skb)->nr_frags;
1510 
1511 			err = -ENOMEM;
1512 			if (!sk_page_frag_refill(sk, pfrag))
1513 				goto error;
1514 
1515 			if (!skb_can_coalesce(skb, i, pfrag->page,
1516 					      pfrag->offset)) {
1517 				err = -EMSGSIZE;
1518 				if (i == MAX_SKB_FRAGS)
1519 					goto error;
1520 
1521 				__skb_fill_page_desc(skb, i, pfrag->page,
1522 						     pfrag->offset, 0);
1523 				skb_shinfo(skb)->nr_frags = ++i;
1524 				get_page(pfrag->page);
1525 			}
1526 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1527 			if (getfrag(from,
1528 				    page_address(pfrag->page) + pfrag->offset,
1529 				    offset, copy, skb->len, skb) < 0)
1530 				goto error_efault;
1531 
1532 			pfrag->offset += copy;
1533 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1534 			skb->len += copy;
1535 			skb->data_len += copy;
1536 			skb->truesize += copy;
1537 			wmem_alloc_delta += copy;
1538 		}
1539 		offset += copy;
1540 		length -= copy;
1541 	}
1542 
1543 	if (wmem_alloc_delta)
1544 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1545 	return 0;
1546 
1547 error_efault:
1548 	err = -EFAULT;
1549 error:
1550 	cork->length -= length;
1551 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1552 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1553 	return err;
1554 }
1555 
1556 int ip6_append_data(struct sock *sk,
1557 		    int getfrag(void *from, char *to, int offset, int len,
1558 				int odd, struct sk_buff *skb),
1559 		    void *from, int length, int transhdrlen,
1560 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1561 		    struct rt6_info *rt, unsigned int flags,
1562 		    const struct sockcm_cookie *sockc)
1563 {
1564 	struct inet_sock *inet = inet_sk(sk);
1565 	struct ipv6_pinfo *np = inet6_sk(sk);
1566 	int exthdrlen;
1567 	int err;
1568 
1569 	if (flags&MSG_PROBE)
1570 		return 0;
1571 	if (skb_queue_empty(&sk->sk_write_queue)) {
1572 		/*
1573 		 * setup for corking
1574 		 */
1575 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1576 				     ipc6, rt, fl6);
1577 		if (err)
1578 			return err;
1579 
1580 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1581 		length += exthdrlen;
1582 		transhdrlen += exthdrlen;
1583 	} else {
1584 		fl6 = &inet->cork.fl.u.ip6;
1585 		transhdrlen = 0;
1586 	}
1587 
1588 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1589 				 &np->cork, sk_page_frag(sk), getfrag,
1590 				 from, length, transhdrlen, flags, ipc6, sockc);
1591 }
1592 EXPORT_SYMBOL_GPL(ip6_append_data);
1593 
1594 static void ip6_cork_release(struct inet_cork_full *cork,
1595 			     struct inet6_cork *v6_cork)
1596 {
1597 	if (v6_cork->opt) {
1598 		kfree(v6_cork->opt->dst0opt);
1599 		kfree(v6_cork->opt->dst1opt);
1600 		kfree(v6_cork->opt->hopopt);
1601 		kfree(v6_cork->opt->srcrt);
1602 		kfree(v6_cork->opt);
1603 		v6_cork->opt = NULL;
1604 	}
1605 
1606 	if (cork->base.dst) {
1607 		dst_release(cork->base.dst);
1608 		cork->base.dst = NULL;
1609 		cork->base.flags &= ~IPCORK_ALLFRAG;
1610 	}
1611 	memset(&cork->fl, 0, sizeof(cork->fl));
1612 }
1613 
1614 struct sk_buff *__ip6_make_skb(struct sock *sk,
1615 			       struct sk_buff_head *queue,
1616 			       struct inet_cork_full *cork,
1617 			       struct inet6_cork *v6_cork)
1618 {
1619 	struct sk_buff *skb, *tmp_skb;
1620 	struct sk_buff **tail_skb;
1621 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1622 	struct ipv6_pinfo *np = inet6_sk(sk);
1623 	struct net *net = sock_net(sk);
1624 	struct ipv6hdr *hdr;
1625 	struct ipv6_txoptions *opt = v6_cork->opt;
1626 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1627 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1628 	unsigned char proto = fl6->flowi6_proto;
1629 
1630 	skb = __skb_dequeue(queue);
1631 	if (!skb)
1632 		goto out;
1633 	tail_skb = &(skb_shinfo(skb)->frag_list);
1634 
1635 	/* move skb->data to ip header from ext header */
1636 	if (skb->data < skb_network_header(skb))
1637 		__skb_pull(skb, skb_network_offset(skb));
1638 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1639 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1640 		*tail_skb = tmp_skb;
1641 		tail_skb = &(tmp_skb->next);
1642 		skb->len += tmp_skb->len;
1643 		skb->data_len += tmp_skb->len;
1644 		skb->truesize += tmp_skb->truesize;
1645 		tmp_skb->destructor = NULL;
1646 		tmp_skb->sk = NULL;
1647 	}
1648 
1649 	/* Allow local fragmentation. */
1650 	skb->ignore_df = ip6_sk_ignore_df(sk);
1651 
1652 	*final_dst = fl6->daddr;
1653 	__skb_pull(skb, skb_network_header_len(skb));
1654 	if (opt && opt->opt_flen)
1655 		ipv6_push_frag_opts(skb, opt, &proto);
1656 	if (opt && opt->opt_nflen)
1657 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1658 
1659 	skb_push(skb, sizeof(struct ipv6hdr));
1660 	skb_reset_network_header(skb);
1661 	hdr = ipv6_hdr(skb);
1662 
1663 	ip6_flow_hdr(hdr, v6_cork->tclass,
1664 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1665 					ip6_autoflowlabel(net, np), fl6));
1666 	hdr->hop_limit = v6_cork->hop_limit;
1667 	hdr->nexthdr = proto;
1668 	hdr->saddr = fl6->saddr;
1669 	hdr->daddr = *final_dst;
1670 
1671 	skb->priority = sk->sk_priority;
1672 	skb->mark = sk->sk_mark;
1673 
1674 	skb_dst_set(skb, dst_clone(&rt->dst));
1675 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1676 	if (proto == IPPROTO_ICMPV6) {
1677 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1678 
1679 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1680 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1681 	}
1682 
1683 	ip6_cork_release(cork, v6_cork);
1684 out:
1685 	return skb;
1686 }
1687 
1688 int ip6_send_skb(struct sk_buff *skb)
1689 {
1690 	struct net *net = sock_net(skb->sk);
1691 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1692 	int err;
1693 
1694 	err = ip6_local_out(net, skb->sk, skb);
1695 	if (err) {
1696 		if (err > 0)
1697 			err = net_xmit_errno(err);
1698 		if (err)
1699 			IP6_INC_STATS(net, rt->rt6i_idev,
1700 				      IPSTATS_MIB_OUTDISCARDS);
1701 	}
1702 
1703 	return err;
1704 }
1705 
1706 int ip6_push_pending_frames(struct sock *sk)
1707 {
1708 	struct sk_buff *skb;
1709 
1710 	skb = ip6_finish_skb(sk);
1711 	if (!skb)
1712 		return 0;
1713 
1714 	return ip6_send_skb(skb);
1715 }
1716 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1717 
1718 static void __ip6_flush_pending_frames(struct sock *sk,
1719 				       struct sk_buff_head *queue,
1720 				       struct inet_cork_full *cork,
1721 				       struct inet6_cork *v6_cork)
1722 {
1723 	struct sk_buff *skb;
1724 
1725 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1726 		if (skb_dst(skb))
1727 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1728 				      IPSTATS_MIB_OUTDISCARDS);
1729 		kfree_skb(skb);
1730 	}
1731 
1732 	ip6_cork_release(cork, v6_cork);
1733 }
1734 
1735 void ip6_flush_pending_frames(struct sock *sk)
1736 {
1737 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1738 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1739 }
1740 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1741 
1742 struct sk_buff *ip6_make_skb(struct sock *sk,
1743 			     int getfrag(void *from, char *to, int offset,
1744 					 int len, int odd, struct sk_buff *skb),
1745 			     void *from, int length, int transhdrlen,
1746 			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1747 			     struct rt6_info *rt, unsigned int flags,
1748 			     struct inet_cork_full *cork,
1749 			     const struct sockcm_cookie *sockc)
1750 {
1751 	struct inet6_cork v6_cork;
1752 	struct sk_buff_head queue;
1753 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1754 	int err;
1755 
1756 	if (flags & MSG_PROBE)
1757 		return NULL;
1758 
1759 	__skb_queue_head_init(&queue);
1760 
1761 	cork->base.flags = 0;
1762 	cork->base.addr = 0;
1763 	cork->base.opt = NULL;
1764 	cork->base.dst = NULL;
1765 	v6_cork.opt = NULL;
1766 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1767 	if (err) {
1768 		ip6_cork_release(cork, &v6_cork);
1769 		return ERR_PTR(err);
1770 	}
1771 	if (ipc6->dontfrag < 0)
1772 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1773 
1774 	err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1775 				&current->task_frag, getfrag, from,
1776 				length + exthdrlen, transhdrlen + exthdrlen,
1777 				flags, ipc6, sockc);
1778 	if (err) {
1779 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1780 		return ERR_PTR(err);
1781 	}
1782 
1783 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1784 }
1785