xref: /openbmc/linux/net/ipv6/ip6_output.c (revision ca90578000afb0d8f177ea36f7259a9c3640cf49)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	:	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/bpf-cgroup.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45 
46 #include <net/sock.h>
47 #include <net/snmp.h>
48 
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
59 #include <net/l3mdev.h>
60 #include <net/lwtunnel.h>
61 
62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
63 {
64 	struct dst_entry *dst = skb_dst(skb);
65 	struct net_device *dev = dst->dev;
66 	struct neighbour *neigh;
67 	struct in6_addr *nexthop;
68 	int ret;
69 
70 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72 
73 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74 		    ((mroute6_is_socket(net, skb) &&
75 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 					 &ipv6_hdr(skb)->saddr))) {
78 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79 
80 			/* Do not check for IFF_ALLMULTI; multicast routing
81 			   is not supported in any case.
82 			 */
83 			if (newskb)
84 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 					net, sk, newskb, NULL, newskb->dev,
86 					dev_loopback_xmit);
87 
88 			if (ipv6_hdr(skb)->hop_limit == 0) {
89 				IP6_INC_STATS(net, idev,
90 					      IPSTATS_MIB_OUTDISCARDS);
91 				kfree_skb(skb);
92 				return 0;
93 			}
94 		}
95 
96 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
97 
98 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
99 		    IPV6_ADDR_SCOPE_NODELOCAL &&
100 		    !(dev->flags & IFF_LOOPBACK)) {
101 			kfree_skb(skb);
102 			return 0;
103 		}
104 	}
105 
106 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
107 		int res = lwtunnel_xmit(skb);
108 
109 		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
110 			return res;
111 	}
112 
113 	rcu_read_lock_bh();
114 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
115 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
116 	if (unlikely(!neigh))
117 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
118 	if (!IS_ERR(neigh)) {
119 		sock_confirm_neigh(skb, neigh);
120 		ret = neigh_output(neigh, skb);
121 		rcu_read_unlock_bh();
122 		return ret;
123 	}
124 	rcu_read_unlock_bh();
125 
126 	IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
127 	kfree_skb(skb);
128 	return -EINVAL;
129 }
130 
131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
132 {
133 	int ret;
134 
135 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
136 	if (ret) {
137 		kfree_skb(skb);
138 		return ret;
139 	}
140 
141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
142 	/* Policy lookup after SNAT yielded a new policy */
143 	if (skb_dst(skb)->xfrm) {
144 		IPCB(skb)->flags |= IPSKB_REROUTED;
145 		return dst_output(net, sk, skb);
146 	}
147 #endif
148 
149 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
150 	    dst_allfrag(skb_dst(skb)) ||
151 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
152 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
153 	else
154 		return ip6_finish_output2(net, sk, skb);
155 }
156 
157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
158 {
159 	struct net_device *dev = skb_dst(skb)->dev;
160 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161 
162 	skb->protocol = htons(ETH_P_IPV6);
163 	skb->dev = dev;
164 
165 	if (unlikely(idev->cnf.disable_ipv6)) {
166 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
167 		kfree_skb(skb);
168 		return 0;
169 	}
170 
171 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
172 			    net, sk, skb, NULL, dev,
173 			    ip6_finish_output,
174 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
175 }
176 
177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
178 {
179 	if (!np->autoflowlabel_set)
180 		return ip6_default_np_autolabel(net);
181 	else
182 		return np->autoflowlabel;
183 }
184 
185 /*
186  * xmit an sk_buff (used by TCP, SCTP and DCCP)
187  * Note : socket lock is not held for SYNACK packets, but might be modified
188  * by calls to skb_set_owner_w() and ipv6_local_error(),
189  * which are using proper atomic operations or spinlocks.
190  */
191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
192 	     __u32 mark, struct ipv6_txoptions *opt, int tclass)
193 {
194 	struct net *net = sock_net(sk);
195 	const struct ipv6_pinfo *np = inet6_sk(sk);
196 	struct in6_addr *first_hop = &fl6->daddr;
197 	struct dst_entry *dst = skb_dst(skb);
198 	struct ipv6hdr *hdr;
199 	u8  proto = fl6->flowi6_proto;
200 	int seg_len = skb->len;
201 	int hlimit = -1;
202 	u32 mtu;
203 
204 	if (opt) {
205 		unsigned int head_room;
206 
207 		/* First: exthdrs may take lots of space (~8K for now)
208 		   MAX_HEADER is not enough.
209 		 */
210 		head_room = opt->opt_nflen + opt->opt_flen;
211 		seg_len += head_room;
212 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
213 
214 		if (skb_headroom(skb) < head_room) {
215 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
216 			if (!skb2) {
217 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
218 					      IPSTATS_MIB_OUTDISCARDS);
219 				kfree_skb(skb);
220 				return -ENOBUFS;
221 			}
222 			consume_skb(skb);
223 			skb = skb2;
224 			/* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
225 			 * it is safe to call in our context (socket lock not held)
226 			 */
227 			skb_set_owner_w(skb, (struct sock *)sk);
228 		}
229 		if (opt->opt_flen)
230 			ipv6_push_frag_opts(skb, opt, &proto);
231 		if (opt->opt_nflen)
232 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
233 					     &fl6->saddr);
234 	}
235 
236 	skb_push(skb, sizeof(struct ipv6hdr));
237 	skb_reset_network_header(skb);
238 	hdr = ipv6_hdr(skb);
239 
240 	/*
241 	 *	Fill in the IPv6 header
242 	 */
243 	if (np)
244 		hlimit = np->hop_limit;
245 	if (hlimit < 0)
246 		hlimit = ip6_dst_hoplimit(dst);
247 
248 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
249 				ip6_autoflowlabel(net, np), fl6));
250 
251 	hdr->payload_len = htons(seg_len);
252 	hdr->nexthdr = proto;
253 	hdr->hop_limit = hlimit;
254 
255 	hdr->saddr = fl6->saddr;
256 	hdr->daddr = *first_hop;
257 
258 	skb->protocol = htons(ETH_P_IPV6);
259 	skb->priority = sk->sk_priority;
260 	skb->mark = mark;
261 
262 	mtu = dst_mtu(dst);
263 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
264 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
265 			      IPSTATS_MIB_OUT, skb->len);
266 
267 		/* if egress device is enslaved to an L3 master device pass the
268 		 * skb to its handler for processing
269 		 */
270 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
271 		if (unlikely(!skb))
272 			return 0;
273 
274 		/* hooks should never assume socket lock is held.
275 		 * we promote our socket to non const
276 		 */
277 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
278 			       net, (struct sock *)sk, skb, NULL, dst->dev,
279 			       dst_output);
280 	}
281 
282 	skb->dev = dst->dev;
283 	/* ipv6_local_error() does not require socket lock,
284 	 * we promote our socket to non const
285 	 */
286 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
287 
288 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
289 	kfree_skb(skb);
290 	return -EMSGSIZE;
291 }
292 EXPORT_SYMBOL(ip6_xmit);
293 
294 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
295 {
296 	struct ip6_ra_chain *ra;
297 	struct sock *last = NULL;
298 
299 	read_lock(&ip6_ra_lock);
300 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
301 		struct sock *sk = ra->sk;
302 		if (sk && ra->sel == sel &&
303 		    (!sk->sk_bound_dev_if ||
304 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
305 			if (last) {
306 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
307 				if (skb2)
308 					rawv6_rcv(last, skb2);
309 			}
310 			last = sk;
311 		}
312 	}
313 
314 	if (last) {
315 		rawv6_rcv(last, skb);
316 		read_unlock(&ip6_ra_lock);
317 		return 1;
318 	}
319 	read_unlock(&ip6_ra_lock);
320 	return 0;
321 }
322 
323 static int ip6_forward_proxy_check(struct sk_buff *skb)
324 {
325 	struct ipv6hdr *hdr = ipv6_hdr(skb);
326 	u8 nexthdr = hdr->nexthdr;
327 	__be16 frag_off;
328 	int offset;
329 
330 	if (ipv6_ext_hdr(nexthdr)) {
331 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
332 		if (offset < 0)
333 			return 0;
334 	} else
335 		offset = sizeof(struct ipv6hdr);
336 
337 	if (nexthdr == IPPROTO_ICMPV6) {
338 		struct icmp6hdr *icmp6;
339 
340 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
341 					 offset + 1 - skb->data)))
342 			return 0;
343 
344 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
345 
346 		switch (icmp6->icmp6_type) {
347 		case NDISC_ROUTER_SOLICITATION:
348 		case NDISC_ROUTER_ADVERTISEMENT:
349 		case NDISC_NEIGHBOUR_SOLICITATION:
350 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
351 		case NDISC_REDIRECT:
352 			/* For reaction involving unicast neighbor discovery
353 			 * message destined to the proxied address, pass it to
354 			 * input function.
355 			 */
356 			return 1;
357 		default:
358 			break;
359 		}
360 	}
361 
362 	/*
363 	 * The proxying router can't forward traffic sent to a link-local
364 	 * address, so signal the sender and discard the packet. This
365 	 * behavior is clarified by the MIPv6 specification.
366 	 */
367 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
368 		dst_link_failure(skb);
369 		return -1;
370 	}
371 
372 	return 0;
373 }
374 
375 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
376 				     struct sk_buff *skb)
377 {
378 	struct dst_entry *dst = skb_dst(skb);
379 
380 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
381 	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
382 
383 	return dst_output(net, sk, skb);
384 }
385 
386 unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
387 {
388 	unsigned int mtu;
389 	struct inet6_dev *idev;
390 
391 	if (dst_metric_locked(dst, RTAX_MTU)) {
392 		mtu = dst_metric_raw(dst, RTAX_MTU);
393 		if (mtu)
394 			return mtu;
395 	}
396 
397 	mtu = IPV6_MIN_MTU;
398 	rcu_read_lock();
399 	idev = __in6_dev_get(dst->dev);
400 	if (idev)
401 		mtu = idev->cnf.mtu6;
402 	rcu_read_unlock();
403 
404 	return mtu;
405 }
406 EXPORT_SYMBOL_GPL(ip6_dst_mtu_forward);
407 
408 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
409 {
410 	if (skb->len <= mtu)
411 		return false;
412 
413 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
414 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
415 		return true;
416 
417 	if (skb->ignore_df)
418 		return false;
419 
420 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
421 		return false;
422 
423 	return true;
424 }
425 
426 int ip6_forward(struct sk_buff *skb)
427 {
428 	struct dst_entry *dst = skb_dst(skb);
429 	struct ipv6hdr *hdr = ipv6_hdr(skb);
430 	struct inet6_skb_parm *opt = IP6CB(skb);
431 	struct net *net = dev_net(dst->dev);
432 	u32 mtu;
433 
434 	if (net->ipv6.devconf_all->forwarding == 0)
435 		goto error;
436 
437 	if (skb->pkt_type != PACKET_HOST)
438 		goto drop;
439 
440 	if (unlikely(skb->sk))
441 		goto drop;
442 
443 	if (skb_warn_if_lro(skb))
444 		goto drop;
445 
446 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
447 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
448 				IPSTATS_MIB_INDISCARDS);
449 		goto drop;
450 	}
451 
452 	skb_forward_csum(skb);
453 
454 	/*
455 	 *	We DO NOT make any processing on
456 	 *	RA packets, pushing them to user level AS IS
457 	 *	without ane WARRANTY that application will be able
458 	 *	to interpret them. The reason is that we
459 	 *	cannot make anything clever here.
460 	 *
461 	 *	We are not end-node, so that if packet contains
462 	 *	AH/ESP, we cannot make anything.
463 	 *	Defragmentation also would be mistake, RA packets
464 	 *	cannot be fragmented, because there is no warranty
465 	 *	that different fragments will go along one path. --ANK
466 	 */
467 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
468 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
469 			return 0;
470 	}
471 
472 	/*
473 	 *	check and decrement ttl
474 	 */
475 	if (hdr->hop_limit <= 1) {
476 		/* Force OUTPUT device used as source address */
477 		skb->dev = dst->dev;
478 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
479 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
480 				IPSTATS_MIB_INHDRERRORS);
481 
482 		kfree_skb(skb);
483 		return -ETIMEDOUT;
484 	}
485 
486 	/* XXX: idev->cnf.proxy_ndp? */
487 	if (net->ipv6.devconf_all->proxy_ndp &&
488 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
489 		int proxied = ip6_forward_proxy_check(skb);
490 		if (proxied > 0)
491 			return ip6_input(skb);
492 		else if (proxied < 0) {
493 			__IP6_INC_STATS(net, ip6_dst_idev(dst),
494 					IPSTATS_MIB_INDISCARDS);
495 			goto drop;
496 		}
497 	}
498 
499 	if (!xfrm6_route_forward(skb)) {
500 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
501 				IPSTATS_MIB_INDISCARDS);
502 		goto drop;
503 	}
504 	dst = skb_dst(skb);
505 
506 	/* IPv6 specs say nothing about it, but it is clear that we cannot
507 	   send redirects to source routed frames.
508 	   We don't send redirects to frames decapsulated from IPsec.
509 	 */
510 	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
511 		struct in6_addr *target = NULL;
512 		struct inet_peer *peer;
513 		struct rt6_info *rt;
514 
515 		/*
516 		 *	incoming and outgoing devices are the same
517 		 *	send a redirect.
518 		 */
519 
520 		rt = (struct rt6_info *) dst;
521 		if (rt->rt6i_flags & RTF_GATEWAY)
522 			target = &rt->rt6i_gateway;
523 		else
524 			target = &hdr->daddr;
525 
526 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
527 
528 		/* Limit redirects both by destination (here)
529 		   and by source (inside ndisc_send_redirect)
530 		 */
531 		if (inet_peer_xrlim_allow(peer, 1*HZ))
532 			ndisc_send_redirect(skb, target);
533 		if (peer)
534 			inet_putpeer(peer);
535 	} else {
536 		int addrtype = ipv6_addr_type(&hdr->saddr);
537 
538 		/* This check is security critical. */
539 		if (addrtype == IPV6_ADDR_ANY ||
540 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
541 			goto error;
542 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
543 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
544 				    ICMPV6_NOT_NEIGHBOUR, 0);
545 			goto error;
546 		}
547 	}
548 
549 	mtu = ip6_dst_mtu_forward(dst);
550 	if (mtu < IPV6_MIN_MTU)
551 		mtu = IPV6_MIN_MTU;
552 
553 	if (ip6_pkt_too_big(skb, mtu)) {
554 		/* Again, force OUTPUT device used as source address */
555 		skb->dev = dst->dev;
556 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
557 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
558 				IPSTATS_MIB_INTOOBIGERRORS);
559 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
560 				IPSTATS_MIB_FRAGFAILS);
561 		kfree_skb(skb);
562 		return -EMSGSIZE;
563 	}
564 
565 	if (skb_cow(skb, dst->dev->hard_header_len)) {
566 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
567 				IPSTATS_MIB_OUTDISCARDS);
568 		goto drop;
569 	}
570 
571 	hdr = ipv6_hdr(skb);
572 
573 	/* Mangling hops number delayed to point after skb COW */
574 
575 	hdr->hop_limit--;
576 
577 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
578 		       net, NULL, skb, skb->dev, dst->dev,
579 		       ip6_forward_finish);
580 
581 error:
582 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
583 drop:
584 	kfree_skb(skb);
585 	return -EINVAL;
586 }
587 
588 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
589 {
590 	to->pkt_type = from->pkt_type;
591 	to->priority = from->priority;
592 	to->protocol = from->protocol;
593 	skb_dst_drop(to);
594 	skb_dst_set(to, dst_clone(skb_dst(from)));
595 	to->dev = from->dev;
596 	to->mark = from->mark;
597 
598 #ifdef CONFIG_NET_SCHED
599 	to->tc_index = from->tc_index;
600 #endif
601 	nf_copy(to, from);
602 	skb_copy_secmark(to, from);
603 }
604 
605 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
606 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
607 {
608 	struct sk_buff *frag;
609 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
610 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
611 				inet6_sk(skb->sk) : NULL;
612 	struct ipv6hdr *tmp_hdr;
613 	struct frag_hdr *fh;
614 	unsigned int mtu, hlen, left, len;
615 	int hroom, troom;
616 	__be32 frag_id;
617 	int ptr, offset = 0, err = 0;
618 	u8 *prevhdr, nexthdr = 0;
619 
620 	err = ip6_find_1stfragopt(skb, &prevhdr);
621 	if (err < 0)
622 		goto fail;
623 	hlen = err;
624 	nexthdr = *prevhdr;
625 
626 	mtu = ip6_skb_dst_mtu(skb);
627 
628 	/* We must not fragment if the socket is set to force MTU discovery
629 	 * or if the skb it not generated by a local socket.
630 	 */
631 	if (unlikely(!skb->ignore_df && skb->len > mtu))
632 		goto fail_toobig;
633 
634 	if (IP6CB(skb)->frag_max_size) {
635 		if (IP6CB(skb)->frag_max_size > mtu)
636 			goto fail_toobig;
637 
638 		/* don't send fragments larger than what we received */
639 		mtu = IP6CB(skb)->frag_max_size;
640 		if (mtu < IPV6_MIN_MTU)
641 			mtu = IPV6_MIN_MTU;
642 	}
643 
644 	if (np && np->frag_size < mtu) {
645 		if (np->frag_size)
646 			mtu = np->frag_size;
647 	}
648 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
649 		goto fail_toobig;
650 	mtu -= hlen + sizeof(struct frag_hdr);
651 
652 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
653 				    &ipv6_hdr(skb)->saddr);
654 
655 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
656 	    (err = skb_checksum_help(skb)))
657 		goto fail;
658 
659 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
660 	if (skb_has_frag_list(skb)) {
661 		unsigned int first_len = skb_pagelen(skb);
662 		struct sk_buff *frag2;
663 
664 		if (first_len - hlen > mtu ||
665 		    ((first_len - hlen) & 7) ||
666 		    skb_cloned(skb) ||
667 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
668 			goto slow_path;
669 
670 		skb_walk_frags(skb, frag) {
671 			/* Correct geometry. */
672 			if (frag->len > mtu ||
673 			    ((frag->len & 7) && frag->next) ||
674 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
675 				goto slow_path_clean;
676 
677 			/* Partially cloned skb? */
678 			if (skb_shared(frag))
679 				goto slow_path_clean;
680 
681 			BUG_ON(frag->sk);
682 			if (skb->sk) {
683 				frag->sk = skb->sk;
684 				frag->destructor = sock_wfree;
685 			}
686 			skb->truesize -= frag->truesize;
687 		}
688 
689 		err = 0;
690 		offset = 0;
691 		/* BUILD HEADER */
692 
693 		*prevhdr = NEXTHDR_FRAGMENT;
694 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
695 		if (!tmp_hdr) {
696 			err = -ENOMEM;
697 			goto fail;
698 		}
699 		frag = skb_shinfo(skb)->frag_list;
700 		skb_frag_list_init(skb);
701 
702 		__skb_pull(skb, hlen);
703 		fh = __skb_push(skb, sizeof(struct frag_hdr));
704 		__skb_push(skb, hlen);
705 		skb_reset_network_header(skb);
706 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
707 
708 		fh->nexthdr = nexthdr;
709 		fh->reserved = 0;
710 		fh->frag_off = htons(IP6_MF);
711 		fh->identification = frag_id;
712 
713 		first_len = skb_pagelen(skb);
714 		skb->data_len = first_len - skb_headlen(skb);
715 		skb->len = first_len;
716 		ipv6_hdr(skb)->payload_len = htons(first_len -
717 						   sizeof(struct ipv6hdr));
718 
719 		for (;;) {
720 			/* Prepare header of the next frame,
721 			 * before previous one went down. */
722 			if (frag) {
723 				frag->ip_summed = CHECKSUM_NONE;
724 				skb_reset_transport_header(frag);
725 				fh = __skb_push(frag, sizeof(struct frag_hdr));
726 				__skb_push(frag, hlen);
727 				skb_reset_network_header(frag);
728 				memcpy(skb_network_header(frag), tmp_hdr,
729 				       hlen);
730 				offset += skb->len - hlen - sizeof(struct frag_hdr);
731 				fh->nexthdr = nexthdr;
732 				fh->reserved = 0;
733 				fh->frag_off = htons(offset);
734 				if (frag->next)
735 					fh->frag_off |= htons(IP6_MF);
736 				fh->identification = frag_id;
737 				ipv6_hdr(frag)->payload_len =
738 						htons(frag->len -
739 						      sizeof(struct ipv6hdr));
740 				ip6_copy_metadata(frag, skb);
741 			}
742 
743 			err = output(net, sk, skb);
744 			if (!err)
745 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
746 					      IPSTATS_MIB_FRAGCREATES);
747 
748 			if (err || !frag)
749 				break;
750 
751 			skb = frag;
752 			frag = skb->next;
753 			skb->next = NULL;
754 		}
755 
756 		kfree(tmp_hdr);
757 
758 		if (err == 0) {
759 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
760 				      IPSTATS_MIB_FRAGOKS);
761 			return 0;
762 		}
763 
764 		kfree_skb_list(frag);
765 
766 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
767 			      IPSTATS_MIB_FRAGFAILS);
768 		return err;
769 
770 slow_path_clean:
771 		skb_walk_frags(skb, frag2) {
772 			if (frag2 == frag)
773 				break;
774 			frag2->sk = NULL;
775 			frag2->destructor = NULL;
776 			skb->truesize += frag2->truesize;
777 		}
778 	}
779 
780 slow_path:
781 	left = skb->len - hlen;		/* Space per frame */
782 	ptr = hlen;			/* Where to start from */
783 
784 	/*
785 	 *	Fragment the datagram.
786 	 */
787 
788 	troom = rt->dst.dev->needed_tailroom;
789 
790 	/*
791 	 *	Keep copying data until we run out.
792 	 */
793 	while (left > 0)	{
794 		u8 *fragnexthdr_offset;
795 
796 		len = left;
797 		/* IF: it doesn't fit, use 'mtu' - the data space left */
798 		if (len > mtu)
799 			len = mtu;
800 		/* IF: we are not sending up to and including the packet end
801 		   then align the next start on an eight byte boundary */
802 		if (len < left)	{
803 			len &= ~7;
804 		}
805 
806 		/* Allocate buffer */
807 		frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
808 				 hroom + troom, GFP_ATOMIC);
809 		if (!frag) {
810 			err = -ENOMEM;
811 			goto fail;
812 		}
813 
814 		/*
815 		 *	Set up data on packet
816 		 */
817 
818 		ip6_copy_metadata(frag, skb);
819 		skb_reserve(frag, hroom);
820 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
821 		skb_reset_network_header(frag);
822 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
823 		frag->transport_header = (frag->network_header + hlen +
824 					  sizeof(struct frag_hdr));
825 
826 		/*
827 		 *	Charge the memory for the fragment to any owner
828 		 *	it might possess
829 		 */
830 		if (skb->sk)
831 			skb_set_owner_w(frag, skb->sk);
832 
833 		/*
834 		 *	Copy the packet header into the new buffer.
835 		 */
836 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
837 
838 		fragnexthdr_offset = skb_network_header(frag);
839 		fragnexthdr_offset += prevhdr - skb_network_header(skb);
840 		*fragnexthdr_offset = NEXTHDR_FRAGMENT;
841 
842 		/*
843 		 *	Build fragment header.
844 		 */
845 		fh->nexthdr = nexthdr;
846 		fh->reserved = 0;
847 		fh->identification = frag_id;
848 
849 		/*
850 		 *	Copy a block of the IP datagram.
851 		 */
852 		BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
853 				     len));
854 		left -= len;
855 
856 		fh->frag_off = htons(offset);
857 		if (left > 0)
858 			fh->frag_off |= htons(IP6_MF);
859 		ipv6_hdr(frag)->payload_len = htons(frag->len -
860 						    sizeof(struct ipv6hdr));
861 
862 		ptr += len;
863 		offset += len;
864 
865 		/*
866 		 *	Put this fragment into the sending queue.
867 		 */
868 		err = output(net, sk, frag);
869 		if (err)
870 			goto fail;
871 
872 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
873 			      IPSTATS_MIB_FRAGCREATES);
874 	}
875 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
876 		      IPSTATS_MIB_FRAGOKS);
877 	consume_skb(skb);
878 	return err;
879 
880 fail_toobig:
881 	if (skb->sk && dst_allfrag(skb_dst(skb)))
882 		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
883 
884 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
885 	err = -EMSGSIZE;
886 
887 fail:
888 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
889 		      IPSTATS_MIB_FRAGFAILS);
890 	kfree_skb(skb);
891 	return err;
892 }
893 
894 static inline int ip6_rt_check(const struct rt6key *rt_key,
895 			       const struct in6_addr *fl_addr,
896 			       const struct in6_addr *addr_cache)
897 {
898 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
899 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
900 }
901 
902 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
903 					  struct dst_entry *dst,
904 					  const struct flowi6 *fl6)
905 {
906 	struct ipv6_pinfo *np = inet6_sk(sk);
907 	struct rt6_info *rt;
908 
909 	if (!dst)
910 		goto out;
911 
912 	if (dst->ops->family != AF_INET6) {
913 		dst_release(dst);
914 		return NULL;
915 	}
916 
917 	rt = (struct rt6_info *)dst;
918 	/* Yes, checking route validity in not connected
919 	 * case is not very simple. Take into account,
920 	 * that we do not support routing by source, TOS,
921 	 * and MSG_DONTROUTE		--ANK (980726)
922 	 *
923 	 * 1. ip6_rt_check(): If route was host route,
924 	 *    check that cached destination is current.
925 	 *    If it is network route, we still may
926 	 *    check its validity using saved pointer
927 	 *    to the last used address: daddr_cache.
928 	 *    We do not want to save whole address now,
929 	 *    (because main consumer of this service
930 	 *    is tcp, which has not this problem),
931 	 *    so that the last trick works only on connected
932 	 *    sockets.
933 	 * 2. oif also should be the same.
934 	 */
935 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
936 #ifdef CONFIG_IPV6_SUBTREES
937 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
938 #endif
939 	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
940 	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
941 		dst_release(dst);
942 		dst = NULL;
943 	}
944 
945 out:
946 	return dst;
947 }
948 
949 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
950 			       struct dst_entry **dst, struct flowi6 *fl6)
951 {
952 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
953 	struct neighbour *n;
954 	struct rt6_info *rt;
955 #endif
956 	int err;
957 	int flags = 0;
958 
959 	/* The correct way to handle this would be to do
960 	 * ip6_route_get_saddr, and then ip6_route_output; however,
961 	 * the route-specific preferred source forces the
962 	 * ip6_route_output call _before_ ip6_route_get_saddr.
963 	 *
964 	 * In source specific routing (no src=any default route),
965 	 * ip6_route_output will fail given src=any saddr, though, so
966 	 * that's why we try it again later.
967 	 */
968 	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
969 		struct rt6_info *rt;
970 		bool had_dst = *dst != NULL;
971 
972 		if (!had_dst)
973 			*dst = ip6_route_output(net, sk, fl6);
974 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
975 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
976 					  sk ? inet6_sk(sk)->srcprefs : 0,
977 					  &fl6->saddr);
978 		if (err)
979 			goto out_err_release;
980 
981 		/* If we had an erroneous initial result, pretend it
982 		 * never existed and let the SA-enabled version take
983 		 * over.
984 		 */
985 		if (!had_dst && (*dst)->error) {
986 			dst_release(*dst);
987 			*dst = NULL;
988 		}
989 
990 		if (fl6->flowi6_oif)
991 			flags |= RT6_LOOKUP_F_IFACE;
992 	}
993 
994 	if (!*dst)
995 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
996 
997 	err = (*dst)->error;
998 	if (err)
999 		goto out_err_release;
1000 
1001 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1002 	/*
1003 	 * Here if the dst entry we've looked up
1004 	 * has a neighbour entry that is in the INCOMPLETE
1005 	 * state and the src address from the flow is
1006 	 * marked as OPTIMISTIC, we release the found
1007 	 * dst entry and replace it instead with the
1008 	 * dst entry of the nexthop router
1009 	 */
1010 	rt = (struct rt6_info *) *dst;
1011 	rcu_read_lock_bh();
1012 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1013 				      rt6_nexthop(rt, &fl6->daddr));
1014 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1015 	rcu_read_unlock_bh();
1016 
1017 	if (err) {
1018 		struct inet6_ifaddr *ifp;
1019 		struct flowi6 fl_gw6;
1020 		int redirect;
1021 
1022 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1023 				      (*dst)->dev, 1);
1024 
1025 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1026 		if (ifp)
1027 			in6_ifa_put(ifp);
1028 
1029 		if (redirect) {
1030 			/*
1031 			 * We need to get the dst entry for the
1032 			 * default router instead
1033 			 */
1034 			dst_release(*dst);
1035 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1036 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1037 			*dst = ip6_route_output(net, sk, &fl_gw6);
1038 			err = (*dst)->error;
1039 			if (err)
1040 				goto out_err_release;
1041 		}
1042 	}
1043 #endif
1044 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1045 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1046 		err = -EAFNOSUPPORT;
1047 		goto out_err_release;
1048 	}
1049 
1050 	return 0;
1051 
1052 out_err_release:
1053 	dst_release(*dst);
1054 	*dst = NULL;
1055 
1056 	if (err == -ENETUNREACH)
1057 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1058 	return err;
1059 }
1060 
1061 /**
1062  *	ip6_dst_lookup - perform route lookup on flow
1063  *	@sk: socket which provides route info
1064  *	@dst: pointer to dst_entry * for result
1065  *	@fl6: flow to lookup
1066  *
1067  *	This function performs a route lookup on the given flow.
1068  *
1069  *	It returns zero on success, or a standard errno code on error.
1070  */
1071 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1072 		   struct flowi6 *fl6)
1073 {
1074 	*dst = NULL;
1075 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1076 }
1077 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1078 
1079 /**
1080  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1081  *	@sk: socket which provides route info
1082  *	@fl6: flow to lookup
1083  *	@final_dst: final destination address for ipsec lookup
1084  *
1085  *	This function performs a route lookup on the given flow.
1086  *
1087  *	It returns a valid dst pointer on success, or a pointer encoded
1088  *	error code.
1089  */
1090 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1091 				      const struct in6_addr *final_dst)
1092 {
1093 	struct dst_entry *dst = NULL;
1094 	int err;
1095 
1096 	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1097 	if (err)
1098 		return ERR_PTR(err);
1099 	if (final_dst)
1100 		fl6->daddr = *final_dst;
1101 
1102 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1103 }
1104 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1105 
1106 /**
1107  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1108  *	@sk: socket which provides the dst cache and route info
1109  *	@fl6: flow to lookup
1110  *	@final_dst: final destination address for ipsec lookup
1111  *	@connected: whether @sk is connected or not
1112  *
1113  *	This function performs a route lookup on the given flow with the
1114  *	possibility of using the cached route in the socket if it is valid.
1115  *	It will take the socket dst lock when operating on the dst cache.
1116  *	As a result, this function can only be used in process context.
1117  *
1118  *	In addition, for a connected socket, cache the dst in the socket
1119  *	if the current cache is not valid.
1120  *
1121  *	It returns a valid dst pointer on success, or a pointer encoded
1122  *	error code.
1123  */
1124 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1125 					 const struct in6_addr *final_dst,
1126 					 bool connected)
1127 {
1128 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1129 
1130 	dst = ip6_sk_dst_check(sk, dst, fl6);
1131 	if (dst)
1132 		return dst;
1133 
1134 	dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1135 	if (connected && !IS_ERR(dst))
1136 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1137 
1138 	return dst;
1139 }
1140 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1141 
1142 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1143 					       gfp_t gfp)
1144 {
1145 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1146 }
1147 
1148 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1149 						gfp_t gfp)
1150 {
1151 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1152 }
1153 
1154 static void ip6_append_data_mtu(unsigned int *mtu,
1155 				int *maxfraglen,
1156 				unsigned int fragheaderlen,
1157 				struct sk_buff *skb,
1158 				struct rt6_info *rt,
1159 				unsigned int orig_mtu)
1160 {
1161 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1162 		if (!skb) {
1163 			/* first fragment, reserve header_len */
1164 			*mtu = orig_mtu - rt->dst.header_len;
1165 
1166 		} else {
1167 			/*
1168 			 * this fragment is not first, the headers
1169 			 * space is regarded as data space.
1170 			 */
1171 			*mtu = orig_mtu;
1172 		}
1173 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1174 			      + fragheaderlen - sizeof(struct frag_hdr);
1175 	}
1176 }
1177 
1178 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1179 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1180 			  struct rt6_info *rt, struct flowi6 *fl6)
1181 {
1182 	struct ipv6_pinfo *np = inet6_sk(sk);
1183 	unsigned int mtu;
1184 	struct ipv6_txoptions *opt = ipc6->opt;
1185 
1186 	/*
1187 	 * setup for corking
1188 	 */
1189 	if (opt) {
1190 		if (WARN_ON(v6_cork->opt))
1191 			return -EINVAL;
1192 
1193 		v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1194 		if (unlikely(!v6_cork->opt))
1195 			return -ENOBUFS;
1196 
1197 		v6_cork->opt->tot_len = sizeof(*opt);
1198 		v6_cork->opt->opt_flen = opt->opt_flen;
1199 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1200 
1201 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1202 						    sk->sk_allocation);
1203 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1204 			return -ENOBUFS;
1205 
1206 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1207 						    sk->sk_allocation);
1208 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1209 			return -ENOBUFS;
1210 
1211 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1212 						   sk->sk_allocation);
1213 		if (opt->hopopt && !v6_cork->opt->hopopt)
1214 			return -ENOBUFS;
1215 
1216 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1217 						    sk->sk_allocation);
1218 		if (opt->srcrt && !v6_cork->opt->srcrt)
1219 			return -ENOBUFS;
1220 
1221 		/* need source address above miyazawa*/
1222 	}
1223 	dst_hold(&rt->dst);
1224 	cork->base.dst = &rt->dst;
1225 	cork->fl.u.ip6 = *fl6;
1226 	v6_cork->hop_limit = ipc6->hlimit;
1227 	v6_cork->tclass = ipc6->tclass;
1228 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1229 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1230 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1231 	else
1232 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1233 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1234 	if (np->frag_size < mtu) {
1235 		if (np->frag_size)
1236 			mtu = np->frag_size;
1237 	}
1238 	if (mtu < IPV6_MIN_MTU)
1239 		return -EINVAL;
1240 	cork->base.fragsize = mtu;
1241 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1242 		cork->base.flags |= IPCORK_ALLFRAG;
1243 	cork->base.length = 0;
1244 
1245 	return 0;
1246 }
1247 
1248 static int __ip6_append_data(struct sock *sk,
1249 			     struct flowi6 *fl6,
1250 			     struct sk_buff_head *queue,
1251 			     struct inet_cork *cork,
1252 			     struct inet6_cork *v6_cork,
1253 			     struct page_frag *pfrag,
1254 			     int getfrag(void *from, char *to, int offset,
1255 					 int len, int odd, struct sk_buff *skb),
1256 			     void *from, int length, int transhdrlen,
1257 			     unsigned int flags, struct ipcm6_cookie *ipc6,
1258 			     const struct sockcm_cookie *sockc)
1259 {
1260 	struct sk_buff *skb, *skb_prev = NULL;
1261 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1262 	int exthdrlen = 0;
1263 	int dst_exthdrlen = 0;
1264 	int hh_len;
1265 	int copy;
1266 	int err;
1267 	int offset = 0;
1268 	__u8 tx_flags = 0;
1269 	u32 tskey = 0;
1270 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1271 	struct ipv6_txoptions *opt = v6_cork->opt;
1272 	int csummode = CHECKSUM_NONE;
1273 	unsigned int maxnonfragsize, headersize;
1274 	unsigned int wmem_alloc_delta = 0;
1275 
1276 	skb = skb_peek_tail(queue);
1277 	if (!skb) {
1278 		exthdrlen = opt ? opt->opt_flen : 0;
1279 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1280 	}
1281 
1282 	mtu = cork->fragsize;
1283 	orig_mtu = mtu;
1284 
1285 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1286 
1287 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1288 			(opt ? opt->opt_nflen : 0);
1289 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1290 		     sizeof(struct frag_hdr);
1291 
1292 	headersize = sizeof(struct ipv6hdr) +
1293 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1294 		     (dst_allfrag(&rt->dst) ?
1295 		      sizeof(struct frag_hdr) : 0) +
1296 		     rt->rt6i_nfheader_len;
1297 
1298 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1299 	 * the first fragment
1300 	 */
1301 	if (headersize + transhdrlen > mtu)
1302 		goto emsgsize;
1303 
1304 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1305 	    (sk->sk_protocol == IPPROTO_UDP ||
1306 	     sk->sk_protocol == IPPROTO_RAW)) {
1307 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1308 				sizeof(struct ipv6hdr));
1309 		goto emsgsize;
1310 	}
1311 
1312 	if (ip6_sk_ignore_df(sk))
1313 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1314 	else
1315 		maxnonfragsize = mtu;
1316 
1317 	if (cork->length + length > maxnonfragsize - headersize) {
1318 emsgsize:
1319 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1320 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1321 		return -EMSGSIZE;
1322 	}
1323 
1324 	/* CHECKSUM_PARTIAL only with no extension headers and when
1325 	 * we are not going to fragment
1326 	 */
1327 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1328 	    headersize == sizeof(struct ipv6hdr) &&
1329 	    length <= mtu - headersize &&
1330 	    !(flags & MSG_MORE) &&
1331 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1332 		csummode = CHECKSUM_PARTIAL;
1333 
1334 	if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1335 		sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1336 		if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1337 		    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1338 			tskey = sk->sk_tskey++;
1339 	}
1340 
1341 	/*
1342 	 * Let's try using as much space as possible.
1343 	 * Use MTU if total length of the message fits into the MTU.
1344 	 * Otherwise, we need to reserve fragment header and
1345 	 * fragment alignment (= 8-15 octects, in total).
1346 	 *
1347 	 * Note that we may need to "move" the data from the tail of
1348 	 * of the buffer to the new fragment when we split
1349 	 * the message.
1350 	 *
1351 	 * FIXME: It may be fragmented into multiple chunks
1352 	 *        at once if non-fragmentable extension headers
1353 	 *        are too large.
1354 	 * --yoshfuji
1355 	 */
1356 
1357 	cork->length += length;
1358 	if (!skb)
1359 		goto alloc_new_skb;
1360 
1361 	while (length > 0) {
1362 		/* Check if the remaining data fits into current packet. */
1363 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1364 		if (copy < length)
1365 			copy = maxfraglen - skb->len;
1366 
1367 		if (copy <= 0) {
1368 			char *data;
1369 			unsigned int datalen;
1370 			unsigned int fraglen;
1371 			unsigned int fraggap;
1372 			unsigned int alloclen;
1373 alloc_new_skb:
1374 			/* There's no room in the current skb */
1375 			if (skb)
1376 				fraggap = skb->len - maxfraglen;
1377 			else
1378 				fraggap = 0;
1379 			/* update mtu and maxfraglen if necessary */
1380 			if (!skb || !skb_prev)
1381 				ip6_append_data_mtu(&mtu, &maxfraglen,
1382 						    fragheaderlen, skb, rt,
1383 						    orig_mtu);
1384 
1385 			skb_prev = skb;
1386 
1387 			/*
1388 			 * If remaining data exceeds the mtu,
1389 			 * we know we need more fragment(s).
1390 			 */
1391 			datalen = length + fraggap;
1392 
1393 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1394 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1395 			if ((flags & MSG_MORE) &&
1396 			    !(rt->dst.dev->features&NETIF_F_SG))
1397 				alloclen = mtu;
1398 			else
1399 				alloclen = datalen + fragheaderlen;
1400 
1401 			alloclen += dst_exthdrlen;
1402 
1403 			if (datalen != length + fraggap) {
1404 				/*
1405 				 * this is not the last fragment, the trailer
1406 				 * space is regarded as data space.
1407 				 */
1408 				datalen += rt->dst.trailer_len;
1409 			}
1410 
1411 			alloclen += rt->dst.trailer_len;
1412 			fraglen = datalen + fragheaderlen;
1413 
1414 			/*
1415 			 * We just reserve space for fragment header.
1416 			 * Note: this may be overallocation if the message
1417 			 * (without MSG_MORE) fits into the MTU.
1418 			 */
1419 			alloclen += sizeof(struct frag_hdr);
1420 
1421 			copy = datalen - transhdrlen - fraggap;
1422 			if (copy < 0) {
1423 				err = -EINVAL;
1424 				goto error;
1425 			}
1426 			if (transhdrlen) {
1427 				skb = sock_alloc_send_skb(sk,
1428 						alloclen + hh_len,
1429 						(flags & MSG_DONTWAIT), &err);
1430 			} else {
1431 				skb = NULL;
1432 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1433 				    2 * sk->sk_sndbuf)
1434 					skb = alloc_skb(alloclen + hh_len,
1435 							sk->sk_allocation);
1436 				if (unlikely(!skb))
1437 					err = -ENOBUFS;
1438 			}
1439 			if (!skb)
1440 				goto error;
1441 			/*
1442 			 *	Fill in the control structures
1443 			 */
1444 			skb->protocol = htons(ETH_P_IPV6);
1445 			skb->ip_summed = csummode;
1446 			skb->csum = 0;
1447 			/* reserve for fragmentation and ipsec header */
1448 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1449 				    dst_exthdrlen);
1450 
1451 			/* Only the initial fragment is time stamped */
1452 			skb_shinfo(skb)->tx_flags = tx_flags;
1453 			tx_flags = 0;
1454 			skb_shinfo(skb)->tskey = tskey;
1455 			tskey = 0;
1456 
1457 			/*
1458 			 *	Find where to start putting bytes
1459 			 */
1460 			data = skb_put(skb, fraglen);
1461 			skb_set_network_header(skb, exthdrlen);
1462 			data += fragheaderlen;
1463 			skb->transport_header = (skb->network_header +
1464 						 fragheaderlen);
1465 			if (fraggap) {
1466 				skb->csum = skb_copy_and_csum_bits(
1467 					skb_prev, maxfraglen,
1468 					data + transhdrlen, fraggap, 0);
1469 				skb_prev->csum = csum_sub(skb_prev->csum,
1470 							  skb->csum);
1471 				data += fraggap;
1472 				pskb_trim_unique(skb_prev, maxfraglen);
1473 			}
1474 			if (copy > 0 &&
1475 			    getfrag(from, data + transhdrlen, offset,
1476 				    copy, fraggap, skb) < 0) {
1477 				err = -EFAULT;
1478 				kfree_skb(skb);
1479 				goto error;
1480 			}
1481 
1482 			offset += copy;
1483 			length -= datalen - fraggap;
1484 			transhdrlen = 0;
1485 			exthdrlen = 0;
1486 			dst_exthdrlen = 0;
1487 
1488 			if ((flags & MSG_CONFIRM) && !skb_prev)
1489 				skb_set_dst_pending_confirm(skb, 1);
1490 
1491 			/*
1492 			 * Put the packet on the pending queue
1493 			 */
1494 			if (!skb->destructor) {
1495 				skb->destructor = sock_wfree;
1496 				skb->sk = sk;
1497 				wmem_alloc_delta += skb->truesize;
1498 			}
1499 			__skb_queue_tail(queue, skb);
1500 			continue;
1501 		}
1502 
1503 		if (copy > length)
1504 			copy = length;
1505 
1506 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1507 			unsigned int off;
1508 
1509 			off = skb->len;
1510 			if (getfrag(from, skb_put(skb, copy),
1511 						offset, copy, off, skb) < 0) {
1512 				__skb_trim(skb, off);
1513 				err = -EFAULT;
1514 				goto error;
1515 			}
1516 		} else {
1517 			int i = skb_shinfo(skb)->nr_frags;
1518 
1519 			err = -ENOMEM;
1520 			if (!sk_page_frag_refill(sk, pfrag))
1521 				goto error;
1522 
1523 			if (!skb_can_coalesce(skb, i, pfrag->page,
1524 					      pfrag->offset)) {
1525 				err = -EMSGSIZE;
1526 				if (i == MAX_SKB_FRAGS)
1527 					goto error;
1528 
1529 				__skb_fill_page_desc(skb, i, pfrag->page,
1530 						     pfrag->offset, 0);
1531 				skb_shinfo(skb)->nr_frags = ++i;
1532 				get_page(pfrag->page);
1533 			}
1534 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1535 			if (getfrag(from,
1536 				    page_address(pfrag->page) + pfrag->offset,
1537 				    offset, copy, skb->len, skb) < 0)
1538 				goto error_efault;
1539 
1540 			pfrag->offset += copy;
1541 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1542 			skb->len += copy;
1543 			skb->data_len += copy;
1544 			skb->truesize += copy;
1545 			wmem_alloc_delta += copy;
1546 		}
1547 		offset += copy;
1548 		length -= copy;
1549 	}
1550 
1551 	if (wmem_alloc_delta)
1552 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1553 	return 0;
1554 
1555 error_efault:
1556 	err = -EFAULT;
1557 error:
1558 	cork->length -= length;
1559 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1560 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1561 	return err;
1562 }
1563 
1564 int ip6_append_data(struct sock *sk,
1565 		    int getfrag(void *from, char *to, int offset, int len,
1566 				int odd, struct sk_buff *skb),
1567 		    void *from, int length, int transhdrlen,
1568 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1569 		    struct rt6_info *rt, unsigned int flags,
1570 		    const struct sockcm_cookie *sockc)
1571 {
1572 	struct inet_sock *inet = inet_sk(sk);
1573 	struct ipv6_pinfo *np = inet6_sk(sk);
1574 	int exthdrlen;
1575 	int err;
1576 
1577 	if (flags&MSG_PROBE)
1578 		return 0;
1579 	if (skb_queue_empty(&sk->sk_write_queue)) {
1580 		/*
1581 		 * setup for corking
1582 		 */
1583 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1584 				     ipc6, rt, fl6);
1585 		if (err)
1586 			return err;
1587 
1588 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1589 		length += exthdrlen;
1590 		transhdrlen += exthdrlen;
1591 	} else {
1592 		fl6 = &inet->cork.fl.u.ip6;
1593 		transhdrlen = 0;
1594 	}
1595 
1596 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1597 				 &np->cork, sk_page_frag(sk), getfrag,
1598 				 from, length, transhdrlen, flags, ipc6, sockc);
1599 }
1600 EXPORT_SYMBOL_GPL(ip6_append_data);
1601 
1602 static void ip6_cork_release(struct inet_cork_full *cork,
1603 			     struct inet6_cork *v6_cork)
1604 {
1605 	if (v6_cork->opt) {
1606 		kfree(v6_cork->opt->dst0opt);
1607 		kfree(v6_cork->opt->dst1opt);
1608 		kfree(v6_cork->opt->hopopt);
1609 		kfree(v6_cork->opt->srcrt);
1610 		kfree(v6_cork->opt);
1611 		v6_cork->opt = NULL;
1612 	}
1613 
1614 	if (cork->base.dst) {
1615 		dst_release(cork->base.dst);
1616 		cork->base.dst = NULL;
1617 		cork->base.flags &= ~IPCORK_ALLFRAG;
1618 	}
1619 	memset(&cork->fl, 0, sizeof(cork->fl));
1620 }
1621 
1622 struct sk_buff *__ip6_make_skb(struct sock *sk,
1623 			       struct sk_buff_head *queue,
1624 			       struct inet_cork_full *cork,
1625 			       struct inet6_cork *v6_cork)
1626 {
1627 	struct sk_buff *skb, *tmp_skb;
1628 	struct sk_buff **tail_skb;
1629 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1630 	struct ipv6_pinfo *np = inet6_sk(sk);
1631 	struct net *net = sock_net(sk);
1632 	struct ipv6hdr *hdr;
1633 	struct ipv6_txoptions *opt = v6_cork->opt;
1634 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1635 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1636 	unsigned char proto = fl6->flowi6_proto;
1637 
1638 	skb = __skb_dequeue(queue);
1639 	if (!skb)
1640 		goto out;
1641 	tail_skb = &(skb_shinfo(skb)->frag_list);
1642 
1643 	/* move skb->data to ip header from ext header */
1644 	if (skb->data < skb_network_header(skb))
1645 		__skb_pull(skb, skb_network_offset(skb));
1646 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1647 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1648 		*tail_skb = tmp_skb;
1649 		tail_skb = &(tmp_skb->next);
1650 		skb->len += tmp_skb->len;
1651 		skb->data_len += tmp_skb->len;
1652 		skb->truesize += tmp_skb->truesize;
1653 		tmp_skb->destructor = NULL;
1654 		tmp_skb->sk = NULL;
1655 	}
1656 
1657 	/* Allow local fragmentation. */
1658 	skb->ignore_df = ip6_sk_ignore_df(sk);
1659 
1660 	*final_dst = fl6->daddr;
1661 	__skb_pull(skb, skb_network_header_len(skb));
1662 	if (opt && opt->opt_flen)
1663 		ipv6_push_frag_opts(skb, opt, &proto);
1664 	if (opt && opt->opt_nflen)
1665 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1666 
1667 	skb_push(skb, sizeof(struct ipv6hdr));
1668 	skb_reset_network_header(skb);
1669 	hdr = ipv6_hdr(skb);
1670 
1671 	ip6_flow_hdr(hdr, v6_cork->tclass,
1672 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1673 					ip6_autoflowlabel(net, np), fl6));
1674 	hdr->hop_limit = v6_cork->hop_limit;
1675 	hdr->nexthdr = proto;
1676 	hdr->saddr = fl6->saddr;
1677 	hdr->daddr = *final_dst;
1678 
1679 	skb->priority = sk->sk_priority;
1680 	skb->mark = sk->sk_mark;
1681 
1682 	skb_dst_set(skb, dst_clone(&rt->dst));
1683 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1684 	if (proto == IPPROTO_ICMPV6) {
1685 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1686 
1687 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1688 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1689 	}
1690 
1691 	ip6_cork_release(cork, v6_cork);
1692 out:
1693 	return skb;
1694 }
1695 
1696 int ip6_send_skb(struct sk_buff *skb)
1697 {
1698 	struct net *net = sock_net(skb->sk);
1699 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1700 	int err;
1701 
1702 	err = ip6_local_out(net, skb->sk, skb);
1703 	if (err) {
1704 		if (err > 0)
1705 			err = net_xmit_errno(err);
1706 		if (err)
1707 			IP6_INC_STATS(net, rt->rt6i_idev,
1708 				      IPSTATS_MIB_OUTDISCARDS);
1709 	}
1710 
1711 	return err;
1712 }
1713 
1714 int ip6_push_pending_frames(struct sock *sk)
1715 {
1716 	struct sk_buff *skb;
1717 
1718 	skb = ip6_finish_skb(sk);
1719 	if (!skb)
1720 		return 0;
1721 
1722 	return ip6_send_skb(skb);
1723 }
1724 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1725 
1726 static void __ip6_flush_pending_frames(struct sock *sk,
1727 				       struct sk_buff_head *queue,
1728 				       struct inet_cork_full *cork,
1729 				       struct inet6_cork *v6_cork)
1730 {
1731 	struct sk_buff *skb;
1732 
1733 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1734 		if (skb_dst(skb))
1735 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1736 				      IPSTATS_MIB_OUTDISCARDS);
1737 		kfree_skb(skb);
1738 	}
1739 
1740 	ip6_cork_release(cork, v6_cork);
1741 }
1742 
1743 void ip6_flush_pending_frames(struct sock *sk)
1744 {
1745 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1746 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1747 }
1748 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1749 
1750 struct sk_buff *ip6_make_skb(struct sock *sk,
1751 			     int getfrag(void *from, char *to, int offset,
1752 					 int len, int odd, struct sk_buff *skb),
1753 			     void *from, int length, int transhdrlen,
1754 			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1755 			     struct rt6_info *rt, unsigned int flags,
1756 			     const struct sockcm_cookie *sockc)
1757 {
1758 	struct inet_cork_full cork;
1759 	struct inet6_cork v6_cork;
1760 	struct sk_buff_head queue;
1761 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1762 	int err;
1763 
1764 	if (flags & MSG_PROBE)
1765 		return NULL;
1766 
1767 	__skb_queue_head_init(&queue);
1768 
1769 	cork.base.flags = 0;
1770 	cork.base.addr = 0;
1771 	cork.base.opt = NULL;
1772 	cork.base.dst = NULL;
1773 	v6_cork.opt = NULL;
1774 	err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1775 	if (err) {
1776 		ip6_cork_release(&cork, &v6_cork);
1777 		return ERR_PTR(err);
1778 	}
1779 	if (ipc6->dontfrag < 0)
1780 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1781 
1782 	err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1783 				&current->task_frag, getfrag, from,
1784 				length + exthdrlen, transhdrlen + exthdrlen,
1785 				flags, ipc6, sockc);
1786 	if (err) {
1787 		__ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1788 		return ERR_PTR(err);
1789 	}
1790 
1791 	return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1792 }
1793