xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 023e41632e065d49bcbe31b3c4b336217f96a271)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	:	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/bpf-cgroup.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45 
46 #include <net/sock.h>
47 #include <net/snmp.h>
48 
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
59 #include <net/l3mdev.h>
60 #include <net/lwtunnel.h>
61 
62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
63 {
64 	struct dst_entry *dst = skb_dst(skb);
65 	struct net_device *dev = dst->dev;
66 	struct neighbour *neigh;
67 	struct in6_addr *nexthop;
68 	int ret;
69 
70 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72 
73 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74 		    ((mroute6_is_socket(net, skb) &&
75 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 					 &ipv6_hdr(skb)->saddr))) {
78 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79 
80 			/* Do not check for IFF_ALLMULTI; multicast routing
81 			   is not supported in any case.
82 			 */
83 			if (newskb)
84 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 					net, sk, newskb, NULL, newskb->dev,
86 					dev_loopback_xmit);
87 
88 			if (ipv6_hdr(skb)->hop_limit == 0) {
89 				IP6_INC_STATS(net, idev,
90 					      IPSTATS_MIB_OUTDISCARDS);
91 				kfree_skb(skb);
92 				return 0;
93 			}
94 		}
95 
96 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
97 
98 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
99 		    IPV6_ADDR_SCOPE_NODELOCAL &&
100 		    !(dev->flags & IFF_LOOPBACK)) {
101 			kfree_skb(skb);
102 			return 0;
103 		}
104 	}
105 
106 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
107 		int res = lwtunnel_xmit(skb);
108 
109 		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
110 			return res;
111 	}
112 
113 	rcu_read_lock_bh();
114 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
115 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
116 	if (unlikely(!neigh))
117 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
118 	if (!IS_ERR(neigh)) {
119 		sock_confirm_neigh(skb, neigh);
120 		ret = neigh_output(neigh, skb);
121 		rcu_read_unlock_bh();
122 		return ret;
123 	}
124 	rcu_read_unlock_bh();
125 
126 	IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
127 	kfree_skb(skb);
128 	return -EINVAL;
129 }
130 
131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
132 {
133 	int ret;
134 
135 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
136 	if (ret) {
137 		kfree_skb(skb);
138 		return ret;
139 	}
140 
141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
142 	/* Policy lookup after SNAT yielded a new policy */
143 	if (skb_dst(skb)->xfrm) {
144 		IPCB(skb)->flags |= IPSKB_REROUTED;
145 		return dst_output(net, sk, skb);
146 	}
147 #endif
148 
149 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
150 	    dst_allfrag(skb_dst(skb)) ||
151 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
152 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
153 	else
154 		return ip6_finish_output2(net, sk, skb);
155 }
156 
157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
158 {
159 	struct net_device *dev = skb_dst(skb)->dev;
160 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161 
162 	skb->protocol = htons(ETH_P_IPV6);
163 	skb->dev = dev;
164 
165 	if (unlikely(idev->cnf.disable_ipv6)) {
166 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
167 		kfree_skb(skb);
168 		return 0;
169 	}
170 
171 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
172 			    net, sk, skb, NULL, dev,
173 			    ip6_finish_output,
174 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
175 }
176 
177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
178 {
179 	if (!np->autoflowlabel_set)
180 		return ip6_default_np_autolabel(net);
181 	else
182 		return np->autoflowlabel;
183 }
184 
185 /*
186  * xmit an sk_buff (used by TCP, SCTP and DCCP)
187  * Note : socket lock is not held for SYNACK packets, but might be modified
188  * by calls to skb_set_owner_w() and ipv6_local_error(),
189  * which are using proper atomic operations or spinlocks.
190  */
191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
192 	     __u32 mark, struct ipv6_txoptions *opt, int tclass)
193 {
194 	struct net *net = sock_net(sk);
195 	const struct ipv6_pinfo *np = inet6_sk(sk);
196 	struct in6_addr *first_hop = &fl6->daddr;
197 	struct dst_entry *dst = skb_dst(skb);
198 	unsigned int head_room;
199 	struct ipv6hdr *hdr;
200 	u8  proto = fl6->flowi6_proto;
201 	int seg_len = skb->len;
202 	int hlimit = -1;
203 	u32 mtu;
204 
205 	head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
206 	if (opt)
207 		head_room += opt->opt_nflen + opt->opt_flen;
208 
209 	if (unlikely(skb_headroom(skb) < head_room)) {
210 		struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
211 		if (!skb2) {
212 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
213 				      IPSTATS_MIB_OUTDISCARDS);
214 			kfree_skb(skb);
215 			return -ENOBUFS;
216 		}
217 		if (skb->sk)
218 			skb_set_owner_w(skb2, skb->sk);
219 		consume_skb(skb);
220 		skb = skb2;
221 	}
222 
223 	if (opt) {
224 		seg_len += opt->opt_nflen + opt->opt_flen;
225 
226 		if (opt->opt_flen)
227 			ipv6_push_frag_opts(skb, opt, &proto);
228 
229 		if (opt->opt_nflen)
230 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
231 					     &fl6->saddr);
232 	}
233 
234 	skb_push(skb, sizeof(struct ipv6hdr));
235 	skb_reset_network_header(skb);
236 	hdr = ipv6_hdr(skb);
237 
238 	/*
239 	 *	Fill in the IPv6 header
240 	 */
241 	if (np)
242 		hlimit = np->hop_limit;
243 	if (hlimit < 0)
244 		hlimit = ip6_dst_hoplimit(dst);
245 
246 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
247 				ip6_autoflowlabel(net, np), fl6));
248 
249 	hdr->payload_len = htons(seg_len);
250 	hdr->nexthdr = proto;
251 	hdr->hop_limit = hlimit;
252 
253 	hdr->saddr = fl6->saddr;
254 	hdr->daddr = *first_hop;
255 
256 	skb->protocol = htons(ETH_P_IPV6);
257 	skb->priority = sk->sk_priority;
258 	skb->mark = mark;
259 
260 	mtu = dst_mtu(dst);
261 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
262 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
263 			      IPSTATS_MIB_OUT, skb->len);
264 
265 		/* if egress device is enslaved to an L3 master device pass the
266 		 * skb to its handler for processing
267 		 */
268 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
269 		if (unlikely(!skb))
270 			return 0;
271 
272 		/* hooks should never assume socket lock is held.
273 		 * we promote our socket to non const
274 		 */
275 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
276 			       net, (struct sock *)sk, skb, NULL, dst->dev,
277 			       dst_output);
278 	}
279 
280 	skb->dev = dst->dev;
281 	/* ipv6_local_error() does not require socket lock,
282 	 * we promote our socket to non const
283 	 */
284 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
285 
286 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
287 	kfree_skb(skb);
288 	return -EMSGSIZE;
289 }
290 EXPORT_SYMBOL(ip6_xmit);
291 
292 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
293 {
294 	struct ip6_ra_chain *ra;
295 	struct sock *last = NULL;
296 
297 	read_lock(&ip6_ra_lock);
298 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
299 		struct sock *sk = ra->sk;
300 		if (sk && ra->sel == sel &&
301 		    (!sk->sk_bound_dev_if ||
302 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
303 			struct ipv6_pinfo *np = inet6_sk(sk);
304 
305 			if (np && np->rtalert_isolate &&
306 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
307 				continue;
308 			}
309 			if (last) {
310 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
311 				if (skb2)
312 					rawv6_rcv(last, skb2);
313 			}
314 			last = sk;
315 		}
316 	}
317 
318 	if (last) {
319 		rawv6_rcv(last, skb);
320 		read_unlock(&ip6_ra_lock);
321 		return 1;
322 	}
323 	read_unlock(&ip6_ra_lock);
324 	return 0;
325 }
326 
327 static int ip6_forward_proxy_check(struct sk_buff *skb)
328 {
329 	struct ipv6hdr *hdr = ipv6_hdr(skb);
330 	u8 nexthdr = hdr->nexthdr;
331 	__be16 frag_off;
332 	int offset;
333 
334 	if (ipv6_ext_hdr(nexthdr)) {
335 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
336 		if (offset < 0)
337 			return 0;
338 	} else
339 		offset = sizeof(struct ipv6hdr);
340 
341 	if (nexthdr == IPPROTO_ICMPV6) {
342 		struct icmp6hdr *icmp6;
343 
344 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
345 					 offset + 1 - skb->data)))
346 			return 0;
347 
348 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
349 
350 		switch (icmp6->icmp6_type) {
351 		case NDISC_ROUTER_SOLICITATION:
352 		case NDISC_ROUTER_ADVERTISEMENT:
353 		case NDISC_NEIGHBOUR_SOLICITATION:
354 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
355 		case NDISC_REDIRECT:
356 			/* For reaction involving unicast neighbor discovery
357 			 * message destined to the proxied address, pass it to
358 			 * input function.
359 			 */
360 			return 1;
361 		default:
362 			break;
363 		}
364 	}
365 
366 	/*
367 	 * The proxying router can't forward traffic sent to a link-local
368 	 * address, so signal the sender and discard the packet. This
369 	 * behavior is clarified by the MIPv6 specification.
370 	 */
371 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
372 		dst_link_failure(skb);
373 		return -1;
374 	}
375 
376 	return 0;
377 }
378 
379 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
380 				     struct sk_buff *skb)
381 {
382 	struct dst_entry *dst = skb_dst(skb);
383 
384 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
385 	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
386 
387 #ifdef CONFIG_NET_SWITCHDEV
388 	if (skb->offload_l3_fwd_mark) {
389 		consume_skb(skb);
390 		return 0;
391 	}
392 #endif
393 
394 	skb->tstamp = 0;
395 	return dst_output(net, sk, skb);
396 }
397 
398 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
399 {
400 	if (skb->len <= mtu)
401 		return false;
402 
403 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
404 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
405 		return true;
406 
407 	if (skb->ignore_df)
408 		return false;
409 
410 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
411 		return false;
412 
413 	return true;
414 }
415 
416 int ip6_forward(struct sk_buff *skb)
417 {
418 	struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
419 	struct dst_entry *dst = skb_dst(skb);
420 	struct ipv6hdr *hdr = ipv6_hdr(skb);
421 	struct inet6_skb_parm *opt = IP6CB(skb);
422 	struct net *net = dev_net(dst->dev);
423 	u32 mtu;
424 
425 	if (net->ipv6.devconf_all->forwarding == 0)
426 		goto error;
427 
428 	if (skb->pkt_type != PACKET_HOST)
429 		goto drop;
430 
431 	if (unlikely(skb->sk))
432 		goto drop;
433 
434 	if (skb_warn_if_lro(skb))
435 		goto drop;
436 
437 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
438 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
439 		goto drop;
440 	}
441 
442 	skb_forward_csum(skb);
443 
444 	/*
445 	 *	We DO NOT make any processing on
446 	 *	RA packets, pushing them to user level AS IS
447 	 *	without ane WARRANTY that application will be able
448 	 *	to interpret them. The reason is that we
449 	 *	cannot make anything clever here.
450 	 *
451 	 *	We are not end-node, so that if packet contains
452 	 *	AH/ESP, we cannot make anything.
453 	 *	Defragmentation also would be mistake, RA packets
454 	 *	cannot be fragmented, because there is no warranty
455 	 *	that different fragments will go along one path. --ANK
456 	 */
457 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
458 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
459 			return 0;
460 	}
461 
462 	/*
463 	 *	check and decrement ttl
464 	 */
465 	if (hdr->hop_limit <= 1) {
466 		/* Force OUTPUT device used as source address */
467 		skb->dev = dst->dev;
468 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
469 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
470 
471 		kfree_skb(skb);
472 		return -ETIMEDOUT;
473 	}
474 
475 	/* XXX: idev->cnf.proxy_ndp? */
476 	if (net->ipv6.devconf_all->proxy_ndp &&
477 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
478 		int proxied = ip6_forward_proxy_check(skb);
479 		if (proxied > 0)
480 			return ip6_input(skb);
481 		else if (proxied < 0) {
482 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
483 			goto drop;
484 		}
485 	}
486 
487 	if (!xfrm6_route_forward(skb)) {
488 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
489 		goto drop;
490 	}
491 	dst = skb_dst(skb);
492 
493 	/* IPv6 specs say nothing about it, but it is clear that we cannot
494 	   send redirects to source routed frames.
495 	   We don't send redirects to frames decapsulated from IPsec.
496 	 */
497 	if (IP6CB(skb)->iif == dst->dev->ifindex &&
498 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
499 		struct in6_addr *target = NULL;
500 		struct inet_peer *peer;
501 		struct rt6_info *rt;
502 
503 		/*
504 		 *	incoming and outgoing devices are the same
505 		 *	send a redirect.
506 		 */
507 
508 		rt = (struct rt6_info *) dst;
509 		if (rt->rt6i_flags & RTF_GATEWAY)
510 			target = &rt->rt6i_gateway;
511 		else
512 			target = &hdr->daddr;
513 
514 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
515 
516 		/* Limit redirects both by destination (here)
517 		   and by source (inside ndisc_send_redirect)
518 		 */
519 		if (inet_peer_xrlim_allow(peer, 1*HZ))
520 			ndisc_send_redirect(skb, target);
521 		if (peer)
522 			inet_putpeer(peer);
523 	} else {
524 		int addrtype = ipv6_addr_type(&hdr->saddr);
525 
526 		/* This check is security critical. */
527 		if (addrtype == IPV6_ADDR_ANY ||
528 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
529 			goto error;
530 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
531 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
532 				    ICMPV6_NOT_NEIGHBOUR, 0);
533 			goto error;
534 		}
535 	}
536 
537 	mtu = ip6_dst_mtu_forward(dst);
538 	if (mtu < IPV6_MIN_MTU)
539 		mtu = IPV6_MIN_MTU;
540 
541 	if (ip6_pkt_too_big(skb, mtu)) {
542 		/* Again, force OUTPUT device used as source address */
543 		skb->dev = dst->dev;
544 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
545 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
546 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
547 				IPSTATS_MIB_FRAGFAILS);
548 		kfree_skb(skb);
549 		return -EMSGSIZE;
550 	}
551 
552 	if (skb_cow(skb, dst->dev->hard_header_len)) {
553 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
554 				IPSTATS_MIB_OUTDISCARDS);
555 		goto drop;
556 	}
557 
558 	hdr = ipv6_hdr(skb);
559 
560 	/* Mangling hops number delayed to point after skb COW */
561 
562 	hdr->hop_limit--;
563 
564 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
565 		       net, NULL, skb, skb->dev, dst->dev,
566 		       ip6_forward_finish);
567 
568 error:
569 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
570 drop:
571 	kfree_skb(skb);
572 	return -EINVAL;
573 }
574 
575 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
576 {
577 	to->pkt_type = from->pkt_type;
578 	to->priority = from->priority;
579 	to->protocol = from->protocol;
580 	skb_dst_drop(to);
581 	skb_dst_set(to, dst_clone(skb_dst(from)));
582 	to->dev = from->dev;
583 	to->mark = from->mark;
584 
585 	skb_copy_hash(to, from);
586 
587 #ifdef CONFIG_NET_SCHED
588 	to->tc_index = from->tc_index;
589 #endif
590 	nf_copy(to, from);
591 	skb_ext_copy(to, from);
592 	skb_copy_secmark(to, from);
593 }
594 
595 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
596 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
597 {
598 	struct sk_buff *frag;
599 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
600 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
601 				inet6_sk(skb->sk) : NULL;
602 	struct ipv6hdr *tmp_hdr;
603 	struct frag_hdr *fh;
604 	unsigned int mtu, hlen, left, len;
605 	int hroom, troom;
606 	__be32 frag_id;
607 	int ptr, offset = 0, err = 0;
608 	u8 *prevhdr, nexthdr = 0;
609 
610 	err = ip6_find_1stfragopt(skb, &prevhdr);
611 	if (err < 0)
612 		goto fail;
613 	hlen = err;
614 	nexthdr = *prevhdr;
615 
616 	mtu = ip6_skb_dst_mtu(skb);
617 
618 	/* We must not fragment if the socket is set to force MTU discovery
619 	 * or if the skb it not generated by a local socket.
620 	 */
621 	if (unlikely(!skb->ignore_df && skb->len > mtu))
622 		goto fail_toobig;
623 
624 	if (IP6CB(skb)->frag_max_size) {
625 		if (IP6CB(skb)->frag_max_size > mtu)
626 			goto fail_toobig;
627 
628 		/* don't send fragments larger than what we received */
629 		mtu = IP6CB(skb)->frag_max_size;
630 		if (mtu < IPV6_MIN_MTU)
631 			mtu = IPV6_MIN_MTU;
632 	}
633 
634 	if (np && np->frag_size < mtu) {
635 		if (np->frag_size)
636 			mtu = np->frag_size;
637 	}
638 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
639 		goto fail_toobig;
640 	mtu -= hlen + sizeof(struct frag_hdr);
641 
642 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
643 				    &ipv6_hdr(skb)->saddr);
644 
645 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
646 	    (err = skb_checksum_help(skb)))
647 		goto fail;
648 
649 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
650 	if (skb_has_frag_list(skb)) {
651 		unsigned int first_len = skb_pagelen(skb);
652 		struct sk_buff *frag2;
653 
654 		if (first_len - hlen > mtu ||
655 		    ((first_len - hlen) & 7) ||
656 		    skb_cloned(skb) ||
657 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
658 			goto slow_path;
659 
660 		skb_walk_frags(skb, frag) {
661 			/* Correct geometry. */
662 			if (frag->len > mtu ||
663 			    ((frag->len & 7) && frag->next) ||
664 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
665 				goto slow_path_clean;
666 
667 			/* Partially cloned skb? */
668 			if (skb_shared(frag))
669 				goto slow_path_clean;
670 
671 			BUG_ON(frag->sk);
672 			if (skb->sk) {
673 				frag->sk = skb->sk;
674 				frag->destructor = sock_wfree;
675 			}
676 			skb->truesize -= frag->truesize;
677 		}
678 
679 		err = 0;
680 		offset = 0;
681 		/* BUILD HEADER */
682 
683 		*prevhdr = NEXTHDR_FRAGMENT;
684 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
685 		if (!tmp_hdr) {
686 			err = -ENOMEM;
687 			goto fail;
688 		}
689 		frag = skb_shinfo(skb)->frag_list;
690 		skb_frag_list_init(skb);
691 
692 		__skb_pull(skb, hlen);
693 		fh = __skb_push(skb, sizeof(struct frag_hdr));
694 		__skb_push(skb, hlen);
695 		skb_reset_network_header(skb);
696 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
697 
698 		fh->nexthdr = nexthdr;
699 		fh->reserved = 0;
700 		fh->frag_off = htons(IP6_MF);
701 		fh->identification = frag_id;
702 
703 		first_len = skb_pagelen(skb);
704 		skb->data_len = first_len - skb_headlen(skb);
705 		skb->len = first_len;
706 		ipv6_hdr(skb)->payload_len = htons(first_len -
707 						   sizeof(struct ipv6hdr));
708 
709 		for (;;) {
710 			/* Prepare header of the next frame,
711 			 * before previous one went down. */
712 			if (frag) {
713 				frag->ip_summed = CHECKSUM_NONE;
714 				skb_reset_transport_header(frag);
715 				fh = __skb_push(frag, sizeof(struct frag_hdr));
716 				__skb_push(frag, hlen);
717 				skb_reset_network_header(frag);
718 				memcpy(skb_network_header(frag), tmp_hdr,
719 				       hlen);
720 				offset += skb->len - hlen - sizeof(struct frag_hdr);
721 				fh->nexthdr = nexthdr;
722 				fh->reserved = 0;
723 				fh->frag_off = htons(offset);
724 				if (frag->next)
725 					fh->frag_off |= htons(IP6_MF);
726 				fh->identification = frag_id;
727 				ipv6_hdr(frag)->payload_len =
728 						htons(frag->len -
729 						      sizeof(struct ipv6hdr));
730 				ip6_copy_metadata(frag, skb);
731 			}
732 
733 			err = output(net, sk, skb);
734 			if (!err)
735 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
736 					      IPSTATS_MIB_FRAGCREATES);
737 
738 			if (err || !frag)
739 				break;
740 
741 			skb = frag;
742 			frag = skb->next;
743 			skb_mark_not_on_list(skb);
744 		}
745 
746 		kfree(tmp_hdr);
747 
748 		if (err == 0) {
749 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
750 				      IPSTATS_MIB_FRAGOKS);
751 			return 0;
752 		}
753 
754 		kfree_skb_list(frag);
755 
756 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
757 			      IPSTATS_MIB_FRAGFAILS);
758 		return err;
759 
760 slow_path_clean:
761 		skb_walk_frags(skb, frag2) {
762 			if (frag2 == frag)
763 				break;
764 			frag2->sk = NULL;
765 			frag2->destructor = NULL;
766 			skb->truesize += frag2->truesize;
767 		}
768 	}
769 
770 slow_path:
771 	left = skb->len - hlen;		/* Space per frame */
772 	ptr = hlen;			/* Where to start from */
773 
774 	/*
775 	 *	Fragment the datagram.
776 	 */
777 
778 	troom = rt->dst.dev->needed_tailroom;
779 
780 	/*
781 	 *	Keep copying data until we run out.
782 	 */
783 	while (left > 0)	{
784 		u8 *fragnexthdr_offset;
785 
786 		len = left;
787 		/* IF: it doesn't fit, use 'mtu' - the data space left */
788 		if (len > mtu)
789 			len = mtu;
790 		/* IF: we are not sending up to and including the packet end
791 		   then align the next start on an eight byte boundary */
792 		if (len < left)	{
793 			len &= ~7;
794 		}
795 
796 		/* Allocate buffer */
797 		frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
798 				 hroom + troom, GFP_ATOMIC);
799 		if (!frag) {
800 			err = -ENOMEM;
801 			goto fail;
802 		}
803 
804 		/*
805 		 *	Set up data on packet
806 		 */
807 
808 		ip6_copy_metadata(frag, skb);
809 		skb_reserve(frag, hroom);
810 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
811 		skb_reset_network_header(frag);
812 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
813 		frag->transport_header = (frag->network_header + hlen +
814 					  sizeof(struct frag_hdr));
815 
816 		/*
817 		 *	Charge the memory for the fragment to any owner
818 		 *	it might possess
819 		 */
820 		if (skb->sk)
821 			skb_set_owner_w(frag, skb->sk);
822 
823 		/*
824 		 *	Copy the packet header into the new buffer.
825 		 */
826 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
827 
828 		fragnexthdr_offset = skb_network_header(frag);
829 		fragnexthdr_offset += prevhdr - skb_network_header(skb);
830 		*fragnexthdr_offset = NEXTHDR_FRAGMENT;
831 
832 		/*
833 		 *	Build fragment header.
834 		 */
835 		fh->nexthdr = nexthdr;
836 		fh->reserved = 0;
837 		fh->identification = frag_id;
838 
839 		/*
840 		 *	Copy a block of the IP datagram.
841 		 */
842 		BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
843 				     len));
844 		left -= len;
845 
846 		fh->frag_off = htons(offset);
847 		if (left > 0)
848 			fh->frag_off |= htons(IP6_MF);
849 		ipv6_hdr(frag)->payload_len = htons(frag->len -
850 						    sizeof(struct ipv6hdr));
851 
852 		ptr += len;
853 		offset += len;
854 
855 		/*
856 		 *	Put this fragment into the sending queue.
857 		 */
858 		err = output(net, sk, frag);
859 		if (err)
860 			goto fail;
861 
862 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
863 			      IPSTATS_MIB_FRAGCREATES);
864 	}
865 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
866 		      IPSTATS_MIB_FRAGOKS);
867 	consume_skb(skb);
868 	return err;
869 
870 fail_toobig:
871 	if (skb->sk && dst_allfrag(skb_dst(skb)))
872 		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
873 
874 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
875 	err = -EMSGSIZE;
876 
877 fail:
878 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
879 		      IPSTATS_MIB_FRAGFAILS);
880 	kfree_skb(skb);
881 	return err;
882 }
883 
884 static inline int ip6_rt_check(const struct rt6key *rt_key,
885 			       const struct in6_addr *fl_addr,
886 			       const struct in6_addr *addr_cache)
887 {
888 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
889 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
890 }
891 
892 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
893 					  struct dst_entry *dst,
894 					  const struct flowi6 *fl6)
895 {
896 	struct ipv6_pinfo *np = inet6_sk(sk);
897 	struct rt6_info *rt;
898 
899 	if (!dst)
900 		goto out;
901 
902 	if (dst->ops->family != AF_INET6) {
903 		dst_release(dst);
904 		return NULL;
905 	}
906 
907 	rt = (struct rt6_info *)dst;
908 	/* Yes, checking route validity in not connected
909 	 * case is not very simple. Take into account,
910 	 * that we do not support routing by source, TOS,
911 	 * and MSG_DONTROUTE		--ANK (980726)
912 	 *
913 	 * 1. ip6_rt_check(): If route was host route,
914 	 *    check that cached destination is current.
915 	 *    If it is network route, we still may
916 	 *    check its validity using saved pointer
917 	 *    to the last used address: daddr_cache.
918 	 *    We do not want to save whole address now,
919 	 *    (because main consumer of this service
920 	 *    is tcp, which has not this problem),
921 	 *    so that the last trick works only on connected
922 	 *    sockets.
923 	 * 2. oif also should be the same.
924 	 */
925 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
926 #ifdef CONFIG_IPV6_SUBTREES
927 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
928 #endif
929 	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
930 	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
931 		dst_release(dst);
932 		dst = NULL;
933 	}
934 
935 out:
936 	return dst;
937 }
938 
939 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
940 			       struct dst_entry **dst, struct flowi6 *fl6)
941 {
942 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
943 	struct neighbour *n;
944 	struct rt6_info *rt;
945 #endif
946 	int err;
947 	int flags = 0;
948 
949 	/* The correct way to handle this would be to do
950 	 * ip6_route_get_saddr, and then ip6_route_output; however,
951 	 * the route-specific preferred source forces the
952 	 * ip6_route_output call _before_ ip6_route_get_saddr.
953 	 *
954 	 * In source specific routing (no src=any default route),
955 	 * ip6_route_output will fail given src=any saddr, though, so
956 	 * that's why we try it again later.
957 	 */
958 	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
959 		struct fib6_info *from;
960 		struct rt6_info *rt;
961 		bool had_dst = *dst != NULL;
962 
963 		if (!had_dst)
964 			*dst = ip6_route_output(net, sk, fl6);
965 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
966 
967 		rcu_read_lock();
968 		from = rt ? rcu_dereference(rt->from) : NULL;
969 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
970 					  sk ? inet6_sk(sk)->srcprefs : 0,
971 					  &fl6->saddr);
972 		rcu_read_unlock();
973 
974 		if (err)
975 			goto out_err_release;
976 
977 		/* If we had an erroneous initial result, pretend it
978 		 * never existed and let the SA-enabled version take
979 		 * over.
980 		 */
981 		if (!had_dst && (*dst)->error) {
982 			dst_release(*dst);
983 			*dst = NULL;
984 		}
985 
986 		if (fl6->flowi6_oif)
987 			flags |= RT6_LOOKUP_F_IFACE;
988 	}
989 
990 	if (!*dst)
991 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
992 
993 	err = (*dst)->error;
994 	if (err)
995 		goto out_err_release;
996 
997 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
998 	/*
999 	 * Here if the dst entry we've looked up
1000 	 * has a neighbour entry that is in the INCOMPLETE
1001 	 * state and the src address from the flow is
1002 	 * marked as OPTIMISTIC, we release the found
1003 	 * dst entry and replace it instead with the
1004 	 * dst entry of the nexthop router
1005 	 */
1006 	rt = (struct rt6_info *) *dst;
1007 	rcu_read_lock_bh();
1008 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1009 				      rt6_nexthop(rt, &fl6->daddr));
1010 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1011 	rcu_read_unlock_bh();
1012 
1013 	if (err) {
1014 		struct inet6_ifaddr *ifp;
1015 		struct flowi6 fl_gw6;
1016 		int redirect;
1017 
1018 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1019 				      (*dst)->dev, 1);
1020 
1021 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1022 		if (ifp)
1023 			in6_ifa_put(ifp);
1024 
1025 		if (redirect) {
1026 			/*
1027 			 * We need to get the dst entry for the
1028 			 * default router instead
1029 			 */
1030 			dst_release(*dst);
1031 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1032 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1033 			*dst = ip6_route_output(net, sk, &fl_gw6);
1034 			err = (*dst)->error;
1035 			if (err)
1036 				goto out_err_release;
1037 		}
1038 	}
1039 #endif
1040 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1041 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1042 		err = -EAFNOSUPPORT;
1043 		goto out_err_release;
1044 	}
1045 
1046 	return 0;
1047 
1048 out_err_release:
1049 	dst_release(*dst);
1050 	*dst = NULL;
1051 
1052 	if (err == -ENETUNREACH)
1053 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1054 	return err;
1055 }
1056 
1057 /**
1058  *	ip6_dst_lookup - perform route lookup on flow
1059  *	@sk: socket which provides route info
1060  *	@dst: pointer to dst_entry * for result
1061  *	@fl6: flow to lookup
1062  *
1063  *	This function performs a route lookup on the given flow.
1064  *
1065  *	It returns zero on success, or a standard errno code on error.
1066  */
1067 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1068 		   struct flowi6 *fl6)
1069 {
1070 	*dst = NULL;
1071 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1072 }
1073 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1074 
1075 /**
1076  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1077  *	@sk: socket which provides route info
1078  *	@fl6: flow to lookup
1079  *	@final_dst: final destination address for ipsec lookup
1080  *
1081  *	This function performs a route lookup on the given flow.
1082  *
1083  *	It returns a valid dst pointer on success, or a pointer encoded
1084  *	error code.
1085  */
1086 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1087 				      const struct in6_addr *final_dst)
1088 {
1089 	struct dst_entry *dst = NULL;
1090 	int err;
1091 
1092 	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1093 	if (err)
1094 		return ERR_PTR(err);
1095 	if (final_dst)
1096 		fl6->daddr = *final_dst;
1097 
1098 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1099 }
1100 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1101 
1102 /**
1103  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1104  *	@sk: socket which provides the dst cache and route info
1105  *	@fl6: flow to lookup
1106  *	@final_dst: final destination address for ipsec lookup
1107  *	@connected: whether @sk is connected or not
1108  *
1109  *	This function performs a route lookup on the given flow with the
1110  *	possibility of using the cached route in the socket if it is valid.
1111  *	It will take the socket dst lock when operating on the dst cache.
1112  *	As a result, this function can only be used in process context.
1113  *
1114  *	In addition, for a connected socket, cache the dst in the socket
1115  *	if the current cache is not valid.
1116  *
1117  *	It returns a valid dst pointer on success, or a pointer encoded
1118  *	error code.
1119  */
1120 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1121 					 const struct in6_addr *final_dst,
1122 					 bool connected)
1123 {
1124 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1125 
1126 	dst = ip6_sk_dst_check(sk, dst, fl6);
1127 	if (dst)
1128 		return dst;
1129 
1130 	dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1131 	if (connected && !IS_ERR(dst))
1132 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1133 
1134 	return dst;
1135 }
1136 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1137 
1138 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1139 					       gfp_t gfp)
1140 {
1141 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1142 }
1143 
1144 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1145 						gfp_t gfp)
1146 {
1147 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1148 }
1149 
1150 static void ip6_append_data_mtu(unsigned int *mtu,
1151 				int *maxfraglen,
1152 				unsigned int fragheaderlen,
1153 				struct sk_buff *skb,
1154 				struct rt6_info *rt,
1155 				unsigned int orig_mtu)
1156 {
1157 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1158 		if (!skb) {
1159 			/* first fragment, reserve header_len */
1160 			*mtu = orig_mtu - rt->dst.header_len;
1161 
1162 		} else {
1163 			/*
1164 			 * this fragment is not first, the headers
1165 			 * space is regarded as data space.
1166 			 */
1167 			*mtu = orig_mtu;
1168 		}
1169 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1170 			      + fragheaderlen - sizeof(struct frag_hdr);
1171 	}
1172 }
1173 
1174 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1175 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1176 			  struct rt6_info *rt, struct flowi6 *fl6)
1177 {
1178 	struct ipv6_pinfo *np = inet6_sk(sk);
1179 	unsigned int mtu;
1180 	struct ipv6_txoptions *opt = ipc6->opt;
1181 
1182 	/*
1183 	 * setup for corking
1184 	 */
1185 	if (opt) {
1186 		if (WARN_ON(v6_cork->opt))
1187 			return -EINVAL;
1188 
1189 		v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1190 		if (unlikely(!v6_cork->opt))
1191 			return -ENOBUFS;
1192 
1193 		v6_cork->opt->tot_len = sizeof(*opt);
1194 		v6_cork->opt->opt_flen = opt->opt_flen;
1195 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1196 
1197 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1198 						    sk->sk_allocation);
1199 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1200 			return -ENOBUFS;
1201 
1202 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1203 						    sk->sk_allocation);
1204 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1205 			return -ENOBUFS;
1206 
1207 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1208 						   sk->sk_allocation);
1209 		if (opt->hopopt && !v6_cork->opt->hopopt)
1210 			return -ENOBUFS;
1211 
1212 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1213 						    sk->sk_allocation);
1214 		if (opt->srcrt && !v6_cork->opt->srcrt)
1215 			return -ENOBUFS;
1216 
1217 		/* need source address above miyazawa*/
1218 	}
1219 	dst_hold(&rt->dst);
1220 	cork->base.dst = &rt->dst;
1221 	cork->fl.u.ip6 = *fl6;
1222 	v6_cork->hop_limit = ipc6->hlimit;
1223 	v6_cork->tclass = ipc6->tclass;
1224 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1225 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1226 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1227 	else
1228 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1229 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1230 	if (np->frag_size < mtu) {
1231 		if (np->frag_size)
1232 			mtu = np->frag_size;
1233 	}
1234 	if (mtu < IPV6_MIN_MTU)
1235 		return -EINVAL;
1236 	cork->base.fragsize = mtu;
1237 	cork->base.gso_size = ipc6->gso_size;
1238 	cork->base.tx_flags = 0;
1239 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1240 
1241 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1242 		cork->base.flags |= IPCORK_ALLFRAG;
1243 	cork->base.length = 0;
1244 
1245 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1246 
1247 	return 0;
1248 }
1249 
1250 static int __ip6_append_data(struct sock *sk,
1251 			     struct flowi6 *fl6,
1252 			     struct sk_buff_head *queue,
1253 			     struct inet_cork *cork,
1254 			     struct inet6_cork *v6_cork,
1255 			     struct page_frag *pfrag,
1256 			     int getfrag(void *from, char *to, int offset,
1257 					 int len, int odd, struct sk_buff *skb),
1258 			     void *from, int length, int transhdrlen,
1259 			     unsigned int flags, struct ipcm6_cookie *ipc6)
1260 {
1261 	struct sk_buff *skb, *skb_prev = NULL;
1262 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1263 	struct ubuf_info *uarg = NULL;
1264 	int exthdrlen = 0;
1265 	int dst_exthdrlen = 0;
1266 	int hh_len;
1267 	int copy;
1268 	int err;
1269 	int offset = 0;
1270 	u32 tskey = 0;
1271 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1272 	struct ipv6_txoptions *opt = v6_cork->opt;
1273 	int csummode = CHECKSUM_NONE;
1274 	unsigned int maxnonfragsize, headersize;
1275 	unsigned int wmem_alloc_delta = 0;
1276 	bool paged, extra_uref;
1277 
1278 	skb = skb_peek_tail(queue);
1279 	if (!skb) {
1280 		exthdrlen = opt ? opt->opt_flen : 0;
1281 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1282 	}
1283 
1284 	paged = !!cork->gso_size;
1285 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1286 	orig_mtu = mtu;
1287 
1288 	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1289 	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1290 		tskey = sk->sk_tskey++;
1291 
1292 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1293 
1294 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1295 			(opt ? opt->opt_nflen : 0);
1296 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1297 		     sizeof(struct frag_hdr);
1298 
1299 	headersize = sizeof(struct ipv6hdr) +
1300 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1301 		     (dst_allfrag(&rt->dst) ?
1302 		      sizeof(struct frag_hdr) : 0) +
1303 		     rt->rt6i_nfheader_len;
1304 
1305 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1306 	 * the first fragment
1307 	 */
1308 	if (headersize + transhdrlen > mtu)
1309 		goto emsgsize;
1310 
1311 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1312 	    (sk->sk_protocol == IPPROTO_UDP ||
1313 	     sk->sk_protocol == IPPROTO_RAW)) {
1314 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1315 				sizeof(struct ipv6hdr));
1316 		goto emsgsize;
1317 	}
1318 
1319 	if (ip6_sk_ignore_df(sk))
1320 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1321 	else
1322 		maxnonfragsize = mtu;
1323 
1324 	if (cork->length + length > maxnonfragsize - headersize) {
1325 emsgsize:
1326 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1327 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1328 		return -EMSGSIZE;
1329 	}
1330 
1331 	/* CHECKSUM_PARTIAL only with no extension headers and when
1332 	 * we are not going to fragment
1333 	 */
1334 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1335 	    headersize == sizeof(struct ipv6hdr) &&
1336 	    length <= mtu - headersize &&
1337 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1338 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1339 		csummode = CHECKSUM_PARTIAL;
1340 
1341 	if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1342 		uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
1343 		if (!uarg)
1344 			return -ENOBUFS;
1345 		extra_uref = true;
1346 		if (rt->dst.dev->features & NETIF_F_SG &&
1347 		    csummode == CHECKSUM_PARTIAL) {
1348 			paged = true;
1349 		} else {
1350 			uarg->zerocopy = 0;
1351 			skb_zcopy_set(skb, uarg, &extra_uref);
1352 		}
1353 	}
1354 
1355 	/*
1356 	 * Let's try using as much space as possible.
1357 	 * Use MTU if total length of the message fits into the MTU.
1358 	 * Otherwise, we need to reserve fragment header and
1359 	 * fragment alignment (= 8-15 octects, in total).
1360 	 *
1361 	 * Note that we may need to "move" the data from the tail of
1362 	 * of the buffer to the new fragment when we split
1363 	 * the message.
1364 	 *
1365 	 * FIXME: It may be fragmented into multiple chunks
1366 	 *        at once if non-fragmentable extension headers
1367 	 *        are too large.
1368 	 * --yoshfuji
1369 	 */
1370 
1371 	cork->length += length;
1372 	if (!skb)
1373 		goto alloc_new_skb;
1374 
1375 	while (length > 0) {
1376 		/* Check if the remaining data fits into current packet. */
1377 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1378 		if (copy < length)
1379 			copy = maxfraglen - skb->len;
1380 
1381 		if (copy <= 0) {
1382 			char *data;
1383 			unsigned int datalen;
1384 			unsigned int fraglen;
1385 			unsigned int fraggap;
1386 			unsigned int alloclen;
1387 			unsigned int pagedlen;
1388 alloc_new_skb:
1389 			/* There's no room in the current skb */
1390 			if (skb)
1391 				fraggap = skb->len - maxfraglen;
1392 			else
1393 				fraggap = 0;
1394 			/* update mtu and maxfraglen if necessary */
1395 			if (!skb || !skb_prev)
1396 				ip6_append_data_mtu(&mtu, &maxfraglen,
1397 						    fragheaderlen, skb, rt,
1398 						    orig_mtu);
1399 
1400 			skb_prev = skb;
1401 
1402 			/*
1403 			 * If remaining data exceeds the mtu,
1404 			 * we know we need more fragment(s).
1405 			 */
1406 			datalen = length + fraggap;
1407 
1408 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1409 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1410 			fraglen = datalen + fragheaderlen;
1411 			pagedlen = 0;
1412 
1413 			if ((flags & MSG_MORE) &&
1414 			    !(rt->dst.dev->features&NETIF_F_SG))
1415 				alloclen = mtu;
1416 			else if (!paged)
1417 				alloclen = fraglen;
1418 			else {
1419 				alloclen = min_t(int, fraglen, MAX_HEADER);
1420 				pagedlen = fraglen - alloclen;
1421 			}
1422 
1423 			alloclen += dst_exthdrlen;
1424 
1425 			if (datalen != length + fraggap) {
1426 				/*
1427 				 * this is not the last fragment, the trailer
1428 				 * space is regarded as data space.
1429 				 */
1430 				datalen += rt->dst.trailer_len;
1431 			}
1432 
1433 			alloclen += rt->dst.trailer_len;
1434 			fraglen = datalen + fragheaderlen;
1435 
1436 			/*
1437 			 * We just reserve space for fragment header.
1438 			 * Note: this may be overallocation if the message
1439 			 * (without MSG_MORE) fits into the MTU.
1440 			 */
1441 			alloclen += sizeof(struct frag_hdr);
1442 
1443 			copy = datalen - transhdrlen - fraggap - pagedlen;
1444 			if (copy < 0) {
1445 				err = -EINVAL;
1446 				goto error;
1447 			}
1448 			if (transhdrlen) {
1449 				skb = sock_alloc_send_skb(sk,
1450 						alloclen + hh_len,
1451 						(flags & MSG_DONTWAIT), &err);
1452 			} else {
1453 				skb = NULL;
1454 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1455 				    2 * sk->sk_sndbuf)
1456 					skb = alloc_skb(alloclen + hh_len,
1457 							sk->sk_allocation);
1458 				if (unlikely(!skb))
1459 					err = -ENOBUFS;
1460 			}
1461 			if (!skb)
1462 				goto error;
1463 			/*
1464 			 *	Fill in the control structures
1465 			 */
1466 			skb->protocol = htons(ETH_P_IPV6);
1467 			skb->ip_summed = csummode;
1468 			skb->csum = 0;
1469 			/* reserve for fragmentation and ipsec header */
1470 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1471 				    dst_exthdrlen);
1472 
1473 			/*
1474 			 *	Find where to start putting bytes
1475 			 */
1476 			data = skb_put(skb, fraglen - pagedlen);
1477 			skb_set_network_header(skb, exthdrlen);
1478 			data += fragheaderlen;
1479 			skb->transport_header = (skb->network_header +
1480 						 fragheaderlen);
1481 			if (fraggap) {
1482 				skb->csum = skb_copy_and_csum_bits(
1483 					skb_prev, maxfraglen,
1484 					data + transhdrlen, fraggap, 0);
1485 				skb_prev->csum = csum_sub(skb_prev->csum,
1486 							  skb->csum);
1487 				data += fraggap;
1488 				pskb_trim_unique(skb_prev, maxfraglen);
1489 			}
1490 			if (copy > 0 &&
1491 			    getfrag(from, data + transhdrlen, offset,
1492 				    copy, fraggap, skb) < 0) {
1493 				err = -EFAULT;
1494 				kfree_skb(skb);
1495 				goto error;
1496 			}
1497 
1498 			offset += copy;
1499 			length -= copy + transhdrlen;
1500 			transhdrlen = 0;
1501 			exthdrlen = 0;
1502 			dst_exthdrlen = 0;
1503 
1504 			/* Only the initial fragment is time stamped */
1505 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1506 			cork->tx_flags = 0;
1507 			skb_shinfo(skb)->tskey = tskey;
1508 			tskey = 0;
1509 			skb_zcopy_set(skb, uarg, &extra_uref);
1510 
1511 			if ((flags & MSG_CONFIRM) && !skb_prev)
1512 				skb_set_dst_pending_confirm(skb, 1);
1513 
1514 			/*
1515 			 * Put the packet on the pending queue
1516 			 */
1517 			if (!skb->destructor) {
1518 				skb->destructor = sock_wfree;
1519 				skb->sk = sk;
1520 				wmem_alloc_delta += skb->truesize;
1521 			}
1522 			__skb_queue_tail(queue, skb);
1523 			continue;
1524 		}
1525 
1526 		if (copy > length)
1527 			copy = length;
1528 
1529 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1530 		    skb_tailroom(skb) >= copy) {
1531 			unsigned int off;
1532 
1533 			off = skb->len;
1534 			if (getfrag(from, skb_put(skb, copy),
1535 						offset, copy, off, skb) < 0) {
1536 				__skb_trim(skb, off);
1537 				err = -EFAULT;
1538 				goto error;
1539 			}
1540 		} else if (!uarg || !uarg->zerocopy) {
1541 			int i = skb_shinfo(skb)->nr_frags;
1542 
1543 			err = -ENOMEM;
1544 			if (!sk_page_frag_refill(sk, pfrag))
1545 				goto error;
1546 
1547 			if (!skb_can_coalesce(skb, i, pfrag->page,
1548 					      pfrag->offset)) {
1549 				err = -EMSGSIZE;
1550 				if (i == MAX_SKB_FRAGS)
1551 					goto error;
1552 
1553 				__skb_fill_page_desc(skb, i, pfrag->page,
1554 						     pfrag->offset, 0);
1555 				skb_shinfo(skb)->nr_frags = ++i;
1556 				get_page(pfrag->page);
1557 			}
1558 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1559 			if (getfrag(from,
1560 				    page_address(pfrag->page) + pfrag->offset,
1561 				    offset, copy, skb->len, skb) < 0)
1562 				goto error_efault;
1563 
1564 			pfrag->offset += copy;
1565 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1566 			skb->len += copy;
1567 			skb->data_len += copy;
1568 			skb->truesize += copy;
1569 			wmem_alloc_delta += copy;
1570 		} else {
1571 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1572 			if (err < 0)
1573 				goto error;
1574 		}
1575 		offset += copy;
1576 		length -= copy;
1577 	}
1578 
1579 	if (wmem_alloc_delta)
1580 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1581 	return 0;
1582 
1583 error_efault:
1584 	err = -EFAULT;
1585 error:
1586 	if (uarg)
1587 		sock_zerocopy_put_abort(uarg, extra_uref);
1588 	cork->length -= length;
1589 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1590 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1591 	return err;
1592 }
1593 
1594 int ip6_append_data(struct sock *sk,
1595 		    int getfrag(void *from, char *to, int offset, int len,
1596 				int odd, struct sk_buff *skb),
1597 		    void *from, int length, int transhdrlen,
1598 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1599 		    struct rt6_info *rt, unsigned int flags)
1600 {
1601 	struct inet_sock *inet = inet_sk(sk);
1602 	struct ipv6_pinfo *np = inet6_sk(sk);
1603 	int exthdrlen;
1604 	int err;
1605 
1606 	if (flags&MSG_PROBE)
1607 		return 0;
1608 	if (skb_queue_empty(&sk->sk_write_queue)) {
1609 		/*
1610 		 * setup for corking
1611 		 */
1612 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1613 				     ipc6, rt, fl6);
1614 		if (err)
1615 			return err;
1616 
1617 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1618 		length += exthdrlen;
1619 		transhdrlen += exthdrlen;
1620 	} else {
1621 		fl6 = &inet->cork.fl.u.ip6;
1622 		transhdrlen = 0;
1623 	}
1624 
1625 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1626 				 &np->cork, sk_page_frag(sk), getfrag,
1627 				 from, length, transhdrlen, flags, ipc6);
1628 }
1629 EXPORT_SYMBOL_GPL(ip6_append_data);
1630 
1631 static void ip6_cork_release(struct inet_cork_full *cork,
1632 			     struct inet6_cork *v6_cork)
1633 {
1634 	if (v6_cork->opt) {
1635 		kfree(v6_cork->opt->dst0opt);
1636 		kfree(v6_cork->opt->dst1opt);
1637 		kfree(v6_cork->opt->hopopt);
1638 		kfree(v6_cork->opt->srcrt);
1639 		kfree(v6_cork->opt);
1640 		v6_cork->opt = NULL;
1641 	}
1642 
1643 	if (cork->base.dst) {
1644 		dst_release(cork->base.dst);
1645 		cork->base.dst = NULL;
1646 		cork->base.flags &= ~IPCORK_ALLFRAG;
1647 	}
1648 	memset(&cork->fl, 0, sizeof(cork->fl));
1649 }
1650 
1651 struct sk_buff *__ip6_make_skb(struct sock *sk,
1652 			       struct sk_buff_head *queue,
1653 			       struct inet_cork_full *cork,
1654 			       struct inet6_cork *v6_cork)
1655 {
1656 	struct sk_buff *skb, *tmp_skb;
1657 	struct sk_buff **tail_skb;
1658 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1659 	struct ipv6_pinfo *np = inet6_sk(sk);
1660 	struct net *net = sock_net(sk);
1661 	struct ipv6hdr *hdr;
1662 	struct ipv6_txoptions *opt = v6_cork->opt;
1663 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1664 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1665 	unsigned char proto = fl6->flowi6_proto;
1666 
1667 	skb = __skb_dequeue(queue);
1668 	if (!skb)
1669 		goto out;
1670 	tail_skb = &(skb_shinfo(skb)->frag_list);
1671 
1672 	/* move skb->data to ip header from ext header */
1673 	if (skb->data < skb_network_header(skb))
1674 		__skb_pull(skb, skb_network_offset(skb));
1675 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1676 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1677 		*tail_skb = tmp_skb;
1678 		tail_skb = &(tmp_skb->next);
1679 		skb->len += tmp_skb->len;
1680 		skb->data_len += tmp_skb->len;
1681 		skb->truesize += tmp_skb->truesize;
1682 		tmp_skb->destructor = NULL;
1683 		tmp_skb->sk = NULL;
1684 	}
1685 
1686 	/* Allow local fragmentation. */
1687 	skb->ignore_df = ip6_sk_ignore_df(sk);
1688 
1689 	*final_dst = fl6->daddr;
1690 	__skb_pull(skb, skb_network_header_len(skb));
1691 	if (opt && opt->opt_flen)
1692 		ipv6_push_frag_opts(skb, opt, &proto);
1693 	if (opt && opt->opt_nflen)
1694 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1695 
1696 	skb_push(skb, sizeof(struct ipv6hdr));
1697 	skb_reset_network_header(skb);
1698 	hdr = ipv6_hdr(skb);
1699 
1700 	ip6_flow_hdr(hdr, v6_cork->tclass,
1701 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1702 					ip6_autoflowlabel(net, np), fl6));
1703 	hdr->hop_limit = v6_cork->hop_limit;
1704 	hdr->nexthdr = proto;
1705 	hdr->saddr = fl6->saddr;
1706 	hdr->daddr = *final_dst;
1707 
1708 	skb->priority = sk->sk_priority;
1709 	skb->mark = sk->sk_mark;
1710 
1711 	skb->tstamp = cork->base.transmit_time;
1712 
1713 	skb_dst_set(skb, dst_clone(&rt->dst));
1714 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1715 	if (proto == IPPROTO_ICMPV6) {
1716 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1717 
1718 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1719 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1720 	}
1721 
1722 	ip6_cork_release(cork, v6_cork);
1723 out:
1724 	return skb;
1725 }
1726 
1727 int ip6_send_skb(struct sk_buff *skb)
1728 {
1729 	struct net *net = sock_net(skb->sk);
1730 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1731 	int err;
1732 
1733 	err = ip6_local_out(net, skb->sk, skb);
1734 	if (err) {
1735 		if (err > 0)
1736 			err = net_xmit_errno(err);
1737 		if (err)
1738 			IP6_INC_STATS(net, rt->rt6i_idev,
1739 				      IPSTATS_MIB_OUTDISCARDS);
1740 	}
1741 
1742 	return err;
1743 }
1744 
1745 int ip6_push_pending_frames(struct sock *sk)
1746 {
1747 	struct sk_buff *skb;
1748 
1749 	skb = ip6_finish_skb(sk);
1750 	if (!skb)
1751 		return 0;
1752 
1753 	return ip6_send_skb(skb);
1754 }
1755 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1756 
1757 static void __ip6_flush_pending_frames(struct sock *sk,
1758 				       struct sk_buff_head *queue,
1759 				       struct inet_cork_full *cork,
1760 				       struct inet6_cork *v6_cork)
1761 {
1762 	struct sk_buff *skb;
1763 
1764 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1765 		if (skb_dst(skb))
1766 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1767 				      IPSTATS_MIB_OUTDISCARDS);
1768 		kfree_skb(skb);
1769 	}
1770 
1771 	ip6_cork_release(cork, v6_cork);
1772 }
1773 
1774 void ip6_flush_pending_frames(struct sock *sk)
1775 {
1776 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1777 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1778 }
1779 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1780 
1781 struct sk_buff *ip6_make_skb(struct sock *sk,
1782 			     int getfrag(void *from, char *to, int offset,
1783 					 int len, int odd, struct sk_buff *skb),
1784 			     void *from, int length, int transhdrlen,
1785 			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1786 			     struct rt6_info *rt, unsigned int flags,
1787 			     struct inet_cork_full *cork)
1788 {
1789 	struct inet6_cork v6_cork;
1790 	struct sk_buff_head queue;
1791 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1792 	int err;
1793 
1794 	if (flags & MSG_PROBE)
1795 		return NULL;
1796 
1797 	__skb_queue_head_init(&queue);
1798 
1799 	cork->base.flags = 0;
1800 	cork->base.addr = 0;
1801 	cork->base.opt = NULL;
1802 	cork->base.dst = NULL;
1803 	v6_cork.opt = NULL;
1804 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1805 	if (err) {
1806 		ip6_cork_release(cork, &v6_cork);
1807 		return ERR_PTR(err);
1808 	}
1809 	if (ipc6->dontfrag < 0)
1810 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1811 
1812 	err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1813 				&current->task_frag, getfrag, from,
1814 				length + exthdrlen, transhdrlen + exthdrlen,
1815 				flags, ipc6);
1816 	if (err) {
1817 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1818 		return ERR_PTR(err);
1819 	}
1820 
1821 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1822 }
1823