xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 22fc4c4c9fd60427bcda00878cee94e7622cfa7a)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	:	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/bpf-cgroup.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45 
46 #include <net/sock.h>
47 #include <net/snmp.h>
48 
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
59 #include <net/l3mdev.h>
60 #include <net/lwtunnel.h>
61 
62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
63 {
64 	struct dst_entry *dst = skb_dst(skb);
65 	struct net_device *dev = dst->dev;
66 	struct neighbour *neigh;
67 	struct in6_addr *nexthop;
68 	int ret;
69 
70 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72 
73 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74 		    ((mroute6_is_socket(net, skb) &&
75 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 					 &ipv6_hdr(skb)->saddr))) {
78 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79 
80 			/* Do not check for IFF_ALLMULTI; multicast routing
81 			   is not supported in any case.
82 			 */
83 			if (newskb)
84 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 					net, sk, newskb, NULL, newskb->dev,
86 					dev_loopback_xmit);
87 
88 			if (ipv6_hdr(skb)->hop_limit == 0) {
89 				IP6_INC_STATS(net, idev,
90 					      IPSTATS_MIB_OUTDISCARDS);
91 				kfree_skb(skb);
92 				return 0;
93 			}
94 		}
95 
96 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
97 
98 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
99 		    IPV6_ADDR_SCOPE_NODELOCAL &&
100 		    !(dev->flags & IFF_LOOPBACK)) {
101 			kfree_skb(skb);
102 			return 0;
103 		}
104 	}
105 
106 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
107 		int res = lwtunnel_xmit(skb);
108 
109 		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
110 			return res;
111 	}
112 
113 	rcu_read_lock_bh();
114 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
115 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
116 	if (unlikely(!neigh))
117 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
118 	if (!IS_ERR(neigh)) {
119 		sock_confirm_neigh(skb, neigh);
120 		ret = neigh_output(neigh, skb);
121 		rcu_read_unlock_bh();
122 		return ret;
123 	}
124 	rcu_read_unlock_bh();
125 
126 	IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
127 	kfree_skb(skb);
128 	return -EINVAL;
129 }
130 
131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
132 {
133 	int ret;
134 
135 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
136 	if (ret) {
137 		kfree_skb(skb);
138 		return ret;
139 	}
140 
141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
142 	/* Policy lookup after SNAT yielded a new policy */
143 	if (skb_dst(skb)->xfrm) {
144 		IPCB(skb)->flags |= IPSKB_REROUTED;
145 		return dst_output(net, sk, skb);
146 	}
147 #endif
148 
149 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
150 	    dst_allfrag(skb_dst(skb)) ||
151 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
152 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
153 	else
154 		return ip6_finish_output2(net, sk, skb);
155 }
156 
157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
158 {
159 	struct net_device *dev = skb_dst(skb)->dev;
160 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161 
162 	skb->protocol = htons(ETH_P_IPV6);
163 	skb->dev = dev;
164 
165 	if (unlikely(idev->cnf.disable_ipv6)) {
166 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
167 		kfree_skb(skb);
168 		return 0;
169 	}
170 
171 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
172 			    net, sk, skb, NULL, dev,
173 			    ip6_finish_output,
174 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
175 }
176 
177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
178 {
179 	if (!np->autoflowlabel_set)
180 		return ip6_default_np_autolabel(net);
181 	else
182 		return np->autoflowlabel;
183 }
184 
185 /*
186  * xmit an sk_buff (used by TCP, SCTP and DCCP)
187  * Note : socket lock is not held for SYNACK packets, but might be modified
188  * by calls to skb_set_owner_w() and ipv6_local_error(),
189  * which are using proper atomic operations or spinlocks.
190  */
191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
192 	     __u32 mark, struct ipv6_txoptions *opt, int tclass)
193 {
194 	struct net *net = sock_net(sk);
195 	const struct ipv6_pinfo *np = inet6_sk(sk);
196 	struct in6_addr *first_hop = &fl6->daddr;
197 	struct dst_entry *dst = skb_dst(skb);
198 	unsigned int head_room;
199 	struct ipv6hdr *hdr;
200 	u8  proto = fl6->flowi6_proto;
201 	int seg_len = skb->len;
202 	int hlimit = -1;
203 	u32 mtu;
204 
205 	head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
206 	if (opt)
207 		head_room += opt->opt_nflen + opt->opt_flen;
208 
209 	if (unlikely(skb_headroom(skb) < head_room)) {
210 		struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
211 		if (!skb2) {
212 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
213 				      IPSTATS_MIB_OUTDISCARDS);
214 			kfree_skb(skb);
215 			return -ENOBUFS;
216 		}
217 		if (skb->sk)
218 			skb_set_owner_w(skb2, skb->sk);
219 		consume_skb(skb);
220 		skb = skb2;
221 	}
222 
223 	if (opt) {
224 		seg_len += opt->opt_nflen + opt->opt_flen;
225 
226 		if (opt->opt_flen)
227 			ipv6_push_frag_opts(skb, opt, &proto);
228 
229 		if (opt->opt_nflen)
230 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
231 					     &fl6->saddr);
232 	}
233 
234 	skb_push(skb, sizeof(struct ipv6hdr));
235 	skb_reset_network_header(skb);
236 	hdr = ipv6_hdr(skb);
237 
238 	/*
239 	 *	Fill in the IPv6 header
240 	 */
241 	if (np)
242 		hlimit = np->hop_limit;
243 	if (hlimit < 0)
244 		hlimit = ip6_dst_hoplimit(dst);
245 
246 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
247 				ip6_autoflowlabel(net, np), fl6));
248 
249 	hdr->payload_len = htons(seg_len);
250 	hdr->nexthdr = proto;
251 	hdr->hop_limit = hlimit;
252 
253 	hdr->saddr = fl6->saddr;
254 	hdr->daddr = *first_hop;
255 
256 	skb->protocol = htons(ETH_P_IPV6);
257 	skb->priority = sk->sk_priority;
258 	skb->mark = mark;
259 
260 	mtu = dst_mtu(dst);
261 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
262 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
263 			      IPSTATS_MIB_OUT, skb->len);
264 
265 		/* if egress device is enslaved to an L3 master device pass the
266 		 * skb to its handler for processing
267 		 */
268 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
269 		if (unlikely(!skb))
270 			return 0;
271 
272 		/* hooks should never assume socket lock is held.
273 		 * we promote our socket to non const
274 		 */
275 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
276 			       net, (struct sock *)sk, skb, NULL, dst->dev,
277 			       dst_output);
278 	}
279 
280 	skb->dev = dst->dev;
281 	/* ipv6_local_error() does not require socket lock,
282 	 * we promote our socket to non const
283 	 */
284 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
285 
286 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
287 	kfree_skb(skb);
288 	return -EMSGSIZE;
289 }
290 EXPORT_SYMBOL(ip6_xmit);
291 
292 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
293 {
294 	struct ip6_ra_chain *ra;
295 	struct sock *last = NULL;
296 
297 	read_lock(&ip6_ra_lock);
298 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
299 		struct sock *sk = ra->sk;
300 		if (sk && ra->sel == sel &&
301 		    (!sk->sk_bound_dev_if ||
302 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
303 			if (last) {
304 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
305 				if (skb2)
306 					rawv6_rcv(last, skb2);
307 			}
308 			last = sk;
309 		}
310 	}
311 
312 	if (last) {
313 		rawv6_rcv(last, skb);
314 		read_unlock(&ip6_ra_lock);
315 		return 1;
316 	}
317 	read_unlock(&ip6_ra_lock);
318 	return 0;
319 }
320 
321 static int ip6_forward_proxy_check(struct sk_buff *skb)
322 {
323 	struct ipv6hdr *hdr = ipv6_hdr(skb);
324 	u8 nexthdr = hdr->nexthdr;
325 	__be16 frag_off;
326 	int offset;
327 
328 	if (ipv6_ext_hdr(nexthdr)) {
329 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
330 		if (offset < 0)
331 			return 0;
332 	} else
333 		offset = sizeof(struct ipv6hdr);
334 
335 	if (nexthdr == IPPROTO_ICMPV6) {
336 		struct icmp6hdr *icmp6;
337 
338 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
339 					 offset + 1 - skb->data)))
340 			return 0;
341 
342 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
343 
344 		switch (icmp6->icmp6_type) {
345 		case NDISC_ROUTER_SOLICITATION:
346 		case NDISC_ROUTER_ADVERTISEMENT:
347 		case NDISC_NEIGHBOUR_SOLICITATION:
348 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
349 		case NDISC_REDIRECT:
350 			/* For reaction involving unicast neighbor discovery
351 			 * message destined to the proxied address, pass it to
352 			 * input function.
353 			 */
354 			return 1;
355 		default:
356 			break;
357 		}
358 	}
359 
360 	/*
361 	 * The proxying router can't forward traffic sent to a link-local
362 	 * address, so signal the sender and discard the packet. This
363 	 * behavior is clarified by the MIPv6 specification.
364 	 */
365 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
366 		dst_link_failure(skb);
367 		return -1;
368 	}
369 
370 	return 0;
371 }
372 
373 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
374 				     struct sk_buff *skb)
375 {
376 	struct dst_entry *dst = skb_dst(skb);
377 
378 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
379 	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
380 
381 #ifdef CONFIG_NET_SWITCHDEV
382 	if (skb->offload_l3_fwd_mark) {
383 		consume_skb(skb);
384 		return 0;
385 	}
386 #endif
387 
388 	skb->tstamp = 0;
389 	return dst_output(net, sk, skb);
390 }
391 
392 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
393 {
394 	if (skb->len <= mtu)
395 		return false;
396 
397 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
398 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
399 		return true;
400 
401 	if (skb->ignore_df)
402 		return false;
403 
404 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
405 		return false;
406 
407 	return true;
408 }
409 
410 int ip6_forward(struct sk_buff *skb)
411 {
412 	struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
413 	struct dst_entry *dst = skb_dst(skb);
414 	struct ipv6hdr *hdr = ipv6_hdr(skb);
415 	struct inet6_skb_parm *opt = IP6CB(skb);
416 	struct net *net = dev_net(dst->dev);
417 	u32 mtu;
418 
419 	if (net->ipv6.devconf_all->forwarding == 0)
420 		goto error;
421 
422 	if (skb->pkt_type != PACKET_HOST)
423 		goto drop;
424 
425 	if (unlikely(skb->sk))
426 		goto drop;
427 
428 	if (skb_warn_if_lro(skb))
429 		goto drop;
430 
431 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
432 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
433 		goto drop;
434 	}
435 
436 	skb_forward_csum(skb);
437 
438 	/*
439 	 *	We DO NOT make any processing on
440 	 *	RA packets, pushing them to user level AS IS
441 	 *	without ane WARRANTY that application will be able
442 	 *	to interpret them. The reason is that we
443 	 *	cannot make anything clever here.
444 	 *
445 	 *	We are not end-node, so that if packet contains
446 	 *	AH/ESP, we cannot make anything.
447 	 *	Defragmentation also would be mistake, RA packets
448 	 *	cannot be fragmented, because there is no warranty
449 	 *	that different fragments will go along one path. --ANK
450 	 */
451 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
452 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
453 			return 0;
454 	}
455 
456 	/*
457 	 *	check and decrement ttl
458 	 */
459 	if (hdr->hop_limit <= 1) {
460 		/* Force OUTPUT device used as source address */
461 		skb->dev = dst->dev;
462 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
463 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
464 
465 		kfree_skb(skb);
466 		return -ETIMEDOUT;
467 	}
468 
469 	/* XXX: idev->cnf.proxy_ndp? */
470 	if (net->ipv6.devconf_all->proxy_ndp &&
471 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
472 		int proxied = ip6_forward_proxy_check(skb);
473 		if (proxied > 0)
474 			return ip6_input(skb);
475 		else if (proxied < 0) {
476 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
477 			goto drop;
478 		}
479 	}
480 
481 	if (!xfrm6_route_forward(skb)) {
482 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
483 		goto drop;
484 	}
485 	dst = skb_dst(skb);
486 
487 	/* IPv6 specs say nothing about it, but it is clear that we cannot
488 	   send redirects to source routed frames.
489 	   We don't send redirects to frames decapsulated from IPsec.
490 	 */
491 	if (IP6CB(skb)->iif == dst->dev->ifindex &&
492 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
493 		struct in6_addr *target = NULL;
494 		struct inet_peer *peer;
495 		struct rt6_info *rt;
496 
497 		/*
498 		 *	incoming and outgoing devices are the same
499 		 *	send a redirect.
500 		 */
501 
502 		rt = (struct rt6_info *) dst;
503 		if (rt->rt6i_flags & RTF_GATEWAY)
504 			target = &rt->rt6i_gateway;
505 		else
506 			target = &hdr->daddr;
507 
508 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
509 
510 		/* Limit redirects both by destination (here)
511 		   and by source (inside ndisc_send_redirect)
512 		 */
513 		if (inet_peer_xrlim_allow(peer, 1*HZ))
514 			ndisc_send_redirect(skb, target);
515 		if (peer)
516 			inet_putpeer(peer);
517 	} else {
518 		int addrtype = ipv6_addr_type(&hdr->saddr);
519 
520 		/* This check is security critical. */
521 		if (addrtype == IPV6_ADDR_ANY ||
522 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
523 			goto error;
524 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
525 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
526 				    ICMPV6_NOT_NEIGHBOUR, 0);
527 			goto error;
528 		}
529 	}
530 
531 	mtu = ip6_dst_mtu_forward(dst);
532 	if (mtu < IPV6_MIN_MTU)
533 		mtu = IPV6_MIN_MTU;
534 
535 	if (ip6_pkt_too_big(skb, mtu)) {
536 		/* Again, force OUTPUT device used as source address */
537 		skb->dev = dst->dev;
538 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
539 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
540 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
541 				IPSTATS_MIB_FRAGFAILS);
542 		kfree_skb(skb);
543 		return -EMSGSIZE;
544 	}
545 
546 	if (skb_cow(skb, dst->dev->hard_header_len)) {
547 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
548 				IPSTATS_MIB_OUTDISCARDS);
549 		goto drop;
550 	}
551 
552 	hdr = ipv6_hdr(skb);
553 
554 	/* Mangling hops number delayed to point after skb COW */
555 
556 	hdr->hop_limit--;
557 
558 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
559 		       net, NULL, skb, skb->dev, dst->dev,
560 		       ip6_forward_finish);
561 
562 error:
563 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
564 drop:
565 	kfree_skb(skb);
566 	return -EINVAL;
567 }
568 
569 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
570 {
571 	to->pkt_type = from->pkt_type;
572 	to->priority = from->priority;
573 	to->protocol = from->protocol;
574 	skb_dst_drop(to);
575 	skb_dst_set(to, dst_clone(skb_dst(from)));
576 	to->dev = from->dev;
577 	to->mark = from->mark;
578 
579 	skb_copy_hash(to, from);
580 
581 #ifdef CONFIG_NET_SCHED
582 	to->tc_index = from->tc_index;
583 #endif
584 	nf_copy(to, from);
585 	skb_ext_copy(to, from);
586 	skb_copy_secmark(to, from);
587 }
588 
589 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
590 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
591 {
592 	struct sk_buff *frag;
593 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
594 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
595 				inet6_sk(skb->sk) : NULL;
596 	struct ipv6hdr *tmp_hdr;
597 	struct frag_hdr *fh;
598 	unsigned int mtu, hlen, left, len;
599 	int hroom, troom;
600 	__be32 frag_id;
601 	int ptr, offset = 0, err = 0;
602 	u8 *prevhdr, nexthdr = 0;
603 
604 	err = ip6_find_1stfragopt(skb, &prevhdr);
605 	if (err < 0)
606 		goto fail;
607 	hlen = err;
608 	nexthdr = *prevhdr;
609 
610 	mtu = ip6_skb_dst_mtu(skb);
611 
612 	/* We must not fragment if the socket is set to force MTU discovery
613 	 * or if the skb it not generated by a local socket.
614 	 */
615 	if (unlikely(!skb->ignore_df && skb->len > mtu))
616 		goto fail_toobig;
617 
618 	if (IP6CB(skb)->frag_max_size) {
619 		if (IP6CB(skb)->frag_max_size > mtu)
620 			goto fail_toobig;
621 
622 		/* don't send fragments larger than what we received */
623 		mtu = IP6CB(skb)->frag_max_size;
624 		if (mtu < IPV6_MIN_MTU)
625 			mtu = IPV6_MIN_MTU;
626 	}
627 
628 	if (np && np->frag_size < mtu) {
629 		if (np->frag_size)
630 			mtu = np->frag_size;
631 	}
632 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
633 		goto fail_toobig;
634 	mtu -= hlen + sizeof(struct frag_hdr);
635 
636 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
637 				    &ipv6_hdr(skb)->saddr);
638 
639 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
640 	    (err = skb_checksum_help(skb)))
641 		goto fail;
642 
643 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
644 	if (skb_has_frag_list(skb)) {
645 		unsigned int first_len = skb_pagelen(skb);
646 		struct sk_buff *frag2;
647 
648 		if (first_len - hlen > mtu ||
649 		    ((first_len - hlen) & 7) ||
650 		    skb_cloned(skb) ||
651 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
652 			goto slow_path;
653 
654 		skb_walk_frags(skb, frag) {
655 			/* Correct geometry. */
656 			if (frag->len > mtu ||
657 			    ((frag->len & 7) && frag->next) ||
658 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
659 				goto slow_path_clean;
660 
661 			/* Partially cloned skb? */
662 			if (skb_shared(frag))
663 				goto slow_path_clean;
664 
665 			BUG_ON(frag->sk);
666 			if (skb->sk) {
667 				frag->sk = skb->sk;
668 				frag->destructor = sock_wfree;
669 			}
670 			skb->truesize -= frag->truesize;
671 		}
672 
673 		err = 0;
674 		offset = 0;
675 		/* BUILD HEADER */
676 
677 		*prevhdr = NEXTHDR_FRAGMENT;
678 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
679 		if (!tmp_hdr) {
680 			err = -ENOMEM;
681 			goto fail;
682 		}
683 		frag = skb_shinfo(skb)->frag_list;
684 		skb_frag_list_init(skb);
685 
686 		__skb_pull(skb, hlen);
687 		fh = __skb_push(skb, sizeof(struct frag_hdr));
688 		__skb_push(skb, hlen);
689 		skb_reset_network_header(skb);
690 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
691 
692 		fh->nexthdr = nexthdr;
693 		fh->reserved = 0;
694 		fh->frag_off = htons(IP6_MF);
695 		fh->identification = frag_id;
696 
697 		first_len = skb_pagelen(skb);
698 		skb->data_len = first_len - skb_headlen(skb);
699 		skb->len = first_len;
700 		ipv6_hdr(skb)->payload_len = htons(first_len -
701 						   sizeof(struct ipv6hdr));
702 
703 		for (;;) {
704 			/* Prepare header of the next frame,
705 			 * before previous one went down. */
706 			if (frag) {
707 				frag->ip_summed = CHECKSUM_NONE;
708 				skb_reset_transport_header(frag);
709 				fh = __skb_push(frag, sizeof(struct frag_hdr));
710 				__skb_push(frag, hlen);
711 				skb_reset_network_header(frag);
712 				memcpy(skb_network_header(frag), tmp_hdr,
713 				       hlen);
714 				offset += skb->len - hlen - sizeof(struct frag_hdr);
715 				fh->nexthdr = nexthdr;
716 				fh->reserved = 0;
717 				fh->frag_off = htons(offset);
718 				if (frag->next)
719 					fh->frag_off |= htons(IP6_MF);
720 				fh->identification = frag_id;
721 				ipv6_hdr(frag)->payload_len =
722 						htons(frag->len -
723 						      sizeof(struct ipv6hdr));
724 				ip6_copy_metadata(frag, skb);
725 			}
726 
727 			err = output(net, sk, skb);
728 			if (!err)
729 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
730 					      IPSTATS_MIB_FRAGCREATES);
731 
732 			if (err || !frag)
733 				break;
734 
735 			skb = frag;
736 			frag = skb->next;
737 			skb_mark_not_on_list(skb);
738 		}
739 
740 		kfree(tmp_hdr);
741 
742 		if (err == 0) {
743 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
744 				      IPSTATS_MIB_FRAGOKS);
745 			return 0;
746 		}
747 
748 		kfree_skb_list(frag);
749 
750 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
751 			      IPSTATS_MIB_FRAGFAILS);
752 		return err;
753 
754 slow_path_clean:
755 		skb_walk_frags(skb, frag2) {
756 			if (frag2 == frag)
757 				break;
758 			frag2->sk = NULL;
759 			frag2->destructor = NULL;
760 			skb->truesize += frag2->truesize;
761 		}
762 	}
763 
764 slow_path:
765 	left = skb->len - hlen;		/* Space per frame */
766 	ptr = hlen;			/* Where to start from */
767 
768 	/*
769 	 *	Fragment the datagram.
770 	 */
771 
772 	troom = rt->dst.dev->needed_tailroom;
773 
774 	/*
775 	 *	Keep copying data until we run out.
776 	 */
777 	while (left > 0)	{
778 		u8 *fragnexthdr_offset;
779 
780 		len = left;
781 		/* IF: it doesn't fit, use 'mtu' - the data space left */
782 		if (len > mtu)
783 			len = mtu;
784 		/* IF: we are not sending up to and including the packet end
785 		   then align the next start on an eight byte boundary */
786 		if (len < left)	{
787 			len &= ~7;
788 		}
789 
790 		/* Allocate buffer */
791 		frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
792 				 hroom + troom, GFP_ATOMIC);
793 		if (!frag) {
794 			err = -ENOMEM;
795 			goto fail;
796 		}
797 
798 		/*
799 		 *	Set up data on packet
800 		 */
801 
802 		ip6_copy_metadata(frag, skb);
803 		skb_reserve(frag, hroom);
804 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
805 		skb_reset_network_header(frag);
806 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
807 		frag->transport_header = (frag->network_header + hlen +
808 					  sizeof(struct frag_hdr));
809 
810 		/*
811 		 *	Charge the memory for the fragment to any owner
812 		 *	it might possess
813 		 */
814 		if (skb->sk)
815 			skb_set_owner_w(frag, skb->sk);
816 
817 		/*
818 		 *	Copy the packet header into the new buffer.
819 		 */
820 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
821 
822 		fragnexthdr_offset = skb_network_header(frag);
823 		fragnexthdr_offset += prevhdr - skb_network_header(skb);
824 		*fragnexthdr_offset = NEXTHDR_FRAGMENT;
825 
826 		/*
827 		 *	Build fragment header.
828 		 */
829 		fh->nexthdr = nexthdr;
830 		fh->reserved = 0;
831 		fh->identification = frag_id;
832 
833 		/*
834 		 *	Copy a block of the IP datagram.
835 		 */
836 		BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
837 				     len));
838 		left -= len;
839 
840 		fh->frag_off = htons(offset);
841 		if (left > 0)
842 			fh->frag_off |= htons(IP6_MF);
843 		ipv6_hdr(frag)->payload_len = htons(frag->len -
844 						    sizeof(struct ipv6hdr));
845 
846 		ptr += len;
847 		offset += len;
848 
849 		/*
850 		 *	Put this fragment into the sending queue.
851 		 */
852 		err = output(net, sk, frag);
853 		if (err)
854 			goto fail;
855 
856 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
857 			      IPSTATS_MIB_FRAGCREATES);
858 	}
859 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
860 		      IPSTATS_MIB_FRAGOKS);
861 	consume_skb(skb);
862 	return err;
863 
864 fail_toobig:
865 	if (skb->sk && dst_allfrag(skb_dst(skb)))
866 		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
867 
868 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
869 	err = -EMSGSIZE;
870 
871 fail:
872 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
873 		      IPSTATS_MIB_FRAGFAILS);
874 	kfree_skb(skb);
875 	return err;
876 }
877 
878 static inline int ip6_rt_check(const struct rt6key *rt_key,
879 			       const struct in6_addr *fl_addr,
880 			       const struct in6_addr *addr_cache)
881 {
882 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
883 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
884 }
885 
886 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
887 					  struct dst_entry *dst,
888 					  const struct flowi6 *fl6)
889 {
890 	struct ipv6_pinfo *np = inet6_sk(sk);
891 	struct rt6_info *rt;
892 
893 	if (!dst)
894 		goto out;
895 
896 	if (dst->ops->family != AF_INET6) {
897 		dst_release(dst);
898 		return NULL;
899 	}
900 
901 	rt = (struct rt6_info *)dst;
902 	/* Yes, checking route validity in not connected
903 	 * case is not very simple. Take into account,
904 	 * that we do not support routing by source, TOS,
905 	 * and MSG_DONTROUTE		--ANK (980726)
906 	 *
907 	 * 1. ip6_rt_check(): If route was host route,
908 	 *    check that cached destination is current.
909 	 *    If it is network route, we still may
910 	 *    check its validity using saved pointer
911 	 *    to the last used address: daddr_cache.
912 	 *    We do not want to save whole address now,
913 	 *    (because main consumer of this service
914 	 *    is tcp, which has not this problem),
915 	 *    so that the last trick works only on connected
916 	 *    sockets.
917 	 * 2. oif also should be the same.
918 	 */
919 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
920 #ifdef CONFIG_IPV6_SUBTREES
921 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
922 #endif
923 	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
924 	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
925 		dst_release(dst);
926 		dst = NULL;
927 	}
928 
929 out:
930 	return dst;
931 }
932 
933 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
934 			       struct dst_entry **dst, struct flowi6 *fl6)
935 {
936 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
937 	struct neighbour *n;
938 	struct rt6_info *rt;
939 #endif
940 	int err;
941 	int flags = 0;
942 
943 	/* The correct way to handle this would be to do
944 	 * ip6_route_get_saddr, and then ip6_route_output; however,
945 	 * the route-specific preferred source forces the
946 	 * ip6_route_output call _before_ ip6_route_get_saddr.
947 	 *
948 	 * In source specific routing (no src=any default route),
949 	 * ip6_route_output will fail given src=any saddr, though, so
950 	 * that's why we try it again later.
951 	 */
952 	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
953 		struct fib6_info *from;
954 		struct rt6_info *rt;
955 		bool had_dst = *dst != NULL;
956 
957 		if (!had_dst)
958 			*dst = ip6_route_output(net, sk, fl6);
959 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
960 
961 		rcu_read_lock();
962 		from = rt ? rcu_dereference(rt->from) : NULL;
963 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
964 					  sk ? inet6_sk(sk)->srcprefs : 0,
965 					  &fl6->saddr);
966 		rcu_read_unlock();
967 
968 		if (err)
969 			goto out_err_release;
970 
971 		/* If we had an erroneous initial result, pretend it
972 		 * never existed and let the SA-enabled version take
973 		 * over.
974 		 */
975 		if (!had_dst && (*dst)->error) {
976 			dst_release(*dst);
977 			*dst = NULL;
978 		}
979 
980 		if (fl6->flowi6_oif)
981 			flags |= RT6_LOOKUP_F_IFACE;
982 	}
983 
984 	if (!*dst)
985 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
986 
987 	err = (*dst)->error;
988 	if (err)
989 		goto out_err_release;
990 
991 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
992 	/*
993 	 * Here if the dst entry we've looked up
994 	 * has a neighbour entry that is in the INCOMPLETE
995 	 * state and the src address from the flow is
996 	 * marked as OPTIMISTIC, we release the found
997 	 * dst entry and replace it instead with the
998 	 * dst entry of the nexthop router
999 	 */
1000 	rt = (struct rt6_info *) *dst;
1001 	rcu_read_lock_bh();
1002 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1003 				      rt6_nexthop(rt, &fl6->daddr));
1004 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1005 	rcu_read_unlock_bh();
1006 
1007 	if (err) {
1008 		struct inet6_ifaddr *ifp;
1009 		struct flowi6 fl_gw6;
1010 		int redirect;
1011 
1012 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1013 				      (*dst)->dev, 1);
1014 
1015 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1016 		if (ifp)
1017 			in6_ifa_put(ifp);
1018 
1019 		if (redirect) {
1020 			/*
1021 			 * We need to get the dst entry for the
1022 			 * default router instead
1023 			 */
1024 			dst_release(*dst);
1025 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1026 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1027 			*dst = ip6_route_output(net, sk, &fl_gw6);
1028 			err = (*dst)->error;
1029 			if (err)
1030 				goto out_err_release;
1031 		}
1032 	}
1033 #endif
1034 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1035 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1036 		err = -EAFNOSUPPORT;
1037 		goto out_err_release;
1038 	}
1039 
1040 	return 0;
1041 
1042 out_err_release:
1043 	dst_release(*dst);
1044 	*dst = NULL;
1045 
1046 	if (err == -ENETUNREACH)
1047 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1048 	return err;
1049 }
1050 
1051 /**
1052  *	ip6_dst_lookup - perform route lookup on flow
1053  *	@sk: socket which provides route info
1054  *	@dst: pointer to dst_entry * for result
1055  *	@fl6: flow to lookup
1056  *
1057  *	This function performs a route lookup on the given flow.
1058  *
1059  *	It returns zero on success, or a standard errno code on error.
1060  */
1061 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1062 		   struct flowi6 *fl6)
1063 {
1064 	*dst = NULL;
1065 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1066 }
1067 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1068 
1069 /**
1070  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1071  *	@sk: socket which provides route info
1072  *	@fl6: flow to lookup
1073  *	@final_dst: final destination address for ipsec lookup
1074  *
1075  *	This function performs a route lookup on the given flow.
1076  *
1077  *	It returns a valid dst pointer on success, or a pointer encoded
1078  *	error code.
1079  */
1080 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1081 				      const struct in6_addr *final_dst)
1082 {
1083 	struct dst_entry *dst = NULL;
1084 	int err;
1085 
1086 	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1087 	if (err)
1088 		return ERR_PTR(err);
1089 	if (final_dst)
1090 		fl6->daddr = *final_dst;
1091 
1092 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1093 }
1094 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1095 
1096 /**
1097  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1098  *	@sk: socket which provides the dst cache and route info
1099  *	@fl6: flow to lookup
1100  *	@final_dst: final destination address for ipsec lookup
1101  *	@connected: whether @sk is connected or not
1102  *
1103  *	This function performs a route lookup on the given flow with the
1104  *	possibility of using the cached route in the socket if it is valid.
1105  *	It will take the socket dst lock when operating on the dst cache.
1106  *	As a result, this function can only be used in process context.
1107  *
1108  *	In addition, for a connected socket, cache the dst in the socket
1109  *	if the current cache is not valid.
1110  *
1111  *	It returns a valid dst pointer on success, or a pointer encoded
1112  *	error code.
1113  */
1114 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1115 					 const struct in6_addr *final_dst,
1116 					 bool connected)
1117 {
1118 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1119 
1120 	dst = ip6_sk_dst_check(sk, dst, fl6);
1121 	if (dst)
1122 		return dst;
1123 
1124 	dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1125 	if (connected && !IS_ERR(dst))
1126 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1127 
1128 	return dst;
1129 }
1130 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1131 
1132 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1133 					       gfp_t gfp)
1134 {
1135 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1136 }
1137 
1138 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1139 						gfp_t gfp)
1140 {
1141 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1142 }
1143 
1144 static void ip6_append_data_mtu(unsigned int *mtu,
1145 				int *maxfraglen,
1146 				unsigned int fragheaderlen,
1147 				struct sk_buff *skb,
1148 				struct rt6_info *rt,
1149 				unsigned int orig_mtu)
1150 {
1151 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1152 		if (!skb) {
1153 			/* first fragment, reserve header_len */
1154 			*mtu = orig_mtu - rt->dst.header_len;
1155 
1156 		} else {
1157 			/*
1158 			 * this fragment is not first, the headers
1159 			 * space is regarded as data space.
1160 			 */
1161 			*mtu = orig_mtu;
1162 		}
1163 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1164 			      + fragheaderlen - sizeof(struct frag_hdr);
1165 	}
1166 }
1167 
1168 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1169 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1170 			  struct rt6_info *rt, struct flowi6 *fl6)
1171 {
1172 	struct ipv6_pinfo *np = inet6_sk(sk);
1173 	unsigned int mtu;
1174 	struct ipv6_txoptions *opt = ipc6->opt;
1175 
1176 	/*
1177 	 * setup for corking
1178 	 */
1179 	if (opt) {
1180 		if (WARN_ON(v6_cork->opt))
1181 			return -EINVAL;
1182 
1183 		v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1184 		if (unlikely(!v6_cork->opt))
1185 			return -ENOBUFS;
1186 
1187 		v6_cork->opt->tot_len = sizeof(*opt);
1188 		v6_cork->opt->opt_flen = opt->opt_flen;
1189 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1190 
1191 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1192 						    sk->sk_allocation);
1193 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1194 			return -ENOBUFS;
1195 
1196 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1197 						    sk->sk_allocation);
1198 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1199 			return -ENOBUFS;
1200 
1201 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1202 						   sk->sk_allocation);
1203 		if (opt->hopopt && !v6_cork->opt->hopopt)
1204 			return -ENOBUFS;
1205 
1206 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1207 						    sk->sk_allocation);
1208 		if (opt->srcrt && !v6_cork->opt->srcrt)
1209 			return -ENOBUFS;
1210 
1211 		/* need source address above miyazawa*/
1212 	}
1213 	dst_hold(&rt->dst);
1214 	cork->base.dst = &rt->dst;
1215 	cork->fl.u.ip6 = *fl6;
1216 	v6_cork->hop_limit = ipc6->hlimit;
1217 	v6_cork->tclass = ipc6->tclass;
1218 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1219 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1220 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1221 	else
1222 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1223 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1224 	if (np->frag_size < mtu) {
1225 		if (np->frag_size)
1226 			mtu = np->frag_size;
1227 	}
1228 	if (mtu < IPV6_MIN_MTU)
1229 		return -EINVAL;
1230 	cork->base.fragsize = mtu;
1231 	cork->base.gso_size = ipc6->gso_size;
1232 	cork->base.tx_flags = 0;
1233 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1234 
1235 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1236 		cork->base.flags |= IPCORK_ALLFRAG;
1237 	cork->base.length = 0;
1238 
1239 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1240 
1241 	return 0;
1242 }
1243 
1244 static int __ip6_append_data(struct sock *sk,
1245 			     struct flowi6 *fl6,
1246 			     struct sk_buff_head *queue,
1247 			     struct inet_cork *cork,
1248 			     struct inet6_cork *v6_cork,
1249 			     struct page_frag *pfrag,
1250 			     int getfrag(void *from, char *to, int offset,
1251 					 int len, int odd, struct sk_buff *skb),
1252 			     void *from, int length, int transhdrlen,
1253 			     unsigned int flags, struct ipcm6_cookie *ipc6)
1254 {
1255 	struct sk_buff *skb, *skb_prev = NULL;
1256 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1257 	struct ubuf_info *uarg = NULL;
1258 	int exthdrlen = 0;
1259 	int dst_exthdrlen = 0;
1260 	int hh_len;
1261 	int copy;
1262 	int err;
1263 	int offset = 0;
1264 	u32 tskey = 0;
1265 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1266 	struct ipv6_txoptions *opt = v6_cork->opt;
1267 	int csummode = CHECKSUM_NONE;
1268 	unsigned int maxnonfragsize, headersize;
1269 	unsigned int wmem_alloc_delta = 0;
1270 	bool paged, extra_uref;
1271 
1272 	skb = skb_peek_tail(queue);
1273 	if (!skb) {
1274 		exthdrlen = opt ? opt->opt_flen : 0;
1275 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1276 	}
1277 
1278 	paged = !!cork->gso_size;
1279 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1280 	orig_mtu = mtu;
1281 
1282 	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1283 	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1284 		tskey = sk->sk_tskey++;
1285 
1286 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1287 
1288 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1289 			(opt ? opt->opt_nflen : 0);
1290 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1291 		     sizeof(struct frag_hdr);
1292 
1293 	headersize = sizeof(struct ipv6hdr) +
1294 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1295 		     (dst_allfrag(&rt->dst) ?
1296 		      sizeof(struct frag_hdr) : 0) +
1297 		     rt->rt6i_nfheader_len;
1298 
1299 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1300 	 * the first fragment
1301 	 */
1302 	if (headersize + transhdrlen > mtu)
1303 		goto emsgsize;
1304 
1305 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1306 	    (sk->sk_protocol == IPPROTO_UDP ||
1307 	     sk->sk_protocol == IPPROTO_RAW)) {
1308 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1309 				sizeof(struct ipv6hdr));
1310 		goto emsgsize;
1311 	}
1312 
1313 	if (ip6_sk_ignore_df(sk))
1314 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1315 	else
1316 		maxnonfragsize = mtu;
1317 
1318 	if (cork->length + length > maxnonfragsize - headersize) {
1319 emsgsize:
1320 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1321 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1322 		return -EMSGSIZE;
1323 	}
1324 
1325 	/* CHECKSUM_PARTIAL only with no extension headers and when
1326 	 * we are not going to fragment
1327 	 */
1328 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1329 	    headersize == sizeof(struct ipv6hdr) &&
1330 	    length <= mtu - headersize &&
1331 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1332 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1333 		csummode = CHECKSUM_PARTIAL;
1334 
1335 	if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1336 		uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
1337 		if (!uarg)
1338 			return -ENOBUFS;
1339 		extra_uref = true;
1340 		if (rt->dst.dev->features & NETIF_F_SG &&
1341 		    csummode == CHECKSUM_PARTIAL) {
1342 			paged = true;
1343 		} else {
1344 			uarg->zerocopy = 0;
1345 			skb_zcopy_set(skb, uarg, &extra_uref);
1346 		}
1347 	}
1348 
1349 	/*
1350 	 * Let's try using as much space as possible.
1351 	 * Use MTU if total length of the message fits into the MTU.
1352 	 * Otherwise, we need to reserve fragment header and
1353 	 * fragment alignment (= 8-15 octects, in total).
1354 	 *
1355 	 * Note that we may need to "move" the data from the tail of
1356 	 * of the buffer to the new fragment when we split
1357 	 * the message.
1358 	 *
1359 	 * FIXME: It may be fragmented into multiple chunks
1360 	 *        at once if non-fragmentable extension headers
1361 	 *        are too large.
1362 	 * --yoshfuji
1363 	 */
1364 
1365 	cork->length += length;
1366 	if (!skb)
1367 		goto alloc_new_skb;
1368 
1369 	while (length > 0) {
1370 		/* Check if the remaining data fits into current packet. */
1371 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1372 		if (copy < length)
1373 			copy = maxfraglen - skb->len;
1374 
1375 		if (copy <= 0) {
1376 			char *data;
1377 			unsigned int datalen;
1378 			unsigned int fraglen;
1379 			unsigned int fraggap;
1380 			unsigned int alloclen;
1381 			unsigned int pagedlen;
1382 alloc_new_skb:
1383 			/* There's no room in the current skb */
1384 			if (skb)
1385 				fraggap = skb->len - maxfraglen;
1386 			else
1387 				fraggap = 0;
1388 			/* update mtu and maxfraglen if necessary */
1389 			if (!skb || !skb_prev)
1390 				ip6_append_data_mtu(&mtu, &maxfraglen,
1391 						    fragheaderlen, skb, rt,
1392 						    orig_mtu);
1393 
1394 			skb_prev = skb;
1395 
1396 			/*
1397 			 * If remaining data exceeds the mtu,
1398 			 * we know we need more fragment(s).
1399 			 */
1400 			datalen = length + fraggap;
1401 
1402 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1403 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1404 			fraglen = datalen + fragheaderlen;
1405 			pagedlen = 0;
1406 
1407 			if ((flags & MSG_MORE) &&
1408 			    !(rt->dst.dev->features&NETIF_F_SG))
1409 				alloclen = mtu;
1410 			else if (!paged)
1411 				alloclen = fraglen;
1412 			else {
1413 				alloclen = min_t(int, fraglen, MAX_HEADER);
1414 				pagedlen = fraglen - alloclen;
1415 			}
1416 
1417 			alloclen += dst_exthdrlen;
1418 
1419 			if (datalen != length + fraggap) {
1420 				/*
1421 				 * this is not the last fragment, the trailer
1422 				 * space is regarded as data space.
1423 				 */
1424 				datalen += rt->dst.trailer_len;
1425 			}
1426 
1427 			alloclen += rt->dst.trailer_len;
1428 			fraglen = datalen + fragheaderlen;
1429 
1430 			/*
1431 			 * We just reserve space for fragment header.
1432 			 * Note: this may be overallocation if the message
1433 			 * (without MSG_MORE) fits into the MTU.
1434 			 */
1435 			alloclen += sizeof(struct frag_hdr);
1436 
1437 			copy = datalen - transhdrlen - fraggap - pagedlen;
1438 			if (copy < 0) {
1439 				err = -EINVAL;
1440 				goto error;
1441 			}
1442 			if (transhdrlen) {
1443 				skb = sock_alloc_send_skb(sk,
1444 						alloclen + hh_len,
1445 						(flags & MSG_DONTWAIT), &err);
1446 			} else {
1447 				skb = NULL;
1448 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1449 				    2 * sk->sk_sndbuf)
1450 					skb = alloc_skb(alloclen + hh_len,
1451 							sk->sk_allocation);
1452 				if (unlikely(!skb))
1453 					err = -ENOBUFS;
1454 			}
1455 			if (!skb)
1456 				goto error;
1457 			/*
1458 			 *	Fill in the control structures
1459 			 */
1460 			skb->protocol = htons(ETH_P_IPV6);
1461 			skb->ip_summed = csummode;
1462 			skb->csum = 0;
1463 			/* reserve for fragmentation and ipsec header */
1464 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1465 				    dst_exthdrlen);
1466 
1467 			/*
1468 			 *	Find where to start putting bytes
1469 			 */
1470 			data = skb_put(skb, fraglen - pagedlen);
1471 			skb_set_network_header(skb, exthdrlen);
1472 			data += fragheaderlen;
1473 			skb->transport_header = (skb->network_header +
1474 						 fragheaderlen);
1475 			if (fraggap) {
1476 				skb->csum = skb_copy_and_csum_bits(
1477 					skb_prev, maxfraglen,
1478 					data + transhdrlen, fraggap, 0);
1479 				skb_prev->csum = csum_sub(skb_prev->csum,
1480 							  skb->csum);
1481 				data += fraggap;
1482 				pskb_trim_unique(skb_prev, maxfraglen);
1483 			}
1484 			if (copy > 0 &&
1485 			    getfrag(from, data + transhdrlen, offset,
1486 				    copy, fraggap, skb) < 0) {
1487 				err = -EFAULT;
1488 				kfree_skb(skb);
1489 				goto error;
1490 			}
1491 
1492 			offset += copy;
1493 			length -= copy + transhdrlen;
1494 			transhdrlen = 0;
1495 			exthdrlen = 0;
1496 			dst_exthdrlen = 0;
1497 
1498 			/* Only the initial fragment is time stamped */
1499 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1500 			cork->tx_flags = 0;
1501 			skb_shinfo(skb)->tskey = tskey;
1502 			tskey = 0;
1503 			skb_zcopy_set(skb, uarg, &extra_uref);
1504 
1505 			if ((flags & MSG_CONFIRM) && !skb_prev)
1506 				skb_set_dst_pending_confirm(skb, 1);
1507 
1508 			/*
1509 			 * Put the packet on the pending queue
1510 			 */
1511 			if (!skb->destructor) {
1512 				skb->destructor = sock_wfree;
1513 				skb->sk = sk;
1514 				wmem_alloc_delta += skb->truesize;
1515 			}
1516 			__skb_queue_tail(queue, skb);
1517 			continue;
1518 		}
1519 
1520 		if (copy > length)
1521 			copy = length;
1522 
1523 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1524 		    skb_tailroom(skb) >= copy) {
1525 			unsigned int off;
1526 
1527 			off = skb->len;
1528 			if (getfrag(from, skb_put(skb, copy),
1529 						offset, copy, off, skb) < 0) {
1530 				__skb_trim(skb, off);
1531 				err = -EFAULT;
1532 				goto error;
1533 			}
1534 		} else if (!uarg || !uarg->zerocopy) {
1535 			int i = skb_shinfo(skb)->nr_frags;
1536 
1537 			err = -ENOMEM;
1538 			if (!sk_page_frag_refill(sk, pfrag))
1539 				goto error;
1540 
1541 			if (!skb_can_coalesce(skb, i, pfrag->page,
1542 					      pfrag->offset)) {
1543 				err = -EMSGSIZE;
1544 				if (i == MAX_SKB_FRAGS)
1545 					goto error;
1546 
1547 				__skb_fill_page_desc(skb, i, pfrag->page,
1548 						     pfrag->offset, 0);
1549 				skb_shinfo(skb)->nr_frags = ++i;
1550 				get_page(pfrag->page);
1551 			}
1552 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1553 			if (getfrag(from,
1554 				    page_address(pfrag->page) + pfrag->offset,
1555 				    offset, copy, skb->len, skb) < 0)
1556 				goto error_efault;
1557 
1558 			pfrag->offset += copy;
1559 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1560 			skb->len += copy;
1561 			skb->data_len += copy;
1562 			skb->truesize += copy;
1563 			wmem_alloc_delta += copy;
1564 		} else {
1565 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1566 			if (err < 0)
1567 				goto error;
1568 		}
1569 		offset += copy;
1570 		length -= copy;
1571 	}
1572 
1573 	if (wmem_alloc_delta)
1574 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1575 	return 0;
1576 
1577 error_efault:
1578 	err = -EFAULT;
1579 error:
1580 	if (uarg)
1581 		sock_zerocopy_put_abort(uarg, extra_uref);
1582 	cork->length -= length;
1583 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1584 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1585 	return err;
1586 }
1587 
1588 int ip6_append_data(struct sock *sk,
1589 		    int getfrag(void *from, char *to, int offset, int len,
1590 				int odd, struct sk_buff *skb),
1591 		    void *from, int length, int transhdrlen,
1592 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1593 		    struct rt6_info *rt, unsigned int flags)
1594 {
1595 	struct inet_sock *inet = inet_sk(sk);
1596 	struct ipv6_pinfo *np = inet6_sk(sk);
1597 	int exthdrlen;
1598 	int err;
1599 
1600 	if (flags&MSG_PROBE)
1601 		return 0;
1602 	if (skb_queue_empty(&sk->sk_write_queue)) {
1603 		/*
1604 		 * setup for corking
1605 		 */
1606 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1607 				     ipc6, rt, fl6);
1608 		if (err)
1609 			return err;
1610 
1611 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1612 		length += exthdrlen;
1613 		transhdrlen += exthdrlen;
1614 	} else {
1615 		fl6 = &inet->cork.fl.u.ip6;
1616 		transhdrlen = 0;
1617 	}
1618 
1619 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1620 				 &np->cork, sk_page_frag(sk), getfrag,
1621 				 from, length, transhdrlen, flags, ipc6);
1622 }
1623 EXPORT_SYMBOL_GPL(ip6_append_data);
1624 
1625 static void ip6_cork_release(struct inet_cork_full *cork,
1626 			     struct inet6_cork *v6_cork)
1627 {
1628 	if (v6_cork->opt) {
1629 		kfree(v6_cork->opt->dst0opt);
1630 		kfree(v6_cork->opt->dst1opt);
1631 		kfree(v6_cork->opt->hopopt);
1632 		kfree(v6_cork->opt->srcrt);
1633 		kfree(v6_cork->opt);
1634 		v6_cork->opt = NULL;
1635 	}
1636 
1637 	if (cork->base.dst) {
1638 		dst_release(cork->base.dst);
1639 		cork->base.dst = NULL;
1640 		cork->base.flags &= ~IPCORK_ALLFRAG;
1641 	}
1642 	memset(&cork->fl, 0, sizeof(cork->fl));
1643 }
1644 
1645 struct sk_buff *__ip6_make_skb(struct sock *sk,
1646 			       struct sk_buff_head *queue,
1647 			       struct inet_cork_full *cork,
1648 			       struct inet6_cork *v6_cork)
1649 {
1650 	struct sk_buff *skb, *tmp_skb;
1651 	struct sk_buff **tail_skb;
1652 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1653 	struct ipv6_pinfo *np = inet6_sk(sk);
1654 	struct net *net = sock_net(sk);
1655 	struct ipv6hdr *hdr;
1656 	struct ipv6_txoptions *opt = v6_cork->opt;
1657 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1658 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1659 	unsigned char proto = fl6->flowi6_proto;
1660 
1661 	skb = __skb_dequeue(queue);
1662 	if (!skb)
1663 		goto out;
1664 	tail_skb = &(skb_shinfo(skb)->frag_list);
1665 
1666 	/* move skb->data to ip header from ext header */
1667 	if (skb->data < skb_network_header(skb))
1668 		__skb_pull(skb, skb_network_offset(skb));
1669 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1670 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1671 		*tail_skb = tmp_skb;
1672 		tail_skb = &(tmp_skb->next);
1673 		skb->len += tmp_skb->len;
1674 		skb->data_len += tmp_skb->len;
1675 		skb->truesize += tmp_skb->truesize;
1676 		tmp_skb->destructor = NULL;
1677 		tmp_skb->sk = NULL;
1678 	}
1679 
1680 	/* Allow local fragmentation. */
1681 	skb->ignore_df = ip6_sk_ignore_df(sk);
1682 
1683 	*final_dst = fl6->daddr;
1684 	__skb_pull(skb, skb_network_header_len(skb));
1685 	if (opt && opt->opt_flen)
1686 		ipv6_push_frag_opts(skb, opt, &proto);
1687 	if (opt && opt->opt_nflen)
1688 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1689 
1690 	skb_push(skb, sizeof(struct ipv6hdr));
1691 	skb_reset_network_header(skb);
1692 	hdr = ipv6_hdr(skb);
1693 
1694 	ip6_flow_hdr(hdr, v6_cork->tclass,
1695 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1696 					ip6_autoflowlabel(net, np), fl6));
1697 	hdr->hop_limit = v6_cork->hop_limit;
1698 	hdr->nexthdr = proto;
1699 	hdr->saddr = fl6->saddr;
1700 	hdr->daddr = *final_dst;
1701 
1702 	skb->priority = sk->sk_priority;
1703 	skb->mark = sk->sk_mark;
1704 
1705 	skb->tstamp = cork->base.transmit_time;
1706 
1707 	skb_dst_set(skb, dst_clone(&rt->dst));
1708 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1709 	if (proto == IPPROTO_ICMPV6) {
1710 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1711 
1712 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1713 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1714 	}
1715 
1716 	ip6_cork_release(cork, v6_cork);
1717 out:
1718 	return skb;
1719 }
1720 
1721 int ip6_send_skb(struct sk_buff *skb)
1722 {
1723 	struct net *net = sock_net(skb->sk);
1724 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1725 	int err;
1726 
1727 	err = ip6_local_out(net, skb->sk, skb);
1728 	if (err) {
1729 		if (err > 0)
1730 			err = net_xmit_errno(err);
1731 		if (err)
1732 			IP6_INC_STATS(net, rt->rt6i_idev,
1733 				      IPSTATS_MIB_OUTDISCARDS);
1734 	}
1735 
1736 	return err;
1737 }
1738 
1739 int ip6_push_pending_frames(struct sock *sk)
1740 {
1741 	struct sk_buff *skb;
1742 
1743 	skb = ip6_finish_skb(sk);
1744 	if (!skb)
1745 		return 0;
1746 
1747 	return ip6_send_skb(skb);
1748 }
1749 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1750 
1751 static void __ip6_flush_pending_frames(struct sock *sk,
1752 				       struct sk_buff_head *queue,
1753 				       struct inet_cork_full *cork,
1754 				       struct inet6_cork *v6_cork)
1755 {
1756 	struct sk_buff *skb;
1757 
1758 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1759 		if (skb_dst(skb))
1760 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1761 				      IPSTATS_MIB_OUTDISCARDS);
1762 		kfree_skb(skb);
1763 	}
1764 
1765 	ip6_cork_release(cork, v6_cork);
1766 }
1767 
1768 void ip6_flush_pending_frames(struct sock *sk)
1769 {
1770 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1771 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1772 }
1773 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1774 
1775 struct sk_buff *ip6_make_skb(struct sock *sk,
1776 			     int getfrag(void *from, char *to, int offset,
1777 					 int len, int odd, struct sk_buff *skb),
1778 			     void *from, int length, int transhdrlen,
1779 			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1780 			     struct rt6_info *rt, unsigned int flags,
1781 			     struct inet_cork_full *cork)
1782 {
1783 	struct inet6_cork v6_cork;
1784 	struct sk_buff_head queue;
1785 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1786 	int err;
1787 
1788 	if (flags & MSG_PROBE)
1789 		return NULL;
1790 
1791 	__skb_queue_head_init(&queue);
1792 
1793 	cork->base.flags = 0;
1794 	cork->base.addr = 0;
1795 	cork->base.opt = NULL;
1796 	cork->base.dst = NULL;
1797 	v6_cork.opt = NULL;
1798 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1799 	if (err) {
1800 		ip6_cork_release(cork, &v6_cork);
1801 		return ERR_PTR(err);
1802 	}
1803 	if (ipc6->dontfrag < 0)
1804 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1805 
1806 	err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1807 				&current->task_frag, getfrag, from,
1808 				length + exthdrlen, transhdrlen + exthdrlen,
1809 				flags, ipc6);
1810 	if (err) {
1811 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1812 		return ERR_PTR(err);
1813 	}
1814 
1815 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1816 }
1817