xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 6e360f7331132822f096399a54f40bc61887aa3c)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	:	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/bpf-cgroup.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45 
46 #include <net/sock.h>
47 #include <net/snmp.h>
48 
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
59 #include <net/l3mdev.h>
60 #include <net/lwtunnel.h>
61 
62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
63 {
64 	struct dst_entry *dst = skb_dst(skb);
65 	struct net_device *dev = dst->dev;
66 	struct neighbour *neigh;
67 	struct in6_addr *nexthop;
68 	int ret;
69 
70 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72 
73 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74 		    ((mroute6_is_socket(net, skb) &&
75 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 					 &ipv6_hdr(skb)->saddr))) {
78 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79 
80 			/* Do not check for IFF_ALLMULTI; multicast routing
81 			   is not supported in any case.
82 			 */
83 			if (newskb)
84 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 					net, sk, newskb, NULL, newskb->dev,
86 					dev_loopback_xmit);
87 
88 			if (ipv6_hdr(skb)->hop_limit == 0) {
89 				IP6_INC_STATS(net, idev,
90 					      IPSTATS_MIB_OUTDISCARDS);
91 				kfree_skb(skb);
92 				return 0;
93 			}
94 		}
95 
96 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
97 
98 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
99 		    IPV6_ADDR_SCOPE_NODELOCAL &&
100 		    !(dev->flags & IFF_LOOPBACK)) {
101 			kfree_skb(skb);
102 			return 0;
103 		}
104 	}
105 
106 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
107 		int res = lwtunnel_xmit(skb);
108 
109 		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
110 			return res;
111 	}
112 
113 	rcu_read_lock_bh();
114 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
115 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
116 	if (unlikely(!neigh))
117 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
118 	if (!IS_ERR(neigh)) {
119 		sock_confirm_neigh(skb, neigh);
120 		ret = neigh_output(neigh, skb);
121 		rcu_read_unlock_bh();
122 		return ret;
123 	}
124 	rcu_read_unlock_bh();
125 
126 	IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
127 	kfree_skb(skb);
128 	return -EINVAL;
129 }
130 
131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
132 {
133 	int ret;
134 
135 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
136 	if (ret) {
137 		kfree_skb(skb);
138 		return ret;
139 	}
140 
141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
142 	/* Policy lookup after SNAT yielded a new policy */
143 	if (skb_dst(skb)->xfrm) {
144 		IPCB(skb)->flags |= IPSKB_REROUTED;
145 		return dst_output(net, sk, skb);
146 	}
147 #endif
148 
149 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
150 	    dst_allfrag(skb_dst(skb)) ||
151 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
152 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
153 	else
154 		return ip6_finish_output2(net, sk, skb);
155 }
156 
157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
158 {
159 	struct net_device *dev = skb_dst(skb)->dev;
160 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161 
162 	skb->protocol = htons(ETH_P_IPV6);
163 	skb->dev = dev;
164 
165 	if (unlikely(idev->cnf.disable_ipv6)) {
166 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
167 		kfree_skb(skb);
168 		return 0;
169 	}
170 
171 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
172 			    net, sk, skb, NULL, dev,
173 			    ip6_finish_output,
174 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
175 }
176 
177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
178 {
179 	if (!np->autoflowlabel_set)
180 		return ip6_default_np_autolabel(net);
181 	else
182 		return np->autoflowlabel;
183 }
184 
185 /*
186  * xmit an sk_buff (used by TCP, SCTP and DCCP)
187  * Note : socket lock is not held for SYNACK packets, but might be modified
188  * by calls to skb_set_owner_w() and ipv6_local_error(),
189  * which are using proper atomic operations or spinlocks.
190  */
191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
192 	     __u32 mark, struct ipv6_txoptions *opt, int tclass)
193 {
194 	struct net *net = sock_net(sk);
195 	const struct ipv6_pinfo *np = inet6_sk(sk);
196 	struct in6_addr *first_hop = &fl6->daddr;
197 	struct dst_entry *dst = skb_dst(skb);
198 	struct ipv6hdr *hdr;
199 	u8  proto = fl6->flowi6_proto;
200 	int seg_len = skb->len;
201 	int hlimit = -1;
202 	u32 mtu;
203 
204 	if (opt) {
205 		unsigned int head_room;
206 
207 		/* First: exthdrs may take lots of space (~8K for now)
208 		   MAX_HEADER is not enough.
209 		 */
210 		head_room = opt->opt_nflen + opt->opt_flen;
211 		seg_len += head_room;
212 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
213 
214 		if (skb_headroom(skb) < head_room) {
215 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
216 			if (!skb2) {
217 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
218 					      IPSTATS_MIB_OUTDISCARDS);
219 				kfree_skb(skb);
220 				return -ENOBUFS;
221 			}
222 			if (skb->sk)
223 				skb_set_owner_w(skb2, skb->sk);
224 			consume_skb(skb);
225 			skb = skb2;
226 		}
227 		if (opt->opt_flen)
228 			ipv6_push_frag_opts(skb, opt, &proto);
229 		if (opt->opt_nflen)
230 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
231 					     &fl6->saddr);
232 	}
233 
234 	skb_push(skb, sizeof(struct ipv6hdr));
235 	skb_reset_network_header(skb);
236 	hdr = ipv6_hdr(skb);
237 
238 	/*
239 	 *	Fill in the IPv6 header
240 	 */
241 	if (np)
242 		hlimit = np->hop_limit;
243 	if (hlimit < 0)
244 		hlimit = ip6_dst_hoplimit(dst);
245 
246 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
247 				ip6_autoflowlabel(net, np), fl6));
248 
249 	hdr->payload_len = htons(seg_len);
250 	hdr->nexthdr = proto;
251 	hdr->hop_limit = hlimit;
252 
253 	hdr->saddr = fl6->saddr;
254 	hdr->daddr = *first_hop;
255 
256 	skb->protocol = htons(ETH_P_IPV6);
257 	skb->priority = sk->sk_priority;
258 	skb->mark = mark;
259 
260 	mtu = dst_mtu(dst);
261 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
262 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
263 			      IPSTATS_MIB_OUT, skb->len);
264 
265 		/* if egress device is enslaved to an L3 master device pass the
266 		 * skb to its handler for processing
267 		 */
268 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
269 		if (unlikely(!skb))
270 			return 0;
271 
272 		/* hooks should never assume socket lock is held.
273 		 * we promote our socket to non const
274 		 */
275 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
276 			       net, (struct sock *)sk, skb, NULL, dst->dev,
277 			       dst_output);
278 	}
279 
280 	skb->dev = dst->dev;
281 	/* ipv6_local_error() does not require socket lock,
282 	 * we promote our socket to non const
283 	 */
284 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
285 
286 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
287 	kfree_skb(skb);
288 	return -EMSGSIZE;
289 }
290 EXPORT_SYMBOL(ip6_xmit);
291 
292 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
293 {
294 	struct ip6_ra_chain *ra;
295 	struct sock *last = NULL;
296 
297 	read_lock(&ip6_ra_lock);
298 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
299 		struct sock *sk = ra->sk;
300 		if (sk && ra->sel == sel &&
301 		    (!sk->sk_bound_dev_if ||
302 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
303 			if (last) {
304 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
305 				if (skb2)
306 					rawv6_rcv(last, skb2);
307 			}
308 			last = sk;
309 		}
310 	}
311 
312 	if (last) {
313 		rawv6_rcv(last, skb);
314 		read_unlock(&ip6_ra_lock);
315 		return 1;
316 	}
317 	read_unlock(&ip6_ra_lock);
318 	return 0;
319 }
320 
321 static int ip6_forward_proxy_check(struct sk_buff *skb)
322 {
323 	struct ipv6hdr *hdr = ipv6_hdr(skb);
324 	u8 nexthdr = hdr->nexthdr;
325 	__be16 frag_off;
326 	int offset;
327 
328 	if (ipv6_ext_hdr(nexthdr)) {
329 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
330 		if (offset < 0)
331 			return 0;
332 	} else
333 		offset = sizeof(struct ipv6hdr);
334 
335 	if (nexthdr == IPPROTO_ICMPV6) {
336 		struct icmp6hdr *icmp6;
337 
338 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
339 					 offset + 1 - skb->data)))
340 			return 0;
341 
342 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
343 
344 		switch (icmp6->icmp6_type) {
345 		case NDISC_ROUTER_SOLICITATION:
346 		case NDISC_ROUTER_ADVERTISEMENT:
347 		case NDISC_NEIGHBOUR_SOLICITATION:
348 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
349 		case NDISC_REDIRECT:
350 			/* For reaction involving unicast neighbor discovery
351 			 * message destined to the proxied address, pass it to
352 			 * input function.
353 			 */
354 			return 1;
355 		default:
356 			break;
357 		}
358 	}
359 
360 	/*
361 	 * The proxying router can't forward traffic sent to a link-local
362 	 * address, so signal the sender and discard the packet. This
363 	 * behavior is clarified by the MIPv6 specification.
364 	 */
365 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
366 		dst_link_failure(skb);
367 		return -1;
368 	}
369 
370 	return 0;
371 }
372 
373 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
374 				     struct sk_buff *skb)
375 {
376 	struct dst_entry *dst = skb_dst(skb);
377 
378 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
379 	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
380 
381 	return dst_output(net, sk, skb);
382 }
383 
384 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
385 {
386 	if (skb->len <= mtu)
387 		return false;
388 
389 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
390 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
391 		return true;
392 
393 	if (skb->ignore_df)
394 		return false;
395 
396 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
397 		return false;
398 
399 	return true;
400 }
401 
402 int ip6_forward(struct sk_buff *skb)
403 {
404 	struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
405 	struct dst_entry *dst = skb_dst(skb);
406 	struct ipv6hdr *hdr = ipv6_hdr(skb);
407 	struct inet6_skb_parm *opt = IP6CB(skb);
408 	struct net *net = dev_net(dst->dev);
409 	u32 mtu;
410 
411 	if (net->ipv6.devconf_all->forwarding == 0)
412 		goto error;
413 
414 	if (skb->pkt_type != PACKET_HOST)
415 		goto drop;
416 
417 	if (unlikely(skb->sk))
418 		goto drop;
419 
420 	if (skb_warn_if_lro(skb))
421 		goto drop;
422 
423 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
424 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
425 		goto drop;
426 	}
427 
428 	skb_forward_csum(skb);
429 
430 	/*
431 	 *	We DO NOT make any processing on
432 	 *	RA packets, pushing them to user level AS IS
433 	 *	without ane WARRANTY that application will be able
434 	 *	to interpret them. The reason is that we
435 	 *	cannot make anything clever here.
436 	 *
437 	 *	We are not end-node, so that if packet contains
438 	 *	AH/ESP, we cannot make anything.
439 	 *	Defragmentation also would be mistake, RA packets
440 	 *	cannot be fragmented, because there is no warranty
441 	 *	that different fragments will go along one path. --ANK
442 	 */
443 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
444 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
445 			return 0;
446 	}
447 
448 	/*
449 	 *	check and decrement ttl
450 	 */
451 	if (hdr->hop_limit <= 1) {
452 		/* Force OUTPUT device used as source address */
453 		skb->dev = dst->dev;
454 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
455 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
456 
457 		kfree_skb(skb);
458 		return -ETIMEDOUT;
459 	}
460 
461 	/* XXX: idev->cnf.proxy_ndp? */
462 	if (net->ipv6.devconf_all->proxy_ndp &&
463 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
464 		int proxied = ip6_forward_proxy_check(skb);
465 		if (proxied > 0)
466 			return ip6_input(skb);
467 		else if (proxied < 0) {
468 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
469 			goto drop;
470 		}
471 	}
472 
473 	if (!xfrm6_route_forward(skb)) {
474 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
475 		goto drop;
476 	}
477 	dst = skb_dst(skb);
478 
479 	/* IPv6 specs say nothing about it, but it is clear that we cannot
480 	   send redirects to source routed frames.
481 	   We don't send redirects to frames decapsulated from IPsec.
482 	 */
483 	if (IP6CB(skb)->iif == dst->dev->ifindex &&
484 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
485 		struct in6_addr *target = NULL;
486 		struct inet_peer *peer;
487 		struct rt6_info *rt;
488 
489 		/*
490 		 *	incoming and outgoing devices are the same
491 		 *	send a redirect.
492 		 */
493 
494 		rt = (struct rt6_info *) dst;
495 		if (rt->rt6i_flags & RTF_GATEWAY)
496 			target = &rt->rt6i_gateway;
497 		else
498 			target = &hdr->daddr;
499 
500 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
501 
502 		/* Limit redirects both by destination (here)
503 		   and by source (inside ndisc_send_redirect)
504 		 */
505 		if (inet_peer_xrlim_allow(peer, 1*HZ))
506 			ndisc_send_redirect(skb, target);
507 		if (peer)
508 			inet_putpeer(peer);
509 	} else {
510 		int addrtype = ipv6_addr_type(&hdr->saddr);
511 
512 		/* This check is security critical. */
513 		if (addrtype == IPV6_ADDR_ANY ||
514 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
515 			goto error;
516 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
517 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
518 				    ICMPV6_NOT_NEIGHBOUR, 0);
519 			goto error;
520 		}
521 	}
522 
523 	mtu = ip6_dst_mtu_forward(dst);
524 	if (mtu < IPV6_MIN_MTU)
525 		mtu = IPV6_MIN_MTU;
526 
527 	if (ip6_pkt_too_big(skb, mtu)) {
528 		/* Again, force OUTPUT device used as source address */
529 		skb->dev = dst->dev;
530 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
531 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
532 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
533 				IPSTATS_MIB_FRAGFAILS);
534 		kfree_skb(skb);
535 		return -EMSGSIZE;
536 	}
537 
538 	if (skb_cow(skb, dst->dev->hard_header_len)) {
539 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
540 				IPSTATS_MIB_OUTDISCARDS);
541 		goto drop;
542 	}
543 
544 	hdr = ipv6_hdr(skb);
545 
546 	/* Mangling hops number delayed to point after skb COW */
547 
548 	hdr->hop_limit--;
549 
550 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
551 		       net, NULL, skb, skb->dev, dst->dev,
552 		       ip6_forward_finish);
553 
554 error:
555 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
556 drop:
557 	kfree_skb(skb);
558 	return -EINVAL;
559 }
560 
561 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
562 {
563 	to->pkt_type = from->pkt_type;
564 	to->priority = from->priority;
565 	to->protocol = from->protocol;
566 	skb_dst_drop(to);
567 	skb_dst_set(to, dst_clone(skb_dst(from)));
568 	to->dev = from->dev;
569 	to->mark = from->mark;
570 
571 	skb_copy_hash(to, from);
572 
573 #ifdef CONFIG_NET_SCHED
574 	to->tc_index = from->tc_index;
575 #endif
576 	nf_copy(to, from);
577 	skb_copy_secmark(to, from);
578 }
579 
580 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
581 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
582 {
583 	struct sk_buff *frag;
584 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
585 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
586 				inet6_sk(skb->sk) : NULL;
587 	struct ipv6hdr *tmp_hdr;
588 	struct frag_hdr *fh;
589 	unsigned int mtu, hlen, left, len;
590 	int hroom, troom;
591 	__be32 frag_id;
592 	int ptr, offset = 0, err = 0;
593 	u8 *prevhdr, nexthdr = 0;
594 
595 	err = ip6_find_1stfragopt(skb, &prevhdr);
596 	if (err < 0)
597 		goto fail;
598 	hlen = err;
599 	nexthdr = *prevhdr;
600 
601 	mtu = ip6_skb_dst_mtu(skb);
602 
603 	/* We must not fragment if the socket is set to force MTU discovery
604 	 * or if the skb it not generated by a local socket.
605 	 */
606 	if (unlikely(!skb->ignore_df && skb->len > mtu))
607 		goto fail_toobig;
608 
609 	if (IP6CB(skb)->frag_max_size) {
610 		if (IP6CB(skb)->frag_max_size > mtu)
611 			goto fail_toobig;
612 
613 		/* don't send fragments larger than what we received */
614 		mtu = IP6CB(skb)->frag_max_size;
615 		if (mtu < IPV6_MIN_MTU)
616 			mtu = IPV6_MIN_MTU;
617 	}
618 
619 	if (np && np->frag_size < mtu) {
620 		if (np->frag_size)
621 			mtu = np->frag_size;
622 	}
623 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
624 		goto fail_toobig;
625 	mtu -= hlen + sizeof(struct frag_hdr);
626 
627 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
628 				    &ipv6_hdr(skb)->saddr);
629 
630 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
631 	    (err = skb_checksum_help(skb)))
632 		goto fail;
633 
634 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
635 	if (skb_has_frag_list(skb)) {
636 		unsigned int first_len = skb_pagelen(skb);
637 		struct sk_buff *frag2;
638 
639 		if (first_len - hlen > mtu ||
640 		    ((first_len - hlen) & 7) ||
641 		    skb_cloned(skb) ||
642 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
643 			goto slow_path;
644 
645 		skb_walk_frags(skb, frag) {
646 			/* Correct geometry. */
647 			if (frag->len > mtu ||
648 			    ((frag->len & 7) && frag->next) ||
649 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
650 				goto slow_path_clean;
651 
652 			/* Partially cloned skb? */
653 			if (skb_shared(frag))
654 				goto slow_path_clean;
655 
656 			BUG_ON(frag->sk);
657 			if (skb->sk) {
658 				frag->sk = skb->sk;
659 				frag->destructor = sock_wfree;
660 			}
661 			skb->truesize -= frag->truesize;
662 		}
663 
664 		err = 0;
665 		offset = 0;
666 		/* BUILD HEADER */
667 
668 		*prevhdr = NEXTHDR_FRAGMENT;
669 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
670 		if (!tmp_hdr) {
671 			err = -ENOMEM;
672 			goto fail;
673 		}
674 		frag = skb_shinfo(skb)->frag_list;
675 		skb_frag_list_init(skb);
676 
677 		__skb_pull(skb, hlen);
678 		fh = __skb_push(skb, sizeof(struct frag_hdr));
679 		__skb_push(skb, hlen);
680 		skb_reset_network_header(skb);
681 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
682 
683 		fh->nexthdr = nexthdr;
684 		fh->reserved = 0;
685 		fh->frag_off = htons(IP6_MF);
686 		fh->identification = frag_id;
687 
688 		first_len = skb_pagelen(skb);
689 		skb->data_len = first_len - skb_headlen(skb);
690 		skb->len = first_len;
691 		ipv6_hdr(skb)->payload_len = htons(first_len -
692 						   sizeof(struct ipv6hdr));
693 
694 		for (;;) {
695 			/* Prepare header of the next frame,
696 			 * before previous one went down. */
697 			if (frag) {
698 				frag->ip_summed = CHECKSUM_NONE;
699 				skb_reset_transport_header(frag);
700 				fh = __skb_push(frag, sizeof(struct frag_hdr));
701 				__skb_push(frag, hlen);
702 				skb_reset_network_header(frag);
703 				memcpy(skb_network_header(frag), tmp_hdr,
704 				       hlen);
705 				offset += skb->len - hlen - sizeof(struct frag_hdr);
706 				fh->nexthdr = nexthdr;
707 				fh->reserved = 0;
708 				fh->frag_off = htons(offset);
709 				if (frag->next)
710 					fh->frag_off |= htons(IP6_MF);
711 				fh->identification = frag_id;
712 				ipv6_hdr(frag)->payload_len =
713 						htons(frag->len -
714 						      sizeof(struct ipv6hdr));
715 				ip6_copy_metadata(frag, skb);
716 			}
717 
718 			err = output(net, sk, skb);
719 			if (!err)
720 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
721 					      IPSTATS_MIB_FRAGCREATES);
722 
723 			if (err || !frag)
724 				break;
725 
726 			skb = frag;
727 			frag = skb->next;
728 			skb_mark_not_on_list(skb);
729 		}
730 
731 		kfree(tmp_hdr);
732 
733 		if (err == 0) {
734 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
735 				      IPSTATS_MIB_FRAGOKS);
736 			return 0;
737 		}
738 
739 		kfree_skb_list(frag);
740 
741 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
742 			      IPSTATS_MIB_FRAGFAILS);
743 		return err;
744 
745 slow_path_clean:
746 		skb_walk_frags(skb, frag2) {
747 			if (frag2 == frag)
748 				break;
749 			frag2->sk = NULL;
750 			frag2->destructor = NULL;
751 			skb->truesize += frag2->truesize;
752 		}
753 	}
754 
755 slow_path:
756 	left = skb->len - hlen;		/* Space per frame */
757 	ptr = hlen;			/* Where to start from */
758 
759 	/*
760 	 *	Fragment the datagram.
761 	 */
762 
763 	troom = rt->dst.dev->needed_tailroom;
764 
765 	/*
766 	 *	Keep copying data until we run out.
767 	 */
768 	while (left > 0)	{
769 		u8 *fragnexthdr_offset;
770 
771 		len = left;
772 		/* IF: it doesn't fit, use 'mtu' - the data space left */
773 		if (len > mtu)
774 			len = mtu;
775 		/* IF: we are not sending up to and including the packet end
776 		   then align the next start on an eight byte boundary */
777 		if (len < left)	{
778 			len &= ~7;
779 		}
780 
781 		/* Allocate buffer */
782 		frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
783 				 hroom + troom, GFP_ATOMIC);
784 		if (!frag) {
785 			err = -ENOMEM;
786 			goto fail;
787 		}
788 
789 		/*
790 		 *	Set up data on packet
791 		 */
792 
793 		ip6_copy_metadata(frag, skb);
794 		skb_reserve(frag, hroom);
795 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
796 		skb_reset_network_header(frag);
797 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
798 		frag->transport_header = (frag->network_header + hlen +
799 					  sizeof(struct frag_hdr));
800 
801 		/*
802 		 *	Charge the memory for the fragment to any owner
803 		 *	it might possess
804 		 */
805 		if (skb->sk)
806 			skb_set_owner_w(frag, skb->sk);
807 
808 		/*
809 		 *	Copy the packet header into the new buffer.
810 		 */
811 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
812 
813 		fragnexthdr_offset = skb_network_header(frag);
814 		fragnexthdr_offset += prevhdr - skb_network_header(skb);
815 		*fragnexthdr_offset = NEXTHDR_FRAGMENT;
816 
817 		/*
818 		 *	Build fragment header.
819 		 */
820 		fh->nexthdr = nexthdr;
821 		fh->reserved = 0;
822 		fh->identification = frag_id;
823 
824 		/*
825 		 *	Copy a block of the IP datagram.
826 		 */
827 		BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
828 				     len));
829 		left -= len;
830 
831 		fh->frag_off = htons(offset);
832 		if (left > 0)
833 			fh->frag_off |= htons(IP6_MF);
834 		ipv6_hdr(frag)->payload_len = htons(frag->len -
835 						    sizeof(struct ipv6hdr));
836 
837 		ptr += len;
838 		offset += len;
839 
840 		/*
841 		 *	Put this fragment into the sending queue.
842 		 */
843 		err = output(net, sk, frag);
844 		if (err)
845 			goto fail;
846 
847 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
848 			      IPSTATS_MIB_FRAGCREATES);
849 	}
850 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
851 		      IPSTATS_MIB_FRAGOKS);
852 	consume_skb(skb);
853 	return err;
854 
855 fail_toobig:
856 	if (skb->sk && dst_allfrag(skb_dst(skb)))
857 		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
858 
859 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
860 	err = -EMSGSIZE;
861 
862 fail:
863 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
864 		      IPSTATS_MIB_FRAGFAILS);
865 	kfree_skb(skb);
866 	return err;
867 }
868 
869 static inline int ip6_rt_check(const struct rt6key *rt_key,
870 			       const struct in6_addr *fl_addr,
871 			       const struct in6_addr *addr_cache)
872 {
873 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
874 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
875 }
876 
877 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
878 					  struct dst_entry *dst,
879 					  const struct flowi6 *fl6)
880 {
881 	struct ipv6_pinfo *np = inet6_sk(sk);
882 	struct rt6_info *rt;
883 
884 	if (!dst)
885 		goto out;
886 
887 	if (dst->ops->family != AF_INET6) {
888 		dst_release(dst);
889 		return NULL;
890 	}
891 
892 	rt = (struct rt6_info *)dst;
893 	/* Yes, checking route validity in not connected
894 	 * case is not very simple. Take into account,
895 	 * that we do not support routing by source, TOS,
896 	 * and MSG_DONTROUTE		--ANK (980726)
897 	 *
898 	 * 1. ip6_rt_check(): If route was host route,
899 	 *    check that cached destination is current.
900 	 *    If it is network route, we still may
901 	 *    check its validity using saved pointer
902 	 *    to the last used address: daddr_cache.
903 	 *    We do not want to save whole address now,
904 	 *    (because main consumer of this service
905 	 *    is tcp, which has not this problem),
906 	 *    so that the last trick works only on connected
907 	 *    sockets.
908 	 * 2. oif also should be the same.
909 	 */
910 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
911 #ifdef CONFIG_IPV6_SUBTREES
912 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
913 #endif
914 	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
915 	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
916 		dst_release(dst);
917 		dst = NULL;
918 	}
919 
920 out:
921 	return dst;
922 }
923 
924 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
925 			       struct dst_entry **dst, struct flowi6 *fl6)
926 {
927 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
928 	struct neighbour *n;
929 	struct rt6_info *rt;
930 #endif
931 	int err;
932 	int flags = 0;
933 
934 	/* The correct way to handle this would be to do
935 	 * ip6_route_get_saddr, and then ip6_route_output; however,
936 	 * the route-specific preferred source forces the
937 	 * ip6_route_output call _before_ ip6_route_get_saddr.
938 	 *
939 	 * In source specific routing (no src=any default route),
940 	 * ip6_route_output will fail given src=any saddr, though, so
941 	 * that's why we try it again later.
942 	 */
943 	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
944 		struct fib6_info *from;
945 		struct rt6_info *rt;
946 		bool had_dst = *dst != NULL;
947 
948 		if (!had_dst)
949 			*dst = ip6_route_output(net, sk, fl6);
950 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
951 
952 		rcu_read_lock();
953 		from = rt ? rcu_dereference(rt->from) : NULL;
954 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
955 					  sk ? inet6_sk(sk)->srcprefs : 0,
956 					  &fl6->saddr);
957 		rcu_read_unlock();
958 
959 		if (err)
960 			goto out_err_release;
961 
962 		/* If we had an erroneous initial result, pretend it
963 		 * never existed and let the SA-enabled version take
964 		 * over.
965 		 */
966 		if (!had_dst && (*dst)->error) {
967 			dst_release(*dst);
968 			*dst = NULL;
969 		}
970 
971 		if (fl6->flowi6_oif)
972 			flags |= RT6_LOOKUP_F_IFACE;
973 	}
974 
975 	if (!*dst)
976 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
977 
978 	err = (*dst)->error;
979 	if (err)
980 		goto out_err_release;
981 
982 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
983 	/*
984 	 * Here if the dst entry we've looked up
985 	 * has a neighbour entry that is in the INCOMPLETE
986 	 * state and the src address from the flow is
987 	 * marked as OPTIMISTIC, we release the found
988 	 * dst entry and replace it instead with the
989 	 * dst entry of the nexthop router
990 	 */
991 	rt = (struct rt6_info *) *dst;
992 	rcu_read_lock_bh();
993 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
994 				      rt6_nexthop(rt, &fl6->daddr));
995 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
996 	rcu_read_unlock_bh();
997 
998 	if (err) {
999 		struct inet6_ifaddr *ifp;
1000 		struct flowi6 fl_gw6;
1001 		int redirect;
1002 
1003 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1004 				      (*dst)->dev, 1);
1005 
1006 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1007 		if (ifp)
1008 			in6_ifa_put(ifp);
1009 
1010 		if (redirect) {
1011 			/*
1012 			 * We need to get the dst entry for the
1013 			 * default router instead
1014 			 */
1015 			dst_release(*dst);
1016 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1017 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1018 			*dst = ip6_route_output(net, sk, &fl_gw6);
1019 			err = (*dst)->error;
1020 			if (err)
1021 				goto out_err_release;
1022 		}
1023 	}
1024 #endif
1025 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1026 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1027 		err = -EAFNOSUPPORT;
1028 		goto out_err_release;
1029 	}
1030 
1031 	return 0;
1032 
1033 out_err_release:
1034 	dst_release(*dst);
1035 	*dst = NULL;
1036 
1037 	if (err == -ENETUNREACH)
1038 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1039 	return err;
1040 }
1041 
1042 /**
1043  *	ip6_dst_lookup - perform route lookup on flow
1044  *	@sk: socket which provides route info
1045  *	@dst: pointer to dst_entry * for result
1046  *	@fl6: flow to lookup
1047  *
1048  *	This function performs a route lookup on the given flow.
1049  *
1050  *	It returns zero on success, or a standard errno code on error.
1051  */
1052 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1053 		   struct flowi6 *fl6)
1054 {
1055 	*dst = NULL;
1056 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1057 }
1058 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1059 
1060 /**
1061  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1062  *	@sk: socket which provides route info
1063  *	@fl6: flow to lookup
1064  *	@final_dst: final destination address for ipsec lookup
1065  *
1066  *	This function performs a route lookup on the given flow.
1067  *
1068  *	It returns a valid dst pointer on success, or a pointer encoded
1069  *	error code.
1070  */
1071 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1072 				      const struct in6_addr *final_dst)
1073 {
1074 	struct dst_entry *dst = NULL;
1075 	int err;
1076 
1077 	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1078 	if (err)
1079 		return ERR_PTR(err);
1080 	if (final_dst)
1081 		fl6->daddr = *final_dst;
1082 
1083 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1084 }
1085 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1086 
1087 /**
1088  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1089  *	@sk: socket which provides the dst cache and route info
1090  *	@fl6: flow to lookup
1091  *	@final_dst: final destination address for ipsec lookup
1092  *	@connected: whether @sk is connected or not
1093  *
1094  *	This function performs a route lookup on the given flow with the
1095  *	possibility of using the cached route in the socket if it is valid.
1096  *	It will take the socket dst lock when operating on the dst cache.
1097  *	As a result, this function can only be used in process context.
1098  *
1099  *	In addition, for a connected socket, cache the dst in the socket
1100  *	if the current cache is not valid.
1101  *
1102  *	It returns a valid dst pointer on success, or a pointer encoded
1103  *	error code.
1104  */
1105 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1106 					 const struct in6_addr *final_dst,
1107 					 bool connected)
1108 {
1109 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1110 
1111 	dst = ip6_sk_dst_check(sk, dst, fl6);
1112 	if (dst)
1113 		return dst;
1114 
1115 	dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1116 	if (connected && !IS_ERR(dst))
1117 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1118 
1119 	return dst;
1120 }
1121 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1122 
1123 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1124 					       gfp_t gfp)
1125 {
1126 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1127 }
1128 
1129 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1130 						gfp_t gfp)
1131 {
1132 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1133 }
1134 
1135 static void ip6_append_data_mtu(unsigned int *mtu,
1136 				int *maxfraglen,
1137 				unsigned int fragheaderlen,
1138 				struct sk_buff *skb,
1139 				struct rt6_info *rt,
1140 				unsigned int orig_mtu)
1141 {
1142 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1143 		if (!skb) {
1144 			/* first fragment, reserve header_len */
1145 			*mtu = orig_mtu - rt->dst.header_len;
1146 
1147 		} else {
1148 			/*
1149 			 * this fragment is not first, the headers
1150 			 * space is regarded as data space.
1151 			 */
1152 			*mtu = orig_mtu;
1153 		}
1154 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1155 			      + fragheaderlen - sizeof(struct frag_hdr);
1156 	}
1157 }
1158 
1159 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1160 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1161 			  struct rt6_info *rt, struct flowi6 *fl6)
1162 {
1163 	struct ipv6_pinfo *np = inet6_sk(sk);
1164 	unsigned int mtu;
1165 	struct ipv6_txoptions *opt = ipc6->opt;
1166 
1167 	/*
1168 	 * setup for corking
1169 	 */
1170 	if (opt) {
1171 		if (WARN_ON(v6_cork->opt))
1172 			return -EINVAL;
1173 
1174 		v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1175 		if (unlikely(!v6_cork->opt))
1176 			return -ENOBUFS;
1177 
1178 		v6_cork->opt->tot_len = sizeof(*opt);
1179 		v6_cork->opt->opt_flen = opt->opt_flen;
1180 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1181 
1182 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1183 						    sk->sk_allocation);
1184 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1185 			return -ENOBUFS;
1186 
1187 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1188 						    sk->sk_allocation);
1189 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1190 			return -ENOBUFS;
1191 
1192 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1193 						   sk->sk_allocation);
1194 		if (opt->hopopt && !v6_cork->opt->hopopt)
1195 			return -ENOBUFS;
1196 
1197 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1198 						    sk->sk_allocation);
1199 		if (opt->srcrt && !v6_cork->opt->srcrt)
1200 			return -ENOBUFS;
1201 
1202 		/* need source address above miyazawa*/
1203 	}
1204 	dst_hold(&rt->dst);
1205 	cork->base.dst = &rt->dst;
1206 	cork->fl.u.ip6 = *fl6;
1207 	v6_cork->hop_limit = ipc6->hlimit;
1208 	v6_cork->tclass = ipc6->tclass;
1209 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1210 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1211 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1212 	else
1213 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1214 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1215 	if (np->frag_size < mtu) {
1216 		if (np->frag_size)
1217 			mtu = np->frag_size;
1218 	}
1219 	if (mtu < IPV6_MIN_MTU)
1220 		return -EINVAL;
1221 	cork->base.fragsize = mtu;
1222 	cork->base.gso_size = ipc6->gso_size;
1223 	cork->base.tx_flags = 0;
1224 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1225 
1226 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1227 		cork->base.flags |= IPCORK_ALLFRAG;
1228 	cork->base.length = 0;
1229 
1230 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1231 
1232 	return 0;
1233 }
1234 
1235 static int __ip6_append_data(struct sock *sk,
1236 			     struct flowi6 *fl6,
1237 			     struct sk_buff_head *queue,
1238 			     struct inet_cork *cork,
1239 			     struct inet6_cork *v6_cork,
1240 			     struct page_frag *pfrag,
1241 			     int getfrag(void *from, char *to, int offset,
1242 					 int len, int odd, struct sk_buff *skb),
1243 			     void *from, int length, int transhdrlen,
1244 			     unsigned int flags, struct ipcm6_cookie *ipc6)
1245 {
1246 	struct sk_buff *skb, *skb_prev = NULL;
1247 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1248 	struct ubuf_info *uarg = NULL;
1249 	int exthdrlen = 0;
1250 	int dst_exthdrlen = 0;
1251 	int hh_len;
1252 	int copy;
1253 	int err;
1254 	int offset = 0;
1255 	u32 tskey = 0;
1256 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1257 	struct ipv6_txoptions *opt = v6_cork->opt;
1258 	int csummode = CHECKSUM_NONE;
1259 	unsigned int maxnonfragsize, headersize;
1260 	unsigned int wmem_alloc_delta = 0;
1261 	bool paged, extra_uref;
1262 
1263 	skb = skb_peek_tail(queue);
1264 	if (!skb) {
1265 		exthdrlen = opt ? opt->opt_flen : 0;
1266 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1267 	}
1268 
1269 	paged = !!cork->gso_size;
1270 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1271 	orig_mtu = mtu;
1272 
1273 	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1274 	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1275 		tskey = sk->sk_tskey++;
1276 
1277 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1278 
1279 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1280 			(opt ? opt->opt_nflen : 0);
1281 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1282 		     sizeof(struct frag_hdr);
1283 
1284 	headersize = sizeof(struct ipv6hdr) +
1285 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1286 		     (dst_allfrag(&rt->dst) ?
1287 		      sizeof(struct frag_hdr) : 0) +
1288 		     rt->rt6i_nfheader_len;
1289 
1290 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1291 	 * the first fragment
1292 	 */
1293 	if (headersize + transhdrlen > mtu)
1294 		goto emsgsize;
1295 
1296 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1297 	    (sk->sk_protocol == IPPROTO_UDP ||
1298 	     sk->sk_protocol == IPPROTO_RAW)) {
1299 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1300 				sizeof(struct ipv6hdr));
1301 		goto emsgsize;
1302 	}
1303 
1304 	if (ip6_sk_ignore_df(sk))
1305 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1306 	else
1307 		maxnonfragsize = mtu;
1308 
1309 	if (cork->length + length > maxnonfragsize - headersize) {
1310 emsgsize:
1311 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1312 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1313 		return -EMSGSIZE;
1314 	}
1315 
1316 	/* CHECKSUM_PARTIAL only with no extension headers and when
1317 	 * we are not going to fragment
1318 	 */
1319 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1320 	    headersize == sizeof(struct ipv6hdr) &&
1321 	    length <= mtu - headersize &&
1322 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1323 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1324 		csummode = CHECKSUM_PARTIAL;
1325 
1326 	if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1327 		uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
1328 		if (!uarg)
1329 			return -ENOBUFS;
1330 		extra_uref = true;
1331 		if (rt->dst.dev->features & NETIF_F_SG &&
1332 		    csummode == CHECKSUM_PARTIAL) {
1333 			paged = true;
1334 		} else {
1335 			uarg->zerocopy = 0;
1336 			skb_zcopy_set(skb, uarg, &extra_uref);
1337 		}
1338 	}
1339 
1340 	/*
1341 	 * Let's try using as much space as possible.
1342 	 * Use MTU if total length of the message fits into the MTU.
1343 	 * Otherwise, we need to reserve fragment header and
1344 	 * fragment alignment (= 8-15 octects, in total).
1345 	 *
1346 	 * Note that we may need to "move" the data from the tail of
1347 	 * of the buffer to the new fragment when we split
1348 	 * the message.
1349 	 *
1350 	 * FIXME: It may be fragmented into multiple chunks
1351 	 *        at once if non-fragmentable extension headers
1352 	 *        are too large.
1353 	 * --yoshfuji
1354 	 */
1355 
1356 	cork->length += length;
1357 	if (!skb)
1358 		goto alloc_new_skb;
1359 
1360 	while (length > 0) {
1361 		/* Check if the remaining data fits into current packet. */
1362 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1363 		if (copy < length)
1364 			copy = maxfraglen - skb->len;
1365 
1366 		if (copy <= 0) {
1367 			char *data;
1368 			unsigned int datalen;
1369 			unsigned int fraglen;
1370 			unsigned int fraggap;
1371 			unsigned int alloclen;
1372 			unsigned int pagedlen;
1373 alloc_new_skb:
1374 			/* There's no room in the current skb */
1375 			if (skb)
1376 				fraggap = skb->len - maxfraglen;
1377 			else
1378 				fraggap = 0;
1379 			/* update mtu and maxfraglen if necessary */
1380 			if (!skb || !skb_prev)
1381 				ip6_append_data_mtu(&mtu, &maxfraglen,
1382 						    fragheaderlen, skb, rt,
1383 						    orig_mtu);
1384 
1385 			skb_prev = skb;
1386 
1387 			/*
1388 			 * If remaining data exceeds the mtu,
1389 			 * we know we need more fragment(s).
1390 			 */
1391 			datalen = length + fraggap;
1392 
1393 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1394 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1395 			fraglen = datalen + fragheaderlen;
1396 			pagedlen = 0;
1397 
1398 			if ((flags & MSG_MORE) &&
1399 			    !(rt->dst.dev->features&NETIF_F_SG))
1400 				alloclen = mtu;
1401 			else if (!paged)
1402 				alloclen = fraglen;
1403 			else {
1404 				alloclen = min_t(int, fraglen, MAX_HEADER);
1405 				pagedlen = fraglen - alloclen;
1406 			}
1407 
1408 			alloclen += dst_exthdrlen;
1409 
1410 			if (datalen != length + fraggap) {
1411 				/*
1412 				 * this is not the last fragment, the trailer
1413 				 * space is regarded as data space.
1414 				 */
1415 				datalen += rt->dst.trailer_len;
1416 			}
1417 
1418 			alloclen += rt->dst.trailer_len;
1419 			fraglen = datalen + fragheaderlen;
1420 
1421 			/*
1422 			 * We just reserve space for fragment header.
1423 			 * Note: this may be overallocation if the message
1424 			 * (without MSG_MORE) fits into the MTU.
1425 			 */
1426 			alloclen += sizeof(struct frag_hdr);
1427 
1428 			copy = datalen - transhdrlen - fraggap - pagedlen;
1429 			if (copy < 0) {
1430 				err = -EINVAL;
1431 				goto error;
1432 			}
1433 			if (transhdrlen) {
1434 				skb = sock_alloc_send_skb(sk,
1435 						alloclen + hh_len,
1436 						(flags & MSG_DONTWAIT), &err);
1437 			} else {
1438 				skb = NULL;
1439 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1440 				    2 * sk->sk_sndbuf)
1441 					skb = alloc_skb(alloclen + hh_len,
1442 							sk->sk_allocation);
1443 				if (unlikely(!skb))
1444 					err = -ENOBUFS;
1445 			}
1446 			if (!skb)
1447 				goto error;
1448 			/*
1449 			 *	Fill in the control structures
1450 			 */
1451 			skb->protocol = htons(ETH_P_IPV6);
1452 			skb->ip_summed = csummode;
1453 			skb->csum = 0;
1454 			/* reserve for fragmentation and ipsec header */
1455 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1456 				    dst_exthdrlen);
1457 
1458 			/*
1459 			 *	Find where to start putting bytes
1460 			 */
1461 			data = skb_put(skb, fraglen - pagedlen);
1462 			skb_set_network_header(skb, exthdrlen);
1463 			data += fragheaderlen;
1464 			skb->transport_header = (skb->network_header +
1465 						 fragheaderlen);
1466 			if (fraggap) {
1467 				skb->csum = skb_copy_and_csum_bits(
1468 					skb_prev, maxfraglen,
1469 					data + transhdrlen, fraggap, 0);
1470 				skb_prev->csum = csum_sub(skb_prev->csum,
1471 							  skb->csum);
1472 				data += fraggap;
1473 				pskb_trim_unique(skb_prev, maxfraglen);
1474 			}
1475 			if (copy > 0 &&
1476 			    getfrag(from, data + transhdrlen, offset,
1477 				    copy, fraggap, skb) < 0) {
1478 				err = -EFAULT;
1479 				kfree_skb(skb);
1480 				goto error;
1481 			}
1482 
1483 			offset += copy;
1484 			length -= copy + transhdrlen;
1485 			transhdrlen = 0;
1486 			exthdrlen = 0;
1487 			dst_exthdrlen = 0;
1488 
1489 			/* Only the initial fragment is time stamped */
1490 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1491 			cork->tx_flags = 0;
1492 			skb_shinfo(skb)->tskey = tskey;
1493 			tskey = 0;
1494 			skb_zcopy_set(skb, uarg, &extra_uref);
1495 
1496 			if ((flags & MSG_CONFIRM) && !skb_prev)
1497 				skb_set_dst_pending_confirm(skb, 1);
1498 
1499 			/*
1500 			 * Put the packet on the pending queue
1501 			 */
1502 			if (!skb->destructor) {
1503 				skb->destructor = sock_wfree;
1504 				skb->sk = sk;
1505 				wmem_alloc_delta += skb->truesize;
1506 			}
1507 			__skb_queue_tail(queue, skb);
1508 			continue;
1509 		}
1510 
1511 		if (copy > length)
1512 			copy = length;
1513 
1514 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1515 		    skb_tailroom(skb) >= copy) {
1516 			unsigned int off;
1517 
1518 			off = skb->len;
1519 			if (getfrag(from, skb_put(skb, copy),
1520 						offset, copy, off, skb) < 0) {
1521 				__skb_trim(skb, off);
1522 				err = -EFAULT;
1523 				goto error;
1524 			}
1525 		} else if (!uarg || !uarg->zerocopy) {
1526 			int i = skb_shinfo(skb)->nr_frags;
1527 
1528 			err = -ENOMEM;
1529 			if (!sk_page_frag_refill(sk, pfrag))
1530 				goto error;
1531 
1532 			if (!skb_can_coalesce(skb, i, pfrag->page,
1533 					      pfrag->offset)) {
1534 				err = -EMSGSIZE;
1535 				if (i == MAX_SKB_FRAGS)
1536 					goto error;
1537 
1538 				__skb_fill_page_desc(skb, i, pfrag->page,
1539 						     pfrag->offset, 0);
1540 				skb_shinfo(skb)->nr_frags = ++i;
1541 				get_page(pfrag->page);
1542 			}
1543 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1544 			if (getfrag(from,
1545 				    page_address(pfrag->page) + pfrag->offset,
1546 				    offset, copy, skb->len, skb) < 0)
1547 				goto error_efault;
1548 
1549 			pfrag->offset += copy;
1550 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1551 			skb->len += copy;
1552 			skb->data_len += copy;
1553 			skb->truesize += copy;
1554 			wmem_alloc_delta += copy;
1555 		} else {
1556 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1557 			if (err < 0)
1558 				goto error;
1559 		}
1560 		offset += copy;
1561 		length -= copy;
1562 	}
1563 
1564 	if (wmem_alloc_delta)
1565 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1566 	return 0;
1567 
1568 error_efault:
1569 	err = -EFAULT;
1570 error:
1571 	sock_zerocopy_put_abort(uarg, extra_uref);
1572 	cork->length -= length;
1573 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1574 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1575 	return err;
1576 }
1577 
1578 int ip6_append_data(struct sock *sk,
1579 		    int getfrag(void *from, char *to, int offset, int len,
1580 				int odd, struct sk_buff *skb),
1581 		    void *from, int length, int transhdrlen,
1582 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1583 		    struct rt6_info *rt, unsigned int flags)
1584 {
1585 	struct inet_sock *inet = inet_sk(sk);
1586 	struct ipv6_pinfo *np = inet6_sk(sk);
1587 	int exthdrlen;
1588 	int err;
1589 
1590 	if (flags&MSG_PROBE)
1591 		return 0;
1592 	if (skb_queue_empty(&sk->sk_write_queue)) {
1593 		/*
1594 		 * setup for corking
1595 		 */
1596 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1597 				     ipc6, rt, fl6);
1598 		if (err)
1599 			return err;
1600 
1601 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1602 		length += exthdrlen;
1603 		transhdrlen += exthdrlen;
1604 	} else {
1605 		fl6 = &inet->cork.fl.u.ip6;
1606 		transhdrlen = 0;
1607 	}
1608 
1609 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1610 				 &np->cork, sk_page_frag(sk), getfrag,
1611 				 from, length, transhdrlen, flags, ipc6);
1612 }
1613 EXPORT_SYMBOL_GPL(ip6_append_data);
1614 
1615 static void ip6_cork_release(struct inet_cork_full *cork,
1616 			     struct inet6_cork *v6_cork)
1617 {
1618 	if (v6_cork->opt) {
1619 		kfree(v6_cork->opt->dst0opt);
1620 		kfree(v6_cork->opt->dst1opt);
1621 		kfree(v6_cork->opt->hopopt);
1622 		kfree(v6_cork->opt->srcrt);
1623 		kfree(v6_cork->opt);
1624 		v6_cork->opt = NULL;
1625 	}
1626 
1627 	if (cork->base.dst) {
1628 		dst_release(cork->base.dst);
1629 		cork->base.dst = NULL;
1630 		cork->base.flags &= ~IPCORK_ALLFRAG;
1631 	}
1632 	memset(&cork->fl, 0, sizeof(cork->fl));
1633 }
1634 
1635 struct sk_buff *__ip6_make_skb(struct sock *sk,
1636 			       struct sk_buff_head *queue,
1637 			       struct inet_cork_full *cork,
1638 			       struct inet6_cork *v6_cork)
1639 {
1640 	struct sk_buff *skb, *tmp_skb;
1641 	struct sk_buff **tail_skb;
1642 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1643 	struct ipv6_pinfo *np = inet6_sk(sk);
1644 	struct net *net = sock_net(sk);
1645 	struct ipv6hdr *hdr;
1646 	struct ipv6_txoptions *opt = v6_cork->opt;
1647 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1648 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1649 	unsigned char proto = fl6->flowi6_proto;
1650 
1651 	skb = __skb_dequeue(queue);
1652 	if (!skb)
1653 		goto out;
1654 	tail_skb = &(skb_shinfo(skb)->frag_list);
1655 
1656 	/* move skb->data to ip header from ext header */
1657 	if (skb->data < skb_network_header(skb))
1658 		__skb_pull(skb, skb_network_offset(skb));
1659 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1660 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1661 		*tail_skb = tmp_skb;
1662 		tail_skb = &(tmp_skb->next);
1663 		skb->len += tmp_skb->len;
1664 		skb->data_len += tmp_skb->len;
1665 		skb->truesize += tmp_skb->truesize;
1666 		tmp_skb->destructor = NULL;
1667 		tmp_skb->sk = NULL;
1668 	}
1669 
1670 	/* Allow local fragmentation. */
1671 	skb->ignore_df = ip6_sk_ignore_df(sk);
1672 
1673 	*final_dst = fl6->daddr;
1674 	__skb_pull(skb, skb_network_header_len(skb));
1675 	if (opt && opt->opt_flen)
1676 		ipv6_push_frag_opts(skb, opt, &proto);
1677 	if (opt && opt->opt_nflen)
1678 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1679 
1680 	skb_push(skb, sizeof(struct ipv6hdr));
1681 	skb_reset_network_header(skb);
1682 	hdr = ipv6_hdr(skb);
1683 
1684 	ip6_flow_hdr(hdr, v6_cork->tclass,
1685 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1686 					ip6_autoflowlabel(net, np), fl6));
1687 	hdr->hop_limit = v6_cork->hop_limit;
1688 	hdr->nexthdr = proto;
1689 	hdr->saddr = fl6->saddr;
1690 	hdr->daddr = *final_dst;
1691 
1692 	skb->priority = sk->sk_priority;
1693 	skb->mark = sk->sk_mark;
1694 
1695 	skb->tstamp = cork->base.transmit_time;
1696 
1697 	skb_dst_set(skb, dst_clone(&rt->dst));
1698 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1699 	if (proto == IPPROTO_ICMPV6) {
1700 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1701 
1702 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1703 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1704 	}
1705 
1706 	ip6_cork_release(cork, v6_cork);
1707 out:
1708 	return skb;
1709 }
1710 
1711 int ip6_send_skb(struct sk_buff *skb)
1712 {
1713 	struct net *net = sock_net(skb->sk);
1714 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1715 	int err;
1716 
1717 	err = ip6_local_out(net, skb->sk, skb);
1718 	if (err) {
1719 		if (err > 0)
1720 			err = net_xmit_errno(err);
1721 		if (err)
1722 			IP6_INC_STATS(net, rt->rt6i_idev,
1723 				      IPSTATS_MIB_OUTDISCARDS);
1724 	}
1725 
1726 	return err;
1727 }
1728 
1729 int ip6_push_pending_frames(struct sock *sk)
1730 {
1731 	struct sk_buff *skb;
1732 
1733 	skb = ip6_finish_skb(sk);
1734 	if (!skb)
1735 		return 0;
1736 
1737 	return ip6_send_skb(skb);
1738 }
1739 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1740 
1741 static void __ip6_flush_pending_frames(struct sock *sk,
1742 				       struct sk_buff_head *queue,
1743 				       struct inet_cork_full *cork,
1744 				       struct inet6_cork *v6_cork)
1745 {
1746 	struct sk_buff *skb;
1747 
1748 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1749 		if (skb_dst(skb))
1750 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1751 				      IPSTATS_MIB_OUTDISCARDS);
1752 		kfree_skb(skb);
1753 	}
1754 
1755 	ip6_cork_release(cork, v6_cork);
1756 }
1757 
1758 void ip6_flush_pending_frames(struct sock *sk)
1759 {
1760 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1761 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1762 }
1763 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1764 
1765 struct sk_buff *ip6_make_skb(struct sock *sk,
1766 			     int getfrag(void *from, char *to, int offset,
1767 					 int len, int odd, struct sk_buff *skb),
1768 			     void *from, int length, int transhdrlen,
1769 			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1770 			     struct rt6_info *rt, unsigned int flags,
1771 			     struct inet_cork_full *cork)
1772 {
1773 	struct inet6_cork v6_cork;
1774 	struct sk_buff_head queue;
1775 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1776 	int err;
1777 
1778 	if (flags & MSG_PROBE)
1779 		return NULL;
1780 
1781 	__skb_queue_head_init(&queue);
1782 
1783 	cork->base.flags = 0;
1784 	cork->base.addr = 0;
1785 	cork->base.opt = NULL;
1786 	cork->base.dst = NULL;
1787 	v6_cork.opt = NULL;
1788 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1789 	if (err) {
1790 		ip6_cork_release(cork, &v6_cork);
1791 		return ERR_PTR(err);
1792 	}
1793 	if (ipc6->dontfrag < 0)
1794 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1795 
1796 	err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1797 				&current->task_frag, getfrag, from,
1798 				length + exthdrlen, transhdrlen + exthdrlen,
1799 				flags, ipc6);
1800 	if (err) {
1801 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1802 		return ERR_PTR(err);
1803 	}
1804 
1805 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1806 }
1807