xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 036b9e7c)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	:	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/bpf-cgroup.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45 
46 #include <net/sock.h>
47 #include <net/snmp.h>
48 
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
59 #include <net/l3mdev.h>
60 #include <net/lwtunnel.h>
61 
62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
63 {
64 	struct dst_entry *dst = skb_dst(skb);
65 	struct net_device *dev = dst->dev;
66 	struct neighbour *neigh;
67 	struct in6_addr *nexthop;
68 	int ret;
69 
70 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72 
73 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74 		    ((mroute6_is_socket(net, skb) &&
75 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 					 &ipv6_hdr(skb)->saddr))) {
78 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79 
80 			/* Do not check for IFF_ALLMULTI; multicast routing
81 			   is not supported in any case.
82 			 */
83 			if (newskb)
84 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 					net, sk, newskb, NULL, newskb->dev,
86 					dev_loopback_xmit);
87 
88 			if (ipv6_hdr(skb)->hop_limit == 0) {
89 				IP6_INC_STATS(net, idev,
90 					      IPSTATS_MIB_OUTDISCARDS);
91 				kfree_skb(skb);
92 				return 0;
93 			}
94 		}
95 
96 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
97 
98 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
99 		    IPV6_ADDR_SCOPE_NODELOCAL &&
100 		    !(dev->flags & IFF_LOOPBACK)) {
101 			kfree_skb(skb);
102 			return 0;
103 		}
104 	}
105 
106 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
107 		int res = lwtunnel_xmit(skb);
108 
109 		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
110 			return res;
111 	}
112 
113 	rcu_read_lock_bh();
114 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
115 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
116 	if (unlikely(!neigh))
117 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
118 	if (!IS_ERR(neigh)) {
119 		sock_confirm_neigh(skb, neigh);
120 		ret = neigh_output(neigh, skb);
121 		rcu_read_unlock_bh();
122 		return ret;
123 	}
124 	rcu_read_unlock_bh();
125 
126 	IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
127 	kfree_skb(skb);
128 	return -EINVAL;
129 }
130 
131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
132 {
133 	int ret;
134 
135 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
136 	if (ret) {
137 		kfree_skb(skb);
138 		return ret;
139 	}
140 
141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
142 	/* Policy lookup after SNAT yielded a new policy */
143 	if (skb_dst(skb)->xfrm) {
144 		IPCB(skb)->flags |= IPSKB_REROUTED;
145 		return dst_output(net, sk, skb);
146 	}
147 #endif
148 
149 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
150 	    dst_allfrag(skb_dst(skb)) ||
151 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
152 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
153 	else
154 		return ip6_finish_output2(net, sk, skb);
155 }
156 
157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
158 {
159 	struct net_device *dev = skb_dst(skb)->dev;
160 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161 
162 	skb->protocol = htons(ETH_P_IPV6);
163 	skb->dev = dev;
164 
165 	if (unlikely(idev->cnf.disable_ipv6)) {
166 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
167 		kfree_skb(skb);
168 		return 0;
169 	}
170 
171 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
172 			    net, sk, skb, NULL, dev,
173 			    ip6_finish_output,
174 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
175 }
176 
177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
178 {
179 	if (!np->autoflowlabel_set)
180 		return ip6_default_np_autolabel(net);
181 	else
182 		return np->autoflowlabel;
183 }
184 
185 /*
186  * xmit an sk_buff (used by TCP, SCTP and DCCP)
187  * Note : socket lock is not held for SYNACK packets, but might be modified
188  * by calls to skb_set_owner_w() and ipv6_local_error(),
189  * which are using proper atomic operations or spinlocks.
190  */
191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
192 	     __u32 mark, struct ipv6_txoptions *opt, int tclass)
193 {
194 	struct net *net = sock_net(sk);
195 	const struct ipv6_pinfo *np = inet6_sk(sk);
196 	struct in6_addr *first_hop = &fl6->daddr;
197 	struct dst_entry *dst = skb_dst(skb);
198 	unsigned int head_room;
199 	struct ipv6hdr *hdr;
200 	u8  proto = fl6->flowi6_proto;
201 	int seg_len = skb->len;
202 	int hlimit = -1;
203 	u32 mtu;
204 
205 	head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
206 	if (opt)
207 		head_room += opt->opt_nflen + opt->opt_flen;
208 
209 	if (unlikely(skb_headroom(skb) < head_room)) {
210 		struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
211 		if (!skb2) {
212 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
213 				      IPSTATS_MIB_OUTDISCARDS);
214 			kfree_skb(skb);
215 			return -ENOBUFS;
216 		}
217 		if (skb->sk)
218 			skb_set_owner_w(skb2, skb->sk);
219 		consume_skb(skb);
220 		skb = skb2;
221 	}
222 
223 	if (opt) {
224 		seg_len += opt->opt_nflen + opt->opt_flen;
225 
226 		if (opt->opt_flen)
227 			ipv6_push_frag_opts(skb, opt, &proto);
228 
229 		if (opt->opt_nflen)
230 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
231 					     &fl6->saddr);
232 	}
233 
234 	skb_push(skb, sizeof(struct ipv6hdr));
235 	skb_reset_network_header(skb);
236 	hdr = ipv6_hdr(skb);
237 
238 	/*
239 	 *	Fill in the IPv6 header
240 	 */
241 	if (np)
242 		hlimit = np->hop_limit;
243 	if (hlimit < 0)
244 		hlimit = ip6_dst_hoplimit(dst);
245 
246 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
247 				ip6_autoflowlabel(net, np), fl6));
248 
249 	hdr->payload_len = htons(seg_len);
250 	hdr->nexthdr = proto;
251 	hdr->hop_limit = hlimit;
252 
253 	hdr->saddr = fl6->saddr;
254 	hdr->daddr = *first_hop;
255 
256 	skb->protocol = htons(ETH_P_IPV6);
257 	skb->priority = sk->sk_priority;
258 	skb->mark = mark;
259 
260 	mtu = dst_mtu(dst);
261 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
262 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
263 			      IPSTATS_MIB_OUT, skb->len);
264 
265 		/* if egress device is enslaved to an L3 master device pass the
266 		 * skb to its handler for processing
267 		 */
268 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
269 		if (unlikely(!skb))
270 			return 0;
271 
272 		/* hooks should never assume socket lock is held.
273 		 * we promote our socket to non const
274 		 */
275 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
276 			       net, (struct sock *)sk, skb, NULL, dst->dev,
277 			       dst_output);
278 	}
279 
280 	skb->dev = dst->dev;
281 	/* ipv6_local_error() does not require socket lock,
282 	 * we promote our socket to non const
283 	 */
284 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
285 
286 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
287 	kfree_skb(skb);
288 	return -EMSGSIZE;
289 }
290 EXPORT_SYMBOL(ip6_xmit);
291 
292 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
293 {
294 	struct ip6_ra_chain *ra;
295 	struct sock *last = NULL;
296 
297 	read_lock(&ip6_ra_lock);
298 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
299 		struct sock *sk = ra->sk;
300 		if (sk && ra->sel == sel &&
301 		    (!sk->sk_bound_dev_if ||
302 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
303 			if (last) {
304 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
305 				if (skb2)
306 					rawv6_rcv(last, skb2);
307 			}
308 			last = sk;
309 		}
310 	}
311 
312 	if (last) {
313 		rawv6_rcv(last, skb);
314 		read_unlock(&ip6_ra_lock);
315 		return 1;
316 	}
317 	read_unlock(&ip6_ra_lock);
318 	return 0;
319 }
320 
321 static int ip6_forward_proxy_check(struct sk_buff *skb)
322 {
323 	struct ipv6hdr *hdr = ipv6_hdr(skb);
324 	u8 nexthdr = hdr->nexthdr;
325 	__be16 frag_off;
326 	int offset;
327 
328 	if (ipv6_ext_hdr(nexthdr)) {
329 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
330 		if (offset < 0)
331 			return 0;
332 	} else
333 		offset = sizeof(struct ipv6hdr);
334 
335 	if (nexthdr == IPPROTO_ICMPV6) {
336 		struct icmp6hdr *icmp6;
337 
338 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
339 					 offset + 1 - skb->data)))
340 			return 0;
341 
342 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
343 
344 		switch (icmp6->icmp6_type) {
345 		case NDISC_ROUTER_SOLICITATION:
346 		case NDISC_ROUTER_ADVERTISEMENT:
347 		case NDISC_NEIGHBOUR_SOLICITATION:
348 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
349 		case NDISC_REDIRECT:
350 			/* For reaction involving unicast neighbor discovery
351 			 * message destined to the proxied address, pass it to
352 			 * input function.
353 			 */
354 			return 1;
355 		default:
356 			break;
357 		}
358 	}
359 
360 	/*
361 	 * The proxying router can't forward traffic sent to a link-local
362 	 * address, so signal the sender and discard the packet. This
363 	 * behavior is clarified by the MIPv6 specification.
364 	 */
365 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
366 		dst_link_failure(skb);
367 		return -1;
368 	}
369 
370 	return 0;
371 }
372 
373 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
374 				     struct sk_buff *skb)
375 {
376 	struct dst_entry *dst = skb_dst(skb);
377 
378 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
379 	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
380 
381 #ifdef CONFIG_NET_SWITCHDEV
382 	if (skb->offload_l3_fwd_mark) {
383 		consume_skb(skb);
384 		return 0;
385 	}
386 #endif
387 
388 	return dst_output(net, sk, skb);
389 }
390 
391 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
392 {
393 	if (skb->len <= mtu)
394 		return false;
395 
396 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
397 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
398 		return true;
399 
400 	if (skb->ignore_df)
401 		return false;
402 
403 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
404 		return false;
405 
406 	return true;
407 }
408 
409 int ip6_forward(struct sk_buff *skb)
410 {
411 	struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
412 	struct dst_entry *dst = skb_dst(skb);
413 	struct ipv6hdr *hdr = ipv6_hdr(skb);
414 	struct inet6_skb_parm *opt = IP6CB(skb);
415 	struct net *net = dev_net(dst->dev);
416 	u32 mtu;
417 
418 	if (net->ipv6.devconf_all->forwarding == 0)
419 		goto error;
420 
421 	if (skb->pkt_type != PACKET_HOST)
422 		goto drop;
423 
424 	if (unlikely(skb->sk))
425 		goto drop;
426 
427 	if (skb_warn_if_lro(skb))
428 		goto drop;
429 
430 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
431 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
432 		goto drop;
433 	}
434 
435 	skb_forward_csum(skb);
436 
437 	/*
438 	 *	We DO NOT make any processing on
439 	 *	RA packets, pushing them to user level AS IS
440 	 *	without ane WARRANTY that application will be able
441 	 *	to interpret them. The reason is that we
442 	 *	cannot make anything clever here.
443 	 *
444 	 *	We are not end-node, so that if packet contains
445 	 *	AH/ESP, we cannot make anything.
446 	 *	Defragmentation also would be mistake, RA packets
447 	 *	cannot be fragmented, because there is no warranty
448 	 *	that different fragments will go along one path. --ANK
449 	 */
450 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
451 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
452 			return 0;
453 	}
454 
455 	/*
456 	 *	check and decrement ttl
457 	 */
458 	if (hdr->hop_limit <= 1) {
459 		/* Force OUTPUT device used as source address */
460 		skb->dev = dst->dev;
461 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
462 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
463 
464 		kfree_skb(skb);
465 		return -ETIMEDOUT;
466 	}
467 
468 	/* XXX: idev->cnf.proxy_ndp? */
469 	if (net->ipv6.devconf_all->proxy_ndp &&
470 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
471 		int proxied = ip6_forward_proxy_check(skb);
472 		if (proxied > 0)
473 			return ip6_input(skb);
474 		else if (proxied < 0) {
475 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
476 			goto drop;
477 		}
478 	}
479 
480 	if (!xfrm6_route_forward(skb)) {
481 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
482 		goto drop;
483 	}
484 	dst = skb_dst(skb);
485 
486 	/* IPv6 specs say nothing about it, but it is clear that we cannot
487 	   send redirects to source routed frames.
488 	   We don't send redirects to frames decapsulated from IPsec.
489 	 */
490 	if (IP6CB(skb)->iif == dst->dev->ifindex &&
491 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
492 		struct in6_addr *target = NULL;
493 		struct inet_peer *peer;
494 		struct rt6_info *rt;
495 
496 		/*
497 		 *	incoming and outgoing devices are the same
498 		 *	send a redirect.
499 		 */
500 
501 		rt = (struct rt6_info *) dst;
502 		if (rt->rt6i_flags & RTF_GATEWAY)
503 			target = &rt->rt6i_gateway;
504 		else
505 			target = &hdr->daddr;
506 
507 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
508 
509 		/* Limit redirects both by destination (here)
510 		   and by source (inside ndisc_send_redirect)
511 		 */
512 		if (inet_peer_xrlim_allow(peer, 1*HZ))
513 			ndisc_send_redirect(skb, target);
514 		if (peer)
515 			inet_putpeer(peer);
516 	} else {
517 		int addrtype = ipv6_addr_type(&hdr->saddr);
518 
519 		/* This check is security critical. */
520 		if (addrtype == IPV6_ADDR_ANY ||
521 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
522 			goto error;
523 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
524 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
525 				    ICMPV6_NOT_NEIGHBOUR, 0);
526 			goto error;
527 		}
528 	}
529 
530 	mtu = ip6_dst_mtu_forward(dst);
531 	if (mtu < IPV6_MIN_MTU)
532 		mtu = IPV6_MIN_MTU;
533 
534 	if (ip6_pkt_too_big(skb, mtu)) {
535 		/* Again, force OUTPUT device used as source address */
536 		skb->dev = dst->dev;
537 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
538 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
539 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
540 				IPSTATS_MIB_FRAGFAILS);
541 		kfree_skb(skb);
542 		return -EMSGSIZE;
543 	}
544 
545 	if (skb_cow(skb, dst->dev->hard_header_len)) {
546 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
547 				IPSTATS_MIB_OUTDISCARDS);
548 		goto drop;
549 	}
550 
551 	hdr = ipv6_hdr(skb);
552 
553 	/* Mangling hops number delayed to point after skb COW */
554 
555 	hdr->hop_limit--;
556 
557 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
558 		       net, NULL, skb, skb->dev, dst->dev,
559 		       ip6_forward_finish);
560 
561 error:
562 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
563 drop:
564 	kfree_skb(skb);
565 	return -EINVAL;
566 }
567 
568 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
569 {
570 	to->pkt_type = from->pkt_type;
571 	to->priority = from->priority;
572 	to->protocol = from->protocol;
573 	skb_dst_drop(to);
574 	skb_dst_set(to, dst_clone(skb_dst(from)));
575 	to->dev = from->dev;
576 	to->mark = from->mark;
577 
578 	skb_copy_hash(to, from);
579 
580 #ifdef CONFIG_NET_SCHED
581 	to->tc_index = from->tc_index;
582 #endif
583 	nf_copy(to, from);
584 	skb_copy_secmark(to, from);
585 }
586 
587 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
588 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
589 {
590 	struct sk_buff *frag;
591 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
592 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
593 				inet6_sk(skb->sk) : NULL;
594 	struct ipv6hdr *tmp_hdr;
595 	struct frag_hdr *fh;
596 	unsigned int mtu, hlen, left, len;
597 	int hroom, troom;
598 	__be32 frag_id;
599 	int ptr, offset = 0, err = 0;
600 	u8 *prevhdr, nexthdr = 0;
601 
602 	err = ip6_find_1stfragopt(skb, &prevhdr);
603 	if (err < 0)
604 		goto fail;
605 	hlen = err;
606 	nexthdr = *prevhdr;
607 
608 	mtu = ip6_skb_dst_mtu(skb);
609 
610 	/* We must not fragment if the socket is set to force MTU discovery
611 	 * or if the skb it not generated by a local socket.
612 	 */
613 	if (unlikely(!skb->ignore_df && skb->len > mtu))
614 		goto fail_toobig;
615 
616 	if (IP6CB(skb)->frag_max_size) {
617 		if (IP6CB(skb)->frag_max_size > mtu)
618 			goto fail_toobig;
619 
620 		/* don't send fragments larger than what we received */
621 		mtu = IP6CB(skb)->frag_max_size;
622 		if (mtu < IPV6_MIN_MTU)
623 			mtu = IPV6_MIN_MTU;
624 	}
625 
626 	if (np && np->frag_size < mtu) {
627 		if (np->frag_size)
628 			mtu = np->frag_size;
629 	}
630 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
631 		goto fail_toobig;
632 	mtu -= hlen + sizeof(struct frag_hdr);
633 
634 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
635 				    &ipv6_hdr(skb)->saddr);
636 
637 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
638 	    (err = skb_checksum_help(skb)))
639 		goto fail;
640 
641 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
642 	if (skb_has_frag_list(skb)) {
643 		unsigned int first_len = skb_pagelen(skb);
644 		struct sk_buff *frag2;
645 
646 		if (first_len - hlen > mtu ||
647 		    ((first_len - hlen) & 7) ||
648 		    skb_cloned(skb) ||
649 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
650 			goto slow_path;
651 
652 		skb_walk_frags(skb, frag) {
653 			/* Correct geometry. */
654 			if (frag->len > mtu ||
655 			    ((frag->len & 7) && frag->next) ||
656 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
657 				goto slow_path_clean;
658 
659 			/* Partially cloned skb? */
660 			if (skb_shared(frag))
661 				goto slow_path_clean;
662 
663 			BUG_ON(frag->sk);
664 			if (skb->sk) {
665 				frag->sk = skb->sk;
666 				frag->destructor = sock_wfree;
667 			}
668 			skb->truesize -= frag->truesize;
669 		}
670 
671 		err = 0;
672 		offset = 0;
673 		/* BUILD HEADER */
674 
675 		*prevhdr = NEXTHDR_FRAGMENT;
676 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
677 		if (!tmp_hdr) {
678 			err = -ENOMEM;
679 			goto fail;
680 		}
681 		frag = skb_shinfo(skb)->frag_list;
682 		skb_frag_list_init(skb);
683 
684 		__skb_pull(skb, hlen);
685 		fh = __skb_push(skb, sizeof(struct frag_hdr));
686 		__skb_push(skb, hlen);
687 		skb_reset_network_header(skb);
688 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
689 
690 		fh->nexthdr = nexthdr;
691 		fh->reserved = 0;
692 		fh->frag_off = htons(IP6_MF);
693 		fh->identification = frag_id;
694 
695 		first_len = skb_pagelen(skb);
696 		skb->data_len = first_len - skb_headlen(skb);
697 		skb->len = first_len;
698 		ipv6_hdr(skb)->payload_len = htons(first_len -
699 						   sizeof(struct ipv6hdr));
700 
701 		for (;;) {
702 			/* Prepare header of the next frame,
703 			 * before previous one went down. */
704 			if (frag) {
705 				frag->ip_summed = CHECKSUM_NONE;
706 				skb_reset_transport_header(frag);
707 				fh = __skb_push(frag, sizeof(struct frag_hdr));
708 				__skb_push(frag, hlen);
709 				skb_reset_network_header(frag);
710 				memcpy(skb_network_header(frag), tmp_hdr,
711 				       hlen);
712 				offset += skb->len - hlen - sizeof(struct frag_hdr);
713 				fh->nexthdr = nexthdr;
714 				fh->reserved = 0;
715 				fh->frag_off = htons(offset);
716 				if (frag->next)
717 					fh->frag_off |= htons(IP6_MF);
718 				fh->identification = frag_id;
719 				ipv6_hdr(frag)->payload_len =
720 						htons(frag->len -
721 						      sizeof(struct ipv6hdr));
722 				ip6_copy_metadata(frag, skb);
723 			}
724 
725 			err = output(net, sk, skb);
726 			if (!err)
727 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
728 					      IPSTATS_MIB_FRAGCREATES);
729 
730 			if (err || !frag)
731 				break;
732 
733 			skb = frag;
734 			frag = skb->next;
735 			skb_mark_not_on_list(skb);
736 		}
737 
738 		kfree(tmp_hdr);
739 
740 		if (err == 0) {
741 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
742 				      IPSTATS_MIB_FRAGOKS);
743 			return 0;
744 		}
745 
746 		kfree_skb_list(frag);
747 
748 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
749 			      IPSTATS_MIB_FRAGFAILS);
750 		return err;
751 
752 slow_path_clean:
753 		skb_walk_frags(skb, frag2) {
754 			if (frag2 == frag)
755 				break;
756 			frag2->sk = NULL;
757 			frag2->destructor = NULL;
758 			skb->truesize += frag2->truesize;
759 		}
760 	}
761 
762 slow_path:
763 	left = skb->len - hlen;		/* Space per frame */
764 	ptr = hlen;			/* Where to start from */
765 
766 	/*
767 	 *	Fragment the datagram.
768 	 */
769 
770 	troom = rt->dst.dev->needed_tailroom;
771 
772 	/*
773 	 *	Keep copying data until we run out.
774 	 */
775 	while (left > 0)	{
776 		u8 *fragnexthdr_offset;
777 
778 		len = left;
779 		/* IF: it doesn't fit, use 'mtu' - the data space left */
780 		if (len > mtu)
781 			len = mtu;
782 		/* IF: we are not sending up to and including the packet end
783 		   then align the next start on an eight byte boundary */
784 		if (len < left)	{
785 			len &= ~7;
786 		}
787 
788 		/* Allocate buffer */
789 		frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
790 				 hroom + troom, GFP_ATOMIC);
791 		if (!frag) {
792 			err = -ENOMEM;
793 			goto fail;
794 		}
795 
796 		/*
797 		 *	Set up data on packet
798 		 */
799 
800 		ip6_copy_metadata(frag, skb);
801 		skb_reserve(frag, hroom);
802 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
803 		skb_reset_network_header(frag);
804 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
805 		frag->transport_header = (frag->network_header + hlen +
806 					  sizeof(struct frag_hdr));
807 
808 		/*
809 		 *	Charge the memory for the fragment to any owner
810 		 *	it might possess
811 		 */
812 		if (skb->sk)
813 			skb_set_owner_w(frag, skb->sk);
814 
815 		/*
816 		 *	Copy the packet header into the new buffer.
817 		 */
818 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
819 
820 		fragnexthdr_offset = skb_network_header(frag);
821 		fragnexthdr_offset += prevhdr - skb_network_header(skb);
822 		*fragnexthdr_offset = NEXTHDR_FRAGMENT;
823 
824 		/*
825 		 *	Build fragment header.
826 		 */
827 		fh->nexthdr = nexthdr;
828 		fh->reserved = 0;
829 		fh->identification = frag_id;
830 
831 		/*
832 		 *	Copy a block of the IP datagram.
833 		 */
834 		BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
835 				     len));
836 		left -= len;
837 
838 		fh->frag_off = htons(offset);
839 		if (left > 0)
840 			fh->frag_off |= htons(IP6_MF);
841 		ipv6_hdr(frag)->payload_len = htons(frag->len -
842 						    sizeof(struct ipv6hdr));
843 
844 		ptr += len;
845 		offset += len;
846 
847 		/*
848 		 *	Put this fragment into the sending queue.
849 		 */
850 		err = output(net, sk, frag);
851 		if (err)
852 			goto fail;
853 
854 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
855 			      IPSTATS_MIB_FRAGCREATES);
856 	}
857 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
858 		      IPSTATS_MIB_FRAGOKS);
859 	consume_skb(skb);
860 	return err;
861 
862 fail_toobig:
863 	if (skb->sk && dst_allfrag(skb_dst(skb)))
864 		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
865 
866 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
867 	err = -EMSGSIZE;
868 
869 fail:
870 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
871 		      IPSTATS_MIB_FRAGFAILS);
872 	kfree_skb(skb);
873 	return err;
874 }
875 
876 static inline int ip6_rt_check(const struct rt6key *rt_key,
877 			       const struct in6_addr *fl_addr,
878 			       const struct in6_addr *addr_cache)
879 {
880 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
881 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
882 }
883 
884 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
885 					  struct dst_entry *dst,
886 					  const struct flowi6 *fl6)
887 {
888 	struct ipv6_pinfo *np = inet6_sk(sk);
889 	struct rt6_info *rt;
890 
891 	if (!dst)
892 		goto out;
893 
894 	if (dst->ops->family != AF_INET6) {
895 		dst_release(dst);
896 		return NULL;
897 	}
898 
899 	rt = (struct rt6_info *)dst;
900 	/* Yes, checking route validity in not connected
901 	 * case is not very simple. Take into account,
902 	 * that we do not support routing by source, TOS,
903 	 * and MSG_DONTROUTE		--ANK (980726)
904 	 *
905 	 * 1. ip6_rt_check(): If route was host route,
906 	 *    check that cached destination is current.
907 	 *    If it is network route, we still may
908 	 *    check its validity using saved pointer
909 	 *    to the last used address: daddr_cache.
910 	 *    We do not want to save whole address now,
911 	 *    (because main consumer of this service
912 	 *    is tcp, which has not this problem),
913 	 *    so that the last trick works only on connected
914 	 *    sockets.
915 	 * 2. oif also should be the same.
916 	 */
917 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
918 #ifdef CONFIG_IPV6_SUBTREES
919 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
920 #endif
921 	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
922 	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
923 		dst_release(dst);
924 		dst = NULL;
925 	}
926 
927 out:
928 	return dst;
929 }
930 
931 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
932 			       struct dst_entry **dst, struct flowi6 *fl6)
933 {
934 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
935 	struct neighbour *n;
936 	struct rt6_info *rt;
937 #endif
938 	int err;
939 	int flags = 0;
940 
941 	/* The correct way to handle this would be to do
942 	 * ip6_route_get_saddr, and then ip6_route_output; however,
943 	 * the route-specific preferred source forces the
944 	 * ip6_route_output call _before_ ip6_route_get_saddr.
945 	 *
946 	 * In source specific routing (no src=any default route),
947 	 * ip6_route_output will fail given src=any saddr, though, so
948 	 * that's why we try it again later.
949 	 */
950 	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
951 		struct fib6_info *from;
952 		struct rt6_info *rt;
953 		bool had_dst = *dst != NULL;
954 
955 		if (!had_dst)
956 			*dst = ip6_route_output(net, sk, fl6);
957 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
958 
959 		rcu_read_lock();
960 		from = rt ? rcu_dereference(rt->from) : NULL;
961 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
962 					  sk ? inet6_sk(sk)->srcprefs : 0,
963 					  &fl6->saddr);
964 		rcu_read_unlock();
965 
966 		if (err)
967 			goto out_err_release;
968 
969 		/* If we had an erroneous initial result, pretend it
970 		 * never existed and let the SA-enabled version take
971 		 * over.
972 		 */
973 		if (!had_dst && (*dst)->error) {
974 			dst_release(*dst);
975 			*dst = NULL;
976 		}
977 
978 		if (fl6->flowi6_oif)
979 			flags |= RT6_LOOKUP_F_IFACE;
980 	}
981 
982 	if (!*dst)
983 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
984 
985 	err = (*dst)->error;
986 	if (err)
987 		goto out_err_release;
988 
989 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
990 	/*
991 	 * Here if the dst entry we've looked up
992 	 * has a neighbour entry that is in the INCOMPLETE
993 	 * state and the src address from the flow is
994 	 * marked as OPTIMISTIC, we release the found
995 	 * dst entry and replace it instead with the
996 	 * dst entry of the nexthop router
997 	 */
998 	rt = (struct rt6_info *) *dst;
999 	rcu_read_lock_bh();
1000 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1001 				      rt6_nexthop(rt, &fl6->daddr));
1002 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1003 	rcu_read_unlock_bh();
1004 
1005 	if (err) {
1006 		struct inet6_ifaddr *ifp;
1007 		struct flowi6 fl_gw6;
1008 		int redirect;
1009 
1010 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1011 				      (*dst)->dev, 1);
1012 
1013 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1014 		if (ifp)
1015 			in6_ifa_put(ifp);
1016 
1017 		if (redirect) {
1018 			/*
1019 			 * We need to get the dst entry for the
1020 			 * default router instead
1021 			 */
1022 			dst_release(*dst);
1023 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1024 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1025 			*dst = ip6_route_output(net, sk, &fl_gw6);
1026 			err = (*dst)->error;
1027 			if (err)
1028 				goto out_err_release;
1029 		}
1030 	}
1031 #endif
1032 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1033 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1034 		err = -EAFNOSUPPORT;
1035 		goto out_err_release;
1036 	}
1037 
1038 	return 0;
1039 
1040 out_err_release:
1041 	dst_release(*dst);
1042 	*dst = NULL;
1043 
1044 	if (err == -ENETUNREACH)
1045 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1046 	return err;
1047 }
1048 
1049 /**
1050  *	ip6_dst_lookup - perform route lookup on flow
1051  *	@sk: socket which provides route info
1052  *	@dst: pointer to dst_entry * for result
1053  *	@fl6: flow to lookup
1054  *
1055  *	This function performs a route lookup on the given flow.
1056  *
1057  *	It returns zero on success, or a standard errno code on error.
1058  */
1059 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1060 		   struct flowi6 *fl6)
1061 {
1062 	*dst = NULL;
1063 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1064 }
1065 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1066 
1067 /**
1068  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1069  *	@sk: socket which provides route info
1070  *	@fl6: flow to lookup
1071  *	@final_dst: final destination address for ipsec lookup
1072  *
1073  *	This function performs a route lookup on the given flow.
1074  *
1075  *	It returns a valid dst pointer on success, or a pointer encoded
1076  *	error code.
1077  */
1078 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1079 				      const struct in6_addr *final_dst)
1080 {
1081 	struct dst_entry *dst = NULL;
1082 	int err;
1083 
1084 	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1085 	if (err)
1086 		return ERR_PTR(err);
1087 	if (final_dst)
1088 		fl6->daddr = *final_dst;
1089 
1090 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1091 }
1092 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1093 
1094 /**
1095  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1096  *	@sk: socket which provides the dst cache and route info
1097  *	@fl6: flow to lookup
1098  *	@final_dst: final destination address for ipsec lookup
1099  *	@connected: whether @sk is connected or not
1100  *
1101  *	This function performs a route lookup on the given flow with the
1102  *	possibility of using the cached route in the socket if it is valid.
1103  *	It will take the socket dst lock when operating on the dst cache.
1104  *	As a result, this function can only be used in process context.
1105  *
1106  *	In addition, for a connected socket, cache the dst in the socket
1107  *	if the current cache is not valid.
1108  *
1109  *	It returns a valid dst pointer on success, or a pointer encoded
1110  *	error code.
1111  */
1112 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1113 					 const struct in6_addr *final_dst,
1114 					 bool connected)
1115 {
1116 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1117 
1118 	dst = ip6_sk_dst_check(sk, dst, fl6);
1119 	if (dst)
1120 		return dst;
1121 
1122 	dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1123 	if (connected && !IS_ERR(dst))
1124 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1125 
1126 	return dst;
1127 }
1128 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1129 
1130 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1131 					       gfp_t gfp)
1132 {
1133 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1134 }
1135 
1136 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1137 						gfp_t gfp)
1138 {
1139 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1140 }
1141 
1142 static void ip6_append_data_mtu(unsigned int *mtu,
1143 				int *maxfraglen,
1144 				unsigned int fragheaderlen,
1145 				struct sk_buff *skb,
1146 				struct rt6_info *rt,
1147 				unsigned int orig_mtu)
1148 {
1149 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1150 		if (!skb) {
1151 			/* first fragment, reserve header_len */
1152 			*mtu = orig_mtu - rt->dst.header_len;
1153 
1154 		} else {
1155 			/*
1156 			 * this fragment is not first, the headers
1157 			 * space is regarded as data space.
1158 			 */
1159 			*mtu = orig_mtu;
1160 		}
1161 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1162 			      + fragheaderlen - sizeof(struct frag_hdr);
1163 	}
1164 }
1165 
1166 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1167 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1168 			  struct rt6_info *rt, struct flowi6 *fl6)
1169 {
1170 	struct ipv6_pinfo *np = inet6_sk(sk);
1171 	unsigned int mtu;
1172 	struct ipv6_txoptions *opt = ipc6->opt;
1173 
1174 	/*
1175 	 * setup for corking
1176 	 */
1177 	if (opt) {
1178 		if (WARN_ON(v6_cork->opt))
1179 			return -EINVAL;
1180 
1181 		v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1182 		if (unlikely(!v6_cork->opt))
1183 			return -ENOBUFS;
1184 
1185 		v6_cork->opt->tot_len = sizeof(*opt);
1186 		v6_cork->opt->opt_flen = opt->opt_flen;
1187 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1188 
1189 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1190 						    sk->sk_allocation);
1191 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1192 			return -ENOBUFS;
1193 
1194 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1195 						    sk->sk_allocation);
1196 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1197 			return -ENOBUFS;
1198 
1199 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1200 						   sk->sk_allocation);
1201 		if (opt->hopopt && !v6_cork->opt->hopopt)
1202 			return -ENOBUFS;
1203 
1204 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1205 						    sk->sk_allocation);
1206 		if (opt->srcrt && !v6_cork->opt->srcrt)
1207 			return -ENOBUFS;
1208 
1209 		/* need source address above miyazawa*/
1210 	}
1211 	dst_hold(&rt->dst);
1212 	cork->base.dst = &rt->dst;
1213 	cork->fl.u.ip6 = *fl6;
1214 	v6_cork->hop_limit = ipc6->hlimit;
1215 	v6_cork->tclass = ipc6->tclass;
1216 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1217 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1218 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1219 	else
1220 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1221 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1222 	if (np->frag_size < mtu) {
1223 		if (np->frag_size)
1224 			mtu = np->frag_size;
1225 	}
1226 	if (mtu < IPV6_MIN_MTU)
1227 		return -EINVAL;
1228 	cork->base.fragsize = mtu;
1229 	cork->base.gso_size = ipc6->gso_size;
1230 	cork->base.tx_flags = 0;
1231 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1232 
1233 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1234 		cork->base.flags |= IPCORK_ALLFRAG;
1235 	cork->base.length = 0;
1236 
1237 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1238 
1239 	return 0;
1240 }
1241 
1242 static int __ip6_append_data(struct sock *sk,
1243 			     struct flowi6 *fl6,
1244 			     struct sk_buff_head *queue,
1245 			     struct inet_cork *cork,
1246 			     struct inet6_cork *v6_cork,
1247 			     struct page_frag *pfrag,
1248 			     int getfrag(void *from, char *to, int offset,
1249 					 int len, int odd, struct sk_buff *skb),
1250 			     void *from, int length, int transhdrlen,
1251 			     unsigned int flags, struct ipcm6_cookie *ipc6)
1252 {
1253 	struct sk_buff *skb, *skb_prev = NULL;
1254 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1255 	struct ubuf_info *uarg = NULL;
1256 	int exthdrlen = 0;
1257 	int dst_exthdrlen = 0;
1258 	int hh_len;
1259 	int copy;
1260 	int err;
1261 	int offset = 0;
1262 	u32 tskey = 0;
1263 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1264 	struct ipv6_txoptions *opt = v6_cork->opt;
1265 	int csummode = CHECKSUM_NONE;
1266 	unsigned int maxnonfragsize, headersize;
1267 	unsigned int wmem_alloc_delta = 0;
1268 	bool paged, extra_uref;
1269 
1270 	skb = skb_peek_tail(queue);
1271 	if (!skb) {
1272 		exthdrlen = opt ? opt->opt_flen : 0;
1273 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1274 	}
1275 
1276 	paged = !!cork->gso_size;
1277 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1278 	orig_mtu = mtu;
1279 
1280 	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1281 	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1282 		tskey = sk->sk_tskey++;
1283 
1284 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1285 
1286 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1287 			(opt ? opt->opt_nflen : 0);
1288 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1289 		     sizeof(struct frag_hdr);
1290 
1291 	headersize = sizeof(struct ipv6hdr) +
1292 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1293 		     (dst_allfrag(&rt->dst) ?
1294 		      sizeof(struct frag_hdr) : 0) +
1295 		     rt->rt6i_nfheader_len;
1296 
1297 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1298 	 * the first fragment
1299 	 */
1300 	if (headersize + transhdrlen > mtu)
1301 		goto emsgsize;
1302 
1303 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1304 	    (sk->sk_protocol == IPPROTO_UDP ||
1305 	     sk->sk_protocol == IPPROTO_RAW)) {
1306 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1307 				sizeof(struct ipv6hdr));
1308 		goto emsgsize;
1309 	}
1310 
1311 	if (ip6_sk_ignore_df(sk))
1312 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1313 	else
1314 		maxnonfragsize = mtu;
1315 
1316 	if (cork->length + length > maxnonfragsize - headersize) {
1317 emsgsize:
1318 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1319 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1320 		return -EMSGSIZE;
1321 	}
1322 
1323 	/* CHECKSUM_PARTIAL only with no extension headers and when
1324 	 * we are not going to fragment
1325 	 */
1326 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1327 	    headersize == sizeof(struct ipv6hdr) &&
1328 	    length <= mtu - headersize &&
1329 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1330 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1331 		csummode = CHECKSUM_PARTIAL;
1332 
1333 	if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1334 		uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
1335 		if (!uarg)
1336 			return -ENOBUFS;
1337 		extra_uref = true;
1338 		if (rt->dst.dev->features & NETIF_F_SG &&
1339 		    csummode == CHECKSUM_PARTIAL) {
1340 			paged = true;
1341 		} else {
1342 			uarg->zerocopy = 0;
1343 			skb_zcopy_set(skb, uarg, &extra_uref);
1344 		}
1345 	}
1346 
1347 	/*
1348 	 * Let's try using as much space as possible.
1349 	 * Use MTU if total length of the message fits into the MTU.
1350 	 * Otherwise, we need to reserve fragment header and
1351 	 * fragment alignment (= 8-15 octects, in total).
1352 	 *
1353 	 * Note that we may need to "move" the data from the tail of
1354 	 * of the buffer to the new fragment when we split
1355 	 * the message.
1356 	 *
1357 	 * FIXME: It may be fragmented into multiple chunks
1358 	 *        at once if non-fragmentable extension headers
1359 	 *        are too large.
1360 	 * --yoshfuji
1361 	 */
1362 
1363 	cork->length += length;
1364 	if (!skb)
1365 		goto alloc_new_skb;
1366 
1367 	while (length > 0) {
1368 		/* Check if the remaining data fits into current packet. */
1369 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1370 		if (copy < length)
1371 			copy = maxfraglen - skb->len;
1372 
1373 		if (copy <= 0) {
1374 			char *data;
1375 			unsigned int datalen;
1376 			unsigned int fraglen;
1377 			unsigned int fraggap;
1378 			unsigned int alloclen;
1379 			unsigned int pagedlen;
1380 alloc_new_skb:
1381 			/* There's no room in the current skb */
1382 			if (skb)
1383 				fraggap = skb->len - maxfraglen;
1384 			else
1385 				fraggap = 0;
1386 			/* update mtu and maxfraglen if necessary */
1387 			if (!skb || !skb_prev)
1388 				ip6_append_data_mtu(&mtu, &maxfraglen,
1389 						    fragheaderlen, skb, rt,
1390 						    orig_mtu);
1391 
1392 			skb_prev = skb;
1393 
1394 			/*
1395 			 * If remaining data exceeds the mtu,
1396 			 * we know we need more fragment(s).
1397 			 */
1398 			datalen = length + fraggap;
1399 
1400 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1401 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1402 			fraglen = datalen + fragheaderlen;
1403 			pagedlen = 0;
1404 
1405 			if ((flags & MSG_MORE) &&
1406 			    !(rt->dst.dev->features&NETIF_F_SG))
1407 				alloclen = mtu;
1408 			else if (!paged)
1409 				alloclen = fraglen;
1410 			else {
1411 				alloclen = min_t(int, fraglen, MAX_HEADER);
1412 				pagedlen = fraglen - alloclen;
1413 			}
1414 
1415 			alloclen += dst_exthdrlen;
1416 
1417 			if (datalen != length + fraggap) {
1418 				/*
1419 				 * this is not the last fragment, the trailer
1420 				 * space is regarded as data space.
1421 				 */
1422 				datalen += rt->dst.trailer_len;
1423 			}
1424 
1425 			alloclen += rt->dst.trailer_len;
1426 			fraglen = datalen + fragheaderlen;
1427 
1428 			/*
1429 			 * We just reserve space for fragment header.
1430 			 * Note: this may be overallocation if the message
1431 			 * (without MSG_MORE) fits into the MTU.
1432 			 */
1433 			alloclen += sizeof(struct frag_hdr);
1434 
1435 			copy = datalen - transhdrlen - fraggap - pagedlen;
1436 			if (copy < 0) {
1437 				err = -EINVAL;
1438 				goto error;
1439 			}
1440 			if (transhdrlen) {
1441 				skb = sock_alloc_send_skb(sk,
1442 						alloclen + hh_len,
1443 						(flags & MSG_DONTWAIT), &err);
1444 			} else {
1445 				skb = NULL;
1446 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1447 				    2 * sk->sk_sndbuf)
1448 					skb = alloc_skb(alloclen + hh_len,
1449 							sk->sk_allocation);
1450 				if (unlikely(!skb))
1451 					err = -ENOBUFS;
1452 			}
1453 			if (!skb)
1454 				goto error;
1455 			/*
1456 			 *	Fill in the control structures
1457 			 */
1458 			skb->protocol = htons(ETH_P_IPV6);
1459 			skb->ip_summed = csummode;
1460 			skb->csum = 0;
1461 			/* reserve for fragmentation and ipsec header */
1462 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1463 				    dst_exthdrlen);
1464 
1465 			/*
1466 			 *	Find where to start putting bytes
1467 			 */
1468 			data = skb_put(skb, fraglen - pagedlen);
1469 			skb_set_network_header(skb, exthdrlen);
1470 			data += fragheaderlen;
1471 			skb->transport_header = (skb->network_header +
1472 						 fragheaderlen);
1473 			if (fraggap) {
1474 				skb->csum = skb_copy_and_csum_bits(
1475 					skb_prev, maxfraglen,
1476 					data + transhdrlen, fraggap, 0);
1477 				skb_prev->csum = csum_sub(skb_prev->csum,
1478 							  skb->csum);
1479 				data += fraggap;
1480 				pskb_trim_unique(skb_prev, maxfraglen);
1481 			}
1482 			if (copy > 0 &&
1483 			    getfrag(from, data + transhdrlen, offset,
1484 				    copy, fraggap, skb) < 0) {
1485 				err = -EFAULT;
1486 				kfree_skb(skb);
1487 				goto error;
1488 			}
1489 
1490 			offset += copy;
1491 			length -= copy + transhdrlen;
1492 			transhdrlen = 0;
1493 			exthdrlen = 0;
1494 			dst_exthdrlen = 0;
1495 
1496 			/* Only the initial fragment is time stamped */
1497 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1498 			cork->tx_flags = 0;
1499 			skb_shinfo(skb)->tskey = tskey;
1500 			tskey = 0;
1501 			skb_zcopy_set(skb, uarg, &extra_uref);
1502 
1503 			if ((flags & MSG_CONFIRM) && !skb_prev)
1504 				skb_set_dst_pending_confirm(skb, 1);
1505 
1506 			/*
1507 			 * Put the packet on the pending queue
1508 			 */
1509 			if (!skb->destructor) {
1510 				skb->destructor = sock_wfree;
1511 				skb->sk = sk;
1512 				wmem_alloc_delta += skb->truesize;
1513 			}
1514 			__skb_queue_tail(queue, skb);
1515 			continue;
1516 		}
1517 
1518 		if (copy > length)
1519 			copy = length;
1520 
1521 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1522 		    skb_tailroom(skb) >= copy) {
1523 			unsigned int off;
1524 
1525 			off = skb->len;
1526 			if (getfrag(from, skb_put(skb, copy),
1527 						offset, copy, off, skb) < 0) {
1528 				__skb_trim(skb, off);
1529 				err = -EFAULT;
1530 				goto error;
1531 			}
1532 		} else if (!uarg || !uarg->zerocopy) {
1533 			int i = skb_shinfo(skb)->nr_frags;
1534 
1535 			err = -ENOMEM;
1536 			if (!sk_page_frag_refill(sk, pfrag))
1537 				goto error;
1538 
1539 			if (!skb_can_coalesce(skb, i, pfrag->page,
1540 					      pfrag->offset)) {
1541 				err = -EMSGSIZE;
1542 				if (i == MAX_SKB_FRAGS)
1543 					goto error;
1544 
1545 				__skb_fill_page_desc(skb, i, pfrag->page,
1546 						     pfrag->offset, 0);
1547 				skb_shinfo(skb)->nr_frags = ++i;
1548 				get_page(pfrag->page);
1549 			}
1550 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1551 			if (getfrag(from,
1552 				    page_address(pfrag->page) + pfrag->offset,
1553 				    offset, copy, skb->len, skb) < 0)
1554 				goto error_efault;
1555 
1556 			pfrag->offset += copy;
1557 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1558 			skb->len += copy;
1559 			skb->data_len += copy;
1560 			skb->truesize += copy;
1561 			wmem_alloc_delta += copy;
1562 		} else {
1563 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1564 			if (err < 0)
1565 				goto error;
1566 		}
1567 		offset += copy;
1568 		length -= copy;
1569 	}
1570 
1571 	if (wmem_alloc_delta)
1572 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1573 	return 0;
1574 
1575 error_efault:
1576 	err = -EFAULT;
1577 error:
1578 	if (uarg)
1579 		sock_zerocopy_put_abort(uarg, extra_uref);
1580 	cork->length -= length;
1581 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1582 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1583 	return err;
1584 }
1585 
1586 int ip6_append_data(struct sock *sk,
1587 		    int getfrag(void *from, char *to, int offset, int len,
1588 				int odd, struct sk_buff *skb),
1589 		    void *from, int length, int transhdrlen,
1590 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1591 		    struct rt6_info *rt, unsigned int flags)
1592 {
1593 	struct inet_sock *inet = inet_sk(sk);
1594 	struct ipv6_pinfo *np = inet6_sk(sk);
1595 	int exthdrlen;
1596 	int err;
1597 
1598 	if (flags&MSG_PROBE)
1599 		return 0;
1600 	if (skb_queue_empty(&sk->sk_write_queue)) {
1601 		/*
1602 		 * setup for corking
1603 		 */
1604 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1605 				     ipc6, rt, fl6);
1606 		if (err)
1607 			return err;
1608 
1609 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1610 		length += exthdrlen;
1611 		transhdrlen += exthdrlen;
1612 	} else {
1613 		fl6 = &inet->cork.fl.u.ip6;
1614 		transhdrlen = 0;
1615 	}
1616 
1617 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1618 				 &np->cork, sk_page_frag(sk), getfrag,
1619 				 from, length, transhdrlen, flags, ipc6);
1620 }
1621 EXPORT_SYMBOL_GPL(ip6_append_data);
1622 
1623 static void ip6_cork_release(struct inet_cork_full *cork,
1624 			     struct inet6_cork *v6_cork)
1625 {
1626 	if (v6_cork->opt) {
1627 		kfree(v6_cork->opt->dst0opt);
1628 		kfree(v6_cork->opt->dst1opt);
1629 		kfree(v6_cork->opt->hopopt);
1630 		kfree(v6_cork->opt->srcrt);
1631 		kfree(v6_cork->opt);
1632 		v6_cork->opt = NULL;
1633 	}
1634 
1635 	if (cork->base.dst) {
1636 		dst_release(cork->base.dst);
1637 		cork->base.dst = NULL;
1638 		cork->base.flags &= ~IPCORK_ALLFRAG;
1639 	}
1640 	memset(&cork->fl, 0, sizeof(cork->fl));
1641 }
1642 
1643 struct sk_buff *__ip6_make_skb(struct sock *sk,
1644 			       struct sk_buff_head *queue,
1645 			       struct inet_cork_full *cork,
1646 			       struct inet6_cork *v6_cork)
1647 {
1648 	struct sk_buff *skb, *tmp_skb;
1649 	struct sk_buff **tail_skb;
1650 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1651 	struct ipv6_pinfo *np = inet6_sk(sk);
1652 	struct net *net = sock_net(sk);
1653 	struct ipv6hdr *hdr;
1654 	struct ipv6_txoptions *opt = v6_cork->opt;
1655 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1656 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1657 	unsigned char proto = fl6->flowi6_proto;
1658 
1659 	skb = __skb_dequeue(queue);
1660 	if (!skb)
1661 		goto out;
1662 	tail_skb = &(skb_shinfo(skb)->frag_list);
1663 
1664 	/* move skb->data to ip header from ext header */
1665 	if (skb->data < skb_network_header(skb))
1666 		__skb_pull(skb, skb_network_offset(skb));
1667 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1668 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1669 		*tail_skb = tmp_skb;
1670 		tail_skb = &(tmp_skb->next);
1671 		skb->len += tmp_skb->len;
1672 		skb->data_len += tmp_skb->len;
1673 		skb->truesize += tmp_skb->truesize;
1674 		tmp_skb->destructor = NULL;
1675 		tmp_skb->sk = NULL;
1676 	}
1677 
1678 	/* Allow local fragmentation. */
1679 	skb->ignore_df = ip6_sk_ignore_df(sk);
1680 
1681 	*final_dst = fl6->daddr;
1682 	__skb_pull(skb, skb_network_header_len(skb));
1683 	if (opt && opt->opt_flen)
1684 		ipv6_push_frag_opts(skb, opt, &proto);
1685 	if (opt && opt->opt_nflen)
1686 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1687 
1688 	skb_push(skb, sizeof(struct ipv6hdr));
1689 	skb_reset_network_header(skb);
1690 	hdr = ipv6_hdr(skb);
1691 
1692 	ip6_flow_hdr(hdr, v6_cork->tclass,
1693 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1694 					ip6_autoflowlabel(net, np), fl6));
1695 	hdr->hop_limit = v6_cork->hop_limit;
1696 	hdr->nexthdr = proto;
1697 	hdr->saddr = fl6->saddr;
1698 	hdr->daddr = *final_dst;
1699 
1700 	skb->priority = sk->sk_priority;
1701 	skb->mark = sk->sk_mark;
1702 
1703 	skb->tstamp = cork->base.transmit_time;
1704 
1705 	skb_dst_set(skb, dst_clone(&rt->dst));
1706 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1707 	if (proto == IPPROTO_ICMPV6) {
1708 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1709 
1710 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1711 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1712 	}
1713 
1714 	ip6_cork_release(cork, v6_cork);
1715 out:
1716 	return skb;
1717 }
1718 
1719 int ip6_send_skb(struct sk_buff *skb)
1720 {
1721 	struct net *net = sock_net(skb->sk);
1722 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1723 	int err;
1724 
1725 	err = ip6_local_out(net, skb->sk, skb);
1726 	if (err) {
1727 		if (err > 0)
1728 			err = net_xmit_errno(err);
1729 		if (err)
1730 			IP6_INC_STATS(net, rt->rt6i_idev,
1731 				      IPSTATS_MIB_OUTDISCARDS);
1732 	}
1733 
1734 	return err;
1735 }
1736 
1737 int ip6_push_pending_frames(struct sock *sk)
1738 {
1739 	struct sk_buff *skb;
1740 
1741 	skb = ip6_finish_skb(sk);
1742 	if (!skb)
1743 		return 0;
1744 
1745 	return ip6_send_skb(skb);
1746 }
1747 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1748 
1749 static void __ip6_flush_pending_frames(struct sock *sk,
1750 				       struct sk_buff_head *queue,
1751 				       struct inet_cork_full *cork,
1752 				       struct inet6_cork *v6_cork)
1753 {
1754 	struct sk_buff *skb;
1755 
1756 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1757 		if (skb_dst(skb))
1758 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1759 				      IPSTATS_MIB_OUTDISCARDS);
1760 		kfree_skb(skb);
1761 	}
1762 
1763 	ip6_cork_release(cork, v6_cork);
1764 }
1765 
1766 void ip6_flush_pending_frames(struct sock *sk)
1767 {
1768 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1769 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1770 }
1771 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1772 
1773 struct sk_buff *ip6_make_skb(struct sock *sk,
1774 			     int getfrag(void *from, char *to, int offset,
1775 					 int len, int odd, struct sk_buff *skb),
1776 			     void *from, int length, int transhdrlen,
1777 			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1778 			     struct rt6_info *rt, unsigned int flags,
1779 			     struct inet_cork_full *cork)
1780 {
1781 	struct inet6_cork v6_cork;
1782 	struct sk_buff_head queue;
1783 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1784 	int err;
1785 
1786 	if (flags & MSG_PROBE)
1787 		return NULL;
1788 
1789 	__skb_queue_head_init(&queue);
1790 
1791 	cork->base.flags = 0;
1792 	cork->base.addr = 0;
1793 	cork->base.opt = NULL;
1794 	cork->base.dst = NULL;
1795 	v6_cork.opt = NULL;
1796 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1797 	if (err) {
1798 		ip6_cork_release(cork, &v6_cork);
1799 		return ERR_PTR(err);
1800 	}
1801 	if (ipc6->dontfrag < 0)
1802 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1803 
1804 	err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1805 				&current->task_frag, getfrag, from,
1806 				length + exthdrlen, transhdrlen + exthdrlen,
1807 				flags, ipc6);
1808 	if (err) {
1809 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1810 		return ERR_PTR(err);
1811 	}
1812 
1813 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1814 }
1815