xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 4ed91d48259d9ddd378424d008f2e6559f7e78f8)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	:	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/bpf-cgroup.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45 
46 #include <net/sock.h>
47 #include <net/snmp.h>
48 
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
59 #include <net/l3mdev.h>
60 #include <net/lwtunnel.h>
61 
62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
63 {
64 	struct dst_entry *dst = skb_dst(skb);
65 	struct net_device *dev = dst->dev;
66 	struct neighbour *neigh;
67 	struct in6_addr *nexthop;
68 	int ret;
69 
70 	skb->protocol = htons(ETH_P_IPV6);
71 	skb->dev = dev;
72 
73 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
74 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
75 
76 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
77 		    ((mroute6_socket(net, skb) &&
78 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
79 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
80 					 &ipv6_hdr(skb)->saddr))) {
81 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
82 
83 			/* Do not check for IFF_ALLMULTI; multicast routing
84 			   is not supported in any case.
85 			 */
86 			if (newskb)
87 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
88 					net, sk, newskb, NULL, newskb->dev,
89 					dev_loopback_xmit);
90 
91 			if (ipv6_hdr(skb)->hop_limit == 0) {
92 				IP6_INC_STATS(net, idev,
93 					      IPSTATS_MIB_OUTDISCARDS);
94 				kfree_skb(skb);
95 				return 0;
96 			}
97 		}
98 
99 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
100 
101 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
102 		    IPV6_ADDR_SCOPE_NODELOCAL &&
103 		    !(dev->flags & IFF_LOOPBACK)) {
104 			kfree_skb(skb);
105 			return 0;
106 		}
107 	}
108 
109 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
110 		int res = lwtunnel_xmit(skb);
111 
112 		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
113 			return res;
114 	}
115 
116 	rcu_read_lock_bh();
117 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
118 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
119 	if (unlikely(!neigh))
120 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
121 	if (!IS_ERR(neigh)) {
122 		sock_confirm_neigh(skb, neigh);
123 		ret = neigh_output(neigh, skb);
124 		rcu_read_unlock_bh();
125 		return ret;
126 	}
127 	rcu_read_unlock_bh();
128 
129 	IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
130 	kfree_skb(skb);
131 	return -EINVAL;
132 }
133 
134 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
135 {
136 	int ret;
137 
138 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
139 	if (ret) {
140 		kfree_skb(skb);
141 		return ret;
142 	}
143 
144 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
145 	    dst_allfrag(skb_dst(skb)) ||
146 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
147 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
148 	else
149 		return ip6_finish_output2(net, sk, skb);
150 }
151 
152 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
153 {
154 	struct net_device *dev = skb_dst(skb)->dev;
155 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
156 
157 	if (unlikely(idev->cnf.disable_ipv6)) {
158 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
159 		kfree_skb(skb);
160 		return 0;
161 	}
162 
163 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
164 			    net, sk, skb, NULL, dev,
165 			    ip6_finish_output,
166 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
167 }
168 
169 /*
170  * xmit an sk_buff (used by TCP, SCTP and DCCP)
171  * Note : socket lock is not held for SYNACK packets, but might be modified
172  * by calls to skb_set_owner_w() and ipv6_local_error(),
173  * which are using proper atomic operations or spinlocks.
174  */
175 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
176 	     __u32 mark, struct ipv6_txoptions *opt, int tclass)
177 {
178 	struct net *net = sock_net(sk);
179 	const struct ipv6_pinfo *np = inet6_sk(sk);
180 	struct in6_addr *first_hop = &fl6->daddr;
181 	struct dst_entry *dst = skb_dst(skb);
182 	struct ipv6hdr *hdr;
183 	u8  proto = fl6->flowi6_proto;
184 	int seg_len = skb->len;
185 	int hlimit = -1;
186 	u32 mtu;
187 
188 	if (opt) {
189 		unsigned int head_room;
190 
191 		/* First: exthdrs may take lots of space (~8K for now)
192 		   MAX_HEADER is not enough.
193 		 */
194 		head_room = opt->opt_nflen + opt->opt_flen;
195 		seg_len += head_room;
196 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
197 
198 		if (skb_headroom(skb) < head_room) {
199 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
200 			if (!skb2) {
201 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
202 					      IPSTATS_MIB_OUTDISCARDS);
203 				kfree_skb(skb);
204 				return -ENOBUFS;
205 			}
206 			consume_skb(skb);
207 			skb = skb2;
208 			/* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
209 			 * it is safe to call in our context (socket lock not held)
210 			 */
211 			skb_set_owner_w(skb, (struct sock *)sk);
212 		}
213 		if (opt->opt_flen)
214 			ipv6_push_frag_opts(skb, opt, &proto);
215 		if (opt->opt_nflen)
216 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
217 					     &fl6->saddr);
218 	}
219 
220 	skb_push(skb, sizeof(struct ipv6hdr));
221 	skb_reset_network_header(skb);
222 	hdr = ipv6_hdr(skb);
223 
224 	/*
225 	 *	Fill in the IPv6 header
226 	 */
227 	if (np)
228 		hlimit = np->hop_limit;
229 	if (hlimit < 0)
230 		hlimit = ip6_dst_hoplimit(dst);
231 
232 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
233 						     np->autoflowlabel, fl6));
234 
235 	hdr->payload_len = htons(seg_len);
236 	hdr->nexthdr = proto;
237 	hdr->hop_limit = hlimit;
238 
239 	hdr->saddr = fl6->saddr;
240 	hdr->daddr = *first_hop;
241 
242 	skb->protocol = htons(ETH_P_IPV6);
243 	skb->priority = sk->sk_priority;
244 	skb->mark = mark;
245 
246 	mtu = dst_mtu(dst);
247 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
248 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
249 			      IPSTATS_MIB_OUT, skb->len);
250 
251 		/* if egress device is enslaved to an L3 master device pass the
252 		 * skb to its handler for processing
253 		 */
254 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
255 		if (unlikely(!skb))
256 			return 0;
257 
258 		/* hooks should never assume socket lock is held.
259 		 * we promote our socket to non const
260 		 */
261 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
262 			       net, (struct sock *)sk, skb, NULL, dst->dev,
263 			       dst_output);
264 	}
265 
266 	skb->dev = dst->dev;
267 	/* ipv6_local_error() does not require socket lock,
268 	 * we promote our socket to non const
269 	 */
270 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
271 
272 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
273 	kfree_skb(skb);
274 	return -EMSGSIZE;
275 }
276 EXPORT_SYMBOL(ip6_xmit);
277 
278 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
279 {
280 	struct ip6_ra_chain *ra;
281 	struct sock *last = NULL;
282 
283 	read_lock(&ip6_ra_lock);
284 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
285 		struct sock *sk = ra->sk;
286 		if (sk && ra->sel == sel &&
287 		    (!sk->sk_bound_dev_if ||
288 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
289 			if (last) {
290 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
291 				if (skb2)
292 					rawv6_rcv(last, skb2);
293 			}
294 			last = sk;
295 		}
296 	}
297 
298 	if (last) {
299 		rawv6_rcv(last, skb);
300 		read_unlock(&ip6_ra_lock);
301 		return 1;
302 	}
303 	read_unlock(&ip6_ra_lock);
304 	return 0;
305 }
306 
307 static int ip6_forward_proxy_check(struct sk_buff *skb)
308 {
309 	struct ipv6hdr *hdr = ipv6_hdr(skb);
310 	u8 nexthdr = hdr->nexthdr;
311 	__be16 frag_off;
312 	int offset;
313 
314 	if (ipv6_ext_hdr(nexthdr)) {
315 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
316 		if (offset < 0)
317 			return 0;
318 	} else
319 		offset = sizeof(struct ipv6hdr);
320 
321 	if (nexthdr == IPPROTO_ICMPV6) {
322 		struct icmp6hdr *icmp6;
323 
324 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
325 					 offset + 1 - skb->data)))
326 			return 0;
327 
328 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
329 
330 		switch (icmp6->icmp6_type) {
331 		case NDISC_ROUTER_SOLICITATION:
332 		case NDISC_ROUTER_ADVERTISEMENT:
333 		case NDISC_NEIGHBOUR_SOLICITATION:
334 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
335 		case NDISC_REDIRECT:
336 			/* For reaction involving unicast neighbor discovery
337 			 * message destined to the proxied address, pass it to
338 			 * input function.
339 			 */
340 			return 1;
341 		default:
342 			break;
343 		}
344 	}
345 
346 	/*
347 	 * The proxying router can't forward traffic sent to a link-local
348 	 * address, so signal the sender and discard the packet. This
349 	 * behavior is clarified by the MIPv6 specification.
350 	 */
351 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
352 		dst_link_failure(skb);
353 		return -1;
354 	}
355 
356 	return 0;
357 }
358 
359 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
360 				     struct sk_buff *skb)
361 {
362 	return dst_output(net, sk, skb);
363 }
364 
365 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
366 {
367 	unsigned int mtu;
368 	struct inet6_dev *idev;
369 
370 	if (dst_metric_locked(dst, RTAX_MTU)) {
371 		mtu = dst_metric_raw(dst, RTAX_MTU);
372 		if (mtu)
373 			return mtu;
374 	}
375 
376 	mtu = IPV6_MIN_MTU;
377 	rcu_read_lock();
378 	idev = __in6_dev_get(dst->dev);
379 	if (idev)
380 		mtu = idev->cnf.mtu6;
381 	rcu_read_unlock();
382 
383 	return mtu;
384 }
385 
386 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
387 {
388 	if (skb->len <= mtu)
389 		return false;
390 
391 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
392 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
393 		return true;
394 
395 	if (skb->ignore_df)
396 		return false;
397 
398 	if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
399 		return false;
400 
401 	return true;
402 }
403 
404 int ip6_forward(struct sk_buff *skb)
405 {
406 	struct dst_entry *dst = skb_dst(skb);
407 	struct ipv6hdr *hdr = ipv6_hdr(skb);
408 	struct inet6_skb_parm *opt = IP6CB(skb);
409 	struct net *net = dev_net(dst->dev);
410 	u32 mtu;
411 
412 	if (net->ipv6.devconf_all->forwarding == 0)
413 		goto error;
414 
415 	if (skb->pkt_type != PACKET_HOST)
416 		goto drop;
417 
418 	if (unlikely(skb->sk))
419 		goto drop;
420 
421 	if (skb_warn_if_lro(skb))
422 		goto drop;
423 
424 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
425 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
426 				IPSTATS_MIB_INDISCARDS);
427 		goto drop;
428 	}
429 
430 	skb_forward_csum(skb);
431 
432 	/*
433 	 *	We DO NOT make any processing on
434 	 *	RA packets, pushing them to user level AS IS
435 	 *	without ane WARRANTY that application will be able
436 	 *	to interpret them. The reason is that we
437 	 *	cannot make anything clever here.
438 	 *
439 	 *	We are not end-node, so that if packet contains
440 	 *	AH/ESP, we cannot make anything.
441 	 *	Defragmentation also would be mistake, RA packets
442 	 *	cannot be fragmented, because there is no warranty
443 	 *	that different fragments will go along one path. --ANK
444 	 */
445 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
446 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
447 			return 0;
448 	}
449 
450 	/*
451 	 *	check and decrement ttl
452 	 */
453 	if (hdr->hop_limit <= 1) {
454 		/* Force OUTPUT device used as source address */
455 		skb->dev = dst->dev;
456 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
457 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
458 				IPSTATS_MIB_INHDRERRORS);
459 
460 		kfree_skb(skb);
461 		return -ETIMEDOUT;
462 	}
463 
464 	/* XXX: idev->cnf.proxy_ndp? */
465 	if (net->ipv6.devconf_all->proxy_ndp &&
466 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
467 		int proxied = ip6_forward_proxy_check(skb);
468 		if (proxied > 0)
469 			return ip6_input(skb);
470 		else if (proxied < 0) {
471 			__IP6_INC_STATS(net, ip6_dst_idev(dst),
472 					IPSTATS_MIB_INDISCARDS);
473 			goto drop;
474 		}
475 	}
476 
477 	if (!xfrm6_route_forward(skb)) {
478 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
479 				IPSTATS_MIB_INDISCARDS);
480 		goto drop;
481 	}
482 	dst = skb_dst(skb);
483 
484 	/* IPv6 specs say nothing about it, but it is clear that we cannot
485 	   send redirects to source routed frames.
486 	   We don't send redirects to frames decapsulated from IPsec.
487 	 */
488 	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
489 		struct in6_addr *target = NULL;
490 		struct inet_peer *peer;
491 		struct rt6_info *rt;
492 
493 		/*
494 		 *	incoming and outgoing devices are the same
495 		 *	send a redirect.
496 		 */
497 
498 		rt = (struct rt6_info *) dst;
499 		if (rt->rt6i_flags & RTF_GATEWAY)
500 			target = &rt->rt6i_gateway;
501 		else
502 			target = &hdr->daddr;
503 
504 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
505 
506 		/* Limit redirects both by destination (here)
507 		   and by source (inside ndisc_send_redirect)
508 		 */
509 		if (inet_peer_xrlim_allow(peer, 1*HZ))
510 			ndisc_send_redirect(skb, target);
511 		if (peer)
512 			inet_putpeer(peer);
513 	} else {
514 		int addrtype = ipv6_addr_type(&hdr->saddr);
515 
516 		/* This check is security critical. */
517 		if (addrtype == IPV6_ADDR_ANY ||
518 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
519 			goto error;
520 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
521 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
522 				    ICMPV6_NOT_NEIGHBOUR, 0);
523 			goto error;
524 		}
525 	}
526 
527 	mtu = ip6_dst_mtu_forward(dst);
528 	if (mtu < IPV6_MIN_MTU)
529 		mtu = IPV6_MIN_MTU;
530 
531 	if (ip6_pkt_too_big(skb, mtu)) {
532 		/* Again, force OUTPUT device used as source address */
533 		skb->dev = dst->dev;
534 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
535 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
536 				IPSTATS_MIB_INTOOBIGERRORS);
537 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
538 				IPSTATS_MIB_FRAGFAILS);
539 		kfree_skb(skb);
540 		return -EMSGSIZE;
541 	}
542 
543 	if (skb_cow(skb, dst->dev->hard_header_len)) {
544 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
545 				IPSTATS_MIB_OUTDISCARDS);
546 		goto drop;
547 	}
548 
549 	hdr = ipv6_hdr(skb);
550 
551 	/* Mangling hops number delayed to point after skb COW */
552 
553 	hdr->hop_limit--;
554 
555 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
556 	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
557 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
558 		       net, NULL, skb, skb->dev, dst->dev,
559 		       ip6_forward_finish);
560 
561 error:
562 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
563 drop:
564 	kfree_skb(skb);
565 	return -EINVAL;
566 }
567 
568 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
569 {
570 	to->pkt_type = from->pkt_type;
571 	to->priority = from->priority;
572 	to->protocol = from->protocol;
573 	skb_dst_drop(to);
574 	skb_dst_set(to, dst_clone(skb_dst(from)));
575 	to->dev = from->dev;
576 	to->mark = from->mark;
577 
578 #ifdef CONFIG_NET_SCHED
579 	to->tc_index = from->tc_index;
580 #endif
581 	nf_copy(to, from);
582 	skb_copy_secmark(to, from);
583 }
584 
585 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
586 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
587 {
588 	struct sk_buff *frag;
589 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
590 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
591 				inet6_sk(skb->sk) : NULL;
592 	struct ipv6hdr *tmp_hdr;
593 	struct frag_hdr *fh;
594 	unsigned int mtu, hlen, left, len;
595 	int hroom, troom;
596 	__be32 frag_id;
597 	int ptr, offset = 0, err = 0;
598 	u8 *prevhdr, nexthdr = 0;
599 
600 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
601 	nexthdr = *prevhdr;
602 
603 	mtu = ip6_skb_dst_mtu(skb);
604 
605 	/* We must not fragment if the socket is set to force MTU discovery
606 	 * or if the skb it not generated by a local socket.
607 	 */
608 	if (unlikely(!skb->ignore_df && skb->len > mtu))
609 		goto fail_toobig;
610 
611 	if (IP6CB(skb)->frag_max_size) {
612 		if (IP6CB(skb)->frag_max_size > mtu)
613 			goto fail_toobig;
614 
615 		/* don't send fragments larger than what we received */
616 		mtu = IP6CB(skb)->frag_max_size;
617 		if (mtu < IPV6_MIN_MTU)
618 			mtu = IPV6_MIN_MTU;
619 	}
620 
621 	if (np && np->frag_size < mtu) {
622 		if (np->frag_size)
623 			mtu = np->frag_size;
624 	}
625 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
626 		goto fail_toobig;
627 	mtu -= hlen + sizeof(struct frag_hdr);
628 
629 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
630 				    &ipv6_hdr(skb)->saddr);
631 
632 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
633 	    (err = skb_checksum_help(skb)))
634 		goto fail;
635 
636 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
637 	if (skb_has_frag_list(skb)) {
638 		unsigned int first_len = skb_pagelen(skb);
639 		struct sk_buff *frag2;
640 
641 		if (first_len - hlen > mtu ||
642 		    ((first_len - hlen) & 7) ||
643 		    skb_cloned(skb) ||
644 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
645 			goto slow_path;
646 
647 		skb_walk_frags(skb, frag) {
648 			/* Correct geometry. */
649 			if (frag->len > mtu ||
650 			    ((frag->len & 7) && frag->next) ||
651 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
652 				goto slow_path_clean;
653 
654 			/* Partially cloned skb? */
655 			if (skb_shared(frag))
656 				goto slow_path_clean;
657 
658 			BUG_ON(frag->sk);
659 			if (skb->sk) {
660 				frag->sk = skb->sk;
661 				frag->destructor = sock_wfree;
662 			}
663 			skb->truesize -= frag->truesize;
664 		}
665 
666 		err = 0;
667 		offset = 0;
668 		/* BUILD HEADER */
669 
670 		*prevhdr = NEXTHDR_FRAGMENT;
671 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
672 		if (!tmp_hdr) {
673 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
674 				      IPSTATS_MIB_FRAGFAILS);
675 			err = -ENOMEM;
676 			goto fail;
677 		}
678 		frag = skb_shinfo(skb)->frag_list;
679 		skb_frag_list_init(skb);
680 
681 		__skb_pull(skb, hlen);
682 		fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
683 		__skb_push(skb, hlen);
684 		skb_reset_network_header(skb);
685 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
686 
687 		fh->nexthdr = nexthdr;
688 		fh->reserved = 0;
689 		fh->frag_off = htons(IP6_MF);
690 		fh->identification = frag_id;
691 
692 		first_len = skb_pagelen(skb);
693 		skb->data_len = first_len - skb_headlen(skb);
694 		skb->len = first_len;
695 		ipv6_hdr(skb)->payload_len = htons(first_len -
696 						   sizeof(struct ipv6hdr));
697 
698 		dst_hold(&rt->dst);
699 
700 		for (;;) {
701 			/* Prepare header of the next frame,
702 			 * before previous one went down. */
703 			if (frag) {
704 				frag->ip_summed = CHECKSUM_NONE;
705 				skb_reset_transport_header(frag);
706 				fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
707 				__skb_push(frag, hlen);
708 				skb_reset_network_header(frag);
709 				memcpy(skb_network_header(frag), tmp_hdr,
710 				       hlen);
711 				offset += skb->len - hlen - sizeof(struct frag_hdr);
712 				fh->nexthdr = nexthdr;
713 				fh->reserved = 0;
714 				fh->frag_off = htons(offset);
715 				if (frag->next)
716 					fh->frag_off |= htons(IP6_MF);
717 				fh->identification = frag_id;
718 				ipv6_hdr(frag)->payload_len =
719 						htons(frag->len -
720 						      sizeof(struct ipv6hdr));
721 				ip6_copy_metadata(frag, skb);
722 			}
723 
724 			err = output(net, sk, skb);
725 			if (!err)
726 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
727 					      IPSTATS_MIB_FRAGCREATES);
728 
729 			if (err || !frag)
730 				break;
731 
732 			skb = frag;
733 			frag = skb->next;
734 			skb->next = NULL;
735 		}
736 
737 		kfree(tmp_hdr);
738 
739 		if (err == 0) {
740 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
741 				      IPSTATS_MIB_FRAGOKS);
742 			ip6_rt_put(rt);
743 			return 0;
744 		}
745 
746 		kfree_skb_list(frag);
747 
748 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
749 			      IPSTATS_MIB_FRAGFAILS);
750 		ip6_rt_put(rt);
751 		return err;
752 
753 slow_path_clean:
754 		skb_walk_frags(skb, frag2) {
755 			if (frag2 == frag)
756 				break;
757 			frag2->sk = NULL;
758 			frag2->destructor = NULL;
759 			skb->truesize += frag2->truesize;
760 		}
761 	}
762 
763 slow_path:
764 	left = skb->len - hlen;		/* Space per frame */
765 	ptr = hlen;			/* Where to start from */
766 
767 	/*
768 	 *	Fragment the datagram.
769 	 */
770 
771 	*prevhdr = NEXTHDR_FRAGMENT;
772 	troom = rt->dst.dev->needed_tailroom;
773 
774 	/*
775 	 *	Keep copying data until we run out.
776 	 */
777 	while (left > 0)	{
778 		len = left;
779 		/* IF: it doesn't fit, use 'mtu' - the data space left */
780 		if (len > mtu)
781 			len = mtu;
782 		/* IF: we are not sending up to and including the packet end
783 		   then align the next start on an eight byte boundary */
784 		if (len < left)	{
785 			len &= ~7;
786 		}
787 
788 		/* Allocate buffer */
789 		frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
790 				 hroom + troom, GFP_ATOMIC);
791 		if (!frag) {
792 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
793 				      IPSTATS_MIB_FRAGFAILS);
794 			err = -ENOMEM;
795 			goto fail;
796 		}
797 
798 		/*
799 		 *	Set up data on packet
800 		 */
801 
802 		ip6_copy_metadata(frag, skb);
803 		skb_reserve(frag, hroom);
804 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
805 		skb_reset_network_header(frag);
806 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
807 		frag->transport_header = (frag->network_header + hlen +
808 					  sizeof(struct frag_hdr));
809 
810 		/*
811 		 *	Charge the memory for the fragment to any owner
812 		 *	it might possess
813 		 */
814 		if (skb->sk)
815 			skb_set_owner_w(frag, skb->sk);
816 
817 		/*
818 		 *	Copy the packet header into the new buffer.
819 		 */
820 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
821 
822 		/*
823 		 *	Build fragment header.
824 		 */
825 		fh->nexthdr = nexthdr;
826 		fh->reserved = 0;
827 		fh->identification = frag_id;
828 
829 		/*
830 		 *	Copy a block of the IP datagram.
831 		 */
832 		BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
833 				     len));
834 		left -= len;
835 
836 		fh->frag_off = htons(offset);
837 		if (left > 0)
838 			fh->frag_off |= htons(IP6_MF);
839 		ipv6_hdr(frag)->payload_len = htons(frag->len -
840 						    sizeof(struct ipv6hdr));
841 
842 		ptr += len;
843 		offset += len;
844 
845 		/*
846 		 *	Put this fragment into the sending queue.
847 		 */
848 		err = output(net, sk, frag);
849 		if (err)
850 			goto fail;
851 
852 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
853 			      IPSTATS_MIB_FRAGCREATES);
854 	}
855 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
856 		      IPSTATS_MIB_FRAGOKS);
857 	consume_skb(skb);
858 	return err;
859 
860 fail_toobig:
861 	if (skb->sk && dst_allfrag(skb_dst(skb)))
862 		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
863 
864 	skb->dev = skb_dst(skb)->dev;
865 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
866 	err = -EMSGSIZE;
867 
868 fail:
869 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
870 		      IPSTATS_MIB_FRAGFAILS);
871 	kfree_skb(skb);
872 	return err;
873 }
874 
875 static inline int ip6_rt_check(const struct rt6key *rt_key,
876 			       const struct in6_addr *fl_addr,
877 			       const struct in6_addr *addr_cache)
878 {
879 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
880 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
881 }
882 
883 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
884 					  struct dst_entry *dst,
885 					  const struct flowi6 *fl6)
886 {
887 	struct ipv6_pinfo *np = inet6_sk(sk);
888 	struct rt6_info *rt;
889 
890 	if (!dst)
891 		goto out;
892 
893 	if (dst->ops->family != AF_INET6) {
894 		dst_release(dst);
895 		return NULL;
896 	}
897 
898 	rt = (struct rt6_info *)dst;
899 	/* Yes, checking route validity in not connected
900 	 * case is not very simple. Take into account,
901 	 * that we do not support routing by source, TOS,
902 	 * and MSG_DONTROUTE		--ANK (980726)
903 	 *
904 	 * 1. ip6_rt_check(): If route was host route,
905 	 *    check that cached destination is current.
906 	 *    If it is network route, we still may
907 	 *    check its validity using saved pointer
908 	 *    to the last used address: daddr_cache.
909 	 *    We do not want to save whole address now,
910 	 *    (because main consumer of this service
911 	 *    is tcp, which has not this problem),
912 	 *    so that the last trick works only on connected
913 	 *    sockets.
914 	 * 2. oif also should be the same.
915 	 */
916 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
917 #ifdef CONFIG_IPV6_SUBTREES
918 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
919 #endif
920 	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
921 	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
922 		dst_release(dst);
923 		dst = NULL;
924 	}
925 
926 out:
927 	return dst;
928 }
929 
930 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
931 			       struct dst_entry **dst, struct flowi6 *fl6)
932 {
933 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
934 	struct neighbour *n;
935 	struct rt6_info *rt;
936 #endif
937 	int err;
938 	int flags = 0;
939 
940 	/* The correct way to handle this would be to do
941 	 * ip6_route_get_saddr, and then ip6_route_output; however,
942 	 * the route-specific preferred source forces the
943 	 * ip6_route_output call _before_ ip6_route_get_saddr.
944 	 *
945 	 * In source specific routing (no src=any default route),
946 	 * ip6_route_output will fail given src=any saddr, though, so
947 	 * that's why we try it again later.
948 	 */
949 	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
950 		struct rt6_info *rt;
951 		bool had_dst = *dst != NULL;
952 
953 		if (!had_dst)
954 			*dst = ip6_route_output(net, sk, fl6);
955 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
956 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
957 					  sk ? inet6_sk(sk)->srcprefs : 0,
958 					  &fl6->saddr);
959 		if (err)
960 			goto out_err_release;
961 
962 		/* If we had an erroneous initial result, pretend it
963 		 * never existed and let the SA-enabled version take
964 		 * over.
965 		 */
966 		if (!had_dst && (*dst)->error) {
967 			dst_release(*dst);
968 			*dst = NULL;
969 		}
970 
971 		if (fl6->flowi6_oif)
972 			flags |= RT6_LOOKUP_F_IFACE;
973 	}
974 
975 	if (!*dst)
976 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
977 
978 	err = (*dst)->error;
979 	if (err)
980 		goto out_err_release;
981 
982 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
983 	/*
984 	 * Here if the dst entry we've looked up
985 	 * has a neighbour entry that is in the INCOMPLETE
986 	 * state and the src address from the flow is
987 	 * marked as OPTIMISTIC, we release the found
988 	 * dst entry and replace it instead with the
989 	 * dst entry of the nexthop router
990 	 */
991 	rt = (struct rt6_info *) *dst;
992 	rcu_read_lock_bh();
993 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
994 				      rt6_nexthop(rt, &fl6->daddr));
995 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
996 	rcu_read_unlock_bh();
997 
998 	if (err) {
999 		struct inet6_ifaddr *ifp;
1000 		struct flowi6 fl_gw6;
1001 		int redirect;
1002 
1003 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1004 				      (*dst)->dev, 1);
1005 
1006 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1007 		if (ifp)
1008 			in6_ifa_put(ifp);
1009 
1010 		if (redirect) {
1011 			/*
1012 			 * We need to get the dst entry for the
1013 			 * default router instead
1014 			 */
1015 			dst_release(*dst);
1016 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1017 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1018 			*dst = ip6_route_output(net, sk, &fl_gw6);
1019 			err = (*dst)->error;
1020 			if (err)
1021 				goto out_err_release;
1022 		}
1023 	}
1024 #endif
1025 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1026 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1027 		err = -EAFNOSUPPORT;
1028 		goto out_err_release;
1029 	}
1030 
1031 	return 0;
1032 
1033 out_err_release:
1034 	dst_release(*dst);
1035 	*dst = NULL;
1036 
1037 	if (err == -ENETUNREACH)
1038 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1039 	return err;
1040 }
1041 
1042 /**
1043  *	ip6_dst_lookup - perform route lookup on flow
1044  *	@sk: socket which provides route info
1045  *	@dst: pointer to dst_entry * for result
1046  *	@fl6: flow to lookup
1047  *
1048  *	This function performs a route lookup on the given flow.
1049  *
1050  *	It returns zero on success, or a standard errno code on error.
1051  */
1052 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1053 		   struct flowi6 *fl6)
1054 {
1055 	*dst = NULL;
1056 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1057 }
1058 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1059 
1060 /**
1061  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1062  *	@sk: socket which provides route info
1063  *	@fl6: flow to lookup
1064  *	@final_dst: final destination address for ipsec lookup
1065  *
1066  *	This function performs a route lookup on the given flow.
1067  *
1068  *	It returns a valid dst pointer on success, or a pointer encoded
1069  *	error code.
1070  */
1071 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1072 				      const struct in6_addr *final_dst)
1073 {
1074 	struct dst_entry *dst = NULL;
1075 	int err;
1076 
1077 	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1078 	if (err)
1079 		return ERR_PTR(err);
1080 	if (final_dst)
1081 		fl6->daddr = *final_dst;
1082 
1083 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1084 }
1085 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1086 
1087 /**
1088  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1089  *	@sk: socket which provides the dst cache and route info
1090  *	@fl6: flow to lookup
1091  *	@final_dst: final destination address for ipsec lookup
1092  *
1093  *	This function performs a route lookup on the given flow with the
1094  *	possibility of using the cached route in the socket if it is valid.
1095  *	It will take the socket dst lock when operating on the dst cache.
1096  *	As a result, this function can only be used in process context.
1097  *
1098  *	It returns a valid dst pointer on success, or a pointer encoded
1099  *	error code.
1100  */
1101 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1102 					 const struct in6_addr *final_dst)
1103 {
1104 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1105 
1106 	dst = ip6_sk_dst_check(sk, dst, fl6);
1107 	if (!dst)
1108 		dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1109 
1110 	return dst;
1111 }
1112 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1113 
1114 static inline int ip6_ufo_append_data(struct sock *sk,
1115 			struct sk_buff_head *queue,
1116 			int getfrag(void *from, char *to, int offset, int len,
1117 			int odd, struct sk_buff *skb),
1118 			void *from, int length, int hh_len, int fragheaderlen,
1119 			int exthdrlen, int transhdrlen, int mtu,
1120 			unsigned int flags, const struct flowi6 *fl6)
1121 
1122 {
1123 	struct sk_buff *skb;
1124 	int err;
1125 
1126 	/* There is support for UDP large send offload by network
1127 	 * device, so create one single skb packet containing complete
1128 	 * udp datagram
1129 	 */
1130 	skb = skb_peek_tail(queue);
1131 	if (!skb) {
1132 		skb = sock_alloc_send_skb(sk,
1133 			hh_len + fragheaderlen + transhdrlen + 20,
1134 			(flags & MSG_DONTWAIT), &err);
1135 		if (!skb)
1136 			return err;
1137 
1138 		/* reserve space for Hardware header */
1139 		skb_reserve(skb, hh_len);
1140 
1141 		/* create space for UDP/IP header */
1142 		skb_put(skb, fragheaderlen + transhdrlen);
1143 
1144 		/* initialize network header pointer */
1145 		skb_set_network_header(skb, exthdrlen);
1146 
1147 		/* initialize protocol header pointer */
1148 		skb->transport_header = skb->network_header + fragheaderlen;
1149 
1150 		skb->protocol = htons(ETH_P_IPV6);
1151 		skb->csum = 0;
1152 
1153 		if (flags & MSG_CONFIRM)
1154 			skb_set_dst_pending_confirm(skb, 1);
1155 
1156 		__skb_queue_tail(queue, skb);
1157 	} else if (skb_is_gso(skb)) {
1158 		goto append;
1159 	}
1160 
1161 	skb->ip_summed = CHECKSUM_PARTIAL;
1162 	/* Specify the length of each IPv6 datagram fragment.
1163 	 * It has to be a multiple of 8.
1164 	 */
1165 	skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1166 				     sizeof(struct frag_hdr)) & ~7;
1167 	skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1168 	skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1169 							 &fl6->daddr,
1170 							 &fl6->saddr);
1171 
1172 append:
1173 	return skb_append_datato_frags(sk, skb, getfrag, from,
1174 				       (length - transhdrlen));
1175 }
1176 
1177 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1178 					       gfp_t gfp)
1179 {
1180 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1181 }
1182 
1183 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1184 						gfp_t gfp)
1185 {
1186 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1187 }
1188 
1189 static void ip6_append_data_mtu(unsigned int *mtu,
1190 				int *maxfraglen,
1191 				unsigned int fragheaderlen,
1192 				struct sk_buff *skb,
1193 				struct rt6_info *rt,
1194 				unsigned int orig_mtu)
1195 {
1196 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1197 		if (!skb) {
1198 			/* first fragment, reserve header_len */
1199 			*mtu = orig_mtu - rt->dst.header_len;
1200 
1201 		} else {
1202 			/*
1203 			 * this fragment is not first, the headers
1204 			 * space is regarded as data space.
1205 			 */
1206 			*mtu = orig_mtu;
1207 		}
1208 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1209 			      + fragheaderlen - sizeof(struct frag_hdr);
1210 	}
1211 }
1212 
1213 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1214 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1215 			  struct rt6_info *rt, struct flowi6 *fl6)
1216 {
1217 	struct ipv6_pinfo *np = inet6_sk(sk);
1218 	unsigned int mtu;
1219 	struct ipv6_txoptions *opt = ipc6->opt;
1220 
1221 	/*
1222 	 * setup for corking
1223 	 */
1224 	if (opt) {
1225 		if (WARN_ON(v6_cork->opt))
1226 			return -EINVAL;
1227 
1228 		v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1229 		if (unlikely(!v6_cork->opt))
1230 			return -ENOBUFS;
1231 
1232 		v6_cork->opt->tot_len = opt->tot_len;
1233 		v6_cork->opt->opt_flen = opt->opt_flen;
1234 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1235 
1236 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1237 						    sk->sk_allocation);
1238 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1239 			return -ENOBUFS;
1240 
1241 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1242 						    sk->sk_allocation);
1243 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1244 			return -ENOBUFS;
1245 
1246 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1247 						   sk->sk_allocation);
1248 		if (opt->hopopt && !v6_cork->opt->hopopt)
1249 			return -ENOBUFS;
1250 
1251 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1252 						    sk->sk_allocation);
1253 		if (opt->srcrt && !v6_cork->opt->srcrt)
1254 			return -ENOBUFS;
1255 
1256 		/* need source address above miyazawa*/
1257 	}
1258 	dst_hold(&rt->dst);
1259 	cork->base.dst = &rt->dst;
1260 	cork->fl.u.ip6 = *fl6;
1261 	v6_cork->hop_limit = ipc6->hlimit;
1262 	v6_cork->tclass = ipc6->tclass;
1263 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1264 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1265 		      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1266 	else
1267 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1268 		      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1269 	if (np->frag_size < mtu) {
1270 		if (np->frag_size)
1271 			mtu = np->frag_size;
1272 	}
1273 	cork->base.fragsize = mtu;
1274 	if (dst_allfrag(rt->dst.path))
1275 		cork->base.flags |= IPCORK_ALLFRAG;
1276 	cork->base.length = 0;
1277 
1278 	return 0;
1279 }
1280 
1281 static int __ip6_append_data(struct sock *sk,
1282 			     struct flowi6 *fl6,
1283 			     struct sk_buff_head *queue,
1284 			     struct inet_cork *cork,
1285 			     struct inet6_cork *v6_cork,
1286 			     struct page_frag *pfrag,
1287 			     int getfrag(void *from, char *to, int offset,
1288 					 int len, int odd, struct sk_buff *skb),
1289 			     void *from, int length, int transhdrlen,
1290 			     unsigned int flags, struct ipcm6_cookie *ipc6,
1291 			     const struct sockcm_cookie *sockc)
1292 {
1293 	struct sk_buff *skb, *skb_prev = NULL;
1294 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1295 	int exthdrlen = 0;
1296 	int dst_exthdrlen = 0;
1297 	int hh_len;
1298 	int copy;
1299 	int err;
1300 	int offset = 0;
1301 	__u8 tx_flags = 0;
1302 	u32 tskey = 0;
1303 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1304 	struct ipv6_txoptions *opt = v6_cork->opt;
1305 	int csummode = CHECKSUM_NONE;
1306 	unsigned int maxnonfragsize, headersize;
1307 
1308 	skb = skb_peek_tail(queue);
1309 	if (!skb) {
1310 		exthdrlen = opt ? opt->opt_flen : 0;
1311 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1312 	}
1313 
1314 	mtu = cork->fragsize;
1315 	orig_mtu = mtu;
1316 
1317 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1318 
1319 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1320 			(opt ? opt->opt_nflen : 0);
1321 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1322 		     sizeof(struct frag_hdr);
1323 
1324 	headersize = sizeof(struct ipv6hdr) +
1325 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1326 		     (dst_allfrag(&rt->dst) ?
1327 		      sizeof(struct frag_hdr) : 0) +
1328 		     rt->rt6i_nfheader_len;
1329 
1330 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1331 	    (sk->sk_protocol == IPPROTO_UDP ||
1332 	     sk->sk_protocol == IPPROTO_RAW)) {
1333 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1334 				sizeof(struct ipv6hdr));
1335 		goto emsgsize;
1336 	}
1337 
1338 	if (ip6_sk_ignore_df(sk))
1339 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1340 	else
1341 		maxnonfragsize = mtu;
1342 
1343 	if (cork->length + length > maxnonfragsize - headersize) {
1344 emsgsize:
1345 		ipv6_local_error(sk, EMSGSIZE, fl6,
1346 				 mtu - headersize +
1347 				 sizeof(struct ipv6hdr));
1348 		return -EMSGSIZE;
1349 	}
1350 
1351 	/* CHECKSUM_PARTIAL only with no extension headers and when
1352 	 * we are not going to fragment
1353 	 */
1354 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1355 	    headersize == sizeof(struct ipv6hdr) &&
1356 	    length <= mtu - headersize &&
1357 	    !(flags & MSG_MORE) &&
1358 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1359 		csummode = CHECKSUM_PARTIAL;
1360 
1361 	if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1362 		sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1363 		if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1364 		    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1365 			tskey = sk->sk_tskey++;
1366 	}
1367 
1368 	/*
1369 	 * Let's try using as much space as possible.
1370 	 * Use MTU if total length of the message fits into the MTU.
1371 	 * Otherwise, we need to reserve fragment header and
1372 	 * fragment alignment (= 8-15 octects, in total).
1373 	 *
1374 	 * Note that we may need to "move" the data from the tail of
1375 	 * of the buffer to the new fragment when we split
1376 	 * the message.
1377 	 *
1378 	 * FIXME: It may be fragmented into multiple chunks
1379 	 *        at once if non-fragmentable extension headers
1380 	 *        are too large.
1381 	 * --yoshfuji
1382 	 */
1383 
1384 	cork->length += length;
1385 	if ((((length + fragheaderlen) > mtu) ||
1386 	     (skb && skb_is_gso(skb))) &&
1387 	    (sk->sk_protocol == IPPROTO_UDP) &&
1388 	    (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len &&
1389 	    (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) {
1390 		err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1391 					  hh_len, fragheaderlen, exthdrlen,
1392 					  transhdrlen, mtu, flags, fl6);
1393 		if (err)
1394 			goto error;
1395 		return 0;
1396 	}
1397 
1398 	if (!skb)
1399 		goto alloc_new_skb;
1400 
1401 	while (length > 0) {
1402 		/* Check if the remaining data fits into current packet. */
1403 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1404 		if (copy < length)
1405 			copy = maxfraglen - skb->len;
1406 
1407 		if (copy <= 0) {
1408 			char *data;
1409 			unsigned int datalen;
1410 			unsigned int fraglen;
1411 			unsigned int fraggap;
1412 			unsigned int alloclen;
1413 alloc_new_skb:
1414 			/* There's no room in the current skb */
1415 			if (skb)
1416 				fraggap = skb->len - maxfraglen;
1417 			else
1418 				fraggap = 0;
1419 			/* update mtu and maxfraglen if necessary */
1420 			if (!skb || !skb_prev)
1421 				ip6_append_data_mtu(&mtu, &maxfraglen,
1422 						    fragheaderlen, skb, rt,
1423 						    orig_mtu);
1424 
1425 			skb_prev = skb;
1426 
1427 			/*
1428 			 * If remaining data exceeds the mtu,
1429 			 * we know we need more fragment(s).
1430 			 */
1431 			datalen = length + fraggap;
1432 
1433 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1434 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1435 			if ((flags & MSG_MORE) &&
1436 			    !(rt->dst.dev->features&NETIF_F_SG))
1437 				alloclen = mtu;
1438 			else
1439 				alloclen = datalen + fragheaderlen;
1440 
1441 			alloclen += dst_exthdrlen;
1442 
1443 			if (datalen != length + fraggap) {
1444 				/*
1445 				 * this is not the last fragment, the trailer
1446 				 * space is regarded as data space.
1447 				 */
1448 				datalen += rt->dst.trailer_len;
1449 			}
1450 
1451 			alloclen += rt->dst.trailer_len;
1452 			fraglen = datalen + fragheaderlen;
1453 
1454 			/*
1455 			 * We just reserve space for fragment header.
1456 			 * Note: this may be overallocation if the message
1457 			 * (without MSG_MORE) fits into the MTU.
1458 			 */
1459 			alloclen += sizeof(struct frag_hdr);
1460 
1461 			if (transhdrlen) {
1462 				skb = sock_alloc_send_skb(sk,
1463 						alloclen + hh_len,
1464 						(flags & MSG_DONTWAIT), &err);
1465 			} else {
1466 				skb = NULL;
1467 				if (atomic_read(&sk->sk_wmem_alloc) <=
1468 				    2 * sk->sk_sndbuf)
1469 					skb = sock_wmalloc(sk,
1470 							   alloclen + hh_len, 1,
1471 							   sk->sk_allocation);
1472 				if (unlikely(!skb))
1473 					err = -ENOBUFS;
1474 			}
1475 			if (!skb)
1476 				goto error;
1477 			/*
1478 			 *	Fill in the control structures
1479 			 */
1480 			skb->protocol = htons(ETH_P_IPV6);
1481 			skb->ip_summed = csummode;
1482 			skb->csum = 0;
1483 			/* reserve for fragmentation and ipsec header */
1484 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1485 				    dst_exthdrlen);
1486 
1487 			/* Only the initial fragment is time stamped */
1488 			skb_shinfo(skb)->tx_flags = tx_flags;
1489 			tx_flags = 0;
1490 			skb_shinfo(skb)->tskey = tskey;
1491 			tskey = 0;
1492 
1493 			/*
1494 			 *	Find where to start putting bytes
1495 			 */
1496 			data = skb_put(skb, fraglen);
1497 			skb_set_network_header(skb, exthdrlen);
1498 			data += fragheaderlen;
1499 			skb->transport_header = (skb->network_header +
1500 						 fragheaderlen);
1501 			if (fraggap) {
1502 				skb->csum = skb_copy_and_csum_bits(
1503 					skb_prev, maxfraglen,
1504 					data + transhdrlen, fraggap, 0);
1505 				skb_prev->csum = csum_sub(skb_prev->csum,
1506 							  skb->csum);
1507 				data += fraggap;
1508 				pskb_trim_unique(skb_prev, maxfraglen);
1509 			}
1510 			copy = datalen - transhdrlen - fraggap;
1511 
1512 			if (copy < 0) {
1513 				err = -EINVAL;
1514 				kfree_skb(skb);
1515 				goto error;
1516 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1517 				err = -EFAULT;
1518 				kfree_skb(skb);
1519 				goto error;
1520 			}
1521 
1522 			offset += copy;
1523 			length -= datalen - fraggap;
1524 			transhdrlen = 0;
1525 			exthdrlen = 0;
1526 			dst_exthdrlen = 0;
1527 
1528 			if ((flags & MSG_CONFIRM) && !skb_prev)
1529 				skb_set_dst_pending_confirm(skb, 1);
1530 
1531 			/*
1532 			 * Put the packet on the pending queue
1533 			 */
1534 			__skb_queue_tail(queue, skb);
1535 			continue;
1536 		}
1537 
1538 		if (copy > length)
1539 			copy = length;
1540 
1541 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1542 			unsigned int off;
1543 
1544 			off = skb->len;
1545 			if (getfrag(from, skb_put(skb, copy),
1546 						offset, copy, off, skb) < 0) {
1547 				__skb_trim(skb, off);
1548 				err = -EFAULT;
1549 				goto error;
1550 			}
1551 		} else {
1552 			int i = skb_shinfo(skb)->nr_frags;
1553 
1554 			err = -ENOMEM;
1555 			if (!sk_page_frag_refill(sk, pfrag))
1556 				goto error;
1557 
1558 			if (!skb_can_coalesce(skb, i, pfrag->page,
1559 					      pfrag->offset)) {
1560 				err = -EMSGSIZE;
1561 				if (i == MAX_SKB_FRAGS)
1562 					goto error;
1563 
1564 				__skb_fill_page_desc(skb, i, pfrag->page,
1565 						     pfrag->offset, 0);
1566 				skb_shinfo(skb)->nr_frags = ++i;
1567 				get_page(pfrag->page);
1568 			}
1569 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1570 			if (getfrag(from,
1571 				    page_address(pfrag->page) + pfrag->offset,
1572 				    offset, copy, skb->len, skb) < 0)
1573 				goto error_efault;
1574 
1575 			pfrag->offset += copy;
1576 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1577 			skb->len += copy;
1578 			skb->data_len += copy;
1579 			skb->truesize += copy;
1580 			atomic_add(copy, &sk->sk_wmem_alloc);
1581 		}
1582 		offset += copy;
1583 		length -= copy;
1584 	}
1585 
1586 	return 0;
1587 
1588 error_efault:
1589 	err = -EFAULT;
1590 error:
1591 	cork->length -= length;
1592 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1593 	return err;
1594 }
1595 
1596 int ip6_append_data(struct sock *sk,
1597 		    int getfrag(void *from, char *to, int offset, int len,
1598 				int odd, struct sk_buff *skb),
1599 		    void *from, int length, int transhdrlen,
1600 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1601 		    struct rt6_info *rt, unsigned int flags,
1602 		    const struct sockcm_cookie *sockc)
1603 {
1604 	struct inet_sock *inet = inet_sk(sk);
1605 	struct ipv6_pinfo *np = inet6_sk(sk);
1606 	int exthdrlen;
1607 	int err;
1608 
1609 	if (flags&MSG_PROBE)
1610 		return 0;
1611 	if (skb_queue_empty(&sk->sk_write_queue)) {
1612 		/*
1613 		 * setup for corking
1614 		 */
1615 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1616 				     ipc6, rt, fl6);
1617 		if (err)
1618 			return err;
1619 
1620 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1621 		length += exthdrlen;
1622 		transhdrlen += exthdrlen;
1623 	} else {
1624 		fl6 = &inet->cork.fl.u.ip6;
1625 		transhdrlen = 0;
1626 	}
1627 
1628 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1629 				 &np->cork, sk_page_frag(sk), getfrag,
1630 				 from, length, transhdrlen, flags, ipc6, sockc);
1631 }
1632 EXPORT_SYMBOL_GPL(ip6_append_data);
1633 
1634 static void ip6_cork_release(struct inet_cork_full *cork,
1635 			     struct inet6_cork *v6_cork)
1636 {
1637 	if (v6_cork->opt) {
1638 		kfree(v6_cork->opt->dst0opt);
1639 		kfree(v6_cork->opt->dst1opt);
1640 		kfree(v6_cork->opt->hopopt);
1641 		kfree(v6_cork->opt->srcrt);
1642 		kfree(v6_cork->opt);
1643 		v6_cork->opt = NULL;
1644 	}
1645 
1646 	if (cork->base.dst) {
1647 		dst_release(cork->base.dst);
1648 		cork->base.dst = NULL;
1649 		cork->base.flags &= ~IPCORK_ALLFRAG;
1650 	}
1651 	memset(&cork->fl, 0, sizeof(cork->fl));
1652 }
1653 
1654 struct sk_buff *__ip6_make_skb(struct sock *sk,
1655 			       struct sk_buff_head *queue,
1656 			       struct inet_cork_full *cork,
1657 			       struct inet6_cork *v6_cork)
1658 {
1659 	struct sk_buff *skb, *tmp_skb;
1660 	struct sk_buff **tail_skb;
1661 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1662 	struct ipv6_pinfo *np = inet6_sk(sk);
1663 	struct net *net = sock_net(sk);
1664 	struct ipv6hdr *hdr;
1665 	struct ipv6_txoptions *opt = v6_cork->opt;
1666 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1667 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1668 	unsigned char proto = fl6->flowi6_proto;
1669 
1670 	skb = __skb_dequeue(queue);
1671 	if (!skb)
1672 		goto out;
1673 	tail_skb = &(skb_shinfo(skb)->frag_list);
1674 
1675 	/* move skb->data to ip header from ext header */
1676 	if (skb->data < skb_network_header(skb))
1677 		__skb_pull(skb, skb_network_offset(skb));
1678 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1679 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1680 		*tail_skb = tmp_skb;
1681 		tail_skb = &(tmp_skb->next);
1682 		skb->len += tmp_skb->len;
1683 		skb->data_len += tmp_skb->len;
1684 		skb->truesize += tmp_skb->truesize;
1685 		tmp_skb->destructor = NULL;
1686 		tmp_skb->sk = NULL;
1687 	}
1688 
1689 	/* Allow local fragmentation. */
1690 	skb->ignore_df = ip6_sk_ignore_df(sk);
1691 
1692 	*final_dst = fl6->daddr;
1693 	__skb_pull(skb, skb_network_header_len(skb));
1694 	if (opt && opt->opt_flen)
1695 		ipv6_push_frag_opts(skb, opt, &proto);
1696 	if (opt && opt->opt_nflen)
1697 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1698 
1699 	skb_push(skb, sizeof(struct ipv6hdr));
1700 	skb_reset_network_header(skb);
1701 	hdr = ipv6_hdr(skb);
1702 
1703 	ip6_flow_hdr(hdr, v6_cork->tclass,
1704 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1705 					np->autoflowlabel, fl6));
1706 	hdr->hop_limit = v6_cork->hop_limit;
1707 	hdr->nexthdr = proto;
1708 	hdr->saddr = fl6->saddr;
1709 	hdr->daddr = *final_dst;
1710 
1711 	skb->priority = sk->sk_priority;
1712 	skb->mark = sk->sk_mark;
1713 
1714 	skb_dst_set(skb, dst_clone(&rt->dst));
1715 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1716 	if (proto == IPPROTO_ICMPV6) {
1717 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1718 
1719 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1720 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1721 	}
1722 
1723 	ip6_cork_release(cork, v6_cork);
1724 out:
1725 	return skb;
1726 }
1727 
1728 int ip6_send_skb(struct sk_buff *skb)
1729 {
1730 	struct net *net = sock_net(skb->sk);
1731 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1732 	int err;
1733 
1734 	err = ip6_local_out(net, skb->sk, skb);
1735 	if (err) {
1736 		if (err > 0)
1737 			err = net_xmit_errno(err);
1738 		if (err)
1739 			IP6_INC_STATS(net, rt->rt6i_idev,
1740 				      IPSTATS_MIB_OUTDISCARDS);
1741 	}
1742 
1743 	return err;
1744 }
1745 
1746 int ip6_push_pending_frames(struct sock *sk)
1747 {
1748 	struct sk_buff *skb;
1749 
1750 	skb = ip6_finish_skb(sk);
1751 	if (!skb)
1752 		return 0;
1753 
1754 	return ip6_send_skb(skb);
1755 }
1756 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1757 
1758 static void __ip6_flush_pending_frames(struct sock *sk,
1759 				       struct sk_buff_head *queue,
1760 				       struct inet_cork_full *cork,
1761 				       struct inet6_cork *v6_cork)
1762 {
1763 	struct sk_buff *skb;
1764 
1765 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1766 		if (skb_dst(skb))
1767 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1768 				      IPSTATS_MIB_OUTDISCARDS);
1769 		kfree_skb(skb);
1770 	}
1771 
1772 	ip6_cork_release(cork, v6_cork);
1773 }
1774 
1775 void ip6_flush_pending_frames(struct sock *sk)
1776 {
1777 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1778 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1779 }
1780 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1781 
1782 struct sk_buff *ip6_make_skb(struct sock *sk,
1783 			     int getfrag(void *from, char *to, int offset,
1784 					 int len, int odd, struct sk_buff *skb),
1785 			     void *from, int length, int transhdrlen,
1786 			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1787 			     struct rt6_info *rt, unsigned int flags,
1788 			     const struct sockcm_cookie *sockc)
1789 {
1790 	struct inet_cork_full cork;
1791 	struct inet6_cork v6_cork;
1792 	struct sk_buff_head queue;
1793 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1794 	int err;
1795 
1796 	if (flags & MSG_PROBE)
1797 		return NULL;
1798 
1799 	__skb_queue_head_init(&queue);
1800 
1801 	cork.base.flags = 0;
1802 	cork.base.addr = 0;
1803 	cork.base.opt = NULL;
1804 	v6_cork.opt = NULL;
1805 	err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1806 	if (err)
1807 		return ERR_PTR(err);
1808 
1809 	if (ipc6->dontfrag < 0)
1810 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1811 
1812 	err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1813 				&current->task_frag, getfrag, from,
1814 				length + exthdrlen, transhdrlen + exthdrlen,
1815 				flags, ipc6, sockc);
1816 	if (err) {
1817 		__ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1818 		return ERR_PTR(err);
1819 	}
1820 
1821 	return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1822 }
1823