xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 9dae47aba0a055f761176d9297371d5bb24289ec)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	:	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/bpf-cgroup.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45 
46 #include <net/sock.h>
47 #include <net/snmp.h>
48 
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
59 #include <net/l3mdev.h>
60 #include <net/lwtunnel.h>
61 
62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
63 {
64 	struct dst_entry *dst = skb_dst(skb);
65 	struct net_device *dev = dst->dev;
66 	struct neighbour *neigh;
67 	struct in6_addr *nexthop;
68 	int ret;
69 
70 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72 
73 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74 		    ((mroute6_socket(net, skb) &&
75 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 					 &ipv6_hdr(skb)->saddr))) {
78 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79 
80 			/* Do not check for IFF_ALLMULTI; multicast routing
81 			   is not supported in any case.
82 			 */
83 			if (newskb)
84 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 					net, sk, newskb, NULL, newskb->dev,
86 					dev_loopback_xmit);
87 
88 			if (ipv6_hdr(skb)->hop_limit == 0) {
89 				IP6_INC_STATS(net, idev,
90 					      IPSTATS_MIB_OUTDISCARDS);
91 				kfree_skb(skb);
92 				return 0;
93 			}
94 		}
95 
96 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
97 
98 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
99 		    IPV6_ADDR_SCOPE_NODELOCAL &&
100 		    !(dev->flags & IFF_LOOPBACK)) {
101 			kfree_skb(skb);
102 			return 0;
103 		}
104 	}
105 
106 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
107 		int res = lwtunnel_xmit(skb);
108 
109 		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
110 			return res;
111 	}
112 
113 	rcu_read_lock_bh();
114 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
115 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
116 	if (unlikely(!neigh))
117 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
118 	if (!IS_ERR(neigh)) {
119 		sock_confirm_neigh(skb, neigh);
120 		ret = neigh_output(neigh, skb);
121 		rcu_read_unlock_bh();
122 		return ret;
123 	}
124 	rcu_read_unlock_bh();
125 
126 	IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
127 	kfree_skb(skb);
128 	return -EINVAL;
129 }
130 
131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
132 {
133 	int ret;
134 
135 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
136 	if (ret) {
137 		kfree_skb(skb);
138 		return ret;
139 	}
140 
141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
142 	/* Policy lookup after SNAT yielded a new policy */
143 	if (skb_dst(skb)->xfrm) {
144 		IPCB(skb)->flags |= IPSKB_REROUTED;
145 		return dst_output(net, sk, skb);
146 	}
147 #endif
148 
149 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
150 	    dst_allfrag(skb_dst(skb)) ||
151 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
152 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
153 	else
154 		return ip6_finish_output2(net, sk, skb);
155 }
156 
157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
158 {
159 	struct net_device *dev = skb_dst(skb)->dev;
160 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161 
162 	skb->protocol = htons(ETH_P_IPV6);
163 	skb->dev = dev;
164 
165 	if (unlikely(idev->cnf.disable_ipv6)) {
166 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
167 		kfree_skb(skb);
168 		return 0;
169 	}
170 
171 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
172 			    net, sk, skb, NULL, dev,
173 			    ip6_finish_output,
174 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
175 }
176 
177 static bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
178 {
179 	if (!np->autoflowlabel_set)
180 		return ip6_default_np_autolabel(net);
181 	else
182 		return np->autoflowlabel;
183 }
184 
185 /*
186  * xmit an sk_buff (used by TCP, SCTP and DCCP)
187  * Note : socket lock is not held for SYNACK packets, but might be modified
188  * by calls to skb_set_owner_w() and ipv6_local_error(),
189  * which are using proper atomic operations or spinlocks.
190  */
191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
192 	     __u32 mark, struct ipv6_txoptions *opt, int tclass)
193 {
194 	struct net *net = sock_net(sk);
195 	const struct ipv6_pinfo *np = inet6_sk(sk);
196 	struct in6_addr *first_hop = &fl6->daddr;
197 	struct dst_entry *dst = skb_dst(skb);
198 	struct ipv6hdr *hdr;
199 	u8  proto = fl6->flowi6_proto;
200 	int seg_len = skb->len;
201 	int hlimit = -1;
202 	u32 mtu;
203 
204 	if (opt) {
205 		unsigned int head_room;
206 
207 		/* First: exthdrs may take lots of space (~8K for now)
208 		   MAX_HEADER is not enough.
209 		 */
210 		head_room = opt->opt_nflen + opt->opt_flen;
211 		seg_len += head_room;
212 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
213 
214 		if (skb_headroom(skb) < head_room) {
215 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
216 			if (!skb2) {
217 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
218 					      IPSTATS_MIB_OUTDISCARDS);
219 				kfree_skb(skb);
220 				return -ENOBUFS;
221 			}
222 			consume_skb(skb);
223 			skb = skb2;
224 			/* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
225 			 * it is safe to call in our context (socket lock not held)
226 			 */
227 			skb_set_owner_w(skb, (struct sock *)sk);
228 		}
229 		if (opt->opt_flen)
230 			ipv6_push_frag_opts(skb, opt, &proto);
231 		if (opt->opt_nflen)
232 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
233 					     &fl6->saddr);
234 	}
235 
236 	skb_push(skb, sizeof(struct ipv6hdr));
237 	skb_reset_network_header(skb);
238 	hdr = ipv6_hdr(skb);
239 
240 	/*
241 	 *	Fill in the IPv6 header
242 	 */
243 	if (np)
244 		hlimit = np->hop_limit;
245 	if (hlimit < 0)
246 		hlimit = ip6_dst_hoplimit(dst);
247 
248 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
249 				ip6_autoflowlabel(net, np), fl6));
250 
251 	hdr->payload_len = htons(seg_len);
252 	hdr->nexthdr = proto;
253 	hdr->hop_limit = hlimit;
254 
255 	hdr->saddr = fl6->saddr;
256 	hdr->daddr = *first_hop;
257 
258 	skb->protocol = htons(ETH_P_IPV6);
259 	skb->priority = sk->sk_priority;
260 	skb->mark = mark;
261 
262 	mtu = dst_mtu(dst);
263 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
264 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
265 			      IPSTATS_MIB_OUT, skb->len);
266 
267 		/* if egress device is enslaved to an L3 master device pass the
268 		 * skb to its handler for processing
269 		 */
270 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
271 		if (unlikely(!skb))
272 			return 0;
273 
274 		/* hooks should never assume socket lock is held.
275 		 * we promote our socket to non const
276 		 */
277 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
278 			       net, (struct sock *)sk, skb, NULL, dst->dev,
279 			       dst_output);
280 	}
281 
282 	skb->dev = dst->dev;
283 	/* ipv6_local_error() does not require socket lock,
284 	 * we promote our socket to non const
285 	 */
286 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
287 
288 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
289 	kfree_skb(skb);
290 	return -EMSGSIZE;
291 }
292 EXPORT_SYMBOL(ip6_xmit);
293 
294 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
295 {
296 	struct ip6_ra_chain *ra;
297 	struct sock *last = NULL;
298 
299 	read_lock(&ip6_ra_lock);
300 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
301 		struct sock *sk = ra->sk;
302 		if (sk && ra->sel == sel &&
303 		    (!sk->sk_bound_dev_if ||
304 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
305 			if (last) {
306 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
307 				if (skb2)
308 					rawv6_rcv(last, skb2);
309 			}
310 			last = sk;
311 		}
312 	}
313 
314 	if (last) {
315 		rawv6_rcv(last, skb);
316 		read_unlock(&ip6_ra_lock);
317 		return 1;
318 	}
319 	read_unlock(&ip6_ra_lock);
320 	return 0;
321 }
322 
323 static int ip6_forward_proxy_check(struct sk_buff *skb)
324 {
325 	struct ipv6hdr *hdr = ipv6_hdr(skb);
326 	u8 nexthdr = hdr->nexthdr;
327 	__be16 frag_off;
328 	int offset;
329 
330 	if (ipv6_ext_hdr(nexthdr)) {
331 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
332 		if (offset < 0)
333 			return 0;
334 	} else
335 		offset = sizeof(struct ipv6hdr);
336 
337 	if (nexthdr == IPPROTO_ICMPV6) {
338 		struct icmp6hdr *icmp6;
339 
340 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
341 					 offset + 1 - skb->data)))
342 			return 0;
343 
344 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
345 
346 		switch (icmp6->icmp6_type) {
347 		case NDISC_ROUTER_SOLICITATION:
348 		case NDISC_ROUTER_ADVERTISEMENT:
349 		case NDISC_NEIGHBOUR_SOLICITATION:
350 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
351 		case NDISC_REDIRECT:
352 			/* For reaction involving unicast neighbor discovery
353 			 * message destined to the proxied address, pass it to
354 			 * input function.
355 			 */
356 			return 1;
357 		default:
358 			break;
359 		}
360 	}
361 
362 	/*
363 	 * The proxying router can't forward traffic sent to a link-local
364 	 * address, so signal the sender and discard the packet. This
365 	 * behavior is clarified by the MIPv6 specification.
366 	 */
367 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
368 		dst_link_failure(skb);
369 		return -1;
370 	}
371 
372 	return 0;
373 }
374 
375 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
376 				     struct sk_buff *skb)
377 {
378 	return dst_output(net, sk, skb);
379 }
380 
381 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
382 {
383 	unsigned int mtu;
384 	struct inet6_dev *idev;
385 
386 	if (dst_metric_locked(dst, RTAX_MTU)) {
387 		mtu = dst_metric_raw(dst, RTAX_MTU);
388 		if (mtu)
389 			return mtu;
390 	}
391 
392 	mtu = IPV6_MIN_MTU;
393 	rcu_read_lock();
394 	idev = __in6_dev_get(dst->dev);
395 	if (idev)
396 		mtu = idev->cnf.mtu6;
397 	rcu_read_unlock();
398 
399 	return mtu;
400 }
401 
402 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
403 {
404 	if (skb->len <= mtu)
405 		return false;
406 
407 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
408 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
409 		return true;
410 
411 	if (skb->ignore_df)
412 		return false;
413 
414 	if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
415 		return false;
416 
417 	return true;
418 }
419 
420 int ip6_forward(struct sk_buff *skb)
421 {
422 	struct dst_entry *dst = skb_dst(skb);
423 	struct ipv6hdr *hdr = ipv6_hdr(skb);
424 	struct inet6_skb_parm *opt = IP6CB(skb);
425 	struct net *net = dev_net(dst->dev);
426 	u32 mtu;
427 
428 	if (net->ipv6.devconf_all->forwarding == 0)
429 		goto error;
430 
431 	if (skb->pkt_type != PACKET_HOST)
432 		goto drop;
433 
434 	if (unlikely(skb->sk))
435 		goto drop;
436 
437 	if (skb_warn_if_lro(skb))
438 		goto drop;
439 
440 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
441 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
442 				IPSTATS_MIB_INDISCARDS);
443 		goto drop;
444 	}
445 
446 	skb_forward_csum(skb);
447 
448 	/*
449 	 *	We DO NOT make any processing on
450 	 *	RA packets, pushing them to user level AS IS
451 	 *	without ane WARRANTY that application will be able
452 	 *	to interpret them. The reason is that we
453 	 *	cannot make anything clever here.
454 	 *
455 	 *	We are not end-node, so that if packet contains
456 	 *	AH/ESP, we cannot make anything.
457 	 *	Defragmentation also would be mistake, RA packets
458 	 *	cannot be fragmented, because there is no warranty
459 	 *	that different fragments will go along one path. --ANK
460 	 */
461 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
462 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
463 			return 0;
464 	}
465 
466 	/*
467 	 *	check and decrement ttl
468 	 */
469 	if (hdr->hop_limit <= 1) {
470 		/* Force OUTPUT device used as source address */
471 		skb->dev = dst->dev;
472 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
473 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
474 				IPSTATS_MIB_INHDRERRORS);
475 
476 		kfree_skb(skb);
477 		return -ETIMEDOUT;
478 	}
479 
480 	/* XXX: idev->cnf.proxy_ndp? */
481 	if (net->ipv6.devconf_all->proxy_ndp &&
482 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
483 		int proxied = ip6_forward_proxy_check(skb);
484 		if (proxied > 0)
485 			return ip6_input(skb);
486 		else if (proxied < 0) {
487 			__IP6_INC_STATS(net, ip6_dst_idev(dst),
488 					IPSTATS_MIB_INDISCARDS);
489 			goto drop;
490 		}
491 	}
492 
493 	if (!xfrm6_route_forward(skb)) {
494 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
495 				IPSTATS_MIB_INDISCARDS);
496 		goto drop;
497 	}
498 	dst = skb_dst(skb);
499 
500 	/* IPv6 specs say nothing about it, but it is clear that we cannot
501 	   send redirects to source routed frames.
502 	   We don't send redirects to frames decapsulated from IPsec.
503 	 */
504 	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
505 		struct in6_addr *target = NULL;
506 		struct inet_peer *peer;
507 		struct rt6_info *rt;
508 
509 		/*
510 		 *	incoming and outgoing devices are the same
511 		 *	send a redirect.
512 		 */
513 
514 		rt = (struct rt6_info *) dst;
515 		if (rt->rt6i_flags & RTF_GATEWAY)
516 			target = &rt->rt6i_gateway;
517 		else
518 			target = &hdr->daddr;
519 
520 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
521 
522 		/* Limit redirects both by destination (here)
523 		   and by source (inside ndisc_send_redirect)
524 		 */
525 		if (inet_peer_xrlim_allow(peer, 1*HZ))
526 			ndisc_send_redirect(skb, target);
527 		if (peer)
528 			inet_putpeer(peer);
529 	} else {
530 		int addrtype = ipv6_addr_type(&hdr->saddr);
531 
532 		/* This check is security critical. */
533 		if (addrtype == IPV6_ADDR_ANY ||
534 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
535 			goto error;
536 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
537 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
538 				    ICMPV6_NOT_NEIGHBOUR, 0);
539 			goto error;
540 		}
541 	}
542 
543 	mtu = ip6_dst_mtu_forward(dst);
544 	if (mtu < IPV6_MIN_MTU)
545 		mtu = IPV6_MIN_MTU;
546 
547 	if (ip6_pkt_too_big(skb, mtu)) {
548 		/* Again, force OUTPUT device used as source address */
549 		skb->dev = dst->dev;
550 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
551 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
552 				IPSTATS_MIB_INTOOBIGERRORS);
553 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
554 				IPSTATS_MIB_FRAGFAILS);
555 		kfree_skb(skb);
556 		return -EMSGSIZE;
557 	}
558 
559 	if (skb_cow(skb, dst->dev->hard_header_len)) {
560 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
561 				IPSTATS_MIB_OUTDISCARDS);
562 		goto drop;
563 	}
564 
565 	hdr = ipv6_hdr(skb);
566 
567 	/* Mangling hops number delayed to point after skb COW */
568 
569 	hdr->hop_limit--;
570 
571 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
572 	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
573 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
574 		       net, NULL, skb, skb->dev, dst->dev,
575 		       ip6_forward_finish);
576 
577 error:
578 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
579 drop:
580 	kfree_skb(skb);
581 	return -EINVAL;
582 }
583 
584 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
585 {
586 	to->pkt_type = from->pkt_type;
587 	to->priority = from->priority;
588 	to->protocol = from->protocol;
589 	skb_dst_drop(to);
590 	skb_dst_set(to, dst_clone(skb_dst(from)));
591 	to->dev = from->dev;
592 	to->mark = from->mark;
593 
594 #ifdef CONFIG_NET_SCHED
595 	to->tc_index = from->tc_index;
596 #endif
597 	nf_copy(to, from);
598 	skb_copy_secmark(to, from);
599 }
600 
601 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
602 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
603 {
604 	struct sk_buff *frag;
605 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
606 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
607 				inet6_sk(skb->sk) : NULL;
608 	struct ipv6hdr *tmp_hdr;
609 	struct frag_hdr *fh;
610 	unsigned int mtu, hlen, left, len;
611 	int hroom, troom;
612 	__be32 frag_id;
613 	int ptr, offset = 0, err = 0;
614 	u8 *prevhdr, nexthdr = 0;
615 
616 	err = ip6_find_1stfragopt(skb, &prevhdr);
617 	if (err < 0)
618 		goto fail;
619 	hlen = err;
620 	nexthdr = *prevhdr;
621 
622 	mtu = ip6_skb_dst_mtu(skb);
623 
624 	/* We must not fragment if the socket is set to force MTU discovery
625 	 * or if the skb it not generated by a local socket.
626 	 */
627 	if (unlikely(!skb->ignore_df && skb->len > mtu))
628 		goto fail_toobig;
629 
630 	if (IP6CB(skb)->frag_max_size) {
631 		if (IP6CB(skb)->frag_max_size > mtu)
632 			goto fail_toobig;
633 
634 		/* don't send fragments larger than what we received */
635 		mtu = IP6CB(skb)->frag_max_size;
636 		if (mtu < IPV6_MIN_MTU)
637 			mtu = IPV6_MIN_MTU;
638 	}
639 
640 	if (np && np->frag_size < mtu) {
641 		if (np->frag_size)
642 			mtu = np->frag_size;
643 	}
644 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
645 		goto fail_toobig;
646 	mtu -= hlen + sizeof(struct frag_hdr);
647 
648 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
649 				    &ipv6_hdr(skb)->saddr);
650 
651 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
652 	    (err = skb_checksum_help(skb)))
653 		goto fail;
654 
655 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
656 	if (skb_has_frag_list(skb)) {
657 		unsigned int first_len = skb_pagelen(skb);
658 		struct sk_buff *frag2;
659 
660 		if (first_len - hlen > mtu ||
661 		    ((first_len - hlen) & 7) ||
662 		    skb_cloned(skb) ||
663 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
664 			goto slow_path;
665 
666 		skb_walk_frags(skb, frag) {
667 			/* Correct geometry. */
668 			if (frag->len > mtu ||
669 			    ((frag->len & 7) && frag->next) ||
670 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
671 				goto slow_path_clean;
672 
673 			/* Partially cloned skb? */
674 			if (skb_shared(frag))
675 				goto slow_path_clean;
676 
677 			BUG_ON(frag->sk);
678 			if (skb->sk) {
679 				frag->sk = skb->sk;
680 				frag->destructor = sock_wfree;
681 			}
682 			skb->truesize -= frag->truesize;
683 		}
684 
685 		err = 0;
686 		offset = 0;
687 		/* BUILD HEADER */
688 
689 		*prevhdr = NEXTHDR_FRAGMENT;
690 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
691 		if (!tmp_hdr) {
692 			err = -ENOMEM;
693 			goto fail;
694 		}
695 		frag = skb_shinfo(skb)->frag_list;
696 		skb_frag_list_init(skb);
697 
698 		__skb_pull(skb, hlen);
699 		fh = __skb_push(skb, sizeof(struct frag_hdr));
700 		__skb_push(skb, hlen);
701 		skb_reset_network_header(skb);
702 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
703 
704 		fh->nexthdr = nexthdr;
705 		fh->reserved = 0;
706 		fh->frag_off = htons(IP6_MF);
707 		fh->identification = frag_id;
708 
709 		first_len = skb_pagelen(skb);
710 		skb->data_len = first_len - skb_headlen(skb);
711 		skb->len = first_len;
712 		ipv6_hdr(skb)->payload_len = htons(first_len -
713 						   sizeof(struct ipv6hdr));
714 
715 		for (;;) {
716 			/* Prepare header of the next frame,
717 			 * before previous one went down. */
718 			if (frag) {
719 				frag->ip_summed = CHECKSUM_NONE;
720 				skb_reset_transport_header(frag);
721 				fh = __skb_push(frag, sizeof(struct frag_hdr));
722 				__skb_push(frag, hlen);
723 				skb_reset_network_header(frag);
724 				memcpy(skb_network_header(frag), tmp_hdr,
725 				       hlen);
726 				offset += skb->len - hlen - sizeof(struct frag_hdr);
727 				fh->nexthdr = nexthdr;
728 				fh->reserved = 0;
729 				fh->frag_off = htons(offset);
730 				if (frag->next)
731 					fh->frag_off |= htons(IP6_MF);
732 				fh->identification = frag_id;
733 				ipv6_hdr(frag)->payload_len =
734 						htons(frag->len -
735 						      sizeof(struct ipv6hdr));
736 				ip6_copy_metadata(frag, skb);
737 			}
738 
739 			err = output(net, sk, skb);
740 			if (!err)
741 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
742 					      IPSTATS_MIB_FRAGCREATES);
743 
744 			if (err || !frag)
745 				break;
746 
747 			skb = frag;
748 			frag = skb->next;
749 			skb->next = NULL;
750 		}
751 
752 		kfree(tmp_hdr);
753 
754 		if (err == 0) {
755 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
756 				      IPSTATS_MIB_FRAGOKS);
757 			return 0;
758 		}
759 
760 		kfree_skb_list(frag);
761 
762 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
763 			      IPSTATS_MIB_FRAGFAILS);
764 		return err;
765 
766 slow_path_clean:
767 		skb_walk_frags(skb, frag2) {
768 			if (frag2 == frag)
769 				break;
770 			frag2->sk = NULL;
771 			frag2->destructor = NULL;
772 			skb->truesize += frag2->truesize;
773 		}
774 	}
775 
776 slow_path:
777 	left = skb->len - hlen;		/* Space per frame */
778 	ptr = hlen;			/* Where to start from */
779 
780 	/*
781 	 *	Fragment the datagram.
782 	 */
783 
784 	troom = rt->dst.dev->needed_tailroom;
785 
786 	/*
787 	 *	Keep copying data until we run out.
788 	 */
789 	while (left > 0)	{
790 		u8 *fragnexthdr_offset;
791 
792 		len = left;
793 		/* IF: it doesn't fit, use 'mtu' - the data space left */
794 		if (len > mtu)
795 			len = mtu;
796 		/* IF: we are not sending up to and including the packet end
797 		   then align the next start on an eight byte boundary */
798 		if (len < left)	{
799 			len &= ~7;
800 		}
801 
802 		/* Allocate buffer */
803 		frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
804 				 hroom + troom, GFP_ATOMIC);
805 		if (!frag) {
806 			err = -ENOMEM;
807 			goto fail;
808 		}
809 
810 		/*
811 		 *	Set up data on packet
812 		 */
813 
814 		ip6_copy_metadata(frag, skb);
815 		skb_reserve(frag, hroom);
816 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
817 		skb_reset_network_header(frag);
818 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
819 		frag->transport_header = (frag->network_header + hlen +
820 					  sizeof(struct frag_hdr));
821 
822 		/*
823 		 *	Charge the memory for the fragment to any owner
824 		 *	it might possess
825 		 */
826 		if (skb->sk)
827 			skb_set_owner_w(frag, skb->sk);
828 
829 		/*
830 		 *	Copy the packet header into the new buffer.
831 		 */
832 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
833 
834 		fragnexthdr_offset = skb_network_header(frag);
835 		fragnexthdr_offset += prevhdr - skb_network_header(skb);
836 		*fragnexthdr_offset = NEXTHDR_FRAGMENT;
837 
838 		/*
839 		 *	Build fragment header.
840 		 */
841 		fh->nexthdr = nexthdr;
842 		fh->reserved = 0;
843 		fh->identification = frag_id;
844 
845 		/*
846 		 *	Copy a block of the IP datagram.
847 		 */
848 		BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
849 				     len));
850 		left -= len;
851 
852 		fh->frag_off = htons(offset);
853 		if (left > 0)
854 			fh->frag_off |= htons(IP6_MF);
855 		ipv6_hdr(frag)->payload_len = htons(frag->len -
856 						    sizeof(struct ipv6hdr));
857 
858 		ptr += len;
859 		offset += len;
860 
861 		/*
862 		 *	Put this fragment into the sending queue.
863 		 */
864 		err = output(net, sk, frag);
865 		if (err)
866 			goto fail;
867 
868 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
869 			      IPSTATS_MIB_FRAGCREATES);
870 	}
871 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
872 		      IPSTATS_MIB_FRAGOKS);
873 	consume_skb(skb);
874 	return err;
875 
876 fail_toobig:
877 	if (skb->sk && dst_allfrag(skb_dst(skb)))
878 		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
879 
880 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
881 	err = -EMSGSIZE;
882 
883 fail:
884 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
885 		      IPSTATS_MIB_FRAGFAILS);
886 	kfree_skb(skb);
887 	return err;
888 }
889 
890 static inline int ip6_rt_check(const struct rt6key *rt_key,
891 			       const struct in6_addr *fl_addr,
892 			       const struct in6_addr *addr_cache)
893 {
894 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
895 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
896 }
897 
898 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
899 					  struct dst_entry *dst,
900 					  const struct flowi6 *fl6)
901 {
902 	struct ipv6_pinfo *np = inet6_sk(sk);
903 	struct rt6_info *rt;
904 
905 	if (!dst)
906 		goto out;
907 
908 	if (dst->ops->family != AF_INET6) {
909 		dst_release(dst);
910 		return NULL;
911 	}
912 
913 	rt = (struct rt6_info *)dst;
914 	/* Yes, checking route validity in not connected
915 	 * case is not very simple. Take into account,
916 	 * that we do not support routing by source, TOS,
917 	 * and MSG_DONTROUTE		--ANK (980726)
918 	 *
919 	 * 1. ip6_rt_check(): If route was host route,
920 	 *    check that cached destination is current.
921 	 *    If it is network route, we still may
922 	 *    check its validity using saved pointer
923 	 *    to the last used address: daddr_cache.
924 	 *    We do not want to save whole address now,
925 	 *    (because main consumer of this service
926 	 *    is tcp, which has not this problem),
927 	 *    so that the last trick works only on connected
928 	 *    sockets.
929 	 * 2. oif also should be the same.
930 	 */
931 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
932 #ifdef CONFIG_IPV6_SUBTREES
933 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
934 #endif
935 	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
936 	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
937 		dst_release(dst);
938 		dst = NULL;
939 	}
940 
941 out:
942 	return dst;
943 }
944 
945 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
946 			       struct dst_entry **dst, struct flowi6 *fl6)
947 {
948 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
949 	struct neighbour *n;
950 	struct rt6_info *rt;
951 #endif
952 	int err;
953 	int flags = 0;
954 
955 	/* The correct way to handle this would be to do
956 	 * ip6_route_get_saddr, and then ip6_route_output; however,
957 	 * the route-specific preferred source forces the
958 	 * ip6_route_output call _before_ ip6_route_get_saddr.
959 	 *
960 	 * In source specific routing (no src=any default route),
961 	 * ip6_route_output will fail given src=any saddr, though, so
962 	 * that's why we try it again later.
963 	 */
964 	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
965 		struct rt6_info *rt;
966 		bool had_dst = *dst != NULL;
967 
968 		if (!had_dst)
969 			*dst = ip6_route_output(net, sk, fl6);
970 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
971 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
972 					  sk ? inet6_sk(sk)->srcprefs : 0,
973 					  &fl6->saddr);
974 		if (err)
975 			goto out_err_release;
976 
977 		/* If we had an erroneous initial result, pretend it
978 		 * never existed and let the SA-enabled version take
979 		 * over.
980 		 */
981 		if (!had_dst && (*dst)->error) {
982 			dst_release(*dst);
983 			*dst = NULL;
984 		}
985 
986 		if (fl6->flowi6_oif)
987 			flags |= RT6_LOOKUP_F_IFACE;
988 	}
989 
990 	if (!*dst)
991 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
992 
993 	err = (*dst)->error;
994 	if (err)
995 		goto out_err_release;
996 
997 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
998 	/*
999 	 * Here if the dst entry we've looked up
1000 	 * has a neighbour entry that is in the INCOMPLETE
1001 	 * state and the src address from the flow is
1002 	 * marked as OPTIMISTIC, we release the found
1003 	 * dst entry and replace it instead with the
1004 	 * dst entry of the nexthop router
1005 	 */
1006 	rt = (struct rt6_info *) *dst;
1007 	rcu_read_lock_bh();
1008 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1009 				      rt6_nexthop(rt, &fl6->daddr));
1010 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1011 	rcu_read_unlock_bh();
1012 
1013 	if (err) {
1014 		struct inet6_ifaddr *ifp;
1015 		struct flowi6 fl_gw6;
1016 		int redirect;
1017 
1018 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1019 				      (*dst)->dev, 1);
1020 
1021 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1022 		if (ifp)
1023 			in6_ifa_put(ifp);
1024 
1025 		if (redirect) {
1026 			/*
1027 			 * We need to get the dst entry for the
1028 			 * default router instead
1029 			 */
1030 			dst_release(*dst);
1031 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1032 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1033 			*dst = ip6_route_output(net, sk, &fl_gw6);
1034 			err = (*dst)->error;
1035 			if (err)
1036 				goto out_err_release;
1037 		}
1038 	}
1039 #endif
1040 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1041 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1042 		err = -EAFNOSUPPORT;
1043 		goto out_err_release;
1044 	}
1045 
1046 	return 0;
1047 
1048 out_err_release:
1049 	dst_release(*dst);
1050 	*dst = NULL;
1051 
1052 	if (err == -ENETUNREACH)
1053 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1054 	return err;
1055 }
1056 
1057 /**
1058  *	ip6_dst_lookup - perform route lookup on flow
1059  *	@sk: socket which provides route info
1060  *	@dst: pointer to dst_entry * for result
1061  *	@fl6: flow to lookup
1062  *
1063  *	This function performs a route lookup on the given flow.
1064  *
1065  *	It returns zero on success, or a standard errno code on error.
1066  */
1067 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1068 		   struct flowi6 *fl6)
1069 {
1070 	*dst = NULL;
1071 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1072 }
1073 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1074 
1075 /**
1076  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1077  *	@sk: socket which provides route info
1078  *	@fl6: flow to lookup
1079  *	@final_dst: final destination address for ipsec lookup
1080  *
1081  *	This function performs a route lookup on the given flow.
1082  *
1083  *	It returns a valid dst pointer on success, or a pointer encoded
1084  *	error code.
1085  */
1086 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1087 				      const struct in6_addr *final_dst)
1088 {
1089 	struct dst_entry *dst = NULL;
1090 	int err;
1091 
1092 	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1093 	if (err)
1094 		return ERR_PTR(err);
1095 	if (final_dst)
1096 		fl6->daddr = *final_dst;
1097 
1098 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1099 }
1100 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1101 
1102 /**
1103  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1104  *	@sk: socket which provides the dst cache and route info
1105  *	@fl6: flow to lookup
1106  *	@final_dst: final destination address for ipsec lookup
1107  *
1108  *	This function performs a route lookup on the given flow with the
1109  *	possibility of using the cached route in the socket if it is valid.
1110  *	It will take the socket dst lock when operating on the dst cache.
1111  *	As a result, this function can only be used in process context.
1112  *
1113  *	It returns a valid dst pointer on success, or a pointer encoded
1114  *	error code.
1115  */
1116 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1117 					 const struct in6_addr *final_dst)
1118 {
1119 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1120 
1121 	dst = ip6_sk_dst_check(sk, dst, fl6);
1122 	if (!dst)
1123 		dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1124 
1125 	return dst;
1126 }
1127 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1128 
1129 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1130 					       gfp_t gfp)
1131 {
1132 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1133 }
1134 
1135 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1136 						gfp_t gfp)
1137 {
1138 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1139 }
1140 
1141 static void ip6_append_data_mtu(unsigned int *mtu,
1142 				int *maxfraglen,
1143 				unsigned int fragheaderlen,
1144 				struct sk_buff *skb,
1145 				struct rt6_info *rt,
1146 				unsigned int orig_mtu)
1147 {
1148 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1149 		if (!skb) {
1150 			/* first fragment, reserve header_len */
1151 			*mtu = orig_mtu - rt->dst.header_len;
1152 
1153 		} else {
1154 			/*
1155 			 * this fragment is not first, the headers
1156 			 * space is regarded as data space.
1157 			 */
1158 			*mtu = orig_mtu;
1159 		}
1160 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1161 			      + fragheaderlen - sizeof(struct frag_hdr);
1162 	}
1163 }
1164 
1165 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1166 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1167 			  struct rt6_info *rt, struct flowi6 *fl6)
1168 {
1169 	struct ipv6_pinfo *np = inet6_sk(sk);
1170 	unsigned int mtu;
1171 	struct ipv6_txoptions *opt = ipc6->opt;
1172 
1173 	/*
1174 	 * setup for corking
1175 	 */
1176 	if (opt) {
1177 		if (WARN_ON(v6_cork->opt))
1178 			return -EINVAL;
1179 
1180 		v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1181 		if (unlikely(!v6_cork->opt))
1182 			return -ENOBUFS;
1183 
1184 		v6_cork->opt->tot_len = sizeof(*opt);
1185 		v6_cork->opt->opt_flen = opt->opt_flen;
1186 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1187 
1188 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1189 						    sk->sk_allocation);
1190 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1191 			return -ENOBUFS;
1192 
1193 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1194 						    sk->sk_allocation);
1195 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1196 			return -ENOBUFS;
1197 
1198 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1199 						   sk->sk_allocation);
1200 		if (opt->hopopt && !v6_cork->opt->hopopt)
1201 			return -ENOBUFS;
1202 
1203 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1204 						    sk->sk_allocation);
1205 		if (opt->srcrt && !v6_cork->opt->srcrt)
1206 			return -ENOBUFS;
1207 
1208 		/* need source address above miyazawa*/
1209 	}
1210 	dst_hold(&rt->dst);
1211 	cork->base.dst = &rt->dst;
1212 	cork->fl.u.ip6 = *fl6;
1213 	v6_cork->hop_limit = ipc6->hlimit;
1214 	v6_cork->tclass = ipc6->tclass;
1215 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1216 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1217 		      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1218 	else
1219 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1220 		      rt->dst.dev->mtu : dst_mtu(xfrm_dst_path(&rt->dst));
1221 	if (np->frag_size < mtu) {
1222 		if (np->frag_size)
1223 			mtu = np->frag_size;
1224 	}
1225 	cork->base.fragsize = mtu;
1226 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1227 		cork->base.flags |= IPCORK_ALLFRAG;
1228 	cork->base.length = 0;
1229 
1230 	return 0;
1231 }
1232 
1233 static int __ip6_append_data(struct sock *sk,
1234 			     struct flowi6 *fl6,
1235 			     struct sk_buff_head *queue,
1236 			     struct inet_cork *cork,
1237 			     struct inet6_cork *v6_cork,
1238 			     struct page_frag *pfrag,
1239 			     int getfrag(void *from, char *to, int offset,
1240 					 int len, int odd, struct sk_buff *skb),
1241 			     void *from, int length, int transhdrlen,
1242 			     unsigned int flags, struct ipcm6_cookie *ipc6,
1243 			     const struct sockcm_cookie *sockc)
1244 {
1245 	struct sk_buff *skb, *skb_prev = NULL;
1246 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1247 	int exthdrlen = 0;
1248 	int dst_exthdrlen = 0;
1249 	int hh_len;
1250 	int copy;
1251 	int err;
1252 	int offset = 0;
1253 	__u8 tx_flags = 0;
1254 	u32 tskey = 0;
1255 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1256 	struct ipv6_txoptions *opt = v6_cork->opt;
1257 	int csummode = CHECKSUM_NONE;
1258 	unsigned int maxnonfragsize, headersize;
1259 
1260 	skb = skb_peek_tail(queue);
1261 	if (!skb) {
1262 		exthdrlen = opt ? opt->opt_flen : 0;
1263 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1264 	}
1265 
1266 	mtu = cork->fragsize;
1267 	orig_mtu = mtu;
1268 
1269 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1270 
1271 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1272 			(opt ? opt->opt_nflen : 0);
1273 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1274 		     sizeof(struct frag_hdr);
1275 
1276 	headersize = sizeof(struct ipv6hdr) +
1277 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1278 		     (dst_allfrag(&rt->dst) ?
1279 		      sizeof(struct frag_hdr) : 0) +
1280 		     rt->rt6i_nfheader_len;
1281 
1282 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1283 	    (sk->sk_protocol == IPPROTO_UDP ||
1284 	     sk->sk_protocol == IPPROTO_RAW)) {
1285 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1286 				sizeof(struct ipv6hdr));
1287 		goto emsgsize;
1288 	}
1289 
1290 	if (ip6_sk_ignore_df(sk))
1291 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1292 	else
1293 		maxnonfragsize = mtu;
1294 
1295 	if (cork->length + length > maxnonfragsize - headersize) {
1296 emsgsize:
1297 		ipv6_local_error(sk, EMSGSIZE, fl6,
1298 				 mtu - headersize +
1299 				 sizeof(struct ipv6hdr));
1300 		return -EMSGSIZE;
1301 	}
1302 
1303 	/* CHECKSUM_PARTIAL only with no extension headers and when
1304 	 * we are not going to fragment
1305 	 */
1306 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1307 	    headersize == sizeof(struct ipv6hdr) &&
1308 	    length <= mtu - headersize &&
1309 	    !(flags & MSG_MORE) &&
1310 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1311 		csummode = CHECKSUM_PARTIAL;
1312 
1313 	if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1314 		sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1315 		if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1316 		    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1317 			tskey = sk->sk_tskey++;
1318 	}
1319 
1320 	/*
1321 	 * Let's try using as much space as possible.
1322 	 * Use MTU if total length of the message fits into the MTU.
1323 	 * Otherwise, we need to reserve fragment header and
1324 	 * fragment alignment (= 8-15 octects, in total).
1325 	 *
1326 	 * Note that we may need to "move" the data from the tail of
1327 	 * of the buffer to the new fragment when we split
1328 	 * the message.
1329 	 *
1330 	 * FIXME: It may be fragmented into multiple chunks
1331 	 *        at once if non-fragmentable extension headers
1332 	 *        are too large.
1333 	 * --yoshfuji
1334 	 */
1335 
1336 	cork->length += length;
1337 	if (!skb)
1338 		goto alloc_new_skb;
1339 
1340 	while (length > 0) {
1341 		/* Check if the remaining data fits into current packet. */
1342 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1343 		if (copy < length)
1344 			copy = maxfraglen - skb->len;
1345 
1346 		if (copy <= 0) {
1347 			char *data;
1348 			unsigned int datalen;
1349 			unsigned int fraglen;
1350 			unsigned int fraggap;
1351 			unsigned int alloclen;
1352 alloc_new_skb:
1353 			/* There's no room in the current skb */
1354 			if (skb)
1355 				fraggap = skb->len - maxfraglen;
1356 			else
1357 				fraggap = 0;
1358 			/* update mtu and maxfraglen if necessary */
1359 			if (!skb || !skb_prev)
1360 				ip6_append_data_mtu(&mtu, &maxfraglen,
1361 						    fragheaderlen, skb, rt,
1362 						    orig_mtu);
1363 
1364 			skb_prev = skb;
1365 
1366 			/*
1367 			 * If remaining data exceeds the mtu,
1368 			 * we know we need more fragment(s).
1369 			 */
1370 			datalen = length + fraggap;
1371 
1372 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1373 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1374 			if ((flags & MSG_MORE) &&
1375 			    !(rt->dst.dev->features&NETIF_F_SG))
1376 				alloclen = mtu;
1377 			else
1378 				alloclen = datalen + fragheaderlen;
1379 
1380 			alloclen += dst_exthdrlen;
1381 
1382 			if (datalen != length + fraggap) {
1383 				/*
1384 				 * this is not the last fragment, the trailer
1385 				 * space is regarded as data space.
1386 				 */
1387 				datalen += rt->dst.trailer_len;
1388 			}
1389 
1390 			alloclen += rt->dst.trailer_len;
1391 			fraglen = datalen + fragheaderlen;
1392 
1393 			/*
1394 			 * We just reserve space for fragment header.
1395 			 * Note: this may be overallocation if the message
1396 			 * (without MSG_MORE) fits into the MTU.
1397 			 */
1398 			alloclen += sizeof(struct frag_hdr);
1399 
1400 			copy = datalen - transhdrlen - fraggap;
1401 			if (copy < 0) {
1402 				err = -EINVAL;
1403 				goto error;
1404 			}
1405 			if (transhdrlen) {
1406 				skb = sock_alloc_send_skb(sk,
1407 						alloclen + hh_len,
1408 						(flags & MSG_DONTWAIT), &err);
1409 			} else {
1410 				skb = NULL;
1411 				if (refcount_read(&sk->sk_wmem_alloc) <=
1412 				    2 * sk->sk_sndbuf)
1413 					skb = sock_wmalloc(sk,
1414 							   alloclen + hh_len, 1,
1415 							   sk->sk_allocation);
1416 				if (unlikely(!skb))
1417 					err = -ENOBUFS;
1418 			}
1419 			if (!skb)
1420 				goto error;
1421 			/*
1422 			 *	Fill in the control structures
1423 			 */
1424 			skb->protocol = htons(ETH_P_IPV6);
1425 			skb->ip_summed = csummode;
1426 			skb->csum = 0;
1427 			/* reserve for fragmentation and ipsec header */
1428 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1429 				    dst_exthdrlen);
1430 
1431 			/* Only the initial fragment is time stamped */
1432 			skb_shinfo(skb)->tx_flags = tx_flags;
1433 			tx_flags = 0;
1434 			skb_shinfo(skb)->tskey = tskey;
1435 			tskey = 0;
1436 
1437 			/*
1438 			 *	Find where to start putting bytes
1439 			 */
1440 			data = skb_put(skb, fraglen);
1441 			skb_set_network_header(skb, exthdrlen);
1442 			data += fragheaderlen;
1443 			skb->transport_header = (skb->network_header +
1444 						 fragheaderlen);
1445 			if (fraggap) {
1446 				skb->csum = skb_copy_and_csum_bits(
1447 					skb_prev, maxfraglen,
1448 					data + transhdrlen, fraggap, 0);
1449 				skb_prev->csum = csum_sub(skb_prev->csum,
1450 							  skb->csum);
1451 				data += fraggap;
1452 				pskb_trim_unique(skb_prev, maxfraglen);
1453 			}
1454 			if (copy > 0 &&
1455 			    getfrag(from, data + transhdrlen, offset,
1456 				    copy, fraggap, skb) < 0) {
1457 				err = -EFAULT;
1458 				kfree_skb(skb);
1459 				goto error;
1460 			}
1461 
1462 			offset += copy;
1463 			length -= datalen - fraggap;
1464 			transhdrlen = 0;
1465 			exthdrlen = 0;
1466 			dst_exthdrlen = 0;
1467 
1468 			if ((flags & MSG_CONFIRM) && !skb_prev)
1469 				skb_set_dst_pending_confirm(skb, 1);
1470 
1471 			/*
1472 			 * Put the packet on the pending queue
1473 			 */
1474 			__skb_queue_tail(queue, skb);
1475 			continue;
1476 		}
1477 
1478 		if (copy > length)
1479 			copy = length;
1480 
1481 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1482 			unsigned int off;
1483 
1484 			off = skb->len;
1485 			if (getfrag(from, skb_put(skb, copy),
1486 						offset, copy, off, skb) < 0) {
1487 				__skb_trim(skb, off);
1488 				err = -EFAULT;
1489 				goto error;
1490 			}
1491 		} else {
1492 			int i = skb_shinfo(skb)->nr_frags;
1493 
1494 			err = -ENOMEM;
1495 			if (!sk_page_frag_refill(sk, pfrag))
1496 				goto error;
1497 
1498 			if (!skb_can_coalesce(skb, i, pfrag->page,
1499 					      pfrag->offset)) {
1500 				err = -EMSGSIZE;
1501 				if (i == MAX_SKB_FRAGS)
1502 					goto error;
1503 
1504 				__skb_fill_page_desc(skb, i, pfrag->page,
1505 						     pfrag->offset, 0);
1506 				skb_shinfo(skb)->nr_frags = ++i;
1507 				get_page(pfrag->page);
1508 			}
1509 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1510 			if (getfrag(from,
1511 				    page_address(pfrag->page) + pfrag->offset,
1512 				    offset, copy, skb->len, skb) < 0)
1513 				goto error_efault;
1514 
1515 			pfrag->offset += copy;
1516 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1517 			skb->len += copy;
1518 			skb->data_len += copy;
1519 			skb->truesize += copy;
1520 			refcount_add(copy, &sk->sk_wmem_alloc);
1521 		}
1522 		offset += copy;
1523 		length -= copy;
1524 	}
1525 
1526 	return 0;
1527 
1528 error_efault:
1529 	err = -EFAULT;
1530 error:
1531 	cork->length -= length;
1532 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1533 	return err;
1534 }
1535 
1536 int ip6_append_data(struct sock *sk,
1537 		    int getfrag(void *from, char *to, int offset, int len,
1538 				int odd, struct sk_buff *skb),
1539 		    void *from, int length, int transhdrlen,
1540 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1541 		    struct rt6_info *rt, unsigned int flags,
1542 		    const struct sockcm_cookie *sockc)
1543 {
1544 	struct inet_sock *inet = inet_sk(sk);
1545 	struct ipv6_pinfo *np = inet6_sk(sk);
1546 	int exthdrlen;
1547 	int err;
1548 
1549 	if (flags&MSG_PROBE)
1550 		return 0;
1551 	if (skb_queue_empty(&sk->sk_write_queue)) {
1552 		/*
1553 		 * setup for corking
1554 		 */
1555 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1556 				     ipc6, rt, fl6);
1557 		if (err)
1558 			return err;
1559 
1560 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1561 		length += exthdrlen;
1562 		transhdrlen += exthdrlen;
1563 	} else {
1564 		fl6 = &inet->cork.fl.u.ip6;
1565 		transhdrlen = 0;
1566 	}
1567 
1568 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1569 				 &np->cork, sk_page_frag(sk), getfrag,
1570 				 from, length, transhdrlen, flags, ipc6, sockc);
1571 }
1572 EXPORT_SYMBOL_GPL(ip6_append_data);
1573 
1574 static void ip6_cork_release(struct inet_cork_full *cork,
1575 			     struct inet6_cork *v6_cork)
1576 {
1577 	if (v6_cork->opt) {
1578 		kfree(v6_cork->opt->dst0opt);
1579 		kfree(v6_cork->opt->dst1opt);
1580 		kfree(v6_cork->opt->hopopt);
1581 		kfree(v6_cork->opt->srcrt);
1582 		kfree(v6_cork->opt);
1583 		v6_cork->opt = NULL;
1584 	}
1585 
1586 	if (cork->base.dst) {
1587 		dst_release(cork->base.dst);
1588 		cork->base.dst = NULL;
1589 		cork->base.flags &= ~IPCORK_ALLFRAG;
1590 	}
1591 	memset(&cork->fl, 0, sizeof(cork->fl));
1592 }
1593 
1594 struct sk_buff *__ip6_make_skb(struct sock *sk,
1595 			       struct sk_buff_head *queue,
1596 			       struct inet_cork_full *cork,
1597 			       struct inet6_cork *v6_cork)
1598 {
1599 	struct sk_buff *skb, *tmp_skb;
1600 	struct sk_buff **tail_skb;
1601 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1602 	struct ipv6_pinfo *np = inet6_sk(sk);
1603 	struct net *net = sock_net(sk);
1604 	struct ipv6hdr *hdr;
1605 	struct ipv6_txoptions *opt = v6_cork->opt;
1606 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1607 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1608 	unsigned char proto = fl6->flowi6_proto;
1609 
1610 	skb = __skb_dequeue(queue);
1611 	if (!skb)
1612 		goto out;
1613 	tail_skb = &(skb_shinfo(skb)->frag_list);
1614 
1615 	/* move skb->data to ip header from ext header */
1616 	if (skb->data < skb_network_header(skb))
1617 		__skb_pull(skb, skb_network_offset(skb));
1618 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1619 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1620 		*tail_skb = tmp_skb;
1621 		tail_skb = &(tmp_skb->next);
1622 		skb->len += tmp_skb->len;
1623 		skb->data_len += tmp_skb->len;
1624 		skb->truesize += tmp_skb->truesize;
1625 		tmp_skb->destructor = NULL;
1626 		tmp_skb->sk = NULL;
1627 	}
1628 
1629 	/* Allow local fragmentation. */
1630 	skb->ignore_df = ip6_sk_ignore_df(sk);
1631 
1632 	*final_dst = fl6->daddr;
1633 	__skb_pull(skb, skb_network_header_len(skb));
1634 	if (opt && opt->opt_flen)
1635 		ipv6_push_frag_opts(skb, opt, &proto);
1636 	if (opt && opt->opt_nflen)
1637 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1638 
1639 	skb_push(skb, sizeof(struct ipv6hdr));
1640 	skb_reset_network_header(skb);
1641 	hdr = ipv6_hdr(skb);
1642 
1643 	ip6_flow_hdr(hdr, v6_cork->tclass,
1644 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1645 					ip6_autoflowlabel(net, np), fl6));
1646 	hdr->hop_limit = v6_cork->hop_limit;
1647 	hdr->nexthdr = proto;
1648 	hdr->saddr = fl6->saddr;
1649 	hdr->daddr = *final_dst;
1650 
1651 	skb->priority = sk->sk_priority;
1652 	skb->mark = sk->sk_mark;
1653 
1654 	skb_dst_set(skb, dst_clone(&rt->dst));
1655 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1656 	if (proto == IPPROTO_ICMPV6) {
1657 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1658 
1659 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1660 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1661 	}
1662 
1663 	ip6_cork_release(cork, v6_cork);
1664 out:
1665 	return skb;
1666 }
1667 
1668 int ip6_send_skb(struct sk_buff *skb)
1669 {
1670 	struct net *net = sock_net(skb->sk);
1671 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1672 	int err;
1673 
1674 	err = ip6_local_out(net, skb->sk, skb);
1675 	if (err) {
1676 		if (err > 0)
1677 			err = net_xmit_errno(err);
1678 		if (err)
1679 			IP6_INC_STATS(net, rt->rt6i_idev,
1680 				      IPSTATS_MIB_OUTDISCARDS);
1681 	}
1682 
1683 	return err;
1684 }
1685 
1686 int ip6_push_pending_frames(struct sock *sk)
1687 {
1688 	struct sk_buff *skb;
1689 
1690 	skb = ip6_finish_skb(sk);
1691 	if (!skb)
1692 		return 0;
1693 
1694 	return ip6_send_skb(skb);
1695 }
1696 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1697 
1698 static void __ip6_flush_pending_frames(struct sock *sk,
1699 				       struct sk_buff_head *queue,
1700 				       struct inet_cork_full *cork,
1701 				       struct inet6_cork *v6_cork)
1702 {
1703 	struct sk_buff *skb;
1704 
1705 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1706 		if (skb_dst(skb))
1707 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1708 				      IPSTATS_MIB_OUTDISCARDS);
1709 		kfree_skb(skb);
1710 	}
1711 
1712 	ip6_cork_release(cork, v6_cork);
1713 }
1714 
1715 void ip6_flush_pending_frames(struct sock *sk)
1716 {
1717 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1718 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1719 }
1720 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1721 
1722 struct sk_buff *ip6_make_skb(struct sock *sk,
1723 			     int getfrag(void *from, char *to, int offset,
1724 					 int len, int odd, struct sk_buff *skb),
1725 			     void *from, int length, int transhdrlen,
1726 			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1727 			     struct rt6_info *rt, unsigned int flags,
1728 			     const struct sockcm_cookie *sockc)
1729 {
1730 	struct inet_cork_full cork;
1731 	struct inet6_cork v6_cork;
1732 	struct sk_buff_head queue;
1733 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1734 	int err;
1735 
1736 	if (flags & MSG_PROBE)
1737 		return NULL;
1738 
1739 	__skb_queue_head_init(&queue);
1740 
1741 	cork.base.flags = 0;
1742 	cork.base.addr = 0;
1743 	cork.base.opt = NULL;
1744 	v6_cork.opt = NULL;
1745 	err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1746 	if (err)
1747 		return ERR_PTR(err);
1748 
1749 	if (ipc6->dontfrag < 0)
1750 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1751 
1752 	err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1753 				&current->task_frag, getfrag, from,
1754 				length + exthdrlen, transhdrlen + exthdrlen,
1755 				flags, ipc6, sockc);
1756 	if (err) {
1757 		__ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1758 		return ERR_PTR(err);
1759 	}
1760 
1761 	return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1762 }
1763