xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 4bce6fce)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	:	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 
59 static int ip6_finish_output2(struct sock *sk, struct sk_buff *skb)
60 {
61 	struct dst_entry *dst = skb_dst(skb);
62 	struct net_device *dev = dst->dev;
63 	struct neighbour *neigh;
64 	struct in6_addr *nexthop;
65 	int ret;
66 
67 	skb->protocol = htons(ETH_P_IPV6);
68 	skb->dev = dev;
69 
70 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72 
73 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74 		    ((mroute6_socket(dev_net(dev), skb) &&
75 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 					 &ipv6_hdr(skb)->saddr))) {
78 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79 
80 			/* Do not check for IFF_ALLMULTI; multicast routing
81 			   is not supported in any case.
82 			 */
83 			if (newskb)
84 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 					sk, newskb, NULL, newskb->dev,
86 					dev_loopback_xmit);
87 
88 			if (ipv6_hdr(skb)->hop_limit == 0) {
89 				IP6_INC_STATS(dev_net(dev), idev,
90 					      IPSTATS_MIB_OUTDISCARDS);
91 				kfree_skb(skb);
92 				return 0;
93 			}
94 		}
95 
96 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
97 				skb->len);
98 
99 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
100 		    IPV6_ADDR_SCOPE_NODELOCAL &&
101 		    !(dev->flags & IFF_LOOPBACK)) {
102 			kfree_skb(skb);
103 			return 0;
104 		}
105 	}
106 
107 	rcu_read_lock_bh();
108 	nexthop = rt6_nexthop((struct rt6_info *)dst);
109 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
110 	if (unlikely(!neigh))
111 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
112 	if (!IS_ERR(neigh)) {
113 		ret = dst_neigh_output(dst, neigh, skb);
114 		rcu_read_unlock_bh();
115 		return ret;
116 	}
117 	rcu_read_unlock_bh();
118 
119 	IP6_INC_STATS(dev_net(dst->dev),
120 		      ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
121 	kfree_skb(skb);
122 	return -EINVAL;
123 }
124 
125 static int ip6_finish_output(struct sock *sk, struct sk_buff *skb)
126 {
127 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
128 	    dst_allfrag(skb_dst(skb)) ||
129 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
130 		return ip6_fragment(sk, skb, ip6_finish_output2);
131 	else
132 		return ip6_finish_output2(sk, skb);
133 }
134 
135 int ip6_output(struct sock *sk, struct sk_buff *skb)
136 {
137 	struct net_device *dev = skb_dst(skb)->dev;
138 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
139 	if (unlikely(idev->cnf.disable_ipv6)) {
140 		IP6_INC_STATS(dev_net(dev), idev,
141 			      IPSTATS_MIB_OUTDISCARDS);
142 		kfree_skb(skb);
143 		return 0;
144 	}
145 
146 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, sk, skb,
147 			    NULL, dev,
148 			    ip6_finish_output,
149 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
150 }
151 
152 /*
153  *	xmit an sk_buff (used by TCP, SCTP and DCCP)
154  */
155 
156 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
157 	     struct ipv6_txoptions *opt, int tclass)
158 {
159 	struct net *net = sock_net(sk);
160 	struct ipv6_pinfo *np = inet6_sk(sk);
161 	struct in6_addr *first_hop = &fl6->daddr;
162 	struct dst_entry *dst = skb_dst(skb);
163 	struct ipv6hdr *hdr;
164 	u8  proto = fl6->flowi6_proto;
165 	int seg_len = skb->len;
166 	int hlimit = -1;
167 	u32 mtu;
168 
169 	if (opt) {
170 		unsigned int head_room;
171 
172 		/* First: exthdrs may take lots of space (~8K for now)
173 		   MAX_HEADER is not enough.
174 		 */
175 		head_room = opt->opt_nflen + opt->opt_flen;
176 		seg_len += head_room;
177 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
178 
179 		if (skb_headroom(skb) < head_room) {
180 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
181 			if (!skb2) {
182 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
183 					      IPSTATS_MIB_OUTDISCARDS);
184 				kfree_skb(skb);
185 				return -ENOBUFS;
186 			}
187 			consume_skb(skb);
188 			skb = skb2;
189 			skb_set_owner_w(skb, sk);
190 		}
191 		if (opt->opt_flen)
192 			ipv6_push_frag_opts(skb, opt, &proto);
193 		if (opt->opt_nflen)
194 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
195 	}
196 
197 	skb_push(skb, sizeof(struct ipv6hdr));
198 	skb_reset_network_header(skb);
199 	hdr = ipv6_hdr(skb);
200 
201 	/*
202 	 *	Fill in the IPv6 header
203 	 */
204 	if (np)
205 		hlimit = np->hop_limit;
206 	if (hlimit < 0)
207 		hlimit = ip6_dst_hoplimit(dst);
208 
209 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
210 						     np->autoflowlabel));
211 
212 	hdr->payload_len = htons(seg_len);
213 	hdr->nexthdr = proto;
214 	hdr->hop_limit = hlimit;
215 
216 	hdr->saddr = fl6->saddr;
217 	hdr->daddr = *first_hop;
218 
219 	skb->protocol = htons(ETH_P_IPV6);
220 	skb->priority = sk->sk_priority;
221 	skb->mark = sk->sk_mark;
222 
223 	mtu = dst_mtu(dst);
224 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
225 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
226 			      IPSTATS_MIB_OUT, skb->len);
227 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb,
228 			       NULL, dst->dev, dst_output_sk);
229 	}
230 
231 	skb->dev = dst->dev;
232 	ipv6_local_error(sk, EMSGSIZE, fl6, mtu);
233 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
234 	kfree_skb(skb);
235 	return -EMSGSIZE;
236 }
237 EXPORT_SYMBOL(ip6_xmit);
238 
239 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
240 {
241 	struct ip6_ra_chain *ra;
242 	struct sock *last = NULL;
243 
244 	read_lock(&ip6_ra_lock);
245 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
246 		struct sock *sk = ra->sk;
247 		if (sk && ra->sel == sel &&
248 		    (!sk->sk_bound_dev_if ||
249 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
250 			if (last) {
251 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
252 				if (skb2)
253 					rawv6_rcv(last, skb2);
254 			}
255 			last = sk;
256 		}
257 	}
258 
259 	if (last) {
260 		rawv6_rcv(last, skb);
261 		read_unlock(&ip6_ra_lock);
262 		return 1;
263 	}
264 	read_unlock(&ip6_ra_lock);
265 	return 0;
266 }
267 
268 static int ip6_forward_proxy_check(struct sk_buff *skb)
269 {
270 	struct ipv6hdr *hdr = ipv6_hdr(skb);
271 	u8 nexthdr = hdr->nexthdr;
272 	__be16 frag_off;
273 	int offset;
274 
275 	if (ipv6_ext_hdr(nexthdr)) {
276 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
277 		if (offset < 0)
278 			return 0;
279 	} else
280 		offset = sizeof(struct ipv6hdr);
281 
282 	if (nexthdr == IPPROTO_ICMPV6) {
283 		struct icmp6hdr *icmp6;
284 
285 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
286 					 offset + 1 - skb->data)))
287 			return 0;
288 
289 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
290 
291 		switch (icmp6->icmp6_type) {
292 		case NDISC_ROUTER_SOLICITATION:
293 		case NDISC_ROUTER_ADVERTISEMENT:
294 		case NDISC_NEIGHBOUR_SOLICITATION:
295 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
296 		case NDISC_REDIRECT:
297 			/* For reaction involving unicast neighbor discovery
298 			 * message destined to the proxied address, pass it to
299 			 * input function.
300 			 */
301 			return 1;
302 		default:
303 			break;
304 		}
305 	}
306 
307 	/*
308 	 * The proxying router can't forward traffic sent to a link-local
309 	 * address, so signal the sender and discard the packet. This
310 	 * behavior is clarified by the MIPv6 specification.
311 	 */
312 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
313 		dst_link_failure(skb);
314 		return -1;
315 	}
316 
317 	return 0;
318 }
319 
320 static inline int ip6_forward_finish(struct sock *sk, struct sk_buff *skb)
321 {
322 	skb_sender_cpu_clear(skb);
323 	return dst_output_sk(sk, skb);
324 }
325 
326 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
327 {
328 	unsigned int mtu;
329 	struct inet6_dev *idev;
330 
331 	if (dst_metric_locked(dst, RTAX_MTU)) {
332 		mtu = dst_metric_raw(dst, RTAX_MTU);
333 		if (mtu)
334 			return mtu;
335 	}
336 
337 	mtu = IPV6_MIN_MTU;
338 	rcu_read_lock();
339 	idev = __in6_dev_get(dst->dev);
340 	if (idev)
341 		mtu = idev->cnf.mtu6;
342 	rcu_read_unlock();
343 
344 	return mtu;
345 }
346 
347 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
348 {
349 	if (skb->len <= mtu)
350 		return false;
351 
352 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
353 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
354 		return true;
355 
356 	if (skb->ignore_df)
357 		return false;
358 
359 	if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
360 		return false;
361 
362 	return true;
363 }
364 
365 int ip6_forward(struct sk_buff *skb)
366 {
367 	struct dst_entry *dst = skb_dst(skb);
368 	struct ipv6hdr *hdr = ipv6_hdr(skb);
369 	struct inet6_skb_parm *opt = IP6CB(skb);
370 	struct net *net = dev_net(dst->dev);
371 	u32 mtu;
372 
373 	if (net->ipv6.devconf_all->forwarding == 0)
374 		goto error;
375 
376 	if (skb->pkt_type != PACKET_HOST)
377 		goto drop;
378 
379 	if (skb_warn_if_lro(skb))
380 		goto drop;
381 
382 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
383 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
384 				 IPSTATS_MIB_INDISCARDS);
385 		goto drop;
386 	}
387 
388 	skb_forward_csum(skb);
389 
390 	/*
391 	 *	We DO NOT make any processing on
392 	 *	RA packets, pushing them to user level AS IS
393 	 *	without ane WARRANTY that application will be able
394 	 *	to interpret them. The reason is that we
395 	 *	cannot make anything clever here.
396 	 *
397 	 *	We are not end-node, so that if packet contains
398 	 *	AH/ESP, we cannot make anything.
399 	 *	Defragmentation also would be mistake, RA packets
400 	 *	cannot be fragmented, because there is no warranty
401 	 *	that different fragments will go along one path. --ANK
402 	 */
403 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
404 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
405 			return 0;
406 	}
407 
408 	/*
409 	 *	check and decrement ttl
410 	 */
411 	if (hdr->hop_limit <= 1) {
412 		/* Force OUTPUT device used as source address */
413 		skb->dev = dst->dev;
414 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
415 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
416 				 IPSTATS_MIB_INHDRERRORS);
417 
418 		kfree_skb(skb);
419 		return -ETIMEDOUT;
420 	}
421 
422 	/* XXX: idev->cnf.proxy_ndp? */
423 	if (net->ipv6.devconf_all->proxy_ndp &&
424 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
425 		int proxied = ip6_forward_proxy_check(skb);
426 		if (proxied > 0)
427 			return ip6_input(skb);
428 		else if (proxied < 0) {
429 			IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
430 					 IPSTATS_MIB_INDISCARDS);
431 			goto drop;
432 		}
433 	}
434 
435 	if (!xfrm6_route_forward(skb)) {
436 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
437 				 IPSTATS_MIB_INDISCARDS);
438 		goto drop;
439 	}
440 	dst = skb_dst(skb);
441 
442 	/* IPv6 specs say nothing about it, but it is clear that we cannot
443 	   send redirects to source routed frames.
444 	   We don't send redirects to frames decapsulated from IPsec.
445 	 */
446 	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
447 		struct in6_addr *target = NULL;
448 		struct inet_peer *peer;
449 		struct rt6_info *rt;
450 
451 		/*
452 		 *	incoming and outgoing devices are the same
453 		 *	send a redirect.
454 		 */
455 
456 		rt = (struct rt6_info *) dst;
457 		if (rt->rt6i_flags & RTF_GATEWAY)
458 			target = &rt->rt6i_gateway;
459 		else
460 			target = &hdr->daddr;
461 
462 		peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
463 
464 		/* Limit redirects both by destination (here)
465 		   and by source (inside ndisc_send_redirect)
466 		 */
467 		if (inet_peer_xrlim_allow(peer, 1*HZ))
468 			ndisc_send_redirect(skb, target);
469 		if (peer)
470 			inet_putpeer(peer);
471 	} else {
472 		int addrtype = ipv6_addr_type(&hdr->saddr);
473 
474 		/* This check is security critical. */
475 		if (addrtype == IPV6_ADDR_ANY ||
476 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
477 			goto error;
478 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
479 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
480 				    ICMPV6_NOT_NEIGHBOUR, 0);
481 			goto error;
482 		}
483 	}
484 
485 	mtu = ip6_dst_mtu_forward(dst);
486 	if (mtu < IPV6_MIN_MTU)
487 		mtu = IPV6_MIN_MTU;
488 
489 	if (ip6_pkt_too_big(skb, mtu)) {
490 		/* Again, force OUTPUT device used as source address */
491 		skb->dev = dst->dev;
492 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
493 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
494 				 IPSTATS_MIB_INTOOBIGERRORS);
495 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
496 				 IPSTATS_MIB_FRAGFAILS);
497 		kfree_skb(skb);
498 		return -EMSGSIZE;
499 	}
500 
501 	if (skb_cow(skb, dst->dev->hard_header_len)) {
502 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
503 				 IPSTATS_MIB_OUTDISCARDS);
504 		goto drop;
505 	}
506 
507 	hdr = ipv6_hdr(skb);
508 
509 	/* Mangling hops number delayed to point after skb COW */
510 
511 	hdr->hop_limit--;
512 
513 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
514 	IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
515 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, NULL, skb,
516 		       skb->dev, dst->dev,
517 		       ip6_forward_finish);
518 
519 error:
520 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
521 drop:
522 	kfree_skb(skb);
523 	return -EINVAL;
524 }
525 
526 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
527 {
528 	to->pkt_type = from->pkt_type;
529 	to->priority = from->priority;
530 	to->protocol = from->protocol;
531 	skb_dst_drop(to);
532 	skb_dst_set(to, dst_clone(skb_dst(from)));
533 	to->dev = from->dev;
534 	to->mark = from->mark;
535 
536 #ifdef CONFIG_NET_SCHED
537 	to->tc_index = from->tc_index;
538 #endif
539 	nf_copy(to, from);
540 	skb_copy_secmark(to, from);
541 }
542 
543 int ip6_fragment(struct sock *sk, struct sk_buff *skb,
544 		 int (*output)(struct sock *, struct sk_buff *))
545 {
546 	struct sk_buff *frag;
547 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
548 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
549 				inet6_sk(skb->sk) : NULL;
550 	struct ipv6hdr *tmp_hdr;
551 	struct frag_hdr *fh;
552 	unsigned int mtu, hlen, left, len;
553 	int hroom, troom;
554 	__be32 frag_id = 0;
555 	int ptr, offset = 0, err = 0;
556 	u8 *prevhdr, nexthdr = 0;
557 	struct net *net = dev_net(skb_dst(skb)->dev);
558 
559 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
560 	nexthdr = *prevhdr;
561 
562 	mtu = ip6_skb_dst_mtu(skb);
563 
564 	/* We must not fragment if the socket is set to force MTU discovery
565 	 * or if the skb it not generated by a local socket.
566 	 */
567 	if (unlikely(!skb->ignore_df && skb->len > mtu) ||
568 		     (IP6CB(skb)->frag_max_size &&
569 		      IP6CB(skb)->frag_max_size > mtu)) {
570 		if (skb->sk && dst_allfrag(skb_dst(skb)))
571 			sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
572 
573 		skb->dev = skb_dst(skb)->dev;
574 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
575 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
576 			      IPSTATS_MIB_FRAGFAILS);
577 		kfree_skb(skb);
578 		return -EMSGSIZE;
579 	}
580 
581 	if (np && np->frag_size < mtu) {
582 		if (np->frag_size)
583 			mtu = np->frag_size;
584 	}
585 	mtu -= hlen + sizeof(struct frag_hdr);
586 
587 	if (skb_has_frag_list(skb)) {
588 		int first_len = skb_pagelen(skb);
589 		struct sk_buff *frag2;
590 
591 		if (first_len - hlen > mtu ||
592 		    ((first_len - hlen) & 7) ||
593 		    skb_cloned(skb))
594 			goto slow_path;
595 
596 		skb_walk_frags(skb, frag) {
597 			/* Correct geometry. */
598 			if (frag->len > mtu ||
599 			    ((frag->len & 7) && frag->next) ||
600 			    skb_headroom(frag) < hlen)
601 				goto slow_path_clean;
602 
603 			/* Partially cloned skb? */
604 			if (skb_shared(frag))
605 				goto slow_path_clean;
606 
607 			BUG_ON(frag->sk);
608 			if (skb->sk) {
609 				frag->sk = skb->sk;
610 				frag->destructor = sock_wfree;
611 			}
612 			skb->truesize -= frag->truesize;
613 		}
614 
615 		err = 0;
616 		offset = 0;
617 		frag = skb_shinfo(skb)->frag_list;
618 		skb_frag_list_init(skb);
619 		/* BUILD HEADER */
620 
621 		*prevhdr = NEXTHDR_FRAGMENT;
622 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
623 		if (!tmp_hdr) {
624 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
625 				      IPSTATS_MIB_FRAGFAILS);
626 			return -ENOMEM;
627 		}
628 
629 		__skb_pull(skb, hlen);
630 		fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
631 		__skb_push(skb, hlen);
632 		skb_reset_network_header(skb);
633 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
634 
635 		ipv6_select_ident(net, fh, rt);
636 		fh->nexthdr = nexthdr;
637 		fh->reserved = 0;
638 		fh->frag_off = htons(IP6_MF);
639 		frag_id = fh->identification;
640 
641 		first_len = skb_pagelen(skb);
642 		skb->data_len = first_len - skb_headlen(skb);
643 		skb->len = first_len;
644 		ipv6_hdr(skb)->payload_len = htons(first_len -
645 						   sizeof(struct ipv6hdr));
646 
647 		dst_hold(&rt->dst);
648 
649 		for (;;) {
650 			/* Prepare header of the next frame,
651 			 * before previous one went down. */
652 			if (frag) {
653 				frag->ip_summed = CHECKSUM_NONE;
654 				skb_reset_transport_header(frag);
655 				fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
656 				__skb_push(frag, hlen);
657 				skb_reset_network_header(frag);
658 				memcpy(skb_network_header(frag), tmp_hdr,
659 				       hlen);
660 				offset += skb->len - hlen - sizeof(struct frag_hdr);
661 				fh->nexthdr = nexthdr;
662 				fh->reserved = 0;
663 				fh->frag_off = htons(offset);
664 				if (frag->next)
665 					fh->frag_off |= htons(IP6_MF);
666 				fh->identification = frag_id;
667 				ipv6_hdr(frag)->payload_len =
668 						htons(frag->len -
669 						      sizeof(struct ipv6hdr));
670 				ip6_copy_metadata(frag, skb);
671 			}
672 
673 			err = output(sk, skb);
674 			if (!err)
675 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
676 					      IPSTATS_MIB_FRAGCREATES);
677 
678 			if (err || !frag)
679 				break;
680 
681 			skb = frag;
682 			frag = skb->next;
683 			skb->next = NULL;
684 		}
685 
686 		kfree(tmp_hdr);
687 
688 		if (err == 0) {
689 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
690 				      IPSTATS_MIB_FRAGOKS);
691 			ip6_rt_put(rt);
692 			return 0;
693 		}
694 
695 		kfree_skb_list(frag);
696 
697 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
698 			      IPSTATS_MIB_FRAGFAILS);
699 		ip6_rt_put(rt);
700 		return err;
701 
702 slow_path_clean:
703 		skb_walk_frags(skb, frag2) {
704 			if (frag2 == frag)
705 				break;
706 			frag2->sk = NULL;
707 			frag2->destructor = NULL;
708 			skb->truesize += frag2->truesize;
709 		}
710 	}
711 
712 slow_path:
713 	if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
714 	    skb_checksum_help(skb))
715 		goto fail;
716 
717 	left = skb->len - hlen;		/* Space per frame */
718 	ptr = hlen;			/* Where to start from */
719 
720 	/*
721 	 *	Fragment the datagram.
722 	 */
723 
724 	*prevhdr = NEXTHDR_FRAGMENT;
725 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
726 	troom = rt->dst.dev->needed_tailroom;
727 
728 	/*
729 	 *	Keep copying data until we run out.
730 	 */
731 	while (left > 0)	{
732 		len = left;
733 		/* IF: it doesn't fit, use 'mtu' - the data space left */
734 		if (len > mtu)
735 			len = mtu;
736 		/* IF: we are not sending up to and including the packet end
737 		   then align the next start on an eight byte boundary */
738 		if (len < left)	{
739 			len &= ~7;
740 		}
741 
742 		/* Allocate buffer */
743 		frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
744 				 hroom + troom, GFP_ATOMIC);
745 		if (!frag) {
746 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
747 				      IPSTATS_MIB_FRAGFAILS);
748 			err = -ENOMEM;
749 			goto fail;
750 		}
751 
752 		/*
753 		 *	Set up data on packet
754 		 */
755 
756 		ip6_copy_metadata(frag, skb);
757 		skb_reserve(frag, hroom);
758 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
759 		skb_reset_network_header(frag);
760 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
761 		frag->transport_header = (frag->network_header + hlen +
762 					  sizeof(struct frag_hdr));
763 
764 		/*
765 		 *	Charge the memory for the fragment to any owner
766 		 *	it might possess
767 		 */
768 		if (skb->sk)
769 			skb_set_owner_w(frag, skb->sk);
770 
771 		/*
772 		 *	Copy the packet header into the new buffer.
773 		 */
774 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
775 
776 		/*
777 		 *	Build fragment header.
778 		 */
779 		fh->nexthdr = nexthdr;
780 		fh->reserved = 0;
781 		if (!frag_id) {
782 			ipv6_select_ident(net, fh, rt);
783 			frag_id = fh->identification;
784 		} else
785 			fh->identification = frag_id;
786 
787 		/*
788 		 *	Copy a block of the IP datagram.
789 		 */
790 		BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
791 				     len));
792 		left -= len;
793 
794 		fh->frag_off = htons(offset);
795 		if (left > 0)
796 			fh->frag_off |= htons(IP6_MF);
797 		ipv6_hdr(frag)->payload_len = htons(frag->len -
798 						    sizeof(struct ipv6hdr));
799 
800 		ptr += len;
801 		offset += len;
802 
803 		/*
804 		 *	Put this fragment into the sending queue.
805 		 */
806 		err = output(sk, frag);
807 		if (err)
808 			goto fail;
809 
810 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
811 			      IPSTATS_MIB_FRAGCREATES);
812 	}
813 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
814 		      IPSTATS_MIB_FRAGOKS);
815 	consume_skb(skb);
816 	return err;
817 
818 fail:
819 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
820 		      IPSTATS_MIB_FRAGFAILS);
821 	kfree_skb(skb);
822 	return err;
823 }
824 
825 static inline int ip6_rt_check(const struct rt6key *rt_key,
826 			       const struct in6_addr *fl_addr,
827 			       const struct in6_addr *addr_cache)
828 {
829 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
830 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
831 }
832 
833 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
834 					  struct dst_entry *dst,
835 					  const struct flowi6 *fl6)
836 {
837 	struct ipv6_pinfo *np = inet6_sk(sk);
838 	struct rt6_info *rt;
839 
840 	if (!dst)
841 		goto out;
842 
843 	if (dst->ops->family != AF_INET6) {
844 		dst_release(dst);
845 		return NULL;
846 	}
847 
848 	rt = (struct rt6_info *)dst;
849 	/* Yes, checking route validity in not connected
850 	 * case is not very simple. Take into account,
851 	 * that we do not support routing by source, TOS,
852 	 * and MSG_DONTROUTE		--ANK (980726)
853 	 *
854 	 * 1. ip6_rt_check(): If route was host route,
855 	 *    check that cached destination is current.
856 	 *    If it is network route, we still may
857 	 *    check its validity using saved pointer
858 	 *    to the last used address: daddr_cache.
859 	 *    We do not want to save whole address now,
860 	 *    (because main consumer of this service
861 	 *    is tcp, which has not this problem),
862 	 *    so that the last trick works only on connected
863 	 *    sockets.
864 	 * 2. oif also should be the same.
865 	 */
866 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
867 #ifdef CONFIG_IPV6_SUBTREES
868 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
869 #endif
870 	    (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
871 		dst_release(dst);
872 		dst = NULL;
873 	}
874 
875 out:
876 	return dst;
877 }
878 
879 static int ip6_dst_lookup_tail(struct sock *sk,
880 			       struct dst_entry **dst, struct flowi6 *fl6)
881 {
882 	struct net *net = sock_net(sk);
883 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
884 	struct neighbour *n;
885 	struct rt6_info *rt;
886 #endif
887 	int err;
888 
889 	if (!*dst)
890 		*dst = ip6_route_output(net, sk, fl6);
891 
892 	err = (*dst)->error;
893 	if (err)
894 		goto out_err_release;
895 
896 	if (ipv6_addr_any(&fl6->saddr)) {
897 		struct rt6_info *rt = (struct rt6_info *) *dst;
898 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
899 					  sk ? inet6_sk(sk)->srcprefs : 0,
900 					  &fl6->saddr);
901 		if (err)
902 			goto out_err_release;
903 	}
904 
905 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
906 	/*
907 	 * Here if the dst entry we've looked up
908 	 * has a neighbour entry that is in the INCOMPLETE
909 	 * state and the src address from the flow is
910 	 * marked as OPTIMISTIC, we release the found
911 	 * dst entry and replace it instead with the
912 	 * dst entry of the nexthop router
913 	 */
914 	rt = (struct rt6_info *) *dst;
915 	rcu_read_lock_bh();
916 	n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt));
917 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
918 	rcu_read_unlock_bh();
919 
920 	if (err) {
921 		struct inet6_ifaddr *ifp;
922 		struct flowi6 fl_gw6;
923 		int redirect;
924 
925 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
926 				      (*dst)->dev, 1);
927 
928 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
929 		if (ifp)
930 			in6_ifa_put(ifp);
931 
932 		if (redirect) {
933 			/*
934 			 * We need to get the dst entry for the
935 			 * default router instead
936 			 */
937 			dst_release(*dst);
938 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
939 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
940 			*dst = ip6_route_output(net, sk, &fl_gw6);
941 			err = (*dst)->error;
942 			if (err)
943 				goto out_err_release;
944 		}
945 	}
946 #endif
947 
948 	return 0;
949 
950 out_err_release:
951 	if (err == -ENETUNREACH)
952 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
953 	dst_release(*dst);
954 	*dst = NULL;
955 	return err;
956 }
957 
958 /**
959  *	ip6_dst_lookup - perform route lookup on flow
960  *	@sk: socket which provides route info
961  *	@dst: pointer to dst_entry * for result
962  *	@fl6: flow to lookup
963  *
964  *	This function performs a route lookup on the given flow.
965  *
966  *	It returns zero on success, or a standard errno code on error.
967  */
968 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
969 {
970 	*dst = NULL;
971 	return ip6_dst_lookup_tail(sk, dst, fl6);
972 }
973 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
974 
975 /**
976  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
977  *	@sk: socket which provides route info
978  *	@fl6: flow to lookup
979  *	@final_dst: final destination address for ipsec lookup
980  *
981  *	This function performs a route lookup on the given flow.
982  *
983  *	It returns a valid dst pointer on success, or a pointer encoded
984  *	error code.
985  */
986 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
987 				      const struct in6_addr *final_dst)
988 {
989 	struct dst_entry *dst = NULL;
990 	int err;
991 
992 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
993 	if (err)
994 		return ERR_PTR(err);
995 	if (final_dst)
996 		fl6->daddr = *final_dst;
997 
998 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
999 }
1000 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1001 
1002 /**
1003  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1004  *	@sk: socket which provides the dst cache and route info
1005  *	@fl6: flow to lookup
1006  *	@final_dst: final destination address for ipsec lookup
1007  *
1008  *	This function performs a route lookup on the given flow with the
1009  *	possibility of using the cached route in the socket if it is valid.
1010  *	It will take the socket dst lock when operating on the dst cache.
1011  *	As a result, this function can only be used in process context.
1012  *
1013  *	It returns a valid dst pointer on success, or a pointer encoded
1014  *	error code.
1015  */
1016 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1017 					 const struct in6_addr *final_dst)
1018 {
1019 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1020 	int err;
1021 
1022 	dst = ip6_sk_dst_check(sk, dst, fl6);
1023 
1024 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1025 	if (err)
1026 		return ERR_PTR(err);
1027 	if (final_dst)
1028 		fl6->daddr = *final_dst;
1029 
1030 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1031 }
1032 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1033 
1034 static inline int ip6_ufo_append_data(struct sock *sk,
1035 			struct sk_buff_head *queue,
1036 			int getfrag(void *from, char *to, int offset, int len,
1037 			int odd, struct sk_buff *skb),
1038 			void *from, int length, int hh_len, int fragheaderlen,
1039 			int transhdrlen, int mtu, unsigned int flags,
1040 			struct rt6_info *rt)
1041 
1042 {
1043 	struct sk_buff *skb;
1044 	struct frag_hdr fhdr;
1045 	int err;
1046 
1047 	/* There is support for UDP large send offload by network
1048 	 * device, so create one single skb packet containing complete
1049 	 * udp datagram
1050 	 */
1051 	skb = skb_peek_tail(queue);
1052 	if (!skb) {
1053 		skb = sock_alloc_send_skb(sk,
1054 			hh_len + fragheaderlen + transhdrlen + 20,
1055 			(flags & MSG_DONTWAIT), &err);
1056 		if (!skb)
1057 			return err;
1058 
1059 		/* reserve space for Hardware header */
1060 		skb_reserve(skb, hh_len);
1061 
1062 		/* create space for UDP/IP header */
1063 		skb_put(skb, fragheaderlen + transhdrlen);
1064 
1065 		/* initialize network header pointer */
1066 		skb_reset_network_header(skb);
1067 
1068 		/* initialize protocol header pointer */
1069 		skb->transport_header = skb->network_header + fragheaderlen;
1070 
1071 		skb->protocol = htons(ETH_P_IPV6);
1072 		skb->csum = 0;
1073 
1074 		__skb_queue_tail(queue, skb);
1075 	} else if (skb_is_gso(skb)) {
1076 		goto append;
1077 	}
1078 
1079 	skb->ip_summed = CHECKSUM_PARTIAL;
1080 	/* Specify the length of each IPv6 datagram fragment.
1081 	 * It has to be a multiple of 8.
1082 	 */
1083 	skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1084 				     sizeof(struct frag_hdr)) & ~7;
1085 	skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1086 	ipv6_select_ident(sock_net(sk), &fhdr, rt);
1087 	skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1088 
1089 append:
1090 	return skb_append_datato_frags(sk, skb, getfrag, from,
1091 				       (length - transhdrlen));
1092 }
1093 
1094 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1095 					       gfp_t gfp)
1096 {
1097 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1098 }
1099 
1100 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1101 						gfp_t gfp)
1102 {
1103 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1104 }
1105 
1106 static void ip6_append_data_mtu(unsigned int *mtu,
1107 				int *maxfraglen,
1108 				unsigned int fragheaderlen,
1109 				struct sk_buff *skb,
1110 				struct rt6_info *rt,
1111 				unsigned int orig_mtu)
1112 {
1113 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1114 		if (!skb) {
1115 			/* first fragment, reserve header_len */
1116 			*mtu = orig_mtu - rt->dst.header_len;
1117 
1118 		} else {
1119 			/*
1120 			 * this fragment is not first, the headers
1121 			 * space is regarded as data space.
1122 			 */
1123 			*mtu = orig_mtu;
1124 		}
1125 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1126 			      + fragheaderlen - sizeof(struct frag_hdr);
1127 	}
1128 }
1129 
1130 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1131 			  struct inet6_cork *v6_cork,
1132 			  int hlimit, int tclass, struct ipv6_txoptions *opt,
1133 			  struct rt6_info *rt, struct flowi6 *fl6)
1134 {
1135 	struct ipv6_pinfo *np = inet6_sk(sk);
1136 	unsigned int mtu;
1137 
1138 	/*
1139 	 * setup for corking
1140 	 */
1141 	if (opt) {
1142 		if (WARN_ON(v6_cork->opt))
1143 			return -EINVAL;
1144 
1145 		v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1146 		if (unlikely(!v6_cork->opt))
1147 			return -ENOBUFS;
1148 
1149 		v6_cork->opt->tot_len = opt->tot_len;
1150 		v6_cork->opt->opt_flen = opt->opt_flen;
1151 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1152 
1153 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1154 						    sk->sk_allocation);
1155 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1156 			return -ENOBUFS;
1157 
1158 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1159 						    sk->sk_allocation);
1160 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1161 			return -ENOBUFS;
1162 
1163 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1164 						   sk->sk_allocation);
1165 		if (opt->hopopt && !v6_cork->opt->hopopt)
1166 			return -ENOBUFS;
1167 
1168 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1169 						    sk->sk_allocation);
1170 		if (opt->srcrt && !v6_cork->opt->srcrt)
1171 			return -ENOBUFS;
1172 
1173 		/* need source address above miyazawa*/
1174 	}
1175 	dst_hold(&rt->dst);
1176 	cork->base.dst = &rt->dst;
1177 	cork->fl.u.ip6 = *fl6;
1178 	v6_cork->hop_limit = hlimit;
1179 	v6_cork->tclass = tclass;
1180 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1181 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1182 		      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1183 	else
1184 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1185 		      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1186 	if (np->frag_size < mtu) {
1187 		if (np->frag_size)
1188 			mtu = np->frag_size;
1189 	}
1190 	cork->base.fragsize = mtu;
1191 	if (dst_allfrag(rt->dst.path))
1192 		cork->base.flags |= IPCORK_ALLFRAG;
1193 	cork->base.length = 0;
1194 
1195 	return 0;
1196 }
1197 
1198 static int __ip6_append_data(struct sock *sk,
1199 			     struct flowi6 *fl6,
1200 			     struct sk_buff_head *queue,
1201 			     struct inet_cork *cork,
1202 			     struct inet6_cork *v6_cork,
1203 			     struct page_frag *pfrag,
1204 			     int getfrag(void *from, char *to, int offset,
1205 					 int len, int odd, struct sk_buff *skb),
1206 			     void *from, int length, int transhdrlen,
1207 			     unsigned int flags, int dontfrag)
1208 {
1209 	struct sk_buff *skb, *skb_prev = NULL;
1210 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1211 	int exthdrlen = 0;
1212 	int dst_exthdrlen = 0;
1213 	int hh_len;
1214 	int copy;
1215 	int err;
1216 	int offset = 0;
1217 	__u8 tx_flags = 0;
1218 	u32 tskey = 0;
1219 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1220 	struct ipv6_txoptions *opt = v6_cork->opt;
1221 	int csummode = CHECKSUM_NONE;
1222 
1223 	skb = skb_peek_tail(queue);
1224 	if (!skb) {
1225 		exthdrlen = opt ? opt->opt_flen : 0;
1226 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1227 	}
1228 
1229 	mtu = cork->fragsize;
1230 	orig_mtu = mtu;
1231 
1232 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1233 
1234 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1235 			(opt ? opt->opt_nflen : 0);
1236 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1237 		     sizeof(struct frag_hdr);
1238 
1239 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1240 		unsigned int maxnonfragsize, headersize;
1241 
1242 		headersize = sizeof(struct ipv6hdr) +
1243 			     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1244 			     (dst_allfrag(&rt->dst) ?
1245 			      sizeof(struct frag_hdr) : 0) +
1246 			     rt->rt6i_nfheader_len;
1247 
1248 		if (ip6_sk_ignore_df(sk))
1249 			maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1250 		else
1251 			maxnonfragsize = mtu;
1252 
1253 		/* dontfrag active */
1254 		if ((cork->length + length > mtu - headersize) && dontfrag &&
1255 		    (sk->sk_protocol == IPPROTO_UDP ||
1256 		     sk->sk_protocol == IPPROTO_RAW)) {
1257 			ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1258 						   sizeof(struct ipv6hdr));
1259 			goto emsgsize;
1260 		}
1261 
1262 		if (cork->length + length > maxnonfragsize - headersize) {
1263 emsgsize:
1264 			ipv6_local_error(sk, EMSGSIZE, fl6,
1265 					 mtu - headersize +
1266 					 sizeof(struct ipv6hdr));
1267 			return -EMSGSIZE;
1268 		}
1269 	}
1270 
1271 	if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1272 		sock_tx_timestamp(sk, &tx_flags);
1273 		if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1274 		    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1275 			tskey = sk->sk_tskey++;
1276 	}
1277 
1278 	/* If this is the first and only packet and device
1279 	 * supports checksum offloading, let's use it.
1280 	 */
1281 	if (!skb && sk->sk_protocol == IPPROTO_UDP &&
1282 	    length + fragheaderlen < mtu &&
1283 	    rt->dst.dev->features & NETIF_F_V6_CSUM &&
1284 	    !exthdrlen)
1285 		csummode = CHECKSUM_PARTIAL;
1286 	/*
1287 	 * Let's try using as much space as possible.
1288 	 * Use MTU if total length of the message fits into the MTU.
1289 	 * Otherwise, we need to reserve fragment header and
1290 	 * fragment alignment (= 8-15 octects, in total).
1291 	 *
1292 	 * Note that we may need to "move" the data from the tail of
1293 	 * of the buffer to the new fragment when we split
1294 	 * the message.
1295 	 *
1296 	 * FIXME: It may be fragmented into multiple chunks
1297 	 *        at once if non-fragmentable extension headers
1298 	 *        are too large.
1299 	 * --yoshfuji
1300 	 */
1301 
1302 	cork->length += length;
1303 	if (((length > mtu) ||
1304 	     (skb && skb_is_gso(skb))) &&
1305 	    (sk->sk_protocol == IPPROTO_UDP) &&
1306 	    (rt->dst.dev->features & NETIF_F_UFO) &&
1307 	    (sk->sk_type == SOCK_DGRAM)) {
1308 		err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1309 					  hh_len, fragheaderlen,
1310 					  transhdrlen, mtu, flags, rt);
1311 		if (err)
1312 			goto error;
1313 		return 0;
1314 	}
1315 
1316 	if (!skb)
1317 		goto alloc_new_skb;
1318 
1319 	while (length > 0) {
1320 		/* Check if the remaining data fits into current packet. */
1321 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1322 		if (copy < length)
1323 			copy = maxfraglen - skb->len;
1324 
1325 		if (copy <= 0) {
1326 			char *data;
1327 			unsigned int datalen;
1328 			unsigned int fraglen;
1329 			unsigned int fraggap;
1330 			unsigned int alloclen;
1331 alloc_new_skb:
1332 			/* There's no room in the current skb */
1333 			if (skb)
1334 				fraggap = skb->len - maxfraglen;
1335 			else
1336 				fraggap = 0;
1337 			/* update mtu and maxfraglen if necessary */
1338 			if (!skb || !skb_prev)
1339 				ip6_append_data_mtu(&mtu, &maxfraglen,
1340 						    fragheaderlen, skb, rt,
1341 						    orig_mtu);
1342 
1343 			skb_prev = skb;
1344 
1345 			/*
1346 			 * If remaining data exceeds the mtu,
1347 			 * we know we need more fragment(s).
1348 			 */
1349 			datalen = length + fraggap;
1350 
1351 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1352 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1353 			if ((flags & MSG_MORE) &&
1354 			    !(rt->dst.dev->features&NETIF_F_SG))
1355 				alloclen = mtu;
1356 			else
1357 				alloclen = datalen + fragheaderlen;
1358 
1359 			alloclen += dst_exthdrlen;
1360 
1361 			if (datalen != length + fraggap) {
1362 				/*
1363 				 * this is not the last fragment, the trailer
1364 				 * space is regarded as data space.
1365 				 */
1366 				datalen += rt->dst.trailer_len;
1367 			}
1368 
1369 			alloclen += rt->dst.trailer_len;
1370 			fraglen = datalen + fragheaderlen;
1371 
1372 			/*
1373 			 * We just reserve space for fragment header.
1374 			 * Note: this may be overallocation if the message
1375 			 * (without MSG_MORE) fits into the MTU.
1376 			 */
1377 			alloclen += sizeof(struct frag_hdr);
1378 
1379 			if (transhdrlen) {
1380 				skb = sock_alloc_send_skb(sk,
1381 						alloclen + hh_len,
1382 						(flags & MSG_DONTWAIT), &err);
1383 			} else {
1384 				skb = NULL;
1385 				if (atomic_read(&sk->sk_wmem_alloc) <=
1386 				    2 * sk->sk_sndbuf)
1387 					skb = sock_wmalloc(sk,
1388 							   alloclen + hh_len, 1,
1389 							   sk->sk_allocation);
1390 				if (unlikely(!skb))
1391 					err = -ENOBUFS;
1392 			}
1393 			if (!skb)
1394 				goto error;
1395 			/*
1396 			 *	Fill in the control structures
1397 			 */
1398 			skb->protocol = htons(ETH_P_IPV6);
1399 			skb->ip_summed = csummode;
1400 			skb->csum = 0;
1401 			/* reserve for fragmentation and ipsec header */
1402 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1403 				    dst_exthdrlen);
1404 
1405 			/* Only the initial fragment is time stamped */
1406 			skb_shinfo(skb)->tx_flags = tx_flags;
1407 			tx_flags = 0;
1408 			skb_shinfo(skb)->tskey = tskey;
1409 			tskey = 0;
1410 
1411 			/*
1412 			 *	Find where to start putting bytes
1413 			 */
1414 			data = skb_put(skb, fraglen);
1415 			skb_set_network_header(skb, exthdrlen);
1416 			data += fragheaderlen;
1417 			skb->transport_header = (skb->network_header +
1418 						 fragheaderlen);
1419 			if (fraggap) {
1420 				skb->csum = skb_copy_and_csum_bits(
1421 					skb_prev, maxfraglen,
1422 					data + transhdrlen, fraggap, 0);
1423 				skb_prev->csum = csum_sub(skb_prev->csum,
1424 							  skb->csum);
1425 				data += fraggap;
1426 				pskb_trim_unique(skb_prev, maxfraglen);
1427 			}
1428 			copy = datalen - transhdrlen - fraggap;
1429 
1430 			if (copy < 0) {
1431 				err = -EINVAL;
1432 				kfree_skb(skb);
1433 				goto error;
1434 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1435 				err = -EFAULT;
1436 				kfree_skb(skb);
1437 				goto error;
1438 			}
1439 
1440 			offset += copy;
1441 			length -= datalen - fraggap;
1442 			transhdrlen = 0;
1443 			exthdrlen = 0;
1444 			dst_exthdrlen = 0;
1445 
1446 			/*
1447 			 * Put the packet on the pending queue
1448 			 */
1449 			__skb_queue_tail(queue, skb);
1450 			continue;
1451 		}
1452 
1453 		if (copy > length)
1454 			copy = length;
1455 
1456 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1457 			unsigned int off;
1458 
1459 			off = skb->len;
1460 			if (getfrag(from, skb_put(skb, copy),
1461 						offset, copy, off, skb) < 0) {
1462 				__skb_trim(skb, off);
1463 				err = -EFAULT;
1464 				goto error;
1465 			}
1466 		} else {
1467 			int i = skb_shinfo(skb)->nr_frags;
1468 
1469 			err = -ENOMEM;
1470 			if (!sk_page_frag_refill(sk, pfrag))
1471 				goto error;
1472 
1473 			if (!skb_can_coalesce(skb, i, pfrag->page,
1474 					      pfrag->offset)) {
1475 				err = -EMSGSIZE;
1476 				if (i == MAX_SKB_FRAGS)
1477 					goto error;
1478 
1479 				__skb_fill_page_desc(skb, i, pfrag->page,
1480 						     pfrag->offset, 0);
1481 				skb_shinfo(skb)->nr_frags = ++i;
1482 				get_page(pfrag->page);
1483 			}
1484 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1485 			if (getfrag(from,
1486 				    page_address(pfrag->page) + pfrag->offset,
1487 				    offset, copy, skb->len, skb) < 0)
1488 				goto error_efault;
1489 
1490 			pfrag->offset += copy;
1491 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1492 			skb->len += copy;
1493 			skb->data_len += copy;
1494 			skb->truesize += copy;
1495 			atomic_add(copy, &sk->sk_wmem_alloc);
1496 		}
1497 		offset += copy;
1498 		length -= copy;
1499 	}
1500 
1501 	return 0;
1502 
1503 error_efault:
1504 	err = -EFAULT;
1505 error:
1506 	cork->length -= length;
1507 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1508 	return err;
1509 }
1510 
1511 int ip6_append_data(struct sock *sk,
1512 		    int getfrag(void *from, char *to, int offset, int len,
1513 				int odd, struct sk_buff *skb),
1514 		    void *from, int length, int transhdrlen, int hlimit,
1515 		    int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1516 		    struct rt6_info *rt, unsigned int flags, int dontfrag)
1517 {
1518 	struct inet_sock *inet = inet_sk(sk);
1519 	struct ipv6_pinfo *np = inet6_sk(sk);
1520 	int exthdrlen;
1521 	int err;
1522 
1523 	if (flags&MSG_PROBE)
1524 		return 0;
1525 	if (skb_queue_empty(&sk->sk_write_queue)) {
1526 		/*
1527 		 * setup for corking
1528 		 */
1529 		err = ip6_setup_cork(sk, &inet->cork, &np->cork, hlimit,
1530 				     tclass, opt, rt, fl6);
1531 		if (err)
1532 			return err;
1533 
1534 		exthdrlen = (opt ? opt->opt_flen : 0);
1535 		length += exthdrlen;
1536 		transhdrlen += exthdrlen;
1537 	} else {
1538 		fl6 = &inet->cork.fl.u.ip6;
1539 		transhdrlen = 0;
1540 	}
1541 
1542 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1543 				 &np->cork, sk_page_frag(sk), getfrag,
1544 				 from, length, transhdrlen, flags, dontfrag);
1545 }
1546 EXPORT_SYMBOL_GPL(ip6_append_data);
1547 
1548 static void ip6_cork_release(struct inet_cork_full *cork,
1549 			     struct inet6_cork *v6_cork)
1550 {
1551 	if (v6_cork->opt) {
1552 		kfree(v6_cork->opt->dst0opt);
1553 		kfree(v6_cork->opt->dst1opt);
1554 		kfree(v6_cork->opt->hopopt);
1555 		kfree(v6_cork->opt->srcrt);
1556 		kfree(v6_cork->opt);
1557 		v6_cork->opt = NULL;
1558 	}
1559 
1560 	if (cork->base.dst) {
1561 		dst_release(cork->base.dst);
1562 		cork->base.dst = NULL;
1563 		cork->base.flags &= ~IPCORK_ALLFRAG;
1564 	}
1565 	memset(&cork->fl, 0, sizeof(cork->fl));
1566 }
1567 
1568 struct sk_buff *__ip6_make_skb(struct sock *sk,
1569 			       struct sk_buff_head *queue,
1570 			       struct inet_cork_full *cork,
1571 			       struct inet6_cork *v6_cork)
1572 {
1573 	struct sk_buff *skb, *tmp_skb;
1574 	struct sk_buff **tail_skb;
1575 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1576 	struct ipv6_pinfo *np = inet6_sk(sk);
1577 	struct net *net = sock_net(sk);
1578 	struct ipv6hdr *hdr;
1579 	struct ipv6_txoptions *opt = v6_cork->opt;
1580 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1581 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1582 	unsigned char proto = fl6->flowi6_proto;
1583 
1584 	skb = __skb_dequeue(queue);
1585 	if (!skb)
1586 		goto out;
1587 	tail_skb = &(skb_shinfo(skb)->frag_list);
1588 
1589 	/* move skb->data to ip header from ext header */
1590 	if (skb->data < skb_network_header(skb))
1591 		__skb_pull(skb, skb_network_offset(skb));
1592 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1593 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1594 		*tail_skb = tmp_skb;
1595 		tail_skb = &(tmp_skb->next);
1596 		skb->len += tmp_skb->len;
1597 		skb->data_len += tmp_skb->len;
1598 		skb->truesize += tmp_skb->truesize;
1599 		tmp_skb->destructor = NULL;
1600 		tmp_skb->sk = NULL;
1601 	}
1602 
1603 	/* Allow local fragmentation. */
1604 	skb->ignore_df = ip6_sk_ignore_df(sk);
1605 
1606 	*final_dst = fl6->daddr;
1607 	__skb_pull(skb, skb_network_header_len(skb));
1608 	if (opt && opt->opt_flen)
1609 		ipv6_push_frag_opts(skb, opt, &proto);
1610 	if (opt && opt->opt_nflen)
1611 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1612 
1613 	skb_push(skb, sizeof(struct ipv6hdr));
1614 	skb_reset_network_header(skb);
1615 	hdr = ipv6_hdr(skb);
1616 
1617 	ip6_flow_hdr(hdr, v6_cork->tclass,
1618 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1619 					np->autoflowlabel));
1620 	hdr->hop_limit = v6_cork->hop_limit;
1621 	hdr->nexthdr = proto;
1622 	hdr->saddr = fl6->saddr;
1623 	hdr->daddr = *final_dst;
1624 
1625 	skb->priority = sk->sk_priority;
1626 	skb->mark = sk->sk_mark;
1627 
1628 	skb_dst_set(skb, dst_clone(&rt->dst));
1629 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1630 	if (proto == IPPROTO_ICMPV6) {
1631 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1632 
1633 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1634 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1635 	}
1636 
1637 	ip6_cork_release(cork, v6_cork);
1638 out:
1639 	return skb;
1640 }
1641 
1642 int ip6_send_skb(struct sk_buff *skb)
1643 {
1644 	struct net *net = sock_net(skb->sk);
1645 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1646 	int err;
1647 
1648 	err = ip6_local_out(skb);
1649 	if (err) {
1650 		if (err > 0)
1651 			err = net_xmit_errno(err);
1652 		if (err)
1653 			IP6_INC_STATS(net, rt->rt6i_idev,
1654 				      IPSTATS_MIB_OUTDISCARDS);
1655 	}
1656 
1657 	return err;
1658 }
1659 
1660 int ip6_push_pending_frames(struct sock *sk)
1661 {
1662 	struct sk_buff *skb;
1663 
1664 	skb = ip6_finish_skb(sk);
1665 	if (!skb)
1666 		return 0;
1667 
1668 	return ip6_send_skb(skb);
1669 }
1670 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1671 
1672 static void __ip6_flush_pending_frames(struct sock *sk,
1673 				       struct sk_buff_head *queue,
1674 				       struct inet_cork_full *cork,
1675 				       struct inet6_cork *v6_cork)
1676 {
1677 	struct sk_buff *skb;
1678 
1679 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1680 		if (skb_dst(skb))
1681 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1682 				      IPSTATS_MIB_OUTDISCARDS);
1683 		kfree_skb(skb);
1684 	}
1685 
1686 	ip6_cork_release(cork, v6_cork);
1687 }
1688 
1689 void ip6_flush_pending_frames(struct sock *sk)
1690 {
1691 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1692 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1693 }
1694 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1695 
1696 struct sk_buff *ip6_make_skb(struct sock *sk,
1697 			     int getfrag(void *from, char *to, int offset,
1698 					 int len, int odd, struct sk_buff *skb),
1699 			     void *from, int length, int transhdrlen,
1700 			     int hlimit, int tclass,
1701 			     struct ipv6_txoptions *opt, struct flowi6 *fl6,
1702 			     struct rt6_info *rt, unsigned int flags,
1703 			     int dontfrag)
1704 {
1705 	struct inet_cork_full cork;
1706 	struct inet6_cork v6_cork;
1707 	struct sk_buff_head queue;
1708 	int exthdrlen = (opt ? opt->opt_flen : 0);
1709 	int err;
1710 
1711 	if (flags & MSG_PROBE)
1712 		return NULL;
1713 
1714 	__skb_queue_head_init(&queue);
1715 
1716 	cork.base.flags = 0;
1717 	cork.base.addr = 0;
1718 	cork.base.opt = NULL;
1719 	v6_cork.opt = NULL;
1720 	err = ip6_setup_cork(sk, &cork, &v6_cork, hlimit, tclass, opt, rt, fl6);
1721 	if (err)
1722 		return ERR_PTR(err);
1723 
1724 	if (dontfrag < 0)
1725 		dontfrag = inet6_sk(sk)->dontfrag;
1726 
1727 	err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1728 				&current->task_frag, getfrag, from,
1729 				length + exthdrlen, transhdrlen + exthdrlen,
1730 				flags, dontfrag);
1731 	if (err) {
1732 		__ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1733 		return ERR_PTR(err);
1734 	}
1735 
1736 	return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1737 }
1738