xref: /openbmc/linux/net/ipv6/ip6_output.c (revision f42b3800)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	$Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9  *
10  *	Based on linux/net/ipv4/ip_output.c
11  *
12  *	This program is free software; you can redistribute it and/or
13  *      modify it under the terms of the GNU General Public License
14  *      as published by the Free Software Foundation; either version
15  *      2 of the License, or (at your option) any later version.
16  *
17  *	Changes:
18  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
19  *				extension headers are implemented.
20  *				route changes now work.
21  *				ip6_forward does not confuse sniffers.
22  *				etc.
23  *
24  *      H. von Brand    :       Added missing #include <linux/string.h>
25  *	Imran Patel	: 	frag id should be in NBO
26  *      Kazunori MIYAZAWA @USAGI
27  *			:       add ip6_append_data and related functions
28  *				for datagram xmit
29  */
30 
31 #include <linux/errno.h>
32 #include <linux/kernel.h>
33 #include <linux/string.h>
34 #include <linux/socket.h>
35 #include <linux/net.h>
36 #include <linux/netdevice.h>
37 #include <linux/if_arp.h>
38 #include <linux/in6.h>
39 #include <linux/tcp.h>
40 #include <linux/route.h>
41 #include <linux/module.h>
42 
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45 
46 #include <net/sock.h>
47 #include <net/snmp.h>
48 
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
59 
60 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
61 
62 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
63 {
64 	static u32 ipv6_fragmentation_id = 1;
65 	static DEFINE_SPINLOCK(ip6_id_lock);
66 
67 	spin_lock_bh(&ip6_id_lock);
68 	fhdr->identification = htonl(ipv6_fragmentation_id);
69 	if (++ipv6_fragmentation_id == 0)
70 		ipv6_fragmentation_id = 1;
71 	spin_unlock_bh(&ip6_id_lock);
72 }
73 
74 int __ip6_local_out(struct sk_buff *skb)
75 {
76 	int len;
77 
78 	len = skb->len - sizeof(struct ipv6hdr);
79 	if (len > IPV6_MAXPLEN)
80 		len = 0;
81 	ipv6_hdr(skb)->payload_len = htons(len);
82 
83 	return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
84 		       dst_output);
85 }
86 
87 int ip6_local_out(struct sk_buff *skb)
88 {
89 	int err;
90 
91 	err = __ip6_local_out(skb);
92 	if (likely(err == 1))
93 		err = dst_output(skb);
94 
95 	return err;
96 }
97 EXPORT_SYMBOL_GPL(ip6_local_out);
98 
99 static int ip6_output_finish(struct sk_buff *skb)
100 {
101 	struct dst_entry *dst = skb->dst;
102 
103 	if (dst->hh)
104 		return neigh_hh_output(dst->hh, skb);
105 	else if (dst->neighbour)
106 		return dst->neighbour->output(skb);
107 
108 	IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
109 	kfree_skb(skb);
110 	return -EINVAL;
111 
112 }
113 
114 /* dev_loopback_xmit for use with netfilter. */
115 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
116 {
117 	skb_reset_mac_header(newskb);
118 	__skb_pull(newskb, skb_network_offset(newskb));
119 	newskb->pkt_type = PACKET_LOOPBACK;
120 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
121 	BUG_TRAP(newskb->dst);
122 
123 	netif_rx(newskb);
124 	return 0;
125 }
126 
127 
128 static int ip6_output2(struct sk_buff *skb)
129 {
130 	struct dst_entry *dst = skb->dst;
131 	struct net_device *dev = dst->dev;
132 
133 	skb->protocol = htons(ETH_P_IPV6);
134 	skb->dev = dev;
135 
136 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
137 		struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
138 		struct inet6_dev *idev = ip6_dst_idev(skb->dst);
139 
140 		if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
141 		    ((mroute6_socket && !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
142 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
143 					 &ipv6_hdr(skb)->saddr))) {
144 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
145 
146 			/* Do not check for IFF_ALLMULTI; multicast routing
147 			   is not supported in any case.
148 			 */
149 			if (newskb)
150 				NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
151 					NULL, newskb->dev,
152 					ip6_dev_loopback_xmit);
153 
154 			if (ipv6_hdr(skb)->hop_limit == 0) {
155 				IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
156 				kfree_skb(skb);
157 				return 0;
158 			}
159 		}
160 
161 		IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS);
162 	}
163 
164 	return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
165 		       ip6_output_finish);
166 }
167 
168 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
169 {
170 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
171 
172 	return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
173 	       skb->dst->dev->mtu : dst_mtu(skb->dst);
174 }
175 
176 int ip6_output(struct sk_buff *skb)
177 {
178 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
179 				dst_allfrag(skb->dst))
180 		return ip6_fragment(skb, ip6_output2);
181 	else
182 		return ip6_output2(skb);
183 }
184 
185 /*
186  *	xmit an sk_buff (used by TCP)
187  */
188 
189 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
190 	     struct ipv6_txoptions *opt, int ipfragok)
191 {
192 	struct ipv6_pinfo *np = inet6_sk(sk);
193 	struct in6_addr *first_hop = &fl->fl6_dst;
194 	struct dst_entry *dst = skb->dst;
195 	struct ipv6hdr *hdr;
196 	u8  proto = fl->proto;
197 	int seg_len = skb->len;
198 	int hlimit, tclass;
199 	u32 mtu;
200 
201 	if (opt) {
202 		unsigned int head_room;
203 
204 		/* First: exthdrs may take lots of space (~8K for now)
205 		   MAX_HEADER is not enough.
206 		 */
207 		head_room = opt->opt_nflen + opt->opt_flen;
208 		seg_len += head_room;
209 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
210 
211 		if (skb_headroom(skb) < head_room) {
212 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
213 			if (skb2 == NULL) {
214 				IP6_INC_STATS(ip6_dst_idev(skb->dst),
215 					      IPSTATS_MIB_OUTDISCARDS);
216 				kfree_skb(skb);
217 				return -ENOBUFS;
218 			}
219 			kfree_skb(skb);
220 			skb = skb2;
221 			if (sk)
222 				skb_set_owner_w(skb, sk);
223 		}
224 		if (opt->opt_flen)
225 			ipv6_push_frag_opts(skb, opt, &proto);
226 		if (opt->opt_nflen)
227 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
228 	}
229 
230 	skb_push(skb, sizeof(struct ipv6hdr));
231 	skb_reset_network_header(skb);
232 	hdr = ipv6_hdr(skb);
233 
234 	/*
235 	 *	Fill in the IPv6 header
236 	 */
237 
238 	hlimit = -1;
239 	if (np)
240 		hlimit = np->hop_limit;
241 	if (hlimit < 0)
242 		hlimit = ip6_dst_hoplimit(dst);
243 
244 	tclass = -1;
245 	if (np)
246 		tclass = np->tclass;
247 	if (tclass < 0)
248 		tclass = 0;
249 
250 	*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
251 
252 	hdr->payload_len = htons(seg_len);
253 	hdr->nexthdr = proto;
254 	hdr->hop_limit = hlimit;
255 
256 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
257 	ipv6_addr_copy(&hdr->daddr, first_hop);
258 
259 	skb->priority = sk->sk_priority;
260 	skb->mark = sk->sk_mark;
261 
262 	mtu = dst_mtu(dst);
263 	if ((skb->len <= mtu) || ipfragok || skb_is_gso(skb)) {
264 		IP6_INC_STATS(ip6_dst_idev(skb->dst),
265 			      IPSTATS_MIB_OUTREQUESTS);
266 		return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
267 				dst_output);
268 	}
269 
270 	if (net_ratelimit())
271 		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
272 	skb->dev = dst->dev;
273 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
274 	IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
275 	kfree_skb(skb);
276 	return -EMSGSIZE;
277 }
278 
279 EXPORT_SYMBOL(ip6_xmit);
280 
281 /*
282  *	To avoid extra problems ND packets are send through this
283  *	routine. It's code duplication but I really want to avoid
284  *	extra checks since ipv6_build_header is used by TCP (which
285  *	is for us performance critical)
286  */
287 
288 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
289 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
290 	       int proto, int len)
291 {
292 	struct ipv6_pinfo *np = inet6_sk(sk);
293 	struct ipv6hdr *hdr;
294 	int totlen;
295 
296 	skb->protocol = htons(ETH_P_IPV6);
297 	skb->dev = dev;
298 
299 	totlen = len + sizeof(struct ipv6hdr);
300 
301 	skb_reset_network_header(skb);
302 	skb_put(skb, sizeof(struct ipv6hdr));
303 	hdr = ipv6_hdr(skb);
304 
305 	*(__be32*)hdr = htonl(0x60000000);
306 
307 	hdr->payload_len = htons(len);
308 	hdr->nexthdr = proto;
309 	hdr->hop_limit = np->hop_limit;
310 
311 	ipv6_addr_copy(&hdr->saddr, saddr);
312 	ipv6_addr_copy(&hdr->daddr, daddr);
313 
314 	return 0;
315 }
316 
317 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
318 {
319 	struct ip6_ra_chain *ra;
320 	struct sock *last = NULL;
321 
322 	read_lock(&ip6_ra_lock);
323 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
324 		struct sock *sk = ra->sk;
325 		if (sk && ra->sel == sel &&
326 		    (!sk->sk_bound_dev_if ||
327 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
328 			if (last) {
329 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
330 				if (skb2)
331 					rawv6_rcv(last, skb2);
332 			}
333 			last = sk;
334 		}
335 	}
336 
337 	if (last) {
338 		rawv6_rcv(last, skb);
339 		read_unlock(&ip6_ra_lock);
340 		return 1;
341 	}
342 	read_unlock(&ip6_ra_lock);
343 	return 0;
344 }
345 
346 static int ip6_forward_proxy_check(struct sk_buff *skb)
347 {
348 	struct ipv6hdr *hdr = ipv6_hdr(skb);
349 	u8 nexthdr = hdr->nexthdr;
350 	int offset;
351 
352 	if (ipv6_ext_hdr(nexthdr)) {
353 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
354 		if (offset < 0)
355 			return 0;
356 	} else
357 		offset = sizeof(struct ipv6hdr);
358 
359 	if (nexthdr == IPPROTO_ICMPV6) {
360 		struct icmp6hdr *icmp6;
361 
362 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
363 					 offset + 1 - skb->data)))
364 			return 0;
365 
366 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
367 
368 		switch (icmp6->icmp6_type) {
369 		case NDISC_ROUTER_SOLICITATION:
370 		case NDISC_ROUTER_ADVERTISEMENT:
371 		case NDISC_NEIGHBOUR_SOLICITATION:
372 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
373 		case NDISC_REDIRECT:
374 			/* For reaction involving unicast neighbor discovery
375 			 * message destined to the proxied address, pass it to
376 			 * input function.
377 			 */
378 			return 1;
379 		default:
380 			break;
381 		}
382 	}
383 
384 	/*
385 	 * The proxying router can't forward traffic sent to a link-local
386 	 * address, so signal the sender and discard the packet. This
387 	 * behavior is clarified by the MIPv6 specification.
388 	 */
389 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
390 		dst_link_failure(skb);
391 		return -1;
392 	}
393 
394 	return 0;
395 }
396 
397 static inline int ip6_forward_finish(struct sk_buff *skb)
398 {
399 	return dst_output(skb);
400 }
401 
402 int ip6_forward(struct sk_buff *skb)
403 {
404 	struct dst_entry *dst = skb->dst;
405 	struct ipv6hdr *hdr = ipv6_hdr(skb);
406 	struct inet6_skb_parm *opt = IP6CB(skb);
407 	struct net *net = dev_net(dst->dev);
408 
409 	if (ipv6_devconf.forwarding == 0)
410 		goto error;
411 
412 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
413 		IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
414 		goto drop;
415 	}
416 
417 	skb_forward_csum(skb);
418 
419 	/*
420 	 *	We DO NOT make any processing on
421 	 *	RA packets, pushing them to user level AS IS
422 	 *	without ane WARRANTY that application will be able
423 	 *	to interpret them. The reason is that we
424 	 *	cannot make anything clever here.
425 	 *
426 	 *	We are not end-node, so that if packet contains
427 	 *	AH/ESP, we cannot make anything.
428 	 *	Defragmentation also would be mistake, RA packets
429 	 *	cannot be fragmented, because there is no warranty
430 	 *	that different fragments will go along one path. --ANK
431 	 */
432 	if (opt->ra) {
433 		u8 *ptr = skb_network_header(skb) + opt->ra;
434 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
435 			return 0;
436 	}
437 
438 	/*
439 	 *	check and decrement ttl
440 	 */
441 	if (hdr->hop_limit <= 1) {
442 		/* Force OUTPUT device used as source address */
443 		skb->dev = dst->dev;
444 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
445 			    0, skb->dev);
446 		IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
447 
448 		kfree_skb(skb);
449 		return -ETIMEDOUT;
450 	}
451 
452 	/* XXX: idev->cnf.proxy_ndp? */
453 	if (ipv6_devconf.proxy_ndp &&
454 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
455 		int proxied = ip6_forward_proxy_check(skb);
456 		if (proxied > 0)
457 			return ip6_input(skb);
458 		else if (proxied < 0) {
459 			IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
460 			goto drop;
461 		}
462 	}
463 
464 	if (!xfrm6_route_forward(skb)) {
465 		IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
466 		goto drop;
467 	}
468 	dst = skb->dst;
469 
470 	/* IPv6 specs say nothing about it, but it is clear that we cannot
471 	   send redirects to source routed frames.
472 	   We don't send redirects to frames decapsulated from IPsec.
473 	 */
474 	if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
475 	    !skb->sp) {
476 		struct in6_addr *target = NULL;
477 		struct rt6_info *rt;
478 		struct neighbour *n = dst->neighbour;
479 
480 		/*
481 		 *	incoming and outgoing devices are the same
482 		 *	send a redirect.
483 		 */
484 
485 		rt = (struct rt6_info *) dst;
486 		if ((rt->rt6i_flags & RTF_GATEWAY))
487 			target = (struct in6_addr*)&n->primary_key;
488 		else
489 			target = &hdr->daddr;
490 
491 		/* Limit redirects both by destination (here)
492 		   and by source (inside ndisc_send_redirect)
493 		 */
494 		if (xrlim_allow(dst, 1*HZ))
495 			ndisc_send_redirect(skb, n, target);
496 	} else {
497 		int addrtype = ipv6_addr_type(&hdr->saddr);
498 
499 		/* This check is security critical. */
500 		if (addrtype & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK))
501 			goto error;
502 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
503 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
504 				ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
505 			goto error;
506 		}
507 	}
508 
509 	if (skb->len > dst_mtu(dst)) {
510 		/* Again, force OUTPUT device used as source address */
511 		skb->dev = dst->dev;
512 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
513 		IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
514 		IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
515 		kfree_skb(skb);
516 		return -EMSGSIZE;
517 	}
518 
519 	if (skb_cow(skb, dst->dev->hard_header_len)) {
520 		IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
521 		goto drop;
522 	}
523 
524 	hdr = ipv6_hdr(skb);
525 
526 	/* Mangling hops number delayed to point after skb COW */
527 
528 	hdr->hop_limit--;
529 
530 	IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
531 	return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
532 		       ip6_forward_finish);
533 
534 error:
535 	IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
536 drop:
537 	kfree_skb(skb);
538 	return -EINVAL;
539 }
540 
541 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
542 {
543 	to->pkt_type = from->pkt_type;
544 	to->priority = from->priority;
545 	to->protocol = from->protocol;
546 	dst_release(to->dst);
547 	to->dst = dst_clone(from->dst);
548 	to->dev = from->dev;
549 	to->mark = from->mark;
550 
551 #ifdef CONFIG_NET_SCHED
552 	to->tc_index = from->tc_index;
553 #endif
554 	nf_copy(to, from);
555 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
556     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
557 	to->nf_trace = from->nf_trace;
558 #endif
559 	skb_copy_secmark(to, from);
560 }
561 
562 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
563 {
564 	u16 offset = sizeof(struct ipv6hdr);
565 	struct ipv6_opt_hdr *exthdr =
566 				(struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
567 	unsigned int packet_len = skb->tail - skb->network_header;
568 	int found_rhdr = 0;
569 	*nexthdr = &ipv6_hdr(skb)->nexthdr;
570 
571 	while (offset + 1 <= packet_len) {
572 
573 		switch (**nexthdr) {
574 
575 		case NEXTHDR_HOP:
576 			break;
577 		case NEXTHDR_ROUTING:
578 			found_rhdr = 1;
579 			break;
580 		case NEXTHDR_DEST:
581 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
582 			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
583 				break;
584 #endif
585 			if (found_rhdr)
586 				return offset;
587 			break;
588 		default :
589 			return offset;
590 		}
591 
592 		offset += ipv6_optlen(exthdr);
593 		*nexthdr = &exthdr->nexthdr;
594 		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
595 						 offset);
596 	}
597 
598 	return offset;
599 }
600 
601 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
602 {
603 	struct net_device *dev;
604 	struct sk_buff *frag;
605 	struct rt6_info *rt = (struct rt6_info*)skb->dst;
606 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
607 	struct ipv6hdr *tmp_hdr;
608 	struct frag_hdr *fh;
609 	unsigned int mtu, hlen, left, len;
610 	__be32 frag_id = 0;
611 	int ptr, offset = 0, err=0;
612 	u8 *prevhdr, nexthdr = 0;
613 
614 	dev = rt->u.dst.dev;
615 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
616 	nexthdr = *prevhdr;
617 
618 	mtu = ip6_skb_dst_mtu(skb);
619 
620 	/* We must not fragment if the socket is set to force MTU discovery
621 	 * or if the skb it not generated by a local socket.  (This last
622 	 * check should be redundant, but it's free.)
623 	 */
624 	if (!skb->local_df) {
625 		skb->dev = skb->dst->dev;
626 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
627 		IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
628 		kfree_skb(skb);
629 		return -EMSGSIZE;
630 	}
631 
632 	if (np && np->frag_size < mtu) {
633 		if (np->frag_size)
634 			mtu = np->frag_size;
635 	}
636 	mtu -= hlen + sizeof(struct frag_hdr);
637 
638 	if (skb_shinfo(skb)->frag_list) {
639 		int first_len = skb_pagelen(skb);
640 		int truesizes = 0;
641 
642 		if (first_len - hlen > mtu ||
643 		    ((first_len - hlen) & 7) ||
644 		    skb_cloned(skb))
645 			goto slow_path;
646 
647 		for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
648 			/* Correct geometry. */
649 			if (frag->len > mtu ||
650 			    ((frag->len & 7) && frag->next) ||
651 			    skb_headroom(frag) < hlen)
652 			    goto slow_path;
653 
654 			/* Partially cloned skb? */
655 			if (skb_shared(frag))
656 				goto slow_path;
657 
658 			BUG_ON(frag->sk);
659 			if (skb->sk) {
660 				sock_hold(skb->sk);
661 				frag->sk = skb->sk;
662 				frag->destructor = sock_wfree;
663 				truesizes += frag->truesize;
664 			}
665 		}
666 
667 		err = 0;
668 		offset = 0;
669 		frag = skb_shinfo(skb)->frag_list;
670 		skb_shinfo(skb)->frag_list = NULL;
671 		/* BUILD HEADER */
672 
673 		*prevhdr = NEXTHDR_FRAGMENT;
674 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
675 		if (!tmp_hdr) {
676 			IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
677 			return -ENOMEM;
678 		}
679 
680 		__skb_pull(skb, hlen);
681 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
682 		__skb_push(skb, hlen);
683 		skb_reset_network_header(skb);
684 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
685 
686 		ipv6_select_ident(skb, fh);
687 		fh->nexthdr = nexthdr;
688 		fh->reserved = 0;
689 		fh->frag_off = htons(IP6_MF);
690 		frag_id = fh->identification;
691 
692 		first_len = skb_pagelen(skb);
693 		skb->data_len = first_len - skb_headlen(skb);
694 		skb->truesize -= truesizes;
695 		skb->len = first_len;
696 		ipv6_hdr(skb)->payload_len = htons(first_len -
697 						   sizeof(struct ipv6hdr));
698 
699 		dst_hold(&rt->u.dst);
700 
701 		for (;;) {
702 			/* Prepare header of the next frame,
703 			 * before previous one went down. */
704 			if (frag) {
705 				frag->ip_summed = CHECKSUM_NONE;
706 				skb_reset_transport_header(frag);
707 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
708 				__skb_push(frag, hlen);
709 				skb_reset_network_header(frag);
710 				memcpy(skb_network_header(frag), tmp_hdr,
711 				       hlen);
712 				offset += skb->len - hlen - sizeof(struct frag_hdr);
713 				fh->nexthdr = nexthdr;
714 				fh->reserved = 0;
715 				fh->frag_off = htons(offset);
716 				if (frag->next != NULL)
717 					fh->frag_off |= htons(IP6_MF);
718 				fh->identification = frag_id;
719 				ipv6_hdr(frag)->payload_len =
720 						htons(frag->len -
721 						      sizeof(struct ipv6hdr));
722 				ip6_copy_metadata(frag, skb);
723 			}
724 
725 			err = output(skb);
726 			if(!err)
727 				IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGCREATES);
728 
729 			if (err || !frag)
730 				break;
731 
732 			skb = frag;
733 			frag = skb->next;
734 			skb->next = NULL;
735 		}
736 
737 		kfree(tmp_hdr);
738 
739 		if (err == 0) {
740 			IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGOKS);
741 			dst_release(&rt->u.dst);
742 			return 0;
743 		}
744 
745 		while (frag) {
746 			skb = frag->next;
747 			kfree_skb(frag);
748 			frag = skb;
749 		}
750 
751 		IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGFAILS);
752 		dst_release(&rt->u.dst);
753 		return err;
754 	}
755 
756 slow_path:
757 	left = skb->len - hlen;		/* Space per frame */
758 	ptr = hlen;			/* Where to start from */
759 
760 	/*
761 	 *	Fragment the datagram.
762 	 */
763 
764 	*prevhdr = NEXTHDR_FRAGMENT;
765 
766 	/*
767 	 *	Keep copying data until we run out.
768 	 */
769 	while(left > 0)	{
770 		len = left;
771 		/* IF: it doesn't fit, use 'mtu' - the data space left */
772 		if (len > mtu)
773 			len = mtu;
774 		/* IF: we are not sending upto and including the packet end
775 		   then align the next start on an eight byte boundary */
776 		if (len < left)	{
777 			len &= ~7;
778 		}
779 		/*
780 		 *	Allocate buffer.
781 		 */
782 
783 		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
784 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
785 			IP6_INC_STATS(ip6_dst_idev(skb->dst),
786 				      IPSTATS_MIB_FRAGFAILS);
787 			err = -ENOMEM;
788 			goto fail;
789 		}
790 
791 		/*
792 		 *	Set up data on packet
793 		 */
794 
795 		ip6_copy_metadata(frag, skb);
796 		skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
797 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
798 		skb_reset_network_header(frag);
799 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
800 		frag->transport_header = (frag->network_header + hlen +
801 					  sizeof(struct frag_hdr));
802 
803 		/*
804 		 *	Charge the memory for the fragment to any owner
805 		 *	it might possess
806 		 */
807 		if (skb->sk)
808 			skb_set_owner_w(frag, skb->sk);
809 
810 		/*
811 		 *	Copy the packet header into the new buffer.
812 		 */
813 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
814 
815 		/*
816 		 *	Build fragment header.
817 		 */
818 		fh->nexthdr = nexthdr;
819 		fh->reserved = 0;
820 		if (!frag_id) {
821 			ipv6_select_ident(skb, fh);
822 			frag_id = fh->identification;
823 		} else
824 			fh->identification = frag_id;
825 
826 		/*
827 		 *	Copy a block of the IP datagram.
828 		 */
829 		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
830 			BUG();
831 		left -= len;
832 
833 		fh->frag_off = htons(offset);
834 		if (left > 0)
835 			fh->frag_off |= htons(IP6_MF);
836 		ipv6_hdr(frag)->payload_len = htons(frag->len -
837 						    sizeof(struct ipv6hdr));
838 
839 		ptr += len;
840 		offset += len;
841 
842 		/*
843 		 *	Put this fragment into the sending queue.
844 		 */
845 		err = output(frag);
846 		if (err)
847 			goto fail;
848 
849 		IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGCREATES);
850 	}
851 	IP6_INC_STATS(ip6_dst_idev(skb->dst),
852 		      IPSTATS_MIB_FRAGOKS);
853 	kfree_skb(skb);
854 	return err;
855 
856 fail:
857 	IP6_INC_STATS(ip6_dst_idev(skb->dst),
858 		      IPSTATS_MIB_FRAGFAILS);
859 	kfree_skb(skb);
860 	return err;
861 }
862 
863 static inline int ip6_rt_check(struct rt6key *rt_key,
864 			       struct in6_addr *fl_addr,
865 			       struct in6_addr *addr_cache)
866 {
867 	return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
868 		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
869 }
870 
871 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
872 					  struct dst_entry *dst,
873 					  struct flowi *fl)
874 {
875 	struct ipv6_pinfo *np = inet6_sk(sk);
876 	struct rt6_info *rt = (struct rt6_info *)dst;
877 
878 	if (!dst)
879 		goto out;
880 
881 	/* Yes, checking route validity in not connected
882 	 * case is not very simple. Take into account,
883 	 * that we do not support routing by source, TOS,
884 	 * and MSG_DONTROUTE 		--ANK (980726)
885 	 *
886 	 * 1. ip6_rt_check(): If route was host route,
887 	 *    check that cached destination is current.
888 	 *    If it is network route, we still may
889 	 *    check its validity using saved pointer
890 	 *    to the last used address: daddr_cache.
891 	 *    We do not want to save whole address now,
892 	 *    (because main consumer of this service
893 	 *    is tcp, which has not this problem),
894 	 *    so that the last trick works only on connected
895 	 *    sockets.
896 	 * 2. oif also should be the same.
897 	 */
898 	if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
899 #ifdef CONFIG_IPV6_SUBTREES
900 	    ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
901 #endif
902 	    (fl->oif && fl->oif != dst->dev->ifindex)) {
903 		dst_release(dst);
904 		dst = NULL;
905 	}
906 
907 out:
908 	return dst;
909 }
910 
911 static int ip6_dst_lookup_tail(struct sock *sk,
912 			       struct dst_entry **dst, struct flowi *fl)
913 {
914 	int err;
915 	struct net *net = sock_net(sk);
916 
917 	if (*dst == NULL)
918 		*dst = ip6_route_output(net, sk, fl);
919 
920 	if ((err = (*dst)->error))
921 		goto out_err_release;
922 
923 	if (ipv6_addr_any(&fl->fl6_src)) {
924 		err = ipv6_dev_get_saddr(ip6_dst_idev(*dst)->dev,
925 					 &fl->fl6_dst,
926 					 sk ? inet6_sk(sk)->srcprefs : 0,
927 					 &fl->fl6_src);
928 		if (err)
929 			goto out_err_release;
930 	}
931 
932 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
933 		/*
934 		 * Here if the dst entry we've looked up
935 		 * has a neighbour entry that is in the INCOMPLETE
936 		 * state and the src address from the flow is
937 		 * marked as OPTIMISTIC, we release the found
938 		 * dst entry and replace it instead with the
939 		 * dst entry of the nexthop router
940 		 */
941 		if (!((*dst)->neighbour->nud_state & NUD_VALID)) {
942 			struct inet6_ifaddr *ifp;
943 			struct flowi fl_gw;
944 			int redirect;
945 
946 			ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
947 					      (*dst)->dev, 1);
948 
949 			redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
950 			if (ifp)
951 				in6_ifa_put(ifp);
952 
953 			if (redirect) {
954 				/*
955 				 * We need to get the dst entry for the
956 				 * default router instead
957 				 */
958 				dst_release(*dst);
959 				memcpy(&fl_gw, fl, sizeof(struct flowi));
960 				memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
961 				*dst = ip6_route_output(net, sk, &fl_gw);
962 				if ((err = (*dst)->error))
963 					goto out_err_release;
964 			}
965 		}
966 #endif
967 
968 	return 0;
969 
970 out_err_release:
971 	if (err == -ENETUNREACH)
972 		IP6_INC_STATS_BH(NULL, IPSTATS_MIB_OUTNOROUTES);
973 	dst_release(*dst);
974 	*dst = NULL;
975 	return err;
976 }
977 
978 /**
979  *	ip6_dst_lookup - perform route lookup on flow
980  *	@sk: socket which provides route info
981  *	@dst: pointer to dst_entry * for result
982  *	@fl: flow to lookup
983  *
984  *	This function performs a route lookup on the given flow.
985  *
986  *	It returns zero on success, or a standard errno code on error.
987  */
988 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
989 {
990 	*dst = NULL;
991 	return ip6_dst_lookup_tail(sk, dst, fl);
992 }
993 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
994 
995 /**
996  *	ip6_sk_dst_lookup - perform socket cached route lookup on flow
997  *	@sk: socket which provides the dst cache and route info
998  *	@dst: pointer to dst_entry * for result
999  *	@fl: flow to lookup
1000  *
1001  *	This function performs a route lookup on the given flow with the
1002  *	possibility of using the cached route in the socket if it is valid.
1003  *	It will take the socket dst lock when operating on the dst cache.
1004  *	As a result, this function can only be used in process context.
1005  *
1006  *	It returns zero on success, or a standard errno code on error.
1007  */
1008 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1009 {
1010 	*dst = NULL;
1011 	if (sk) {
1012 		*dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1013 		*dst = ip6_sk_dst_check(sk, *dst, fl);
1014 	}
1015 
1016 	return ip6_dst_lookup_tail(sk, dst, fl);
1017 }
1018 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1019 
1020 static inline int ip6_ufo_append_data(struct sock *sk,
1021 			int getfrag(void *from, char *to, int offset, int len,
1022 			int odd, struct sk_buff *skb),
1023 			void *from, int length, int hh_len, int fragheaderlen,
1024 			int transhdrlen, int mtu,unsigned int flags)
1025 
1026 {
1027 	struct sk_buff *skb;
1028 	int err;
1029 
1030 	/* There is support for UDP large send offload by network
1031 	 * device, so create one single skb packet containing complete
1032 	 * udp datagram
1033 	 */
1034 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1035 		skb = sock_alloc_send_skb(sk,
1036 			hh_len + fragheaderlen + transhdrlen + 20,
1037 			(flags & MSG_DONTWAIT), &err);
1038 		if (skb == NULL)
1039 			return -ENOMEM;
1040 
1041 		/* reserve space for Hardware header */
1042 		skb_reserve(skb, hh_len);
1043 
1044 		/* create space for UDP/IP header */
1045 		skb_put(skb,fragheaderlen + transhdrlen);
1046 
1047 		/* initialize network header pointer */
1048 		skb_reset_network_header(skb);
1049 
1050 		/* initialize protocol header pointer */
1051 		skb->transport_header = skb->network_header + fragheaderlen;
1052 
1053 		skb->ip_summed = CHECKSUM_PARTIAL;
1054 		skb->csum = 0;
1055 		sk->sk_sndmsg_off = 0;
1056 	}
1057 
1058 	err = skb_append_datato_frags(sk,skb, getfrag, from,
1059 				      (length - transhdrlen));
1060 	if (!err) {
1061 		struct frag_hdr fhdr;
1062 
1063 		/* specify the length of each IP datagram fragment*/
1064 		skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
1065 					    sizeof(struct frag_hdr);
1066 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1067 		ipv6_select_ident(skb, &fhdr);
1068 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1069 		__skb_queue_tail(&sk->sk_write_queue, skb);
1070 
1071 		return 0;
1072 	}
1073 	/* There is not enough support do UPD LSO,
1074 	 * so follow normal path
1075 	 */
1076 	kfree_skb(skb);
1077 
1078 	return err;
1079 }
1080 
1081 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1082 	int offset, int len, int odd, struct sk_buff *skb),
1083 	void *from, int length, int transhdrlen,
1084 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1085 	struct rt6_info *rt, unsigned int flags)
1086 {
1087 	struct inet_sock *inet = inet_sk(sk);
1088 	struct ipv6_pinfo *np = inet6_sk(sk);
1089 	struct sk_buff *skb;
1090 	unsigned int maxfraglen, fragheaderlen;
1091 	int exthdrlen;
1092 	int hh_len;
1093 	int mtu;
1094 	int copy;
1095 	int err;
1096 	int offset = 0;
1097 	int csummode = CHECKSUM_NONE;
1098 
1099 	if (flags&MSG_PROBE)
1100 		return 0;
1101 	if (skb_queue_empty(&sk->sk_write_queue)) {
1102 		/*
1103 		 * setup for corking
1104 		 */
1105 		if (opt) {
1106 			if (np->cork.opt == NULL) {
1107 				np->cork.opt = kmalloc(opt->tot_len,
1108 						       sk->sk_allocation);
1109 				if (unlikely(np->cork.opt == NULL))
1110 					return -ENOBUFS;
1111 			} else if (np->cork.opt->tot_len < opt->tot_len) {
1112 				printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
1113 				return -EINVAL;
1114 			}
1115 			memcpy(np->cork.opt, opt, opt->tot_len);
1116 			inet->cork.flags |= IPCORK_OPT;
1117 			/* need source address above miyazawa*/
1118 		}
1119 		dst_hold(&rt->u.dst);
1120 		inet->cork.dst = &rt->u.dst;
1121 		inet->cork.fl = *fl;
1122 		np->cork.hop_limit = hlimit;
1123 		np->cork.tclass = tclass;
1124 		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1125 		      rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1126 		if (np->frag_size < mtu) {
1127 			if (np->frag_size)
1128 				mtu = np->frag_size;
1129 		}
1130 		inet->cork.fragsize = mtu;
1131 		if (dst_allfrag(rt->u.dst.path))
1132 			inet->cork.flags |= IPCORK_ALLFRAG;
1133 		inet->cork.length = 0;
1134 		sk->sk_sndmsg_page = NULL;
1135 		sk->sk_sndmsg_off = 0;
1136 		exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1137 			    rt->rt6i_nfheader_len;
1138 		length += exthdrlen;
1139 		transhdrlen += exthdrlen;
1140 	} else {
1141 		rt = (struct rt6_info *)inet->cork.dst;
1142 		fl = &inet->cork.fl;
1143 		if (inet->cork.flags & IPCORK_OPT)
1144 			opt = np->cork.opt;
1145 		transhdrlen = 0;
1146 		exthdrlen = 0;
1147 		mtu = inet->cork.fragsize;
1148 	}
1149 
1150 	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1151 
1152 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1153 			(opt ? opt->opt_nflen : 0);
1154 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1155 
1156 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1157 		if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1158 			ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1159 			return -EMSGSIZE;
1160 		}
1161 	}
1162 
1163 	/*
1164 	 * Let's try using as much space as possible.
1165 	 * Use MTU if total length of the message fits into the MTU.
1166 	 * Otherwise, we need to reserve fragment header and
1167 	 * fragment alignment (= 8-15 octects, in total).
1168 	 *
1169 	 * Note that we may need to "move" the data from the tail of
1170 	 * of the buffer to the new fragment when we split
1171 	 * the message.
1172 	 *
1173 	 * FIXME: It may be fragmented into multiple chunks
1174 	 *        at once if non-fragmentable extension headers
1175 	 *        are too large.
1176 	 * --yoshfuji
1177 	 */
1178 
1179 	inet->cork.length += length;
1180 	if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1181 	    (rt->u.dst.dev->features & NETIF_F_UFO)) {
1182 
1183 		err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1184 					  fragheaderlen, transhdrlen, mtu,
1185 					  flags);
1186 		if (err)
1187 			goto error;
1188 		return 0;
1189 	}
1190 
1191 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1192 		goto alloc_new_skb;
1193 
1194 	while (length > 0) {
1195 		/* Check if the remaining data fits into current packet. */
1196 		copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1197 		if (copy < length)
1198 			copy = maxfraglen - skb->len;
1199 
1200 		if (copy <= 0) {
1201 			char *data;
1202 			unsigned int datalen;
1203 			unsigned int fraglen;
1204 			unsigned int fraggap;
1205 			unsigned int alloclen;
1206 			struct sk_buff *skb_prev;
1207 alloc_new_skb:
1208 			skb_prev = skb;
1209 
1210 			/* There's no room in the current skb */
1211 			if (skb_prev)
1212 				fraggap = skb_prev->len - maxfraglen;
1213 			else
1214 				fraggap = 0;
1215 
1216 			/*
1217 			 * If remaining data exceeds the mtu,
1218 			 * we know we need more fragment(s).
1219 			 */
1220 			datalen = length + fraggap;
1221 			if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1222 				datalen = maxfraglen - fragheaderlen;
1223 
1224 			fraglen = datalen + fragheaderlen;
1225 			if ((flags & MSG_MORE) &&
1226 			    !(rt->u.dst.dev->features&NETIF_F_SG))
1227 				alloclen = mtu;
1228 			else
1229 				alloclen = datalen + fragheaderlen;
1230 
1231 			/*
1232 			 * The last fragment gets additional space at tail.
1233 			 * Note: we overallocate on fragments with MSG_MODE
1234 			 * because we have no idea if we're the last one.
1235 			 */
1236 			if (datalen == length + fraggap)
1237 				alloclen += rt->u.dst.trailer_len;
1238 
1239 			/*
1240 			 * We just reserve space for fragment header.
1241 			 * Note: this may be overallocation if the message
1242 			 * (without MSG_MORE) fits into the MTU.
1243 			 */
1244 			alloclen += sizeof(struct frag_hdr);
1245 
1246 			if (transhdrlen) {
1247 				skb = sock_alloc_send_skb(sk,
1248 						alloclen + hh_len,
1249 						(flags & MSG_DONTWAIT), &err);
1250 			} else {
1251 				skb = NULL;
1252 				if (atomic_read(&sk->sk_wmem_alloc) <=
1253 				    2 * sk->sk_sndbuf)
1254 					skb = sock_wmalloc(sk,
1255 							   alloclen + hh_len, 1,
1256 							   sk->sk_allocation);
1257 				if (unlikely(skb == NULL))
1258 					err = -ENOBUFS;
1259 			}
1260 			if (skb == NULL)
1261 				goto error;
1262 			/*
1263 			 *	Fill in the control structures
1264 			 */
1265 			skb->ip_summed = csummode;
1266 			skb->csum = 0;
1267 			/* reserve for fragmentation */
1268 			skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1269 
1270 			/*
1271 			 *	Find where to start putting bytes
1272 			 */
1273 			data = skb_put(skb, fraglen);
1274 			skb_set_network_header(skb, exthdrlen);
1275 			data += fragheaderlen;
1276 			skb->transport_header = (skb->network_header +
1277 						 fragheaderlen);
1278 			if (fraggap) {
1279 				skb->csum = skb_copy_and_csum_bits(
1280 					skb_prev, maxfraglen,
1281 					data + transhdrlen, fraggap, 0);
1282 				skb_prev->csum = csum_sub(skb_prev->csum,
1283 							  skb->csum);
1284 				data += fraggap;
1285 				pskb_trim_unique(skb_prev, maxfraglen);
1286 			}
1287 			copy = datalen - transhdrlen - fraggap;
1288 			if (copy < 0) {
1289 				err = -EINVAL;
1290 				kfree_skb(skb);
1291 				goto error;
1292 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1293 				err = -EFAULT;
1294 				kfree_skb(skb);
1295 				goto error;
1296 			}
1297 
1298 			offset += copy;
1299 			length -= datalen - fraggap;
1300 			transhdrlen = 0;
1301 			exthdrlen = 0;
1302 			csummode = CHECKSUM_NONE;
1303 
1304 			/*
1305 			 * Put the packet on the pending queue
1306 			 */
1307 			__skb_queue_tail(&sk->sk_write_queue, skb);
1308 			continue;
1309 		}
1310 
1311 		if (copy > length)
1312 			copy = length;
1313 
1314 		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1315 			unsigned int off;
1316 
1317 			off = skb->len;
1318 			if (getfrag(from, skb_put(skb, copy),
1319 						offset, copy, off, skb) < 0) {
1320 				__skb_trim(skb, off);
1321 				err = -EFAULT;
1322 				goto error;
1323 			}
1324 		} else {
1325 			int i = skb_shinfo(skb)->nr_frags;
1326 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1327 			struct page *page = sk->sk_sndmsg_page;
1328 			int off = sk->sk_sndmsg_off;
1329 			unsigned int left;
1330 
1331 			if (page && (left = PAGE_SIZE - off) > 0) {
1332 				if (copy >= left)
1333 					copy = left;
1334 				if (page != frag->page) {
1335 					if (i == MAX_SKB_FRAGS) {
1336 						err = -EMSGSIZE;
1337 						goto error;
1338 					}
1339 					get_page(page);
1340 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1341 					frag = &skb_shinfo(skb)->frags[i];
1342 				}
1343 			} else if(i < MAX_SKB_FRAGS) {
1344 				if (copy > PAGE_SIZE)
1345 					copy = PAGE_SIZE;
1346 				page = alloc_pages(sk->sk_allocation, 0);
1347 				if (page == NULL) {
1348 					err = -ENOMEM;
1349 					goto error;
1350 				}
1351 				sk->sk_sndmsg_page = page;
1352 				sk->sk_sndmsg_off = 0;
1353 
1354 				skb_fill_page_desc(skb, i, page, 0, 0);
1355 				frag = &skb_shinfo(skb)->frags[i];
1356 			} else {
1357 				err = -EMSGSIZE;
1358 				goto error;
1359 			}
1360 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1361 				err = -EFAULT;
1362 				goto error;
1363 			}
1364 			sk->sk_sndmsg_off += copy;
1365 			frag->size += copy;
1366 			skb->len += copy;
1367 			skb->data_len += copy;
1368 			skb->truesize += copy;
1369 			atomic_add(copy, &sk->sk_wmem_alloc);
1370 		}
1371 		offset += copy;
1372 		length -= copy;
1373 	}
1374 	return 0;
1375 error:
1376 	inet->cork.length -= length;
1377 	IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1378 	return err;
1379 }
1380 
1381 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1382 {
1383 	inet->cork.flags &= ~IPCORK_OPT;
1384 	kfree(np->cork.opt);
1385 	np->cork.opt = NULL;
1386 	if (inet->cork.dst) {
1387 		dst_release(inet->cork.dst);
1388 		inet->cork.dst = NULL;
1389 		inet->cork.flags &= ~IPCORK_ALLFRAG;
1390 	}
1391 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1392 }
1393 
1394 int ip6_push_pending_frames(struct sock *sk)
1395 {
1396 	struct sk_buff *skb, *tmp_skb;
1397 	struct sk_buff **tail_skb;
1398 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1399 	struct inet_sock *inet = inet_sk(sk);
1400 	struct ipv6_pinfo *np = inet6_sk(sk);
1401 	struct ipv6hdr *hdr;
1402 	struct ipv6_txoptions *opt = np->cork.opt;
1403 	struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1404 	struct flowi *fl = &inet->cork.fl;
1405 	unsigned char proto = fl->proto;
1406 	int err = 0;
1407 
1408 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1409 		goto out;
1410 	tail_skb = &(skb_shinfo(skb)->frag_list);
1411 
1412 	/* move skb->data to ip header from ext header */
1413 	if (skb->data < skb_network_header(skb))
1414 		__skb_pull(skb, skb_network_offset(skb));
1415 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1416 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1417 		*tail_skb = tmp_skb;
1418 		tail_skb = &(tmp_skb->next);
1419 		skb->len += tmp_skb->len;
1420 		skb->data_len += tmp_skb->len;
1421 		skb->truesize += tmp_skb->truesize;
1422 		__sock_put(tmp_skb->sk);
1423 		tmp_skb->destructor = NULL;
1424 		tmp_skb->sk = NULL;
1425 	}
1426 
1427 	/* Allow local fragmentation. */
1428 	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1429 		skb->local_df = 1;
1430 
1431 	ipv6_addr_copy(final_dst, &fl->fl6_dst);
1432 	__skb_pull(skb, skb_network_header_len(skb));
1433 	if (opt && opt->opt_flen)
1434 		ipv6_push_frag_opts(skb, opt, &proto);
1435 	if (opt && opt->opt_nflen)
1436 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1437 
1438 	skb_push(skb, sizeof(struct ipv6hdr));
1439 	skb_reset_network_header(skb);
1440 	hdr = ipv6_hdr(skb);
1441 
1442 	*(__be32*)hdr = fl->fl6_flowlabel |
1443 		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1444 
1445 	hdr->hop_limit = np->cork.hop_limit;
1446 	hdr->nexthdr = proto;
1447 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1448 	ipv6_addr_copy(&hdr->daddr, final_dst);
1449 
1450 	skb->priority = sk->sk_priority;
1451 	skb->mark = sk->sk_mark;
1452 
1453 	skb->dst = dst_clone(&rt->u.dst);
1454 	IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1455 	if (proto == IPPROTO_ICMPV6) {
1456 		struct inet6_dev *idev = ip6_dst_idev(skb->dst);
1457 
1458 		ICMP6MSGOUT_INC_STATS_BH(idev, icmp6_hdr(skb)->icmp6_type);
1459 		ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS);
1460 	}
1461 
1462 	err = ip6_local_out(skb);
1463 	if (err) {
1464 		if (err > 0)
1465 			err = np->recverr ? net_xmit_errno(err) : 0;
1466 		if (err)
1467 			goto error;
1468 	}
1469 
1470 out:
1471 	ip6_cork_release(inet, np);
1472 	return err;
1473 error:
1474 	goto out;
1475 }
1476 
1477 void ip6_flush_pending_frames(struct sock *sk)
1478 {
1479 	struct sk_buff *skb;
1480 
1481 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1482 		if (skb->dst)
1483 			IP6_INC_STATS(ip6_dst_idev(skb->dst),
1484 				      IPSTATS_MIB_OUTDISCARDS);
1485 		kfree_skb(skb);
1486 	}
1487 
1488 	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1489 }
1490