xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 367b8112)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	: 	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 
41 #include <linux/netfilter.h>
42 #include <linux/netfilter_ipv6.h>
43 
44 #include <net/sock.h>
45 #include <net/snmp.h>
46 
47 #include <net/ipv6.h>
48 #include <net/ndisc.h>
49 #include <net/protocol.h>
50 #include <net/ip6_route.h>
51 #include <net/addrconf.h>
52 #include <net/rawv6.h>
53 #include <net/icmp.h>
54 #include <net/xfrm.h>
55 #include <net/checksum.h>
56 #include <linux/mroute6.h>
57 
58 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
59 
60 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
61 {
62 	static u32 ipv6_fragmentation_id = 1;
63 	static DEFINE_SPINLOCK(ip6_id_lock);
64 
65 	spin_lock_bh(&ip6_id_lock);
66 	fhdr->identification = htonl(ipv6_fragmentation_id);
67 	if (++ipv6_fragmentation_id == 0)
68 		ipv6_fragmentation_id = 1;
69 	spin_unlock_bh(&ip6_id_lock);
70 }
71 
72 int __ip6_local_out(struct sk_buff *skb)
73 {
74 	int len;
75 
76 	len = skb->len - sizeof(struct ipv6hdr);
77 	if (len > IPV6_MAXPLEN)
78 		len = 0;
79 	ipv6_hdr(skb)->payload_len = htons(len);
80 
81 	return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
82 		       dst_output);
83 }
84 
85 int ip6_local_out(struct sk_buff *skb)
86 {
87 	int err;
88 
89 	err = __ip6_local_out(skb);
90 	if (likely(err == 1))
91 		err = dst_output(skb);
92 
93 	return err;
94 }
95 EXPORT_SYMBOL_GPL(ip6_local_out);
96 
97 static int ip6_output_finish(struct sk_buff *skb)
98 {
99 	struct dst_entry *dst = skb->dst;
100 
101 	if (dst->hh)
102 		return neigh_hh_output(dst->hh, skb);
103 	else if (dst->neighbour)
104 		return dst->neighbour->output(skb);
105 
106 	IP6_INC_STATS_BH(dev_net(dst->dev),
107 			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
108 	kfree_skb(skb);
109 	return -EINVAL;
110 
111 }
112 
113 /* dev_loopback_xmit for use with netfilter. */
114 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
115 {
116 	skb_reset_mac_header(newskb);
117 	__skb_pull(newskb, skb_network_offset(newskb));
118 	newskb->pkt_type = PACKET_LOOPBACK;
119 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
120 	WARN_ON(!newskb->dst);
121 
122 	netif_rx(newskb);
123 	return 0;
124 }
125 
126 
127 static int ip6_output2(struct sk_buff *skb)
128 {
129 	struct dst_entry *dst = skb->dst;
130 	struct net_device *dev = dst->dev;
131 
132 	skb->protocol = htons(ETH_P_IPV6);
133 	skb->dev = dev;
134 
135 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
136 		struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
137 		struct inet6_dev *idev = ip6_dst_idev(skb->dst);
138 
139 		if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
140 		    ((mroute6_socket && !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
141 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
142 					 &ipv6_hdr(skb)->saddr))) {
143 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
144 
145 			/* Do not check for IFF_ALLMULTI; multicast routing
146 			   is not supported in any case.
147 			 */
148 			if (newskb)
149 				NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
150 					NULL, newskb->dev,
151 					ip6_dev_loopback_xmit);
152 
153 			if (ipv6_hdr(skb)->hop_limit == 0) {
154 				IP6_INC_STATS(dev_net(dev), idev,
155 					      IPSTATS_MIB_OUTDISCARDS);
156 				kfree_skb(skb);
157 				return 0;
158 			}
159 		}
160 
161 		IP6_INC_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCASTPKTS);
162 	}
163 
164 	return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
165 		       ip6_output_finish);
166 }
167 
168 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
169 {
170 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
171 
172 	return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
173 	       skb->dst->dev->mtu : dst_mtu(skb->dst);
174 }
175 
176 int ip6_output(struct sk_buff *skb)
177 {
178 	struct inet6_dev *idev = ip6_dst_idev(skb->dst);
179 	if (unlikely(idev->cnf.disable_ipv6)) {
180 		IP6_INC_STATS(dev_net(skb->dst->dev), idev,
181 			      IPSTATS_MIB_OUTDISCARDS);
182 		kfree_skb(skb);
183 		return 0;
184 	}
185 
186 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
187 				dst_allfrag(skb->dst))
188 		return ip6_fragment(skb, ip6_output2);
189 	else
190 		return ip6_output2(skb);
191 }
192 
193 /*
194  *	xmit an sk_buff (used by TCP)
195  */
196 
197 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
198 	     struct ipv6_txoptions *opt, int ipfragok)
199 {
200 	struct net *net = sock_net(sk);
201 	struct ipv6_pinfo *np = inet6_sk(sk);
202 	struct in6_addr *first_hop = &fl->fl6_dst;
203 	struct dst_entry *dst = skb->dst;
204 	struct ipv6hdr *hdr;
205 	u8  proto = fl->proto;
206 	int seg_len = skb->len;
207 	int hlimit, tclass;
208 	u32 mtu;
209 
210 	if (opt) {
211 		unsigned int head_room;
212 
213 		/* First: exthdrs may take lots of space (~8K for now)
214 		   MAX_HEADER is not enough.
215 		 */
216 		head_room = opt->opt_nflen + opt->opt_flen;
217 		seg_len += head_room;
218 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
219 
220 		if (skb_headroom(skb) < head_room) {
221 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
222 			if (skb2 == NULL) {
223 				IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
224 					      IPSTATS_MIB_OUTDISCARDS);
225 				kfree_skb(skb);
226 				return -ENOBUFS;
227 			}
228 			kfree_skb(skb);
229 			skb = skb2;
230 			if (sk)
231 				skb_set_owner_w(skb, sk);
232 		}
233 		if (opt->opt_flen)
234 			ipv6_push_frag_opts(skb, opt, &proto);
235 		if (opt->opt_nflen)
236 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
237 	}
238 
239 	skb_push(skb, sizeof(struct ipv6hdr));
240 	skb_reset_network_header(skb);
241 	hdr = ipv6_hdr(skb);
242 
243 	/* Allow local fragmentation. */
244 	if (ipfragok)
245 		skb->local_df = 1;
246 
247 	/*
248 	 *	Fill in the IPv6 header
249 	 */
250 
251 	hlimit = -1;
252 	if (np)
253 		hlimit = np->hop_limit;
254 	if (hlimit < 0)
255 		hlimit = ip6_dst_hoplimit(dst);
256 
257 	tclass = -1;
258 	if (np)
259 		tclass = np->tclass;
260 	if (tclass < 0)
261 		tclass = 0;
262 
263 	*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
264 
265 	hdr->payload_len = htons(seg_len);
266 	hdr->nexthdr = proto;
267 	hdr->hop_limit = hlimit;
268 
269 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
270 	ipv6_addr_copy(&hdr->daddr, first_hop);
271 
272 	skb->priority = sk->sk_priority;
273 	skb->mark = sk->sk_mark;
274 
275 	mtu = dst_mtu(dst);
276 	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
277 		IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
278 			      IPSTATS_MIB_OUTREQUESTS);
279 		return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
280 				dst_output);
281 	}
282 
283 	if (net_ratelimit())
284 		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
285 	skb->dev = dst->dev;
286 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
287 	IP6_INC_STATS(net, ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
288 	kfree_skb(skb);
289 	return -EMSGSIZE;
290 }
291 
292 EXPORT_SYMBOL(ip6_xmit);
293 
294 /*
295  *	To avoid extra problems ND packets are send through this
296  *	routine. It's code duplication but I really want to avoid
297  *	extra checks since ipv6_build_header is used by TCP (which
298  *	is for us performance critical)
299  */
300 
301 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
302 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
303 	       int proto, int len)
304 {
305 	struct ipv6_pinfo *np = inet6_sk(sk);
306 	struct ipv6hdr *hdr;
307 	int totlen;
308 
309 	skb->protocol = htons(ETH_P_IPV6);
310 	skb->dev = dev;
311 
312 	totlen = len + sizeof(struct ipv6hdr);
313 
314 	skb_reset_network_header(skb);
315 	skb_put(skb, sizeof(struct ipv6hdr));
316 	hdr = ipv6_hdr(skb);
317 
318 	*(__be32*)hdr = htonl(0x60000000);
319 
320 	hdr->payload_len = htons(len);
321 	hdr->nexthdr = proto;
322 	hdr->hop_limit = np->hop_limit;
323 
324 	ipv6_addr_copy(&hdr->saddr, saddr);
325 	ipv6_addr_copy(&hdr->daddr, daddr);
326 
327 	return 0;
328 }
329 
330 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
331 {
332 	struct ip6_ra_chain *ra;
333 	struct sock *last = NULL;
334 
335 	read_lock(&ip6_ra_lock);
336 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
337 		struct sock *sk = ra->sk;
338 		if (sk && ra->sel == sel &&
339 		    (!sk->sk_bound_dev_if ||
340 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
341 			if (last) {
342 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
343 				if (skb2)
344 					rawv6_rcv(last, skb2);
345 			}
346 			last = sk;
347 		}
348 	}
349 
350 	if (last) {
351 		rawv6_rcv(last, skb);
352 		read_unlock(&ip6_ra_lock);
353 		return 1;
354 	}
355 	read_unlock(&ip6_ra_lock);
356 	return 0;
357 }
358 
359 static int ip6_forward_proxy_check(struct sk_buff *skb)
360 {
361 	struct ipv6hdr *hdr = ipv6_hdr(skb);
362 	u8 nexthdr = hdr->nexthdr;
363 	int offset;
364 
365 	if (ipv6_ext_hdr(nexthdr)) {
366 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
367 		if (offset < 0)
368 			return 0;
369 	} else
370 		offset = sizeof(struct ipv6hdr);
371 
372 	if (nexthdr == IPPROTO_ICMPV6) {
373 		struct icmp6hdr *icmp6;
374 
375 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
376 					 offset + 1 - skb->data)))
377 			return 0;
378 
379 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
380 
381 		switch (icmp6->icmp6_type) {
382 		case NDISC_ROUTER_SOLICITATION:
383 		case NDISC_ROUTER_ADVERTISEMENT:
384 		case NDISC_NEIGHBOUR_SOLICITATION:
385 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
386 		case NDISC_REDIRECT:
387 			/* For reaction involving unicast neighbor discovery
388 			 * message destined to the proxied address, pass it to
389 			 * input function.
390 			 */
391 			return 1;
392 		default:
393 			break;
394 		}
395 	}
396 
397 	/*
398 	 * The proxying router can't forward traffic sent to a link-local
399 	 * address, so signal the sender and discard the packet. This
400 	 * behavior is clarified by the MIPv6 specification.
401 	 */
402 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
403 		dst_link_failure(skb);
404 		return -1;
405 	}
406 
407 	return 0;
408 }
409 
410 static inline int ip6_forward_finish(struct sk_buff *skb)
411 {
412 	return dst_output(skb);
413 }
414 
415 int ip6_forward(struct sk_buff *skb)
416 {
417 	struct dst_entry *dst = skb->dst;
418 	struct ipv6hdr *hdr = ipv6_hdr(skb);
419 	struct inet6_skb_parm *opt = IP6CB(skb);
420 	struct net *net = dev_net(dst->dev);
421 
422 	if (net->ipv6.devconf_all->forwarding == 0)
423 		goto error;
424 
425 	if (skb_warn_if_lro(skb))
426 		goto drop;
427 
428 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
429 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
430 		goto drop;
431 	}
432 
433 	skb_forward_csum(skb);
434 
435 	/*
436 	 *	We DO NOT make any processing on
437 	 *	RA packets, pushing them to user level AS IS
438 	 *	without ane WARRANTY that application will be able
439 	 *	to interpret them. The reason is that we
440 	 *	cannot make anything clever here.
441 	 *
442 	 *	We are not end-node, so that if packet contains
443 	 *	AH/ESP, we cannot make anything.
444 	 *	Defragmentation also would be mistake, RA packets
445 	 *	cannot be fragmented, because there is no warranty
446 	 *	that different fragments will go along one path. --ANK
447 	 */
448 	if (opt->ra) {
449 		u8 *ptr = skb_network_header(skb) + opt->ra;
450 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
451 			return 0;
452 	}
453 
454 	/*
455 	 *	check and decrement ttl
456 	 */
457 	if (hdr->hop_limit <= 1) {
458 		/* Force OUTPUT device used as source address */
459 		skb->dev = dst->dev;
460 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
461 			    0, skb->dev);
462 		IP6_INC_STATS_BH(net,
463 				 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
464 
465 		kfree_skb(skb);
466 		return -ETIMEDOUT;
467 	}
468 
469 	/* XXX: idev->cnf.proxy_ndp? */
470 	if (net->ipv6.devconf_all->proxy_ndp &&
471 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
472 		int proxied = ip6_forward_proxy_check(skb);
473 		if (proxied > 0)
474 			return ip6_input(skb);
475 		else if (proxied < 0) {
476 			IP6_INC_STATS(net, ip6_dst_idev(dst),
477 				      IPSTATS_MIB_INDISCARDS);
478 			goto drop;
479 		}
480 	}
481 
482 	if (!xfrm6_route_forward(skb)) {
483 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
484 		goto drop;
485 	}
486 	dst = skb->dst;
487 
488 	/* IPv6 specs say nothing about it, but it is clear that we cannot
489 	   send redirects to source routed frames.
490 	   We don't send redirects to frames decapsulated from IPsec.
491 	 */
492 	if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
493 	    !skb->sp) {
494 		struct in6_addr *target = NULL;
495 		struct rt6_info *rt;
496 		struct neighbour *n = dst->neighbour;
497 
498 		/*
499 		 *	incoming and outgoing devices are the same
500 		 *	send a redirect.
501 		 */
502 
503 		rt = (struct rt6_info *) dst;
504 		if ((rt->rt6i_flags & RTF_GATEWAY))
505 			target = (struct in6_addr*)&n->primary_key;
506 		else
507 			target = &hdr->daddr;
508 
509 		/* Limit redirects both by destination (here)
510 		   and by source (inside ndisc_send_redirect)
511 		 */
512 		if (xrlim_allow(dst, 1*HZ))
513 			ndisc_send_redirect(skb, n, target);
514 	} else {
515 		int addrtype = ipv6_addr_type(&hdr->saddr);
516 
517 		/* This check is security critical. */
518 		if (addrtype == IPV6_ADDR_ANY ||
519 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
520 			goto error;
521 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
522 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
523 				ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
524 			goto error;
525 		}
526 	}
527 
528 	if (skb->len > dst_mtu(dst)) {
529 		/* Again, force OUTPUT device used as source address */
530 		skb->dev = dst->dev;
531 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
532 		IP6_INC_STATS_BH(net,
533 				 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
534 		IP6_INC_STATS_BH(net,
535 				 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
536 		kfree_skb(skb);
537 		return -EMSGSIZE;
538 	}
539 
540 	if (skb_cow(skb, dst->dev->hard_header_len)) {
541 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
542 		goto drop;
543 	}
544 
545 	hdr = ipv6_hdr(skb);
546 
547 	/* Mangling hops number delayed to point after skb COW */
548 
549 	hdr->hop_limit--;
550 
551 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
552 	return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
553 		       ip6_forward_finish);
554 
555 error:
556 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
557 drop:
558 	kfree_skb(skb);
559 	return -EINVAL;
560 }
561 
562 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
563 {
564 	to->pkt_type = from->pkt_type;
565 	to->priority = from->priority;
566 	to->protocol = from->protocol;
567 	dst_release(to->dst);
568 	to->dst = dst_clone(from->dst);
569 	to->dev = from->dev;
570 	to->mark = from->mark;
571 
572 #ifdef CONFIG_NET_SCHED
573 	to->tc_index = from->tc_index;
574 #endif
575 	nf_copy(to, from);
576 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
577     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
578 	to->nf_trace = from->nf_trace;
579 #endif
580 	skb_copy_secmark(to, from);
581 }
582 
583 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
584 {
585 	u16 offset = sizeof(struct ipv6hdr);
586 	struct ipv6_opt_hdr *exthdr =
587 				(struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
588 	unsigned int packet_len = skb->tail - skb->network_header;
589 	int found_rhdr = 0;
590 	*nexthdr = &ipv6_hdr(skb)->nexthdr;
591 
592 	while (offset + 1 <= packet_len) {
593 
594 		switch (**nexthdr) {
595 
596 		case NEXTHDR_HOP:
597 			break;
598 		case NEXTHDR_ROUTING:
599 			found_rhdr = 1;
600 			break;
601 		case NEXTHDR_DEST:
602 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
603 			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
604 				break;
605 #endif
606 			if (found_rhdr)
607 				return offset;
608 			break;
609 		default :
610 			return offset;
611 		}
612 
613 		offset += ipv6_optlen(exthdr);
614 		*nexthdr = &exthdr->nexthdr;
615 		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
616 						 offset);
617 	}
618 
619 	return offset;
620 }
621 
622 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
623 {
624 	struct sk_buff *frag;
625 	struct rt6_info *rt = (struct rt6_info*)skb->dst;
626 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
627 	struct ipv6hdr *tmp_hdr;
628 	struct frag_hdr *fh;
629 	unsigned int mtu, hlen, left, len;
630 	__be32 frag_id = 0;
631 	int ptr, offset = 0, err=0;
632 	u8 *prevhdr, nexthdr = 0;
633 	struct net *net = dev_net(skb->dst->dev);
634 
635 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
636 	nexthdr = *prevhdr;
637 
638 	mtu = ip6_skb_dst_mtu(skb);
639 
640 	/* We must not fragment if the socket is set to force MTU discovery
641 	 * or if the skb it not generated by a local socket.  (This last
642 	 * check should be redundant, but it's free.)
643 	 */
644 	if (!skb->local_df) {
645 		skb->dev = skb->dst->dev;
646 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
647 		IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
648 			      IPSTATS_MIB_FRAGFAILS);
649 		kfree_skb(skb);
650 		return -EMSGSIZE;
651 	}
652 
653 	if (np && np->frag_size < mtu) {
654 		if (np->frag_size)
655 			mtu = np->frag_size;
656 	}
657 	mtu -= hlen + sizeof(struct frag_hdr);
658 
659 	if (skb_shinfo(skb)->frag_list) {
660 		int first_len = skb_pagelen(skb);
661 		int truesizes = 0;
662 
663 		if (first_len - hlen > mtu ||
664 		    ((first_len - hlen) & 7) ||
665 		    skb_cloned(skb))
666 			goto slow_path;
667 
668 		for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
669 			/* Correct geometry. */
670 			if (frag->len > mtu ||
671 			    ((frag->len & 7) && frag->next) ||
672 			    skb_headroom(frag) < hlen)
673 			    goto slow_path;
674 
675 			/* Partially cloned skb? */
676 			if (skb_shared(frag))
677 				goto slow_path;
678 
679 			BUG_ON(frag->sk);
680 			if (skb->sk) {
681 				sock_hold(skb->sk);
682 				frag->sk = skb->sk;
683 				frag->destructor = sock_wfree;
684 				truesizes += frag->truesize;
685 			}
686 		}
687 
688 		err = 0;
689 		offset = 0;
690 		frag = skb_shinfo(skb)->frag_list;
691 		skb_shinfo(skb)->frag_list = NULL;
692 		/* BUILD HEADER */
693 
694 		*prevhdr = NEXTHDR_FRAGMENT;
695 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
696 		if (!tmp_hdr) {
697 			IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
698 				      IPSTATS_MIB_FRAGFAILS);
699 			return -ENOMEM;
700 		}
701 
702 		__skb_pull(skb, hlen);
703 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
704 		__skb_push(skb, hlen);
705 		skb_reset_network_header(skb);
706 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
707 
708 		ipv6_select_ident(skb, fh);
709 		fh->nexthdr = nexthdr;
710 		fh->reserved = 0;
711 		fh->frag_off = htons(IP6_MF);
712 		frag_id = fh->identification;
713 
714 		first_len = skb_pagelen(skb);
715 		skb->data_len = first_len - skb_headlen(skb);
716 		skb->truesize -= truesizes;
717 		skb->len = first_len;
718 		ipv6_hdr(skb)->payload_len = htons(first_len -
719 						   sizeof(struct ipv6hdr));
720 
721 		dst_hold(&rt->u.dst);
722 
723 		for (;;) {
724 			/* Prepare header of the next frame,
725 			 * before previous one went down. */
726 			if (frag) {
727 				frag->ip_summed = CHECKSUM_NONE;
728 				skb_reset_transport_header(frag);
729 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
730 				__skb_push(frag, hlen);
731 				skb_reset_network_header(frag);
732 				memcpy(skb_network_header(frag), tmp_hdr,
733 				       hlen);
734 				offset += skb->len - hlen - sizeof(struct frag_hdr);
735 				fh->nexthdr = nexthdr;
736 				fh->reserved = 0;
737 				fh->frag_off = htons(offset);
738 				if (frag->next != NULL)
739 					fh->frag_off |= htons(IP6_MF);
740 				fh->identification = frag_id;
741 				ipv6_hdr(frag)->payload_len =
742 						htons(frag->len -
743 						      sizeof(struct ipv6hdr));
744 				ip6_copy_metadata(frag, skb);
745 			}
746 
747 			err = output(skb);
748 			if(!err)
749 				IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
750 					      IPSTATS_MIB_FRAGCREATES);
751 
752 			if (err || !frag)
753 				break;
754 
755 			skb = frag;
756 			frag = skb->next;
757 			skb->next = NULL;
758 		}
759 
760 		kfree(tmp_hdr);
761 
762 		if (err == 0) {
763 			IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
764 				      IPSTATS_MIB_FRAGOKS);
765 			dst_release(&rt->u.dst);
766 			return 0;
767 		}
768 
769 		while (frag) {
770 			skb = frag->next;
771 			kfree_skb(frag);
772 			frag = skb;
773 		}
774 
775 		IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
776 			      IPSTATS_MIB_FRAGFAILS);
777 		dst_release(&rt->u.dst);
778 		return err;
779 	}
780 
781 slow_path:
782 	left = skb->len - hlen;		/* Space per frame */
783 	ptr = hlen;			/* Where to start from */
784 
785 	/*
786 	 *	Fragment the datagram.
787 	 */
788 
789 	*prevhdr = NEXTHDR_FRAGMENT;
790 
791 	/*
792 	 *	Keep copying data until we run out.
793 	 */
794 	while(left > 0)	{
795 		len = left;
796 		/* IF: it doesn't fit, use 'mtu' - the data space left */
797 		if (len > mtu)
798 			len = mtu;
799 		/* IF: we are not sending upto and including the packet end
800 		   then align the next start on an eight byte boundary */
801 		if (len < left)	{
802 			len &= ~7;
803 		}
804 		/*
805 		 *	Allocate buffer.
806 		 */
807 
808 		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
809 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
810 			IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
811 				      IPSTATS_MIB_FRAGFAILS);
812 			err = -ENOMEM;
813 			goto fail;
814 		}
815 
816 		/*
817 		 *	Set up data on packet
818 		 */
819 
820 		ip6_copy_metadata(frag, skb);
821 		skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
822 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
823 		skb_reset_network_header(frag);
824 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
825 		frag->transport_header = (frag->network_header + hlen +
826 					  sizeof(struct frag_hdr));
827 
828 		/*
829 		 *	Charge the memory for the fragment to any owner
830 		 *	it might possess
831 		 */
832 		if (skb->sk)
833 			skb_set_owner_w(frag, skb->sk);
834 
835 		/*
836 		 *	Copy the packet header into the new buffer.
837 		 */
838 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
839 
840 		/*
841 		 *	Build fragment header.
842 		 */
843 		fh->nexthdr = nexthdr;
844 		fh->reserved = 0;
845 		if (!frag_id) {
846 			ipv6_select_ident(skb, fh);
847 			frag_id = fh->identification;
848 		} else
849 			fh->identification = frag_id;
850 
851 		/*
852 		 *	Copy a block of the IP datagram.
853 		 */
854 		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
855 			BUG();
856 		left -= len;
857 
858 		fh->frag_off = htons(offset);
859 		if (left > 0)
860 			fh->frag_off |= htons(IP6_MF);
861 		ipv6_hdr(frag)->payload_len = htons(frag->len -
862 						    sizeof(struct ipv6hdr));
863 
864 		ptr += len;
865 		offset += len;
866 
867 		/*
868 		 *	Put this fragment into the sending queue.
869 		 */
870 		err = output(frag);
871 		if (err)
872 			goto fail;
873 
874 		IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
875 			      IPSTATS_MIB_FRAGCREATES);
876 	}
877 	IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
878 		      IPSTATS_MIB_FRAGOKS);
879 	kfree_skb(skb);
880 	return err;
881 
882 fail:
883 	IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
884 		      IPSTATS_MIB_FRAGFAILS);
885 	kfree_skb(skb);
886 	return err;
887 }
888 
889 static inline int ip6_rt_check(struct rt6key *rt_key,
890 			       struct in6_addr *fl_addr,
891 			       struct in6_addr *addr_cache)
892 {
893 	return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
894 		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
895 }
896 
897 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
898 					  struct dst_entry *dst,
899 					  struct flowi *fl)
900 {
901 	struct ipv6_pinfo *np = inet6_sk(sk);
902 	struct rt6_info *rt = (struct rt6_info *)dst;
903 
904 	if (!dst)
905 		goto out;
906 
907 	/* Yes, checking route validity in not connected
908 	 * case is not very simple. Take into account,
909 	 * that we do not support routing by source, TOS,
910 	 * and MSG_DONTROUTE 		--ANK (980726)
911 	 *
912 	 * 1. ip6_rt_check(): If route was host route,
913 	 *    check that cached destination is current.
914 	 *    If it is network route, we still may
915 	 *    check its validity using saved pointer
916 	 *    to the last used address: daddr_cache.
917 	 *    We do not want to save whole address now,
918 	 *    (because main consumer of this service
919 	 *    is tcp, which has not this problem),
920 	 *    so that the last trick works only on connected
921 	 *    sockets.
922 	 * 2. oif also should be the same.
923 	 */
924 	if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
925 #ifdef CONFIG_IPV6_SUBTREES
926 	    ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
927 #endif
928 	    (fl->oif && fl->oif != dst->dev->ifindex)) {
929 		dst_release(dst);
930 		dst = NULL;
931 	}
932 
933 out:
934 	return dst;
935 }
936 
937 static int ip6_dst_lookup_tail(struct sock *sk,
938 			       struct dst_entry **dst, struct flowi *fl)
939 {
940 	int err;
941 	struct net *net = sock_net(sk);
942 
943 	if (*dst == NULL)
944 		*dst = ip6_route_output(net, sk, fl);
945 
946 	if ((err = (*dst)->error))
947 		goto out_err_release;
948 
949 	if (ipv6_addr_any(&fl->fl6_src)) {
950 		err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
951 					 &fl->fl6_dst,
952 					 sk ? inet6_sk(sk)->srcprefs : 0,
953 					 &fl->fl6_src);
954 		if (err)
955 			goto out_err_release;
956 	}
957 
958 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
959 	/*
960 	 * Here if the dst entry we've looked up
961 	 * has a neighbour entry that is in the INCOMPLETE
962 	 * state and the src address from the flow is
963 	 * marked as OPTIMISTIC, we release the found
964 	 * dst entry and replace it instead with the
965 	 * dst entry of the nexthop router
966 	 */
967 	if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
968 		struct inet6_ifaddr *ifp;
969 		struct flowi fl_gw;
970 		int redirect;
971 
972 		ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
973 				      (*dst)->dev, 1);
974 
975 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
976 		if (ifp)
977 			in6_ifa_put(ifp);
978 
979 		if (redirect) {
980 			/*
981 			 * We need to get the dst entry for the
982 			 * default router instead
983 			 */
984 			dst_release(*dst);
985 			memcpy(&fl_gw, fl, sizeof(struct flowi));
986 			memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
987 			*dst = ip6_route_output(net, sk, &fl_gw);
988 			if ((err = (*dst)->error))
989 				goto out_err_release;
990 		}
991 	}
992 #endif
993 
994 	return 0;
995 
996 out_err_release:
997 	if (err == -ENETUNREACH)
998 		IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
999 	dst_release(*dst);
1000 	*dst = NULL;
1001 	return err;
1002 }
1003 
1004 /**
1005  *	ip6_dst_lookup - perform route lookup on flow
1006  *	@sk: socket which provides route info
1007  *	@dst: pointer to dst_entry * for result
1008  *	@fl: flow to lookup
1009  *
1010  *	This function performs a route lookup on the given flow.
1011  *
1012  *	It returns zero on success, or a standard errno code on error.
1013  */
1014 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1015 {
1016 	*dst = NULL;
1017 	return ip6_dst_lookup_tail(sk, dst, fl);
1018 }
1019 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1020 
1021 /**
1022  *	ip6_sk_dst_lookup - perform socket cached route lookup on flow
1023  *	@sk: socket which provides the dst cache and route info
1024  *	@dst: pointer to dst_entry * for result
1025  *	@fl: flow to lookup
1026  *
1027  *	This function performs a route lookup on the given flow with the
1028  *	possibility of using the cached route in the socket if it is valid.
1029  *	It will take the socket dst lock when operating on the dst cache.
1030  *	As a result, this function can only be used in process context.
1031  *
1032  *	It returns zero on success, or a standard errno code on error.
1033  */
1034 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1035 {
1036 	*dst = NULL;
1037 	if (sk) {
1038 		*dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1039 		*dst = ip6_sk_dst_check(sk, *dst, fl);
1040 	}
1041 
1042 	return ip6_dst_lookup_tail(sk, dst, fl);
1043 }
1044 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1045 
1046 static inline int ip6_ufo_append_data(struct sock *sk,
1047 			int getfrag(void *from, char *to, int offset, int len,
1048 			int odd, struct sk_buff *skb),
1049 			void *from, int length, int hh_len, int fragheaderlen,
1050 			int transhdrlen, int mtu,unsigned int flags)
1051 
1052 {
1053 	struct sk_buff *skb;
1054 	int err;
1055 
1056 	/* There is support for UDP large send offload by network
1057 	 * device, so create one single skb packet containing complete
1058 	 * udp datagram
1059 	 */
1060 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1061 		skb = sock_alloc_send_skb(sk,
1062 			hh_len + fragheaderlen + transhdrlen + 20,
1063 			(flags & MSG_DONTWAIT), &err);
1064 		if (skb == NULL)
1065 			return -ENOMEM;
1066 
1067 		/* reserve space for Hardware header */
1068 		skb_reserve(skb, hh_len);
1069 
1070 		/* create space for UDP/IP header */
1071 		skb_put(skb,fragheaderlen + transhdrlen);
1072 
1073 		/* initialize network header pointer */
1074 		skb_reset_network_header(skb);
1075 
1076 		/* initialize protocol header pointer */
1077 		skb->transport_header = skb->network_header + fragheaderlen;
1078 
1079 		skb->ip_summed = CHECKSUM_PARTIAL;
1080 		skb->csum = 0;
1081 		sk->sk_sndmsg_off = 0;
1082 	}
1083 
1084 	err = skb_append_datato_frags(sk,skb, getfrag, from,
1085 				      (length - transhdrlen));
1086 	if (!err) {
1087 		struct frag_hdr fhdr;
1088 
1089 		/* specify the length of each IP datagram fragment*/
1090 		skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
1091 					    sizeof(struct frag_hdr);
1092 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1093 		ipv6_select_ident(skb, &fhdr);
1094 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1095 		__skb_queue_tail(&sk->sk_write_queue, skb);
1096 
1097 		return 0;
1098 	}
1099 	/* There is not enough support do UPD LSO,
1100 	 * so follow normal path
1101 	 */
1102 	kfree_skb(skb);
1103 
1104 	return err;
1105 }
1106 
1107 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1108 	int offset, int len, int odd, struct sk_buff *skb),
1109 	void *from, int length, int transhdrlen,
1110 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1111 	struct rt6_info *rt, unsigned int flags)
1112 {
1113 	struct inet_sock *inet = inet_sk(sk);
1114 	struct ipv6_pinfo *np = inet6_sk(sk);
1115 	struct sk_buff *skb;
1116 	unsigned int maxfraglen, fragheaderlen;
1117 	int exthdrlen;
1118 	int hh_len;
1119 	int mtu;
1120 	int copy;
1121 	int err;
1122 	int offset = 0;
1123 	int csummode = CHECKSUM_NONE;
1124 
1125 	if (flags&MSG_PROBE)
1126 		return 0;
1127 	if (skb_queue_empty(&sk->sk_write_queue)) {
1128 		/*
1129 		 * setup for corking
1130 		 */
1131 		if (opt) {
1132 			if (np->cork.opt == NULL) {
1133 				np->cork.opt = kmalloc(opt->tot_len,
1134 						       sk->sk_allocation);
1135 				if (unlikely(np->cork.opt == NULL))
1136 					return -ENOBUFS;
1137 			} else if (np->cork.opt->tot_len < opt->tot_len) {
1138 				printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
1139 				return -EINVAL;
1140 			}
1141 			memcpy(np->cork.opt, opt, opt->tot_len);
1142 			inet->cork.flags |= IPCORK_OPT;
1143 			/* need source address above miyazawa*/
1144 		}
1145 		dst_hold(&rt->u.dst);
1146 		inet->cork.dst = &rt->u.dst;
1147 		inet->cork.fl = *fl;
1148 		np->cork.hop_limit = hlimit;
1149 		np->cork.tclass = tclass;
1150 		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1151 		      rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1152 		if (np->frag_size < mtu) {
1153 			if (np->frag_size)
1154 				mtu = np->frag_size;
1155 		}
1156 		inet->cork.fragsize = mtu;
1157 		if (dst_allfrag(rt->u.dst.path))
1158 			inet->cork.flags |= IPCORK_ALLFRAG;
1159 		inet->cork.length = 0;
1160 		sk->sk_sndmsg_page = NULL;
1161 		sk->sk_sndmsg_off = 0;
1162 		exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1163 			    rt->rt6i_nfheader_len;
1164 		length += exthdrlen;
1165 		transhdrlen += exthdrlen;
1166 	} else {
1167 		rt = (struct rt6_info *)inet->cork.dst;
1168 		fl = &inet->cork.fl;
1169 		if (inet->cork.flags & IPCORK_OPT)
1170 			opt = np->cork.opt;
1171 		transhdrlen = 0;
1172 		exthdrlen = 0;
1173 		mtu = inet->cork.fragsize;
1174 	}
1175 
1176 	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1177 
1178 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1179 			(opt ? opt->opt_nflen : 0);
1180 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1181 
1182 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1183 		if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1184 			ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1185 			return -EMSGSIZE;
1186 		}
1187 	}
1188 
1189 	/*
1190 	 * Let's try using as much space as possible.
1191 	 * Use MTU if total length of the message fits into the MTU.
1192 	 * Otherwise, we need to reserve fragment header and
1193 	 * fragment alignment (= 8-15 octects, in total).
1194 	 *
1195 	 * Note that we may need to "move" the data from the tail of
1196 	 * of the buffer to the new fragment when we split
1197 	 * the message.
1198 	 *
1199 	 * FIXME: It may be fragmented into multiple chunks
1200 	 *        at once if non-fragmentable extension headers
1201 	 *        are too large.
1202 	 * --yoshfuji
1203 	 */
1204 
1205 	inet->cork.length += length;
1206 	if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1207 	    (rt->u.dst.dev->features & NETIF_F_UFO)) {
1208 
1209 		err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1210 					  fragheaderlen, transhdrlen, mtu,
1211 					  flags);
1212 		if (err)
1213 			goto error;
1214 		return 0;
1215 	}
1216 
1217 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1218 		goto alloc_new_skb;
1219 
1220 	while (length > 0) {
1221 		/* Check if the remaining data fits into current packet. */
1222 		copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1223 		if (copy < length)
1224 			copy = maxfraglen - skb->len;
1225 
1226 		if (copy <= 0) {
1227 			char *data;
1228 			unsigned int datalen;
1229 			unsigned int fraglen;
1230 			unsigned int fraggap;
1231 			unsigned int alloclen;
1232 			struct sk_buff *skb_prev;
1233 alloc_new_skb:
1234 			skb_prev = skb;
1235 
1236 			/* There's no room in the current skb */
1237 			if (skb_prev)
1238 				fraggap = skb_prev->len - maxfraglen;
1239 			else
1240 				fraggap = 0;
1241 
1242 			/*
1243 			 * If remaining data exceeds the mtu,
1244 			 * we know we need more fragment(s).
1245 			 */
1246 			datalen = length + fraggap;
1247 			if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1248 				datalen = maxfraglen - fragheaderlen;
1249 
1250 			fraglen = datalen + fragheaderlen;
1251 			if ((flags & MSG_MORE) &&
1252 			    !(rt->u.dst.dev->features&NETIF_F_SG))
1253 				alloclen = mtu;
1254 			else
1255 				alloclen = datalen + fragheaderlen;
1256 
1257 			/*
1258 			 * The last fragment gets additional space at tail.
1259 			 * Note: we overallocate on fragments with MSG_MODE
1260 			 * because we have no idea if we're the last one.
1261 			 */
1262 			if (datalen == length + fraggap)
1263 				alloclen += rt->u.dst.trailer_len;
1264 
1265 			/*
1266 			 * We just reserve space for fragment header.
1267 			 * Note: this may be overallocation if the message
1268 			 * (without MSG_MORE) fits into the MTU.
1269 			 */
1270 			alloclen += sizeof(struct frag_hdr);
1271 
1272 			if (transhdrlen) {
1273 				skb = sock_alloc_send_skb(sk,
1274 						alloclen + hh_len,
1275 						(flags & MSG_DONTWAIT), &err);
1276 			} else {
1277 				skb = NULL;
1278 				if (atomic_read(&sk->sk_wmem_alloc) <=
1279 				    2 * sk->sk_sndbuf)
1280 					skb = sock_wmalloc(sk,
1281 							   alloclen + hh_len, 1,
1282 							   sk->sk_allocation);
1283 				if (unlikely(skb == NULL))
1284 					err = -ENOBUFS;
1285 			}
1286 			if (skb == NULL)
1287 				goto error;
1288 			/*
1289 			 *	Fill in the control structures
1290 			 */
1291 			skb->ip_summed = csummode;
1292 			skb->csum = 0;
1293 			/* reserve for fragmentation */
1294 			skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1295 
1296 			/*
1297 			 *	Find where to start putting bytes
1298 			 */
1299 			data = skb_put(skb, fraglen);
1300 			skb_set_network_header(skb, exthdrlen);
1301 			data += fragheaderlen;
1302 			skb->transport_header = (skb->network_header +
1303 						 fragheaderlen);
1304 			if (fraggap) {
1305 				skb->csum = skb_copy_and_csum_bits(
1306 					skb_prev, maxfraglen,
1307 					data + transhdrlen, fraggap, 0);
1308 				skb_prev->csum = csum_sub(skb_prev->csum,
1309 							  skb->csum);
1310 				data += fraggap;
1311 				pskb_trim_unique(skb_prev, maxfraglen);
1312 			}
1313 			copy = datalen - transhdrlen - fraggap;
1314 			if (copy < 0) {
1315 				err = -EINVAL;
1316 				kfree_skb(skb);
1317 				goto error;
1318 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1319 				err = -EFAULT;
1320 				kfree_skb(skb);
1321 				goto error;
1322 			}
1323 
1324 			offset += copy;
1325 			length -= datalen - fraggap;
1326 			transhdrlen = 0;
1327 			exthdrlen = 0;
1328 			csummode = CHECKSUM_NONE;
1329 
1330 			/*
1331 			 * Put the packet on the pending queue
1332 			 */
1333 			__skb_queue_tail(&sk->sk_write_queue, skb);
1334 			continue;
1335 		}
1336 
1337 		if (copy > length)
1338 			copy = length;
1339 
1340 		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1341 			unsigned int off;
1342 
1343 			off = skb->len;
1344 			if (getfrag(from, skb_put(skb, copy),
1345 						offset, copy, off, skb) < 0) {
1346 				__skb_trim(skb, off);
1347 				err = -EFAULT;
1348 				goto error;
1349 			}
1350 		} else {
1351 			int i = skb_shinfo(skb)->nr_frags;
1352 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1353 			struct page *page = sk->sk_sndmsg_page;
1354 			int off = sk->sk_sndmsg_off;
1355 			unsigned int left;
1356 
1357 			if (page && (left = PAGE_SIZE - off) > 0) {
1358 				if (copy >= left)
1359 					copy = left;
1360 				if (page != frag->page) {
1361 					if (i == MAX_SKB_FRAGS) {
1362 						err = -EMSGSIZE;
1363 						goto error;
1364 					}
1365 					get_page(page);
1366 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1367 					frag = &skb_shinfo(skb)->frags[i];
1368 				}
1369 			} else if(i < MAX_SKB_FRAGS) {
1370 				if (copy > PAGE_SIZE)
1371 					copy = PAGE_SIZE;
1372 				page = alloc_pages(sk->sk_allocation, 0);
1373 				if (page == NULL) {
1374 					err = -ENOMEM;
1375 					goto error;
1376 				}
1377 				sk->sk_sndmsg_page = page;
1378 				sk->sk_sndmsg_off = 0;
1379 
1380 				skb_fill_page_desc(skb, i, page, 0, 0);
1381 				frag = &skb_shinfo(skb)->frags[i];
1382 			} else {
1383 				err = -EMSGSIZE;
1384 				goto error;
1385 			}
1386 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1387 				err = -EFAULT;
1388 				goto error;
1389 			}
1390 			sk->sk_sndmsg_off += copy;
1391 			frag->size += copy;
1392 			skb->len += copy;
1393 			skb->data_len += copy;
1394 			skb->truesize += copy;
1395 			atomic_add(copy, &sk->sk_wmem_alloc);
1396 		}
1397 		offset += copy;
1398 		length -= copy;
1399 	}
1400 	return 0;
1401 error:
1402 	inet->cork.length -= length;
1403 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1404 	return err;
1405 }
1406 
1407 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1408 {
1409 	inet->cork.flags &= ~IPCORK_OPT;
1410 	kfree(np->cork.opt);
1411 	np->cork.opt = NULL;
1412 	if (inet->cork.dst) {
1413 		dst_release(inet->cork.dst);
1414 		inet->cork.dst = NULL;
1415 		inet->cork.flags &= ~IPCORK_ALLFRAG;
1416 	}
1417 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1418 }
1419 
1420 int ip6_push_pending_frames(struct sock *sk)
1421 {
1422 	struct sk_buff *skb, *tmp_skb;
1423 	struct sk_buff **tail_skb;
1424 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1425 	struct inet_sock *inet = inet_sk(sk);
1426 	struct ipv6_pinfo *np = inet6_sk(sk);
1427 	struct net *net = sock_net(sk);
1428 	struct ipv6hdr *hdr;
1429 	struct ipv6_txoptions *opt = np->cork.opt;
1430 	struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1431 	struct flowi *fl = &inet->cork.fl;
1432 	unsigned char proto = fl->proto;
1433 	int err = 0;
1434 
1435 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1436 		goto out;
1437 	tail_skb = &(skb_shinfo(skb)->frag_list);
1438 
1439 	/* move skb->data to ip header from ext header */
1440 	if (skb->data < skb_network_header(skb))
1441 		__skb_pull(skb, skb_network_offset(skb));
1442 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1443 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1444 		*tail_skb = tmp_skb;
1445 		tail_skb = &(tmp_skb->next);
1446 		skb->len += tmp_skb->len;
1447 		skb->data_len += tmp_skb->len;
1448 		skb->truesize += tmp_skb->truesize;
1449 		__sock_put(tmp_skb->sk);
1450 		tmp_skb->destructor = NULL;
1451 		tmp_skb->sk = NULL;
1452 	}
1453 
1454 	/* Allow local fragmentation. */
1455 	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1456 		skb->local_df = 1;
1457 
1458 	ipv6_addr_copy(final_dst, &fl->fl6_dst);
1459 	__skb_pull(skb, skb_network_header_len(skb));
1460 	if (opt && opt->opt_flen)
1461 		ipv6_push_frag_opts(skb, opt, &proto);
1462 	if (opt && opt->opt_nflen)
1463 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1464 
1465 	skb_push(skb, sizeof(struct ipv6hdr));
1466 	skb_reset_network_header(skb);
1467 	hdr = ipv6_hdr(skb);
1468 
1469 	*(__be32*)hdr = fl->fl6_flowlabel |
1470 		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1471 
1472 	hdr->hop_limit = np->cork.hop_limit;
1473 	hdr->nexthdr = proto;
1474 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1475 	ipv6_addr_copy(&hdr->daddr, final_dst);
1476 
1477 	skb->priority = sk->sk_priority;
1478 	skb->mark = sk->sk_mark;
1479 
1480 	skb->dst = dst_clone(&rt->u.dst);
1481 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1482 	if (proto == IPPROTO_ICMPV6) {
1483 		struct inet6_dev *idev = ip6_dst_idev(skb->dst);
1484 
1485 		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1486 		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1487 	}
1488 
1489 	err = ip6_local_out(skb);
1490 	if (err) {
1491 		if (err > 0)
1492 			err = np->recverr ? net_xmit_errno(err) : 0;
1493 		if (err)
1494 			goto error;
1495 	}
1496 
1497 out:
1498 	ip6_cork_release(inet, np);
1499 	return err;
1500 error:
1501 	goto out;
1502 }
1503 
1504 void ip6_flush_pending_frames(struct sock *sk)
1505 {
1506 	struct sk_buff *skb;
1507 
1508 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1509 		if (skb->dst)
1510 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb->dst),
1511 				      IPSTATS_MIB_OUTDISCARDS);
1512 		kfree_skb(skb);
1513 	}
1514 
1515 	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1516 }
1517