xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 3bd653c8455bc7991bae77968702b31c8f5df883)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	: 	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 
41 #include <linux/netfilter.h>
42 #include <linux/netfilter_ipv6.h>
43 
44 #include <net/sock.h>
45 #include <net/snmp.h>
46 
47 #include <net/ipv6.h>
48 #include <net/ndisc.h>
49 #include <net/protocol.h>
50 #include <net/ip6_route.h>
51 #include <net/addrconf.h>
52 #include <net/rawv6.h>
53 #include <net/icmp.h>
54 #include <net/xfrm.h>
55 #include <net/checksum.h>
56 #include <linux/mroute6.h>
57 
58 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
59 
60 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
61 {
62 	static u32 ipv6_fragmentation_id = 1;
63 	static DEFINE_SPINLOCK(ip6_id_lock);
64 
65 	spin_lock_bh(&ip6_id_lock);
66 	fhdr->identification = htonl(ipv6_fragmentation_id);
67 	if (++ipv6_fragmentation_id == 0)
68 		ipv6_fragmentation_id = 1;
69 	spin_unlock_bh(&ip6_id_lock);
70 }
71 
72 int __ip6_local_out(struct sk_buff *skb)
73 {
74 	int len;
75 
76 	len = skb->len - sizeof(struct ipv6hdr);
77 	if (len > IPV6_MAXPLEN)
78 		len = 0;
79 	ipv6_hdr(skb)->payload_len = htons(len);
80 
81 	return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
82 		       dst_output);
83 }
84 
85 int ip6_local_out(struct sk_buff *skb)
86 {
87 	int err;
88 
89 	err = __ip6_local_out(skb);
90 	if (likely(err == 1))
91 		err = dst_output(skb);
92 
93 	return err;
94 }
95 EXPORT_SYMBOL_GPL(ip6_local_out);
96 
97 static int ip6_output_finish(struct sk_buff *skb)
98 {
99 	struct dst_entry *dst = skb->dst;
100 
101 	if (dst->hh)
102 		return neigh_hh_output(dst->hh, skb);
103 	else if (dst->neighbour)
104 		return dst->neighbour->output(skb);
105 
106 	IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
107 	kfree_skb(skb);
108 	return -EINVAL;
109 
110 }
111 
112 /* dev_loopback_xmit for use with netfilter. */
113 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
114 {
115 	skb_reset_mac_header(newskb);
116 	__skb_pull(newskb, skb_network_offset(newskb));
117 	newskb->pkt_type = PACKET_LOOPBACK;
118 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
119 	WARN_ON(!newskb->dst);
120 
121 	netif_rx(newskb);
122 	return 0;
123 }
124 
125 
126 static int ip6_output2(struct sk_buff *skb)
127 {
128 	struct dst_entry *dst = skb->dst;
129 	struct net_device *dev = dst->dev;
130 
131 	skb->protocol = htons(ETH_P_IPV6);
132 	skb->dev = dev;
133 
134 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
135 		struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
136 		struct inet6_dev *idev = ip6_dst_idev(skb->dst);
137 
138 		if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
139 		    ((mroute6_socket && !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
140 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
141 					 &ipv6_hdr(skb)->saddr))) {
142 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
143 
144 			/* Do not check for IFF_ALLMULTI; multicast routing
145 			   is not supported in any case.
146 			 */
147 			if (newskb)
148 				NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
149 					NULL, newskb->dev,
150 					ip6_dev_loopback_xmit);
151 
152 			if (ipv6_hdr(skb)->hop_limit == 0) {
153 				IP6_INC_STATS(dev_net(dev), idev,
154 					      IPSTATS_MIB_OUTDISCARDS);
155 				kfree_skb(skb);
156 				return 0;
157 			}
158 		}
159 
160 		IP6_INC_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCASTPKTS);
161 	}
162 
163 	return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
164 		       ip6_output_finish);
165 }
166 
167 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
168 {
169 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
170 
171 	return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
172 	       skb->dst->dev->mtu : dst_mtu(skb->dst);
173 }
174 
175 int ip6_output(struct sk_buff *skb)
176 {
177 	struct inet6_dev *idev = ip6_dst_idev(skb->dst);
178 	if (unlikely(idev->cnf.disable_ipv6)) {
179 		IP6_INC_STATS(dev_net(skb->dst->dev), idev,
180 			      IPSTATS_MIB_OUTDISCARDS);
181 		kfree_skb(skb);
182 		return 0;
183 	}
184 
185 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
186 				dst_allfrag(skb->dst))
187 		return ip6_fragment(skb, ip6_output2);
188 	else
189 		return ip6_output2(skb);
190 }
191 
192 /*
193  *	xmit an sk_buff (used by TCP)
194  */
195 
196 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
197 	     struct ipv6_txoptions *opt, int ipfragok)
198 {
199 	struct net *net = sock_net(sk);
200 	struct ipv6_pinfo *np = inet6_sk(sk);
201 	struct in6_addr *first_hop = &fl->fl6_dst;
202 	struct dst_entry *dst = skb->dst;
203 	struct ipv6hdr *hdr;
204 	u8  proto = fl->proto;
205 	int seg_len = skb->len;
206 	int hlimit, tclass;
207 	u32 mtu;
208 
209 	if (opt) {
210 		unsigned int head_room;
211 
212 		/* First: exthdrs may take lots of space (~8K for now)
213 		   MAX_HEADER is not enough.
214 		 */
215 		head_room = opt->opt_nflen + opt->opt_flen;
216 		seg_len += head_room;
217 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
218 
219 		if (skb_headroom(skb) < head_room) {
220 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
221 			if (skb2 == NULL) {
222 				IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
223 					      IPSTATS_MIB_OUTDISCARDS);
224 				kfree_skb(skb);
225 				return -ENOBUFS;
226 			}
227 			kfree_skb(skb);
228 			skb = skb2;
229 			if (sk)
230 				skb_set_owner_w(skb, sk);
231 		}
232 		if (opt->opt_flen)
233 			ipv6_push_frag_opts(skb, opt, &proto);
234 		if (opt->opt_nflen)
235 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
236 	}
237 
238 	skb_push(skb, sizeof(struct ipv6hdr));
239 	skb_reset_network_header(skb);
240 	hdr = ipv6_hdr(skb);
241 
242 	/* Allow local fragmentation. */
243 	if (ipfragok)
244 		skb->local_df = 1;
245 
246 	/*
247 	 *	Fill in the IPv6 header
248 	 */
249 
250 	hlimit = -1;
251 	if (np)
252 		hlimit = np->hop_limit;
253 	if (hlimit < 0)
254 		hlimit = ip6_dst_hoplimit(dst);
255 
256 	tclass = -1;
257 	if (np)
258 		tclass = np->tclass;
259 	if (tclass < 0)
260 		tclass = 0;
261 
262 	*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
263 
264 	hdr->payload_len = htons(seg_len);
265 	hdr->nexthdr = proto;
266 	hdr->hop_limit = hlimit;
267 
268 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
269 	ipv6_addr_copy(&hdr->daddr, first_hop);
270 
271 	skb->priority = sk->sk_priority;
272 	skb->mark = sk->sk_mark;
273 
274 	mtu = dst_mtu(dst);
275 	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
276 		IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
277 			      IPSTATS_MIB_OUTREQUESTS);
278 		return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
279 				dst_output);
280 	}
281 
282 	if (net_ratelimit())
283 		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
284 	skb->dev = dst->dev;
285 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
286 	IP6_INC_STATS(net, ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
287 	kfree_skb(skb);
288 	return -EMSGSIZE;
289 }
290 
291 EXPORT_SYMBOL(ip6_xmit);
292 
293 /*
294  *	To avoid extra problems ND packets are send through this
295  *	routine. It's code duplication but I really want to avoid
296  *	extra checks since ipv6_build_header is used by TCP (which
297  *	is for us performance critical)
298  */
299 
300 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
301 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
302 	       int proto, int len)
303 {
304 	struct ipv6_pinfo *np = inet6_sk(sk);
305 	struct ipv6hdr *hdr;
306 	int totlen;
307 
308 	skb->protocol = htons(ETH_P_IPV6);
309 	skb->dev = dev;
310 
311 	totlen = len + sizeof(struct ipv6hdr);
312 
313 	skb_reset_network_header(skb);
314 	skb_put(skb, sizeof(struct ipv6hdr));
315 	hdr = ipv6_hdr(skb);
316 
317 	*(__be32*)hdr = htonl(0x60000000);
318 
319 	hdr->payload_len = htons(len);
320 	hdr->nexthdr = proto;
321 	hdr->hop_limit = np->hop_limit;
322 
323 	ipv6_addr_copy(&hdr->saddr, saddr);
324 	ipv6_addr_copy(&hdr->daddr, daddr);
325 
326 	return 0;
327 }
328 
329 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
330 {
331 	struct ip6_ra_chain *ra;
332 	struct sock *last = NULL;
333 
334 	read_lock(&ip6_ra_lock);
335 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
336 		struct sock *sk = ra->sk;
337 		if (sk && ra->sel == sel &&
338 		    (!sk->sk_bound_dev_if ||
339 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
340 			if (last) {
341 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
342 				if (skb2)
343 					rawv6_rcv(last, skb2);
344 			}
345 			last = sk;
346 		}
347 	}
348 
349 	if (last) {
350 		rawv6_rcv(last, skb);
351 		read_unlock(&ip6_ra_lock);
352 		return 1;
353 	}
354 	read_unlock(&ip6_ra_lock);
355 	return 0;
356 }
357 
358 static int ip6_forward_proxy_check(struct sk_buff *skb)
359 {
360 	struct ipv6hdr *hdr = ipv6_hdr(skb);
361 	u8 nexthdr = hdr->nexthdr;
362 	int offset;
363 
364 	if (ipv6_ext_hdr(nexthdr)) {
365 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
366 		if (offset < 0)
367 			return 0;
368 	} else
369 		offset = sizeof(struct ipv6hdr);
370 
371 	if (nexthdr == IPPROTO_ICMPV6) {
372 		struct icmp6hdr *icmp6;
373 
374 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
375 					 offset + 1 - skb->data)))
376 			return 0;
377 
378 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
379 
380 		switch (icmp6->icmp6_type) {
381 		case NDISC_ROUTER_SOLICITATION:
382 		case NDISC_ROUTER_ADVERTISEMENT:
383 		case NDISC_NEIGHBOUR_SOLICITATION:
384 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
385 		case NDISC_REDIRECT:
386 			/* For reaction involving unicast neighbor discovery
387 			 * message destined to the proxied address, pass it to
388 			 * input function.
389 			 */
390 			return 1;
391 		default:
392 			break;
393 		}
394 	}
395 
396 	/*
397 	 * The proxying router can't forward traffic sent to a link-local
398 	 * address, so signal the sender and discard the packet. This
399 	 * behavior is clarified by the MIPv6 specification.
400 	 */
401 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
402 		dst_link_failure(skb);
403 		return -1;
404 	}
405 
406 	return 0;
407 }
408 
409 static inline int ip6_forward_finish(struct sk_buff *skb)
410 {
411 	return dst_output(skb);
412 }
413 
414 int ip6_forward(struct sk_buff *skb)
415 {
416 	struct dst_entry *dst = skb->dst;
417 	struct ipv6hdr *hdr = ipv6_hdr(skb);
418 	struct inet6_skb_parm *opt = IP6CB(skb);
419 	struct net *net = dev_net(dst->dev);
420 
421 	if (net->ipv6.devconf_all->forwarding == 0)
422 		goto error;
423 
424 	if (skb_warn_if_lro(skb))
425 		goto drop;
426 
427 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
428 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
429 		goto drop;
430 	}
431 
432 	skb_forward_csum(skb);
433 
434 	/*
435 	 *	We DO NOT make any processing on
436 	 *	RA packets, pushing them to user level AS IS
437 	 *	without ane WARRANTY that application will be able
438 	 *	to interpret them. The reason is that we
439 	 *	cannot make anything clever here.
440 	 *
441 	 *	We are not end-node, so that if packet contains
442 	 *	AH/ESP, we cannot make anything.
443 	 *	Defragmentation also would be mistake, RA packets
444 	 *	cannot be fragmented, because there is no warranty
445 	 *	that different fragments will go along one path. --ANK
446 	 */
447 	if (opt->ra) {
448 		u8 *ptr = skb_network_header(skb) + opt->ra;
449 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
450 			return 0;
451 	}
452 
453 	/*
454 	 *	check and decrement ttl
455 	 */
456 	if (hdr->hop_limit <= 1) {
457 		/* Force OUTPUT device used as source address */
458 		skb->dev = dst->dev;
459 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
460 			    0, skb->dev);
461 		IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
462 
463 		kfree_skb(skb);
464 		return -ETIMEDOUT;
465 	}
466 
467 	/* XXX: idev->cnf.proxy_ndp? */
468 	if (net->ipv6.devconf_all->proxy_ndp &&
469 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
470 		int proxied = ip6_forward_proxy_check(skb);
471 		if (proxied > 0)
472 			return ip6_input(skb);
473 		else if (proxied < 0) {
474 			IP6_INC_STATS(net, ip6_dst_idev(dst),
475 				      IPSTATS_MIB_INDISCARDS);
476 			goto drop;
477 		}
478 	}
479 
480 	if (!xfrm6_route_forward(skb)) {
481 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
482 		goto drop;
483 	}
484 	dst = skb->dst;
485 
486 	/* IPv6 specs say nothing about it, but it is clear that we cannot
487 	   send redirects to source routed frames.
488 	   We don't send redirects to frames decapsulated from IPsec.
489 	 */
490 	if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
491 	    !skb->sp) {
492 		struct in6_addr *target = NULL;
493 		struct rt6_info *rt;
494 		struct neighbour *n = dst->neighbour;
495 
496 		/*
497 		 *	incoming and outgoing devices are the same
498 		 *	send a redirect.
499 		 */
500 
501 		rt = (struct rt6_info *) dst;
502 		if ((rt->rt6i_flags & RTF_GATEWAY))
503 			target = (struct in6_addr*)&n->primary_key;
504 		else
505 			target = &hdr->daddr;
506 
507 		/* Limit redirects both by destination (here)
508 		   and by source (inside ndisc_send_redirect)
509 		 */
510 		if (xrlim_allow(dst, 1*HZ))
511 			ndisc_send_redirect(skb, n, target);
512 	} else {
513 		int addrtype = ipv6_addr_type(&hdr->saddr);
514 
515 		/* This check is security critical. */
516 		if (addrtype == IPV6_ADDR_ANY ||
517 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
518 			goto error;
519 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
520 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
521 				ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
522 			goto error;
523 		}
524 	}
525 
526 	if (skb->len > dst_mtu(dst)) {
527 		/* Again, force OUTPUT device used as source address */
528 		skb->dev = dst->dev;
529 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
530 		IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
531 		IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
532 		kfree_skb(skb);
533 		return -EMSGSIZE;
534 	}
535 
536 	if (skb_cow(skb, dst->dev->hard_header_len)) {
537 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
538 		goto drop;
539 	}
540 
541 	hdr = ipv6_hdr(skb);
542 
543 	/* Mangling hops number delayed to point after skb COW */
544 
545 	hdr->hop_limit--;
546 
547 	IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
548 	return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
549 		       ip6_forward_finish);
550 
551 error:
552 	IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
553 drop:
554 	kfree_skb(skb);
555 	return -EINVAL;
556 }
557 
558 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
559 {
560 	to->pkt_type = from->pkt_type;
561 	to->priority = from->priority;
562 	to->protocol = from->protocol;
563 	dst_release(to->dst);
564 	to->dst = dst_clone(from->dst);
565 	to->dev = from->dev;
566 	to->mark = from->mark;
567 
568 #ifdef CONFIG_NET_SCHED
569 	to->tc_index = from->tc_index;
570 #endif
571 	nf_copy(to, from);
572 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
573     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
574 	to->nf_trace = from->nf_trace;
575 #endif
576 	skb_copy_secmark(to, from);
577 }
578 
579 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
580 {
581 	u16 offset = sizeof(struct ipv6hdr);
582 	struct ipv6_opt_hdr *exthdr =
583 				(struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
584 	unsigned int packet_len = skb->tail - skb->network_header;
585 	int found_rhdr = 0;
586 	*nexthdr = &ipv6_hdr(skb)->nexthdr;
587 
588 	while (offset + 1 <= packet_len) {
589 
590 		switch (**nexthdr) {
591 
592 		case NEXTHDR_HOP:
593 			break;
594 		case NEXTHDR_ROUTING:
595 			found_rhdr = 1;
596 			break;
597 		case NEXTHDR_DEST:
598 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
599 			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
600 				break;
601 #endif
602 			if (found_rhdr)
603 				return offset;
604 			break;
605 		default :
606 			return offset;
607 		}
608 
609 		offset += ipv6_optlen(exthdr);
610 		*nexthdr = &exthdr->nexthdr;
611 		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
612 						 offset);
613 	}
614 
615 	return offset;
616 }
617 
618 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
619 {
620 	struct sk_buff *frag;
621 	struct rt6_info *rt = (struct rt6_info*)skb->dst;
622 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
623 	struct ipv6hdr *tmp_hdr;
624 	struct frag_hdr *fh;
625 	unsigned int mtu, hlen, left, len;
626 	__be32 frag_id = 0;
627 	int ptr, offset = 0, err=0;
628 	u8 *prevhdr, nexthdr = 0;
629 	struct net *net = dev_net(skb->dst->dev);
630 
631 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
632 	nexthdr = *prevhdr;
633 
634 	mtu = ip6_skb_dst_mtu(skb);
635 
636 	/* We must not fragment if the socket is set to force MTU discovery
637 	 * or if the skb it not generated by a local socket.  (This last
638 	 * check should be redundant, but it's free.)
639 	 */
640 	if (!skb->local_df) {
641 		skb->dev = skb->dst->dev;
642 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
643 		IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
644 			      IPSTATS_MIB_FRAGFAILS);
645 		kfree_skb(skb);
646 		return -EMSGSIZE;
647 	}
648 
649 	if (np && np->frag_size < mtu) {
650 		if (np->frag_size)
651 			mtu = np->frag_size;
652 	}
653 	mtu -= hlen + sizeof(struct frag_hdr);
654 
655 	if (skb_shinfo(skb)->frag_list) {
656 		int first_len = skb_pagelen(skb);
657 		int truesizes = 0;
658 
659 		if (first_len - hlen > mtu ||
660 		    ((first_len - hlen) & 7) ||
661 		    skb_cloned(skb))
662 			goto slow_path;
663 
664 		for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
665 			/* Correct geometry. */
666 			if (frag->len > mtu ||
667 			    ((frag->len & 7) && frag->next) ||
668 			    skb_headroom(frag) < hlen)
669 			    goto slow_path;
670 
671 			/* Partially cloned skb? */
672 			if (skb_shared(frag))
673 				goto slow_path;
674 
675 			BUG_ON(frag->sk);
676 			if (skb->sk) {
677 				sock_hold(skb->sk);
678 				frag->sk = skb->sk;
679 				frag->destructor = sock_wfree;
680 				truesizes += frag->truesize;
681 			}
682 		}
683 
684 		err = 0;
685 		offset = 0;
686 		frag = skb_shinfo(skb)->frag_list;
687 		skb_shinfo(skb)->frag_list = NULL;
688 		/* BUILD HEADER */
689 
690 		*prevhdr = NEXTHDR_FRAGMENT;
691 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
692 		if (!tmp_hdr) {
693 			IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
694 				      IPSTATS_MIB_FRAGFAILS);
695 			return -ENOMEM;
696 		}
697 
698 		__skb_pull(skb, hlen);
699 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
700 		__skb_push(skb, hlen);
701 		skb_reset_network_header(skb);
702 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
703 
704 		ipv6_select_ident(skb, fh);
705 		fh->nexthdr = nexthdr;
706 		fh->reserved = 0;
707 		fh->frag_off = htons(IP6_MF);
708 		frag_id = fh->identification;
709 
710 		first_len = skb_pagelen(skb);
711 		skb->data_len = first_len - skb_headlen(skb);
712 		skb->truesize -= truesizes;
713 		skb->len = first_len;
714 		ipv6_hdr(skb)->payload_len = htons(first_len -
715 						   sizeof(struct ipv6hdr));
716 
717 		dst_hold(&rt->u.dst);
718 
719 		for (;;) {
720 			/* Prepare header of the next frame,
721 			 * before previous one went down. */
722 			if (frag) {
723 				frag->ip_summed = CHECKSUM_NONE;
724 				skb_reset_transport_header(frag);
725 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
726 				__skb_push(frag, hlen);
727 				skb_reset_network_header(frag);
728 				memcpy(skb_network_header(frag), tmp_hdr,
729 				       hlen);
730 				offset += skb->len - hlen - sizeof(struct frag_hdr);
731 				fh->nexthdr = nexthdr;
732 				fh->reserved = 0;
733 				fh->frag_off = htons(offset);
734 				if (frag->next != NULL)
735 					fh->frag_off |= htons(IP6_MF);
736 				fh->identification = frag_id;
737 				ipv6_hdr(frag)->payload_len =
738 						htons(frag->len -
739 						      sizeof(struct ipv6hdr));
740 				ip6_copy_metadata(frag, skb);
741 			}
742 
743 			err = output(skb);
744 			if(!err)
745 				IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
746 					      IPSTATS_MIB_FRAGCREATES);
747 
748 			if (err || !frag)
749 				break;
750 
751 			skb = frag;
752 			frag = skb->next;
753 			skb->next = NULL;
754 		}
755 
756 		kfree(tmp_hdr);
757 
758 		if (err == 0) {
759 			IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
760 				      IPSTATS_MIB_FRAGOKS);
761 			dst_release(&rt->u.dst);
762 			return 0;
763 		}
764 
765 		while (frag) {
766 			skb = frag->next;
767 			kfree_skb(frag);
768 			frag = skb;
769 		}
770 
771 		IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
772 			      IPSTATS_MIB_FRAGFAILS);
773 		dst_release(&rt->u.dst);
774 		return err;
775 	}
776 
777 slow_path:
778 	left = skb->len - hlen;		/* Space per frame */
779 	ptr = hlen;			/* Where to start from */
780 
781 	/*
782 	 *	Fragment the datagram.
783 	 */
784 
785 	*prevhdr = NEXTHDR_FRAGMENT;
786 
787 	/*
788 	 *	Keep copying data until we run out.
789 	 */
790 	while(left > 0)	{
791 		len = left;
792 		/* IF: it doesn't fit, use 'mtu' - the data space left */
793 		if (len > mtu)
794 			len = mtu;
795 		/* IF: we are not sending upto and including the packet end
796 		   then align the next start on an eight byte boundary */
797 		if (len < left)	{
798 			len &= ~7;
799 		}
800 		/*
801 		 *	Allocate buffer.
802 		 */
803 
804 		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
805 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
806 			IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
807 				      IPSTATS_MIB_FRAGFAILS);
808 			err = -ENOMEM;
809 			goto fail;
810 		}
811 
812 		/*
813 		 *	Set up data on packet
814 		 */
815 
816 		ip6_copy_metadata(frag, skb);
817 		skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
818 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
819 		skb_reset_network_header(frag);
820 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
821 		frag->transport_header = (frag->network_header + hlen +
822 					  sizeof(struct frag_hdr));
823 
824 		/*
825 		 *	Charge the memory for the fragment to any owner
826 		 *	it might possess
827 		 */
828 		if (skb->sk)
829 			skb_set_owner_w(frag, skb->sk);
830 
831 		/*
832 		 *	Copy the packet header into the new buffer.
833 		 */
834 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
835 
836 		/*
837 		 *	Build fragment header.
838 		 */
839 		fh->nexthdr = nexthdr;
840 		fh->reserved = 0;
841 		if (!frag_id) {
842 			ipv6_select_ident(skb, fh);
843 			frag_id = fh->identification;
844 		} else
845 			fh->identification = frag_id;
846 
847 		/*
848 		 *	Copy a block of the IP datagram.
849 		 */
850 		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
851 			BUG();
852 		left -= len;
853 
854 		fh->frag_off = htons(offset);
855 		if (left > 0)
856 			fh->frag_off |= htons(IP6_MF);
857 		ipv6_hdr(frag)->payload_len = htons(frag->len -
858 						    sizeof(struct ipv6hdr));
859 
860 		ptr += len;
861 		offset += len;
862 
863 		/*
864 		 *	Put this fragment into the sending queue.
865 		 */
866 		err = output(frag);
867 		if (err)
868 			goto fail;
869 
870 		IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
871 			      IPSTATS_MIB_FRAGCREATES);
872 	}
873 	IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
874 		      IPSTATS_MIB_FRAGOKS);
875 	kfree_skb(skb);
876 	return err;
877 
878 fail:
879 	IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
880 		      IPSTATS_MIB_FRAGFAILS);
881 	kfree_skb(skb);
882 	return err;
883 }
884 
885 static inline int ip6_rt_check(struct rt6key *rt_key,
886 			       struct in6_addr *fl_addr,
887 			       struct in6_addr *addr_cache)
888 {
889 	return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
890 		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
891 }
892 
893 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
894 					  struct dst_entry *dst,
895 					  struct flowi *fl)
896 {
897 	struct ipv6_pinfo *np = inet6_sk(sk);
898 	struct rt6_info *rt = (struct rt6_info *)dst;
899 
900 	if (!dst)
901 		goto out;
902 
903 	/* Yes, checking route validity in not connected
904 	 * case is not very simple. Take into account,
905 	 * that we do not support routing by source, TOS,
906 	 * and MSG_DONTROUTE 		--ANK (980726)
907 	 *
908 	 * 1. ip6_rt_check(): If route was host route,
909 	 *    check that cached destination is current.
910 	 *    If it is network route, we still may
911 	 *    check its validity using saved pointer
912 	 *    to the last used address: daddr_cache.
913 	 *    We do not want to save whole address now,
914 	 *    (because main consumer of this service
915 	 *    is tcp, which has not this problem),
916 	 *    so that the last trick works only on connected
917 	 *    sockets.
918 	 * 2. oif also should be the same.
919 	 */
920 	if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
921 #ifdef CONFIG_IPV6_SUBTREES
922 	    ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
923 #endif
924 	    (fl->oif && fl->oif != dst->dev->ifindex)) {
925 		dst_release(dst);
926 		dst = NULL;
927 	}
928 
929 out:
930 	return dst;
931 }
932 
933 static int ip6_dst_lookup_tail(struct sock *sk,
934 			       struct dst_entry **dst, struct flowi *fl)
935 {
936 	int err;
937 	struct net *net = sock_net(sk);
938 
939 	if (*dst == NULL)
940 		*dst = ip6_route_output(net, sk, fl);
941 
942 	if ((err = (*dst)->error))
943 		goto out_err_release;
944 
945 	if (ipv6_addr_any(&fl->fl6_src)) {
946 		err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
947 					 &fl->fl6_dst,
948 					 sk ? inet6_sk(sk)->srcprefs : 0,
949 					 &fl->fl6_src);
950 		if (err)
951 			goto out_err_release;
952 	}
953 
954 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
955 	/*
956 	 * Here if the dst entry we've looked up
957 	 * has a neighbour entry that is in the INCOMPLETE
958 	 * state and the src address from the flow is
959 	 * marked as OPTIMISTIC, we release the found
960 	 * dst entry and replace it instead with the
961 	 * dst entry of the nexthop router
962 	 */
963 	if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
964 		struct inet6_ifaddr *ifp;
965 		struct flowi fl_gw;
966 		int redirect;
967 
968 		ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
969 				      (*dst)->dev, 1);
970 
971 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
972 		if (ifp)
973 			in6_ifa_put(ifp);
974 
975 		if (redirect) {
976 			/*
977 			 * We need to get the dst entry for the
978 			 * default router instead
979 			 */
980 			dst_release(*dst);
981 			memcpy(&fl_gw, fl, sizeof(struct flowi));
982 			memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
983 			*dst = ip6_route_output(net, sk, &fl_gw);
984 			if ((err = (*dst)->error))
985 				goto out_err_release;
986 		}
987 	}
988 #endif
989 
990 	return 0;
991 
992 out_err_release:
993 	if (err == -ENETUNREACH)
994 		IP6_INC_STATS_BH(NULL, IPSTATS_MIB_OUTNOROUTES);
995 	dst_release(*dst);
996 	*dst = NULL;
997 	return err;
998 }
999 
1000 /**
1001  *	ip6_dst_lookup - perform route lookup on flow
1002  *	@sk: socket which provides route info
1003  *	@dst: pointer to dst_entry * for result
1004  *	@fl: flow to lookup
1005  *
1006  *	This function performs a route lookup on the given flow.
1007  *
1008  *	It returns zero on success, or a standard errno code on error.
1009  */
1010 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1011 {
1012 	*dst = NULL;
1013 	return ip6_dst_lookup_tail(sk, dst, fl);
1014 }
1015 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1016 
1017 /**
1018  *	ip6_sk_dst_lookup - perform socket cached route lookup on flow
1019  *	@sk: socket which provides the dst cache and route info
1020  *	@dst: pointer to dst_entry * for result
1021  *	@fl: flow to lookup
1022  *
1023  *	This function performs a route lookup on the given flow with the
1024  *	possibility of using the cached route in the socket if it is valid.
1025  *	It will take the socket dst lock when operating on the dst cache.
1026  *	As a result, this function can only be used in process context.
1027  *
1028  *	It returns zero on success, or a standard errno code on error.
1029  */
1030 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1031 {
1032 	*dst = NULL;
1033 	if (sk) {
1034 		*dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1035 		*dst = ip6_sk_dst_check(sk, *dst, fl);
1036 	}
1037 
1038 	return ip6_dst_lookup_tail(sk, dst, fl);
1039 }
1040 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1041 
1042 static inline int ip6_ufo_append_data(struct sock *sk,
1043 			int getfrag(void *from, char *to, int offset, int len,
1044 			int odd, struct sk_buff *skb),
1045 			void *from, int length, int hh_len, int fragheaderlen,
1046 			int transhdrlen, int mtu,unsigned int flags)
1047 
1048 {
1049 	struct sk_buff *skb;
1050 	int err;
1051 
1052 	/* There is support for UDP large send offload by network
1053 	 * device, so create one single skb packet containing complete
1054 	 * udp datagram
1055 	 */
1056 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1057 		skb = sock_alloc_send_skb(sk,
1058 			hh_len + fragheaderlen + transhdrlen + 20,
1059 			(flags & MSG_DONTWAIT), &err);
1060 		if (skb == NULL)
1061 			return -ENOMEM;
1062 
1063 		/* reserve space for Hardware header */
1064 		skb_reserve(skb, hh_len);
1065 
1066 		/* create space for UDP/IP header */
1067 		skb_put(skb,fragheaderlen + transhdrlen);
1068 
1069 		/* initialize network header pointer */
1070 		skb_reset_network_header(skb);
1071 
1072 		/* initialize protocol header pointer */
1073 		skb->transport_header = skb->network_header + fragheaderlen;
1074 
1075 		skb->ip_summed = CHECKSUM_PARTIAL;
1076 		skb->csum = 0;
1077 		sk->sk_sndmsg_off = 0;
1078 	}
1079 
1080 	err = skb_append_datato_frags(sk,skb, getfrag, from,
1081 				      (length - transhdrlen));
1082 	if (!err) {
1083 		struct frag_hdr fhdr;
1084 
1085 		/* specify the length of each IP datagram fragment*/
1086 		skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
1087 					    sizeof(struct frag_hdr);
1088 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1089 		ipv6_select_ident(skb, &fhdr);
1090 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1091 		__skb_queue_tail(&sk->sk_write_queue, skb);
1092 
1093 		return 0;
1094 	}
1095 	/* There is not enough support do UPD LSO,
1096 	 * so follow normal path
1097 	 */
1098 	kfree_skb(skb);
1099 
1100 	return err;
1101 }
1102 
1103 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1104 	int offset, int len, int odd, struct sk_buff *skb),
1105 	void *from, int length, int transhdrlen,
1106 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1107 	struct rt6_info *rt, unsigned int flags)
1108 {
1109 	struct inet_sock *inet = inet_sk(sk);
1110 	struct ipv6_pinfo *np = inet6_sk(sk);
1111 	struct sk_buff *skb;
1112 	unsigned int maxfraglen, fragheaderlen;
1113 	int exthdrlen;
1114 	int hh_len;
1115 	int mtu;
1116 	int copy;
1117 	int err;
1118 	int offset = 0;
1119 	int csummode = CHECKSUM_NONE;
1120 
1121 	if (flags&MSG_PROBE)
1122 		return 0;
1123 	if (skb_queue_empty(&sk->sk_write_queue)) {
1124 		/*
1125 		 * setup for corking
1126 		 */
1127 		if (opt) {
1128 			if (np->cork.opt == NULL) {
1129 				np->cork.opt = kmalloc(opt->tot_len,
1130 						       sk->sk_allocation);
1131 				if (unlikely(np->cork.opt == NULL))
1132 					return -ENOBUFS;
1133 			} else if (np->cork.opt->tot_len < opt->tot_len) {
1134 				printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
1135 				return -EINVAL;
1136 			}
1137 			memcpy(np->cork.opt, opt, opt->tot_len);
1138 			inet->cork.flags |= IPCORK_OPT;
1139 			/* need source address above miyazawa*/
1140 		}
1141 		dst_hold(&rt->u.dst);
1142 		inet->cork.dst = &rt->u.dst;
1143 		inet->cork.fl = *fl;
1144 		np->cork.hop_limit = hlimit;
1145 		np->cork.tclass = tclass;
1146 		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1147 		      rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1148 		if (np->frag_size < mtu) {
1149 			if (np->frag_size)
1150 				mtu = np->frag_size;
1151 		}
1152 		inet->cork.fragsize = mtu;
1153 		if (dst_allfrag(rt->u.dst.path))
1154 			inet->cork.flags |= IPCORK_ALLFRAG;
1155 		inet->cork.length = 0;
1156 		sk->sk_sndmsg_page = NULL;
1157 		sk->sk_sndmsg_off = 0;
1158 		exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1159 			    rt->rt6i_nfheader_len;
1160 		length += exthdrlen;
1161 		transhdrlen += exthdrlen;
1162 	} else {
1163 		rt = (struct rt6_info *)inet->cork.dst;
1164 		fl = &inet->cork.fl;
1165 		if (inet->cork.flags & IPCORK_OPT)
1166 			opt = np->cork.opt;
1167 		transhdrlen = 0;
1168 		exthdrlen = 0;
1169 		mtu = inet->cork.fragsize;
1170 	}
1171 
1172 	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1173 
1174 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1175 			(opt ? opt->opt_nflen : 0);
1176 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1177 
1178 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1179 		if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1180 			ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1181 			return -EMSGSIZE;
1182 		}
1183 	}
1184 
1185 	/*
1186 	 * Let's try using as much space as possible.
1187 	 * Use MTU if total length of the message fits into the MTU.
1188 	 * Otherwise, we need to reserve fragment header and
1189 	 * fragment alignment (= 8-15 octects, in total).
1190 	 *
1191 	 * Note that we may need to "move" the data from the tail of
1192 	 * of the buffer to the new fragment when we split
1193 	 * the message.
1194 	 *
1195 	 * FIXME: It may be fragmented into multiple chunks
1196 	 *        at once if non-fragmentable extension headers
1197 	 *        are too large.
1198 	 * --yoshfuji
1199 	 */
1200 
1201 	inet->cork.length += length;
1202 	if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1203 	    (rt->u.dst.dev->features & NETIF_F_UFO)) {
1204 
1205 		err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1206 					  fragheaderlen, transhdrlen, mtu,
1207 					  flags);
1208 		if (err)
1209 			goto error;
1210 		return 0;
1211 	}
1212 
1213 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1214 		goto alloc_new_skb;
1215 
1216 	while (length > 0) {
1217 		/* Check if the remaining data fits into current packet. */
1218 		copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1219 		if (copy < length)
1220 			copy = maxfraglen - skb->len;
1221 
1222 		if (copy <= 0) {
1223 			char *data;
1224 			unsigned int datalen;
1225 			unsigned int fraglen;
1226 			unsigned int fraggap;
1227 			unsigned int alloclen;
1228 			struct sk_buff *skb_prev;
1229 alloc_new_skb:
1230 			skb_prev = skb;
1231 
1232 			/* There's no room in the current skb */
1233 			if (skb_prev)
1234 				fraggap = skb_prev->len - maxfraglen;
1235 			else
1236 				fraggap = 0;
1237 
1238 			/*
1239 			 * If remaining data exceeds the mtu,
1240 			 * we know we need more fragment(s).
1241 			 */
1242 			datalen = length + fraggap;
1243 			if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1244 				datalen = maxfraglen - fragheaderlen;
1245 
1246 			fraglen = datalen + fragheaderlen;
1247 			if ((flags & MSG_MORE) &&
1248 			    !(rt->u.dst.dev->features&NETIF_F_SG))
1249 				alloclen = mtu;
1250 			else
1251 				alloclen = datalen + fragheaderlen;
1252 
1253 			/*
1254 			 * The last fragment gets additional space at tail.
1255 			 * Note: we overallocate on fragments with MSG_MODE
1256 			 * because we have no idea if we're the last one.
1257 			 */
1258 			if (datalen == length + fraggap)
1259 				alloclen += rt->u.dst.trailer_len;
1260 
1261 			/*
1262 			 * We just reserve space for fragment header.
1263 			 * Note: this may be overallocation if the message
1264 			 * (without MSG_MORE) fits into the MTU.
1265 			 */
1266 			alloclen += sizeof(struct frag_hdr);
1267 
1268 			if (transhdrlen) {
1269 				skb = sock_alloc_send_skb(sk,
1270 						alloclen + hh_len,
1271 						(flags & MSG_DONTWAIT), &err);
1272 			} else {
1273 				skb = NULL;
1274 				if (atomic_read(&sk->sk_wmem_alloc) <=
1275 				    2 * sk->sk_sndbuf)
1276 					skb = sock_wmalloc(sk,
1277 							   alloclen + hh_len, 1,
1278 							   sk->sk_allocation);
1279 				if (unlikely(skb == NULL))
1280 					err = -ENOBUFS;
1281 			}
1282 			if (skb == NULL)
1283 				goto error;
1284 			/*
1285 			 *	Fill in the control structures
1286 			 */
1287 			skb->ip_summed = csummode;
1288 			skb->csum = 0;
1289 			/* reserve for fragmentation */
1290 			skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1291 
1292 			/*
1293 			 *	Find where to start putting bytes
1294 			 */
1295 			data = skb_put(skb, fraglen);
1296 			skb_set_network_header(skb, exthdrlen);
1297 			data += fragheaderlen;
1298 			skb->transport_header = (skb->network_header +
1299 						 fragheaderlen);
1300 			if (fraggap) {
1301 				skb->csum = skb_copy_and_csum_bits(
1302 					skb_prev, maxfraglen,
1303 					data + transhdrlen, fraggap, 0);
1304 				skb_prev->csum = csum_sub(skb_prev->csum,
1305 							  skb->csum);
1306 				data += fraggap;
1307 				pskb_trim_unique(skb_prev, maxfraglen);
1308 			}
1309 			copy = datalen - transhdrlen - fraggap;
1310 			if (copy < 0) {
1311 				err = -EINVAL;
1312 				kfree_skb(skb);
1313 				goto error;
1314 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1315 				err = -EFAULT;
1316 				kfree_skb(skb);
1317 				goto error;
1318 			}
1319 
1320 			offset += copy;
1321 			length -= datalen - fraggap;
1322 			transhdrlen = 0;
1323 			exthdrlen = 0;
1324 			csummode = CHECKSUM_NONE;
1325 
1326 			/*
1327 			 * Put the packet on the pending queue
1328 			 */
1329 			__skb_queue_tail(&sk->sk_write_queue, skb);
1330 			continue;
1331 		}
1332 
1333 		if (copy > length)
1334 			copy = length;
1335 
1336 		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1337 			unsigned int off;
1338 
1339 			off = skb->len;
1340 			if (getfrag(from, skb_put(skb, copy),
1341 						offset, copy, off, skb) < 0) {
1342 				__skb_trim(skb, off);
1343 				err = -EFAULT;
1344 				goto error;
1345 			}
1346 		} else {
1347 			int i = skb_shinfo(skb)->nr_frags;
1348 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1349 			struct page *page = sk->sk_sndmsg_page;
1350 			int off = sk->sk_sndmsg_off;
1351 			unsigned int left;
1352 
1353 			if (page && (left = PAGE_SIZE - off) > 0) {
1354 				if (copy >= left)
1355 					copy = left;
1356 				if (page != frag->page) {
1357 					if (i == MAX_SKB_FRAGS) {
1358 						err = -EMSGSIZE;
1359 						goto error;
1360 					}
1361 					get_page(page);
1362 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1363 					frag = &skb_shinfo(skb)->frags[i];
1364 				}
1365 			} else if(i < MAX_SKB_FRAGS) {
1366 				if (copy > PAGE_SIZE)
1367 					copy = PAGE_SIZE;
1368 				page = alloc_pages(sk->sk_allocation, 0);
1369 				if (page == NULL) {
1370 					err = -ENOMEM;
1371 					goto error;
1372 				}
1373 				sk->sk_sndmsg_page = page;
1374 				sk->sk_sndmsg_off = 0;
1375 
1376 				skb_fill_page_desc(skb, i, page, 0, 0);
1377 				frag = &skb_shinfo(skb)->frags[i];
1378 			} else {
1379 				err = -EMSGSIZE;
1380 				goto error;
1381 			}
1382 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1383 				err = -EFAULT;
1384 				goto error;
1385 			}
1386 			sk->sk_sndmsg_off += copy;
1387 			frag->size += copy;
1388 			skb->len += copy;
1389 			skb->data_len += copy;
1390 			skb->truesize += copy;
1391 			atomic_add(copy, &sk->sk_wmem_alloc);
1392 		}
1393 		offset += copy;
1394 		length -= copy;
1395 	}
1396 	return 0;
1397 error:
1398 	inet->cork.length -= length;
1399 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1400 	return err;
1401 }
1402 
1403 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1404 {
1405 	inet->cork.flags &= ~IPCORK_OPT;
1406 	kfree(np->cork.opt);
1407 	np->cork.opt = NULL;
1408 	if (inet->cork.dst) {
1409 		dst_release(inet->cork.dst);
1410 		inet->cork.dst = NULL;
1411 		inet->cork.flags &= ~IPCORK_ALLFRAG;
1412 	}
1413 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1414 }
1415 
1416 int ip6_push_pending_frames(struct sock *sk)
1417 {
1418 	struct sk_buff *skb, *tmp_skb;
1419 	struct sk_buff **tail_skb;
1420 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1421 	struct inet_sock *inet = inet_sk(sk);
1422 	struct ipv6_pinfo *np = inet6_sk(sk);
1423 	struct net *net = sock_net(sk);
1424 	struct ipv6hdr *hdr;
1425 	struct ipv6_txoptions *opt = np->cork.opt;
1426 	struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1427 	struct flowi *fl = &inet->cork.fl;
1428 	unsigned char proto = fl->proto;
1429 	int err = 0;
1430 
1431 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1432 		goto out;
1433 	tail_skb = &(skb_shinfo(skb)->frag_list);
1434 
1435 	/* move skb->data to ip header from ext header */
1436 	if (skb->data < skb_network_header(skb))
1437 		__skb_pull(skb, skb_network_offset(skb));
1438 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1439 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1440 		*tail_skb = tmp_skb;
1441 		tail_skb = &(tmp_skb->next);
1442 		skb->len += tmp_skb->len;
1443 		skb->data_len += tmp_skb->len;
1444 		skb->truesize += tmp_skb->truesize;
1445 		__sock_put(tmp_skb->sk);
1446 		tmp_skb->destructor = NULL;
1447 		tmp_skb->sk = NULL;
1448 	}
1449 
1450 	/* Allow local fragmentation. */
1451 	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1452 		skb->local_df = 1;
1453 
1454 	ipv6_addr_copy(final_dst, &fl->fl6_dst);
1455 	__skb_pull(skb, skb_network_header_len(skb));
1456 	if (opt && opt->opt_flen)
1457 		ipv6_push_frag_opts(skb, opt, &proto);
1458 	if (opt && opt->opt_nflen)
1459 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1460 
1461 	skb_push(skb, sizeof(struct ipv6hdr));
1462 	skb_reset_network_header(skb);
1463 	hdr = ipv6_hdr(skb);
1464 
1465 	*(__be32*)hdr = fl->fl6_flowlabel |
1466 		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1467 
1468 	hdr->hop_limit = np->cork.hop_limit;
1469 	hdr->nexthdr = proto;
1470 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1471 	ipv6_addr_copy(&hdr->daddr, final_dst);
1472 
1473 	skb->priority = sk->sk_priority;
1474 	skb->mark = sk->sk_mark;
1475 
1476 	skb->dst = dst_clone(&rt->u.dst);
1477 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1478 	if (proto == IPPROTO_ICMPV6) {
1479 		struct inet6_dev *idev = ip6_dst_idev(skb->dst);
1480 
1481 		ICMP6MSGOUT_INC_STATS_BH(idev, icmp6_hdr(skb)->icmp6_type);
1482 		ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS);
1483 	}
1484 
1485 	err = ip6_local_out(skb);
1486 	if (err) {
1487 		if (err > 0)
1488 			err = np->recverr ? net_xmit_errno(err) : 0;
1489 		if (err)
1490 			goto error;
1491 	}
1492 
1493 out:
1494 	ip6_cork_release(inet, np);
1495 	return err;
1496 error:
1497 	goto out;
1498 }
1499 
1500 void ip6_flush_pending_frames(struct sock *sk)
1501 {
1502 	struct sk_buff *skb;
1503 
1504 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1505 		if (skb->dst)
1506 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb->dst),
1507 				      IPSTATS_MIB_OUTDISCARDS);
1508 		kfree_skb(skb);
1509 	}
1510 
1511 	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1512 }
1513