xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 4800cd83)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	: 	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63 	int len;
64 
65 	len = skb->len - sizeof(struct ipv6hdr);
66 	if (len > IPV6_MAXPLEN)
67 		len = 0;
68 	ipv6_hdr(skb)->payload_len = htons(len);
69 
70 	return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 		       skb_dst(skb)->dev, dst_output);
72 }
73 
74 int ip6_local_out(struct sk_buff *skb)
75 {
76 	int err;
77 
78 	err = __ip6_local_out(skb);
79 	if (likely(err == 1))
80 		err = dst_output(skb);
81 
82 	return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85 
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89 	skb_reset_mac_header(newskb);
90 	__skb_pull(newskb, skb_network_offset(newskb));
91 	newskb->pkt_type = PACKET_LOOPBACK;
92 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
93 	WARN_ON(!skb_dst(newskb));
94 
95 	netif_rx_ni(newskb);
96 	return 0;
97 }
98 
99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101 	struct dst_entry *dst = skb_dst(skb);
102 	struct net_device *dev = dst->dev;
103 
104 	skb->protocol = htons(ETH_P_IPV6);
105 	skb->dev = dev;
106 
107 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
108 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
109 
110 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
111 		    ((mroute6_socket(dev_net(dev), skb) &&
112 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
113 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
114 					 &ipv6_hdr(skb)->saddr))) {
115 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
116 
117 			/* Do not check for IFF_ALLMULTI; multicast routing
118 			   is not supported in any case.
119 			 */
120 			if (newskb)
121 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
122 					newskb, NULL, newskb->dev,
123 					ip6_dev_loopback_xmit);
124 
125 			if (ipv6_hdr(skb)->hop_limit == 0) {
126 				IP6_INC_STATS(dev_net(dev), idev,
127 					      IPSTATS_MIB_OUTDISCARDS);
128 				kfree_skb(skb);
129 				return 0;
130 			}
131 		}
132 
133 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
134 				skb->len);
135 	}
136 
137 	if (dst->hh)
138 		return neigh_hh_output(dst->hh, skb);
139 	else if (dst->neighbour)
140 		return dst->neighbour->output(skb);
141 
142 	IP6_INC_STATS_BH(dev_net(dst->dev),
143 			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
144 	kfree_skb(skb);
145 	return -EINVAL;
146 }
147 
148 static int ip6_finish_output(struct sk_buff *skb)
149 {
150 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
151 	    dst_allfrag(skb_dst(skb)))
152 		return ip6_fragment(skb, ip6_finish_output2);
153 	else
154 		return ip6_finish_output2(skb);
155 }
156 
157 int ip6_output(struct sk_buff *skb)
158 {
159 	struct net_device *dev = skb_dst(skb)->dev;
160 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161 	if (unlikely(idev->cnf.disable_ipv6)) {
162 		IP6_INC_STATS(dev_net(dev), idev,
163 			      IPSTATS_MIB_OUTDISCARDS);
164 		kfree_skb(skb);
165 		return 0;
166 	}
167 
168 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
169 			    ip6_finish_output,
170 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
171 }
172 
173 /*
174  *	xmit an sk_buff (used by TCP, SCTP and DCCP)
175  */
176 
177 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
178 	     struct ipv6_txoptions *opt)
179 {
180 	struct net *net = sock_net(sk);
181 	struct ipv6_pinfo *np = inet6_sk(sk);
182 	struct in6_addr *first_hop = &fl->fl6_dst;
183 	struct dst_entry *dst = skb_dst(skb);
184 	struct ipv6hdr *hdr;
185 	u8  proto = fl->proto;
186 	int seg_len = skb->len;
187 	int hlimit = -1;
188 	int tclass = 0;
189 	u32 mtu;
190 
191 	if (opt) {
192 		unsigned int head_room;
193 
194 		/* First: exthdrs may take lots of space (~8K for now)
195 		   MAX_HEADER is not enough.
196 		 */
197 		head_room = opt->opt_nflen + opt->opt_flen;
198 		seg_len += head_room;
199 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
200 
201 		if (skb_headroom(skb) < head_room) {
202 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
203 			if (skb2 == NULL) {
204 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
205 					      IPSTATS_MIB_OUTDISCARDS);
206 				kfree_skb(skb);
207 				return -ENOBUFS;
208 			}
209 			kfree_skb(skb);
210 			skb = skb2;
211 			skb_set_owner_w(skb, sk);
212 		}
213 		if (opt->opt_flen)
214 			ipv6_push_frag_opts(skb, opt, &proto);
215 		if (opt->opt_nflen)
216 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
217 	}
218 
219 	skb_push(skb, sizeof(struct ipv6hdr));
220 	skb_reset_network_header(skb);
221 	hdr = ipv6_hdr(skb);
222 
223 	/*
224 	 *	Fill in the IPv6 header
225 	 */
226 	if (np) {
227 		tclass = np->tclass;
228 		hlimit = np->hop_limit;
229 	}
230 	if (hlimit < 0)
231 		hlimit = ip6_dst_hoplimit(dst);
232 
233 	*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
234 
235 	hdr->payload_len = htons(seg_len);
236 	hdr->nexthdr = proto;
237 	hdr->hop_limit = hlimit;
238 
239 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
240 	ipv6_addr_copy(&hdr->daddr, first_hop);
241 
242 	skb->priority = sk->sk_priority;
243 	skb->mark = sk->sk_mark;
244 
245 	mtu = dst_mtu(dst);
246 	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
247 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
248 			      IPSTATS_MIB_OUT, skb->len);
249 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
250 			       dst->dev, dst_output);
251 	}
252 
253 	if (net_ratelimit())
254 		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
255 	skb->dev = dst->dev;
256 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
257 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
258 	kfree_skb(skb);
259 	return -EMSGSIZE;
260 }
261 
262 EXPORT_SYMBOL(ip6_xmit);
263 
264 /*
265  *	To avoid extra problems ND packets are send through this
266  *	routine. It's code duplication but I really want to avoid
267  *	extra checks since ipv6_build_header is used by TCP (which
268  *	is for us performance critical)
269  */
270 
271 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
272 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
273 	       int proto, int len)
274 {
275 	struct ipv6_pinfo *np = inet6_sk(sk);
276 	struct ipv6hdr *hdr;
277 	int totlen;
278 
279 	skb->protocol = htons(ETH_P_IPV6);
280 	skb->dev = dev;
281 
282 	totlen = len + sizeof(struct ipv6hdr);
283 
284 	skb_reset_network_header(skb);
285 	skb_put(skb, sizeof(struct ipv6hdr));
286 	hdr = ipv6_hdr(skb);
287 
288 	*(__be32*)hdr = htonl(0x60000000);
289 
290 	hdr->payload_len = htons(len);
291 	hdr->nexthdr = proto;
292 	hdr->hop_limit = np->hop_limit;
293 
294 	ipv6_addr_copy(&hdr->saddr, saddr);
295 	ipv6_addr_copy(&hdr->daddr, daddr);
296 
297 	return 0;
298 }
299 
300 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
301 {
302 	struct ip6_ra_chain *ra;
303 	struct sock *last = NULL;
304 
305 	read_lock(&ip6_ra_lock);
306 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
307 		struct sock *sk = ra->sk;
308 		if (sk && ra->sel == sel &&
309 		    (!sk->sk_bound_dev_if ||
310 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
311 			if (last) {
312 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
313 				if (skb2)
314 					rawv6_rcv(last, skb2);
315 			}
316 			last = sk;
317 		}
318 	}
319 
320 	if (last) {
321 		rawv6_rcv(last, skb);
322 		read_unlock(&ip6_ra_lock);
323 		return 1;
324 	}
325 	read_unlock(&ip6_ra_lock);
326 	return 0;
327 }
328 
329 static int ip6_forward_proxy_check(struct sk_buff *skb)
330 {
331 	struct ipv6hdr *hdr = ipv6_hdr(skb);
332 	u8 nexthdr = hdr->nexthdr;
333 	int offset;
334 
335 	if (ipv6_ext_hdr(nexthdr)) {
336 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
337 		if (offset < 0)
338 			return 0;
339 	} else
340 		offset = sizeof(struct ipv6hdr);
341 
342 	if (nexthdr == IPPROTO_ICMPV6) {
343 		struct icmp6hdr *icmp6;
344 
345 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
346 					 offset + 1 - skb->data)))
347 			return 0;
348 
349 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
350 
351 		switch (icmp6->icmp6_type) {
352 		case NDISC_ROUTER_SOLICITATION:
353 		case NDISC_ROUTER_ADVERTISEMENT:
354 		case NDISC_NEIGHBOUR_SOLICITATION:
355 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
356 		case NDISC_REDIRECT:
357 			/* For reaction involving unicast neighbor discovery
358 			 * message destined to the proxied address, pass it to
359 			 * input function.
360 			 */
361 			return 1;
362 		default:
363 			break;
364 		}
365 	}
366 
367 	/*
368 	 * The proxying router can't forward traffic sent to a link-local
369 	 * address, so signal the sender and discard the packet. This
370 	 * behavior is clarified by the MIPv6 specification.
371 	 */
372 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
373 		dst_link_failure(skb);
374 		return -1;
375 	}
376 
377 	return 0;
378 }
379 
380 static inline int ip6_forward_finish(struct sk_buff *skb)
381 {
382 	return dst_output(skb);
383 }
384 
385 int ip6_forward(struct sk_buff *skb)
386 {
387 	struct dst_entry *dst = skb_dst(skb);
388 	struct ipv6hdr *hdr = ipv6_hdr(skb);
389 	struct inet6_skb_parm *opt = IP6CB(skb);
390 	struct net *net = dev_net(dst->dev);
391 	u32 mtu;
392 
393 	if (net->ipv6.devconf_all->forwarding == 0)
394 		goto error;
395 
396 	if (skb_warn_if_lro(skb))
397 		goto drop;
398 
399 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
400 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
401 		goto drop;
402 	}
403 
404 	if (skb->pkt_type != PACKET_HOST)
405 		goto drop;
406 
407 	skb_forward_csum(skb);
408 
409 	/*
410 	 *	We DO NOT make any processing on
411 	 *	RA packets, pushing them to user level AS IS
412 	 *	without ane WARRANTY that application will be able
413 	 *	to interpret them. The reason is that we
414 	 *	cannot make anything clever here.
415 	 *
416 	 *	We are not end-node, so that if packet contains
417 	 *	AH/ESP, we cannot make anything.
418 	 *	Defragmentation also would be mistake, RA packets
419 	 *	cannot be fragmented, because there is no warranty
420 	 *	that different fragments will go along one path. --ANK
421 	 */
422 	if (opt->ra) {
423 		u8 *ptr = skb_network_header(skb) + opt->ra;
424 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
425 			return 0;
426 	}
427 
428 	/*
429 	 *	check and decrement ttl
430 	 */
431 	if (hdr->hop_limit <= 1) {
432 		/* Force OUTPUT device used as source address */
433 		skb->dev = dst->dev;
434 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
435 		IP6_INC_STATS_BH(net,
436 				 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
437 
438 		kfree_skb(skb);
439 		return -ETIMEDOUT;
440 	}
441 
442 	/* XXX: idev->cnf.proxy_ndp? */
443 	if (net->ipv6.devconf_all->proxy_ndp &&
444 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
445 		int proxied = ip6_forward_proxy_check(skb);
446 		if (proxied > 0)
447 			return ip6_input(skb);
448 		else if (proxied < 0) {
449 			IP6_INC_STATS(net, ip6_dst_idev(dst),
450 				      IPSTATS_MIB_INDISCARDS);
451 			goto drop;
452 		}
453 	}
454 
455 	if (!xfrm6_route_forward(skb)) {
456 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
457 		goto drop;
458 	}
459 	dst = skb_dst(skb);
460 
461 	/* IPv6 specs say nothing about it, but it is clear that we cannot
462 	   send redirects to source routed frames.
463 	   We don't send redirects to frames decapsulated from IPsec.
464 	 */
465 	if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
466 	    !skb_sec_path(skb)) {
467 		struct in6_addr *target = NULL;
468 		struct rt6_info *rt;
469 		struct neighbour *n = dst->neighbour;
470 
471 		/*
472 		 *	incoming and outgoing devices are the same
473 		 *	send a redirect.
474 		 */
475 
476 		rt = (struct rt6_info *) dst;
477 		if ((rt->rt6i_flags & RTF_GATEWAY))
478 			target = (struct in6_addr*)&n->primary_key;
479 		else
480 			target = &hdr->daddr;
481 
482 		/* Limit redirects both by destination (here)
483 		   and by source (inside ndisc_send_redirect)
484 		 */
485 		if (xrlim_allow(dst, 1*HZ))
486 			ndisc_send_redirect(skb, n, target);
487 	} else {
488 		int addrtype = ipv6_addr_type(&hdr->saddr);
489 
490 		/* This check is security critical. */
491 		if (addrtype == IPV6_ADDR_ANY ||
492 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
493 			goto error;
494 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
495 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
496 				    ICMPV6_NOT_NEIGHBOUR, 0);
497 			goto error;
498 		}
499 	}
500 
501 	mtu = dst_mtu(dst);
502 	if (mtu < IPV6_MIN_MTU)
503 		mtu = IPV6_MIN_MTU;
504 
505 	if (skb->len > mtu && !skb_is_gso(skb)) {
506 		/* Again, force OUTPUT device used as source address */
507 		skb->dev = dst->dev;
508 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
509 		IP6_INC_STATS_BH(net,
510 				 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
511 		IP6_INC_STATS_BH(net,
512 				 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
513 		kfree_skb(skb);
514 		return -EMSGSIZE;
515 	}
516 
517 	if (skb_cow(skb, dst->dev->hard_header_len)) {
518 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
519 		goto drop;
520 	}
521 
522 	hdr = ipv6_hdr(skb);
523 
524 	/* Mangling hops number delayed to point after skb COW */
525 
526 	hdr->hop_limit--;
527 
528 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
529 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
530 		       ip6_forward_finish);
531 
532 error:
533 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
534 drop:
535 	kfree_skb(skb);
536 	return -EINVAL;
537 }
538 
539 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
540 {
541 	to->pkt_type = from->pkt_type;
542 	to->priority = from->priority;
543 	to->protocol = from->protocol;
544 	skb_dst_drop(to);
545 	skb_dst_set(to, dst_clone(skb_dst(from)));
546 	to->dev = from->dev;
547 	to->mark = from->mark;
548 
549 #ifdef CONFIG_NET_SCHED
550 	to->tc_index = from->tc_index;
551 #endif
552 	nf_copy(to, from);
553 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
554     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
555 	to->nf_trace = from->nf_trace;
556 #endif
557 	skb_copy_secmark(to, from);
558 }
559 
560 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
561 {
562 	u16 offset = sizeof(struct ipv6hdr);
563 	struct ipv6_opt_hdr *exthdr =
564 				(struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
565 	unsigned int packet_len = skb->tail - skb->network_header;
566 	int found_rhdr = 0;
567 	*nexthdr = &ipv6_hdr(skb)->nexthdr;
568 
569 	while (offset + 1 <= packet_len) {
570 
571 		switch (**nexthdr) {
572 
573 		case NEXTHDR_HOP:
574 			break;
575 		case NEXTHDR_ROUTING:
576 			found_rhdr = 1;
577 			break;
578 		case NEXTHDR_DEST:
579 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
580 			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
581 				break;
582 #endif
583 			if (found_rhdr)
584 				return offset;
585 			break;
586 		default :
587 			return offset;
588 		}
589 
590 		offset += ipv6_optlen(exthdr);
591 		*nexthdr = &exthdr->nexthdr;
592 		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
593 						 offset);
594 	}
595 
596 	return offset;
597 }
598 
599 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
600 {
601 	struct sk_buff *frag;
602 	struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
603 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
604 	struct ipv6hdr *tmp_hdr;
605 	struct frag_hdr *fh;
606 	unsigned int mtu, hlen, left, len;
607 	__be32 frag_id = 0;
608 	int ptr, offset = 0, err=0;
609 	u8 *prevhdr, nexthdr = 0;
610 	struct net *net = dev_net(skb_dst(skb)->dev);
611 
612 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
613 	nexthdr = *prevhdr;
614 
615 	mtu = ip6_skb_dst_mtu(skb);
616 
617 	/* We must not fragment if the socket is set to force MTU discovery
618 	 * or if the skb it not generated by a local socket.
619 	 */
620 	if (!skb->local_df && skb->len > mtu) {
621 		skb->dev = skb_dst(skb)->dev;
622 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
623 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
624 			      IPSTATS_MIB_FRAGFAILS);
625 		kfree_skb(skb);
626 		return -EMSGSIZE;
627 	}
628 
629 	if (np && np->frag_size < mtu) {
630 		if (np->frag_size)
631 			mtu = np->frag_size;
632 	}
633 	mtu -= hlen + sizeof(struct frag_hdr);
634 
635 	if (skb_has_frag_list(skb)) {
636 		int first_len = skb_pagelen(skb);
637 		struct sk_buff *frag2;
638 
639 		if (first_len - hlen > mtu ||
640 		    ((first_len - hlen) & 7) ||
641 		    skb_cloned(skb))
642 			goto slow_path;
643 
644 		skb_walk_frags(skb, frag) {
645 			/* Correct geometry. */
646 			if (frag->len > mtu ||
647 			    ((frag->len & 7) && frag->next) ||
648 			    skb_headroom(frag) < hlen)
649 				goto slow_path_clean;
650 
651 			/* Partially cloned skb? */
652 			if (skb_shared(frag))
653 				goto slow_path_clean;
654 
655 			BUG_ON(frag->sk);
656 			if (skb->sk) {
657 				frag->sk = skb->sk;
658 				frag->destructor = sock_wfree;
659 			}
660 			skb->truesize -= frag->truesize;
661 		}
662 
663 		err = 0;
664 		offset = 0;
665 		frag = skb_shinfo(skb)->frag_list;
666 		skb_frag_list_init(skb);
667 		/* BUILD HEADER */
668 
669 		*prevhdr = NEXTHDR_FRAGMENT;
670 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
671 		if (!tmp_hdr) {
672 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
673 				      IPSTATS_MIB_FRAGFAILS);
674 			return -ENOMEM;
675 		}
676 
677 		__skb_pull(skb, hlen);
678 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
679 		__skb_push(skb, hlen);
680 		skb_reset_network_header(skb);
681 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
682 
683 		ipv6_select_ident(fh);
684 		fh->nexthdr = nexthdr;
685 		fh->reserved = 0;
686 		fh->frag_off = htons(IP6_MF);
687 		frag_id = fh->identification;
688 
689 		first_len = skb_pagelen(skb);
690 		skb->data_len = first_len - skb_headlen(skb);
691 		skb->len = first_len;
692 		ipv6_hdr(skb)->payload_len = htons(first_len -
693 						   sizeof(struct ipv6hdr));
694 
695 		dst_hold(&rt->dst);
696 
697 		for (;;) {
698 			/* Prepare header of the next frame,
699 			 * before previous one went down. */
700 			if (frag) {
701 				frag->ip_summed = CHECKSUM_NONE;
702 				skb_reset_transport_header(frag);
703 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
704 				__skb_push(frag, hlen);
705 				skb_reset_network_header(frag);
706 				memcpy(skb_network_header(frag), tmp_hdr,
707 				       hlen);
708 				offset += skb->len - hlen - sizeof(struct frag_hdr);
709 				fh->nexthdr = nexthdr;
710 				fh->reserved = 0;
711 				fh->frag_off = htons(offset);
712 				if (frag->next != NULL)
713 					fh->frag_off |= htons(IP6_MF);
714 				fh->identification = frag_id;
715 				ipv6_hdr(frag)->payload_len =
716 						htons(frag->len -
717 						      sizeof(struct ipv6hdr));
718 				ip6_copy_metadata(frag, skb);
719 			}
720 
721 			err = output(skb);
722 			if(!err)
723 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
724 					      IPSTATS_MIB_FRAGCREATES);
725 
726 			if (err || !frag)
727 				break;
728 
729 			skb = frag;
730 			frag = skb->next;
731 			skb->next = NULL;
732 		}
733 
734 		kfree(tmp_hdr);
735 
736 		if (err == 0) {
737 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
738 				      IPSTATS_MIB_FRAGOKS);
739 			dst_release(&rt->dst);
740 			return 0;
741 		}
742 
743 		while (frag) {
744 			skb = frag->next;
745 			kfree_skb(frag);
746 			frag = skb;
747 		}
748 
749 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
750 			      IPSTATS_MIB_FRAGFAILS);
751 		dst_release(&rt->dst);
752 		return err;
753 
754 slow_path_clean:
755 		skb_walk_frags(skb, frag2) {
756 			if (frag2 == frag)
757 				break;
758 			frag2->sk = NULL;
759 			frag2->destructor = NULL;
760 			skb->truesize += frag2->truesize;
761 		}
762 	}
763 
764 slow_path:
765 	left = skb->len - hlen;		/* Space per frame */
766 	ptr = hlen;			/* Where to start from */
767 
768 	/*
769 	 *	Fragment the datagram.
770 	 */
771 
772 	*prevhdr = NEXTHDR_FRAGMENT;
773 
774 	/*
775 	 *	Keep copying data until we run out.
776 	 */
777 	while(left > 0)	{
778 		len = left;
779 		/* IF: it doesn't fit, use 'mtu' - the data space left */
780 		if (len > mtu)
781 			len = mtu;
782 		/* IF: we are not sending upto and including the packet end
783 		   then align the next start on an eight byte boundary */
784 		if (len < left)	{
785 			len &= ~7;
786 		}
787 		/*
788 		 *	Allocate buffer.
789 		 */
790 
791 		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
792 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
793 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
794 				      IPSTATS_MIB_FRAGFAILS);
795 			err = -ENOMEM;
796 			goto fail;
797 		}
798 
799 		/*
800 		 *	Set up data on packet
801 		 */
802 
803 		ip6_copy_metadata(frag, skb);
804 		skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
805 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
806 		skb_reset_network_header(frag);
807 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
808 		frag->transport_header = (frag->network_header + hlen +
809 					  sizeof(struct frag_hdr));
810 
811 		/*
812 		 *	Charge the memory for the fragment to any owner
813 		 *	it might possess
814 		 */
815 		if (skb->sk)
816 			skb_set_owner_w(frag, skb->sk);
817 
818 		/*
819 		 *	Copy the packet header into the new buffer.
820 		 */
821 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
822 
823 		/*
824 		 *	Build fragment header.
825 		 */
826 		fh->nexthdr = nexthdr;
827 		fh->reserved = 0;
828 		if (!frag_id) {
829 			ipv6_select_ident(fh);
830 			frag_id = fh->identification;
831 		} else
832 			fh->identification = frag_id;
833 
834 		/*
835 		 *	Copy a block of the IP datagram.
836 		 */
837 		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
838 			BUG();
839 		left -= len;
840 
841 		fh->frag_off = htons(offset);
842 		if (left > 0)
843 			fh->frag_off |= htons(IP6_MF);
844 		ipv6_hdr(frag)->payload_len = htons(frag->len -
845 						    sizeof(struct ipv6hdr));
846 
847 		ptr += len;
848 		offset += len;
849 
850 		/*
851 		 *	Put this fragment into the sending queue.
852 		 */
853 		err = output(frag);
854 		if (err)
855 			goto fail;
856 
857 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
858 			      IPSTATS_MIB_FRAGCREATES);
859 	}
860 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
861 		      IPSTATS_MIB_FRAGOKS);
862 	kfree_skb(skb);
863 	return err;
864 
865 fail:
866 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
867 		      IPSTATS_MIB_FRAGFAILS);
868 	kfree_skb(skb);
869 	return err;
870 }
871 
872 static inline int ip6_rt_check(struct rt6key *rt_key,
873 			       struct in6_addr *fl_addr,
874 			       struct in6_addr *addr_cache)
875 {
876 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
877 		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
878 }
879 
880 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
881 					  struct dst_entry *dst,
882 					  struct flowi *fl)
883 {
884 	struct ipv6_pinfo *np = inet6_sk(sk);
885 	struct rt6_info *rt = (struct rt6_info *)dst;
886 
887 	if (!dst)
888 		goto out;
889 
890 	/* Yes, checking route validity in not connected
891 	 * case is not very simple. Take into account,
892 	 * that we do not support routing by source, TOS,
893 	 * and MSG_DONTROUTE 		--ANK (980726)
894 	 *
895 	 * 1. ip6_rt_check(): If route was host route,
896 	 *    check that cached destination is current.
897 	 *    If it is network route, we still may
898 	 *    check its validity using saved pointer
899 	 *    to the last used address: daddr_cache.
900 	 *    We do not want to save whole address now,
901 	 *    (because main consumer of this service
902 	 *    is tcp, which has not this problem),
903 	 *    so that the last trick works only on connected
904 	 *    sockets.
905 	 * 2. oif also should be the same.
906 	 */
907 	if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
908 #ifdef CONFIG_IPV6_SUBTREES
909 	    ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
910 #endif
911 	    (fl->oif && fl->oif != dst->dev->ifindex)) {
912 		dst_release(dst);
913 		dst = NULL;
914 	}
915 
916 out:
917 	return dst;
918 }
919 
920 static int ip6_dst_lookup_tail(struct sock *sk,
921 			       struct dst_entry **dst, struct flowi *fl)
922 {
923 	int err;
924 	struct net *net = sock_net(sk);
925 
926 	if (*dst == NULL)
927 		*dst = ip6_route_output(net, sk, fl);
928 
929 	if ((err = (*dst)->error))
930 		goto out_err_release;
931 
932 	if (ipv6_addr_any(&fl->fl6_src)) {
933 		err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
934 					 &fl->fl6_dst,
935 					 sk ? inet6_sk(sk)->srcprefs : 0,
936 					 &fl->fl6_src);
937 		if (err)
938 			goto out_err_release;
939 	}
940 
941 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
942 	/*
943 	 * Here if the dst entry we've looked up
944 	 * has a neighbour entry that is in the INCOMPLETE
945 	 * state and the src address from the flow is
946 	 * marked as OPTIMISTIC, we release the found
947 	 * dst entry and replace it instead with the
948 	 * dst entry of the nexthop router
949 	 */
950 	if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
951 		struct inet6_ifaddr *ifp;
952 		struct flowi fl_gw;
953 		int redirect;
954 
955 		ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
956 				      (*dst)->dev, 1);
957 
958 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
959 		if (ifp)
960 			in6_ifa_put(ifp);
961 
962 		if (redirect) {
963 			/*
964 			 * We need to get the dst entry for the
965 			 * default router instead
966 			 */
967 			dst_release(*dst);
968 			memcpy(&fl_gw, fl, sizeof(struct flowi));
969 			memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
970 			*dst = ip6_route_output(net, sk, &fl_gw);
971 			if ((err = (*dst)->error))
972 				goto out_err_release;
973 		}
974 	}
975 #endif
976 
977 	return 0;
978 
979 out_err_release:
980 	if (err == -ENETUNREACH)
981 		IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
982 	dst_release(*dst);
983 	*dst = NULL;
984 	return err;
985 }
986 
987 /**
988  *	ip6_dst_lookup - perform route lookup on flow
989  *	@sk: socket which provides route info
990  *	@dst: pointer to dst_entry * for result
991  *	@fl: flow to lookup
992  *
993  *	This function performs a route lookup on the given flow.
994  *
995  *	It returns zero on success, or a standard errno code on error.
996  */
997 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
998 {
999 	*dst = NULL;
1000 	return ip6_dst_lookup_tail(sk, dst, fl);
1001 }
1002 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1003 
1004 /**
1005  *	ip6_sk_dst_lookup - perform socket cached route lookup on flow
1006  *	@sk: socket which provides the dst cache and route info
1007  *	@dst: pointer to dst_entry * for result
1008  *	@fl: flow to lookup
1009  *
1010  *	This function performs a route lookup on the given flow with the
1011  *	possibility of using the cached route in the socket if it is valid.
1012  *	It will take the socket dst lock when operating on the dst cache.
1013  *	As a result, this function can only be used in process context.
1014  *
1015  *	It returns zero on success, or a standard errno code on error.
1016  */
1017 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1018 {
1019 	*dst = NULL;
1020 	if (sk) {
1021 		*dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1022 		*dst = ip6_sk_dst_check(sk, *dst, fl);
1023 	}
1024 
1025 	return ip6_dst_lookup_tail(sk, dst, fl);
1026 }
1027 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1028 
1029 static inline int ip6_ufo_append_data(struct sock *sk,
1030 			int getfrag(void *from, char *to, int offset, int len,
1031 			int odd, struct sk_buff *skb),
1032 			void *from, int length, int hh_len, int fragheaderlen,
1033 			int transhdrlen, int mtu,unsigned int flags)
1034 
1035 {
1036 	struct sk_buff *skb;
1037 	int err;
1038 
1039 	/* There is support for UDP large send offload by network
1040 	 * device, so create one single skb packet containing complete
1041 	 * udp datagram
1042 	 */
1043 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1044 		skb = sock_alloc_send_skb(sk,
1045 			hh_len + fragheaderlen + transhdrlen + 20,
1046 			(flags & MSG_DONTWAIT), &err);
1047 		if (skb == NULL)
1048 			return -ENOMEM;
1049 
1050 		/* reserve space for Hardware header */
1051 		skb_reserve(skb, hh_len);
1052 
1053 		/* create space for UDP/IP header */
1054 		skb_put(skb,fragheaderlen + transhdrlen);
1055 
1056 		/* initialize network header pointer */
1057 		skb_reset_network_header(skb);
1058 
1059 		/* initialize protocol header pointer */
1060 		skb->transport_header = skb->network_header + fragheaderlen;
1061 
1062 		skb->ip_summed = CHECKSUM_PARTIAL;
1063 		skb->csum = 0;
1064 		sk->sk_sndmsg_off = 0;
1065 	}
1066 
1067 	err = skb_append_datato_frags(sk,skb, getfrag, from,
1068 				      (length - transhdrlen));
1069 	if (!err) {
1070 		struct frag_hdr fhdr;
1071 
1072 		/* Specify the length of each IPv6 datagram fragment.
1073 		 * It has to be a multiple of 8.
1074 		 */
1075 		skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1076 					     sizeof(struct frag_hdr)) & ~7;
1077 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1078 		ipv6_select_ident(&fhdr);
1079 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1080 		__skb_queue_tail(&sk->sk_write_queue, skb);
1081 
1082 		return 0;
1083 	}
1084 	/* There is not enough support do UPD LSO,
1085 	 * so follow normal path
1086 	 */
1087 	kfree_skb(skb);
1088 
1089 	return err;
1090 }
1091 
1092 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1093 					       gfp_t gfp)
1094 {
1095 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1096 }
1097 
1098 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1099 						gfp_t gfp)
1100 {
1101 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1102 }
1103 
1104 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1105 	int offset, int len, int odd, struct sk_buff *skb),
1106 	void *from, int length, int transhdrlen,
1107 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1108 	struct rt6_info *rt, unsigned int flags, int dontfrag)
1109 {
1110 	struct inet_sock *inet = inet_sk(sk);
1111 	struct ipv6_pinfo *np = inet6_sk(sk);
1112 	struct sk_buff *skb;
1113 	unsigned int maxfraglen, fragheaderlen;
1114 	int exthdrlen;
1115 	int hh_len;
1116 	int mtu;
1117 	int copy;
1118 	int err;
1119 	int offset = 0;
1120 	int csummode = CHECKSUM_NONE;
1121 
1122 	if (flags&MSG_PROBE)
1123 		return 0;
1124 	if (skb_queue_empty(&sk->sk_write_queue)) {
1125 		/*
1126 		 * setup for corking
1127 		 */
1128 		if (opt) {
1129 			if (WARN_ON(np->cork.opt))
1130 				return -EINVAL;
1131 
1132 			np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1133 			if (unlikely(np->cork.opt == NULL))
1134 				return -ENOBUFS;
1135 
1136 			np->cork.opt->tot_len = opt->tot_len;
1137 			np->cork.opt->opt_flen = opt->opt_flen;
1138 			np->cork.opt->opt_nflen = opt->opt_nflen;
1139 
1140 			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1141 							    sk->sk_allocation);
1142 			if (opt->dst0opt && !np->cork.opt->dst0opt)
1143 				return -ENOBUFS;
1144 
1145 			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1146 							    sk->sk_allocation);
1147 			if (opt->dst1opt && !np->cork.opt->dst1opt)
1148 				return -ENOBUFS;
1149 
1150 			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1151 							   sk->sk_allocation);
1152 			if (opt->hopopt && !np->cork.opt->hopopt)
1153 				return -ENOBUFS;
1154 
1155 			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1156 							    sk->sk_allocation);
1157 			if (opt->srcrt && !np->cork.opt->srcrt)
1158 				return -ENOBUFS;
1159 
1160 			/* need source address above miyazawa*/
1161 		}
1162 		dst_hold(&rt->dst);
1163 		inet->cork.dst = &rt->dst;
1164 		inet->cork.fl = *fl;
1165 		np->cork.hop_limit = hlimit;
1166 		np->cork.tclass = tclass;
1167 		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1168 		      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1169 		if (np->frag_size < mtu) {
1170 			if (np->frag_size)
1171 				mtu = np->frag_size;
1172 		}
1173 		inet->cork.fragsize = mtu;
1174 		if (dst_allfrag(rt->dst.path))
1175 			inet->cork.flags |= IPCORK_ALLFRAG;
1176 		inet->cork.length = 0;
1177 		sk->sk_sndmsg_page = NULL;
1178 		sk->sk_sndmsg_off = 0;
1179 		exthdrlen = rt->dst.header_len + (opt ? opt->opt_flen : 0) -
1180 			    rt->rt6i_nfheader_len;
1181 		length += exthdrlen;
1182 		transhdrlen += exthdrlen;
1183 	} else {
1184 		rt = (struct rt6_info *)inet->cork.dst;
1185 		fl = &inet->cork.fl;
1186 		opt = np->cork.opt;
1187 		transhdrlen = 0;
1188 		exthdrlen = 0;
1189 		mtu = inet->cork.fragsize;
1190 	}
1191 
1192 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1193 
1194 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1195 			(opt ? opt->opt_nflen : 0);
1196 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1197 
1198 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1199 		if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1200 			ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1201 			return -EMSGSIZE;
1202 		}
1203 	}
1204 
1205 	/*
1206 	 * Let's try using as much space as possible.
1207 	 * Use MTU if total length of the message fits into the MTU.
1208 	 * Otherwise, we need to reserve fragment header and
1209 	 * fragment alignment (= 8-15 octects, in total).
1210 	 *
1211 	 * Note that we may need to "move" the data from the tail of
1212 	 * of the buffer to the new fragment when we split
1213 	 * the message.
1214 	 *
1215 	 * FIXME: It may be fragmented into multiple chunks
1216 	 *        at once if non-fragmentable extension headers
1217 	 *        are too large.
1218 	 * --yoshfuji
1219 	 */
1220 
1221 	inet->cork.length += length;
1222 	if (length > mtu) {
1223 		int proto = sk->sk_protocol;
1224 		if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1225 			ipv6_local_rxpmtu(sk, fl, mtu-exthdrlen);
1226 			return -EMSGSIZE;
1227 		}
1228 
1229 		if (proto == IPPROTO_UDP &&
1230 		    (rt->dst.dev->features & NETIF_F_UFO)) {
1231 
1232 			err = ip6_ufo_append_data(sk, getfrag, from, length,
1233 						  hh_len, fragheaderlen,
1234 						  transhdrlen, mtu, flags);
1235 			if (err)
1236 				goto error;
1237 			return 0;
1238 		}
1239 	}
1240 
1241 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1242 		goto alloc_new_skb;
1243 
1244 	while (length > 0) {
1245 		/* Check if the remaining data fits into current packet. */
1246 		copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1247 		if (copy < length)
1248 			copy = maxfraglen - skb->len;
1249 
1250 		if (copy <= 0) {
1251 			char *data;
1252 			unsigned int datalen;
1253 			unsigned int fraglen;
1254 			unsigned int fraggap;
1255 			unsigned int alloclen;
1256 			struct sk_buff *skb_prev;
1257 alloc_new_skb:
1258 			skb_prev = skb;
1259 
1260 			/* There's no room in the current skb */
1261 			if (skb_prev)
1262 				fraggap = skb_prev->len - maxfraglen;
1263 			else
1264 				fraggap = 0;
1265 
1266 			/*
1267 			 * If remaining data exceeds the mtu,
1268 			 * we know we need more fragment(s).
1269 			 */
1270 			datalen = length + fraggap;
1271 			if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1272 				datalen = maxfraglen - fragheaderlen;
1273 
1274 			fraglen = datalen + fragheaderlen;
1275 			if ((flags & MSG_MORE) &&
1276 			    !(rt->dst.dev->features&NETIF_F_SG))
1277 				alloclen = mtu;
1278 			else
1279 				alloclen = datalen + fragheaderlen;
1280 
1281 			/*
1282 			 * The last fragment gets additional space at tail.
1283 			 * Note: we overallocate on fragments with MSG_MODE
1284 			 * because we have no idea if we're the last one.
1285 			 */
1286 			if (datalen == length + fraggap)
1287 				alloclen += rt->dst.trailer_len;
1288 
1289 			/*
1290 			 * We just reserve space for fragment header.
1291 			 * Note: this may be overallocation if the message
1292 			 * (without MSG_MORE) fits into the MTU.
1293 			 */
1294 			alloclen += sizeof(struct frag_hdr);
1295 
1296 			if (transhdrlen) {
1297 				skb = sock_alloc_send_skb(sk,
1298 						alloclen + hh_len,
1299 						(flags & MSG_DONTWAIT), &err);
1300 			} else {
1301 				skb = NULL;
1302 				if (atomic_read(&sk->sk_wmem_alloc) <=
1303 				    2 * sk->sk_sndbuf)
1304 					skb = sock_wmalloc(sk,
1305 							   alloclen + hh_len, 1,
1306 							   sk->sk_allocation);
1307 				if (unlikely(skb == NULL))
1308 					err = -ENOBUFS;
1309 			}
1310 			if (skb == NULL)
1311 				goto error;
1312 			/*
1313 			 *	Fill in the control structures
1314 			 */
1315 			skb->ip_summed = csummode;
1316 			skb->csum = 0;
1317 			/* reserve for fragmentation */
1318 			skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1319 
1320 			/*
1321 			 *	Find where to start putting bytes
1322 			 */
1323 			data = skb_put(skb, fraglen);
1324 			skb_set_network_header(skb, exthdrlen);
1325 			data += fragheaderlen;
1326 			skb->transport_header = (skb->network_header +
1327 						 fragheaderlen);
1328 			if (fraggap) {
1329 				skb->csum = skb_copy_and_csum_bits(
1330 					skb_prev, maxfraglen,
1331 					data + transhdrlen, fraggap, 0);
1332 				skb_prev->csum = csum_sub(skb_prev->csum,
1333 							  skb->csum);
1334 				data += fraggap;
1335 				pskb_trim_unique(skb_prev, maxfraglen);
1336 			}
1337 			copy = datalen - transhdrlen - fraggap;
1338 			if (copy < 0) {
1339 				err = -EINVAL;
1340 				kfree_skb(skb);
1341 				goto error;
1342 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1343 				err = -EFAULT;
1344 				kfree_skb(skb);
1345 				goto error;
1346 			}
1347 
1348 			offset += copy;
1349 			length -= datalen - fraggap;
1350 			transhdrlen = 0;
1351 			exthdrlen = 0;
1352 			csummode = CHECKSUM_NONE;
1353 
1354 			/*
1355 			 * Put the packet on the pending queue
1356 			 */
1357 			__skb_queue_tail(&sk->sk_write_queue, skb);
1358 			continue;
1359 		}
1360 
1361 		if (copy > length)
1362 			copy = length;
1363 
1364 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1365 			unsigned int off;
1366 
1367 			off = skb->len;
1368 			if (getfrag(from, skb_put(skb, copy),
1369 						offset, copy, off, skb) < 0) {
1370 				__skb_trim(skb, off);
1371 				err = -EFAULT;
1372 				goto error;
1373 			}
1374 		} else {
1375 			int i = skb_shinfo(skb)->nr_frags;
1376 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1377 			struct page *page = sk->sk_sndmsg_page;
1378 			int off = sk->sk_sndmsg_off;
1379 			unsigned int left;
1380 
1381 			if (page && (left = PAGE_SIZE - off) > 0) {
1382 				if (copy >= left)
1383 					copy = left;
1384 				if (page != frag->page) {
1385 					if (i == MAX_SKB_FRAGS) {
1386 						err = -EMSGSIZE;
1387 						goto error;
1388 					}
1389 					get_page(page);
1390 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1391 					frag = &skb_shinfo(skb)->frags[i];
1392 				}
1393 			} else if(i < MAX_SKB_FRAGS) {
1394 				if (copy > PAGE_SIZE)
1395 					copy = PAGE_SIZE;
1396 				page = alloc_pages(sk->sk_allocation, 0);
1397 				if (page == NULL) {
1398 					err = -ENOMEM;
1399 					goto error;
1400 				}
1401 				sk->sk_sndmsg_page = page;
1402 				sk->sk_sndmsg_off = 0;
1403 
1404 				skb_fill_page_desc(skb, i, page, 0, 0);
1405 				frag = &skb_shinfo(skb)->frags[i];
1406 			} else {
1407 				err = -EMSGSIZE;
1408 				goto error;
1409 			}
1410 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1411 				err = -EFAULT;
1412 				goto error;
1413 			}
1414 			sk->sk_sndmsg_off += copy;
1415 			frag->size += copy;
1416 			skb->len += copy;
1417 			skb->data_len += copy;
1418 			skb->truesize += copy;
1419 			atomic_add(copy, &sk->sk_wmem_alloc);
1420 		}
1421 		offset += copy;
1422 		length -= copy;
1423 	}
1424 	return 0;
1425 error:
1426 	inet->cork.length -= length;
1427 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1428 	return err;
1429 }
1430 
1431 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1432 {
1433 	if (np->cork.opt) {
1434 		kfree(np->cork.opt->dst0opt);
1435 		kfree(np->cork.opt->dst1opt);
1436 		kfree(np->cork.opt->hopopt);
1437 		kfree(np->cork.opt->srcrt);
1438 		kfree(np->cork.opt);
1439 		np->cork.opt = NULL;
1440 	}
1441 
1442 	if (inet->cork.dst) {
1443 		dst_release(inet->cork.dst);
1444 		inet->cork.dst = NULL;
1445 		inet->cork.flags &= ~IPCORK_ALLFRAG;
1446 	}
1447 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1448 }
1449 
1450 int ip6_push_pending_frames(struct sock *sk)
1451 {
1452 	struct sk_buff *skb, *tmp_skb;
1453 	struct sk_buff **tail_skb;
1454 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1455 	struct inet_sock *inet = inet_sk(sk);
1456 	struct ipv6_pinfo *np = inet6_sk(sk);
1457 	struct net *net = sock_net(sk);
1458 	struct ipv6hdr *hdr;
1459 	struct ipv6_txoptions *opt = np->cork.opt;
1460 	struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1461 	struct flowi *fl = &inet->cork.fl;
1462 	unsigned char proto = fl->proto;
1463 	int err = 0;
1464 
1465 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1466 		goto out;
1467 	tail_skb = &(skb_shinfo(skb)->frag_list);
1468 
1469 	/* move skb->data to ip header from ext header */
1470 	if (skb->data < skb_network_header(skb))
1471 		__skb_pull(skb, skb_network_offset(skb));
1472 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1473 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1474 		*tail_skb = tmp_skb;
1475 		tail_skb = &(tmp_skb->next);
1476 		skb->len += tmp_skb->len;
1477 		skb->data_len += tmp_skb->len;
1478 		skb->truesize += tmp_skb->truesize;
1479 		tmp_skb->destructor = NULL;
1480 		tmp_skb->sk = NULL;
1481 	}
1482 
1483 	/* Allow local fragmentation. */
1484 	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1485 		skb->local_df = 1;
1486 
1487 	ipv6_addr_copy(final_dst, &fl->fl6_dst);
1488 	__skb_pull(skb, skb_network_header_len(skb));
1489 	if (opt && opt->opt_flen)
1490 		ipv6_push_frag_opts(skb, opt, &proto);
1491 	if (opt && opt->opt_nflen)
1492 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1493 
1494 	skb_push(skb, sizeof(struct ipv6hdr));
1495 	skb_reset_network_header(skb);
1496 	hdr = ipv6_hdr(skb);
1497 
1498 	*(__be32*)hdr = fl->fl6_flowlabel |
1499 		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1500 
1501 	hdr->hop_limit = np->cork.hop_limit;
1502 	hdr->nexthdr = proto;
1503 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1504 	ipv6_addr_copy(&hdr->daddr, final_dst);
1505 
1506 	skb->priority = sk->sk_priority;
1507 	skb->mark = sk->sk_mark;
1508 
1509 	skb_dst_set(skb, dst_clone(&rt->dst));
1510 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1511 	if (proto == IPPROTO_ICMPV6) {
1512 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1513 
1514 		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1515 		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1516 	}
1517 
1518 	err = ip6_local_out(skb);
1519 	if (err) {
1520 		if (err > 0)
1521 			err = net_xmit_errno(err);
1522 		if (err)
1523 			goto error;
1524 	}
1525 
1526 out:
1527 	ip6_cork_release(inet, np);
1528 	return err;
1529 error:
1530 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1531 	goto out;
1532 }
1533 
1534 void ip6_flush_pending_frames(struct sock *sk)
1535 {
1536 	struct sk_buff *skb;
1537 
1538 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1539 		if (skb_dst(skb))
1540 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1541 				      IPSTATS_MIB_OUTDISCARDS);
1542 		kfree_skb(skb);
1543 	}
1544 
1545 	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1546 }
1547