xref: /openbmc/linux/net/ipv6/ip6_output.c (revision baa7eb025ab14f3cba2e35c0a8648f9c9f01d24f)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	: 	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63 	int len;
64 
65 	len = skb->len - sizeof(struct ipv6hdr);
66 	if (len > IPV6_MAXPLEN)
67 		len = 0;
68 	ipv6_hdr(skb)->payload_len = htons(len);
69 
70 	return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 		       skb_dst(skb)->dev, dst_output);
72 }
73 
74 int ip6_local_out(struct sk_buff *skb)
75 {
76 	int err;
77 
78 	err = __ip6_local_out(skb);
79 	if (likely(err == 1))
80 		err = dst_output(skb);
81 
82 	return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85 
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89 	skb_reset_mac_header(newskb);
90 	__skb_pull(newskb, skb_network_offset(newskb));
91 	newskb->pkt_type = PACKET_LOOPBACK;
92 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
93 	WARN_ON(!skb_dst(newskb));
94 
95 	netif_rx_ni(newskb);
96 	return 0;
97 }
98 
99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101 	struct dst_entry *dst = skb_dst(skb);
102 	struct net_device *dev = dst->dev;
103 
104 	skb->protocol = htons(ETH_P_IPV6);
105 	skb->dev = dev;
106 
107 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
108 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
109 
110 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
111 		    ((mroute6_socket(dev_net(dev), skb) &&
112 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
113 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
114 					 &ipv6_hdr(skb)->saddr))) {
115 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
116 
117 			/* Do not check for IFF_ALLMULTI; multicast routing
118 			   is not supported in any case.
119 			 */
120 			if (newskb)
121 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
122 					newskb, NULL, newskb->dev,
123 					ip6_dev_loopback_xmit);
124 
125 			if (ipv6_hdr(skb)->hop_limit == 0) {
126 				IP6_INC_STATS(dev_net(dev), idev,
127 					      IPSTATS_MIB_OUTDISCARDS);
128 				kfree_skb(skb);
129 				return 0;
130 			}
131 		}
132 
133 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
134 				skb->len);
135 	}
136 
137 	if (dst->hh)
138 		return neigh_hh_output(dst->hh, skb);
139 	else if (dst->neighbour)
140 		return dst->neighbour->output(skb);
141 
142 	IP6_INC_STATS_BH(dev_net(dst->dev),
143 			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
144 	kfree_skb(skb);
145 	return -EINVAL;
146 }
147 
148 static int ip6_finish_output(struct sk_buff *skb)
149 {
150 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
151 	    dst_allfrag(skb_dst(skb)))
152 		return ip6_fragment(skb, ip6_finish_output2);
153 	else
154 		return ip6_finish_output2(skb);
155 }
156 
157 int ip6_output(struct sk_buff *skb)
158 {
159 	struct net_device *dev = skb_dst(skb)->dev;
160 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161 	if (unlikely(idev->cnf.disable_ipv6)) {
162 		IP6_INC_STATS(dev_net(dev), idev,
163 			      IPSTATS_MIB_OUTDISCARDS);
164 		kfree_skb(skb);
165 		return 0;
166 	}
167 
168 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
169 			    ip6_finish_output,
170 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
171 }
172 
173 /*
174  *	xmit an sk_buff (used by TCP, SCTP and DCCP)
175  */
176 
177 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
178 	     struct ipv6_txoptions *opt)
179 {
180 	struct net *net = sock_net(sk);
181 	struct ipv6_pinfo *np = inet6_sk(sk);
182 	struct in6_addr *first_hop = &fl->fl6_dst;
183 	struct dst_entry *dst = skb_dst(skb);
184 	struct ipv6hdr *hdr;
185 	u8  proto = fl->proto;
186 	int seg_len = skb->len;
187 	int hlimit = -1;
188 	int tclass = 0;
189 	u32 mtu;
190 
191 	if (opt) {
192 		unsigned int head_room;
193 
194 		/* First: exthdrs may take lots of space (~8K for now)
195 		   MAX_HEADER is not enough.
196 		 */
197 		head_room = opt->opt_nflen + opt->opt_flen;
198 		seg_len += head_room;
199 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
200 
201 		if (skb_headroom(skb) < head_room) {
202 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
203 			if (skb2 == NULL) {
204 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
205 					      IPSTATS_MIB_OUTDISCARDS);
206 				kfree_skb(skb);
207 				return -ENOBUFS;
208 			}
209 			kfree_skb(skb);
210 			skb = skb2;
211 			skb_set_owner_w(skb, sk);
212 		}
213 		if (opt->opt_flen)
214 			ipv6_push_frag_opts(skb, opt, &proto);
215 		if (opt->opt_nflen)
216 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
217 	}
218 
219 	skb_push(skb, sizeof(struct ipv6hdr));
220 	skb_reset_network_header(skb);
221 	hdr = ipv6_hdr(skb);
222 
223 	/*
224 	 *	Fill in the IPv6 header
225 	 */
226 	if (np) {
227 		tclass = np->tclass;
228 		hlimit = np->hop_limit;
229 	}
230 	if (hlimit < 0)
231 		hlimit = ip6_dst_hoplimit(dst);
232 
233 	*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
234 
235 	hdr->payload_len = htons(seg_len);
236 	hdr->nexthdr = proto;
237 	hdr->hop_limit = hlimit;
238 
239 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
240 	ipv6_addr_copy(&hdr->daddr, first_hop);
241 
242 	skb->priority = sk->sk_priority;
243 	skb->mark = sk->sk_mark;
244 
245 	mtu = dst_mtu(dst);
246 	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
247 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
248 			      IPSTATS_MIB_OUT, skb->len);
249 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
250 			       dst->dev, dst_output);
251 	}
252 
253 	if (net_ratelimit())
254 		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
255 	skb->dev = dst->dev;
256 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
257 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
258 	kfree_skb(skb);
259 	return -EMSGSIZE;
260 }
261 
262 EXPORT_SYMBOL(ip6_xmit);
263 
264 /*
265  *	To avoid extra problems ND packets are send through this
266  *	routine. It's code duplication but I really want to avoid
267  *	extra checks since ipv6_build_header is used by TCP (which
268  *	is for us performance critical)
269  */
270 
271 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
272 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
273 	       int proto, int len)
274 {
275 	struct ipv6_pinfo *np = inet6_sk(sk);
276 	struct ipv6hdr *hdr;
277 	int totlen;
278 
279 	skb->protocol = htons(ETH_P_IPV6);
280 	skb->dev = dev;
281 
282 	totlen = len + sizeof(struct ipv6hdr);
283 
284 	skb_reset_network_header(skb);
285 	skb_put(skb, sizeof(struct ipv6hdr));
286 	hdr = ipv6_hdr(skb);
287 
288 	*(__be32*)hdr = htonl(0x60000000);
289 
290 	hdr->payload_len = htons(len);
291 	hdr->nexthdr = proto;
292 	hdr->hop_limit = np->hop_limit;
293 
294 	ipv6_addr_copy(&hdr->saddr, saddr);
295 	ipv6_addr_copy(&hdr->daddr, daddr);
296 
297 	return 0;
298 }
299 
300 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
301 {
302 	struct ip6_ra_chain *ra;
303 	struct sock *last = NULL;
304 
305 	read_lock(&ip6_ra_lock);
306 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
307 		struct sock *sk = ra->sk;
308 		if (sk && ra->sel == sel &&
309 		    (!sk->sk_bound_dev_if ||
310 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
311 			if (last) {
312 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
313 				if (skb2)
314 					rawv6_rcv(last, skb2);
315 			}
316 			last = sk;
317 		}
318 	}
319 
320 	if (last) {
321 		rawv6_rcv(last, skb);
322 		read_unlock(&ip6_ra_lock);
323 		return 1;
324 	}
325 	read_unlock(&ip6_ra_lock);
326 	return 0;
327 }
328 
329 static int ip6_forward_proxy_check(struct sk_buff *skb)
330 {
331 	struct ipv6hdr *hdr = ipv6_hdr(skb);
332 	u8 nexthdr = hdr->nexthdr;
333 	int offset;
334 
335 	if (ipv6_ext_hdr(nexthdr)) {
336 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
337 		if (offset < 0)
338 			return 0;
339 	} else
340 		offset = sizeof(struct ipv6hdr);
341 
342 	if (nexthdr == IPPROTO_ICMPV6) {
343 		struct icmp6hdr *icmp6;
344 
345 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
346 					 offset + 1 - skb->data)))
347 			return 0;
348 
349 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
350 
351 		switch (icmp6->icmp6_type) {
352 		case NDISC_ROUTER_SOLICITATION:
353 		case NDISC_ROUTER_ADVERTISEMENT:
354 		case NDISC_NEIGHBOUR_SOLICITATION:
355 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
356 		case NDISC_REDIRECT:
357 			/* For reaction involving unicast neighbor discovery
358 			 * message destined to the proxied address, pass it to
359 			 * input function.
360 			 */
361 			return 1;
362 		default:
363 			break;
364 		}
365 	}
366 
367 	/*
368 	 * The proxying router can't forward traffic sent to a link-local
369 	 * address, so signal the sender and discard the packet. This
370 	 * behavior is clarified by the MIPv6 specification.
371 	 */
372 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
373 		dst_link_failure(skb);
374 		return -1;
375 	}
376 
377 	return 0;
378 }
379 
380 static inline int ip6_forward_finish(struct sk_buff *skb)
381 {
382 	return dst_output(skb);
383 }
384 
385 int ip6_forward(struct sk_buff *skb)
386 {
387 	struct dst_entry *dst = skb_dst(skb);
388 	struct ipv6hdr *hdr = ipv6_hdr(skb);
389 	struct inet6_skb_parm *opt = IP6CB(skb);
390 	struct net *net = dev_net(dst->dev);
391 	u32 mtu;
392 
393 	if (net->ipv6.devconf_all->forwarding == 0)
394 		goto error;
395 
396 	if (skb_warn_if_lro(skb))
397 		goto drop;
398 
399 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
400 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
401 		goto drop;
402 	}
403 
404 	skb_forward_csum(skb);
405 
406 	/*
407 	 *	We DO NOT make any processing on
408 	 *	RA packets, pushing them to user level AS IS
409 	 *	without ane WARRANTY that application will be able
410 	 *	to interpret them. The reason is that we
411 	 *	cannot make anything clever here.
412 	 *
413 	 *	We are not end-node, so that if packet contains
414 	 *	AH/ESP, we cannot make anything.
415 	 *	Defragmentation also would be mistake, RA packets
416 	 *	cannot be fragmented, because there is no warranty
417 	 *	that different fragments will go along one path. --ANK
418 	 */
419 	if (opt->ra) {
420 		u8 *ptr = skb_network_header(skb) + opt->ra;
421 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
422 			return 0;
423 	}
424 
425 	/*
426 	 *	check and decrement ttl
427 	 */
428 	if (hdr->hop_limit <= 1) {
429 		/* Force OUTPUT device used as source address */
430 		skb->dev = dst->dev;
431 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
432 		IP6_INC_STATS_BH(net,
433 				 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
434 
435 		kfree_skb(skb);
436 		return -ETIMEDOUT;
437 	}
438 
439 	/* XXX: idev->cnf.proxy_ndp? */
440 	if (net->ipv6.devconf_all->proxy_ndp &&
441 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
442 		int proxied = ip6_forward_proxy_check(skb);
443 		if (proxied > 0)
444 			return ip6_input(skb);
445 		else if (proxied < 0) {
446 			IP6_INC_STATS(net, ip6_dst_idev(dst),
447 				      IPSTATS_MIB_INDISCARDS);
448 			goto drop;
449 		}
450 	}
451 
452 	if (!xfrm6_route_forward(skb)) {
453 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
454 		goto drop;
455 	}
456 	dst = skb_dst(skb);
457 
458 	/* IPv6 specs say nothing about it, but it is clear that we cannot
459 	   send redirects to source routed frames.
460 	   We don't send redirects to frames decapsulated from IPsec.
461 	 */
462 	if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
463 	    !skb_sec_path(skb)) {
464 		struct in6_addr *target = NULL;
465 		struct rt6_info *rt;
466 		struct neighbour *n = dst->neighbour;
467 
468 		/*
469 		 *	incoming and outgoing devices are the same
470 		 *	send a redirect.
471 		 */
472 
473 		rt = (struct rt6_info *) dst;
474 		if ((rt->rt6i_flags & RTF_GATEWAY))
475 			target = (struct in6_addr*)&n->primary_key;
476 		else
477 			target = &hdr->daddr;
478 
479 		/* Limit redirects both by destination (here)
480 		   and by source (inside ndisc_send_redirect)
481 		 */
482 		if (xrlim_allow(dst, 1*HZ))
483 			ndisc_send_redirect(skb, n, target);
484 	} else {
485 		int addrtype = ipv6_addr_type(&hdr->saddr);
486 
487 		/* This check is security critical. */
488 		if (addrtype == IPV6_ADDR_ANY ||
489 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
490 			goto error;
491 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
492 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
493 				    ICMPV6_NOT_NEIGHBOUR, 0);
494 			goto error;
495 		}
496 	}
497 
498 	mtu = dst_mtu(dst);
499 	if (mtu < IPV6_MIN_MTU)
500 		mtu = IPV6_MIN_MTU;
501 
502 	if (skb->len > mtu && !skb_is_gso(skb)) {
503 		/* Again, force OUTPUT device used as source address */
504 		skb->dev = dst->dev;
505 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
506 		IP6_INC_STATS_BH(net,
507 				 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
508 		IP6_INC_STATS_BH(net,
509 				 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
510 		kfree_skb(skb);
511 		return -EMSGSIZE;
512 	}
513 
514 	if (skb_cow(skb, dst->dev->hard_header_len)) {
515 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
516 		goto drop;
517 	}
518 
519 	hdr = ipv6_hdr(skb);
520 
521 	/* Mangling hops number delayed to point after skb COW */
522 
523 	hdr->hop_limit--;
524 
525 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
526 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
527 		       ip6_forward_finish);
528 
529 error:
530 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
531 drop:
532 	kfree_skb(skb);
533 	return -EINVAL;
534 }
535 
536 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
537 {
538 	to->pkt_type = from->pkt_type;
539 	to->priority = from->priority;
540 	to->protocol = from->protocol;
541 	skb_dst_drop(to);
542 	skb_dst_set(to, dst_clone(skb_dst(from)));
543 	to->dev = from->dev;
544 	to->mark = from->mark;
545 
546 #ifdef CONFIG_NET_SCHED
547 	to->tc_index = from->tc_index;
548 #endif
549 	nf_copy(to, from);
550 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
551     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
552 	to->nf_trace = from->nf_trace;
553 #endif
554 	skb_copy_secmark(to, from);
555 }
556 
557 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
558 {
559 	u16 offset = sizeof(struct ipv6hdr);
560 	struct ipv6_opt_hdr *exthdr =
561 				(struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
562 	unsigned int packet_len = skb->tail - skb->network_header;
563 	int found_rhdr = 0;
564 	*nexthdr = &ipv6_hdr(skb)->nexthdr;
565 
566 	while (offset + 1 <= packet_len) {
567 
568 		switch (**nexthdr) {
569 
570 		case NEXTHDR_HOP:
571 			break;
572 		case NEXTHDR_ROUTING:
573 			found_rhdr = 1;
574 			break;
575 		case NEXTHDR_DEST:
576 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
577 			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
578 				break;
579 #endif
580 			if (found_rhdr)
581 				return offset;
582 			break;
583 		default :
584 			return offset;
585 		}
586 
587 		offset += ipv6_optlen(exthdr);
588 		*nexthdr = &exthdr->nexthdr;
589 		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
590 						 offset);
591 	}
592 
593 	return offset;
594 }
595 
596 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
597 {
598 	struct sk_buff *frag;
599 	struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
600 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
601 	struct ipv6hdr *tmp_hdr;
602 	struct frag_hdr *fh;
603 	unsigned int mtu, hlen, left, len;
604 	__be32 frag_id = 0;
605 	int ptr, offset = 0, err=0;
606 	u8 *prevhdr, nexthdr = 0;
607 	struct net *net = dev_net(skb_dst(skb)->dev);
608 
609 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
610 	nexthdr = *prevhdr;
611 
612 	mtu = ip6_skb_dst_mtu(skb);
613 
614 	/* We must not fragment if the socket is set to force MTU discovery
615 	 * or if the skb it not generated by a local socket.
616 	 */
617 	if (!skb->local_df && skb->len > mtu) {
618 		skb->dev = skb_dst(skb)->dev;
619 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
620 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
621 			      IPSTATS_MIB_FRAGFAILS);
622 		kfree_skb(skb);
623 		return -EMSGSIZE;
624 	}
625 
626 	if (np && np->frag_size < mtu) {
627 		if (np->frag_size)
628 			mtu = np->frag_size;
629 	}
630 	mtu -= hlen + sizeof(struct frag_hdr);
631 
632 	if (skb_has_frag_list(skb)) {
633 		int first_len = skb_pagelen(skb);
634 		struct sk_buff *frag2;
635 
636 		if (first_len - hlen > mtu ||
637 		    ((first_len - hlen) & 7) ||
638 		    skb_cloned(skb))
639 			goto slow_path;
640 
641 		skb_walk_frags(skb, frag) {
642 			/* Correct geometry. */
643 			if (frag->len > mtu ||
644 			    ((frag->len & 7) && frag->next) ||
645 			    skb_headroom(frag) < hlen)
646 				goto slow_path_clean;
647 
648 			/* Partially cloned skb? */
649 			if (skb_shared(frag))
650 				goto slow_path_clean;
651 
652 			BUG_ON(frag->sk);
653 			if (skb->sk) {
654 				frag->sk = skb->sk;
655 				frag->destructor = sock_wfree;
656 			}
657 			skb->truesize -= frag->truesize;
658 		}
659 
660 		err = 0;
661 		offset = 0;
662 		frag = skb_shinfo(skb)->frag_list;
663 		skb_frag_list_init(skb);
664 		/* BUILD HEADER */
665 
666 		*prevhdr = NEXTHDR_FRAGMENT;
667 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
668 		if (!tmp_hdr) {
669 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
670 				      IPSTATS_MIB_FRAGFAILS);
671 			return -ENOMEM;
672 		}
673 
674 		__skb_pull(skb, hlen);
675 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
676 		__skb_push(skb, hlen);
677 		skb_reset_network_header(skb);
678 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
679 
680 		ipv6_select_ident(fh);
681 		fh->nexthdr = nexthdr;
682 		fh->reserved = 0;
683 		fh->frag_off = htons(IP6_MF);
684 		frag_id = fh->identification;
685 
686 		first_len = skb_pagelen(skb);
687 		skb->data_len = first_len - skb_headlen(skb);
688 		skb->len = first_len;
689 		ipv6_hdr(skb)->payload_len = htons(first_len -
690 						   sizeof(struct ipv6hdr));
691 
692 		dst_hold(&rt->dst);
693 
694 		for (;;) {
695 			/* Prepare header of the next frame,
696 			 * before previous one went down. */
697 			if (frag) {
698 				frag->ip_summed = CHECKSUM_NONE;
699 				skb_reset_transport_header(frag);
700 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
701 				__skb_push(frag, hlen);
702 				skb_reset_network_header(frag);
703 				memcpy(skb_network_header(frag), tmp_hdr,
704 				       hlen);
705 				offset += skb->len - hlen - sizeof(struct frag_hdr);
706 				fh->nexthdr = nexthdr;
707 				fh->reserved = 0;
708 				fh->frag_off = htons(offset);
709 				if (frag->next != NULL)
710 					fh->frag_off |= htons(IP6_MF);
711 				fh->identification = frag_id;
712 				ipv6_hdr(frag)->payload_len =
713 						htons(frag->len -
714 						      sizeof(struct ipv6hdr));
715 				ip6_copy_metadata(frag, skb);
716 			}
717 
718 			err = output(skb);
719 			if(!err)
720 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
721 					      IPSTATS_MIB_FRAGCREATES);
722 
723 			if (err || !frag)
724 				break;
725 
726 			skb = frag;
727 			frag = skb->next;
728 			skb->next = NULL;
729 		}
730 
731 		kfree(tmp_hdr);
732 
733 		if (err == 0) {
734 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
735 				      IPSTATS_MIB_FRAGOKS);
736 			dst_release(&rt->dst);
737 			return 0;
738 		}
739 
740 		while (frag) {
741 			skb = frag->next;
742 			kfree_skb(frag);
743 			frag = skb;
744 		}
745 
746 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
747 			      IPSTATS_MIB_FRAGFAILS);
748 		dst_release(&rt->dst);
749 		return err;
750 
751 slow_path_clean:
752 		skb_walk_frags(skb, frag2) {
753 			if (frag2 == frag)
754 				break;
755 			frag2->sk = NULL;
756 			frag2->destructor = NULL;
757 			skb->truesize += frag2->truesize;
758 		}
759 	}
760 
761 slow_path:
762 	left = skb->len - hlen;		/* Space per frame */
763 	ptr = hlen;			/* Where to start from */
764 
765 	/*
766 	 *	Fragment the datagram.
767 	 */
768 
769 	*prevhdr = NEXTHDR_FRAGMENT;
770 
771 	/*
772 	 *	Keep copying data until we run out.
773 	 */
774 	while(left > 0)	{
775 		len = left;
776 		/* IF: it doesn't fit, use 'mtu' - the data space left */
777 		if (len > mtu)
778 			len = mtu;
779 		/* IF: we are not sending upto and including the packet end
780 		   then align the next start on an eight byte boundary */
781 		if (len < left)	{
782 			len &= ~7;
783 		}
784 		/*
785 		 *	Allocate buffer.
786 		 */
787 
788 		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
789 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
790 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
791 				      IPSTATS_MIB_FRAGFAILS);
792 			err = -ENOMEM;
793 			goto fail;
794 		}
795 
796 		/*
797 		 *	Set up data on packet
798 		 */
799 
800 		ip6_copy_metadata(frag, skb);
801 		skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
802 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
803 		skb_reset_network_header(frag);
804 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
805 		frag->transport_header = (frag->network_header + hlen +
806 					  sizeof(struct frag_hdr));
807 
808 		/*
809 		 *	Charge the memory for the fragment to any owner
810 		 *	it might possess
811 		 */
812 		if (skb->sk)
813 			skb_set_owner_w(frag, skb->sk);
814 
815 		/*
816 		 *	Copy the packet header into the new buffer.
817 		 */
818 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
819 
820 		/*
821 		 *	Build fragment header.
822 		 */
823 		fh->nexthdr = nexthdr;
824 		fh->reserved = 0;
825 		if (!frag_id) {
826 			ipv6_select_ident(fh);
827 			frag_id = fh->identification;
828 		} else
829 			fh->identification = frag_id;
830 
831 		/*
832 		 *	Copy a block of the IP datagram.
833 		 */
834 		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
835 			BUG();
836 		left -= len;
837 
838 		fh->frag_off = htons(offset);
839 		if (left > 0)
840 			fh->frag_off |= htons(IP6_MF);
841 		ipv6_hdr(frag)->payload_len = htons(frag->len -
842 						    sizeof(struct ipv6hdr));
843 
844 		ptr += len;
845 		offset += len;
846 
847 		/*
848 		 *	Put this fragment into the sending queue.
849 		 */
850 		err = output(frag);
851 		if (err)
852 			goto fail;
853 
854 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
855 			      IPSTATS_MIB_FRAGCREATES);
856 	}
857 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
858 		      IPSTATS_MIB_FRAGOKS);
859 	kfree_skb(skb);
860 	return err;
861 
862 fail:
863 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
864 		      IPSTATS_MIB_FRAGFAILS);
865 	kfree_skb(skb);
866 	return err;
867 }
868 
869 static inline int ip6_rt_check(struct rt6key *rt_key,
870 			       struct in6_addr *fl_addr,
871 			       struct in6_addr *addr_cache)
872 {
873 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
874 		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
875 }
876 
877 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
878 					  struct dst_entry *dst,
879 					  struct flowi *fl)
880 {
881 	struct ipv6_pinfo *np = inet6_sk(sk);
882 	struct rt6_info *rt = (struct rt6_info *)dst;
883 
884 	if (!dst)
885 		goto out;
886 
887 	/* Yes, checking route validity in not connected
888 	 * case is not very simple. Take into account,
889 	 * that we do not support routing by source, TOS,
890 	 * and MSG_DONTROUTE 		--ANK (980726)
891 	 *
892 	 * 1. ip6_rt_check(): If route was host route,
893 	 *    check that cached destination is current.
894 	 *    If it is network route, we still may
895 	 *    check its validity using saved pointer
896 	 *    to the last used address: daddr_cache.
897 	 *    We do not want to save whole address now,
898 	 *    (because main consumer of this service
899 	 *    is tcp, which has not this problem),
900 	 *    so that the last trick works only on connected
901 	 *    sockets.
902 	 * 2. oif also should be the same.
903 	 */
904 	if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
905 #ifdef CONFIG_IPV6_SUBTREES
906 	    ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
907 #endif
908 	    (fl->oif && fl->oif != dst->dev->ifindex)) {
909 		dst_release(dst);
910 		dst = NULL;
911 	}
912 
913 out:
914 	return dst;
915 }
916 
917 static int ip6_dst_lookup_tail(struct sock *sk,
918 			       struct dst_entry **dst, struct flowi *fl)
919 {
920 	int err;
921 	struct net *net = sock_net(sk);
922 
923 	if (*dst == NULL)
924 		*dst = ip6_route_output(net, sk, fl);
925 
926 	if ((err = (*dst)->error))
927 		goto out_err_release;
928 
929 	if (ipv6_addr_any(&fl->fl6_src)) {
930 		err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
931 					 &fl->fl6_dst,
932 					 sk ? inet6_sk(sk)->srcprefs : 0,
933 					 &fl->fl6_src);
934 		if (err)
935 			goto out_err_release;
936 	}
937 
938 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
939 	/*
940 	 * Here if the dst entry we've looked up
941 	 * has a neighbour entry that is in the INCOMPLETE
942 	 * state and the src address from the flow is
943 	 * marked as OPTIMISTIC, we release the found
944 	 * dst entry and replace it instead with the
945 	 * dst entry of the nexthop router
946 	 */
947 	if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
948 		struct inet6_ifaddr *ifp;
949 		struct flowi fl_gw;
950 		int redirect;
951 
952 		ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
953 				      (*dst)->dev, 1);
954 
955 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
956 		if (ifp)
957 			in6_ifa_put(ifp);
958 
959 		if (redirect) {
960 			/*
961 			 * We need to get the dst entry for the
962 			 * default router instead
963 			 */
964 			dst_release(*dst);
965 			memcpy(&fl_gw, fl, sizeof(struct flowi));
966 			memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
967 			*dst = ip6_route_output(net, sk, &fl_gw);
968 			if ((err = (*dst)->error))
969 				goto out_err_release;
970 		}
971 	}
972 #endif
973 
974 	return 0;
975 
976 out_err_release:
977 	if (err == -ENETUNREACH)
978 		IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
979 	dst_release(*dst);
980 	*dst = NULL;
981 	return err;
982 }
983 
984 /**
985  *	ip6_dst_lookup - perform route lookup on flow
986  *	@sk: socket which provides route info
987  *	@dst: pointer to dst_entry * for result
988  *	@fl: flow to lookup
989  *
990  *	This function performs a route lookup on the given flow.
991  *
992  *	It returns zero on success, or a standard errno code on error.
993  */
994 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
995 {
996 	*dst = NULL;
997 	return ip6_dst_lookup_tail(sk, dst, fl);
998 }
999 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1000 
1001 /**
1002  *	ip6_sk_dst_lookup - perform socket cached route lookup on flow
1003  *	@sk: socket which provides the dst cache and route info
1004  *	@dst: pointer to dst_entry * for result
1005  *	@fl: flow to lookup
1006  *
1007  *	This function performs a route lookup on the given flow with the
1008  *	possibility of using the cached route in the socket if it is valid.
1009  *	It will take the socket dst lock when operating on the dst cache.
1010  *	As a result, this function can only be used in process context.
1011  *
1012  *	It returns zero on success, or a standard errno code on error.
1013  */
1014 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1015 {
1016 	*dst = NULL;
1017 	if (sk) {
1018 		*dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1019 		*dst = ip6_sk_dst_check(sk, *dst, fl);
1020 	}
1021 
1022 	return ip6_dst_lookup_tail(sk, dst, fl);
1023 }
1024 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1025 
1026 static inline int ip6_ufo_append_data(struct sock *sk,
1027 			int getfrag(void *from, char *to, int offset, int len,
1028 			int odd, struct sk_buff *skb),
1029 			void *from, int length, int hh_len, int fragheaderlen,
1030 			int transhdrlen, int mtu,unsigned int flags)
1031 
1032 {
1033 	struct sk_buff *skb;
1034 	int err;
1035 
1036 	/* There is support for UDP large send offload by network
1037 	 * device, so create one single skb packet containing complete
1038 	 * udp datagram
1039 	 */
1040 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1041 		skb = sock_alloc_send_skb(sk,
1042 			hh_len + fragheaderlen + transhdrlen + 20,
1043 			(flags & MSG_DONTWAIT), &err);
1044 		if (skb == NULL)
1045 			return -ENOMEM;
1046 
1047 		/* reserve space for Hardware header */
1048 		skb_reserve(skb, hh_len);
1049 
1050 		/* create space for UDP/IP header */
1051 		skb_put(skb,fragheaderlen + transhdrlen);
1052 
1053 		/* initialize network header pointer */
1054 		skb_reset_network_header(skb);
1055 
1056 		/* initialize protocol header pointer */
1057 		skb->transport_header = skb->network_header + fragheaderlen;
1058 
1059 		skb->ip_summed = CHECKSUM_PARTIAL;
1060 		skb->csum = 0;
1061 		sk->sk_sndmsg_off = 0;
1062 	}
1063 
1064 	err = skb_append_datato_frags(sk,skb, getfrag, from,
1065 				      (length - transhdrlen));
1066 	if (!err) {
1067 		struct frag_hdr fhdr;
1068 
1069 		/* Specify the length of each IPv6 datagram fragment.
1070 		 * It has to be a multiple of 8.
1071 		 */
1072 		skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1073 					     sizeof(struct frag_hdr)) & ~7;
1074 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1075 		ipv6_select_ident(&fhdr);
1076 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1077 		__skb_queue_tail(&sk->sk_write_queue, skb);
1078 
1079 		return 0;
1080 	}
1081 	/* There is not enough support do UPD LSO,
1082 	 * so follow normal path
1083 	 */
1084 	kfree_skb(skb);
1085 
1086 	return err;
1087 }
1088 
1089 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1090 					       gfp_t gfp)
1091 {
1092 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1093 }
1094 
1095 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1096 						gfp_t gfp)
1097 {
1098 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1099 }
1100 
1101 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1102 	int offset, int len, int odd, struct sk_buff *skb),
1103 	void *from, int length, int transhdrlen,
1104 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1105 	struct rt6_info *rt, unsigned int flags, int dontfrag)
1106 {
1107 	struct inet_sock *inet = inet_sk(sk);
1108 	struct ipv6_pinfo *np = inet6_sk(sk);
1109 	struct sk_buff *skb;
1110 	unsigned int maxfraglen, fragheaderlen;
1111 	int exthdrlen;
1112 	int hh_len;
1113 	int mtu;
1114 	int copy;
1115 	int err;
1116 	int offset = 0;
1117 	int csummode = CHECKSUM_NONE;
1118 
1119 	if (flags&MSG_PROBE)
1120 		return 0;
1121 	if (skb_queue_empty(&sk->sk_write_queue)) {
1122 		/*
1123 		 * setup for corking
1124 		 */
1125 		if (opt) {
1126 			if (WARN_ON(np->cork.opt))
1127 				return -EINVAL;
1128 
1129 			np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1130 			if (unlikely(np->cork.opt == NULL))
1131 				return -ENOBUFS;
1132 
1133 			np->cork.opt->tot_len = opt->tot_len;
1134 			np->cork.opt->opt_flen = opt->opt_flen;
1135 			np->cork.opt->opt_nflen = opt->opt_nflen;
1136 
1137 			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1138 							    sk->sk_allocation);
1139 			if (opt->dst0opt && !np->cork.opt->dst0opt)
1140 				return -ENOBUFS;
1141 
1142 			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1143 							    sk->sk_allocation);
1144 			if (opt->dst1opt && !np->cork.opt->dst1opt)
1145 				return -ENOBUFS;
1146 
1147 			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1148 							   sk->sk_allocation);
1149 			if (opt->hopopt && !np->cork.opt->hopopt)
1150 				return -ENOBUFS;
1151 
1152 			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1153 							    sk->sk_allocation);
1154 			if (opt->srcrt && !np->cork.opt->srcrt)
1155 				return -ENOBUFS;
1156 
1157 			/* need source address above miyazawa*/
1158 		}
1159 		dst_hold(&rt->dst);
1160 		inet->cork.dst = &rt->dst;
1161 		inet->cork.fl = *fl;
1162 		np->cork.hop_limit = hlimit;
1163 		np->cork.tclass = tclass;
1164 		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1165 		      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1166 		if (np->frag_size < mtu) {
1167 			if (np->frag_size)
1168 				mtu = np->frag_size;
1169 		}
1170 		inet->cork.fragsize = mtu;
1171 		if (dst_allfrag(rt->dst.path))
1172 			inet->cork.flags |= IPCORK_ALLFRAG;
1173 		inet->cork.length = 0;
1174 		sk->sk_sndmsg_page = NULL;
1175 		sk->sk_sndmsg_off = 0;
1176 		exthdrlen = rt->dst.header_len + (opt ? opt->opt_flen : 0) -
1177 			    rt->rt6i_nfheader_len;
1178 		length += exthdrlen;
1179 		transhdrlen += exthdrlen;
1180 	} else {
1181 		rt = (struct rt6_info *)inet->cork.dst;
1182 		fl = &inet->cork.fl;
1183 		opt = np->cork.opt;
1184 		transhdrlen = 0;
1185 		exthdrlen = 0;
1186 		mtu = inet->cork.fragsize;
1187 	}
1188 
1189 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1190 
1191 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1192 			(opt ? opt->opt_nflen : 0);
1193 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1194 
1195 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1196 		if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1197 			ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1198 			return -EMSGSIZE;
1199 		}
1200 	}
1201 
1202 	/*
1203 	 * Let's try using as much space as possible.
1204 	 * Use MTU if total length of the message fits into the MTU.
1205 	 * Otherwise, we need to reserve fragment header and
1206 	 * fragment alignment (= 8-15 octects, in total).
1207 	 *
1208 	 * Note that we may need to "move" the data from the tail of
1209 	 * of the buffer to the new fragment when we split
1210 	 * the message.
1211 	 *
1212 	 * FIXME: It may be fragmented into multiple chunks
1213 	 *        at once if non-fragmentable extension headers
1214 	 *        are too large.
1215 	 * --yoshfuji
1216 	 */
1217 
1218 	inet->cork.length += length;
1219 	if (length > mtu) {
1220 		int proto = sk->sk_protocol;
1221 		if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1222 			ipv6_local_rxpmtu(sk, fl, mtu-exthdrlen);
1223 			return -EMSGSIZE;
1224 		}
1225 
1226 		if (proto == IPPROTO_UDP &&
1227 		    (rt->dst.dev->features & NETIF_F_UFO)) {
1228 
1229 			err = ip6_ufo_append_data(sk, getfrag, from, length,
1230 						  hh_len, fragheaderlen,
1231 						  transhdrlen, mtu, flags);
1232 			if (err)
1233 				goto error;
1234 			return 0;
1235 		}
1236 	}
1237 
1238 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1239 		goto alloc_new_skb;
1240 
1241 	while (length > 0) {
1242 		/* Check if the remaining data fits into current packet. */
1243 		copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1244 		if (copy < length)
1245 			copy = maxfraglen - skb->len;
1246 
1247 		if (copy <= 0) {
1248 			char *data;
1249 			unsigned int datalen;
1250 			unsigned int fraglen;
1251 			unsigned int fraggap;
1252 			unsigned int alloclen;
1253 			struct sk_buff *skb_prev;
1254 alloc_new_skb:
1255 			skb_prev = skb;
1256 
1257 			/* There's no room in the current skb */
1258 			if (skb_prev)
1259 				fraggap = skb_prev->len - maxfraglen;
1260 			else
1261 				fraggap = 0;
1262 
1263 			/*
1264 			 * If remaining data exceeds the mtu,
1265 			 * we know we need more fragment(s).
1266 			 */
1267 			datalen = length + fraggap;
1268 			if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1269 				datalen = maxfraglen - fragheaderlen;
1270 
1271 			fraglen = datalen + fragheaderlen;
1272 			if ((flags & MSG_MORE) &&
1273 			    !(rt->dst.dev->features&NETIF_F_SG))
1274 				alloclen = mtu;
1275 			else
1276 				alloclen = datalen + fragheaderlen;
1277 
1278 			/*
1279 			 * The last fragment gets additional space at tail.
1280 			 * Note: we overallocate on fragments with MSG_MODE
1281 			 * because we have no idea if we're the last one.
1282 			 */
1283 			if (datalen == length + fraggap)
1284 				alloclen += rt->dst.trailer_len;
1285 
1286 			/*
1287 			 * We just reserve space for fragment header.
1288 			 * Note: this may be overallocation if the message
1289 			 * (without MSG_MORE) fits into the MTU.
1290 			 */
1291 			alloclen += sizeof(struct frag_hdr);
1292 
1293 			if (transhdrlen) {
1294 				skb = sock_alloc_send_skb(sk,
1295 						alloclen + hh_len,
1296 						(flags & MSG_DONTWAIT), &err);
1297 			} else {
1298 				skb = NULL;
1299 				if (atomic_read(&sk->sk_wmem_alloc) <=
1300 				    2 * sk->sk_sndbuf)
1301 					skb = sock_wmalloc(sk,
1302 							   alloclen + hh_len, 1,
1303 							   sk->sk_allocation);
1304 				if (unlikely(skb == NULL))
1305 					err = -ENOBUFS;
1306 			}
1307 			if (skb == NULL)
1308 				goto error;
1309 			/*
1310 			 *	Fill in the control structures
1311 			 */
1312 			skb->ip_summed = csummode;
1313 			skb->csum = 0;
1314 			/* reserve for fragmentation */
1315 			skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1316 
1317 			/*
1318 			 *	Find where to start putting bytes
1319 			 */
1320 			data = skb_put(skb, fraglen);
1321 			skb_set_network_header(skb, exthdrlen);
1322 			data += fragheaderlen;
1323 			skb->transport_header = (skb->network_header +
1324 						 fragheaderlen);
1325 			if (fraggap) {
1326 				skb->csum = skb_copy_and_csum_bits(
1327 					skb_prev, maxfraglen,
1328 					data + transhdrlen, fraggap, 0);
1329 				skb_prev->csum = csum_sub(skb_prev->csum,
1330 							  skb->csum);
1331 				data += fraggap;
1332 				pskb_trim_unique(skb_prev, maxfraglen);
1333 			}
1334 			copy = datalen - transhdrlen - fraggap;
1335 			if (copy < 0) {
1336 				err = -EINVAL;
1337 				kfree_skb(skb);
1338 				goto error;
1339 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1340 				err = -EFAULT;
1341 				kfree_skb(skb);
1342 				goto error;
1343 			}
1344 
1345 			offset += copy;
1346 			length -= datalen - fraggap;
1347 			transhdrlen = 0;
1348 			exthdrlen = 0;
1349 			csummode = CHECKSUM_NONE;
1350 
1351 			/*
1352 			 * Put the packet on the pending queue
1353 			 */
1354 			__skb_queue_tail(&sk->sk_write_queue, skb);
1355 			continue;
1356 		}
1357 
1358 		if (copy > length)
1359 			copy = length;
1360 
1361 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1362 			unsigned int off;
1363 
1364 			off = skb->len;
1365 			if (getfrag(from, skb_put(skb, copy),
1366 						offset, copy, off, skb) < 0) {
1367 				__skb_trim(skb, off);
1368 				err = -EFAULT;
1369 				goto error;
1370 			}
1371 		} else {
1372 			int i = skb_shinfo(skb)->nr_frags;
1373 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1374 			struct page *page = sk->sk_sndmsg_page;
1375 			int off = sk->sk_sndmsg_off;
1376 			unsigned int left;
1377 
1378 			if (page && (left = PAGE_SIZE - off) > 0) {
1379 				if (copy >= left)
1380 					copy = left;
1381 				if (page != frag->page) {
1382 					if (i == MAX_SKB_FRAGS) {
1383 						err = -EMSGSIZE;
1384 						goto error;
1385 					}
1386 					get_page(page);
1387 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1388 					frag = &skb_shinfo(skb)->frags[i];
1389 				}
1390 			} else if(i < MAX_SKB_FRAGS) {
1391 				if (copy > PAGE_SIZE)
1392 					copy = PAGE_SIZE;
1393 				page = alloc_pages(sk->sk_allocation, 0);
1394 				if (page == NULL) {
1395 					err = -ENOMEM;
1396 					goto error;
1397 				}
1398 				sk->sk_sndmsg_page = page;
1399 				sk->sk_sndmsg_off = 0;
1400 
1401 				skb_fill_page_desc(skb, i, page, 0, 0);
1402 				frag = &skb_shinfo(skb)->frags[i];
1403 			} else {
1404 				err = -EMSGSIZE;
1405 				goto error;
1406 			}
1407 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1408 				err = -EFAULT;
1409 				goto error;
1410 			}
1411 			sk->sk_sndmsg_off += copy;
1412 			frag->size += copy;
1413 			skb->len += copy;
1414 			skb->data_len += copy;
1415 			skb->truesize += copy;
1416 			atomic_add(copy, &sk->sk_wmem_alloc);
1417 		}
1418 		offset += copy;
1419 		length -= copy;
1420 	}
1421 	return 0;
1422 error:
1423 	inet->cork.length -= length;
1424 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1425 	return err;
1426 }
1427 
1428 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1429 {
1430 	if (np->cork.opt) {
1431 		kfree(np->cork.opt->dst0opt);
1432 		kfree(np->cork.opt->dst1opt);
1433 		kfree(np->cork.opt->hopopt);
1434 		kfree(np->cork.opt->srcrt);
1435 		kfree(np->cork.opt);
1436 		np->cork.opt = NULL;
1437 	}
1438 
1439 	if (inet->cork.dst) {
1440 		dst_release(inet->cork.dst);
1441 		inet->cork.dst = NULL;
1442 		inet->cork.flags &= ~IPCORK_ALLFRAG;
1443 	}
1444 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1445 }
1446 
1447 int ip6_push_pending_frames(struct sock *sk)
1448 {
1449 	struct sk_buff *skb, *tmp_skb;
1450 	struct sk_buff **tail_skb;
1451 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1452 	struct inet_sock *inet = inet_sk(sk);
1453 	struct ipv6_pinfo *np = inet6_sk(sk);
1454 	struct net *net = sock_net(sk);
1455 	struct ipv6hdr *hdr;
1456 	struct ipv6_txoptions *opt = np->cork.opt;
1457 	struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1458 	struct flowi *fl = &inet->cork.fl;
1459 	unsigned char proto = fl->proto;
1460 	int err = 0;
1461 
1462 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1463 		goto out;
1464 	tail_skb = &(skb_shinfo(skb)->frag_list);
1465 
1466 	/* move skb->data to ip header from ext header */
1467 	if (skb->data < skb_network_header(skb))
1468 		__skb_pull(skb, skb_network_offset(skb));
1469 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1470 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1471 		*tail_skb = tmp_skb;
1472 		tail_skb = &(tmp_skb->next);
1473 		skb->len += tmp_skb->len;
1474 		skb->data_len += tmp_skb->len;
1475 		skb->truesize += tmp_skb->truesize;
1476 		tmp_skb->destructor = NULL;
1477 		tmp_skb->sk = NULL;
1478 	}
1479 
1480 	/* Allow local fragmentation. */
1481 	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1482 		skb->local_df = 1;
1483 
1484 	ipv6_addr_copy(final_dst, &fl->fl6_dst);
1485 	__skb_pull(skb, skb_network_header_len(skb));
1486 	if (opt && opt->opt_flen)
1487 		ipv6_push_frag_opts(skb, opt, &proto);
1488 	if (opt && opt->opt_nflen)
1489 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1490 
1491 	skb_push(skb, sizeof(struct ipv6hdr));
1492 	skb_reset_network_header(skb);
1493 	hdr = ipv6_hdr(skb);
1494 
1495 	*(__be32*)hdr = fl->fl6_flowlabel |
1496 		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1497 
1498 	hdr->hop_limit = np->cork.hop_limit;
1499 	hdr->nexthdr = proto;
1500 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1501 	ipv6_addr_copy(&hdr->daddr, final_dst);
1502 
1503 	skb->priority = sk->sk_priority;
1504 	skb->mark = sk->sk_mark;
1505 
1506 	skb_dst_set(skb, dst_clone(&rt->dst));
1507 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1508 	if (proto == IPPROTO_ICMPV6) {
1509 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1510 
1511 		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1512 		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1513 	}
1514 
1515 	err = ip6_local_out(skb);
1516 	if (err) {
1517 		if (err > 0)
1518 			err = net_xmit_errno(err);
1519 		if (err)
1520 			goto error;
1521 	}
1522 
1523 out:
1524 	ip6_cork_release(inet, np);
1525 	return err;
1526 error:
1527 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1528 	goto out;
1529 }
1530 
1531 void ip6_flush_pending_frames(struct sock *sk)
1532 {
1533 	struct sk_buff *skb;
1534 
1535 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1536 		if (skb_dst(skb))
1537 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1538 				      IPSTATS_MIB_OUTDISCARDS);
1539 		kfree_skb(skb);
1540 	}
1541 
1542 	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1543 }
1544