xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 65cf840f)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	: 	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63 	int len;
64 
65 	len = skb->len - sizeof(struct ipv6hdr);
66 	if (len > IPV6_MAXPLEN)
67 		len = 0;
68 	ipv6_hdr(skb)->payload_len = htons(len);
69 
70 	return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 		       skb_dst(skb)->dev, dst_output);
72 }
73 
74 int ip6_local_out(struct sk_buff *skb)
75 {
76 	int err;
77 
78 	err = __ip6_local_out(skb);
79 	if (likely(err == 1))
80 		err = dst_output(skb);
81 
82 	return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85 
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89 	skb_reset_mac_header(newskb);
90 	__skb_pull(newskb, skb_network_offset(newskb));
91 	newskb->pkt_type = PACKET_LOOPBACK;
92 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
93 	WARN_ON(!skb_dst(newskb));
94 
95 	netif_rx_ni(newskb);
96 	return 0;
97 }
98 
99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101 	struct dst_entry *dst = skb_dst(skb);
102 	struct net_device *dev = dst->dev;
103 
104 	skb->protocol = htons(ETH_P_IPV6);
105 	skb->dev = dev;
106 
107 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
108 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
109 
110 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
111 		    ((mroute6_socket(dev_net(dev), skb) &&
112 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
113 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
114 					 &ipv6_hdr(skb)->saddr))) {
115 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
116 
117 			/* Do not check for IFF_ALLMULTI; multicast routing
118 			   is not supported in any case.
119 			 */
120 			if (newskb)
121 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
122 					newskb, NULL, newskb->dev,
123 					ip6_dev_loopback_xmit);
124 
125 			if (ipv6_hdr(skb)->hop_limit == 0) {
126 				IP6_INC_STATS(dev_net(dev), idev,
127 					      IPSTATS_MIB_OUTDISCARDS);
128 				kfree_skb(skb);
129 				return 0;
130 			}
131 		}
132 
133 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
134 				skb->len);
135 	}
136 
137 	if (dst->hh)
138 		return neigh_hh_output(dst->hh, skb);
139 	else if (dst->neighbour)
140 		return dst->neighbour->output(skb);
141 
142 	IP6_INC_STATS_BH(dev_net(dst->dev),
143 			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
144 	kfree_skb(skb);
145 	return -EINVAL;
146 }
147 
148 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
149 {
150 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
151 
152 	return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
153 	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
154 }
155 
156 static int ip6_finish_output(struct sk_buff *skb)
157 {
158 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
159 	    dst_allfrag(skb_dst(skb)))
160 		return ip6_fragment(skb, ip6_finish_output2);
161 	else
162 		return ip6_finish_output2(skb);
163 }
164 
165 int ip6_output(struct sk_buff *skb)
166 {
167 	struct net_device *dev = skb_dst(skb)->dev;
168 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
169 	if (unlikely(idev->cnf.disable_ipv6)) {
170 		IP6_INC_STATS(dev_net(dev), idev,
171 			      IPSTATS_MIB_OUTDISCARDS);
172 		kfree_skb(skb);
173 		return 0;
174 	}
175 
176 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
177 			    ip6_finish_output,
178 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
179 }
180 
181 /*
182  *	xmit an sk_buff (used by TCP, SCTP and DCCP)
183  */
184 
185 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
186 	     struct ipv6_txoptions *opt)
187 {
188 	struct net *net = sock_net(sk);
189 	struct ipv6_pinfo *np = inet6_sk(sk);
190 	struct in6_addr *first_hop = &fl->fl6_dst;
191 	struct dst_entry *dst = skb_dst(skb);
192 	struct ipv6hdr *hdr;
193 	u8  proto = fl->proto;
194 	int seg_len = skb->len;
195 	int hlimit = -1;
196 	int tclass = 0;
197 	u32 mtu;
198 
199 	if (opt) {
200 		unsigned int head_room;
201 
202 		/* First: exthdrs may take lots of space (~8K for now)
203 		   MAX_HEADER is not enough.
204 		 */
205 		head_room = opt->opt_nflen + opt->opt_flen;
206 		seg_len += head_room;
207 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
208 
209 		if (skb_headroom(skb) < head_room) {
210 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
211 			if (skb2 == NULL) {
212 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
213 					      IPSTATS_MIB_OUTDISCARDS);
214 				kfree_skb(skb);
215 				return -ENOBUFS;
216 			}
217 			kfree_skb(skb);
218 			skb = skb2;
219 			skb_set_owner_w(skb, sk);
220 		}
221 		if (opt->opt_flen)
222 			ipv6_push_frag_opts(skb, opt, &proto);
223 		if (opt->opt_nflen)
224 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
225 	}
226 
227 	skb_push(skb, sizeof(struct ipv6hdr));
228 	skb_reset_network_header(skb);
229 	hdr = ipv6_hdr(skb);
230 
231 	/*
232 	 *	Fill in the IPv6 header
233 	 */
234 	if (np) {
235 		tclass = np->tclass;
236 		hlimit = np->hop_limit;
237 	}
238 	if (hlimit < 0)
239 		hlimit = ip6_dst_hoplimit(dst);
240 
241 	*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
242 
243 	hdr->payload_len = htons(seg_len);
244 	hdr->nexthdr = proto;
245 	hdr->hop_limit = hlimit;
246 
247 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
248 	ipv6_addr_copy(&hdr->daddr, first_hop);
249 
250 	skb->priority = sk->sk_priority;
251 	skb->mark = sk->sk_mark;
252 
253 	mtu = dst_mtu(dst);
254 	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
255 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
256 			      IPSTATS_MIB_OUT, skb->len);
257 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
258 			       dst->dev, dst_output);
259 	}
260 
261 	if (net_ratelimit())
262 		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
263 	skb->dev = dst->dev;
264 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
265 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
266 	kfree_skb(skb);
267 	return -EMSGSIZE;
268 }
269 
270 EXPORT_SYMBOL(ip6_xmit);
271 
272 /*
273  *	To avoid extra problems ND packets are send through this
274  *	routine. It's code duplication but I really want to avoid
275  *	extra checks since ipv6_build_header is used by TCP (which
276  *	is for us performance critical)
277  */
278 
279 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
280 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
281 	       int proto, int len)
282 {
283 	struct ipv6_pinfo *np = inet6_sk(sk);
284 	struct ipv6hdr *hdr;
285 	int totlen;
286 
287 	skb->protocol = htons(ETH_P_IPV6);
288 	skb->dev = dev;
289 
290 	totlen = len + sizeof(struct ipv6hdr);
291 
292 	skb_reset_network_header(skb);
293 	skb_put(skb, sizeof(struct ipv6hdr));
294 	hdr = ipv6_hdr(skb);
295 
296 	*(__be32*)hdr = htonl(0x60000000);
297 
298 	hdr->payload_len = htons(len);
299 	hdr->nexthdr = proto;
300 	hdr->hop_limit = np->hop_limit;
301 
302 	ipv6_addr_copy(&hdr->saddr, saddr);
303 	ipv6_addr_copy(&hdr->daddr, daddr);
304 
305 	return 0;
306 }
307 
308 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
309 {
310 	struct ip6_ra_chain *ra;
311 	struct sock *last = NULL;
312 
313 	read_lock(&ip6_ra_lock);
314 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
315 		struct sock *sk = ra->sk;
316 		if (sk && ra->sel == sel &&
317 		    (!sk->sk_bound_dev_if ||
318 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
319 			if (last) {
320 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
321 				if (skb2)
322 					rawv6_rcv(last, skb2);
323 			}
324 			last = sk;
325 		}
326 	}
327 
328 	if (last) {
329 		rawv6_rcv(last, skb);
330 		read_unlock(&ip6_ra_lock);
331 		return 1;
332 	}
333 	read_unlock(&ip6_ra_lock);
334 	return 0;
335 }
336 
337 static int ip6_forward_proxy_check(struct sk_buff *skb)
338 {
339 	struct ipv6hdr *hdr = ipv6_hdr(skb);
340 	u8 nexthdr = hdr->nexthdr;
341 	int offset;
342 
343 	if (ipv6_ext_hdr(nexthdr)) {
344 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
345 		if (offset < 0)
346 			return 0;
347 	} else
348 		offset = sizeof(struct ipv6hdr);
349 
350 	if (nexthdr == IPPROTO_ICMPV6) {
351 		struct icmp6hdr *icmp6;
352 
353 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
354 					 offset + 1 - skb->data)))
355 			return 0;
356 
357 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
358 
359 		switch (icmp6->icmp6_type) {
360 		case NDISC_ROUTER_SOLICITATION:
361 		case NDISC_ROUTER_ADVERTISEMENT:
362 		case NDISC_NEIGHBOUR_SOLICITATION:
363 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
364 		case NDISC_REDIRECT:
365 			/* For reaction involving unicast neighbor discovery
366 			 * message destined to the proxied address, pass it to
367 			 * input function.
368 			 */
369 			return 1;
370 		default:
371 			break;
372 		}
373 	}
374 
375 	/*
376 	 * The proxying router can't forward traffic sent to a link-local
377 	 * address, so signal the sender and discard the packet. This
378 	 * behavior is clarified by the MIPv6 specification.
379 	 */
380 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
381 		dst_link_failure(skb);
382 		return -1;
383 	}
384 
385 	return 0;
386 }
387 
388 static inline int ip6_forward_finish(struct sk_buff *skb)
389 {
390 	return dst_output(skb);
391 }
392 
393 int ip6_forward(struct sk_buff *skb)
394 {
395 	struct dst_entry *dst = skb_dst(skb);
396 	struct ipv6hdr *hdr = ipv6_hdr(skb);
397 	struct inet6_skb_parm *opt = IP6CB(skb);
398 	struct net *net = dev_net(dst->dev);
399 	u32 mtu;
400 
401 	if (net->ipv6.devconf_all->forwarding == 0)
402 		goto error;
403 
404 	if (skb_warn_if_lro(skb))
405 		goto drop;
406 
407 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
408 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
409 		goto drop;
410 	}
411 
412 	skb_forward_csum(skb);
413 
414 	/*
415 	 *	We DO NOT make any processing on
416 	 *	RA packets, pushing them to user level AS IS
417 	 *	without ane WARRANTY that application will be able
418 	 *	to interpret them. The reason is that we
419 	 *	cannot make anything clever here.
420 	 *
421 	 *	We are not end-node, so that if packet contains
422 	 *	AH/ESP, we cannot make anything.
423 	 *	Defragmentation also would be mistake, RA packets
424 	 *	cannot be fragmented, because there is no warranty
425 	 *	that different fragments will go along one path. --ANK
426 	 */
427 	if (opt->ra) {
428 		u8 *ptr = skb_network_header(skb) + opt->ra;
429 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
430 			return 0;
431 	}
432 
433 	/*
434 	 *	check and decrement ttl
435 	 */
436 	if (hdr->hop_limit <= 1) {
437 		/* Force OUTPUT device used as source address */
438 		skb->dev = dst->dev;
439 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
440 		IP6_INC_STATS_BH(net,
441 				 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
442 
443 		kfree_skb(skb);
444 		return -ETIMEDOUT;
445 	}
446 
447 	/* XXX: idev->cnf.proxy_ndp? */
448 	if (net->ipv6.devconf_all->proxy_ndp &&
449 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
450 		int proxied = ip6_forward_proxy_check(skb);
451 		if (proxied > 0)
452 			return ip6_input(skb);
453 		else if (proxied < 0) {
454 			IP6_INC_STATS(net, ip6_dst_idev(dst),
455 				      IPSTATS_MIB_INDISCARDS);
456 			goto drop;
457 		}
458 	}
459 
460 	if (!xfrm6_route_forward(skb)) {
461 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
462 		goto drop;
463 	}
464 	dst = skb_dst(skb);
465 
466 	/* IPv6 specs say nothing about it, but it is clear that we cannot
467 	   send redirects to source routed frames.
468 	   We don't send redirects to frames decapsulated from IPsec.
469 	 */
470 	if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
471 	    !skb_sec_path(skb)) {
472 		struct in6_addr *target = NULL;
473 		struct rt6_info *rt;
474 		struct neighbour *n = dst->neighbour;
475 
476 		/*
477 		 *	incoming and outgoing devices are the same
478 		 *	send a redirect.
479 		 */
480 
481 		rt = (struct rt6_info *) dst;
482 		if ((rt->rt6i_flags & RTF_GATEWAY))
483 			target = (struct in6_addr*)&n->primary_key;
484 		else
485 			target = &hdr->daddr;
486 
487 		/* Limit redirects both by destination (here)
488 		   and by source (inside ndisc_send_redirect)
489 		 */
490 		if (xrlim_allow(dst, 1*HZ))
491 			ndisc_send_redirect(skb, n, target);
492 	} else {
493 		int addrtype = ipv6_addr_type(&hdr->saddr);
494 
495 		/* This check is security critical. */
496 		if (addrtype == IPV6_ADDR_ANY ||
497 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
498 			goto error;
499 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
500 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
501 				    ICMPV6_NOT_NEIGHBOUR, 0);
502 			goto error;
503 		}
504 	}
505 
506 	mtu = dst_mtu(dst);
507 	if (mtu < IPV6_MIN_MTU)
508 		mtu = IPV6_MIN_MTU;
509 
510 	if (skb->len > mtu && !skb_is_gso(skb)) {
511 		/* Again, force OUTPUT device used as source address */
512 		skb->dev = dst->dev;
513 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
514 		IP6_INC_STATS_BH(net,
515 				 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
516 		IP6_INC_STATS_BH(net,
517 				 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
518 		kfree_skb(skb);
519 		return -EMSGSIZE;
520 	}
521 
522 	if (skb_cow(skb, dst->dev->hard_header_len)) {
523 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
524 		goto drop;
525 	}
526 
527 	hdr = ipv6_hdr(skb);
528 
529 	/* Mangling hops number delayed to point after skb COW */
530 
531 	hdr->hop_limit--;
532 
533 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
534 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
535 		       ip6_forward_finish);
536 
537 error:
538 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
539 drop:
540 	kfree_skb(skb);
541 	return -EINVAL;
542 }
543 
544 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
545 {
546 	to->pkt_type = from->pkt_type;
547 	to->priority = from->priority;
548 	to->protocol = from->protocol;
549 	skb_dst_drop(to);
550 	skb_dst_set(to, dst_clone(skb_dst(from)));
551 	to->dev = from->dev;
552 	to->mark = from->mark;
553 
554 #ifdef CONFIG_NET_SCHED
555 	to->tc_index = from->tc_index;
556 #endif
557 	nf_copy(to, from);
558 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
559     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
560 	to->nf_trace = from->nf_trace;
561 #endif
562 	skb_copy_secmark(to, from);
563 }
564 
565 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
566 {
567 	u16 offset = sizeof(struct ipv6hdr);
568 	struct ipv6_opt_hdr *exthdr =
569 				(struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
570 	unsigned int packet_len = skb->tail - skb->network_header;
571 	int found_rhdr = 0;
572 	*nexthdr = &ipv6_hdr(skb)->nexthdr;
573 
574 	while (offset + 1 <= packet_len) {
575 
576 		switch (**nexthdr) {
577 
578 		case NEXTHDR_HOP:
579 			break;
580 		case NEXTHDR_ROUTING:
581 			found_rhdr = 1;
582 			break;
583 		case NEXTHDR_DEST:
584 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
585 			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
586 				break;
587 #endif
588 			if (found_rhdr)
589 				return offset;
590 			break;
591 		default :
592 			return offset;
593 		}
594 
595 		offset += ipv6_optlen(exthdr);
596 		*nexthdr = &exthdr->nexthdr;
597 		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
598 						 offset);
599 	}
600 
601 	return offset;
602 }
603 
604 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
605 {
606 	struct sk_buff *frag;
607 	struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
608 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
609 	struct ipv6hdr *tmp_hdr;
610 	struct frag_hdr *fh;
611 	unsigned int mtu, hlen, left, len;
612 	__be32 frag_id = 0;
613 	int ptr, offset = 0, err=0;
614 	u8 *prevhdr, nexthdr = 0;
615 	struct net *net = dev_net(skb_dst(skb)->dev);
616 
617 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
618 	nexthdr = *prevhdr;
619 
620 	mtu = ip6_skb_dst_mtu(skb);
621 
622 	/* We must not fragment if the socket is set to force MTU discovery
623 	 * or if the skb it not generated by a local socket.
624 	 */
625 	if (!skb->local_df && skb->len > mtu) {
626 		skb->dev = skb_dst(skb)->dev;
627 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
628 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
629 			      IPSTATS_MIB_FRAGFAILS);
630 		kfree_skb(skb);
631 		return -EMSGSIZE;
632 	}
633 
634 	if (np && np->frag_size < mtu) {
635 		if (np->frag_size)
636 			mtu = np->frag_size;
637 	}
638 	mtu -= hlen + sizeof(struct frag_hdr);
639 
640 	if (skb_has_frags(skb)) {
641 		int first_len = skb_pagelen(skb);
642 		int truesizes = 0;
643 
644 		if (first_len - hlen > mtu ||
645 		    ((first_len - hlen) & 7) ||
646 		    skb_cloned(skb))
647 			goto slow_path;
648 
649 		skb_walk_frags(skb, frag) {
650 			/* Correct geometry. */
651 			if (frag->len > mtu ||
652 			    ((frag->len & 7) && frag->next) ||
653 			    skb_headroom(frag) < hlen)
654 			    goto slow_path;
655 
656 			/* Partially cloned skb? */
657 			if (skb_shared(frag))
658 				goto slow_path;
659 
660 			BUG_ON(frag->sk);
661 			if (skb->sk) {
662 				frag->sk = skb->sk;
663 				frag->destructor = sock_wfree;
664 				truesizes += frag->truesize;
665 			}
666 		}
667 
668 		err = 0;
669 		offset = 0;
670 		frag = skb_shinfo(skb)->frag_list;
671 		skb_frag_list_init(skb);
672 		/* BUILD HEADER */
673 
674 		*prevhdr = NEXTHDR_FRAGMENT;
675 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
676 		if (!tmp_hdr) {
677 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
678 				      IPSTATS_MIB_FRAGFAILS);
679 			return -ENOMEM;
680 		}
681 
682 		__skb_pull(skb, hlen);
683 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
684 		__skb_push(skb, hlen);
685 		skb_reset_network_header(skb);
686 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
687 
688 		ipv6_select_ident(fh);
689 		fh->nexthdr = nexthdr;
690 		fh->reserved = 0;
691 		fh->frag_off = htons(IP6_MF);
692 		frag_id = fh->identification;
693 
694 		first_len = skb_pagelen(skb);
695 		skb->data_len = first_len - skb_headlen(skb);
696 		skb->truesize -= truesizes;
697 		skb->len = first_len;
698 		ipv6_hdr(skb)->payload_len = htons(first_len -
699 						   sizeof(struct ipv6hdr));
700 
701 		dst_hold(&rt->u.dst);
702 
703 		for (;;) {
704 			/* Prepare header of the next frame,
705 			 * before previous one went down. */
706 			if (frag) {
707 				frag->ip_summed = CHECKSUM_NONE;
708 				skb_reset_transport_header(frag);
709 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
710 				__skb_push(frag, hlen);
711 				skb_reset_network_header(frag);
712 				memcpy(skb_network_header(frag), tmp_hdr,
713 				       hlen);
714 				offset += skb->len - hlen - sizeof(struct frag_hdr);
715 				fh->nexthdr = nexthdr;
716 				fh->reserved = 0;
717 				fh->frag_off = htons(offset);
718 				if (frag->next != NULL)
719 					fh->frag_off |= htons(IP6_MF);
720 				fh->identification = frag_id;
721 				ipv6_hdr(frag)->payload_len =
722 						htons(frag->len -
723 						      sizeof(struct ipv6hdr));
724 				ip6_copy_metadata(frag, skb);
725 			}
726 
727 			err = output(skb);
728 			if(!err)
729 				IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
730 					      IPSTATS_MIB_FRAGCREATES);
731 
732 			if (err || !frag)
733 				break;
734 
735 			skb = frag;
736 			frag = skb->next;
737 			skb->next = NULL;
738 		}
739 
740 		kfree(tmp_hdr);
741 
742 		if (err == 0) {
743 			IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
744 				      IPSTATS_MIB_FRAGOKS);
745 			dst_release(&rt->u.dst);
746 			return 0;
747 		}
748 
749 		while (frag) {
750 			skb = frag->next;
751 			kfree_skb(frag);
752 			frag = skb;
753 		}
754 
755 		IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
756 			      IPSTATS_MIB_FRAGFAILS);
757 		dst_release(&rt->u.dst);
758 		return err;
759 	}
760 
761 slow_path:
762 	left = skb->len - hlen;		/* Space per frame */
763 	ptr = hlen;			/* Where to start from */
764 
765 	/*
766 	 *	Fragment the datagram.
767 	 */
768 
769 	*prevhdr = NEXTHDR_FRAGMENT;
770 
771 	/*
772 	 *	Keep copying data until we run out.
773 	 */
774 	while(left > 0)	{
775 		len = left;
776 		/* IF: it doesn't fit, use 'mtu' - the data space left */
777 		if (len > mtu)
778 			len = mtu;
779 		/* IF: we are not sending upto and including the packet end
780 		   then align the next start on an eight byte boundary */
781 		if (len < left)	{
782 			len &= ~7;
783 		}
784 		/*
785 		 *	Allocate buffer.
786 		 */
787 
788 		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
789 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
790 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
791 				      IPSTATS_MIB_FRAGFAILS);
792 			err = -ENOMEM;
793 			goto fail;
794 		}
795 
796 		/*
797 		 *	Set up data on packet
798 		 */
799 
800 		ip6_copy_metadata(frag, skb);
801 		skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
802 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
803 		skb_reset_network_header(frag);
804 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
805 		frag->transport_header = (frag->network_header + hlen +
806 					  sizeof(struct frag_hdr));
807 
808 		/*
809 		 *	Charge the memory for the fragment to any owner
810 		 *	it might possess
811 		 */
812 		if (skb->sk)
813 			skb_set_owner_w(frag, skb->sk);
814 
815 		/*
816 		 *	Copy the packet header into the new buffer.
817 		 */
818 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
819 
820 		/*
821 		 *	Build fragment header.
822 		 */
823 		fh->nexthdr = nexthdr;
824 		fh->reserved = 0;
825 		if (!frag_id) {
826 			ipv6_select_ident(fh);
827 			frag_id = fh->identification;
828 		} else
829 			fh->identification = frag_id;
830 
831 		/*
832 		 *	Copy a block of the IP datagram.
833 		 */
834 		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
835 			BUG();
836 		left -= len;
837 
838 		fh->frag_off = htons(offset);
839 		if (left > 0)
840 			fh->frag_off |= htons(IP6_MF);
841 		ipv6_hdr(frag)->payload_len = htons(frag->len -
842 						    sizeof(struct ipv6hdr));
843 
844 		ptr += len;
845 		offset += len;
846 
847 		/*
848 		 *	Put this fragment into the sending queue.
849 		 */
850 		err = output(frag);
851 		if (err)
852 			goto fail;
853 
854 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
855 			      IPSTATS_MIB_FRAGCREATES);
856 	}
857 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
858 		      IPSTATS_MIB_FRAGOKS);
859 	kfree_skb(skb);
860 	return err;
861 
862 fail:
863 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
864 		      IPSTATS_MIB_FRAGFAILS);
865 	kfree_skb(skb);
866 	return err;
867 }
868 
869 static inline int ip6_rt_check(struct rt6key *rt_key,
870 			       struct in6_addr *fl_addr,
871 			       struct in6_addr *addr_cache)
872 {
873 	return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
874 		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
875 }
876 
877 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
878 					  struct dst_entry *dst,
879 					  struct flowi *fl)
880 {
881 	struct ipv6_pinfo *np = inet6_sk(sk);
882 	struct rt6_info *rt = (struct rt6_info *)dst;
883 
884 	if (!dst)
885 		goto out;
886 
887 	/* Yes, checking route validity in not connected
888 	 * case is not very simple. Take into account,
889 	 * that we do not support routing by source, TOS,
890 	 * and MSG_DONTROUTE 		--ANK (980726)
891 	 *
892 	 * 1. ip6_rt_check(): If route was host route,
893 	 *    check that cached destination is current.
894 	 *    If it is network route, we still may
895 	 *    check its validity using saved pointer
896 	 *    to the last used address: daddr_cache.
897 	 *    We do not want to save whole address now,
898 	 *    (because main consumer of this service
899 	 *    is tcp, which has not this problem),
900 	 *    so that the last trick works only on connected
901 	 *    sockets.
902 	 * 2. oif also should be the same.
903 	 */
904 	if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
905 #ifdef CONFIG_IPV6_SUBTREES
906 	    ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
907 #endif
908 	    (fl->oif && fl->oif != dst->dev->ifindex)) {
909 		dst_release(dst);
910 		dst = NULL;
911 	}
912 
913 out:
914 	return dst;
915 }
916 
917 static int ip6_dst_lookup_tail(struct sock *sk,
918 			       struct dst_entry **dst, struct flowi *fl)
919 {
920 	int err;
921 	struct net *net = sock_net(sk);
922 
923 	if (*dst == NULL)
924 		*dst = ip6_route_output(net, sk, fl);
925 
926 	if ((err = (*dst)->error))
927 		goto out_err_release;
928 
929 	if (ipv6_addr_any(&fl->fl6_src)) {
930 		err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
931 					 &fl->fl6_dst,
932 					 sk ? inet6_sk(sk)->srcprefs : 0,
933 					 &fl->fl6_src);
934 		if (err)
935 			goto out_err_release;
936 	}
937 
938 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
939 	/*
940 	 * Here if the dst entry we've looked up
941 	 * has a neighbour entry that is in the INCOMPLETE
942 	 * state and the src address from the flow is
943 	 * marked as OPTIMISTIC, we release the found
944 	 * dst entry and replace it instead with the
945 	 * dst entry of the nexthop router
946 	 */
947 	if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
948 		struct inet6_ifaddr *ifp;
949 		struct flowi fl_gw;
950 		int redirect;
951 
952 		ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
953 				      (*dst)->dev, 1);
954 
955 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
956 		if (ifp)
957 			in6_ifa_put(ifp);
958 
959 		if (redirect) {
960 			/*
961 			 * We need to get the dst entry for the
962 			 * default router instead
963 			 */
964 			dst_release(*dst);
965 			memcpy(&fl_gw, fl, sizeof(struct flowi));
966 			memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
967 			*dst = ip6_route_output(net, sk, &fl_gw);
968 			if ((err = (*dst)->error))
969 				goto out_err_release;
970 		}
971 	}
972 #endif
973 
974 	return 0;
975 
976 out_err_release:
977 	if (err == -ENETUNREACH)
978 		IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
979 	dst_release(*dst);
980 	*dst = NULL;
981 	return err;
982 }
983 
984 /**
985  *	ip6_dst_lookup - perform route lookup on flow
986  *	@sk: socket which provides route info
987  *	@dst: pointer to dst_entry * for result
988  *	@fl: flow to lookup
989  *
990  *	This function performs a route lookup on the given flow.
991  *
992  *	It returns zero on success, or a standard errno code on error.
993  */
994 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
995 {
996 	*dst = NULL;
997 	return ip6_dst_lookup_tail(sk, dst, fl);
998 }
999 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1000 
1001 /**
1002  *	ip6_sk_dst_lookup - perform socket cached route lookup on flow
1003  *	@sk: socket which provides the dst cache and route info
1004  *	@dst: pointer to dst_entry * for result
1005  *	@fl: flow to lookup
1006  *
1007  *	This function performs a route lookup on the given flow with the
1008  *	possibility of using the cached route in the socket if it is valid.
1009  *	It will take the socket dst lock when operating on the dst cache.
1010  *	As a result, this function can only be used in process context.
1011  *
1012  *	It returns zero on success, or a standard errno code on error.
1013  */
1014 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1015 {
1016 	*dst = NULL;
1017 	if (sk) {
1018 		*dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1019 		*dst = ip6_sk_dst_check(sk, *dst, fl);
1020 	}
1021 
1022 	return ip6_dst_lookup_tail(sk, dst, fl);
1023 }
1024 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1025 
1026 static inline int ip6_ufo_append_data(struct sock *sk,
1027 			int getfrag(void *from, char *to, int offset, int len,
1028 			int odd, struct sk_buff *skb),
1029 			void *from, int length, int hh_len, int fragheaderlen,
1030 			int transhdrlen, int mtu,unsigned int flags)
1031 
1032 {
1033 	struct sk_buff *skb;
1034 	int err;
1035 
1036 	/* There is support for UDP large send offload by network
1037 	 * device, so create one single skb packet containing complete
1038 	 * udp datagram
1039 	 */
1040 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1041 		skb = sock_alloc_send_skb(sk,
1042 			hh_len + fragheaderlen + transhdrlen + 20,
1043 			(flags & MSG_DONTWAIT), &err);
1044 		if (skb == NULL)
1045 			return -ENOMEM;
1046 
1047 		/* reserve space for Hardware header */
1048 		skb_reserve(skb, hh_len);
1049 
1050 		/* create space for UDP/IP header */
1051 		skb_put(skb,fragheaderlen + transhdrlen);
1052 
1053 		/* initialize network header pointer */
1054 		skb_reset_network_header(skb);
1055 
1056 		/* initialize protocol header pointer */
1057 		skb->transport_header = skb->network_header + fragheaderlen;
1058 
1059 		skb->ip_summed = CHECKSUM_PARTIAL;
1060 		skb->csum = 0;
1061 		sk->sk_sndmsg_off = 0;
1062 	}
1063 
1064 	err = skb_append_datato_frags(sk,skb, getfrag, from,
1065 				      (length - transhdrlen));
1066 	if (!err) {
1067 		struct frag_hdr fhdr;
1068 
1069 		/* Specify the length of each IPv6 datagram fragment.
1070 		 * It has to be a multiple of 8.
1071 		 */
1072 		skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1073 					     sizeof(struct frag_hdr)) & ~7;
1074 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1075 		ipv6_select_ident(&fhdr);
1076 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1077 		__skb_queue_tail(&sk->sk_write_queue, skb);
1078 
1079 		return 0;
1080 	}
1081 	/* There is not enough support do UPD LSO,
1082 	 * so follow normal path
1083 	 */
1084 	kfree_skb(skb);
1085 
1086 	return err;
1087 }
1088 
1089 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1090 					       gfp_t gfp)
1091 {
1092 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1093 }
1094 
1095 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1096 						gfp_t gfp)
1097 {
1098 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1099 }
1100 
1101 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1102 	int offset, int len, int odd, struct sk_buff *skb),
1103 	void *from, int length, int transhdrlen,
1104 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1105 	struct rt6_info *rt, unsigned int flags, int dontfrag)
1106 {
1107 	struct inet_sock *inet = inet_sk(sk);
1108 	struct ipv6_pinfo *np = inet6_sk(sk);
1109 	struct sk_buff *skb;
1110 	unsigned int maxfraglen, fragheaderlen;
1111 	int exthdrlen;
1112 	int hh_len;
1113 	int mtu;
1114 	int copy;
1115 	int err;
1116 	int offset = 0;
1117 	int csummode = CHECKSUM_NONE;
1118 
1119 	if (flags&MSG_PROBE)
1120 		return 0;
1121 	if (skb_queue_empty(&sk->sk_write_queue)) {
1122 		/*
1123 		 * setup for corking
1124 		 */
1125 		if (opt) {
1126 			if (WARN_ON(np->cork.opt))
1127 				return -EINVAL;
1128 
1129 			np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1130 			if (unlikely(np->cork.opt == NULL))
1131 				return -ENOBUFS;
1132 
1133 			np->cork.opt->tot_len = opt->tot_len;
1134 			np->cork.opt->opt_flen = opt->opt_flen;
1135 			np->cork.opt->opt_nflen = opt->opt_nflen;
1136 
1137 			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1138 							    sk->sk_allocation);
1139 			if (opt->dst0opt && !np->cork.opt->dst0opt)
1140 				return -ENOBUFS;
1141 
1142 			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1143 							    sk->sk_allocation);
1144 			if (opt->dst1opt && !np->cork.opt->dst1opt)
1145 				return -ENOBUFS;
1146 
1147 			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1148 							   sk->sk_allocation);
1149 			if (opt->hopopt && !np->cork.opt->hopopt)
1150 				return -ENOBUFS;
1151 
1152 			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1153 							    sk->sk_allocation);
1154 			if (opt->srcrt && !np->cork.opt->srcrt)
1155 				return -ENOBUFS;
1156 
1157 			/* need source address above miyazawa*/
1158 		}
1159 		dst_hold(&rt->u.dst);
1160 		inet->cork.dst = &rt->u.dst;
1161 		inet->cork.fl = *fl;
1162 		np->cork.hop_limit = hlimit;
1163 		np->cork.tclass = tclass;
1164 		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1165 		      rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1166 		if (np->frag_size < mtu) {
1167 			if (np->frag_size)
1168 				mtu = np->frag_size;
1169 		}
1170 		inet->cork.fragsize = mtu;
1171 		if (dst_allfrag(rt->u.dst.path))
1172 			inet->cork.flags |= IPCORK_ALLFRAG;
1173 		inet->cork.length = 0;
1174 		sk->sk_sndmsg_page = NULL;
1175 		sk->sk_sndmsg_off = 0;
1176 		exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1177 			    rt->rt6i_nfheader_len;
1178 		length += exthdrlen;
1179 		transhdrlen += exthdrlen;
1180 	} else {
1181 		rt = (struct rt6_info *)inet->cork.dst;
1182 		fl = &inet->cork.fl;
1183 		opt = np->cork.opt;
1184 		transhdrlen = 0;
1185 		exthdrlen = 0;
1186 		mtu = inet->cork.fragsize;
1187 	}
1188 
1189 	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1190 
1191 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1192 			(opt ? opt->opt_nflen : 0);
1193 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1194 
1195 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1196 		if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1197 			ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1198 			return -EMSGSIZE;
1199 		}
1200 	}
1201 
1202 	/*
1203 	 * Let's try using as much space as possible.
1204 	 * Use MTU if total length of the message fits into the MTU.
1205 	 * Otherwise, we need to reserve fragment header and
1206 	 * fragment alignment (= 8-15 octects, in total).
1207 	 *
1208 	 * Note that we may need to "move" the data from the tail of
1209 	 * of the buffer to the new fragment when we split
1210 	 * the message.
1211 	 *
1212 	 * FIXME: It may be fragmented into multiple chunks
1213 	 *        at once if non-fragmentable extension headers
1214 	 *        are too large.
1215 	 * --yoshfuji
1216 	 */
1217 
1218 	inet->cork.length += length;
1219 	if (length > mtu) {
1220 		int proto = sk->sk_protocol;
1221 		if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1222 			ipv6_local_rxpmtu(sk, fl, mtu-exthdrlen);
1223 			return -EMSGSIZE;
1224 		}
1225 
1226 		if (proto == IPPROTO_UDP &&
1227 		    (rt->u.dst.dev->features & NETIF_F_UFO)) {
1228 
1229 			err = ip6_ufo_append_data(sk, getfrag, from, length,
1230 						  hh_len, fragheaderlen,
1231 						  transhdrlen, mtu, flags);
1232 			if (err)
1233 				goto error;
1234 			return 0;
1235 		}
1236 	}
1237 
1238 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1239 		goto alloc_new_skb;
1240 
1241 	while (length > 0) {
1242 		/* Check if the remaining data fits into current packet. */
1243 		copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1244 		if (copy < length)
1245 			copy = maxfraglen - skb->len;
1246 
1247 		if (copy <= 0) {
1248 			char *data;
1249 			unsigned int datalen;
1250 			unsigned int fraglen;
1251 			unsigned int fraggap;
1252 			unsigned int alloclen;
1253 			struct sk_buff *skb_prev;
1254 alloc_new_skb:
1255 			skb_prev = skb;
1256 
1257 			/* There's no room in the current skb */
1258 			if (skb_prev)
1259 				fraggap = skb_prev->len - maxfraglen;
1260 			else
1261 				fraggap = 0;
1262 
1263 			/*
1264 			 * If remaining data exceeds the mtu,
1265 			 * we know we need more fragment(s).
1266 			 */
1267 			datalen = length + fraggap;
1268 			if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1269 				datalen = maxfraglen - fragheaderlen;
1270 
1271 			fraglen = datalen + fragheaderlen;
1272 			if ((flags & MSG_MORE) &&
1273 			    !(rt->u.dst.dev->features&NETIF_F_SG))
1274 				alloclen = mtu;
1275 			else
1276 				alloclen = datalen + fragheaderlen;
1277 
1278 			/*
1279 			 * The last fragment gets additional space at tail.
1280 			 * Note: we overallocate on fragments with MSG_MODE
1281 			 * because we have no idea if we're the last one.
1282 			 */
1283 			if (datalen == length + fraggap)
1284 				alloclen += rt->u.dst.trailer_len;
1285 
1286 			/*
1287 			 * We just reserve space for fragment header.
1288 			 * Note: this may be overallocation if the message
1289 			 * (without MSG_MORE) fits into the MTU.
1290 			 */
1291 			alloclen += sizeof(struct frag_hdr);
1292 
1293 			if (transhdrlen) {
1294 				skb = sock_alloc_send_skb(sk,
1295 						alloclen + hh_len,
1296 						(flags & MSG_DONTWAIT), &err);
1297 			} else {
1298 				skb = NULL;
1299 				if (atomic_read(&sk->sk_wmem_alloc) <=
1300 				    2 * sk->sk_sndbuf)
1301 					skb = sock_wmalloc(sk,
1302 							   alloclen + hh_len, 1,
1303 							   sk->sk_allocation);
1304 				if (unlikely(skb == NULL))
1305 					err = -ENOBUFS;
1306 			}
1307 			if (skb == NULL)
1308 				goto error;
1309 			/*
1310 			 *	Fill in the control structures
1311 			 */
1312 			skb->ip_summed = csummode;
1313 			skb->csum = 0;
1314 			/* reserve for fragmentation */
1315 			skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1316 
1317 			/*
1318 			 *	Find where to start putting bytes
1319 			 */
1320 			data = skb_put(skb, fraglen);
1321 			skb_set_network_header(skb, exthdrlen);
1322 			data += fragheaderlen;
1323 			skb->transport_header = (skb->network_header +
1324 						 fragheaderlen);
1325 			if (fraggap) {
1326 				skb->csum = skb_copy_and_csum_bits(
1327 					skb_prev, maxfraglen,
1328 					data + transhdrlen, fraggap, 0);
1329 				skb_prev->csum = csum_sub(skb_prev->csum,
1330 							  skb->csum);
1331 				data += fraggap;
1332 				pskb_trim_unique(skb_prev, maxfraglen);
1333 			}
1334 			copy = datalen - transhdrlen - fraggap;
1335 			if (copy < 0) {
1336 				err = -EINVAL;
1337 				kfree_skb(skb);
1338 				goto error;
1339 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1340 				err = -EFAULT;
1341 				kfree_skb(skb);
1342 				goto error;
1343 			}
1344 
1345 			offset += copy;
1346 			length -= datalen - fraggap;
1347 			transhdrlen = 0;
1348 			exthdrlen = 0;
1349 			csummode = CHECKSUM_NONE;
1350 
1351 			/*
1352 			 * Put the packet on the pending queue
1353 			 */
1354 			__skb_queue_tail(&sk->sk_write_queue, skb);
1355 			continue;
1356 		}
1357 
1358 		if (copy > length)
1359 			copy = length;
1360 
1361 		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1362 			unsigned int off;
1363 
1364 			off = skb->len;
1365 			if (getfrag(from, skb_put(skb, copy),
1366 						offset, copy, off, skb) < 0) {
1367 				__skb_trim(skb, off);
1368 				err = -EFAULT;
1369 				goto error;
1370 			}
1371 		} else {
1372 			int i = skb_shinfo(skb)->nr_frags;
1373 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1374 			struct page *page = sk->sk_sndmsg_page;
1375 			int off = sk->sk_sndmsg_off;
1376 			unsigned int left;
1377 
1378 			if (page && (left = PAGE_SIZE - off) > 0) {
1379 				if (copy >= left)
1380 					copy = left;
1381 				if (page != frag->page) {
1382 					if (i == MAX_SKB_FRAGS) {
1383 						err = -EMSGSIZE;
1384 						goto error;
1385 					}
1386 					get_page(page);
1387 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1388 					frag = &skb_shinfo(skb)->frags[i];
1389 				}
1390 			} else if(i < MAX_SKB_FRAGS) {
1391 				if (copy > PAGE_SIZE)
1392 					copy = PAGE_SIZE;
1393 				page = alloc_pages(sk->sk_allocation, 0);
1394 				if (page == NULL) {
1395 					err = -ENOMEM;
1396 					goto error;
1397 				}
1398 				sk->sk_sndmsg_page = page;
1399 				sk->sk_sndmsg_off = 0;
1400 
1401 				skb_fill_page_desc(skb, i, page, 0, 0);
1402 				frag = &skb_shinfo(skb)->frags[i];
1403 			} else {
1404 				err = -EMSGSIZE;
1405 				goto error;
1406 			}
1407 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1408 				err = -EFAULT;
1409 				goto error;
1410 			}
1411 			sk->sk_sndmsg_off += copy;
1412 			frag->size += copy;
1413 			skb->len += copy;
1414 			skb->data_len += copy;
1415 			skb->truesize += copy;
1416 			atomic_add(copy, &sk->sk_wmem_alloc);
1417 		}
1418 		offset += copy;
1419 		length -= copy;
1420 	}
1421 	return 0;
1422 error:
1423 	inet->cork.length -= length;
1424 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1425 	return err;
1426 }
1427 
1428 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1429 {
1430 	if (np->cork.opt) {
1431 		kfree(np->cork.opt->dst0opt);
1432 		kfree(np->cork.opt->dst1opt);
1433 		kfree(np->cork.opt->hopopt);
1434 		kfree(np->cork.opt->srcrt);
1435 		kfree(np->cork.opt);
1436 		np->cork.opt = NULL;
1437 	}
1438 
1439 	if (inet->cork.dst) {
1440 		dst_release(inet->cork.dst);
1441 		inet->cork.dst = NULL;
1442 		inet->cork.flags &= ~IPCORK_ALLFRAG;
1443 	}
1444 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1445 }
1446 
1447 int ip6_push_pending_frames(struct sock *sk)
1448 {
1449 	struct sk_buff *skb, *tmp_skb;
1450 	struct sk_buff **tail_skb;
1451 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1452 	struct inet_sock *inet = inet_sk(sk);
1453 	struct ipv6_pinfo *np = inet6_sk(sk);
1454 	struct net *net = sock_net(sk);
1455 	struct ipv6hdr *hdr;
1456 	struct ipv6_txoptions *opt = np->cork.opt;
1457 	struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1458 	struct flowi *fl = &inet->cork.fl;
1459 	unsigned char proto = fl->proto;
1460 	int err = 0;
1461 
1462 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1463 		goto out;
1464 	tail_skb = &(skb_shinfo(skb)->frag_list);
1465 
1466 	/* move skb->data to ip header from ext header */
1467 	if (skb->data < skb_network_header(skb))
1468 		__skb_pull(skb, skb_network_offset(skb));
1469 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1470 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1471 		*tail_skb = tmp_skb;
1472 		tail_skb = &(tmp_skb->next);
1473 		skb->len += tmp_skb->len;
1474 		skb->data_len += tmp_skb->len;
1475 		skb->truesize += tmp_skb->truesize;
1476 		tmp_skb->destructor = NULL;
1477 		tmp_skb->sk = NULL;
1478 	}
1479 
1480 	/* Allow local fragmentation. */
1481 	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1482 		skb->local_df = 1;
1483 
1484 	ipv6_addr_copy(final_dst, &fl->fl6_dst);
1485 	__skb_pull(skb, skb_network_header_len(skb));
1486 	if (opt && opt->opt_flen)
1487 		ipv6_push_frag_opts(skb, opt, &proto);
1488 	if (opt && opt->opt_nflen)
1489 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1490 
1491 	skb_push(skb, sizeof(struct ipv6hdr));
1492 	skb_reset_network_header(skb);
1493 	hdr = ipv6_hdr(skb);
1494 
1495 	*(__be32*)hdr = fl->fl6_flowlabel |
1496 		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1497 
1498 	hdr->hop_limit = np->cork.hop_limit;
1499 	hdr->nexthdr = proto;
1500 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1501 	ipv6_addr_copy(&hdr->daddr, final_dst);
1502 
1503 	skb->priority = sk->sk_priority;
1504 	skb->mark = sk->sk_mark;
1505 
1506 	skb_dst_set(skb, dst_clone(&rt->u.dst));
1507 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1508 	if (proto == IPPROTO_ICMPV6) {
1509 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1510 
1511 		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1512 		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1513 	}
1514 
1515 	err = ip6_local_out(skb);
1516 	if (err) {
1517 		if (err > 0)
1518 			err = net_xmit_errno(err);
1519 		if (err)
1520 			goto error;
1521 	}
1522 
1523 out:
1524 	ip6_cork_release(inet, np);
1525 	return err;
1526 error:
1527 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1528 	goto out;
1529 }
1530 
1531 void ip6_flush_pending_frames(struct sock *sk)
1532 {
1533 	struct sk_buff *skb;
1534 
1535 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1536 		if (skb_dst(skb))
1537 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1538 				      IPSTATS_MIB_OUTDISCARDS);
1539 		kfree_skb(skb);
1540 	}
1541 
1542 	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1543 }
1544