xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 0b0588d42b2774734b51525fe6550d77f8ea9bc0)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	: 	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 
41 #include <linux/netfilter.h>
42 #include <linux/netfilter_ipv6.h>
43 
44 #include <net/sock.h>
45 #include <net/snmp.h>
46 
47 #include <net/ipv6.h>
48 #include <net/ndisc.h>
49 #include <net/protocol.h>
50 #include <net/ip6_route.h>
51 #include <net/addrconf.h>
52 #include <net/rawv6.h>
53 #include <net/icmp.h>
54 #include <net/xfrm.h>
55 #include <net/checksum.h>
56 #include <linux/mroute6.h>
57 
58 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
59 
60 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
61 {
62 	static u32 ipv6_fragmentation_id = 1;
63 	static DEFINE_SPINLOCK(ip6_id_lock);
64 
65 	spin_lock_bh(&ip6_id_lock);
66 	fhdr->identification = htonl(ipv6_fragmentation_id);
67 	if (++ipv6_fragmentation_id == 0)
68 		ipv6_fragmentation_id = 1;
69 	spin_unlock_bh(&ip6_id_lock);
70 }
71 
72 int __ip6_local_out(struct sk_buff *skb)
73 {
74 	int len;
75 
76 	len = skb->len - sizeof(struct ipv6hdr);
77 	if (len > IPV6_MAXPLEN)
78 		len = 0;
79 	ipv6_hdr(skb)->payload_len = htons(len);
80 
81 	return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
82 		       dst_output);
83 }
84 
85 int ip6_local_out(struct sk_buff *skb)
86 {
87 	int err;
88 
89 	err = __ip6_local_out(skb);
90 	if (likely(err == 1))
91 		err = dst_output(skb);
92 
93 	return err;
94 }
95 EXPORT_SYMBOL_GPL(ip6_local_out);
96 
97 static int ip6_output_finish(struct sk_buff *skb)
98 {
99 	struct dst_entry *dst = skb->dst;
100 
101 	if (dst->hh)
102 		return neigh_hh_output(dst->hh, skb);
103 	else if (dst->neighbour)
104 		return dst->neighbour->output(skb);
105 
106 	IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
107 	kfree_skb(skb);
108 	return -EINVAL;
109 
110 }
111 
112 /* dev_loopback_xmit for use with netfilter. */
113 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
114 {
115 	skb_reset_mac_header(newskb);
116 	__skb_pull(newskb, skb_network_offset(newskb));
117 	newskb->pkt_type = PACKET_LOOPBACK;
118 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
119 	WARN_ON(!newskb->dst);
120 
121 	netif_rx(newskb);
122 	return 0;
123 }
124 
125 
126 static int ip6_output2(struct sk_buff *skb)
127 {
128 	struct dst_entry *dst = skb->dst;
129 	struct net_device *dev = dst->dev;
130 
131 	skb->protocol = htons(ETH_P_IPV6);
132 	skb->dev = dev;
133 
134 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
135 		struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
136 		struct inet6_dev *idev = ip6_dst_idev(skb->dst);
137 
138 		if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
139 		    ((mroute6_socket && !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
140 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
141 					 &ipv6_hdr(skb)->saddr))) {
142 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
143 
144 			/* Do not check for IFF_ALLMULTI; multicast routing
145 			   is not supported in any case.
146 			 */
147 			if (newskb)
148 				NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
149 					NULL, newskb->dev,
150 					ip6_dev_loopback_xmit);
151 
152 			if (ipv6_hdr(skb)->hop_limit == 0) {
153 				IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
154 				kfree_skb(skb);
155 				return 0;
156 			}
157 		}
158 
159 		IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS);
160 	}
161 
162 	return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
163 		       ip6_output_finish);
164 }
165 
166 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
167 {
168 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
169 
170 	return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
171 	       skb->dst->dev->mtu : dst_mtu(skb->dst);
172 }
173 
174 int ip6_output(struct sk_buff *skb)
175 {
176 	struct inet6_dev *idev = ip6_dst_idev(skb->dst);
177 	if (unlikely(idev->cnf.disable_ipv6)) {
178 		IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
179 		kfree_skb(skb);
180 		return 0;
181 	}
182 
183 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
184 				dst_allfrag(skb->dst))
185 		return ip6_fragment(skb, ip6_output2);
186 	else
187 		return ip6_output2(skb);
188 }
189 
190 /*
191  *	xmit an sk_buff (used by TCP)
192  */
193 
194 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
195 	     struct ipv6_txoptions *opt, int ipfragok)
196 {
197 	struct ipv6_pinfo *np = inet6_sk(sk);
198 	struct in6_addr *first_hop = &fl->fl6_dst;
199 	struct dst_entry *dst = skb->dst;
200 	struct ipv6hdr *hdr;
201 	u8  proto = fl->proto;
202 	int seg_len = skb->len;
203 	int hlimit, tclass;
204 	u32 mtu;
205 
206 	if (opt) {
207 		unsigned int head_room;
208 
209 		/* First: exthdrs may take lots of space (~8K for now)
210 		   MAX_HEADER is not enough.
211 		 */
212 		head_room = opt->opt_nflen + opt->opt_flen;
213 		seg_len += head_room;
214 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
215 
216 		if (skb_headroom(skb) < head_room) {
217 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
218 			if (skb2 == NULL) {
219 				IP6_INC_STATS(ip6_dst_idev(skb->dst),
220 					      IPSTATS_MIB_OUTDISCARDS);
221 				kfree_skb(skb);
222 				return -ENOBUFS;
223 			}
224 			kfree_skb(skb);
225 			skb = skb2;
226 			if (sk)
227 				skb_set_owner_w(skb, sk);
228 		}
229 		if (opt->opt_flen)
230 			ipv6_push_frag_opts(skb, opt, &proto);
231 		if (opt->opt_nflen)
232 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
233 	}
234 
235 	skb_push(skb, sizeof(struct ipv6hdr));
236 	skb_reset_network_header(skb);
237 	hdr = ipv6_hdr(skb);
238 
239 	/* Allow local fragmentation. */
240 	if (ipfragok)
241 		skb->local_df = 1;
242 
243 	/*
244 	 *	Fill in the IPv6 header
245 	 */
246 
247 	hlimit = -1;
248 	if (np)
249 		hlimit = np->hop_limit;
250 	if (hlimit < 0)
251 		hlimit = ip6_dst_hoplimit(dst);
252 
253 	tclass = -1;
254 	if (np)
255 		tclass = np->tclass;
256 	if (tclass < 0)
257 		tclass = 0;
258 
259 	*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
260 
261 	hdr->payload_len = htons(seg_len);
262 	hdr->nexthdr = proto;
263 	hdr->hop_limit = hlimit;
264 
265 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
266 	ipv6_addr_copy(&hdr->daddr, first_hop);
267 
268 	skb->priority = sk->sk_priority;
269 	skb->mark = sk->sk_mark;
270 
271 	mtu = dst_mtu(dst);
272 	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
273 		IP6_INC_STATS(ip6_dst_idev(skb->dst),
274 			      IPSTATS_MIB_OUTREQUESTS);
275 		return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
276 				dst_output);
277 	}
278 
279 	if (net_ratelimit())
280 		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
281 	skb->dev = dst->dev;
282 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
283 	IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
284 	kfree_skb(skb);
285 	return -EMSGSIZE;
286 }
287 
288 EXPORT_SYMBOL(ip6_xmit);
289 
290 /*
291  *	To avoid extra problems ND packets are send through this
292  *	routine. It's code duplication but I really want to avoid
293  *	extra checks since ipv6_build_header is used by TCP (which
294  *	is for us performance critical)
295  */
296 
297 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
298 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
299 	       int proto, int len)
300 {
301 	struct ipv6_pinfo *np = inet6_sk(sk);
302 	struct ipv6hdr *hdr;
303 	int totlen;
304 
305 	skb->protocol = htons(ETH_P_IPV6);
306 	skb->dev = dev;
307 
308 	totlen = len + sizeof(struct ipv6hdr);
309 
310 	skb_reset_network_header(skb);
311 	skb_put(skb, sizeof(struct ipv6hdr));
312 	hdr = ipv6_hdr(skb);
313 
314 	*(__be32*)hdr = htonl(0x60000000);
315 
316 	hdr->payload_len = htons(len);
317 	hdr->nexthdr = proto;
318 	hdr->hop_limit = np->hop_limit;
319 
320 	ipv6_addr_copy(&hdr->saddr, saddr);
321 	ipv6_addr_copy(&hdr->daddr, daddr);
322 
323 	return 0;
324 }
325 
326 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
327 {
328 	struct ip6_ra_chain *ra;
329 	struct sock *last = NULL;
330 
331 	read_lock(&ip6_ra_lock);
332 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
333 		struct sock *sk = ra->sk;
334 		if (sk && ra->sel == sel &&
335 		    (!sk->sk_bound_dev_if ||
336 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
337 			if (last) {
338 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
339 				if (skb2)
340 					rawv6_rcv(last, skb2);
341 			}
342 			last = sk;
343 		}
344 	}
345 
346 	if (last) {
347 		rawv6_rcv(last, skb);
348 		read_unlock(&ip6_ra_lock);
349 		return 1;
350 	}
351 	read_unlock(&ip6_ra_lock);
352 	return 0;
353 }
354 
355 static int ip6_forward_proxy_check(struct sk_buff *skb)
356 {
357 	struct ipv6hdr *hdr = ipv6_hdr(skb);
358 	u8 nexthdr = hdr->nexthdr;
359 	int offset;
360 
361 	if (ipv6_ext_hdr(nexthdr)) {
362 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
363 		if (offset < 0)
364 			return 0;
365 	} else
366 		offset = sizeof(struct ipv6hdr);
367 
368 	if (nexthdr == IPPROTO_ICMPV6) {
369 		struct icmp6hdr *icmp6;
370 
371 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
372 					 offset + 1 - skb->data)))
373 			return 0;
374 
375 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
376 
377 		switch (icmp6->icmp6_type) {
378 		case NDISC_ROUTER_SOLICITATION:
379 		case NDISC_ROUTER_ADVERTISEMENT:
380 		case NDISC_NEIGHBOUR_SOLICITATION:
381 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
382 		case NDISC_REDIRECT:
383 			/* For reaction involving unicast neighbor discovery
384 			 * message destined to the proxied address, pass it to
385 			 * input function.
386 			 */
387 			return 1;
388 		default:
389 			break;
390 		}
391 	}
392 
393 	/*
394 	 * The proxying router can't forward traffic sent to a link-local
395 	 * address, so signal the sender and discard the packet. This
396 	 * behavior is clarified by the MIPv6 specification.
397 	 */
398 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
399 		dst_link_failure(skb);
400 		return -1;
401 	}
402 
403 	return 0;
404 }
405 
406 static inline int ip6_forward_finish(struct sk_buff *skb)
407 {
408 	return dst_output(skb);
409 }
410 
411 int ip6_forward(struct sk_buff *skb)
412 {
413 	struct dst_entry *dst = skb->dst;
414 	struct ipv6hdr *hdr = ipv6_hdr(skb);
415 	struct inet6_skb_parm *opt = IP6CB(skb);
416 	struct net *net = dev_net(dst->dev);
417 
418 	if (net->ipv6.devconf_all->forwarding == 0)
419 		goto error;
420 
421 	if (skb_warn_if_lro(skb))
422 		goto drop;
423 
424 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
425 		IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
426 		goto drop;
427 	}
428 
429 	skb_forward_csum(skb);
430 
431 	/*
432 	 *	We DO NOT make any processing on
433 	 *	RA packets, pushing them to user level AS IS
434 	 *	without ane WARRANTY that application will be able
435 	 *	to interpret them. The reason is that we
436 	 *	cannot make anything clever here.
437 	 *
438 	 *	We are not end-node, so that if packet contains
439 	 *	AH/ESP, we cannot make anything.
440 	 *	Defragmentation also would be mistake, RA packets
441 	 *	cannot be fragmented, because there is no warranty
442 	 *	that different fragments will go along one path. --ANK
443 	 */
444 	if (opt->ra) {
445 		u8 *ptr = skb_network_header(skb) + opt->ra;
446 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
447 			return 0;
448 	}
449 
450 	/*
451 	 *	check and decrement ttl
452 	 */
453 	if (hdr->hop_limit <= 1) {
454 		/* Force OUTPUT device used as source address */
455 		skb->dev = dst->dev;
456 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
457 			    0, skb->dev);
458 		IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
459 
460 		kfree_skb(skb);
461 		return -ETIMEDOUT;
462 	}
463 
464 	/* XXX: idev->cnf.proxy_ndp? */
465 	if (net->ipv6.devconf_all->proxy_ndp &&
466 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
467 		int proxied = ip6_forward_proxy_check(skb);
468 		if (proxied > 0)
469 			return ip6_input(skb);
470 		else if (proxied < 0) {
471 			IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
472 			goto drop;
473 		}
474 	}
475 
476 	if (!xfrm6_route_forward(skb)) {
477 		IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
478 		goto drop;
479 	}
480 	dst = skb->dst;
481 
482 	/* IPv6 specs say nothing about it, but it is clear that we cannot
483 	   send redirects to source routed frames.
484 	   We don't send redirects to frames decapsulated from IPsec.
485 	 */
486 	if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
487 	    !skb->sp) {
488 		struct in6_addr *target = NULL;
489 		struct rt6_info *rt;
490 		struct neighbour *n = dst->neighbour;
491 
492 		/*
493 		 *	incoming and outgoing devices are the same
494 		 *	send a redirect.
495 		 */
496 
497 		rt = (struct rt6_info *) dst;
498 		if ((rt->rt6i_flags & RTF_GATEWAY))
499 			target = (struct in6_addr*)&n->primary_key;
500 		else
501 			target = &hdr->daddr;
502 
503 		/* Limit redirects both by destination (here)
504 		   and by source (inside ndisc_send_redirect)
505 		 */
506 		if (xrlim_allow(dst, 1*HZ))
507 			ndisc_send_redirect(skb, n, target);
508 	} else {
509 		int addrtype = ipv6_addr_type(&hdr->saddr);
510 
511 		/* This check is security critical. */
512 		if (addrtype == IPV6_ADDR_ANY ||
513 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
514 			goto error;
515 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
516 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
517 				ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
518 			goto error;
519 		}
520 	}
521 
522 	if (skb->len > dst_mtu(dst)) {
523 		/* Again, force OUTPUT device used as source address */
524 		skb->dev = dst->dev;
525 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
526 		IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
527 		IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
528 		kfree_skb(skb);
529 		return -EMSGSIZE;
530 	}
531 
532 	if (skb_cow(skb, dst->dev->hard_header_len)) {
533 		IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
534 		goto drop;
535 	}
536 
537 	hdr = ipv6_hdr(skb);
538 
539 	/* Mangling hops number delayed to point after skb COW */
540 
541 	hdr->hop_limit--;
542 
543 	IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
544 	return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
545 		       ip6_forward_finish);
546 
547 error:
548 	IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
549 drop:
550 	kfree_skb(skb);
551 	return -EINVAL;
552 }
553 
554 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
555 {
556 	to->pkt_type = from->pkt_type;
557 	to->priority = from->priority;
558 	to->protocol = from->protocol;
559 	dst_release(to->dst);
560 	to->dst = dst_clone(from->dst);
561 	to->dev = from->dev;
562 	to->mark = from->mark;
563 
564 #ifdef CONFIG_NET_SCHED
565 	to->tc_index = from->tc_index;
566 #endif
567 	nf_copy(to, from);
568 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
569     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
570 	to->nf_trace = from->nf_trace;
571 #endif
572 	skb_copy_secmark(to, from);
573 }
574 
575 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
576 {
577 	u16 offset = sizeof(struct ipv6hdr);
578 	struct ipv6_opt_hdr *exthdr =
579 				(struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
580 	unsigned int packet_len = skb->tail - skb->network_header;
581 	int found_rhdr = 0;
582 	*nexthdr = &ipv6_hdr(skb)->nexthdr;
583 
584 	while (offset + 1 <= packet_len) {
585 
586 		switch (**nexthdr) {
587 
588 		case NEXTHDR_HOP:
589 			break;
590 		case NEXTHDR_ROUTING:
591 			found_rhdr = 1;
592 			break;
593 		case NEXTHDR_DEST:
594 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
595 			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
596 				break;
597 #endif
598 			if (found_rhdr)
599 				return offset;
600 			break;
601 		default :
602 			return offset;
603 		}
604 
605 		offset += ipv6_optlen(exthdr);
606 		*nexthdr = &exthdr->nexthdr;
607 		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
608 						 offset);
609 	}
610 
611 	return offset;
612 }
613 
614 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
615 {
616 	struct sk_buff *frag;
617 	struct rt6_info *rt = (struct rt6_info*)skb->dst;
618 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
619 	struct ipv6hdr *tmp_hdr;
620 	struct frag_hdr *fh;
621 	unsigned int mtu, hlen, left, len;
622 	__be32 frag_id = 0;
623 	int ptr, offset = 0, err=0;
624 	u8 *prevhdr, nexthdr = 0;
625 
626 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
627 	nexthdr = *prevhdr;
628 
629 	mtu = ip6_skb_dst_mtu(skb);
630 
631 	/* We must not fragment if the socket is set to force MTU discovery
632 	 * or if the skb it not generated by a local socket.  (This last
633 	 * check should be redundant, but it's free.)
634 	 */
635 	if (!skb->local_df) {
636 		skb->dev = skb->dst->dev;
637 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
638 		IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
639 		kfree_skb(skb);
640 		return -EMSGSIZE;
641 	}
642 
643 	if (np && np->frag_size < mtu) {
644 		if (np->frag_size)
645 			mtu = np->frag_size;
646 	}
647 	mtu -= hlen + sizeof(struct frag_hdr);
648 
649 	if (skb_shinfo(skb)->frag_list) {
650 		int first_len = skb_pagelen(skb);
651 		int truesizes = 0;
652 
653 		if (first_len - hlen > mtu ||
654 		    ((first_len - hlen) & 7) ||
655 		    skb_cloned(skb))
656 			goto slow_path;
657 
658 		for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
659 			/* Correct geometry. */
660 			if (frag->len > mtu ||
661 			    ((frag->len & 7) && frag->next) ||
662 			    skb_headroom(frag) < hlen)
663 			    goto slow_path;
664 
665 			/* Partially cloned skb? */
666 			if (skb_shared(frag))
667 				goto slow_path;
668 
669 			BUG_ON(frag->sk);
670 			if (skb->sk) {
671 				sock_hold(skb->sk);
672 				frag->sk = skb->sk;
673 				frag->destructor = sock_wfree;
674 				truesizes += frag->truesize;
675 			}
676 		}
677 
678 		err = 0;
679 		offset = 0;
680 		frag = skb_shinfo(skb)->frag_list;
681 		skb_shinfo(skb)->frag_list = NULL;
682 		/* BUILD HEADER */
683 
684 		*prevhdr = NEXTHDR_FRAGMENT;
685 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
686 		if (!tmp_hdr) {
687 			IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
688 			return -ENOMEM;
689 		}
690 
691 		__skb_pull(skb, hlen);
692 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
693 		__skb_push(skb, hlen);
694 		skb_reset_network_header(skb);
695 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
696 
697 		ipv6_select_ident(skb, fh);
698 		fh->nexthdr = nexthdr;
699 		fh->reserved = 0;
700 		fh->frag_off = htons(IP6_MF);
701 		frag_id = fh->identification;
702 
703 		first_len = skb_pagelen(skb);
704 		skb->data_len = first_len - skb_headlen(skb);
705 		skb->truesize -= truesizes;
706 		skb->len = first_len;
707 		ipv6_hdr(skb)->payload_len = htons(first_len -
708 						   sizeof(struct ipv6hdr));
709 
710 		dst_hold(&rt->u.dst);
711 
712 		for (;;) {
713 			/* Prepare header of the next frame,
714 			 * before previous one went down. */
715 			if (frag) {
716 				frag->ip_summed = CHECKSUM_NONE;
717 				skb_reset_transport_header(frag);
718 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
719 				__skb_push(frag, hlen);
720 				skb_reset_network_header(frag);
721 				memcpy(skb_network_header(frag), tmp_hdr,
722 				       hlen);
723 				offset += skb->len - hlen - sizeof(struct frag_hdr);
724 				fh->nexthdr = nexthdr;
725 				fh->reserved = 0;
726 				fh->frag_off = htons(offset);
727 				if (frag->next != NULL)
728 					fh->frag_off |= htons(IP6_MF);
729 				fh->identification = frag_id;
730 				ipv6_hdr(frag)->payload_len =
731 						htons(frag->len -
732 						      sizeof(struct ipv6hdr));
733 				ip6_copy_metadata(frag, skb);
734 			}
735 
736 			err = output(skb);
737 			if(!err)
738 				IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGCREATES);
739 
740 			if (err || !frag)
741 				break;
742 
743 			skb = frag;
744 			frag = skb->next;
745 			skb->next = NULL;
746 		}
747 
748 		kfree(tmp_hdr);
749 
750 		if (err == 0) {
751 			IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGOKS);
752 			dst_release(&rt->u.dst);
753 			return 0;
754 		}
755 
756 		while (frag) {
757 			skb = frag->next;
758 			kfree_skb(frag);
759 			frag = skb;
760 		}
761 
762 		IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGFAILS);
763 		dst_release(&rt->u.dst);
764 		return err;
765 	}
766 
767 slow_path:
768 	left = skb->len - hlen;		/* Space per frame */
769 	ptr = hlen;			/* Where to start from */
770 
771 	/*
772 	 *	Fragment the datagram.
773 	 */
774 
775 	*prevhdr = NEXTHDR_FRAGMENT;
776 
777 	/*
778 	 *	Keep copying data until we run out.
779 	 */
780 	while(left > 0)	{
781 		len = left;
782 		/* IF: it doesn't fit, use 'mtu' - the data space left */
783 		if (len > mtu)
784 			len = mtu;
785 		/* IF: we are not sending upto and including the packet end
786 		   then align the next start on an eight byte boundary */
787 		if (len < left)	{
788 			len &= ~7;
789 		}
790 		/*
791 		 *	Allocate buffer.
792 		 */
793 
794 		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
795 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
796 			IP6_INC_STATS(ip6_dst_idev(skb->dst),
797 				      IPSTATS_MIB_FRAGFAILS);
798 			err = -ENOMEM;
799 			goto fail;
800 		}
801 
802 		/*
803 		 *	Set up data on packet
804 		 */
805 
806 		ip6_copy_metadata(frag, skb);
807 		skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
808 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
809 		skb_reset_network_header(frag);
810 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
811 		frag->transport_header = (frag->network_header + hlen +
812 					  sizeof(struct frag_hdr));
813 
814 		/*
815 		 *	Charge the memory for the fragment to any owner
816 		 *	it might possess
817 		 */
818 		if (skb->sk)
819 			skb_set_owner_w(frag, skb->sk);
820 
821 		/*
822 		 *	Copy the packet header into the new buffer.
823 		 */
824 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
825 
826 		/*
827 		 *	Build fragment header.
828 		 */
829 		fh->nexthdr = nexthdr;
830 		fh->reserved = 0;
831 		if (!frag_id) {
832 			ipv6_select_ident(skb, fh);
833 			frag_id = fh->identification;
834 		} else
835 			fh->identification = frag_id;
836 
837 		/*
838 		 *	Copy a block of the IP datagram.
839 		 */
840 		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
841 			BUG();
842 		left -= len;
843 
844 		fh->frag_off = htons(offset);
845 		if (left > 0)
846 			fh->frag_off |= htons(IP6_MF);
847 		ipv6_hdr(frag)->payload_len = htons(frag->len -
848 						    sizeof(struct ipv6hdr));
849 
850 		ptr += len;
851 		offset += len;
852 
853 		/*
854 		 *	Put this fragment into the sending queue.
855 		 */
856 		err = output(frag);
857 		if (err)
858 			goto fail;
859 
860 		IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGCREATES);
861 	}
862 	IP6_INC_STATS(ip6_dst_idev(skb->dst),
863 		      IPSTATS_MIB_FRAGOKS);
864 	kfree_skb(skb);
865 	return err;
866 
867 fail:
868 	IP6_INC_STATS(ip6_dst_idev(skb->dst),
869 		      IPSTATS_MIB_FRAGFAILS);
870 	kfree_skb(skb);
871 	return err;
872 }
873 
874 static inline int ip6_rt_check(struct rt6key *rt_key,
875 			       struct in6_addr *fl_addr,
876 			       struct in6_addr *addr_cache)
877 {
878 	return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
879 		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
880 }
881 
882 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
883 					  struct dst_entry *dst,
884 					  struct flowi *fl)
885 {
886 	struct ipv6_pinfo *np = inet6_sk(sk);
887 	struct rt6_info *rt = (struct rt6_info *)dst;
888 
889 	if (!dst)
890 		goto out;
891 
892 	/* Yes, checking route validity in not connected
893 	 * case is not very simple. Take into account,
894 	 * that we do not support routing by source, TOS,
895 	 * and MSG_DONTROUTE 		--ANK (980726)
896 	 *
897 	 * 1. ip6_rt_check(): If route was host route,
898 	 *    check that cached destination is current.
899 	 *    If it is network route, we still may
900 	 *    check its validity using saved pointer
901 	 *    to the last used address: daddr_cache.
902 	 *    We do not want to save whole address now,
903 	 *    (because main consumer of this service
904 	 *    is tcp, which has not this problem),
905 	 *    so that the last trick works only on connected
906 	 *    sockets.
907 	 * 2. oif also should be the same.
908 	 */
909 	if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
910 #ifdef CONFIG_IPV6_SUBTREES
911 	    ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
912 #endif
913 	    (fl->oif && fl->oif != dst->dev->ifindex)) {
914 		dst_release(dst);
915 		dst = NULL;
916 	}
917 
918 out:
919 	return dst;
920 }
921 
922 static int ip6_dst_lookup_tail(struct sock *sk,
923 			       struct dst_entry **dst, struct flowi *fl)
924 {
925 	int err;
926 	struct net *net = sock_net(sk);
927 
928 	if (*dst == NULL)
929 		*dst = ip6_route_output(net, sk, fl);
930 
931 	if ((err = (*dst)->error))
932 		goto out_err_release;
933 
934 	if (ipv6_addr_any(&fl->fl6_src)) {
935 		err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
936 					 &fl->fl6_dst,
937 					 sk ? inet6_sk(sk)->srcprefs : 0,
938 					 &fl->fl6_src);
939 		if (err)
940 			goto out_err_release;
941 	}
942 
943 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
944 	/*
945 	 * Here if the dst entry we've looked up
946 	 * has a neighbour entry that is in the INCOMPLETE
947 	 * state and the src address from the flow is
948 	 * marked as OPTIMISTIC, we release the found
949 	 * dst entry and replace it instead with the
950 	 * dst entry of the nexthop router
951 	 */
952 	if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
953 		struct inet6_ifaddr *ifp;
954 		struct flowi fl_gw;
955 		int redirect;
956 
957 		ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
958 				      (*dst)->dev, 1);
959 
960 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
961 		if (ifp)
962 			in6_ifa_put(ifp);
963 
964 		if (redirect) {
965 			/*
966 			 * We need to get the dst entry for the
967 			 * default router instead
968 			 */
969 			dst_release(*dst);
970 			memcpy(&fl_gw, fl, sizeof(struct flowi));
971 			memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
972 			*dst = ip6_route_output(net, sk, &fl_gw);
973 			if ((err = (*dst)->error))
974 				goto out_err_release;
975 		}
976 	}
977 #endif
978 
979 	return 0;
980 
981 out_err_release:
982 	if (err == -ENETUNREACH)
983 		IP6_INC_STATS_BH(NULL, IPSTATS_MIB_OUTNOROUTES);
984 	dst_release(*dst);
985 	*dst = NULL;
986 	return err;
987 }
988 
989 /**
990  *	ip6_dst_lookup - perform route lookup on flow
991  *	@sk: socket which provides route info
992  *	@dst: pointer to dst_entry * for result
993  *	@fl: flow to lookup
994  *
995  *	This function performs a route lookup on the given flow.
996  *
997  *	It returns zero on success, or a standard errno code on error.
998  */
999 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1000 {
1001 	*dst = NULL;
1002 	return ip6_dst_lookup_tail(sk, dst, fl);
1003 }
1004 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1005 
1006 /**
1007  *	ip6_sk_dst_lookup - perform socket cached route lookup on flow
1008  *	@sk: socket which provides the dst cache and route info
1009  *	@dst: pointer to dst_entry * for result
1010  *	@fl: flow to lookup
1011  *
1012  *	This function performs a route lookup on the given flow with the
1013  *	possibility of using the cached route in the socket if it is valid.
1014  *	It will take the socket dst lock when operating on the dst cache.
1015  *	As a result, this function can only be used in process context.
1016  *
1017  *	It returns zero on success, or a standard errno code on error.
1018  */
1019 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1020 {
1021 	*dst = NULL;
1022 	if (sk) {
1023 		*dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1024 		*dst = ip6_sk_dst_check(sk, *dst, fl);
1025 	}
1026 
1027 	return ip6_dst_lookup_tail(sk, dst, fl);
1028 }
1029 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1030 
1031 static inline int ip6_ufo_append_data(struct sock *sk,
1032 			int getfrag(void *from, char *to, int offset, int len,
1033 			int odd, struct sk_buff *skb),
1034 			void *from, int length, int hh_len, int fragheaderlen,
1035 			int transhdrlen, int mtu,unsigned int flags)
1036 
1037 {
1038 	struct sk_buff *skb;
1039 	int err;
1040 
1041 	/* There is support for UDP large send offload by network
1042 	 * device, so create one single skb packet containing complete
1043 	 * udp datagram
1044 	 */
1045 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1046 		skb = sock_alloc_send_skb(sk,
1047 			hh_len + fragheaderlen + transhdrlen + 20,
1048 			(flags & MSG_DONTWAIT), &err);
1049 		if (skb == NULL)
1050 			return -ENOMEM;
1051 
1052 		/* reserve space for Hardware header */
1053 		skb_reserve(skb, hh_len);
1054 
1055 		/* create space for UDP/IP header */
1056 		skb_put(skb,fragheaderlen + transhdrlen);
1057 
1058 		/* initialize network header pointer */
1059 		skb_reset_network_header(skb);
1060 
1061 		/* initialize protocol header pointer */
1062 		skb->transport_header = skb->network_header + fragheaderlen;
1063 
1064 		skb->ip_summed = CHECKSUM_PARTIAL;
1065 		skb->csum = 0;
1066 		sk->sk_sndmsg_off = 0;
1067 	}
1068 
1069 	err = skb_append_datato_frags(sk,skb, getfrag, from,
1070 				      (length - transhdrlen));
1071 	if (!err) {
1072 		struct frag_hdr fhdr;
1073 
1074 		/* specify the length of each IP datagram fragment*/
1075 		skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
1076 					    sizeof(struct frag_hdr);
1077 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1078 		ipv6_select_ident(skb, &fhdr);
1079 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1080 		__skb_queue_tail(&sk->sk_write_queue, skb);
1081 
1082 		return 0;
1083 	}
1084 	/* There is not enough support do UPD LSO,
1085 	 * so follow normal path
1086 	 */
1087 	kfree_skb(skb);
1088 
1089 	return err;
1090 }
1091 
1092 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1093 	int offset, int len, int odd, struct sk_buff *skb),
1094 	void *from, int length, int transhdrlen,
1095 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1096 	struct rt6_info *rt, unsigned int flags)
1097 {
1098 	struct inet_sock *inet = inet_sk(sk);
1099 	struct ipv6_pinfo *np = inet6_sk(sk);
1100 	struct sk_buff *skb;
1101 	unsigned int maxfraglen, fragheaderlen;
1102 	int exthdrlen;
1103 	int hh_len;
1104 	int mtu;
1105 	int copy;
1106 	int err;
1107 	int offset = 0;
1108 	int csummode = CHECKSUM_NONE;
1109 
1110 	if (flags&MSG_PROBE)
1111 		return 0;
1112 	if (skb_queue_empty(&sk->sk_write_queue)) {
1113 		/*
1114 		 * setup for corking
1115 		 */
1116 		if (opt) {
1117 			if (np->cork.opt == NULL) {
1118 				np->cork.opt = kmalloc(opt->tot_len,
1119 						       sk->sk_allocation);
1120 				if (unlikely(np->cork.opt == NULL))
1121 					return -ENOBUFS;
1122 			} else if (np->cork.opt->tot_len < opt->tot_len) {
1123 				printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
1124 				return -EINVAL;
1125 			}
1126 			memcpy(np->cork.opt, opt, opt->tot_len);
1127 			inet->cork.flags |= IPCORK_OPT;
1128 			/* need source address above miyazawa*/
1129 		}
1130 		dst_hold(&rt->u.dst);
1131 		inet->cork.dst = &rt->u.dst;
1132 		inet->cork.fl = *fl;
1133 		np->cork.hop_limit = hlimit;
1134 		np->cork.tclass = tclass;
1135 		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1136 		      rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1137 		if (np->frag_size < mtu) {
1138 			if (np->frag_size)
1139 				mtu = np->frag_size;
1140 		}
1141 		inet->cork.fragsize = mtu;
1142 		if (dst_allfrag(rt->u.dst.path))
1143 			inet->cork.flags |= IPCORK_ALLFRAG;
1144 		inet->cork.length = 0;
1145 		sk->sk_sndmsg_page = NULL;
1146 		sk->sk_sndmsg_off = 0;
1147 		exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1148 			    rt->rt6i_nfheader_len;
1149 		length += exthdrlen;
1150 		transhdrlen += exthdrlen;
1151 	} else {
1152 		rt = (struct rt6_info *)inet->cork.dst;
1153 		fl = &inet->cork.fl;
1154 		if (inet->cork.flags & IPCORK_OPT)
1155 			opt = np->cork.opt;
1156 		transhdrlen = 0;
1157 		exthdrlen = 0;
1158 		mtu = inet->cork.fragsize;
1159 	}
1160 
1161 	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1162 
1163 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1164 			(opt ? opt->opt_nflen : 0);
1165 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1166 
1167 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1168 		if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1169 			ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1170 			return -EMSGSIZE;
1171 		}
1172 	}
1173 
1174 	/*
1175 	 * Let's try using as much space as possible.
1176 	 * Use MTU if total length of the message fits into the MTU.
1177 	 * Otherwise, we need to reserve fragment header and
1178 	 * fragment alignment (= 8-15 octects, in total).
1179 	 *
1180 	 * Note that we may need to "move" the data from the tail of
1181 	 * of the buffer to the new fragment when we split
1182 	 * the message.
1183 	 *
1184 	 * FIXME: It may be fragmented into multiple chunks
1185 	 *        at once if non-fragmentable extension headers
1186 	 *        are too large.
1187 	 * --yoshfuji
1188 	 */
1189 
1190 	inet->cork.length += length;
1191 	if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1192 	    (rt->u.dst.dev->features & NETIF_F_UFO)) {
1193 
1194 		err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1195 					  fragheaderlen, transhdrlen, mtu,
1196 					  flags);
1197 		if (err)
1198 			goto error;
1199 		return 0;
1200 	}
1201 
1202 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1203 		goto alloc_new_skb;
1204 
1205 	while (length > 0) {
1206 		/* Check if the remaining data fits into current packet. */
1207 		copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1208 		if (copy < length)
1209 			copy = maxfraglen - skb->len;
1210 
1211 		if (copy <= 0) {
1212 			char *data;
1213 			unsigned int datalen;
1214 			unsigned int fraglen;
1215 			unsigned int fraggap;
1216 			unsigned int alloclen;
1217 			struct sk_buff *skb_prev;
1218 alloc_new_skb:
1219 			skb_prev = skb;
1220 
1221 			/* There's no room in the current skb */
1222 			if (skb_prev)
1223 				fraggap = skb_prev->len - maxfraglen;
1224 			else
1225 				fraggap = 0;
1226 
1227 			/*
1228 			 * If remaining data exceeds the mtu,
1229 			 * we know we need more fragment(s).
1230 			 */
1231 			datalen = length + fraggap;
1232 			if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1233 				datalen = maxfraglen - fragheaderlen;
1234 
1235 			fraglen = datalen + fragheaderlen;
1236 			if ((flags & MSG_MORE) &&
1237 			    !(rt->u.dst.dev->features&NETIF_F_SG))
1238 				alloclen = mtu;
1239 			else
1240 				alloclen = datalen + fragheaderlen;
1241 
1242 			/*
1243 			 * The last fragment gets additional space at tail.
1244 			 * Note: we overallocate on fragments with MSG_MODE
1245 			 * because we have no idea if we're the last one.
1246 			 */
1247 			if (datalen == length + fraggap)
1248 				alloclen += rt->u.dst.trailer_len;
1249 
1250 			/*
1251 			 * We just reserve space for fragment header.
1252 			 * Note: this may be overallocation if the message
1253 			 * (without MSG_MORE) fits into the MTU.
1254 			 */
1255 			alloclen += sizeof(struct frag_hdr);
1256 
1257 			if (transhdrlen) {
1258 				skb = sock_alloc_send_skb(sk,
1259 						alloclen + hh_len,
1260 						(flags & MSG_DONTWAIT), &err);
1261 			} else {
1262 				skb = NULL;
1263 				if (atomic_read(&sk->sk_wmem_alloc) <=
1264 				    2 * sk->sk_sndbuf)
1265 					skb = sock_wmalloc(sk,
1266 							   alloclen + hh_len, 1,
1267 							   sk->sk_allocation);
1268 				if (unlikely(skb == NULL))
1269 					err = -ENOBUFS;
1270 			}
1271 			if (skb == NULL)
1272 				goto error;
1273 			/*
1274 			 *	Fill in the control structures
1275 			 */
1276 			skb->ip_summed = csummode;
1277 			skb->csum = 0;
1278 			/* reserve for fragmentation */
1279 			skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1280 
1281 			/*
1282 			 *	Find where to start putting bytes
1283 			 */
1284 			data = skb_put(skb, fraglen);
1285 			skb_set_network_header(skb, exthdrlen);
1286 			data += fragheaderlen;
1287 			skb->transport_header = (skb->network_header +
1288 						 fragheaderlen);
1289 			if (fraggap) {
1290 				skb->csum = skb_copy_and_csum_bits(
1291 					skb_prev, maxfraglen,
1292 					data + transhdrlen, fraggap, 0);
1293 				skb_prev->csum = csum_sub(skb_prev->csum,
1294 							  skb->csum);
1295 				data += fraggap;
1296 				pskb_trim_unique(skb_prev, maxfraglen);
1297 			}
1298 			copy = datalen - transhdrlen - fraggap;
1299 			if (copy < 0) {
1300 				err = -EINVAL;
1301 				kfree_skb(skb);
1302 				goto error;
1303 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1304 				err = -EFAULT;
1305 				kfree_skb(skb);
1306 				goto error;
1307 			}
1308 
1309 			offset += copy;
1310 			length -= datalen - fraggap;
1311 			transhdrlen = 0;
1312 			exthdrlen = 0;
1313 			csummode = CHECKSUM_NONE;
1314 
1315 			/*
1316 			 * Put the packet on the pending queue
1317 			 */
1318 			__skb_queue_tail(&sk->sk_write_queue, skb);
1319 			continue;
1320 		}
1321 
1322 		if (copy > length)
1323 			copy = length;
1324 
1325 		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1326 			unsigned int off;
1327 
1328 			off = skb->len;
1329 			if (getfrag(from, skb_put(skb, copy),
1330 						offset, copy, off, skb) < 0) {
1331 				__skb_trim(skb, off);
1332 				err = -EFAULT;
1333 				goto error;
1334 			}
1335 		} else {
1336 			int i = skb_shinfo(skb)->nr_frags;
1337 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1338 			struct page *page = sk->sk_sndmsg_page;
1339 			int off = sk->sk_sndmsg_off;
1340 			unsigned int left;
1341 
1342 			if (page && (left = PAGE_SIZE - off) > 0) {
1343 				if (copy >= left)
1344 					copy = left;
1345 				if (page != frag->page) {
1346 					if (i == MAX_SKB_FRAGS) {
1347 						err = -EMSGSIZE;
1348 						goto error;
1349 					}
1350 					get_page(page);
1351 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1352 					frag = &skb_shinfo(skb)->frags[i];
1353 				}
1354 			} else if(i < MAX_SKB_FRAGS) {
1355 				if (copy > PAGE_SIZE)
1356 					copy = PAGE_SIZE;
1357 				page = alloc_pages(sk->sk_allocation, 0);
1358 				if (page == NULL) {
1359 					err = -ENOMEM;
1360 					goto error;
1361 				}
1362 				sk->sk_sndmsg_page = page;
1363 				sk->sk_sndmsg_off = 0;
1364 
1365 				skb_fill_page_desc(skb, i, page, 0, 0);
1366 				frag = &skb_shinfo(skb)->frags[i];
1367 			} else {
1368 				err = -EMSGSIZE;
1369 				goto error;
1370 			}
1371 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1372 				err = -EFAULT;
1373 				goto error;
1374 			}
1375 			sk->sk_sndmsg_off += copy;
1376 			frag->size += copy;
1377 			skb->len += copy;
1378 			skb->data_len += copy;
1379 			skb->truesize += copy;
1380 			atomic_add(copy, &sk->sk_wmem_alloc);
1381 		}
1382 		offset += copy;
1383 		length -= copy;
1384 	}
1385 	return 0;
1386 error:
1387 	inet->cork.length -= length;
1388 	IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1389 	return err;
1390 }
1391 
1392 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1393 {
1394 	inet->cork.flags &= ~IPCORK_OPT;
1395 	kfree(np->cork.opt);
1396 	np->cork.opt = NULL;
1397 	if (inet->cork.dst) {
1398 		dst_release(inet->cork.dst);
1399 		inet->cork.dst = NULL;
1400 		inet->cork.flags &= ~IPCORK_ALLFRAG;
1401 	}
1402 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1403 }
1404 
1405 int ip6_push_pending_frames(struct sock *sk)
1406 {
1407 	struct sk_buff *skb, *tmp_skb;
1408 	struct sk_buff **tail_skb;
1409 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1410 	struct inet_sock *inet = inet_sk(sk);
1411 	struct ipv6_pinfo *np = inet6_sk(sk);
1412 	struct ipv6hdr *hdr;
1413 	struct ipv6_txoptions *opt = np->cork.opt;
1414 	struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1415 	struct flowi *fl = &inet->cork.fl;
1416 	unsigned char proto = fl->proto;
1417 	int err = 0;
1418 
1419 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1420 		goto out;
1421 	tail_skb = &(skb_shinfo(skb)->frag_list);
1422 
1423 	/* move skb->data to ip header from ext header */
1424 	if (skb->data < skb_network_header(skb))
1425 		__skb_pull(skb, skb_network_offset(skb));
1426 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1427 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1428 		*tail_skb = tmp_skb;
1429 		tail_skb = &(tmp_skb->next);
1430 		skb->len += tmp_skb->len;
1431 		skb->data_len += tmp_skb->len;
1432 		skb->truesize += tmp_skb->truesize;
1433 		__sock_put(tmp_skb->sk);
1434 		tmp_skb->destructor = NULL;
1435 		tmp_skb->sk = NULL;
1436 	}
1437 
1438 	/* Allow local fragmentation. */
1439 	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1440 		skb->local_df = 1;
1441 
1442 	ipv6_addr_copy(final_dst, &fl->fl6_dst);
1443 	__skb_pull(skb, skb_network_header_len(skb));
1444 	if (opt && opt->opt_flen)
1445 		ipv6_push_frag_opts(skb, opt, &proto);
1446 	if (opt && opt->opt_nflen)
1447 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1448 
1449 	skb_push(skb, sizeof(struct ipv6hdr));
1450 	skb_reset_network_header(skb);
1451 	hdr = ipv6_hdr(skb);
1452 
1453 	*(__be32*)hdr = fl->fl6_flowlabel |
1454 		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1455 
1456 	hdr->hop_limit = np->cork.hop_limit;
1457 	hdr->nexthdr = proto;
1458 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1459 	ipv6_addr_copy(&hdr->daddr, final_dst);
1460 
1461 	skb->priority = sk->sk_priority;
1462 	skb->mark = sk->sk_mark;
1463 
1464 	skb->dst = dst_clone(&rt->u.dst);
1465 	IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1466 	if (proto == IPPROTO_ICMPV6) {
1467 		struct inet6_dev *idev = ip6_dst_idev(skb->dst);
1468 
1469 		ICMP6MSGOUT_INC_STATS_BH(idev, icmp6_hdr(skb)->icmp6_type);
1470 		ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS);
1471 	}
1472 
1473 	err = ip6_local_out(skb);
1474 	if (err) {
1475 		if (err > 0)
1476 			err = np->recverr ? net_xmit_errno(err) : 0;
1477 		if (err)
1478 			goto error;
1479 	}
1480 
1481 out:
1482 	ip6_cork_release(inet, np);
1483 	return err;
1484 error:
1485 	goto out;
1486 }
1487 
1488 void ip6_flush_pending_frames(struct sock *sk)
1489 {
1490 	struct sk_buff *skb;
1491 
1492 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1493 		if (skb->dst)
1494 			IP6_INC_STATS(ip6_dst_idev(skb->dst),
1495 				      IPSTATS_MIB_OUTDISCARDS);
1496 		kfree_skb(skb);
1497 	}
1498 
1499 	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1500 }
1501