xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 81d67439)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	: 	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63 	int len;
64 
65 	len = skb->len - sizeof(struct ipv6hdr);
66 	if (len > IPV6_MAXPLEN)
67 		len = 0;
68 	ipv6_hdr(skb)->payload_len = htons(len);
69 
70 	return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 		       skb_dst(skb)->dev, dst_output);
72 }
73 
74 int ip6_local_out(struct sk_buff *skb)
75 {
76 	int err;
77 
78 	err = __ip6_local_out(skb);
79 	if (likely(err == 1))
80 		err = dst_output(skb);
81 
82 	return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85 
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89 	skb_reset_mac_header(newskb);
90 	__skb_pull(newskb, skb_network_offset(newskb));
91 	newskb->pkt_type = PACKET_LOOPBACK;
92 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
93 	WARN_ON(!skb_dst(newskb));
94 
95 	netif_rx_ni(newskb);
96 	return 0;
97 }
98 
99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101 	struct dst_entry *dst = skb_dst(skb);
102 	struct net_device *dev = dst->dev;
103 	struct neighbour *neigh;
104 
105 	skb->protocol = htons(ETH_P_IPV6);
106 	skb->dev = dev;
107 
108 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
109 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
110 
111 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
112 		    ((mroute6_socket(dev_net(dev), skb) &&
113 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
114 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
115 					 &ipv6_hdr(skb)->saddr))) {
116 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
117 
118 			/* Do not check for IFF_ALLMULTI; multicast routing
119 			   is not supported in any case.
120 			 */
121 			if (newskb)
122 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
123 					newskb, NULL, newskb->dev,
124 					ip6_dev_loopback_xmit);
125 
126 			if (ipv6_hdr(skb)->hop_limit == 0) {
127 				IP6_INC_STATS(dev_net(dev), idev,
128 					      IPSTATS_MIB_OUTDISCARDS);
129 				kfree_skb(skb);
130 				return 0;
131 			}
132 		}
133 
134 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
135 				skb->len);
136 	}
137 
138 	neigh = dst_get_neighbour(dst);
139 	if (neigh)
140 		return neigh_output(neigh, skb);
141 
142 	IP6_INC_STATS_BH(dev_net(dst->dev),
143 			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
144 	kfree_skb(skb);
145 	return -EINVAL;
146 }
147 
148 static int ip6_finish_output(struct sk_buff *skb)
149 {
150 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
151 	    dst_allfrag(skb_dst(skb)))
152 		return ip6_fragment(skb, ip6_finish_output2);
153 	else
154 		return ip6_finish_output2(skb);
155 }
156 
157 int ip6_output(struct sk_buff *skb)
158 {
159 	struct net_device *dev = skb_dst(skb)->dev;
160 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161 	if (unlikely(idev->cnf.disable_ipv6)) {
162 		IP6_INC_STATS(dev_net(dev), idev,
163 			      IPSTATS_MIB_OUTDISCARDS);
164 		kfree_skb(skb);
165 		return 0;
166 	}
167 
168 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
169 			    ip6_finish_output,
170 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
171 }
172 
173 /*
174  *	xmit an sk_buff (used by TCP, SCTP and DCCP)
175  */
176 
177 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
178 	     struct ipv6_txoptions *opt)
179 {
180 	struct net *net = sock_net(sk);
181 	struct ipv6_pinfo *np = inet6_sk(sk);
182 	struct in6_addr *first_hop = &fl6->daddr;
183 	struct dst_entry *dst = skb_dst(skb);
184 	struct ipv6hdr *hdr;
185 	u8  proto = fl6->flowi6_proto;
186 	int seg_len = skb->len;
187 	int hlimit = -1;
188 	int tclass = 0;
189 	u32 mtu;
190 
191 	if (opt) {
192 		unsigned int head_room;
193 
194 		/* First: exthdrs may take lots of space (~8K for now)
195 		   MAX_HEADER is not enough.
196 		 */
197 		head_room = opt->opt_nflen + opt->opt_flen;
198 		seg_len += head_room;
199 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
200 
201 		if (skb_headroom(skb) < head_room) {
202 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
203 			if (skb2 == NULL) {
204 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
205 					      IPSTATS_MIB_OUTDISCARDS);
206 				kfree_skb(skb);
207 				return -ENOBUFS;
208 			}
209 			kfree_skb(skb);
210 			skb = skb2;
211 			skb_set_owner_w(skb, sk);
212 		}
213 		if (opt->opt_flen)
214 			ipv6_push_frag_opts(skb, opt, &proto);
215 		if (opt->opt_nflen)
216 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
217 	}
218 
219 	skb_push(skb, sizeof(struct ipv6hdr));
220 	skb_reset_network_header(skb);
221 	hdr = ipv6_hdr(skb);
222 
223 	/*
224 	 *	Fill in the IPv6 header
225 	 */
226 	if (np) {
227 		tclass = np->tclass;
228 		hlimit = np->hop_limit;
229 	}
230 	if (hlimit < 0)
231 		hlimit = ip6_dst_hoplimit(dst);
232 
233 	*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
234 
235 	hdr->payload_len = htons(seg_len);
236 	hdr->nexthdr = proto;
237 	hdr->hop_limit = hlimit;
238 
239 	ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
240 	ipv6_addr_copy(&hdr->daddr, first_hop);
241 
242 	skb->priority = sk->sk_priority;
243 	skb->mark = sk->sk_mark;
244 
245 	mtu = dst_mtu(dst);
246 	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
247 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
248 			      IPSTATS_MIB_OUT, skb->len);
249 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
250 			       dst->dev, dst_output);
251 	}
252 
253 	if (net_ratelimit())
254 		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
255 	skb->dev = dst->dev;
256 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
257 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
258 	kfree_skb(skb);
259 	return -EMSGSIZE;
260 }
261 
262 EXPORT_SYMBOL(ip6_xmit);
263 
264 /*
265  *	To avoid extra problems ND packets are send through this
266  *	routine. It's code duplication but I really want to avoid
267  *	extra checks since ipv6_build_header is used by TCP (which
268  *	is for us performance critical)
269  */
270 
271 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
272 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
273 	       int proto, int len)
274 {
275 	struct ipv6_pinfo *np = inet6_sk(sk);
276 	struct ipv6hdr *hdr;
277 
278 	skb->protocol = htons(ETH_P_IPV6);
279 	skb->dev = dev;
280 
281 	skb_reset_network_header(skb);
282 	skb_put(skb, sizeof(struct ipv6hdr));
283 	hdr = ipv6_hdr(skb);
284 
285 	*(__be32*)hdr = htonl(0x60000000);
286 
287 	hdr->payload_len = htons(len);
288 	hdr->nexthdr = proto;
289 	hdr->hop_limit = np->hop_limit;
290 
291 	ipv6_addr_copy(&hdr->saddr, saddr);
292 	ipv6_addr_copy(&hdr->daddr, daddr);
293 
294 	return 0;
295 }
296 
297 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
298 {
299 	struct ip6_ra_chain *ra;
300 	struct sock *last = NULL;
301 
302 	read_lock(&ip6_ra_lock);
303 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
304 		struct sock *sk = ra->sk;
305 		if (sk && ra->sel == sel &&
306 		    (!sk->sk_bound_dev_if ||
307 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
308 			if (last) {
309 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
310 				if (skb2)
311 					rawv6_rcv(last, skb2);
312 			}
313 			last = sk;
314 		}
315 	}
316 
317 	if (last) {
318 		rawv6_rcv(last, skb);
319 		read_unlock(&ip6_ra_lock);
320 		return 1;
321 	}
322 	read_unlock(&ip6_ra_lock);
323 	return 0;
324 }
325 
326 static int ip6_forward_proxy_check(struct sk_buff *skb)
327 {
328 	struct ipv6hdr *hdr = ipv6_hdr(skb);
329 	u8 nexthdr = hdr->nexthdr;
330 	int offset;
331 
332 	if (ipv6_ext_hdr(nexthdr)) {
333 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
334 		if (offset < 0)
335 			return 0;
336 	} else
337 		offset = sizeof(struct ipv6hdr);
338 
339 	if (nexthdr == IPPROTO_ICMPV6) {
340 		struct icmp6hdr *icmp6;
341 
342 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
343 					 offset + 1 - skb->data)))
344 			return 0;
345 
346 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
347 
348 		switch (icmp6->icmp6_type) {
349 		case NDISC_ROUTER_SOLICITATION:
350 		case NDISC_ROUTER_ADVERTISEMENT:
351 		case NDISC_NEIGHBOUR_SOLICITATION:
352 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
353 		case NDISC_REDIRECT:
354 			/* For reaction involving unicast neighbor discovery
355 			 * message destined to the proxied address, pass it to
356 			 * input function.
357 			 */
358 			return 1;
359 		default:
360 			break;
361 		}
362 	}
363 
364 	/*
365 	 * The proxying router can't forward traffic sent to a link-local
366 	 * address, so signal the sender and discard the packet. This
367 	 * behavior is clarified by the MIPv6 specification.
368 	 */
369 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
370 		dst_link_failure(skb);
371 		return -1;
372 	}
373 
374 	return 0;
375 }
376 
377 static inline int ip6_forward_finish(struct sk_buff *skb)
378 {
379 	return dst_output(skb);
380 }
381 
382 int ip6_forward(struct sk_buff *skb)
383 {
384 	struct dst_entry *dst = skb_dst(skb);
385 	struct ipv6hdr *hdr = ipv6_hdr(skb);
386 	struct inet6_skb_parm *opt = IP6CB(skb);
387 	struct net *net = dev_net(dst->dev);
388 	struct neighbour *n;
389 	u32 mtu;
390 
391 	if (net->ipv6.devconf_all->forwarding == 0)
392 		goto error;
393 
394 	if (skb_warn_if_lro(skb))
395 		goto drop;
396 
397 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
398 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
399 		goto drop;
400 	}
401 
402 	if (skb->pkt_type != PACKET_HOST)
403 		goto drop;
404 
405 	skb_forward_csum(skb);
406 
407 	/*
408 	 *	We DO NOT make any processing on
409 	 *	RA packets, pushing them to user level AS IS
410 	 *	without ane WARRANTY that application will be able
411 	 *	to interpret them. The reason is that we
412 	 *	cannot make anything clever here.
413 	 *
414 	 *	We are not end-node, so that if packet contains
415 	 *	AH/ESP, we cannot make anything.
416 	 *	Defragmentation also would be mistake, RA packets
417 	 *	cannot be fragmented, because there is no warranty
418 	 *	that different fragments will go along one path. --ANK
419 	 */
420 	if (opt->ra) {
421 		u8 *ptr = skb_network_header(skb) + opt->ra;
422 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
423 			return 0;
424 	}
425 
426 	/*
427 	 *	check and decrement ttl
428 	 */
429 	if (hdr->hop_limit <= 1) {
430 		/* Force OUTPUT device used as source address */
431 		skb->dev = dst->dev;
432 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
433 		IP6_INC_STATS_BH(net,
434 				 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
435 
436 		kfree_skb(skb);
437 		return -ETIMEDOUT;
438 	}
439 
440 	/* XXX: idev->cnf.proxy_ndp? */
441 	if (net->ipv6.devconf_all->proxy_ndp &&
442 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
443 		int proxied = ip6_forward_proxy_check(skb);
444 		if (proxied > 0)
445 			return ip6_input(skb);
446 		else if (proxied < 0) {
447 			IP6_INC_STATS(net, ip6_dst_idev(dst),
448 				      IPSTATS_MIB_INDISCARDS);
449 			goto drop;
450 		}
451 	}
452 
453 	if (!xfrm6_route_forward(skb)) {
454 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
455 		goto drop;
456 	}
457 	dst = skb_dst(skb);
458 
459 	/* IPv6 specs say nothing about it, but it is clear that we cannot
460 	   send redirects to source routed frames.
461 	   We don't send redirects to frames decapsulated from IPsec.
462 	 */
463 	n = dst_get_neighbour(dst);
464 	if (skb->dev == dst->dev && n && opt->srcrt == 0 && !skb_sec_path(skb)) {
465 		struct in6_addr *target = NULL;
466 		struct rt6_info *rt;
467 
468 		/*
469 		 *	incoming and outgoing devices are the same
470 		 *	send a redirect.
471 		 */
472 
473 		rt = (struct rt6_info *) dst;
474 		if ((rt->rt6i_flags & RTF_GATEWAY))
475 			target = (struct in6_addr*)&n->primary_key;
476 		else
477 			target = &hdr->daddr;
478 
479 		if (!rt->rt6i_peer)
480 			rt6_bind_peer(rt, 1);
481 
482 		/* Limit redirects both by destination (here)
483 		   and by source (inside ndisc_send_redirect)
484 		 */
485 		if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
486 			ndisc_send_redirect(skb, n, target);
487 	} else {
488 		int addrtype = ipv6_addr_type(&hdr->saddr);
489 
490 		/* This check is security critical. */
491 		if (addrtype == IPV6_ADDR_ANY ||
492 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
493 			goto error;
494 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
495 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
496 				    ICMPV6_NOT_NEIGHBOUR, 0);
497 			goto error;
498 		}
499 	}
500 
501 	mtu = dst_mtu(dst);
502 	if (mtu < IPV6_MIN_MTU)
503 		mtu = IPV6_MIN_MTU;
504 
505 	if (skb->len > mtu && !skb_is_gso(skb)) {
506 		/* Again, force OUTPUT device used as source address */
507 		skb->dev = dst->dev;
508 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
509 		IP6_INC_STATS_BH(net,
510 				 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
511 		IP6_INC_STATS_BH(net,
512 				 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
513 		kfree_skb(skb);
514 		return -EMSGSIZE;
515 	}
516 
517 	if (skb_cow(skb, dst->dev->hard_header_len)) {
518 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
519 		goto drop;
520 	}
521 
522 	hdr = ipv6_hdr(skb);
523 
524 	/* Mangling hops number delayed to point after skb COW */
525 
526 	hdr->hop_limit--;
527 
528 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
529 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
530 		       ip6_forward_finish);
531 
532 error:
533 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
534 drop:
535 	kfree_skb(skb);
536 	return -EINVAL;
537 }
538 
539 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
540 {
541 	to->pkt_type = from->pkt_type;
542 	to->priority = from->priority;
543 	to->protocol = from->protocol;
544 	skb_dst_drop(to);
545 	skb_dst_set(to, dst_clone(skb_dst(from)));
546 	to->dev = from->dev;
547 	to->mark = from->mark;
548 
549 #ifdef CONFIG_NET_SCHED
550 	to->tc_index = from->tc_index;
551 #endif
552 	nf_copy(to, from);
553 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
554     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
555 	to->nf_trace = from->nf_trace;
556 #endif
557 	skb_copy_secmark(to, from);
558 }
559 
560 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
561 {
562 	u16 offset = sizeof(struct ipv6hdr);
563 	struct ipv6_opt_hdr *exthdr =
564 				(struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
565 	unsigned int packet_len = skb->tail - skb->network_header;
566 	int found_rhdr = 0;
567 	*nexthdr = &ipv6_hdr(skb)->nexthdr;
568 
569 	while (offset + 1 <= packet_len) {
570 
571 		switch (**nexthdr) {
572 
573 		case NEXTHDR_HOP:
574 			break;
575 		case NEXTHDR_ROUTING:
576 			found_rhdr = 1;
577 			break;
578 		case NEXTHDR_DEST:
579 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
580 			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
581 				break;
582 #endif
583 			if (found_rhdr)
584 				return offset;
585 			break;
586 		default :
587 			return offset;
588 		}
589 
590 		offset += ipv6_optlen(exthdr);
591 		*nexthdr = &exthdr->nexthdr;
592 		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
593 						 offset);
594 	}
595 
596 	return offset;
597 }
598 
599 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
600 {
601 	static atomic_t ipv6_fragmentation_id;
602 	int old, new;
603 
604 	if (rt) {
605 		struct inet_peer *peer;
606 
607 		if (!rt->rt6i_peer)
608 			rt6_bind_peer(rt, 1);
609 		peer = rt->rt6i_peer;
610 		if (peer) {
611 			fhdr->identification = htonl(inet_getid(peer, 0));
612 			return;
613 		}
614 	}
615 	do {
616 		old = atomic_read(&ipv6_fragmentation_id);
617 		new = old + 1;
618 		if (!new)
619 			new = 1;
620 	} while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
621 	fhdr->identification = htonl(new);
622 }
623 
624 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
625 {
626 	struct sk_buff *frag;
627 	struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
628 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
629 	struct ipv6hdr *tmp_hdr;
630 	struct frag_hdr *fh;
631 	unsigned int mtu, hlen, left, len;
632 	__be32 frag_id = 0;
633 	int ptr, offset = 0, err=0;
634 	u8 *prevhdr, nexthdr = 0;
635 	struct net *net = dev_net(skb_dst(skb)->dev);
636 
637 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
638 	nexthdr = *prevhdr;
639 
640 	mtu = ip6_skb_dst_mtu(skb);
641 
642 	/* We must not fragment if the socket is set to force MTU discovery
643 	 * or if the skb it not generated by a local socket.
644 	 */
645 	if (!skb->local_df && skb->len > mtu) {
646 		skb->dev = skb_dst(skb)->dev;
647 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
648 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
649 			      IPSTATS_MIB_FRAGFAILS);
650 		kfree_skb(skb);
651 		return -EMSGSIZE;
652 	}
653 
654 	if (np && np->frag_size < mtu) {
655 		if (np->frag_size)
656 			mtu = np->frag_size;
657 	}
658 	mtu -= hlen + sizeof(struct frag_hdr);
659 
660 	if (skb_has_frag_list(skb)) {
661 		int first_len = skb_pagelen(skb);
662 		struct sk_buff *frag2;
663 
664 		if (first_len - hlen > mtu ||
665 		    ((first_len - hlen) & 7) ||
666 		    skb_cloned(skb))
667 			goto slow_path;
668 
669 		skb_walk_frags(skb, frag) {
670 			/* Correct geometry. */
671 			if (frag->len > mtu ||
672 			    ((frag->len & 7) && frag->next) ||
673 			    skb_headroom(frag) < hlen)
674 				goto slow_path_clean;
675 
676 			/* Partially cloned skb? */
677 			if (skb_shared(frag))
678 				goto slow_path_clean;
679 
680 			BUG_ON(frag->sk);
681 			if (skb->sk) {
682 				frag->sk = skb->sk;
683 				frag->destructor = sock_wfree;
684 			}
685 			skb->truesize -= frag->truesize;
686 		}
687 
688 		err = 0;
689 		offset = 0;
690 		frag = skb_shinfo(skb)->frag_list;
691 		skb_frag_list_init(skb);
692 		/* BUILD HEADER */
693 
694 		*prevhdr = NEXTHDR_FRAGMENT;
695 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
696 		if (!tmp_hdr) {
697 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
698 				      IPSTATS_MIB_FRAGFAILS);
699 			return -ENOMEM;
700 		}
701 
702 		__skb_pull(skb, hlen);
703 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
704 		__skb_push(skb, hlen);
705 		skb_reset_network_header(skb);
706 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
707 
708 		ipv6_select_ident(fh, rt);
709 		fh->nexthdr = nexthdr;
710 		fh->reserved = 0;
711 		fh->frag_off = htons(IP6_MF);
712 		frag_id = fh->identification;
713 
714 		first_len = skb_pagelen(skb);
715 		skb->data_len = first_len - skb_headlen(skb);
716 		skb->len = first_len;
717 		ipv6_hdr(skb)->payload_len = htons(first_len -
718 						   sizeof(struct ipv6hdr));
719 
720 		dst_hold(&rt->dst);
721 
722 		for (;;) {
723 			/* Prepare header of the next frame,
724 			 * before previous one went down. */
725 			if (frag) {
726 				frag->ip_summed = CHECKSUM_NONE;
727 				skb_reset_transport_header(frag);
728 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
729 				__skb_push(frag, hlen);
730 				skb_reset_network_header(frag);
731 				memcpy(skb_network_header(frag), tmp_hdr,
732 				       hlen);
733 				offset += skb->len - hlen - sizeof(struct frag_hdr);
734 				fh->nexthdr = nexthdr;
735 				fh->reserved = 0;
736 				fh->frag_off = htons(offset);
737 				if (frag->next != NULL)
738 					fh->frag_off |= htons(IP6_MF);
739 				fh->identification = frag_id;
740 				ipv6_hdr(frag)->payload_len =
741 						htons(frag->len -
742 						      sizeof(struct ipv6hdr));
743 				ip6_copy_metadata(frag, skb);
744 			}
745 
746 			err = output(skb);
747 			if(!err)
748 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
749 					      IPSTATS_MIB_FRAGCREATES);
750 
751 			if (err || !frag)
752 				break;
753 
754 			skb = frag;
755 			frag = skb->next;
756 			skb->next = NULL;
757 		}
758 
759 		kfree(tmp_hdr);
760 
761 		if (err == 0) {
762 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
763 				      IPSTATS_MIB_FRAGOKS);
764 			dst_release(&rt->dst);
765 			return 0;
766 		}
767 
768 		while (frag) {
769 			skb = frag->next;
770 			kfree_skb(frag);
771 			frag = skb;
772 		}
773 
774 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
775 			      IPSTATS_MIB_FRAGFAILS);
776 		dst_release(&rt->dst);
777 		return err;
778 
779 slow_path_clean:
780 		skb_walk_frags(skb, frag2) {
781 			if (frag2 == frag)
782 				break;
783 			frag2->sk = NULL;
784 			frag2->destructor = NULL;
785 			skb->truesize += frag2->truesize;
786 		}
787 	}
788 
789 slow_path:
790 	left = skb->len - hlen;		/* Space per frame */
791 	ptr = hlen;			/* Where to start from */
792 
793 	/*
794 	 *	Fragment the datagram.
795 	 */
796 
797 	*prevhdr = NEXTHDR_FRAGMENT;
798 
799 	/*
800 	 *	Keep copying data until we run out.
801 	 */
802 	while(left > 0)	{
803 		len = left;
804 		/* IF: it doesn't fit, use 'mtu' - the data space left */
805 		if (len > mtu)
806 			len = mtu;
807 		/* IF: we are not sending up to and including the packet end
808 		   then align the next start on an eight byte boundary */
809 		if (len < left)	{
810 			len &= ~7;
811 		}
812 		/*
813 		 *	Allocate buffer.
814 		 */
815 
816 		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
817 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
818 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
819 				      IPSTATS_MIB_FRAGFAILS);
820 			err = -ENOMEM;
821 			goto fail;
822 		}
823 
824 		/*
825 		 *	Set up data on packet
826 		 */
827 
828 		ip6_copy_metadata(frag, skb);
829 		skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
830 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
831 		skb_reset_network_header(frag);
832 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
833 		frag->transport_header = (frag->network_header + hlen +
834 					  sizeof(struct frag_hdr));
835 
836 		/*
837 		 *	Charge the memory for the fragment to any owner
838 		 *	it might possess
839 		 */
840 		if (skb->sk)
841 			skb_set_owner_w(frag, skb->sk);
842 
843 		/*
844 		 *	Copy the packet header into the new buffer.
845 		 */
846 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
847 
848 		/*
849 		 *	Build fragment header.
850 		 */
851 		fh->nexthdr = nexthdr;
852 		fh->reserved = 0;
853 		if (!frag_id) {
854 			ipv6_select_ident(fh, rt);
855 			frag_id = fh->identification;
856 		} else
857 			fh->identification = frag_id;
858 
859 		/*
860 		 *	Copy a block of the IP datagram.
861 		 */
862 		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
863 			BUG();
864 		left -= len;
865 
866 		fh->frag_off = htons(offset);
867 		if (left > 0)
868 			fh->frag_off |= htons(IP6_MF);
869 		ipv6_hdr(frag)->payload_len = htons(frag->len -
870 						    sizeof(struct ipv6hdr));
871 
872 		ptr += len;
873 		offset += len;
874 
875 		/*
876 		 *	Put this fragment into the sending queue.
877 		 */
878 		err = output(frag);
879 		if (err)
880 			goto fail;
881 
882 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
883 			      IPSTATS_MIB_FRAGCREATES);
884 	}
885 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
886 		      IPSTATS_MIB_FRAGOKS);
887 	kfree_skb(skb);
888 	return err;
889 
890 fail:
891 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
892 		      IPSTATS_MIB_FRAGFAILS);
893 	kfree_skb(skb);
894 	return err;
895 }
896 
897 static inline int ip6_rt_check(const struct rt6key *rt_key,
898 			       const struct in6_addr *fl_addr,
899 			       const struct in6_addr *addr_cache)
900 {
901 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
902 		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
903 }
904 
905 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
906 					  struct dst_entry *dst,
907 					  const struct flowi6 *fl6)
908 {
909 	struct ipv6_pinfo *np = inet6_sk(sk);
910 	struct rt6_info *rt = (struct rt6_info *)dst;
911 
912 	if (!dst)
913 		goto out;
914 
915 	/* Yes, checking route validity in not connected
916 	 * case is not very simple. Take into account,
917 	 * that we do not support routing by source, TOS,
918 	 * and MSG_DONTROUTE 		--ANK (980726)
919 	 *
920 	 * 1. ip6_rt_check(): If route was host route,
921 	 *    check that cached destination is current.
922 	 *    If it is network route, we still may
923 	 *    check its validity using saved pointer
924 	 *    to the last used address: daddr_cache.
925 	 *    We do not want to save whole address now,
926 	 *    (because main consumer of this service
927 	 *    is tcp, which has not this problem),
928 	 *    so that the last trick works only on connected
929 	 *    sockets.
930 	 * 2. oif also should be the same.
931 	 */
932 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
933 #ifdef CONFIG_IPV6_SUBTREES
934 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
935 #endif
936 	    (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
937 		dst_release(dst);
938 		dst = NULL;
939 	}
940 
941 out:
942 	return dst;
943 }
944 
945 static int ip6_dst_lookup_tail(struct sock *sk,
946 			       struct dst_entry **dst, struct flowi6 *fl6)
947 {
948 	struct net *net = sock_net(sk);
949 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
950 	struct neighbour *n;
951 #endif
952 	int err;
953 
954 	if (*dst == NULL)
955 		*dst = ip6_route_output(net, sk, fl6);
956 
957 	if ((err = (*dst)->error))
958 		goto out_err_release;
959 
960 	if (ipv6_addr_any(&fl6->saddr)) {
961 		struct rt6_info *rt = (struct rt6_info *) *dst;
962 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
963 					  sk ? inet6_sk(sk)->srcprefs : 0,
964 					  &fl6->saddr);
965 		if (err)
966 			goto out_err_release;
967 	}
968 
969 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
970 	/*
971 	 * Here if the dst entry we've looked up
972 	 * has a neighbour entry that is in the INCOMPLETE
973 	 * state and the src address from the flow is
974 	 * marked as OPTIMISTIC, we release the found
975 	 * dst entry and replace it instead with the
976 	 * dst entry of the nexthop router
977 	 */
978 	n = dst_get_neighbour(*dst);
979 	if (n && !(n->nud_state & NUD_VALID)) {
980 		struct inet6_ifaddr *ifp;
981 		struct flowi6 fl_gw6;
982 		int redirect;
983 
984 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
985 				      (*dst)->dev, 1);
986 
987 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
988 		if (ifp)
989 			in6_ifa_put(ifp);
990 
991 		if (redirect) {
992 			/*
993 			 * We need to get the dst entry for the
994 			 * default router instead
995 			 */
996 			dst_release(*dst);
997 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
998 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
999 			*dst = ip6_route_output(net, sk, &fl_gw6);
1000 			if ((err = (*dst)->error))
1001 				goto out_err_release;
1002 		}
1003 	}
1004 #endif
1005 
1006 	return 0;
1007 
1008 out_err_release:
1009 	if (err == -ENETUNREACH)
1010 		IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1011 	dst_release(*dst);
1012 	*dst = NULL;
1013 	return err;
1014 }
1015 
1016 /**
1017  *	ip6_dst_lookup - perform route lookup on flow
1018  *	@sk: socket which provides route info
1019  *	@dst: pointer to dst_entry * for result
1020  *	@fl6: flow to lookup
1021  *
1022  *	This function performs a route lookup on the given flow.
1023  *
1024  *	It returns zero on success, or a standard errno code on error.
1025  */
1026 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1027 {
1028 	*dst = NULL;
1029 	return ip6_dst_lookup_tail(sk, dst, fl6);
1030 }
1031 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1032 
1033 /**
1034  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1035  *	@sk: socket which provides route info
1036  *	@fl6: flow to lookup
1037  *	@final_dst: final destination address for ipsec lookup
1038  *	@can_sleep: we are in a sleepable context
1039  *
1040  *	This function performs a route lookup on the given flow.
1041  *
1042  *	It returns a valid dst pointer on success, or a pointer encoded
1043  *	error code.
1044  */
1045 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1046 				      const struct in6_addr *final_dst,
1047 				      bool can_sleep)
1048 {
1049 	struct dst_entry *dst = NULL;
1050 	int err;
1051 
1052 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1053 	if (err)
1054 		return ERR_PTR(err);
1055 	if (final_dst)
1056 		ipv6_addr_copy(&fl6->daddr, final_dst);
1057 	if (can_sleep)
1058 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1059 
1060 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1061 }
1062 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1063 
1064 /**
1065  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1066  *	@sk: socket which provides the dst cache and route info
1067  *	@fl6: flow to lookup
1068  *	@final_dst: final destination address for ipsec lookup
1069  *	@can_sleep: we are in a sleepable context
1070  *
1071  *	This function performs a route lookup on the given flow with the
1072  *	possibility of using the cached route in the socket if it is valid.
1073  *	It will take the socket dst lock when operating on the dst cache.
1074  *	As a result, this function can only be used in process context.
1075  *
1076  *	It returns a valid dst pointer on success, or a pointer encoded
1077  *	error code.
1078  */
1079 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1080 					 const struct in6_addr *final_dst,
1081 					 bool can_sleep)
1082 {
1083 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1084 	int err;
1085 
1086 	dst = ip6_sk_dst_check(sk, dst, fl6);
1087 
1088 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1089 	if (err)
1090 		return ERR_PTR(err);
1091 	if (final_dst)
1092 		ipv6_addr_copy(&fl6->daddr, final_dst);
1093 	if (can_sleep)
1094 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1095 
1096 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1097 }
1098 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1099 
1100 static inline int ip6_ufo_append_data(struct sock *sk,
1101 			int getfrag(void *from, char *to, int offset, int len,
1102 			int odd, struct sk_buff *skb),
1103 			void *from, int length, int hh_len, int fragheaderlen,
1104 			int transhdrlen, int mtu,unsigned int flags,
1105 			struct rt6_info *rt)
1106 
1107 {
1108 	struct sk_buff *skb;
1109 	int err;
1110 
1111 	/* There is support for UDP large send offload by network
1112 	 * device, so create one single skb packet containing complete
1113 	 * udp datagram
1114 	 */
1115 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1116 		skb = sock_alloc_send_skb(sk,
1117 			hh_len + fragheaderlen + transhdrlen + 20,
1118 			(flags & MSG_DONTWAIT), &err);
1119 		if (skb == NULL)
1120 			return -ENOMEM;
1121 
1122 		/* reserve space for Hardware header */
1123 		skb_reserve(skb, hh_len);
1124 
1125 		/* create space for UDP/IP header */
1126 		skb_put(skb,fragheaderlen + transhdrlen);
1127 
1128 		/* initialize network header pointer */
1129 		skb_reset_network_header(skb);
1130 
1131 		/* initialize protocol header pointer */
1132 		skb->transport_header = skb->network_header + fragheaderlen;
1133 
1134 		skb->ip_summed = CHECKSUM_PARTIAL;
1135 		skb->csum = 0;
1136 	}
1137 
1138 	err = skb_append_datato_frags(sk,skb, getfrag, from,
1139 				      (length - transhdrlen));
1140 	if (!err) {
1141 		struct frag_hdr fhdr;
1142 
1143 		/* Specify the length of each IPv6 datagram fragment.
1144 		 * It has to be a multiple of 8.
1145 		 */
1146 		skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1147 					     sizeof(struct frag_hdr)) & ~7;
1148 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1149 		ipv6_select_ident(&fhdr, rt);
1150 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1151 		__skb_queue_tail(&sk->sk_write_queue, skb);
1152 
1153 		return 0;
1154 	}
1155 	/* There is not enough support do UPD LSO,
1156 	 * so follow normal path
1157 	 */
1158 	kfree_skb(skb);
1159 
1160 	return err;
1161 }
1162 
1163 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1164 					       gfp_t gfp)
1165 {
1166 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1167 }
1168 
1169 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1170 						gfp_t gfp)
1171 {
1172 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1173 }
1174 
1175 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1176 	int offset, int len, int odd, struct sk_buff *skb),
1177 	void *from, int length, int transhdrlen,
1178 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1179 	struct rt6_info *rt, unsigned int flags, int dontfrag)
1180 {
1181 	struct inet_sock *inet = inet_sk(sk);
1182 	struct ipv6_pinfo *np = inet6_sk(sk);
1183 	struct inet_cork *cork;
1184 	struct sk_buff *skb;
1185 	unsigned int maxfraglen, fragheaderlen;
1186 	int exthdrlen;
1187 	int hh_len;
1188 	int mtu;
1189 	int copy;
1190 	int err;
1191 	int offset = 0;
1192 	int csummode = CHECKSUM_NONE;
1193 	__u8 tx_flags = 0;
1194 
1195 	if (flags&MSG_PROBE)
1196 		return 0;
1197 	cork = &inet->cork.base;
1198 	if (skb_queue_empty(&sk->sk_write_queue)) {
1199 		/*
1200 		 * setup for corking
1201 		 */
1202 		if (opt) {
1203 			if (WARN_ON(np->cork.opt))
1204 				return -EINVAL;
1205 
1206 			np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1207 			if (unlikely(np->cork.opt == NULL))
1208 				return -ENOBUFS;
1209 
1210 			np->cork.opt->tot_len = opt->tot_len;
1211 			np->cork.opt->opt_flen = opt->opt_flen;
1212 			np->cork.opt->opt_nflen = opt->opt_nflen;
1213 
1214 			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1215 							    sk->sk_allocation);
1216 			if (opt->dst0opt && !np->cork.opt->dst0opt)
1217 				return -ENOBUFS;
1218 
1219 			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1220 							    sk->sk_allocation);
1221 			if (opt->dst1opt && !np->cork.opt->dst1opt)
1222 				return -ENOBUFS;
1223 
1224 			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1225 							   sk->sk_allocation);
1226 			if (opt->hopopt && !np->cork.opt->hopopt)
1227 				return -ENOBUFS;
1228 
1229 			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1230 							    sk->sk_allocation);
1231 			if (opt->srcrt && !np->cork.opt->srcrt)
1232 				return -ENOBUFS;
1233 
1234 			/* need source address above miyazawa*/
1235 		}
1236 		dst_hold(&rt->dst);
1237 		cork->dst = &rt->dst;
1238 		inet->cork.fl.u.ip6 = *fl6;
1239 		np->cork.hop_limit = hlimit;
1240 		np->cork.tclass = tclass;
1241 		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1242 		      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1243 		if (np->frag_size < mtu) {
1244 			if (np->frag_size)
1245 				mtu = np->frag_size;
1246 		}
1247 		cork->fragsize = mtu;
1248 		if (dst_allfrag(rt->dst.path))
1249 			cork->flags |= IPCORK_ALLFRAG;
1250 		cork->length = 0;
1251 		sk->sk_sndmsg_page = NULL;
1252 		sk->sk_sndmsg_off = 0;
1253 		exthdrlen = rt->dst.header_len + (opt ? opt->opt_flen : 0) -
1254 			    rt->rt6i_nfheader_len;
1255 		length += exthdrlen;
1256 		transhdrlen += exthdrlen;
1257 	} else {
1258 		rt = (struct rt6_info *)cork->dst;
1259 		fl6 = &inet->cork.fl.u.ip6;
1260 		opt = np->cork.opt;
1261 		transhdrlen = 0;
1262 		exthdrlen = 0;
1263 		mtu = cork->fragsize;
1264 	}
1265 
1266 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1267 
1268 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1269 			(opt ? opt->opt_nflen : 0);
1270 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1271 
1272 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1273 		if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1274 			ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1275 			return -EMSGSIZE;
1276 		}
1277 	}
1278 
1279 	/* For UDP, check if TX timestamp is enabled */
1280 	if (sk->sk_type == SOCK_DGRAM) {
1281 		err = sock_tx_timestamp(sk, &tx_flags);
1282 		if (err)
1283 			goto error;
1284 	}
1285 
1286 	/*
1287 	 * Let's try using as much space as possible.
1288 	 * Use MTU if total length of the message fits into the MTU.
1289 	 * Otherwise, we need to reserve fragment header and
1290 	 * fragment alignment (= 8-15 octects, in total).
1291 	 *
1292 	 * Note that we may need to "move" the data from the tail of
1293 	 * of the buffer to the new fragment when we split
1294 	 * the message.
1295 	 *
1296 	 * FIXME: It may be fragmented into multiple chunks
1297 	 *        at once if non-fragmentable extension headers
1298 	 *        are too large.
1299 	 * --yoshfuji
1300 	 */
1301 
1302 	cork->length += length;
1303 	if (length > mtu) {
1304 		int proto = sk->sk_protocol;
1305 		if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1306 			ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1307 			return -EMSGSIZE;
1308 		}
1309 
1310 		if (proto == IPPROTO_UDP &&
1311 		    (rt->dst.dev->features & NETIF_F_UFO)) {
1312 
1313 			err = ip6_ufo_append_data(sk, getfrag, from, length,
1314 						  hh_len, fragheaderlen,
1315 						  transhdrlen, mtu, flags, rt);
1316 			if (err)
1317 				goto error;
1318 			return 0;
1319 		}
1320 	}
1321 
1322 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1323 		goto alloc_new_skb;
1324 
1325 	while (length > 0) {
1326 		/* Check if the remaining data fits into current packet. */
1327 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1328 		if (copy < length)
1329 			copy = maxfraglen - skb->len;
1330 
1331 		if (copy <= 0) {
1332 			char *data;
1333 			unsigned int datalen;
1334 			unsigned int fraglen;
1335 			unsigned int fraggap;
1336 			unsigned int alloclen;
1337 			struct sk_buff *skb_prev;
1338 alloc_new_skb:
1339 			skb_prev = skb;
1340 
1341 			/* There's no room in the current skb */
1342 			if (skb_prev)
1343 				fraggap = skb_prev->len - maxfraglen;
1344 			else
1345 				fraggap = 0;
1346 
1347 			/*
1348 			 * If remaining data exceeds the mtu,
1349 			 * we know we need more fragment(s).
1350 			 */
1351 			datalen = length + fraggap;
1352 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1353 				datalen = maxfraglen - fragheaderlen;
1354 
1355 			fraglen = datalen + fragheaderlen;
1356 			if ((flags & MSG_MORE) &&
1357 			    !(rt->dst.dev->features&NETIF_F_SG))
1358 				alloclen = mtu;
1359 			else
1360 				alloclen = datalen + fragheaderlen;
1361 
1362 			/*
1363 			 * The last fragment gets additional space at tail.
1364 			 * Note: we overallocate on fragments with MSG_MODE
1365 			 * because we have no idea if we're the last one.
1366 			 */
1367 			if (datalen == length + fraggap)
1368 				alloclen += rt->dst.trailer_len;
1369 
1370 			/*
1371 			 * We just reserve space for fragment header.
1372 			 * Note: this may be overallocation if the message
1373 			 * (without MSG_MORE) fits into the MTU.
1374 			 */
1375 			alloclen += sizeof(struct frag_hdr);
1376 
1377 			if (transhdrlen) {
1378 				skb = sock_alloc_send_skb(sk,
1379 						alloclen + hh_len,
1380 						(flags & MSG_DONTWAIT), &err);
1381 			} else {
1382 				skb = NULL;
1383 				if (atomic_read(&sk->sk_wmem_alloc) <=
1384 				    2 * sk->sk_sndbuf)
1385 					skb = sock_wmalloc(sk,
1386 							   alloclen + hh_len, 1,
1387 							   sk->sk_allocation);
1388 				if (unlikely(skb == NULL))
1389 					err = -ENOBUFS;
1390 				else {
1391 					/* Only the initial fragment
1392 					 * is time stamped.
1393 					 */
1394 					tx_flags = 0;
1395 				}
1396 			}
1397 			if (skb == NULL)
1398 				goto error;
1399 			/*
1400 			 *	Fill in the control structures
1401 			 */
1402 			skb->ip_summed = csummode;
1403 			skb->csum = 0;
1404 			/* reserve for fragmentation */
1405 			skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1406 
1407 			if (sk->sk_type == SOCK_DGRAM)
1408 				skb_shinfo(skb)->tx_flags = tx_flags;
1409 
1410 			/*
1411 			 *	Find where to start putting bytes
1412 			 */
1413 			data = skb_put(skb, fraglen);
1414 			skb_set_network_header(skb, exthdrlen);
1415 			data += fragheaderlen;
1416 			skb->transport_header = (skb->network_header +
1417 						 fragheaderlen);
1418 			if (fraggap) {
1419 				skb->csum = skb_copy_and_csum_bits(
1420 					skb_prev, maxfraglen,
1421 					data + transhdrlen, fraggap, 0);
1422 				skb_prev->csum = csum_sub(skb_prev->csum,
1423 							  skb->csum);
1424 				data += fraggap;
1425 				pskb_trim_unique(skb_prev, maxfraglen);
1426 			}
1427 			copy = datalen - transhdrlen - fraggap;
1428 			if (copy < 0) {
1429 				err = -EINVAL;
1430 				kfree_skb(skb);
1431 				goto error;
1432 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1433 				err = -EFAULT;
1434 				kfree_skb(skb);
1435 				goto error;
1436 			}
1437 
1438 			offset += copy;
1439 			length -= datalen - fraggap;
1440 			transhdrlen = 0;
1441 			exthdrlen = 0;
1442 			csummode = CHECKSUM_NONE;
1443 
1444 			/*
1445 			 * Put the packet on the pending queue
1446 			 */
1447 			__skb_queue_tail(&sk->sk_write_queue, skb);
1448 			continue;
1449 		}
1450 
1451 		if (copy > length)
1452 			copy = length;
1453 
1454 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1455 			unsigned int off;
1456 
1457 			off = skb->len;
1458 			if (getfrag(from, skb_put(skb, copy),
1459 						offset, copy, off, skb) < 0) {
1460 				__skb_trim(skb, off);
1461 				err = -EFAULT;
1462 				goto error;
1463 			}
1464 		} else {
1465 			int i = skb_shinfo(skb)->nr_frags;
1466 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1467 			struct page *page = sk->sk_sndmsg_page;
1468 			int off = sk->sk_sndmsg_off;
1469 			unsigned int left;
1470 
1471 			if (page && (left = PAGE_SIZE - off) > 0) {
1472 				if (copy >= left)
1473 					copy = left;
1474 				if (page != frag->page) {
1475 					if (i == MAX_SKB_FRAGS) {
1476 						err = -EMSGSIZE;
1477 						goto error;
1478 					}
1479 					get_page(page);
1480 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1481 					frag = &skb_shinfo(skb)->frags[i];
1482 				}
1483 			} else if(i < MAX_SKB_FRAGS) {
1484 				if (copy > PAGE_SIZE)
1485 					copy = PAGE_SIZE;
1486 				page = alloc_pages(sk->sk_allocation, 0);
1487 				if (page == NULL) {
1488 					err = -ENOMEM;
1489 					goto error;
1490 				}
1491 				sk->sk_sndmsg_page = page;
1492 				sk->sk_sndmsg_off = 0;
1493 
1494 				skb_fill_page_desc(skb, i, page, 0, 0);
1495 				frag = &skb_shinfo(skb)->frags[i];
1496 			} else {
1497 				err = -EMSGSIZE;
1498 				goto error;
1499 			}
1500 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1501 				err = -EFAULT;
1502 				goto error;
1503 			}
1504 			sk->sk_sndmsg_off += copy;
1505 			frag->size += copy;
1506 			skb->len += copy;
1507 			skb->data_len += copy;
1508 			skb->truesize += copy;
1509 			atomic_add(copy, &sk->sk_wmem_alloc);
1510 		}
1511 		offset += copy;
1512 		length -= copy;
1513 	}
1514 	return 0;
1515 error:
1516 	cork->length -= length;
1517 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1518 	return err;
1519 }
1520 
1521 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1522 {
1523 	if (np->cork.opt) {
1524 		kfree(np->cork.opt->dst0opt);
1525 		kfree(np->cork.opt->dst1opt);
1526 		kfree(np->cork.opt->hopopt);
1527 		kfree(np->cork.opt->srcrt);
1528 		kfree(np->cork.opt);
1529 		np->cork.opt = NULL;
1530 	}
1531 
1532 	if (inet->cork.base.dst) {
1533 		dst_release(inet->cork.base.dst);
1534 		inet->cork.base.dst = NULL;
1535 		inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1536 	}
1537 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1538 }
1539 
1540 int ip6_push_pending_frames(struct sock *sk)
1541 {
1542 	struct sk_buff *skb, *tmp_skb;
1543 	struct sk_buff **tail_skb;
1544 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1545 	struct inet_sock *inet = inet_sk(sk);
1546 	struct ipv6_pinfo *np = inet6_sk(sk);
1547 	struct net *net = sock_net(sk);
1548 	struct ipv6hdr *hdr;
1549 	struct ipv6_txoptions *opt = np->cork.opt;
1550 	struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1551 	struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1552 	unsigned char proto = fl6->flowi6_proto;
1553 	int err = 0;
1554 
1555 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1556 		goto out;
1557 	tail_skb = &(skb_shinfo(skb)->frag_list);
1558 
1559 	/* move skb->data to ip header from ext header */
1560 	if (skb->data < skb_network_header(skb))
1561 		__skb_pull(skb, skb_network_offset(skb));
1562 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1563 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1564 		*tail_skb = tmp_skb;
1565 		tail_skb = &(tmp_skb->next);
1566 		skb->len += tmp_skb->len;
1567 		skb->data_len += tmp_skb->len;
1568 		skb->truesize += tmp_skb->truesize;
1569 		tmp_skb->destructor = NULL;
1570 		tmp_skb->sk = NULL;
1571 	}
1572 
1573 	/* Allow local fragmentation. */
1574 	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1575 		skb->local_df = 1;
1576 
1577 	ipv6_addr_copy(final_dst, &fl6->daddr);
1578 	__skb_pull(skb, skb_network_header_len(skb));
1579 	if (opt && opt->opt_flen)
1580 		ipv6_push_frag_opts(skb, opt, &proto);
1581 	if (opt && opt->opt_nflen)
1582 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1583 
1584 	skb_push(skb, sizeof(struct ipv6hdr));
1585 	skb_reset_network_header(skb);
1586 	hdr = ipv6_hdr(skb);
1587 
1588 	*(__be32*)hdr = fl6->flowlabel |
1589 		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1590 
1591 	hdr->hop_limit = np->cork.hop_limit;
1592 	hdr->nexthdr = proto;
1593 	ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
1594 	ipv6_addr_copy(&hdr->daddr, final_dst);
1595 
1596 	skb->priority = sk->sk_priority;
1597 	skb->mark = sk->sk_mark;
1598 
1599 	skb_dst_set(skb, dst_clone(&rt->dst));
1600 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1601 	if (proto == IPPROTO_ICMPV6) {
1602 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1603 
1604 		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1605 		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1606 	}
1607 
1608 	err = ip6_local_out(skb);
1609 	if (err) {
1610 		if (err > 0)
1611 			err = net_xmit_errno(err);
1612 		if (err)
1613 			goto error;
1614 	}
1615 
1616 out:
1617 	ip6_cork_release(inet, np);
1618 	return err;
1619 error:
1620 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1621 	goto out;
1622 }
1623 
1624 void ip6_flush_pending_frames(struct sock *sk)
1625 {
1626 	struct sk_buff *skb;
1627 
1628 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1629 		if (skb_dst(skb))
1630 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1631 				      IPSTATS_MIB_OUTDISCARDS);
1632 		kfree_skb(skb);
1633 	}
1634 
1635 	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1636 }
1637