xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 1eb4c977)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	: 	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63 	int len;
64 
65 	len = skb->len - sizeof(struct ipv6hdr);
66 	if (len > IPV6_MAXPLEN)
67 		len = 0;
68 	ipv6_hdr(skb)->payload_len = htons(len);
69 
70 	return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 		       skb_dst(skb)->dev, dst_output);
72 }
73 
74 int ip6_local_out(struct sk_buff *skb)
75 {
76 	int err;
77 
78 	err = __ip6_local_out(skb);
79 	if (likely(err == 1))
80 		err = dst_output(skb);
81 
82 	return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85 
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89 	skb_reset_mac_header(newskb);
90 	__skb_pull(newskb, skb_network_offset(newskb));
91 	newskb->pkt_type = PACKET_LOOPBACK;
92 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
93 	WARN_ON(!skb_dst(newskb));
94 
95 	netif_rx_ni(newskb);
96 	return 0;
97 }
98 
99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101 	struct dst_entry *dst = skb_dst(skb);
102 	struct net_device *dev = dst->dev;
103 	struct neighbour *neigh;
104 
105 	skb->protocol = htons(ETH_P_IPV6);
106 	skb->dev = dev;
107 
108 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
109 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
110 
111 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
112 		    ((mroute6_socket(dev_net(dev), skb) &&
113 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
114 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
115 					 &ipv6_hdr(skb)->saddr))) {
116 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
117 
118 			/* Do not check for IFF_ALLMULTI; multicast routing
119 			   is not supported in any case.
120 			 */
121 			if (newskb)
122 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
123 					newskb, NULL, newskb->dev,
124 					ip6_dev_loopback_xmit);
125 
126 			if (ipv6_hdr(skb)->hop_limit == 0) {
127 				IP6_INC_STATS(dev_net(dev), idev,
128 					      IPSTATS_MIB_OUTDISCARDS);
129 				kfree_skb(skb);
130 				return 0;
131 			}
132 		}
133 
134 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
135 				skb->len);
136 	}
137 
138 	rcu_read_lock();
139 	neigh = dst_get_neighbour_noref(dst);
140 	if (neigh) {
141 		int res = neigh_output(neigh, skb);
142 
143 		rcu_read_unlock();
144 		return res;
145 	}
146 	rcu_read_unlock();
147 	IP6_INC_STATS_BH(dev_net(dst->dev),
148 			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
149 	kfree_skb(skb);
150 	return -EINVAL;
151 }
152 
153 static int ip6_finish_output(struct sk_buff *skb)
154 {
155 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
156 	    dst_allfrag(skb_dst(skb)))
157 		return ip6_fragment(skb, ip6_finish_output2);
158 	else
159 		return ip6_finish_output2(skb);
160 }
161 
162 int ip6_output(struct sk_buff *skb)
163 {
164 	struct net_device *dev = skb_dst(skb)->dev;
165 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
166 	if (unlikely(idev->cnf.disable_ipv6)) {
167 		IP6_INC_STATS(dev_net(dev), idev,
168 			      IPSTATS_MIB_OUTDISCARDS);
169 		kfree_skb(skb);
170 		return 0;
171 	}
172 
173 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
174 			    ip6_finish_output,
175 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
176 }
177 
178 /*
179  *	xmit an sk_buff (used by TCP, SCTP and DCCP)
180  */
181 
182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
183 	     struct ipv6_txoptions *opt, int tclass)
184 {
185 	struct net *net = sock_net(sk);
186 	struct ipv6_pinfo *np = inet6_sk(sk);
187 	struct in6_addr *first_hop = &fl6->daddr;
188 	struct dst_entry *dst = skb_dst(skb);
189 	struct ipv6hdr *hdr;
190 	u8  proto = fl6->flowi6_proto;
191 	int seg_len = skb->len;
192 	int hlimit = -1;
193 	u32 mtu;
194 
195 	if (opt) {
196 		unsigned int head_room;
197 
198 		/* First: exthdrs may take lots of space (~8K for now)
199 		   MAX_HEADER is not enough.
200 		 */
201 		head_room = opt->opt_nflen + opt->opt_flen;
202 		seg_len += head_room;
203 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
204 
205 		if (skb_headroom(skb) < head_room) {
206 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
207 			if (skb2 == NULL) {
208 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
209 					      IPSTATS_MIB_OUTDISCARDS);
210 				kfree_skb(skb);
211 				return -ENOBUFS;
212 			}
213 			kfree_skb(skb);
214 			skb = skb2;
215 			skb_set_owner_w(skb, sk);
216 		}
217 		if (opt->opt_flen)
218 			ipv6_push_frag_opts(skb, opt, &proto);
219 		if (opt->opt_nflen)
220 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
221 	}
222 
223 	skb_push(skb, sizeof(struct ipv6hdr));
224 	skb_reset_network_header(skb);
225 	hdr = ipv6_hdr(skb);
226 
227 	/*
228 	 *	Fill in the IPv6 header
229 	 */
230 	if (np)
231 		hlimit = np->hop_limit;
232 	if (hlimit < 0)
233 		hlimit = ip6_dst_hoplimit(dst);
234 
235 	*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
236 
237 	hdr->payload_len = htons(seg_len);
238 	hdr->nexthdr = proto;
239 	hdr->hop_limit = hlimit;
240 
241 	hdr->saddr = fl6->saddr;
242 	hdr->daddr = *first_hop;
243 
244 	skb->priority = sk->sk_priority;
245 	skb->mark = sk->sk_mark;
246 
247 	mtu = dst_mtu(dst);
248 	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
249 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
250 			      IPSTATS_MIB_OUT, skb->len);
251 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
252 			       dst->dev, dst_output);
253 	}
254 
255 	if (net_ratelimit())
256 		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
257 	skb->dev = dst->dev;
258 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
259 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
260 	kfree_skb(skb);
261 	return -EMSGSIZE;
262 }
263 
264 EXPORT_SYMBOL(ip6_xmit);
265 
266 /*
267  *	To avoid extra problems ND packets are send through this
268  *	routine. It's code duplication but I really want to avoid
269  *	extra checks since ipv6_build_header is used by TCP (which
270  *	is for us performance critical)
271  */
272 
273 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
274 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
275 	       int proto, int len)
276 {
277 	struct ipv6_pinfo *np = inet6_sk(sk);
278 	struct ipv6hdr *hdr;
279 
280 	skb->protocol = htons(ETH_P_IPV6);
281 	skb->dev = dev;
282 
283 	skb_reset_network_header(skb);
284 	skb_put(skb, sizeof(struct ipv6hdr));
285 	hdr = ipv6_hdr(skb);
286 
287 	*(__be32*)hdr = htonl(0x60000000);
288 
289 	hdr->payload_len = htons(len);
290 	hdr->nexthdr = proto;
291 	hdr->hop_limit = np->hop_limit;
292 
293 	hdr->saddr = *saddr;
294 	hdr->daddr = *daddr;
295 
296 	return 0;
297 }
298 
299 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
300 {
301 	struct ip6_ra_chain *ra;
302 	struct sock *last = NULL;
303 
304 	read_lock(&ip6_ra_lock);
305 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
306 		struct sock *sk = ra->sk;
307 		if (sk && ra->sel == sel &&
308 		    (!sk->sk_bound_dev_if ||
309 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
310 			if (last) {
311 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
312 				if (skb2)
313 					rawv6_rcv(last, skb2);
314 			}
315 			last = sk;
316 		}
317 	}
318 
319 	if (last) {
320 		rawv6_rcv(last, skb);
321 		read_unlock(&ip6_ra_lock);
322 		return 1;
323 	}
324 	read_unlock(&ip6_ra_lock);
325 	return 0;
326 }
327 
328 static int ip6_forward_proxy_check(struct sk_buff *skb)
329 {
330 	struct ipv6hdr *hdr = ipv6_hdr(skb);
331 	u8 nexthdr = hdr->nexthdr;
332 	__be16 frag_off;
333 	int offset;
334 
335 	if (ipv6_ext_hdr(nexthdr)) {
336 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
337 		if (offset < 0)
338 			return 0;
339 	} else
340 		offset = sizeof(struct ipv6hdr);
341 
342 	if (nexthdr == IPPROTO_ICMPV6) {
343 		struct icmp6hdr *icmp6;
344 
345 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
346 					 offset + 1 - skb->data)))
347 			return 0;
348 
349 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
350 
351 		switch (icmp6->icmp6_type) {
352 		case NDISC_ROUTER_SOLICITATION:
353 		case NDISC_ROUTER_ADVERTISEMENT:
354 		case NDISC_NEIGHBOUR_SOLICITATION:
355 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
356 		case NDISC_REDIRECT:
357 			/* For reaction involving unicast neighbor discovery
358 			 * message destined to the proxied address, pass it to
359 			 * input function.
360 			 */
361 			return 1;
362 		default:
363 			break;
364 		}
365 	}
366 
367 	/*
368 	 * The proxying router can't forward traffic sent to a link-local
369 	 * address, so signal the sender and discard the packet. This
370 	 * behavior is clarified by the MIPv6 specification.
371 	 */
372 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
373 		dst_link_failure(skb);
374 		return -1;
375 	}
376 
377 	return 0;
378 }
379 
380 static inline int ip6_forward_finish(struct sk_buff *skb)
381 {
382 	return dst_output(skb);
383 }
384 
385 int ip6_forward(struct sk_buff *skb)
386 {
387 	struct dst_entry *dst = skb_dst(skb);
388 	struct ipv6hdr *hdr = ipv6_hdr(skb);
389 	struct inet6_skb_parm *opt = IP6CB(skb);
390 	struct net *net = dev_net(dst->dev);
391 	u32 mtu;
392 
393 	if (net->ipv6.devconf_all->forwarding == 0)
394 		goto error;
395 
396 	if (skb_warn_if_lro(skb))
397 		goto drop;
398 
399 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
400 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
401 		goto drop;
402 	}
403 
404 	if (skb->pkt_type != PACKET_HOST)
405 		goto drop;
406 
407 	skb_forward_csum(skb);
408 
409 	/*
410 	 *	We DO NOT make any processing on
411 	 *	RA packets, pushing them to user level AS IS
412 	 *	without ane WARRANTY that application will be able
413 	 *	to interpret them. The reason is that we
414 	 *	cannot make anything clever here.
415 	 *
416 	 *	We are not end-node, so that if packet contains
417 	 *	AH/ESP, we cannot make anything.
418 	 *	Defragmentation also would be mistake, RA packets
419 	 *	cannot be fragmented, because there is no warranty
420 	 *	that different fragments will go along one path. --ANK
421 	 */
422 	if (opt->ra) {
423 		u8 *ptr = skb_network_header(skb) + opt->ra;
424 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
425 			return 0;
426 	}
427 
428 	/*
429 	 *	check and decrement ttl
430 	 */
431 	if (hdr->hop_limit <= 1) {
432 		/* Force OUTPUT device used as source address */
433 		skb->dev = dst->dev;
434 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
435 		IP6_INC_STATS_BH(net,
436 				 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
437 
438 		kfree_skb(skb);
439 		return -ETIMEDOUT;
440 	}
441 
442 	/* XXX: idev->cnf.proxy_ndp? */
443 	if (net->ipv6.devconf_all->proxy_ndp &&
444 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
445 		int proxied = ip6_forward_proxy_check(skb);
446 		if (proxied > 0)
447 			return ip6_input(skb);
448 		else if (proxied < 0) {
449 			IP6_INC_STATS(net, ip6_dst_idev(dst),
450 				      IPSTATS_MIB_INDISCARDS);
451 			goto drop;
452 		}
453 	}
454 
455 	if (!xfrm6_route_forward(skb)) {
456 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
457 		goto drop;
458 	}
459 	dst = skb_dst(skb);
460 
461 	/* IPv6 specs say nothing about it, but it is clear that we cannot
462 	   send redirects to source routed frames.
463 	   We don't send redirects to frames decapsulated from IPsec.
464 	 */
465 	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
466 		struct in6_addr *target = NULL;
467 		struct rt6_info *rt;
468 
469 		/*
470 		 *	incoming and outgoing devices are the same
471 		 *	send a redirect.
472 		 */
473 
474 		rt = (struct rt6_info *) dst;
475 		if (rt->rt6i_flags & RTF_GATEWAY)
476 			target = &rt->rt6i_gateway;
477 		else
478 			target = &hdr->daddr;
479 
480 		if (!rt->rt6i_peer)
481 			rt6_bind_peer(rt, 1);
482 
483 		/* Limit redirects both by destination (here)
484 		   and by source (inside ndisc_send_redirect)
485 		 */
486 		if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
487 			ndisc_send_redirect(skb, target);
488 	} else {
489 		int addrtype = ipv6_addr_type(&hdr->saddr);
490 
491 		/* This check is security critical. */
492 		if (addrtype == IPV6_ADDR_ANY ||
493 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
494 			goto error;
495 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
496 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
497 				    ICMPV6_NOT_NEIGHBOUR, 0);
498 			goto error;
499 		}
500 	}
501 
502 	mtu = dst_mtu(dst);
503 	if (mtu < IPV6_MIN_MTU)
504 		mtu = IPV6_MIN_MTU;
505 
506 	if (skb->len > mtu && !skb_is_gso(skb)) {
507 		/* Again, force OUTPUT device used as source address */
508 		skb->dev = dst->dev;
509 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
510 		IP6_INC_STATS_BH(net,
511 				 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
512 		IP6_INC_STATS_BH(net,
513 				 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
514 		kfree_skb(skb);
515 		return -EMSGSIZE;
516 	}
517 
518 	if (skb_cow(skb, dst->dev->hard_header_len)) {
519 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
520 		goto drop;
521 	}
522 
523 	hdr = ipv6_hdr(skb);
524 
525 	/* Mangling hops number delayed to point after skb COW */
526 
527 	hdr->hop_limit--;
528 
529 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
530 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
531 		       ip6_forward_finish);
532 
533 error:
534 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
535 drop:
536 	kfree_skb(skb);
537 	return -EINVAL;
538 }
539 
540 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
541 {
542 	to->pkt_type = from->pkt_type;
543 	to->priority = from->priority;
544 	to->protocol = from->protocol;
545 	skb_dst_drop(to);
546 	skb_dst_set(to, dst_clone(skb_dst(from)));
547 	to->dev = from->dev;
548 	to->mark = from->mark;
549 
550 #ifdef CONFIG_NET_SCHED
551 	to->tc_index = from->tc_index;
552 #endif
553 	nf_copy(to, from);
554 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
555     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
556 	to->nf_trace = from->nf_trace;
557 #endif
558 	skb_copy_secmark(to, from);
559 }
560 
561 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
562 {
563 	u16 offset = sizeof(struct ipv6hdr);
564 	struct ipv6_opt_hdr *exthdr =
565 				(struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
566 	unsigned int packet_len = skb->tail - skb->network_header;
567 	int found_rhdr = 0;
568 	*nexthdr = &ipv6_hdr(skb)->nexthdr;
569 
570 	while (offset + 1 <= packet_len) {
571 
572 		switch (**nexthdr) {
573 
574 		case NEXTHDR_HOP:
575 			break;
576 		case NEXTHDR_ROUTING:
577 			found_rhdr = 1;
578 			break;
579 		case NEXTHDR_DEST:
580 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
581 			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
582 				break;
583 #endif
584 			if (found_rhdr)
585 				return offset;
586 			break;
587 		default :
588 			return offset;
589 		}
590 
591 		offset += ipv6_optlen(exthdr);
592 		*nexthdr = &exthdr->nexthdr;
593 		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
594 						 offset);
595 	}
596 
597 	return offset;
598 }
599 
600 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
601 {
602 	static atomic_t ipv6_fragmentation_id;
603 	int old, new;
604 
605 	if (rt && !(rt->dst.flags & DST_NOPEER)) {
606 		struct inet_peer *peer;
607 
608 		if (!rt->rt6i_peer)
609 			rt6_bind_peer(rt, 1);
610 		peer = rt->rt6i_peer;
611 		if (peer) {
612 			fhdr->identification = htonl(inet_getid(peer, 0));
613 			return;
614 		}
615 	}
616 	do {
617 		old = atomic_read(&ipv6_fragmentation_id);
618 		new = old + 1;
619 		if (!new)
620 			new = 1;
621 	} while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
622 	fhdr->identification = htonl(new);
623 }
624 
625 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
626 {
627 	struct sk_buff *frag;
628 	struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
629 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
630 	struct ipv6hdr *tmp_hdr;
631 	struct frag_hdr *fh;
632 	unsigned int mtu, hlen, left, len;
633 	int hroom, troom;
634 	__be32 frag_id = 0;
635 	int ptr, offset = 0, err=0;
636 	u8 *prevhdr, nexthdr = 0;
637 	struct net *net = dev_net(skb_dst(skb)->dev);
638 
639 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
640 	nexthdr = *prevhdr;
641 
642 	mtu = ip6_skb_dst_mtu(skb);
643 
644 	/* We must not fragment if the socket is set to force MTU discovery
645 	 * or if the skb it not generated by a local socket.
646 	 */
647 	if (!skb->local_df && skb->len > mtu) {
648 		skb->dev = skb_dst(skb)->dev;
649 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
650 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
651 			      IPSTATS_MIB_FRAGFAILS);
652 		kfree_skb(skb);
653 		return -EMSGSIZE;
654 	}
655 
656 	if (np && np->frag_size < mtu) {
657 		if (np->frag_size)
658 			mtu = np->frag_size;
659 	}
660 	mtu -= hlen + sizeof(struct frag_hdr);
661 
662 	if (skb_has_frag_list(skb)) {
663 		int first_len = skb_pagelen(skb);
664 		struct sk_buff *frag2;
665 
666 		if (first_len - hlen > mtu ||
667 		    ((first_len - hlen) & 7) ||
668 		    skb_cloned(skb))
669 			goto slow_path;
670 
671 		skb_walk_frags(skb, frag) {
672 			/* Correct geometry. */
673 			if (frag->len > mtu ||
674 			    ((frag->len & 7) && frag->next) ||
675 			    skb_headroom(frag) < hlen)
676 				goto slow_path_clean;
677 
678 			/* Partially cloned skb? */
679 			if (skb_shared(frag))
680 				goto slow_path_clean;
681 
682 			BUG_ON(frag->sk);
683 			if (skb->sk) {
684 				frag->sk = skb->sk;
685 				frag->destructor = sock_wfree;
686 			}
687 			skb->truesize -= frag->truesize;
688 		}
689 
690 		err = 0;
691 		offset = 0;
692 		frag = skb_shinfo(skb)->frag_list;
693 		skb_frag_list_init(skb);
694 		/* BUILD HEADER */
695 
696 		*prevhdr = NEXTHDR_FRAGMENT;
697 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
698 		if (!tmp_hdr) {
699 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
700 				      IPSTATS_MIB_FRAGFAILS);
701 			return -ENOMEM;
702 		}
703 
704 		__skb_pull(skb, hlen);
705 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
706 		__skb_push(skb, hlen);
707 		skb_reset_network_header(skb);
708 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
709 
710 		ipv6_select_ident(fh, rt);
711 		fh->nexthdr = nexthdr;
712 		fh->reserved = 0;
713 		fh->frag_off = htons(IP6_MF);
714 		frag_id = fh->identification;
715 
716 		first_len = skb_pagelen(skb);
717 		skb->data_len = first_len - skb_headlen(skb);
718 		skb->len = first_len;
719 		ipv6_hdr(skb)->payload_len = htons(first_len -
720 						   sizeof(struct ipv6hdr));
721 
722 		dst_hold(&rt->dst);
723 
724 		for (;;) {
725 			/* Prepare header of the next frame,
726 			 * before previous one went down. */
727 			if (frag) {
728 				frag->ip_summed = CHECKSUM_NONE;
729 				skb_reset_transport_header(frag);
730 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
731 				__skb_push(frag, hlen);
732 				skb_reset_network_header(frag);
733 				memcpy(skb_network_header(frag), tmp_hdr,
734 				       hlen);
735 				offset += skb->len - hlen - sizeof(struct frag_hdr);
736 				fh->nexthdr = nexthdr;
737 				fh->reserved = 0;
738 				fh->frag_off = htons(offset);
739 				if (frag->next != NULL)
740 					fh->frag_off |= htons(IP6_MF);
741 				fh->identification = frag_id;
742 				ipv6_hdr(frag)->payload_len =
743 						htons(frag->len -
744 						      sizeof(struct ipv6hdr));
745 				ip6_copy_metadata(frag, skb);
746 			}
747 
748 			err = output(skb);
749 			if(!err)
750 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
751 					      IPSTATS_MIB_FRAGCREATES);
752 
753 			if (err || !frag)
754 				break;
755 
756 			skb = frag;
757 			frag = skb->next;
758 			skb->next = NULL;
759 		}
760 
761 		kfree(tmp_hdr);
762 
763 		if (err == 0) {
764 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
765 				      IPSTATS_MIB_FRAGOKS);
766 			dst_release(&rt->dst);
767 			return 0;
768 		}
769 
770 		while (frag) {
771 			skb = frag->next;
772 			kfree_skb(frag);
773 			frag = skb;
774 		}
775 
776 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
777 			      IPSTATS_MIB_FRAGFAILS);
778 		dst_release(&rt->dst);
779 		return err;
780 
781 slow_path_clean:
782 		skb_walk_frags(skb, frag2) {
783 			if (frag2 == frag)
784 				break;
785 			frag2->sk = NULL;
786 			frag2->destructor = NULL;
787 			skb->truesize += frag2->truesize;
788 		}
789 	}
790 
791 slow_path:
792 	left = skb->len - hlen;		/* Space per frame */
793 	ptr = hlen;			/* Where to start from */
794 
795 	/*
796 	 *	Fragment the datagram.
797 	 */
798 
799 	*prevhdr = NEXTHDR_FRAGMENT;
800 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
801 	troom = rt->dst.dev->needed_tailroom;
802 
803 	/*
804 	 *	Keep copying data until we run out.
805 	 */
806 	while(left > 0)	{
807 		len = left;
808 		/* IF: it doesn't fit, use 'mtu' - the data space left */
809 		if (len > mtu)
810 			len = mtu;
811 		/* IF: we are not sending up to and including the packet end
812 		   then align the next start on an eight byte boundary */
813 		if (len < left)	{
814 			len &= ~7;
815 		}
816 		/*
817 		 *	Allocate buffer.
818 		 */
819 
820 		if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
821 				      hroom + troom, GFP_ATOMIC)) == NULL) {
822 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
823 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
824 				      IPSTATS_MIB_FRAGFAILS);
825 			err = -ENOMEM;
826 			goto fail;
827 		}
828 
829 		/*
830 		 *	Set up data on packet
831 		 */
832 
833 		ip6_copy_metadata(frag, skb);
834 		skb_reserve(frag, hroom);
835 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
836 		skb_reset_network_header(frag);
837 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
838 		frag->transport_header = (frag->network_header + hlen +
839 					  sizeof(struct frag_hdr));
840 
841 		/*
842 		 *	Charge the memory for the fragment to any owner
843 		 *	it might possess
844 		 */
845 		if (skb->sk)
846 			skb_set_owner_w(frag, skb->sk);
847 
848 		/*
849 		 *	Copy the packet header into the new buffer.
850 		 */
851 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
852 
853 		/*
854 		 *	Build fragment header.
855 		 */
856 		fh->nexthdr = nexthdr;
857 		fh->reserved = 0;
858 		if (!frag_id) {
859 			ipv6_select_ident(fh, rt);
860 			frag_id = fh->identification;
861 		} else
862 			fh->identification = frag_id;
863 
864 		/*
865 		 *	Copy a block of the IP datagram.
866 		 */
867 		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
868 			BUG();
869 		left -= len;
870 
871 		fh->frag_off = htons(offset);
872 		if (left > 0)
873 			fh->frag_off |= htons(IP6_MF);
874 		ipv6_hdr(frag)->payload_len = htons(frag->len -
875 						    sizeof(struct ipv6hdr));
876 
877 		ptr += len;
878 		offset += len;
879 
880 		/*
881 		 *	Put this fragment into the sending queue.
882 		 */
883 		err = output(frag);
884 		if (err)
885 			goto fail;
886 
887 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
888 			      IPSTATS_MIB_FRAGCREATES);
889 	}
890 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
891 		      IPSTATS_MIB_FRAGOKS);
892 	kfree_skb(skb);
893 	return err;
894 
895 fail:
896 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
897 		      IPSTATS_MIB_FRAGFAILS);
898 	kfree_skb(skb);
899 	return err;
900 }
901 
902 static inline int ip6_rt_check(const struct rt6key *rt_key,
903 			       const struct in6_addr *fl_addr,
904 			       const struct in6_addr *addr_cache)
905 {
906 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
907 		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
908 }
909 
910 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
911 					  struct dst_entry *dst,
912 					  const struct flowi6 *fl6)
913 {
914 	struct ipv6_pinfo *np = inet6_sk(sk);
915 	struct rt6_info *rt = (struct rt6_info *)dst;
916 
917 	if (!dst)
918 		goto out;
919 
920 	/* Yes, checking route validity in not connected
921 	 * case is not very simple. Take into account,
922 	 * that we do not support routing by source, TOS,
923 	 * and MSG_DONTROUTE 		--ANK (980726)
924 	 *
925 	 * 1. ip6_rt_check(): If route was host route,
926 	 *    check that cached destination is current.
927 	 *    If it is network route, we still may
928 	 *    check its validity using saved pointer
929 	 *    to the last used address: daddr_cache.
930 	 *    We do not want to save whole address now,
931 	 *    (because main consumer of this service
932 	 *    is tcp, which has not this problem),
933 	 *    so that the last trick works only on connected
934 	 *    sockets.
935 	 * 2. oif also should be the same.
936 	 */
937 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
938 #ifdef CONFIG_IPV6_SUBTREES
939 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
940 #endif
941 	    (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
942 		dst_release(dst);
943 		dst = NULL;
944 	}
945 
946 out:
947 	return dst;
948 }
949 
950 static int ip6_dst_lookup_tail(struct sock *sk,
951 			       struct dst_entry **dst, struct flowi6 *fl6)
952 {
953 	struct net *net = sock_net(sk);
954 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
955 	struct neighbour *n;
956 #endif
957 	int err;
958 
959 	if (*dst == NULL)
960 		*dst = ip6_route_output(net, sk, fl6);
961 
962 	if ((err = (*dst)->error))
963 		goto out_err_release;
964 
965 	if (ipv6_addr_any(&fl6->saddr)) {
966 		struct rt6_info *rt = (struct rt6_info *) *dst;
967 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
968 					  sk ? inet6_sk(sk)->srcprefs : 0,
969 					  &fl6->saddr);
970 		if (err)
971 			goto out_err_release;
972 	}
973 
974 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
975 	/*
976 	 * Here if the dst entry we've looked up
977 	 * has a neighbour entry that is in the INCOMPLETE
978 	 * state and the src address from the flow is
979 	 * marked as OPTIMISTIC, we release the found
980 	 * dst entry and replace it instead with the
981 	 * dst entry of the nexthop router
982 	 */
983 	rcu_read_lock();
984 	n = dst_get_neighbour_noref(*dst);
985 	if (n && !(n->nud_state & NUD_VALID)) {
986 		struct inet6_ifaddr *ifp;
987 		struct flowi6 fl_gw6;
988 		int redirect;
989 
990 		rcu_read_unlock();
991 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
992 				      (*dst)->dev, 1);
993 
994 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
995 		if (ifp)
996 			in6_ifa_put(ifp);
997 
998 		if (redirect) {
999 			/*
1000 			 * We need to get the dst entry for the
1001 			 * default router instead
1002 			 */
1003 			dst_release(*dst);
1004 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1005 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1006 			*dst = ip6_route_output(net, sk, &fl_gw6);
1007 			if ((err = (*dst)->error))
1008 				goto out_err_release;
1009 		}
1010 	} else {
1011 		rcu_read_unlock();
1012 	}
1013 #endif
1014 
1015 	return 0;
1016 
1017 out_err_release:
1018 	if (err == -ENETUNREACH)
1019 		IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1020 	dst_release(*dst);
1021 	*dst = NULL;
1022 	return err;
1023 }
1024 
1025 /**
1026  *	ip6_dst_lookup - perform route lookup on flow
1027  *	@sk: socket which provides route info
1028  *	@dst: pointer to dst_entry * for result
1029  *	@fl6: flow to lookup
1030  *
1031  *	This function performs a route lookup on the given flow.
1032  *
1033  *	It returns zero on success, or a standard errno code on error.
1034  */
1035 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1036 {
1037 	*dst = NULL;
1038 	return ip6_dst_lookup_tail(sk, dst, fl6);
1039 }
1040 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1041 
1042 /**
1043  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1044  *	@sk: socket which provides route info
1045  *	@fl6: flow to lookup
1046  *	@final_dst: final destination address for ipsec lookup
1047  *	@can_sleep: we are in a sleepable context
1048  *
1049  *	This function performs a route lookup on the given flow.
1050  *
1051  *	It returns a valid dst pointer on success, or a pointer encoded
1052  *	error code.
1053  */
1054 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1055 				      const struct in6_addr *final_dst,
1056 				      bool can_sleep)
1057 {
1058 	struct dst_entry *dst = NULL;
1059 	int err;
1060 
1061 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1062 	if (err)
1063 		return ERR_PTR(err);
1064 	if (final_dst)
1065 		fl6->daddr = *final_dst;
1066 	if (can_sleep)
1067 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1068 
1069 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1070 }
1071 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1072 
1073 /**
1074  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1075  *	@sk: socket which provides the dst cache and route info
1076  *	@fl6: flow to lookup
1077  *	@final_dst: final destination address for ipsec lookup
1078  *	@can_sleep: we are in a sleepable context
1079  *
1080  *	This function performs a route lookup on the given flow with the
1081  *	possibility of using the cached route in the socket if it is valid.
1082  *	It will take the socket dst lock when operating on the dst cache.
1083  *	As a result, this function can only be used in process context.
1084  *
1085  *	It returns a valid dst pointer on success, or a pointer encoded
1086  *	error code.
1087  */
1088 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1089 					 const struct in6_addr *final_dst,
1090 					 bool can_sleep)
1091 {
1092 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1093 	int err;
1094 
1095 	dst = ip6_sk_dst_check(sk, dst, fl6);
1096 
1097 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1098 	if (err)
1099 		return ERR_PTR(err);
1100 	if (final_dst)
1101 		fl6->daddr = *final_dst;
1102 	if (can_sleep)
1103 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1104 
1105 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1106 }
1107 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1108 
1109 static inline int ip6_ufo_append_data(struct sock *sk,
1110 			int getfrag(void *from, char *to, int offset, int len,
1111 			int odd, struct sk_buff *skb),
1112 			void *from, int length, int hh_len, int fragheaderlen,
1113 			int transhdrlen, int mtu,unsigned int flags,
1114 			struct rt6_info *rt)
1115 
1116 {
1117 	struct sk_buff *skb;
1118 	int err;
1119 
1120 	/* There is support for UDP large send offload by network
1121 	 * device, so create one single skb packet containing complete
1122 	 * udp datagram
1123 	 */
1124 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1125 		skb = sock_alloc_send_skb(sk,
1126 			hh_len + fragheaderlen + transhdrlen + 20,
1127 			(flags & MSG_DONTWAIT), &err);
1128 		if (skb == NULL)
1129 			return err;
1130 
1131 		/* reserve space for Hardware header */
1132 		skb_reserve(skb, hh_len);
1133 
1134 		/* create space for UDP/IP header */
1135 		skb_put(skb,fragheaderlen + transhdrlen);
1136 
1137 		/* initialize network header pointer */
1138 		skb_reset_network_header(skb);
1139 
1140 		/* initialize protocol header pointer */
1141 		skb->transport_header = skb->network_header + fragheaderlen;
1142 
1143 		skb->ip_summed = CHECKSUM_PARTIAL;
1144 		skb->csum = 0;
1145 	}
1146 
1147 	err = skb_append_datato_frags(sk,skb, getfrag, from,
1148 				      (length - transhdrlen));
1149 	if (!err) {
1150 		struct frag_hdr fhdr;
1151 
1152 		/* Specify the length of each IPv6 datagram fragment.
1153 		 * It has to be a multiple of 8.
1154 		 */
1155 		skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1156 					     sizeof(struct frag_hdr)) & ~7;
1157 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1158 		ipv6_select_ident(&fhdr, rt);
1159 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1160 		__skb_queue_tail(&sk->sk_write_queue, skb);
1161 
1162 		return 0;
1163 	}
1164 	/* There is not enough support do UPD LSO,
1165 	 * so follow normal path
1166 	 */
1167 	kfree_skb(skb);
1168 
1169 	return err;
1170 }
1171 
1172 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1173 					       gfp_t gfp)
1174 {
1175 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1176 }
1177 
1178 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1179 						gfp_t gfp)
1180 {
1181 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1182 }
1183 
1184 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1185 	int offset, int len, int odd, struct sk_buff *skb),
1186 	void *from, int length, int transhdrlen,
1187 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1188 	struct rt6_info *rt, unsigned int flags, int dontfrag)
1189 {
1190 	struct inet_sock *inet = inet_sk(sk);
1191 	struct ipv6_pinfo *np = inet6_sk(sk);
1192 	struct inet_cork *cork;
1193 	struct sk_buff *skb;
1194 	unsigned int maxfraglen, fragheaderlen;
1195 	int exthdrlen;
1196 	int dst_exthdrlen;
1197 	int hh_len;
1198 	int mtu;
1199 	int copy;
1200 	int err;
1201 	int offset = 0;
1202 	int csummode = CHECKSUM_NONE;
1203 	__u8 tx_flags = 0;
1204 
1205 	if (flags&MSG_PROBE)
1206 		return 0;
1207 	cork = &inet->cork.base;
1208 	if (skb_queue_empty(&sk->sk_write_queue)) {
1209 		/*
1210 		 * setup for corking
1211 		 */
1212 		if (opt) {
1213 			if (WARN_ON(np->cork.opt))
1214 				return -EINVAL;
1215 
1216 			np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1217 			if (unlikely(np->cork.opt == NULL))
1218 				return -ENOBUFS;
1219 
1220 			np->cork.opt->tot_len = opt->tot_len;
1221 			np->cork.opt->opt_flen = opt->opt_flen;
1222 			np->cork.opt->opt_nflen = opt->opt_nflen;
1223 
1224 			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1225 							    sk->sk_allocation);
1226 			if (opt->dst0opt && !np->cork.opt->dst0opt)
1227 				return -ENOBUFS;
1228 
1229 			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1230 							    sk->sk_allocation);
1231 			if (opt->dst1opt && !np->cork.opt->dst1opt)
1232 				return -ENOBUFS;
1233 
1234 			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1235 							   sk->sk_allocation);
1236 			if (opt->hopopt && !np->cork.opt->hopopt)
1237 				return -ENOBUFS;
1238 
1239 			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1240 							    sk->sk_allocation);
1241 			if (opt->srcrt && !np->cork.opt->srcrt)
1242 				return -ENOBUFS;
1243 
1244 			/* need source address above miyazawa*/
1245 		}
1246 		dst_hold(&rt->dst);
1247 		cork->dst = &rt->dst;
1248 		inet->cork.fl.u.ip6 = *fl6;
1249 		np->cork.hop_limit = hlimit;
1250 		np->cork.tclass = tclass;
1251 		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1252 		      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1253 		if (np->frag_size < mtu) {
1254 			if (np->frag_size)
1255 				mtu = np->frag_size;
1256 		}
1257 		cork->fragsize = mtu;
1258 		if (dst_allfrag(rt->dst.path))
1259 			cork->flags |= IPCORK_ALLFRAG;
1260 		cork->length = 0;
1261 		sk->sk_sndmsg_page = NULL;
1262 		sk->sk_sndmsg_off = 0;
1263 		exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1264 		length += exthdrlen;
1265 		transhdrlen += exthdrlen;
1266 		dst_exthdrlen = rt->dst.header_len;
1267 	} else {
1268 		rt = (struct rt6_info *)cork->dst;
1269 		fl6 = &inet->cork.fl.u.ip6;
1270 		opt = np->cork.opt;
1271 		transhdrlen = 0;
1272 		exthdrlen = 0;
1273 		dst_exthdrlen = 0;
1274 		mtu = cork->fragsize;
1275 	}
1276 
1277 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1278 
1279 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1280 			(opt ? opt->opt_nflen : 0);
1281 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1282 
1283 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1284 		if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1285 			ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1286 			return -EMSGSIZE;
1287 		}
1288 	}
1289 
1290 	/* For UDP, check if TX timestamp is enabled */
1291 	if (sk->sk_type == SOCK_DGRAM) {
1292 		err = sock_tx_timestamp(sk, &tx_flags);
1293 		if (err)
1294 			goto error;
1295 	}
1296 
1297 	/*
1298 	 * Let's try using as much space as possible.
1299 	 * Use MTU if total length of the message fits into the MTU.
1300 	 * Otherwise, we need to reserve fragment header and
1301 	 * fragment alignment (= 8-15 octects, in total).
1302 	 *
1303 	 * Note that we may need to "move" the data from the tail of
1304 	 * of the buffer to the new fragment when we split
1305 	 * the message.
1306 	 *
1307 	 * FIXME: It may be fragmented into multiple chunks
1308 	 *        at once if non-fragmentable extension headers
1309 	 *        are too large.
1310 	 * --yoshfuji
1311 	 */
1312 
1313 	cork->length += length;
1314 	if (length > mtu) {
1315 		int proto = sk->sk_protocol;
1316 		if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1317 			ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1318 			return -EMSGSIZE;
1319 		}
1320 
1321 		if (proto == IPPROTO_UDP &&
1322 		    (rt->dst.dev->features & NETIF_F_UFO)) {
1323 
1324 			err = ip6_ufo_append_data(sk, getfrag, from, length,
1325 						  hh_len, fragheaderlen,
1326 						  transhdrlen, mtu, flags, rt);
1327 			if (err)
1328 				goto error;
1329 			return 0;
1330 		}
1331 	}
1332 
1333 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1334 		goto alloc_new_skb;
1335 
1336 	while (length > 0) {
1337 		/* Check if the remaining data fits into current packet. */
1338 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1339 		if (copy < length)
1340 			copy = maxfraglen - skb->len;
1341 
1342 		if (copy <= 0) {
1343 			char *data;
1344 			unsigned int datalen;
1345 			unsigned int fraglen;
1346 			unsigned int fraggap;
1347 			unsigned int alloclen;
1348 			struct sk_buff *skb_prev;
1349 alloc_new_skb:
1350 			skb_prev = skb;
1351 
1352 			/* There's no room in the current skb */
1353 			if (skb_prev)
1354 				fraggap = skb_prev->len - maxfraglen;
1355 			else
1356 				fraggap = 0;
1357 
1358 			/*
1359 			 * If remaining data exceeds the mtu,
1360 			 * we know we need more fragment(s).
1361 			 */
1362 			datalen = length + fraggap;
1363 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1364 				datalen = maxfraglen - fragheaderlen;
1365 
1366 			fraglen = datalen + fragheaderlen;
1367 			if ((flags & MSG_MORE) &&
1368 			    !(rt->dst.dev->features&NETIF_F_SG))
1369 				alloclen = mtu;
1370 			else
1371 				alloclen = datalen + fragheaderlen;
1372 
1373 			alloclen += dst_exthdrlen;
1374 
1375 			/*
1376 			 * The last fragment gets additional space at tail.
1377 			 * Note: we overallocate on fragments with MSG_MODE
1378 			 * because we have no idea if we're the last one.
1379 			 */
1380 			if (datalen == length + fraggap)
1381 				alloclen += rt->dst.trailer_len;
1382 
1383 			/*
1384 			 * We just reserve space for fragment header.
1385 			 * Note: this may be overallocation if the message
1386 			 * (without MSG_MORE) fits into the MTU.
1387 			 */
1388 			alloclen += sizeof(struct frag_hdr);
1389 
1390 			if (transhdrlen) {
1391 				skb = sock_alloc_send_skb(sk,
1392 						alloclen + hh_len,
1393 						(flags & MSG_DONTWAIT), &err);
1394 			} else {
1395 				skb = NULL;
1396 				if (atomic_read(&sk->sk_wmem_alloc) <=
1397 				    2 * sk->sk_sndbuf)
1398 					skb = sock_wmalloc(sk,
1399 							   alloclen + hh_len, 1,
1400 							   sk->sk_allocation);
1401 				if (unlikely(skb == NULL))
1402 					err = -ENOBUFS;
1403 				else {
1404 					/* Only the initial fragment
1405 					 * is time stamped.
1406 					 */
1407 					tx_flags = 0;
1408 				}
1409 			}
1410 			if (skb == NULL)
1411 				goto error;
1412 			/*
1413 			 *	Fill in the control structures
1414 			 */
1415 			skb->ip_summed = csummode;
1416 			skb->csum = 0;
1417 			/* reserve for fragmentation and ipsec header */
1418 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1419 				    dst_exthdrlen);
1420 
1421 			if (sk->sk_type == SOCK_DGRAM)
1422 				skb_shinfo(skb)->tx_flags = tx_flags;
1423 
1424 			/*
1425 			 *	Find where to start putting bytes
1426 			 */
1427 			data = skb_put(skb, fraglen);
1428 			skb_set_network_header(skb, exthdrlen);
1429 			data += fragheaderlen;
1430 			skb->transport_header = (skb->network_header +
1431 						 fragheaderlen);
1432 			if (fraggap) {
1433 				skb->csum = skb_copy_and_csum_bits(
1434 					skb_prev, maxfraglen,
1435 					data + transhdrlen, fraggap, 0);
1436 				skb_prev->csum = csum_sub(skb_prev->csum,
1437 							  skb->csum);
1438 				data += fraggap;
1439 				pskb_trim_unique(skb_prev, maxfraglen);
1440 			}
1441 			copy = datalen - transhdrlen - fraggap;
1442 
1443 			if (copy < 0) {
1444 				err = -EINVAL;
1445 				kfree_skb(skb);
1446 				goto error;
1447 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1448 				err = -EFAULT;
1449 				kfree_skb(skb);
1450 				goto error;
1451 			}
1452 
1453 			offset += copy;
1454 			length -= datalen - fraggap;
1455 			transhdrlen = 0;
1456 			exthdrlen = 0;
1457 			dst_exthdrlen = 0;
1458 			csummode = CHECKSUM_NONE;
1459 
1460 			/*
1461 			 * Put the packet on the pending queue
1462 			 */
1463 			__skb_queue_tail(&sk->sk_write_queue, skb);
1464 			continue;
1465 		}
1466 
1467 		if (copy > length)
1468 			copy = length;
1469 
1470 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1471 			unsigned int off;
1472 
1473 			off = skb->len;
1474 			if (getfrag(from, skb_put(skb, copy),
1475 						offset, copy, off, skb) < 0) {
1476 				__skb_trim(skb, off);
1477 				err = -EFAULT;
1478 				goto error;
1479 			}
1480 		} else {
1481 			int i = skb_shinfo(skb)->nr_frags;
1482 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1483 			struct page *page = sk->sk_sndmsg_page;
1484 			int off = sk->sk_sndmsg_off;
1485 			unsigned int left;
1486 
1487 			if (page && (left = PAGE_SIZE - off) > 0) {
1488 				if (copy >= left)
1489 					copy = left;
1490 				if (page != skb_frag_page(frag)) {
1491 					if (i == MAX_SKB_FRAGS) {
1492 						err = -EMSGSIZE;
1493 						goto error;
1494 					}
1495 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1496 					skb_frag_ref(skb, i);
1497 					frag = &skb_shinfo(skb)->frags[i];
1498 				}
1499 			} else if(i < MAX_SKB_FRAGS) {
1500 				if (copy > PAGE_SIZE)
1501 					copy = PAGE_SIZE;
1502 				page = alloc_pages(sk->sk_allocation, 0);
1503 				if (page == NULL) {
1504 					err = -ENOMEM;
1505 					goto error;
1506 				}
1507 				sk->sk_sndmsg_page = page;
1508 				sk->sk_sndmsg_off = 0;
1509 
1510 				skb_fill_page_desc(skb, i, page, 0, 0);
1511 				frag = &skb_shinfo(skb)->frags[i];
1512 			} else {
1513 				err = -EMSGSIZE;
1514 				goto error;
1515 			}
1516 			if (getfrag(from,
1517 				    skb_frag_address(frag) + skb_frag_size(frag),
1518 				    offset, copy, skb->len, skb) < 0) {
1519 				err = -EFAULT;
1520 				goto error;
1521 			}
1522 			sk->sk_sndmsg_off += copy;
1523 			skb_frag_size_add(frag, copy);
1524 			skb->len += copy;
1525 			skb->data_len += copy;
1526 			skb->truesize += copy;
1527 			atomic_add(copy, &sk->sk_wmem_alloc);
1528 		}
1529 		offset += copy;
1530 		length -= copy;
1531 	}
1532 	return 0;
1533 error:
1534 	cork->length -= length;
1535 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1536 	return err;
1537 }
1538 
1539 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1540 {
1541 	if (np->cork.opt) {
1542 		kfree(np->cork.opt->dst0opt);
1543 		kfree(np->cork.opt->dst1opt);
1544 		kfree(np->cork.opt->hopopt);
1545 		kfree(np->cork.opt->srcrt);
1546 		kfree(np->cork.opt);
1547 		np->cork.opt = NULL;
1548 	}
1549 
1550 	if (inet->cork.base.dst) {
1551 		dst_release(inet->cork.base.dst);
1552 		inet->cork.base.dst = NULL;
1553 		inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1554 	}
1555 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1556 }
1557 
1558 int ip6_push_pending_frames(struct sock *sk)
1559 {
1560 	struct sk_buff *skb, *tmp_skb;
1561 	struct sk_buff **tail_skb;
1562 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1563 	struct inet_sock *inet = inet_sk(sk);
1564 	struct ipv6_pinfo *np = inet6_sk(sk);
1565 	struct net *net = sock_net(sk);
1566 	struct ipv6hdr *hdr;
1567 	struct ipv6_txoptions *opt = np->cork.opt;
1568 	struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1569 	struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1570 	unsigned char proto = fl6->flowi6_proto;
1571 	int err = 0;
1572 
1573 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1574 		goto out;
1575 	tail_skb = &(skb_shinfo(skb)->frag_list);
1576 
1577 	/* move skb->data to ip header from ext header */
1578 	if (skb->data < skb_network_header(skb))
1579 		__skb_pull(skb, skb_network_offset(skb));
1580 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1581 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1582 		*tail_skb = tmp_skb;
1583 		tail_skb = &(tmp_skb->next);
1584 		skb->len += tmp_skb->len;
1585 		skb->data_len += tmp_skb->len;
1586 		skb->truesize += tmp_skb->truesize;
1587 		tmp_skb->destructor = NULL;
1588 		tmp_skb->sk = NULL;
1589 	}
1590 
1591 	/* Allow local fragmentation. */
1592 	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1593 		skb->local_df = 1;
1594 
1595 	*final_dst = fl6->daddr;
1596 	__skb_pull(skb, skb_network_header_len(skb));
1597 	if (opt && opt->opt_flen)
1598 		ipv6_push_frag_opts(skb, opt, &proto);
1599 	if (opt && opt->opt_nflen)
1600 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1601 
1602 	skb_push(skb, sizeof(struct ipv6hdr));
1603 	skb_reset_network_header(skb);
1604 	hdr = ipv6_hdr(skb);
1605 
1606 	*(__be32*)hdr = fl6->flowlabel |
1607 		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1608 
1609 	hdr->hop_limit = np->cork.hop_limit;
1610 	hdr->nexthdr = proto;
1611 	hdr->saddr = fl6->saddr;
1612 	hdr->daddr = *final_dst;
1613 
1614 	skb->priority = sk->sk_priority;
1615 	skb->mark = sk->sk_mark;
1616 
1617 	skb_dst_set(skb, dst_clone(&rt->dst));
1618 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1619 	if (proto == IPPROTO_ICMPV6) {
1620 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1621 
1622 		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1623 		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1624 	}
1625 
1626 	err = ip6_local_out(skb);
1627 	if (err) {
1628 		if (err > 0)
1629 			err = net_xmit_errno(err);
1630 		if (err)
1631 			goto error;
1632 	}
1633 
1634 out:
1635 	ip6_cork_release(inet, np);
1636 	return err;
1637 error:
1638 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1639 	goto out;
1640 }
1641 
1642 void ip6_flush_pending_frames(struct sock *sk)
1643 {
1644 	struct sk_buff *skb;
1645 
1646 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1647 		if (skb_dst(skb))
1648 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1649 				      IPSTATS_MIB_OUTDISCARDS);
1650 		kfree_skb(skb);
1651 	}
1652 
1653 	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1654 }
1655