xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 87c2ce3b)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	$Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9  *
10  *	Based on linux/net/ipv4/ip_output.c
11  *
12  *	This program is free software; you can redistribute it and/or
13  *      modify it under the terms of the GNU General Public License
14  *      as published by the Free Software Foundation; either version
15  *      2 of the License, or (at your option) any later version.
16  *
17  *	Changes:
18  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
19  *				extension headers are implemented.
20  *				route changes now work.
21  *				ip6_forward does not confuse sniffers.
22  *				etc.
23  *
24  *      H. von Brand    :       Added missing #include <linux/string.h>
25  *	Imran Patel	: 	frag id should be in NBO
26  *      Kazunori MIYAZAWA @USAGI
27  *			:       add ip6_append_data and related functions
28  *				for datagram xmit
29  */
30 
31 #include <linux/config.h>
32 #include <linux/errno.h>
33 #include <linux/types.h>
34 #include <linux/string.h>
35 #include <linux/socket.h>
36 #include <linux/net.h>
37 #include <linux/netdevice.h>
38 #include <linux/if_arp.h>
39 #include <linux/in6.h>
40 #include <linux/tcp.h>
41 #include <linux/route.h>
42 
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45 
46 #include <net/sock.h>
47 #include <net/snmp.h>
48 
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 
61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62 {
63 	static u32 ipv6_fragmentation_id = 1;
64 	static DEFINE_SPINLOCK(ip6_id_lock);
65 
66 	spin_lock_bh(&ip6_id_lock);
67 	fhdr->identification = htonl(ipv6_fragmentation_id);
68 	if (++ipv6_fragmentation_id == 0)
69 		ipv6_fragmentation_id = 1;
70 	spin_unlock_bh(&ip6_id_lock);
71 }
72 
73 static inline int ip6_output_finish(struct sk_buff *skb)
74 {
75 
76 	struct dst_entry *dst = skb->dst;
77 	struct hh_cache *hh = dst->hh;
78 
79 	if (hh) {
80 		int hh_alen;
81 
82 		read_lock_bh(&hh->hh_lock);
83 		hh_alen = HH_DATA_ALIGN(hh->hh_len);
84 		memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
85 		read_unlock_bh(&hh->hh_lock);
86 	        skb_push(skb, hh->hh_len);
87 		return hh->hh_output(skb);
88 	} else if (dst->neighbour)
89 		return dst->neighbour->output(skb);
90 
91 	IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
92 	kfree_skb(skb);
93 	return -EINVAL;
94 
95 }
96 
97 /* dev_loopback_xmit for use with netfilter. */
98 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
99 {
100 	newskb->mac.raw = newskb->data;
101 	__skb_pull(newskb, newskb->nh.raw - newskb->data);
102 	newskb->pkt_type = PACKET_LOOPBACK;
103 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
104 	BUG_TRAP(newskb->dst);
105 
106 	netif_rx(newskb);
107 	return 0;
108 }
109 
110 
111 static int ip6_output2(struct sk_buff *skb)
112 {
113 	struct dst_entry *dst = skb->dst;
114 	struct net_device *dev = dst->dev;
115 
116 	skb->protocol = htons(ETH_P_IPV6);
117 	skb->dev = dev;
118 
119 	if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
120 		struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
121 
122 		if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
123 		    ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
124 				&skb->nh.ipv6h->saddr)) {
125 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
126 
127 			/* Do not check for IFF_ALLMULTI; multicast routing
128 			   is not supported in any case.
129 			 */
130 			if (newskb)
131 				NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
132 					newskb->dev,
133 					ip6_dev_loopback_xmit);
134 
135 			if (skb->nh.ipv6h->hop_limit == 0) {
136 				IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
137 				kfree_skb(skb);
138 				return 0;
139 			}
140 		}
141 
142 		IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
143 	}
144 
145 	return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
146 }
147 
148 int ip6_output(struct sk_buff *skb)
149 {
150 	if ((skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->ufo_size) ||
151 				dst_allfrag(skb->dst))
152 		return ip6_fragment(skb, ip6_output2);
153 	else
154 		return ip6_output2(skb);
155 }
156 
157 /*
158  *	xmit an sk_buff (used by TCP)
159  */
160 
161 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
162 	     struct ipv6_txoptions *opt, int ipfragok)
163 {
164 	struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
165 	struct in6_addr *first_hop = &fl->fl6_dst;
166 	struct dst_entry *dst = skb->dst;
167 	struct ipv6hdr *hdr;
168 	u8  proto = fl->proto;
169 	int seg_len = skb->len;
170 	int hlimit, tclass;
171 	u32 mtu;
172 
173 	if (opt) {
174 		int head_room;
175 
176 		/* First: exthdrs may take lots of space (~8K for now)
177 		   MAX_HEADER is not enough.
178 		 */
179 		head_room = opt->opt_nflen + opt->opt_flen;
180 		seg_len += head_room;
181 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
182 
183 		if (skb_headroom(skb) < head_room) {
184 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
185 			kfree_skb(skb);
186 			skb = skb2;
187 			if (skb == NULL) {
188 				IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
189 				return -ENOBUFS;
190 			}
191 			if (sk)
192 				skb_set_owner_w(skb, sk);
193 		}
194 		if (opt->opt_flen)
195 			ipv6_push_frag_opts(skb, opt, &proto);
196 		if (opt->opt_nflen)
197 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
198 	}
199 
200 	hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
201 
202 	/*
203 	 *	Fill in the IPv6 header
204 	 */
205 
206 	hlimit = -1;
207 	if (np)
208 		hlimit = np->hop_limit;
209 	if (hlimit < 0)
210 		hlimit = dst_metric(dst, RTAX_HOPLIMIT);
211 	if (hlimit < 0)
212 		hlimit = ipv6_get_hoplimit(dst->dev);
213 
214 	tclass = -1;
215 	if (np)
216 		tclass = np->tclass;
217 	if (tclass < 0)
218 		tclass = 0;
219 
220 	*(u32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
221 
222 	hdr->payload_len = htons(seg_len);
223 	hdr->nexthdr = proto;
224 	hdr->hop_limit = hlimit;
225 
226 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
227 	ipv6_addr_copy(&hdr->daddr, first_hop);
228 
229 	skb->priority = sk->sk_priority;
230 
231 	mtu = dst_mtu(dst);
232 	if ((skb->len <= mtu) || ipfragok) {
233 		IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
234 		return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev,
235 				dst_output);
236 	}
237 
238 	if (net_ratelimit())
239 		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
240 	skb->dev = dst->dev;
241 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
242 	IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
243 	kfree_skb(skb);
244 	return -EMSGSIZE;
245 }
246 
247 /*
248  *	To avoid extra problems ND packets are send through this
249  *	routine. It's code duplication but I really want to avoid
250  *	extra checks since ipv6_build_header is used by TCP (which
251  *	is for us performance critical)
252  */
253 
254 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
255 	       struct in6_addr *saddr, struct in6_addr *daddr,
256 	       int proto, int len)
257 {
258 	struct ipv6_pinfo *np = inet6_sk(sk);
259 	struct ipv6hdr *hdr;
260 	int totlen;
261 
262 	skb->protocol = htons(ETH_P_IPV6);
263 	skb->dev = dev;
264 
265 	totlen = len + sizeof(struct ipv6hdr);
266 
267 	hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
268 	skb->nh.ipv6h = hdr;
269 
270 	*(u32*)hdr = htonl(0x60000000);
271 
272 	hdr->payload_len = htons(len);
273 	hdr->nexthdr = proto;
274 	hdr->hop_limit = np->hop_limit;
275 
276 	ipv6_addr_copy(&hdr->saddr, saddr);
277 	ipv6_addr_copy(&hdr->daddr, daddr);
278 
279 	return 0;
280 }
281 
282 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
283 {
284 	struct ip6_ra_chain *ra;
285 	struct sock *last = NULL;
286 
287 	read_lock(&ip6_ra_lock);
288 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
289 		struct sock *sk = ra->sk;
290 		if (sk && ra->sel == sel &&
291 		    (!sk->sk_bound_dev_if ||
292 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
293 			if (last) {
294 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
295 				if (skb2)
296 					rawv6_rcv(last, skb2);
297 			}
298 			last = sk;
299 		}
300 	}
301 
302 	if (last) {
303 		rawv6_rcv(last, skb);
304 		read_unlock(&ip6_ra_lock);
305 		return 1;
306 	}
307 	read_unlock(&ip6_ra_lock);
308 	return 0;
309 }
310 
311 static inline int ip6_forward_finish(struct sk_buff *skb)
312 {
313 	return dst_output(skb);
314 }
315 
316 int ip6_forward(struct sk_buff *skb)
317 {
318 	struct dst_entry *dst = skb->dst;
319 	struct ipv6hdr *hdr = skb->nh.ipv6h;
320 	struct inet6_skb_parm *opt = IP6CB(skb);
321 
322 	if (ipv6_devconf.forwarding == 0)
323 		goto error;
324 
325 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
326 		IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
327 		goto drop;
328 	}
329 
330 	skb->ip_summed = CHECKSUM_NONE;
331 
332 	/*
333 	 *	We DO NOT make any processing on
334 	 *	RA packets, pushing them to user level AS IS
335 	 *	without ane WARRANTY that application will be able
336 	 *	to interpret them. The reason is that we
337 	 *	cannot make anything clever here.
338 	 *
339 	 *	We are not end-node, so that if packet contains
340 	 *	AH/ESP, we cannot make anything.
341 	 *	Defragmentation also would be mistake, RA packets
342 	 *	cannot be fragmented, because there is no warranty
343 	 *	that different fragments will go along one path. --ANK
344 	 */
345 	if (opt->ra) {
346 		u8 *ptr = skb->nh.raw + opt->ra;
347 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
348 			return 0;
349 	}
350 
351 	/*
352 	 *	check and decrement ttl
353 	 */
354 	if (hdr->hop_limit <= 1) {
355 		/* Force OUTPUT device used as source address */
356 		skb->dev = dst->dev;
357 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
358 			    0, skb->dev);
359 
360 		kfree_skb(skb);
361 		return -ETIMEDOUT;
362 	}
363 
364 	if (!xfrm6_route_forward(skb)) {
365 		IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
366 		goto drop;
367 	}
368 	dst = skb->dst;
369 
370 	/* IPv6 specs say nothing about it, but it is clear that we cannot
371 	   send redirects to source routed frames.
372 	 */
373 	if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
374 		struct in6_addr *target = NULL;
375 		struct rt6_info *rt;
376 		struct neighbour *n = dst->neighbour;
377 
378 		/*
379 		 *	incoming and outgoing devices are the same
380 		 *	send a redirect.
381 		 */
382 
383 		rt = (struct rt6_info *) dst;
384 		if ((rt->rt6i_flags & RTF_GATEWAY))
385 			target = (struct in6_addr*)&n->primary_key;
386 		else
387 			target = &hdr->daddr;
388 
389 		/* Limit redirects both by destination (here)
390 		   and by source (inside ndisc_send_redirect)
391 		 */
392 		if (xrlim_allow(dst, 1*HZ))
393 			ndisc_send_redirect(skb, n, target);
394 	} else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
395 						|IPV6_ADDR_LINKLOCAL)) {
396 		/* This check is security critical. */
397 		goto error;
398 	}
399 
400 	if (skb->len > dst_mtu(dst)) {
401 		/* Again, force OUTPUT device used as source address */
402 		skb->dev = dst->dev;
403 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
404 		IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
405 		IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
406 		kfree_skb(skb);
407 		return -EMSGSIZE;
408 	}
409 
410 	if (skb_cow(skb, dst->dev->hard_header_len)) {
411 		IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
412 		goto drop;
413 	}
414 
415 	hdr = skb->nh.ipv6h;
416 
417 	/* Mangling hops number delayed to point after skb COW */
418 
419 	hdr->hop_limit--;
420 
421 	IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
422 	return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
423 
424 error:
425 	IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
426 drop:
427 	kfree_skb(skb);
428 	return -EINVAL;
429 }
430 
431 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
432 {
433 	to->pkt_type = from->pkt_type;
434 	to->priority = from->priority;
435 	to->protocol = from->protocol;
436 	dst_release(to->dst);
437 	to->dst = dst_clone(from->dst);
438 	to->dev = from->dev;
439 
440 #ifdef CONFIG_NET_SCHED
441 	to->tc_index = from->tc_index;
442 #endif
443 #ifdef CONFIG_NETFILTER
444 	to->nfmark = from->nfmark;
445 	/* Connection association is same as pre-frag packet */
446 	nf_conntrack_put(to->nfct);
447 	to->nfct = from->nfct;
448 	nf_conntrack_get(to->nfct);
449 	to->nfctinfo = from->nfctinfo;
450 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
451 	nf_conntrack_put_reasm(to->nfct_reasm);
452 	to->nfct_reasm = from->nfct_reasm;
453 	nf_conntrack_get_reasm(to->nfct_reasm);
454 #endif
455 #ifdef CONFIG_BRIDGE_NETFILTER
456 	nf_bridge_put(to->nf_bridge);
457 	to->nf_bridge = from->nf_bridge;
458 	nf_bridge_get(to->nf_bridge);
459 #endif
460 #endif
461 }
462 
463 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
464 {
465 	u16 offset = sizeof(struct ipv6hdr);
466 	struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
467 	unsigned int packet_len = skb->tail - skb->nh.raw;
468 	int found_rhdr = 0;
469 	*nexthdr = &skb->nh.ipv6h->nexthdr;
470 
471 	while (offset + 1 <= packet_len) {
472 
473 		switch (**nexthdr) {
474 
475 		case NEXTHDR_HOP:
476 		case NEXTHDR_ROUTING:
477 		case NEXTHDR_DEST:
478 			if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
479 			if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
480 			offset += ipv6_optlen(exthdr);
481 			*nexthdr = &exthdr->nexthdr;
482 			exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
483 			break;
484 		default :
485 			return offset;
486 		}
487 	}
488 
489 	return offset;
490 }
491 
492 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
493 {
494 	struct net_device *dev;
495 	struct sk_buff *frag;
496 	struct rt6_info *rt = (struct rt6_info*)skb->dst;
497 	struct ipv6hdr *tmp_hdr;
498 	struct frag_hdr *fh;
499 	unsigned int mtu, hlen, left, len;
500 	u32 frag_id = 0;
501 	int ptr, offset = 0, err=0;
502 	u8 *prevhdr, nexthdr = 0;
503 
504 	dev = rt->u.dst.dev;
505 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
506 	nexthdr = *prevhdr;
507 
508 	mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
509 
510 	if (skb_shinfo(skb)->frag_list) {
511 		int first_len = skb_pagelen(skb);
512 
513 		if (first_len - hlen > mtu ||
514 		    ((first_len - hlen) & 7) ||
515 		    skb_cloned(skb))
516 			goto slow_path;
517 
518 		for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
519 			/* Correct geometry. */
520 			if (frag->len > mtu ||
521 			    ((frag->len & 7) && frag->next) ||
522 			    skb_headroom(frag) < hlen)
523 			    goto slow_path;
524 
525 			/* Partially cloned skb? */
526 			if (skb_shared(frag))
527 				goto slow_path;
528 
529 			BUG_ON(frag->sk);
530 			if (skb->sk) {
531 				sock_hold(skb->sk);
532 				frag->sk = skb->sk;
533 				frag->destructor = sock_wfree;
534 				skb->truesize -= frag->truesize;
535 			}
536 		}
537 
538 		err = 0;
539 		offset = 0;
540 		frag = skb_shinfo(skb)->frag_list;
541 		skb_shinfo(skb)->frag_list = NULL;
542 		/* BUILD HEADER */
543 
544 		tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
545 		if (!tmp_hdr) {
546 			IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
547 			return -ENOMEM;
548 		}
549 
550 		*prevhdr = NEXTHDR_FRAGMENT;
551 		memcpy(tmp_hdr, skb->nh.raw, hlen);
552 		__skb_pull(skb, hlen);
553 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
554 		skb->nh.raw = __skb_push(skb, hlen);
555 		memcpy(skb->nh.raw, tmp_hdr, hlen);
556 
557 		ipv6_select_ident(skb, fh);
558 		fh->nexthdr = nexthdr;
559 		fh->reserved = 0;
560 		fh->frag_off = htons(IP6_MF);
561 		frag_id = fh->identification;
562 
563 		first_len = skb_pagelen(skb);
564 		skb->data_len = first_len - skb_headlen(skb);
565 		skb->len = first_len;
566 		skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
567 
568 
569 		for (;;) {
570 			/* Prepare header of the next frame,
571 			 * before previous one went down. */
572 			if (frag) {
573 				frag->ip_summed = CHECKSUM_NONE;
574 				frag->h.raw = frag->data;
575 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
576 				frag->nh.raw = __skb_push(frag, hlen);
577 				memcpy(frag->nh.raw, tmp_hdr, hlen);
578 				offset += skb->len - hlen - sizeof(struct frag_hdr);
579 				fh->nexthdr = nexthdr;
580 				fh->reserved = 0;
581 				fh->frag_off = htons(offset);
582 				if (frag->next != NULL)
583 					fh->frag_off |= htons(IP6_MF);
584 				fh->identification = frag_id;
585 				frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
586 				ip6_copy_metadata(frag, skb);
587 			}
588 
589 			err = output(skb);
590 			if (err || !frag)
591 				break;
592 
593 			skb = frag;
594 			frag = skb->next;
595 			skb->next = NULL;
596 		}
597 
598 		kfree(tmp_hdr);
599 
600 		if (err == 0) {
601 			IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
602 			return 0;
603 		}
604 
605 		while (frag) {
606 			skb = frag->next;
607 			kfree_skb(frag);
608 			frag = skb;
609 		}
610 
611 		IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
612 		return err;
613 	}
614 
615 slow_path:
616 	left = skb->len - hlen;		/* Space per frame */
617 	ptr = hlen;			/* Where to start from */
618 
619 	/*
620 	 *	Fragment the datagram.
621 	 */
622 
623 	*prevhdr = NEXTHDR_FRAGMENT;
624 
625 	/*
626 	 *	Keep copying data until we run out.
627 	 */
628 	while(left > 0)	{
629 		len = left;
630 		/* IF: it doesn't fit, use 'mtu' - the data space left */
631 		if (len > mtu)
632 			len = mtu;
633 		/* IF: we are not sending upto and including the packet end
634 		   then align the next start on an eight byte boundary */
635 		if (len < left)	{
636 			len &= ~7;
637 		}
638 		/*
639 		 *	Allocate buffer.
640 		 */
641 
642 		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
643 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
644 			IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
645 			err = -ENOMEM;
646 			goto fail;
647 		}
648 
649 		/*
650 		 *	Set up data on packet
651 		 */
652 
653 		ip6_copy_metadata(frag, skb);
654 		skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
655 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
656 		frag->nh.raw = frag->data;
657 		fh = (struct frag_hdr*)(frag->data + hlen);
658 		frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
659 
660 		/*
661 		 *	Charge the memory for the fragment to any owner
662 		 *	it might possess
663 		 */
664 		if (skb->sk)
665 			skb_set_owner_w(frag, skb->sk);
666 
667 		/*
668 		 *	Copy the packet header into the new buffer.
669 		 */
670 		memcpy(frag->nh.raw, skb->data, hlen);
671 
672 		/*
673 		 *	Build fragment header.
674 		 */
675 		fh->nexthdr = nexthdr;
676 		fh->reserved = 0;
677 		if (!frag_id) {
678 			ipv6_select_ident(skb, fh);
679 			frag_id = fh->identification;
680 		} else
681 			fh->identification = frag_id;
682 
683 		/*
684 		 *	Copy a block of the IP datagram.
685 		 */
686 		if (skb_copy_bits(skb, ptr, frag->h.raw, len))
687 			BUG();
688 		left -= len;
689 
690 		fh->frag_off = htons(offset);
691 		if (left > 0)
692 			fh->frag_off |= htons(IP6_MF);
693 		frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
694 
695 		ptr += len;
696 		offset += len;
697 
698 		/*
699 		 *	Put this fragment into the sending queue.
700 		 */
701 
702 		IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
703 
704 		err = output(frag);
705 		if (err)
706 			goto fail;
707 	}
708 	kfree_skb(skb);
709 	IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
710 	return err;
711 
712 fail:
713 	kfree_skb(skb);
714 	IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
715 	return err;
716 }
717 
718 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
719 {
720 	int err = 0;
721 
722 	*dst = NULL;
723 	if (sk) {
724 		struct ipv6_pinfo *np = inet6_sk(sk);
725 
726 		*dst = sk_dst_check(sk, np->dst_cookie);
727 		if (*dst) {
728 			struct rt6_info *rt = (struct rt6_info*)*dst;
729 
730 				/* Yes, checking route validity in not connected
731 				   case is not very simple. Take into account,
732 				   that we do not support routing by source, TOS,
733 				   and MSG_DONTROUTE 		--ANK (980726)
734 
735 				   1. If route was host route, check that
736 				      cached destination is current.
737 				      If it is network route, we still may
738 				      check its validity using saved pointer
739 				      to the last used address: daddr_cache.
740 				      We do not want to save whole address now,
741 				      (because main consumer of this service
742 				       is tcp, which has not this problem),
743 				      so that the last trick works only on connected
744 				      sockets.
745 				   2. oif also should be the same.
746 				 */
747 
748 			if (((rt->rt6i_dst.plen != 128 ||
749 			      !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
750 			     && (np->daddr_cache == NULL ||
751 				 !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
752 			    || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
753 				dst_release(*dst);
754 				*dst = NULL;
755 			}
756 		}
757 	}
758 
759 	if (*dst == NULL)
760 		*dst = ip6_route_output(sk, fl);
761 
762 	if ((err = (*dst)->error))
763 		goto out_err_release;
764 
765 	if (ipv6_addr_any(&fl->fl6_src)) {
766 		err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
767 
768 		if (err)
769 			goto out_err_release;
770 	}
771 
772 	return 0;
773 
774 out_err_release:
775 	dst_release(*dst);
776 	*dst = NULL;
777 	return err;
778 }
779 
780 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
781 
782 static inline int ip6_ufo_append_data(struct sock *sk,
783 			int getfrag(void *from, char *to, int offset, int len,
784 			int odd, struct sk_buff *skb),
785 			void *from, int length, int hh_len, int fragheaderlen,
786 			int transhdrlen, int mtu,unsigned int flags)
787 
788 {
789 	struct sk_buff *skb;
790 	int err;
791 
792 	/* There is support for UDP large send offload by network
793 	 * device, so create one single skb packet containing complete
794 	 * udp datagram
795 	 */
796 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
797 		skb = sock_alloc_send_skb(sk,
798 			hh_len + fragheaderlen + transhdrlen + 20,
799 			(flags & MSG_DONTWAIT), &err);
800 		if (skb == NULL)
801 			return -ENOMEM;
802 
803 		/* reserve space for Hardware header */
804 		skb_reserve(skb, hh_len);
805 
806 		/* create space for UDP/IP header */
807 		skb_put(skb,fragheaderlen + transhdrlen);
808 
809 		/* initialize network header pointer */
810 		skb->nh.raw = skb->data;
811 
812 		/* initialize protocol header pointer */
813 		skb->h.raw = skb->data + fragheaderlen;
814 
815 		skb->ip_summed = CHECKSUM_HW;
816 		skb->csum = 0;
817 		sk->sk_sndmsg_off = 0;
818 	}
819 
820 	err = skb_append_datato_frags(sk,skb, getfrag, from,
821 				      (length - transhdrlen));
822 	if (!err) {
823 		struct frag_hdr fhdr;
824 
825 		/* specify the length of each IP datagram fragment*/
826 		skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen) -
827 						sizeof(struct frag_hdr);
828 		ipv6_select_ident(skb, &fhdr);
829 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
830 		__skb_queue_tail(&sk->sk_write_queue, skb);
831 
832 		return 0;
833 	}
834 	/* There is not enough support do UPD LSO,
835 	 * so follow normal path
836 	 */
837 	kfree_skb(skb);
838 
839 	return err;
840 }
841 
842 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
843 	int offset, int len, int odd, struct sk_buff *skb),
844 	void *from, int length, int transhdrlen,
845 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
846 	struct rt6_info *rt, unsigned int flags)
847 {
848 	struct inet_sock *inet = inet_sk(sk);
849 	struct ipv6_pinfo *np = inet6_sk(sk);
850 	struct sk_buff *skb;
851 	unsigned int maxfraglen, fragheaderlen;
852 	int exthdrlen;
853 	int hh_len;
854 	int mtu;
855 	int copy;
856 	int err;
857 	int offset = 0;
858 	int csummode = CHECKSUM_NONE;
859 
860 	if (flags&MSG_PROBE)
861 		return 0;
862 	if (skb_queue_empty(&sk->sk_write_queue)) {
863 		/*
864 		 * setup for corking
865 		 */
866 		if (opt) {
867 			if (np->cork.opt == NULL) {
868 				np->cork.opt = kmalloc(opt->tot_len,
869 						       sk->sk_allocation);
870 				if (unlikely(np->cork.opt == NULL))
871 					return -ENOBUFS;
872 			} else if (np->cork.opt->tot_len < opt->tot_len) {
873 				printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
874 				return -EINVAL;
875 			}
876 			memcpy(np->cork.opt, opt, opt->tot_len);
877 			inet->cork.flags |= IPCORK_OPT;
878 			/* need source address above miyazawa*/
879 		}
880 		dst_hold(&rt->u.dst);
881 		np->cork.rt = rt;
882 		inet->cork.fl = *fl;
883 		np->cork.hop_limit = hlimit;
884 		np->cork.tclass = tclass;
885 		inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
886 		if (dst_allfrag(rt->u.dst.path))
887 			inet->cork.flags |= IPCORK_ALLFRAG;
888 		inet->cork.length = 0;
889 		sk->sk_sndmsg_page = NULL;
890 		sk->sk_sndmsg_off = 0;
891 		exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
892 		length += exthdrlen;
893 		transhdrlen += exthdrlen;
894 	} else {
895 		rt = np->cork.rt;
896 		fl = &inet->cork.fl;
897 		if (inet->cork.flags & IPCORK_OPT)
898 			opt = np->cork.opt;
899 		transhdrlen = 0;
900 		exthdrlen = 0;
901 		mtu = inet->cork.fragsize;
902 	}
903 
904 	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
905 
906 	fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
907 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
908 
909 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
910 		if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
911 			ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
912 			return -EMSGSIZE;
913 		}
914 	}
915 
916 	/*
917 	 * Let's try using as much space as possible.
918 	 * Use MTU if total length of the message fits into the MTU.
919 	 * Otherwise, we need to reserve fragment header and
920 	 * fragment alignment (= 8-15 octects, in total).
921 	 *
922 	 * Note that we may need to "move" the data from the tail of
923 	 * of the buffer to the new fragment when we split
924 	 * the message.
925 	 *
926 	 * FIXME: It may be fragmented into multiple chunks
927 	 *        at once if non-fragmentable extension headers
928 	 *        are too large.
929 	 * --yoshfuji
930 	 */
931 
932 	inet->cork.length += length;
933 	if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
934 	    (rt->u.dst.dev->features & NETIF_F_UFO)) {
935 
936 		if(ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
937 				fragheaderlen, transhdrlen, mtu, flags))
938 			goto error;
939 
940 		return 0;
941 	}
942 
943 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
944 		goto alloc_new_skb;
945 
946 	while (length > 0) {
947 		/* Check if the remaining data fits into current packet. */
948 		copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
949 		if (copy < length)
950 			copy = maxfraglen - skb->len;
951 
952 		if (copy <= 0) {
953 			char *data;
954 			unsigned int datalen;
955 			unsigned int fraglen;
956 			unsigned int fraggap;
957 			unsigned int alloclen;
958 			struct sk_buff *skb_prev;
959 alloc_new_skb:
960 			skb_prev = skb;
961 
962 			/* There's no room in the current skb */
963 			if (skb_prev)
964 				fraggap = skb_prev->len - maxfraglen;
965 			else
966 				fraggap = 0;
967 
968 			/*
969 			 * If remaining data exceeds the mtu,
970 			 * we know we need more fragment(s).
971 			 */
972 			datalen = length + fraggap;
973 			if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
974 				datalen = maxfraglen - fragheaderlen;
975 
976 			fraglen = datalen + fragheaderlen;
977 			if ((flags & MSG_MORE) &&
978 			    !(rt->u.dst.dev->features&NETIF_F_SG))
979 				alloclen = mtu;
980 			else
981 				alloclen = datalen + fragheaderlen;
982 
983 			/*
984 			 * The last fragment gets additional space at tail.
985 			 * Note: we overallocate on fragments with MSG_MODE
986 			 * because we have no idea if we're the last one.
987 			 */
988 			if (datalen == length + fraggap)
989 				alloclen += rt->u.dst.trailer_len;
990 
991 			/*
992 			 * We just reserve space for fragment header.
993 			 * Note: this may be overallocation if the message
994 			 * (without MSG_MORE) fits into the MTU.
995 			 */
996 			alloclen += sizeof(struct frag_hdr);
997 
998 			if (transhdrlen) {
999 				skb = sock_alloc_send_skb(sk,
1000 						alloclen + hh_len,
1001 						(flags & MSG_DONTWAIT), &err);
1002 			} else {
1003 				skb = NULL;
1004 				if (atomic_read(&sk->sk_wmem_alloc) <=
1005 				    2 * sk->sk_sndbuf)
1006 					skb = sock_wmalloc(sk,
1007 							   alloclen + hh_len, 1,
1008 							   sk->sk_allocation);
1009 				if (unlikely(skb == NULL))
1010 					err = -ENOBUFS;
1011 			}
1012 			if (skb == NULL)
1013 				goto error;
1014 			/*
1015 			 *	Fill in the control structures
1016 			 */
1017 			skb->ip_summed = csummode;
1018 			skb->csum = 0;
1019 			/* reserve for fragmentation */
1020 			skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1021 
1022 			/*
1023 			 *	Find where to start putting bytes
1024 			 */
1025 			data = skb_put(skb, fraglen);
1026 			skb->nh.raw = data + exthdrlen;
1027 			data += fragheaderlen;
1028 			skb->h.raw = data + exthdrlen;
1029 
1030 			if (fraggap) {
1031 				skb->csum = skb_copy_and_csum_bits(
1032 					skb_prev, maxfraglen,
1033 					data + transhdrlen, fraggap, 0);
1034 				skb_prev->csum = csum_sub(skb_prev->csum,
1035 							  skb->csum);
1036 				data += fraggap;
1037 				skb_trim(skb_prev, maxfraglen);
1038 			}
1039 			copy = datalen - transhdrlen - fraggap;
1040 			if (copy < 0) {
1041 				err = -EINVAL;
1042 				kfree_skb(skb);
1043 				goto error;
1044 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1045 				err = -EFAULT;
1046 				kfree_skb(skb);
1047 				goto error;
1048 			}
1049 
1050 			offset += copy;
1051 			length -= datalen - fraggap;
1052 			transhdrlen = 0;
1053 			exthdrlen = 0;
1054 			csummode = CHECKSUM_NONE;
1055 
1056 			/*
1057 			 * Put the packet on the pending queue
1058 			 */
1059 			__skb_queue_tail(&sk->sk_write_queue, skb);
1060 			continue;
1061 		}
1062 
1063 		if (copy > length)
1064 			copy = length;
1065 
1066 		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1067 			unsigned int off;
1068 
1069 			off = skb->len;
1070 			if (getfrag(from, skb_put(skb, copy),
1071 						offset, copy, off, skb) < 0) {
1072 				__skb_trim(skb, off);
1073 				err = -EFAULT;
1074 				goto error;
1075 			}
1076 		} else {
1077 			int i = skb_shinfo(skb)->nr_frags;
1078 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1079 			struct page *page = sk->sk_sndmsg_page;
1080 			int off = sk->sk_sndmsg_off;
1081 			unsigned int left;
1082 
1083 			if (page && (left = PAGE_SIZE - off) > 0) {
1084 				if (copy >= left)
1085 					copy = left;
1086 				if (page != frag->page) {
1087 					if (i == MAX_SKB_FRAGS) {
1088 						err = -EMSGSIZE;
1089 						goto error;
1090 					}
1091 					get_page(page);
1092 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1093 					frag = &skb_shinfo(skb)->frags[i];
1094 				}
1095 			} else if(i < MAX_SKB_FRAGS) {
1096 				if (copy > PAGE_SIZE)
1097 					copy = PAGE_SIZE;
1098 				page = alloc_pages(sk->sk_allocation, 0);
1099 				if (page == NULL) {
1100 					err = -ENOMEM;
1101 					goto error;
1102 				}
1103 				sk->sk_sndmsg_page = page;
1104 				sk->sk_sndmsg_off = 0;
1105 
1106 				skb_fill_page_desc(skb, i, page, 0, 0);
1107 				frag = &skb_shinfo(skb)->frags[i];
1108 				skb->truesize += PAGE_SIZE;
1109 				atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1110 			} else {
1111 				err = -EMSGSIZE;
1112 				goto error;
1113 			}
1114 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1115 				err = -EFAULT;
1116 				goto error;
1117 			}
1118 			sk->sk_sndmsg_off += copy;
1119 			frag->size += copy;
1120 			skb->len += copy;
1121 			skb->data_len += copy;
1122 		}
1123 		offset += copy;
1124 		length -= copy;
1125 	}
1126 	return 0;
1127 error:
1128 	inet->cork.length -= length;
1129 	IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1130 	return err;
1131 }
1132 
1133 int ip6_push_pending_frames(struct sock *sk)
1134 {
1135 	struct sk_buff *skb, *tmp_skb;
1136 	struct sk_buff **tail_skb;
1137 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1138 	struct inet_sock *inet = inet_sk(sk);
1139 	struct ipv6_pinfo *np = inet6_sk(sk);
1140 	struct ipv6hdr *hdr;
1141 	struct ipv6_txoptions *opt = np->cork.opt;
1142 	struct rt6_info *rt = np->cork.rt;
1143 	struct flowi *fl = &inet->cork.fl;
1144 	unsigned char proto = fl->proto;
1145 	int err = 0;
1146 
1147 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1148 		goto out;
1149 	tail_skb = &(skb_shinfo(skb)->frag_list);
1150 
1151 	/* move skb->data to ip header from ext header */
1152 	if (skb->data < skb->nh.raw)
1153 		__skb_pull(skb, skb->nh.raw - skb->data);
1154 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1155 		__skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1156 		*tail_skb = tmp_skb;
1157 		tail_skb = &(tmp_skb->next);
1158 		skb->len += tmp_skb->len;
1159 		skb->data_len += tmp_skb->len;
1160 		skb->truesize += tmp_skb->truesize;
1161 		__sock_put(tmp_skb->sk);
1162 		tmp_skb->destructor = NULL;
1163 		tmp_skb->sk = NULL;
1164 	}
1165 
1166 	ipv6_addr_copy(final_dst, &fl->fl6_dst);
1167 	__skb_pull(skb, skb->h.raw - skb->nh.raw);
1168 	if (opt && opt->opt_flen)
1169 		ipv6_push_frag_opts(skb, opt, &proto);
1170 	if (opt && opt->opt_nflen)
1171 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1172 
1173 	skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1174 
1175 	*(u32*)hdr = fl->fl6_flowlabel |
1176 		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1177 
1178 	if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1179 		hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1180 	else
1181 		hdr->payload_len = 0;
1182 	hdr->hop_limit = np->cork.hop_limit;
1183 	hdr->nexthdr = proto;
1184 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1185 	ipv6_addr_copy(&hdr->daddr, final_dst);
1186 
1187 	skb->priority = sk->sk_priority;
1188 
1189 	skb->dst = dst_clone(&rt->u.dst);
1190 	IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
1191 	err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1192 	if (err) {
1193 		if (err > 0)
1194 			err = np->recverr ? net_xmit_errno(err) : 0;
1195 		if (err)
1196 			goto error;
1197 	}
1198 
1199 out:
1200 	inet->cork.flags &= ~IPCORK_OPT;
1201 	kfree(np->cork.opt);
1202 	np->cork.opt = NULL;
1203 	if (np->cork.rt) {
1204 		dst_release(&np->cork.rt->u.dst);
1205 		np->cork.rt = NULL;
1206 		inet->cork.flags &= ~IPCORK_ALLFRAG;
1207 	}
1208 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1209 	return err;
1210 error:
1211 	goto out;
1212 }
1213 
1214 void ip6_flush_pending_frames(struct sock *sk)
1215 {
1216 	struct inet_sock *inet = inet_sk(sk);
1217 	struct ipv6_pinfo *np = inet6_sk(sk);
1218 	struct sk_buff *skb;
1219 
1220 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1221 		IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1222 		kfree_skb(skb);
1223 	}
1224 
1225 	inet->cork.flags &= ~IPCORK_OPT;
1226 
1227 	kfree(np->cork.opt);
1228 	np->cork.opt = NULL;
1229 	if (np->cork.rt) {
1230 		dst_release(&np->cork.rt->u.dst);
1231 		np->cork.rt = NULL;
1232 		inet->cork.flags &= ~IPCORK_ALLFRAG;
1233 	}
1234 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1235 }
1236