xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 1da177e4)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	$Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9  *
10  *	Based on linux/net/ipv4/ip_output.c
11  *
12  *	This program is free software; you can redistribute it and/or
13  *      modify it under the terms of the GNU General Public License
14  *      as published by the Free Software Foundation; either version
15  *      2 of the License, or (at your option) any later version.
16  *
17  *	Changes:
18  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
19  *				extension headers are implemented.
20  *				route changes now work.
21  *				ip6_forward does not confuse sniffers.
22  *				etc.
23  *
24  *      H. von Brand    :       Added missing #include <linux/string.h>
25  *	Imran Patel	: 	frag id should be in NBO
26  *      Kazunori MIYAZAWA @USAGI
27  *			:       add ip6_append_data and related functions
28  *				for datagram xmit
29  */
30 
31 #include <linux/config.h>
32 #include <linux/errno.h>
33 #include <linux/types.h>
34 #include <linux/string.h>
35 #include <linux/socket.h>
36 #include <linux/net.h>
37 #include <linux/netdevice.h>
38 #include <linux/if_arp.h>
39 #include <linux/in6.h>
40 #include <linux/tcp.h>
41 #include <linux/route.h>
42 
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45 
46 #include <net/sock.h>
47 #include <net/snmp.h>
48 
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 
61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62 {
63 	static u32 ipv6_fragmentation_id = 1;
64 	static DEFINE_SPINLOCK(ip6_id_lock);
65 
66 	spin_lock_bh(&ip6_id_lock);
67 	fhdr->identification = htonl(ipv6_fragmentation_id);
68 	if (++ipv6_fragmentation_id == 0)
69 		ipv6_fragmentation_id = 1;
70 	spin_unlock_bh(&ip6_id_lock);
71 }
72 
73 static inline int ip6_output_finish(struct sk_buff *skb)
74 {
75 
76 	struct dst_entry *dst = skb->dst;
77 	struct hh_cache *hh = dst->hh;
78 
79 	if (hh) {
80 		int hh_alen;
81 
82 		read_lock_bh(&hh->hh_lock);
83 		hh_alen = HH_DATA_ALIGN(hh->hh_len);
84 		memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
85 		read_unlock_bh(&hh->hh_lock);
86 	        skb_push(skb, hh->hh_len);
87 		return hh->hh_output(skb);
88 	} else if (dst->neighbour)
89 		return dst->neighbour->output(skb);
90 
91 	IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
92 	kfree_skb(skb);
93 	return -EINVAL;
94 
95 }
96 
97 /* dev_loopback_xmit for use with netfilter. */
98 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
99 {
100 	newskb->mac.raw = newskb->data;
101 	__skb_pull(newskb, newskb->nh.raw - newskb->data);
102 	newskb->pkt_type = PACKET_LOOPBACK;
103 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
104 	BUG_TRAP(newskb->dst);
105 
106 	netif_rx(newskb);
107 	return 0;
108 }
109 
110 
111 static int ip6_output2(struct sk_buff *skb)
112 {
113 	struct dst_entry *dst = skb->dst;
114 	struct net_device *dev = dst->dev;
115 
116 	skb->protocol = htons(ETH_P_IPV6);
117 	skb->dev = dev;
118 
119 	if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
120 		struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
121 
122 		if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
123 		    ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
124 				&skb->nh.ipv6h->saddr)) {
125 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
126 
127 			/* Do not check for IFF_ALLMULTI; multicast routing
128 			   is not supported in any case.
129 			 */
130 			if (newskb)
131 				NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
132 					newskb->dev,
133 					ip6_dev_loopback_xmit);
134 
135 			if (skb->nh.ipv6h->hop_limit == 0) {
136 				IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
137 				kfree_skb(skb);
138 				return 0;
139 			}
140 		}
141 
142 		IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
143 	}
144 
145 	return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
146 }
147 
148 int ip6_output(struct sk_buff *skb)
149 {
150 	if (skb->len > dst_mtu(skb->dst) || dst_allfrag(skb->dst))
151 		return ip6_fragment(skb, ip6_output2);
152 	else
153 		return ip6_output2(skb);
154 }
155 
156 #ifdef CONFIG_NETFILTER
157 int ip6_route_me_harder(struct sk_buff *skb)
158 {
159 	struct ipv6hdr *iph = skb->nh.ipv6h;
160 	struct dst_entry *dst;
161 	struct flowi fl = {
162 		.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
163 		.nl_u =
164 		{ .ip6_u =
165 		  { .daddr = iph->daddr,
166 		    .saddr = iph->saddr, } },
167 		.proto = iph->nexthdr,
168 	};
169 
170 	dst = ip6_route_output(skb->sk, &fl);
171 
172 	if (dst->error) {
173 		IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
174 		LIMIT_NETDEBUG(
175 			printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n"));
176 		dst_release(dst);
177 		return -EINVAL;
178 	}
179 
180 	/* Drop old route. */
181 	dst_release(skb->dst);
182 
183 	skb->dst = dst;
184 	return 0;
185 }
186 #endif
187 
188 static inline int ip6_maybe_reroute(struct sk_buff *skb)
189 {
190 #ifdef CONFIG_NETFILTER
191 	if (skb->nfcache & NFC_ALTERED){
192 		if (ip6_route_me_harder(skb) != 0){
193 			kfree_skb(skb);
194 			return -EINVAL;
195 		}
196 	}
197 #endif /* CONFIG_NETFILTER */
198 	return dst_output(skb);
199 }
200 
201 /*
202  *	xmit an sk_buff (used by TCP)
203  */
204 
205 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
206 	     struct ipv6_txoptions *opt, int ipfragok)
207 {
208 	struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
209 	struct in6_addr *first_hop = &fl->fl6_dst;
210 	struct dst_entry *dst = skb->dst;
211 	struct ipv6hdr *hdr;
212 	u8  proto = fl->proto;
213 	int seg_len = skb->len;
214 	int hlimit;
215 	u32 mtu;
216 
217 	if (opt) {
218 		int head_room;
219 
220 		/* First: exthdrs may take lots of space (~8K for now)
221 		   MAX_HEADER is not enough.
222 		 */
223 		head_room = opt->opt_nflen + opt->opt_flen;
224 		seg_len += head_room;
225 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
226 
227 		if (skb_headroom(skb) < head_room) {
228 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
229 			kfree_skb(skb);
230 			skb = skb2;
231 			if (skb == NULL) {
232 				IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
233 				return -ENOBUFS;
234 			}
235 			if (sk)
236 				skb_set_owner_w(skb, sk);
237 		}
238 		if (opt->opt_flen)
239 			ipv6_push_frag_opts(skb, opt, &proto);
240 		if (opt->opt_nflen)
241 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
242 	}
243 
244 	hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
245 
246 	/*
247 	 *	Fill in the IPv6 header
248 	 */
249 
250 	*(u32*)hdr = htonl(0x60000000) | fl->fl6_flowlabel;
251 	hlimit = -1;
252 	if (np)
253 		hlimit = np->hop_limit;
254 	if (hlimit < 0)
255 		hlimit = dst_metric(dst, RTAX_HOPLIMIT);
256 	if (hlimit < 0)
257 		hlimit = ipv6_get_hoplimit(dst->dev);
258 
259 	hdr->payload_len = htons(seg_len);
260 	hdr->nexthdr = proto;
261 	hdr->hop_limit = hlimit;
262 
263 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
264 	ipv6_addr_copy(&hdr->daddr, first_hop);
265 
266 	mtu = dst_mtu(dst);
267 	if ((skb->len <= mtu) || ipfragok) {
268 		IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
269 		return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute);
270 	}
271 
272 	if (net_ratelimit())
273 		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
274 	skb->dev = dst->dev;
275 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
276 	IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
277 	kfree_skb(skb);
278 	return -EMSGSIZE;
279 }
280 
281 /*
282  *	To avoid extra problems ND packets are send through this
283  *	routine. It's code duplication but I really want to avoid
284  *	extra checks since ipv6_build_header is used by TCP (which
285  *	is for us performance critical)
286  */
287 
288 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
289 	       struct in6_addr *saddr, struct in6_addr *daddr,
290 	       int proto, int len)
291 {
292 	struct ipv6_pinfo *np = inet6_sk(sk);
293 	struct ipv6hdr *hdr;
294 	int totlen;
295 
296 	skb->protocol = htons(ETH_P_IPV6);
297 	skb->dev = dev;
298 
299 	totlen = len + sizeof(struct ipv6hdr);
300 
301 	hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
302 	skb->nh.ipv6h = hdr;
303 
304 	*(u32*)hdr = htonl(0x60000000);
305 
306 	hdr->payload_len = htons(len);
307 	hdr->nexthdr = proto;
308 	hdr->hop_limit = np->hop_limit;
309 
310 	ipv6_addr_copy(&hdr->saddr, saddr);
311 	ipv6_addr_copy(&hdr->daddr, daddr);
312 
313 	return 0;
314 }
315 
316 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
317 {
318 	struct ip6_ra_chain *ra;
319 	struct sock *last = NULL;
320 
321 	read_lock(&ip6_ra_lock);
322 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
323 		struct sock *sk = ra->sk;
324 		if (sk && ra->sel == sel) {
325 			if (last) {
326 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
327 				if (skb2)
328 					rawv6_rcv(last, skb2);
329 			}
330 			last = sk;
331 		}
332 	}
333 
334 	if (last) {
335 		rawv6_rcv(last, skb);
336 		read_unlock(&ip6_ra_lock);
337 		return 1;
338 	}
339 	read_unlock(&ip6_ra_lock);
340 	return 0;
341 }
342 
343 static inline int ip6_forward_finish(struct sk_buff *skb)
344 {
345 	return dst_output(skb);
346 }
347 
348 int ip6_forward(struct sk_buff *skb)
349 {
350 	struct dst_entry *dst = skb->dst;
351 	struct ipv6hdr *hdr = skb->nh.ipv6h;
352 	struct inet6_skb_parm *opt = IP6CB(skb);
353 
354 	if (ipv6_devconf.forwarding == 0)
355 		goto error;
356 
357 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
358 		IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
359 		goto drop;
360 	}
361 
362 	skb->ip_summed = CHECKSUM_NONE;
363 
364 	/*
365 	 *	We DO NOT make any processing on
366 	 *	RA packets, pushing them to user level AS IS
367 	 *	without ane WARRANTY that application will be able
368 	 *	to interpret them. The reason is that we
369 	 *	cannot make anything clever here.
370 	 *
371 	 *	We are not end-node, so that if packet contains
372 	 *	AH/ESP, we cannot make anything.
373 	 *	Defragmentation also would be mistake, RA packets
374 	 *	cannot be fragmented, because there is no warranty
375 	 *	that different fragments will go along one path. --ANK
376 	 */
377 	if (opt->ra) {
378 		u8 *ptr = skb->nh.raw + opt->ra;
379 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
380 			return 0;
381 	}
382 
383 	/*
384 	 *	check and decrement ttl
385 	 */
386 	if (hdr->hop_limit <= 1) {
387 		/* Force OUTPUT device used as source address */
388 		skb->dev = dst->dev;
389 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
390 			    0, skb->dev);
391 
392 		kfree_skb(skb);
393 		return -ETIMEDOUT;
394 	}
395 
396 	if (!xfrm6_route_forward(skb)) {
397 		IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
398 		goto drop;
399 	}
400 	dst = skb->dst;
401 
402 	/* IPv6 specs say nothing about it, but it is clear that we cannot
403 	   send redirects to source routed frames.
404 	 */
405 	if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
406 		struct in6_addr *target = NULL;
407 		struct rt6_info *rt;
408 		struct neighbour *n = dst->neighbour;
409 
410 		/*
411 		 *	incoming and outgoing devices are the same
412 		 *	send a redirect.
413 		 */
414 
415 		rt = (struct rt6_info *) dst;
416 		if ((rt->rt6i_flags & RTF_GATEWAY))
417 			target = (struct in6_addr*)&n->primary_key;
418 		else
419 			target = &hdr->daddr;
420 
421 		/* Limit redirects both by destination (here)
422 		   and by source (inside ndisc_send_redirect)
423 		 */
424 		if (xrlim_allow(dst, 1*HZ))
425 			ndisc_send_redirect(skb, n, target);
426 	} else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
427 						|IPV6_ADDR_LINKLOCAL)) {
428 		/* This check is security critical. */
429 		goto error;
430 	}
431 
432 	if (skb->len > dst_mtu(dst)) {
433 		/* Again, force OUTPUT device used as source address */
434 		skb->dev = dst->dev;
435 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
436 		IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
437 		IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
438 		kfree_skb(skb);
439 		return -EMSGSIZE;
440 	}
441 
442 	if (skb_cow(skb, dst->dev->hard_header_len)) {
443 		IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
444 		goto drop;
445 	}
446 
447 	hdr = skb->nh.ipv6h;
448 
449 	/* Mangling hops number delayed to point after skb COW */
450 
451 	hdr->hop_limit--;
452 
453 	IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
454 	return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
455 
456 error:
457 	IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
458 drop:
459 	kfree_skb(skb);
460 	return -EINVAL;
461 }
462 
463 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
464 {
465 	to->pkt_type = from->pkt_type;
466 	to->priority = from->priority;
467 	to->protocol = from->protocol;
468 	to->security = from->security;
469 	dst_release(to->dst);
470 	to->dst = dst_clone(from->dst);
471 	to->dev = from->dev;
472 
473 #ifdef CONFIG_NET_SCHED
474 	to->tc_index = from->tc_index;
475 #endif
476 #ifdef CONFIG_NETFILTER
477 	to->nfmark = from->nfmark;
478 	/* Connection association is same as pre-frag packet */
479 	to->nfct = from->nfct;
480 	nf_conntrack_get(to->nfct);
481 	to->nfctinfo = from->nfctinfo;
482 #ifdef CONFIG_BRIDGE_NETFILTER
483 	nf_bridge_put(to->nf_bridge);
484 	to->nf_bridge = from->nf_bridge;
485 	nf_bridge_get(to->nf_bridge);
486 #endif
487 #ifdef CONFIG_NETFILTER_DEBUG
488 	to->nf_debug = from->nf_debug;
489 #endif
490 #endif
491 }
492 
493 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
494 {
495 	u16 offset = sizeof(struct ipv6hdr);
496 	struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
497 	unsigned int packet_len = skb->tail - skb->nh.raw;
498 	int found_rhdr = 0;
499 	*nexthdr = &skb->nh.ipv6h->nexthdr;
500 
501 	while (offset + 1 <= packet_len) {
502 
503 		switch (**nexthdr) {
504 
505 		case NEXTHDR_HOP:
506 		case NEXTHDR_ROUTING:
507 		case NEXTHDR_DEST:
508 			if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
509 			if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
510 			offset += ipv6_optlen(exthdr);
511 			*nexthdr = &exthdr->nexthdr;
512 			exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
513 			break;
514 		default :
515 			return offset;
516 		}
517 	}
518 
519 	return offset;
520 }
521 
522 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
523 {
524 	struct net_device *dev;
525 	struct sk_buff *frag;
526 	struct rt6_info *rt = (struct rt6_info*)skb->dst;
527 	struct ipv6hdr *tmp_hdr;
528 	struct frag_hdr *fh;
529 	unsigned int mtu, hlen, left, len;
530 	u32 frag_id = 0;
531 	int ptr, offset = 0, err=0;
532 	u8 *prevhdr, nexthdr = 0;
533 
534 	dev = rt->u.dst.dev;
535 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
536 	nexthdr = *prevhdr;
537 
538 	mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
539 
540 	if (skb_shinfo(skb)->frag_list) {
541 		int first_len = skb_pagelen(skb);
542 
543 		if (first_len - hlen > mtu ||
544 		    ((first_len - hlen) & 7) ||
545 		    skb_cloned(skb))
546 			goto slow_path;
547 
548 		for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
549 			/* Correct geometry. */
550 			if (frag->len > mtu ||
551 			    ((frag->len & 7) && frag->next) ||
552 			    skb_headroom(frag) < hlen)
553 			    goto slow_path;
554 
555 			/* Correct socket ownership. */
556 			if (frag->sk == NULL)
557 				goto slow_path;
558 
559 			/* Partially cloned skb? */
560 			if (skb_shared(frag))
561 				goto slow_path;
562 		}
563 
564 		err = 0;
565 		offset = 0;
566 		frag = skb_shinfo(skb)->frag_list;
567 		skb_shinfo(skb)->frag_list = NULL;
568 		/* BUILD HEADER */
569 
570 		tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
571 		if (!tmp_hdr) {
572 			IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
573 			return -ENOMEM;
574 		}
575 
576 		*prevhdr = NEXTHDR_FRAGMENT;
577 		memcpy(tmp_hdr, skb->nh.raw, hlen);
578 		__skb_pull(skb, hlen);
579 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
580 		skb->nh.raw = __skb_push(skb, hlen);
581 		memcpy(skb->nh.raw, tmp_hdr, hlen);
582 
583 		ipv6_select_ident(skb, fh);
584 		fh->nexthdr = nexthdr;
585 		fh->reserved = 0;
586 		fh->frag_off = htons(IP6_MF);
587 		frag_id = fh->identification;
588 
589 		first_len = skb_pagelen(skb);
590 		skb->data_len = first_len - skb_headlen(skb);
591 		skb->len = first_len;
592 		skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
593 
594 
595 		for (;;) {
596 			/* Prepare header of the next frame,
597 			 * before previous one went down. */
598 			if (frag) {
599 				frag->ip_summed = CHECKSUM_NONE;
600 				frag->h.raw = frag->data;
601 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
602 				frag->nh.raw = __skb_push(frag, hlen);
603 				memcpy(frag->nh.raw, tmp_hdr, hlen);
604 				offset += skb->len - hlen - sizeof(struct frag_hdr);
605 				fh->nexthdr = nexthdr;
606 				fh->reserved = 0;
607 				fh->frag_off = htons(offset);
608 				if (frag->next != NULL)
609 					fh->frag_off |= htons(IP6_MF);
610 				fh->identification = frag_id;
611 				frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
612 				ip6_copy_metadata(frag, skb);
613 			}
614 
615 			err = output(skb);
616 			if (err || !frag)
617 				break;
618 
619 			skb = frag;
620 			frag = skb->next;
621 			skb->next = NULL;
622 		}
623 
624 		if (tmp_hdr)
625 			kfree(tmp_hdr);
626 
627 		if (err == 0) {
628 			IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
629 			return 0;
630 		}
631 
632 		while (frag) {
633 			skb = frag->next;
634 			kfree_skb(frag);
635 			frag = skb;
636 		}
637 
638 		IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
639 		return err;
640 	}
641 
642 slow_path:
643 	left = skb->len - hlen;		/* Space per frame */
644 	ptr = hlen;			/* Where to start from */
645 
646 	/*
647 	 *	Fragment the datagram.
648 	 */
649 
650 	*prevhdr = NEXTHDR_FRAGMENT;
651 
652 	/*
653 	 *	Keep copying data until we run out.
654 	 */
655 	while(left > 0)	{
656 		len = left;
657 		/* IF: it doesn't fit, use 'mtu' - the data space left */
658 		if (len > mtu)
659 			len = mtu;
660 		/* IF: we are not sending upto and including the packet end
661 		   then align the next start on an eight byte boundary */
662 		if (len < left)	{
663 			len &= ~7;
664 		}
665 		/*
666 		 *	Allocate buffer.
667 		 */
668 
669 		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
670 			NETDEBUG(printk(KERN_INFO "IPv6: frag: no memory for new fragment!\n"));
671 			IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
672 			err = -ENOMEM;
673 			goto fail;
674 		}
675 
676 		/*
677 		 *	Set up data on packet
678 		 */
679 
680 		ip6_copy_metadata(frag, skb);
681 		skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
682 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
683 		frag->nh.raw = frag->data;
684 		fh = (struct frag_hdr*)(frag->data + hlen);
685 		frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
686 
687 		/*
688 		 *	Charge the memory for the fragment to any owner
689 		 *	it might possess
690 		 */
691 		if (skb->sk)
692 			skb_set_owner_w(frag, skb->sk);
693 
694 		/*
695 		 *	Copy the packet header into the new buffer.
696 		 */
697 		memcpy(frag->nh.raw, skb->data, hlen);
698 
699 		/*
700 		 *	Build fragment header.
701 		 */
702 		fh->nexthdr = nexthdr;
703 		fh->reserved = 0;
704 		if (frag_id) {
705 			ipv6_select_ident(skb, fh);
706 			frag_id = fh->identification;
707 		} else
708 			fh->identification = frag_id;
709 
710 		/*
711 		 *	Copy a block of the IP datagram.
712 		 */
713 		if (skb_copy_bits(skb, ptr, frag->h.raw, len))
714 			BUG();
715 		left -= len;
716 
717 		fh->frag_off = htons(offset);
718 		if (left > 0)
719 			fh->frag_off |= htons(IP6_MF);
720 		frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
721 
722 		ptr += len;
723 		offset += len;
724 
725 		/*
726 		 *	Put this fragment into the sending queue.
727 		 */
728 
729 		IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
730 
731 		err = output(frag);
732 		if (err)
733 			goto fail;
734 	}
735 	kfree_skb(skb);
736 	IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
737 	return err;
738 
739 fail:
740 	kfree_skb(skb);
741 	IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
742 	return err;
743 }
744 
745 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
746 {
747 	int err = 0;
748 
749 	*dst = NULL;
750 	if (sk) {
751 		struct ipv6_pinfo *np = inet6_sk(sk);
752 
753 		*dst = sk_dst_check(sk, np->dst_cookie);
754 		if (*dst) {
755 			struct rt6_info *rt = (struct rt6_info*)*dst;
756 
757 				/* Yes, checking route validity in not connected
758 				   case is not very simple. Take into account,
759 				   that we do not support routing by source, TOS,
760 				   and MSG_DONTROUTE 		--ANK (980726)
761 
762 				   1. If route was host route, check that
763 				      cached destination is current.
764 				      If it is network route, we still may
765 				      check its validity using saved pointer
766 				      to the last used address: daddr_cache.
767 				      We do not want to save whole address now,
768 				      (because main consumer of this service
769 				       is tcp, which has not this problem),
770 				      so that the last trick works only on connected
771 				      sockets.
772 				   2. oif also should be the same.
773 				 */
774 
775 			if (((rt->rt6i_dst.plen != 128 ||
776 			      !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
777 			     && (np->daddr_cache == NULL ||
778 				 !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
779 			    || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
780 				dst_release(*dst);
781 				*dst = NULL;
782 			}
783 		}
784 	}
785 
786 	if (*dst == NULL)
787 		*dst = ip6_route_output(sk, fl);
788 
789 	if ((err = (*dst)->error))
790 		goto out_err_release;
791 
792 	if (ipv6_addr_any(&fl->fl6_src)) {
793 		err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
794 
795 		if (err) {
796 #if IP6_DEBUG >= 2
797 			printk(KERN_DEBUG "ip6_dst_lookup: "
798 			       "no available source address\n");
799 #endif
800 			goto out_err_release;
801 		}
802 	}
803 
804 	return 0;
805 
806 out_err_release:
807 	dst_release(*dst);
808 	*dst = NULL;
809 	return err;
810 }
811 
812 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb),
813 		    void *from, int length, int transhdrlen,
814 		    int hlimit, struct ipv6_txoptions *opt, struct flowi *fl, struct rt6_info *rt,
815 		    unsigned int flags)
816 {
817 	struct inet_sock *inet = inet_sk(sk);
818 	struct ipv6_pinfo *np = inet6_sk(sk);
819 	struct sk_buff *skb;
820 	unsigned int maxfraglen, fragheaderlen;
821 	int exthdrlen;
822 	int hh_len;
823 	int mtu;
824 	int copy;
825 	int err;
826 	int offset = 0;
827 	int csummode = CHECKSUM_NONE;
828 
829 	if (flags&MSG_PROBE)
830 		return 0;
831 	if (skb_queue_empty(&sk->sk_write_queue)) {
832 		/*
833 		 * setup for corking
834 		 */
835 		if (opt) {
836 			if (np->cork.opt == NULL) {
837 				np->cork.opt = kmalloc(opt->tot_len,
838 						       sk->sk_allocation);
839 				if (unlikely(np->cork.opt == NULL))
840 					return -ENOBUFS;
841 			} else if (np->cork.opt->tot_len < opt->tot_len) {
842 				printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
843 				return -EINVAL;
844 			}
845 			memcpy(np->cork.opt, opt, opt->tot_len);
846 			inet->cork.flags |= IPCORK_OPT;
847 			/* need source address above miyazawa*/
848 		}
849 		dst_hold(&rt->u.dst);
850 		np->cork.rt = rt;
851 		inet->cork.fl = *fl;
852 		np->cork.hop_limit = hlimit;
853 		inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
854 		if (dst_allfrag(rt->u.dst.path))
855 			inet->cork.flags |= IPCORK_ALLFRAG;
856 		inet->cork.length = 0;
857 		sk->sk_sndmsg_page = NULL;
858 		sk->sk_sndmsg_off = 0;
859 		exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
860 		length += exthdrlen;
861 		transhdrlen += exthdrlen;
862 	} else {
863 		rt = np->cork.rt;
864 		fl = &inet->cork.fl;
865 		if (inet->cork.flags & IPCORK_OPT)
866 			opt = np->cork.opt;
867 		transhdrlen = 0;
868 		exthdrlen = 0;
869 		mtu = inet->cork.fragsize;
870 	}
871 
872 	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
873 
874 	fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
875 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
876 
877 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
878 		if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
879 			ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
880 			return -EMSGSIZE;
881 		}
882 	}
883 
884 	/*
885 	 * Let's try using as much space as possible.
886 	 * Use MTU if total length of the message fits into the MTU.
887 	 * Otherwise, we need to reserve fragment header and
888 	 * fragment alignment (= 8-15 octects, in total).
889 	 *
890 	 * Note that we may need to "move" the data from the tail of
891 	 * of the buffer to the new fragment when we split
892 	 * the message.
893 	 *
894 	 * FIXME: It may be fragmented into multiple chunks
895 	 *        at once if non-fragmentable extension headers
896 	 *        are too large.
897 	 * --yoshfuji
898 	 */
899 
900 	inet->cork.length += length;
901 
902 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
903 		goto alloc_new_skb;
904 
905 	while (length > 0) {
906 		/* Check if the remaining data fits into current packet. */
907 		copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
908 		if (copy < length)
909 			copy = maxfraglen - skb->len;
910 
911 		if (copy <= 0) {
912 			char *data;
913 			unsigned int datalen;
914 			unsigned int fraglen;
915 			unsigned int fraggap;
916 			unsigned int alloclen;
917 			struct sk_buff *skb_prev;
918 alloc_new_skb:
919 			skb_prev = skb;
920 
921 			/* There's no room in the current skb */
922 			if (skb_prev)
923 				fraggap = skb_prev->len - maxfraglen;
924 			else
925 				fraggap = 0;
926 
927 			/*
928 			 * If remaining data exceeds the mtu,
929 			 * we know we need more fragment(s).
930 			 */
931 			datalen = length + fraggap;
932 			if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
933 				datalen = maxfraglen - fragheaderlen;
934 
935 			fraglen = datalen + fragheaderlen;
936 			if ((flags & MSG_MORE) &&
937 			    !(rt->u.dst.dev->features&NETIF_F_SG))
938 				alloclen = mtu;
939 			else
940 				alloclen = datalen + fragheaderlen;
941 
942 			/*
943 			 * The last fragment gets additional space at tail.
944 			 * Note: we overallocate on fragments with MSG_MODE
945 			 * because we have no idea if we're the last one.
946 			 */
947 			if (datalen == length + fraggap)
948 				alloclen += rt->u.dst.trailer_len;
949 
950 			/*
951 			 * We just reserve space for fragment header.
952 			 * Note: this may be overallocation if the message
953 			 * (without MSG_MORE) fits into the MTU.
954 			 */
955 			alloclen += sizeof(struct frag_hdr);
956 
957 			if (transhdrlen) {
958 				skb = sock_alloc_send_skb(sk,
959 						alloclen + hh_len,
960 						(flags & MSG_DONTWAIT), &err);
961 			} else {
962 				skb = NULL;
963 				if (atomic_read(&sk->sk_wmem_alloc) <=
964 				    2 * sk->sk_sndbuf)
965 					skb = sock_wmalloc(sk,
966 							   alloclen + hh_len, 1,
967 							   sk->sk_allocation);
968 				if (unlikely(skb == NULL))
969 					err = -ENOBUFS;
970 			}
971 			if (skb == NULL)
972 				goto error;
973 			/*
974 			 *	Fill in the control structures
975 			 */
976 			skb->ip_summed = csummode;
977 			skb->csum = 0;
978 			/* reserve for fragmentation */
979 			skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
980 
981 			/*
982 			 *	Find where to start putting bytes
983 			 */
984 			data = skb_put(skb, fraglen);
985 			skb->nh.raw = data + exthdrlen;
986 			data += fragheaderlen;
987 			skb->h.raw = data + exthdrlen;
988 
989 			if (fraggap) {
990 				skb->csum = skb_copy_and_csum_bits(
991 					skb_prev, maxfraglen,
992 					data + transhdrlen, fraggap, 0);
993 				skb_prev->csum = csum_sub(skb_prev->csum,
994 							  skb->csum);
995 				data += fraggap;
996 				skb_trim(skb_prev, maxfraglen);
997 			}
998 			copy = datalen - transhdrlen - fraggap;
999 			if (copy < 0) {
1000 				err = -EINVAL;
1001 				kfree_skb(skb);
1002 				goto error;
1003 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1004 				err = -EFAULT;
1005 				kfree_skb(skb);
1006 				goto error;
1007 			}
1008 
1009 			offset += copy;
1010 			length -= datalen - fraggap;
1011 			transhdrlen = 0;
1012 			exthdrlen = 0;
1013 			csummode = CHECKSUM_NONE;
1014 
1015 			/*
1016 			 * Put the packet on the pending queue
1017 			 */
1018 			__skb_queue_tail(&sk->sk_write_queue, skb);
1019 			continue;
1020 		}
1021 
1022 		if (copy > length)
1023 			copy = length;
1024 
1025 		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1026 			unsigned int off;
1027 
1028 			off = skb->len;
1029 			if (getfrag(from, skb_put(skb, copy),
1030 						offset, copy, off, skb) < 0) {
1031 				__skb_trim(skb, off);
1032 				err = -EFAULT;
1033 				goto error;
1034 			}
1035 		} else {
1036 			int i = skb_shinfo(skb)->nr_frags;
1037 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1038 			struct page *page = sk->sk_sndmsg_page;
1039 			int off = sk->sk_sndmsg_off;
1040 			unsigned int left;
1041 
1042 			if (page && (left = PAGE_SIZE - off) > 0) {
1043 				if (copy >= left)
1044 					copy = left;
1045 				if (page != frag->page) {
1046 					if (i == MAX_SKB_FRAGS) {
1047 						err = -EMSGSIZE;
1048 						goto error;
1049 					}
1050 					get_page(page);
1051 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1052 					frag = &skb_shinfo(skb)->frags[i];
1053 				}
1054 			} else if(i < MAX_SKB_FRAGS) {
1055 				if (copy > PAGE_SIZE)
1056 					copy = PAGE_SIZE;
1057 				page = alloc_pages(sk->sk_allocation, 0);
1058 				if (page == NULL) {
1059 					err = -ENOMEM;
1060 					goto error;
1061 				}
1062 				sk->sk_sndmsg_page = page;
1063 				sk->sk_sndmsg_off = 0;
1064 
1065 				skb_fill_page_desc(skb, i, page, 0, 0);
1066 				frag = &skb_shinfo(skb)->frags[i];
1067 				skb->truesize += PAGE_SIZE;
1068 				atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1069 			} else {
1070 				err = -EMSGSIZE;
1071 				goto error;
1072 			}
1073 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1074 				err = -EFAULT;
1075 				goto error;
1076 			}
1077 			sk->sk_sndmsg_off += copy;
1078 			frag->size += copy;
1079 			skb->len += copy;
1080 			skb->data_len += copy;
1081 		}
1082 		offset += copy;
1083 		length -= copy;
1084 	}
1085 	return 0;
1086 error:
1087 	inet->cork.length -= length;
1088 	IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1089 	return err;
1090 }
1091 
1092 int ip6_push_pending_frames(struct sock *sk)
1093 {
1094 	struct sk_buff *skb, *tmp_skb;
1095 	struct sk_buff **tail_skb;
1096 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1097 	struct inet_sock *inet = inet_sk(sk);
1098 	struct ipv6_pinfo *np = inet6_sk(sk);
1099 	struct ipv6hdr *hdr;
1100 	struct ipv6_txoptions *opt = np->cork.opt;
1101 	struct rt6_info *rt = np->cork.rt;
1102 	struct flowi *fl = &inet->cork.fl;
1103 	unsigned char proto = fl->proto;
1104 	int err = 0;
1105 
1106 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1107 		goto out;
1108 	tail_skb = &(skb_shinfo(skb)->frag_list);
1109 
1110 	/* move skb->data to ip header from ext header */
1111 	if (skb->data < skb->nh.raw)
1112 		__skb_pull(skb, skb->nh.raw - skb->data);
1113 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1114 		__skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1115 		*tail_skb = tmp_skb;
1116 		tail_skb = &(tmp_skb->next);
1117 		skb->len += tmp_skb->len;
1118 		skb->data_len += tmp_skb->len;
1119 #if 0 /* Logically correct, but useless work, ip_fragment() will have to undo */
1120 		skb->truesize += tmp_skb->truesize;
1121 		__sock_put(tmp_skb->sk);
1122 		tmp_skb->destructor = NULL;
1123 		tmp_skb->sk = NULL;
1124 #endif
1125 	}
1126 
1127 	ipv6_addr_copy(final_dst, &fl->fl6_dst);
1128 	__skb_pull(skb, skb->h.raw - skb->nh.raw);
1129 	if (opt && opt->opt_flen)
1130 		ipv6_push_frag_opts(skb, opt, &proto);
1131 	if (opt && opt->opt_nflen)
1132 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1133 
1134 	skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1135 
1136 	*(u32*)hdr = fl->fl6_flowlabel | htonl(0x60000000);
1137 
1138 	if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1139 		hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1140 	else
1141 		hdr->payload_len = 0;
1142 	hdr->hop_limit = np->cork.hop_limit;
1143 	hdr->nexthdr = proto;
1144 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1145 	ipv6_addr_copy(&hdr->daddr, final_dst);
1146 
1147 	skb->dst = dst_clone(&rt->u.dst);
1148 	IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
1149 	err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1150 	if (err) {
1151 		if (err > 0)
1152 			err = inet->recverr ? net_xmit_errno(err) : 0;
1153 		if (err)
1154 			goto error;
1155 	}
1156 
1157 out:
1158 	inet->cork.flags &= ~IPCORK_OPT;
1159 	if (np->cork.opt) {
1160 		kfree(np->cork.opt);
1161 		np->cork.opt = NULL;
1162 	}
1163 	if (np->cork.rt) {
1164 		dst_release(&np->cork.rt->u.dst);
1165 		np->cork.rt = NULL;
1166 		inet->cork.flags &= ~IPCORK_ALLFRAG;
1167 	}
1168 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1169 	return err;
1170 error:
1171 	goto out;
1172 }
1173 
1174 void ip6_flush_pending_frames(struct sock *sk)
1175 {
1176 	struct inet_sock *inet = inet_sk(sk);
1177 	struct ipv6_pinfo *np = inet6_sk(sk);
1178 	struct sk_buff *skb;
1179 
1180 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1181 		IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1182 		kfree_skb(skb);
1183 	}
1184 
1185 	inet->cork.flags &= ~IPCORK_OPT;
1186 
1187 	if (np->cork.opt) {
1188 		kfree(np->cork.opt);
1189 		np->cork.opt = NULL;
1190 	}
1191 	if (np->cork.rt) {
1192 		dst_release(&np->cork.rt->u.dst);
1193 		np->cork.rt = NULL;
1194 		inet->cork.flags &= ~IPCORK_ALLFRAG;
1195 	}
1196 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1197 }
1198