xref: /openbmc/linux/net/ipv6/ip6_output.c (revision d5cb9783536a41df9f9cba5b0a1d78047ed787f7)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	$Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9  *
10  *	Based on linux/net/ipv4/ip_output.c
11  *
12  *	This program is free software; you can redistribute it and/or
13  *      modify it under the terms of the GNU General Public License
14  *      as published by the Free Software Foundation; either version
15  *      2 of the License, or (at your option) any later version.
16  *
17  *	Changes:
18  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
19  *				extension headers are implemented.
20  *				route changes now work.
21  *				ip6_forward does not confuse sniffers.
22  *				etc.
23  *
24  *      H. von Brand    :       Added missing #include <linux/string.h>
25  *	Imran Patel	: 	frag id should be in NBO
26  *      Kazunori MIYAZAWA @USAGI
27  *			:       add ip6_append_data and related functions
28  *				for datagram xmit
29  */
30 
31 #include <linux/config.h>
32 #include <linux/errno.h>
33 #include <linux/types.h>
34 #include <linux/string.h>
35 #include <linux/socket.h>
36 #include <linux/net.h>
37 #include <linux/netdevice.h>
38 #include <linux/if_arp.h>
39 #include <linux/in6.h>
40 #include <linux/tcp.h>
41 #include <linux/route.h>
42 
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45 
46 #include <net/sock.h>
47 #include <net/snmp.h>
48 
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 
61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62 {
63 	static u32 ipv6_fragmentation_id = 1;
64 	static DEFINE_SPINLOCK(ip6_id_lock);
65 
66 	spin_lock_bh(&ip6_id_lock);
67 	fhdr->identification = htonl(ipv6_fragmentation_id);
68 	if (++ipv6_fragmentation_id == 0)
69 		ipv6_fragmentation_id = 1;
70 	spin_unlock_bh(&ip6_id_lock);
71 }
72 
73 static inline int ip6_output_finish(struct sk_buff *skb)
74 {
75 
76 	struct dst_entry *dst = skb->dst;
77 	struct hh_cache *hh = dst->hh;
78 
79 	if (hh) {
80 		int hh_alen;
81 
82 		read_lock_bh(&hh->hh_lock);
83 		hh_alen = HH_DATA_ALIGN(hh->hh_len);
84 		memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
85 		read_unlock_bh(&hh->hh_lock);
86 	        skb_push(skb, hh->hh_len);
87 		return hh->hh_output(skb);
88 	} else if (dst->neighbour)
89 		return dst->neighbour->output(skb);
90 
91 	IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
92 	kfree_skb(skb);
93 	return -EINVAL;
94 
95 }
96 
97 /* dev_loopback_xmit for use with netfilter. */
98 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
99 {
100 	newskb->mac.raw = newskb->data;
101 	__skb_pull(newskb, newskb->nh.raw - newskb->data);
102 	newskb->pkt_type = PACKET_LOOPBACK;
103 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
104 	BUG_TRAP(newskb->dst);
105 
106 	netif_rx(newskb);
107 	return 0;
108 }
109 
110 
111 static int ip6_output2(struct sk_buff *skb)
112 {
113 	struct dst_entry *dst = skb->dst;
114 	struct net_device *dev = dst->dev;
115 
116 	skb->protocol = htons(ETH_P_IPV6);
117 	skb->dev = dev;
118 
119 	if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
120 		struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
121 
122 		if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
123 		    ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
124 				&skb->nh.ipv6h->saddr)) {
125 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
126 
127 			/* Do not check for IFF_ALLMULTI; multicast routing
128 			   is not supported in any case.
129 			 */
130 			if (newskb)
131 				NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
132 					newskb->dev,
133 					ip6_dev_loopback_xmit);
134 
135 			if (skb->nh.ipv6h->hop_limit == 0) {
136 				IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
137 				kfree_skb(skb);
138 				return 0;
139 			}
140 		}
141 
142 		IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
143 	}
144 
145 	return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
146 }
147 
148 int ip6_output(struct sk_buff *skb)
149 {
150 	if ((skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->ufo_size) ||
151 				dst_allfrag(skb->dst))
152 		return ip6_fragment(skb, ip6_output2);
153 	else
154 		return ip6_output2(skb);
155 }
156 
157 /*
158  *	xmit an sk_buff (used by TCP)
159  */
160 
161 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
162 	     struct ipv6_txoptions *opt, int ipfragok)
163 {
164 	struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
165 	struct in6_addr *first_hop = &fl->fl6_dst;
166 	struct dst_entry *dst = skb->dst;
167 	struct ipv6hdr *hdr;
168 	u8  proto = fl->proto;
169 	int seg_len = skb->len;
170 	int hlimit, tclass;
171 	u32 mtu;
172 
173 	if (opt) {
174 		int head_room;
175 
176 		/* First: exthdrs may take lots of space (~8K for now)
177 		   MAX_HEADER is not enough.
178 		 */
179 		head_room = opt->opt_nflen + opt->opt_flen;
180 		seg_len += head_room;
181 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
182 
183 		if (skb_headroom(skb) < head_room) {
184 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
185 			kfree_skb(skb);
186 			skb = skb2;
187 			if (skb == NULL) {
188 				IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
189 				return -ENOBUFS;
190 			}
191 			if (sk)
192 				skb_set_owner_w(skb, sk);
193 		}
194 		if (opt->opt_flen)
195 			ipv6_push_frag_opts(skb, opt, &proto);
196 		if (opt->opt_nflen)
197 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
198 	}
199 
200 	hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
201 
202 	/*
203 	 *	Fill in the IPv6 header
204 	 */
205 
206 	hlimit = -1;
207 	if (np)
208 		hlimit = np->hop_limit;
209 	if (hlimit < 0)
210 		hlimit = dst_metric(dst, RTAX_HOPLIMIT);
211 	if (hlimit < 0)
212 		hlimit = ipv6_get_hoplimit(dst->dev);
213 
214 	tclass = -1;
215 	if (np)
216 		tclass = np->tclass;
217 	if (tclass < 0)
218 		tclass = 0;
219 
220 	*(u32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
221 
222 	hdr->payload_len = htons(seg_len);
223 	hdr->nexthdr = proto;
224 	hdr->hop_limit = hlimit;
225 
226 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
227 	ipv6_addr_copy(&hdr->daddr, first_hop);
228 
229 	mtu = dst_mtu(dst);
230 	if ((skb->len <= mtu) || ipfragok) {
231 		IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
232 		return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev,
233 				dst_output);
234 	}
235 
236 	if (net_ratelimit())
237 		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
238 	skb->dev = dst->dev;
239 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
240 	IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
241 	kfree_skb(skb);
242 	return -EMSGSIZE;
243 }
244 
245 /*
246  *	To avoid extra problems ND packets are send through this
247  *	routine. It's code duplication but I really want to avoid
248  *	extra checks since ipv6_build_header is used by TCP (which
249  *	is for us performance critical)
250  */
251 
252 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
253 	       struct in6_addr *saddr, struct in6_addr *daddr,
254 	       int proto, int len)
255 {
256 	struct ipv6_pinfo *np = inet6_sk(sk);
257 	struct ipv6hdr *hdr;
258 	int totlen;
259 
260 	skb->protocol = htons(ETH_P_IPV6);
261 	skb->dev = dev;
262 
263 	totlen = len + sizeof(struct ipv6hdr);
264 
265 	hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
266 	skb->nh.ipv6h = hdr;
267 
268 	*(u32*)hdr = htonl(0x60000000);
269 
270 	hdr->payload_len = htons(len);
271 	hdr->nexthdr = proto;
272 	hdr->hop_limit = np->hop_limit;
273 
274 	ipv6_addr_copy(&hdr->saddr, saddr);
275 	ipv6_addr_copy(&hdr->daddr, daddr);
276 
277 	return 0;
278 }
279 
280 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
281 {
282 	struct ip6_ra_chain *ra;
283 	struct sock *last = NULL;
284 
285 	read_lock(&ip6_ra_lock);
286 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
287 		struct sock *sk = ra->sk;
288 		if (sk && ra->sel == sel &&
289 		    (!sk->sk_bound_dev_if ||
290 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
291 			if (last) {
292 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
293 				if (skb2)
294 					rawv6_rcv(last, skb2);
295 			}
296 			last = sk;
297 		}
298 	}
299 
300 	if (last) {
301 		rawv6_rcv(last, skb);
302 		read_unlock(&ip6_ra_lock);
303 		return 1;
304 	}
305 	read_unlock(&ip6_ra_lock);
306 	return 0;
307 }
308 
309 static inline int ip6_forward_finish(struct sk_buff *skb)
310 {
311 	return dst_output(skb);
312 }
313 
314 int ip6_forward(struct sk_buff *skb)
315 {
316 	struct dst_entry *dst = skb->dst;
317 	struct ipv6hdr *hdr = skb->nh.ipv6h;
318 	struct inet6_skb_parm *opt = IP6CB(skb);
319 
320 	if (ipv6_devconf.forwarding == 0)
321 		goto error;
322 
323 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
324 		IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
325 		goto drop;
326 	}
327 
328 	skb->ip_summed = CHECKSUM_NONE;
329 
330 	/*
331 	 *	We DO NOT make any processing on
332 	 *	RA packets, pushing them to user level AS IS
333 	 *	without ane WARRANTY that application will be able
334 	 *	to interpret them. The reason is that we
335 	 *	cannot make anything clever here.
336 	 *
337 	 *	We are not end-node, so that if packet contains
338 	 *	AH/ESP, we cannot make anything.
339 	 *	Defragmentation also would be mistake, RA packets
340 	 *	cannot be fragmented, because there is no warranty
341 	 *	that different fragments will go along one path. --ANK
342 	 */
343 	if (opt->ra) {
344 		u8 *ptr = skb->nh.raw + opt->ra;
345 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
346 			return 0;
347 	}
348 
349 	/*
350 	 *	check and decrement ttl
351 	 */
352 	if (hdr->hop_limit <= 1) {
353 		/* Force OUTPUT device used as source address */
354 		skb->dev = dst->dev;
355 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
356 			    0, skb->dev);
357 
358 		kfree_skb(skb);
359 		return -ETIMEDOUT;
360 	}
361 
362 	if (!xfrm6_route_forward(skb)) {
363 		IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
364 		goto drop;
365 	}
366 	dst = skb->dst;
367 
368 	/* IPv6 specs say nothing about it, but it is clear that we cannot
369 	   send redirects to source routed frames.
370 	 */
371 	if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
372 		struct in6_addr *target = NULL;
373 		struct rt6_info *rt;
374 		struct neighbour *n = dst->neighbour;
375 
376 		/*
377 		 *	incoming and outgoing devices are the same
378 		 *	send a redirect.
379 		 */
380 
381 		rt = (struct rt6_info *) dst;
382 		if ((rt->rt6i_flags & RTF_GATEWAY))
383 			target = (struct in6_addr*)&n->primary_key;
384 		else
385 			target = &hdr->daddr;
386 
387 		/* Limit redirects both by destination (here)
388 		   and by source (inside ndisc_send_redirect)
389 		 */
390 		if (xrlim_allow(dst, 1*HZ))
391 			ndisc_send_redirect(skb, n, target);
392 	} else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
393 						|IPV6_ADDR_LINKLOCAL)) {
394 		/* This check is security critical. */
395 		goto error;
396 	}
397 
398 	if (skb->len > dst_mtu(dst)) {
399 		/* Again, force OUTPUT device used as source address */
400 		skb->dev = dst->dev;
401 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
402 		IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
403 		IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
404 		kfree_skb(skb);
405 		return -EMSGSIZE;
406 	}
407 
408 	if (skb_cow(skb, dst->dev->hard_header_len)) {
409 		IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
410 		goto drop;
411 	}
412 
413 	hdr = skb->nh.ipv6h;
414 
415 	/* Mangling hops number delayed to point after skb COW */
416 
417 	hdr->hop_limit--;
418 
419 	IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
420 	return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
421 
422 error:
423 	IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
424 drop:
425 	kfree_skb(skb);
426 	return -EINVAL;
427 }
428 
429 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
430 {
431 	to->pkt_type = from->pkt_type;
432 	to->priority = from->priority;
433 	to->protocol = from->protocol;
434 	dst_release(to->dst);
435 	to->dst = dst_clone(from->dst);
436 	to->dev = from->dev;
437 
438 #ifdef CONFIG_NET_SCHED
439 	to->tc_index = from->tc_index;
440 #endif
441 #ifdef CONFIG_NETFILTER
442 	to->nfmark = from->nfmark;
443 	/* Connection association is same as pre-frag packet */
444 	to->nfct = from->nfct;
445 	nf_conntrack_get(to->nfct);
446 	to->nfctinfo = from->nfctinfo;
447 #ifdef CONFIG_BRIDGE_NETFILTER
448 	nf_bridge_put(to->nf_bridge);
449 	to->nf_bridge = from->nf_bridge;
450 	nf_bridge_get(to->nf_bridge);
451 #endif
452 #endif
453 }
454 
455 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
456 {
457 	u16 offset = sizeof(struct ipv6hdr);
458 	struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
459 	unsigned int packet_len = skb->tail - skb->nh.raw;
460 	int found_rhdr = 0;
461 	*nexthdr = &skb->nh.ipv6h->nexthdr;
462 
463 	while (offset + 1 <= packet_len) {
464 
465 		switch (**nexthdr) {
466 
467 		case NEXTHDR_HOP:
468 		case NEXTHDR_ROUTING:
469 		case NEXTHDR_DEST:
470 			if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
471 			if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
472 			offset += ipv6_optlen(exthdr);
473 			*nexthdr = &exthdr->nexthdr;
474 			exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
475 			break;
476 		default :
477 			return offset;
478 		}
479 	}
480 
481 	return offset;
482 }
483 
484 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
485 {
486 	struct net_device *dev;
487 	struct sk_buff *frag;
488 	struct rt6_info *rt = (struct rt6_info*)skb->dst;
489 	struct ipv6hdr *tmp_hdr;
490 	struct frag_hdr *fh;
491 	unsigned int mtu, hlen, left, len;
492 	u32 frag_id = 0;
493 	int ptr, offset = 0, err=0;
494 	u8 *prevhdr, nexthdr = 0;
495 
496 	dev = rt->u.dst.dev;
497 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
498 	nexthdr = *prevhdr;
499 
500 	mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
501 
502 	if (skb_shinfo(skb)->frag_list) {
503 		int first_len = skb_pagelen(skb);
504 
505 		if (first_len - hlen > mtu ||
506 		    ((first_len - hlen) & 7) ||
507 		    skb_cloned(skb))
508 			goto slow_path;
509 
510 		for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
511 			/* Correct geometry. */
512 			if (frag->len > mtu ||
513 			    ((frag->len & 7) && frag->next) ||
514 			    skb_headroom(frag) < hlen)
515 			    goto slow_path;
516 
517 			/* Partially cloned skb? */
518 			if (skb_shared(frag))
519 				goto slow_path;
520 
521 			BUG_ON(frag->sk);
522 			if (skb->sk) {
523 				sock_hold(skb->sk);
524 				frag->sk = skb->sk;
525 				frag->destructor = sock_wfree;
526 				skb->truesize -= frag->truesize;
527 			}
528 		}
529 
530 		err = 0;
531 		offset = 0;
532 		frag = skb_shinfo(skb)->frag_list;
533 		skb_shinfo(skb)->frag_list = NULL;
534 		/* BUILD HEADER */
535 
536 		tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
537 		if (!tmp_hdr) {
538 			IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
539 			return -ENOMEM;
540 		}
541 
542 		*prevhdr = NEXTHDR_FRAGMENT;
543 		memcpy(tmp_hdr, skb->nh.raw, hlen);
544 		__skb_pull(skb, hlen);
545 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
546 		skb->nh.raw = __skb_push(skb, hlen);
547 		memcpy(skb->nh.raw, tmp_hdr, hlen);
548 
549 		ipv6_select_ident(skb, fh);
550 		fh->nexthdr = nexthdr;
551 		fh->reserved = 0;
552 		fh->frag_off = htons(IP6_MF);
553 		frag_id = fh->identification;
554 
555 		first_len = skb_pagelen(skb);
556 		skb->data_len = first_len - skb_headlen(skb);
557 		skb->len = first_len;
558 		skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
559 
560 
561 		for (;;) {
562 			/* Prepare header of the next frame,
563 			 * before previous one went down. */
564 			if (frag) {
565 				frag->ip_summed = CHECKSUM_NONE;
566 				frag->h.raw = frag->data;
567 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
568 				frag->nh.raw = __skb_push(frag, hlen);
569 				memcpy(frag->nh.raw, tmp_hdr, hlen);
570 				offset += skb->len - hlen - sizeof(struct frag_hdr);
571 				fh->nexthdr = nexthdr;
572 				fh->reserved = 0;
573 				fh->frag_off = htons(offset);
574 				if (frag->next != NULL)
575 					fh->frag_off |= htons(IP6_MF);
576 				fh->identification = frag_id;
577 				frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
578 				ip6_copy_metadata(frag, skb);
579 			}
580 
581 			err = output(skb);
582 			if (err || !frag)
583 				break;
584 
585 			skb = frag;
586 			frag = skb->next;
587 			skb->next = NULL;
588 		}
589 
590 		if (tmp_hdr)
591 			kfree(tmp_hdr);
592 
593 		if (err == 0) {
594 			IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
595 			return 0;
596 		}
597 
598 		while (frag) {
599 			skb = frag->next;
600 			kfree_skb(frag);
601 			frag = skb;
602 		}
603 
604 		IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
605 		return err;
606 	}
607 
608 slow_path:
609 	left = skb->len - hlen;		/* Space per frame */
610 	ptr = hlen;			/* Where to start from */
611 
612 	/*
613 	 *	Fragment the datagram.
614 	 */
615 
616 	*prevhdr = NEXTHDR_FRAGMENT;
617 
618 	/*
619 	 *	Keep copying data until we run out.
620 	 */
621 	while(left > 0)	{
622 		len = left;
623 		/* IF: it doesn't fit, use 'mtu' - the data space left */
624 		if (len > mtu)
625 			len = mtu;
626 		/* IF: we are not sending upto and including the packet end
627 		   then align the next start on an eight byte boundary */
628 		if (len < left)	{
629 			len &= ~7;
630 		}
631 		/*
632 		 *	Allocate buffer.
633 		 */
634 
635 		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
636 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
637 			IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
638 			err = -ENOMEM;
639 			goto fail;
640 		}
641 
642 		/*
643 		 *	Set up data on packet
644 		 */
645 
646 		ip6_copy_metadata(frag, skb);
647 		skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
648 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
649 		frag->nh.raw = frag->data;
650 		fh = (struct frag_hdr*)(frag->data + hlen);
651 		frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
652 
653 		/*
654 		 *	Charge the memory for the fragment to any owner
655 		 *	it might possess
656 		 */
657 		if (skb->sk)
658 			skb_set_owner_w(frag, skb->sk);
659 
660 		/*
661 		 *	Copy the packet header into the new buffer.
662 		 */
663 		memcpy(frag->nh.raw, skb->data, hlen);
664 
665 		/*
666 		 *	Build fragment header.
667 		 */
668 		fh->nexthdr = nexthdr;
669 		fh->reserved = 0;
670 		if (!frag_id) {
671 			ipv6_select_ident(skb, fh);
672 			frag_id = fh->identification;
673 		} else
674 			fh->identification = frag_id;
675 
676 		/*
677 		 *	Copy a block of the IP datagram.
678 		 */
679 		if (skb_copy_bits(skb, ptr, frag->h.raw, len))
680 			BUG();
681 		left -= len;
682 
683 		fh->frag_off = htons(offset);
684 		if (left > 0)
685 			fh->frag_off |= htons(IP6_MF);
686 		frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
687 
688 		ptr += len;
689 		offset += len;
690 
691 		/*
692 		 *	Put this fragment into the sending queue.
693 		 */
694 
695 		IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
696 
697 		err = output(frag);
698 		if (err)
699 			goto fail;
700 	}
701 	kfree_skb(skb);
702 	IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
703 	return err;
704 
705 fail:
706 	kfree_skb(skb);
707 	IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
708 	return err;
709 }
710 
711 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
712 {
713 	int err = 0;
714 
715 	*dst = NULL;
716 	if (sk) {
717 		struct ipv6_pinfo *np = inet6_sk(sk);
718 
719 		*dst = sk_dst_check(sk, np->dst_cookie);
720 		if (*dst) {
721 			struct rt6_info *rt = (struct rt6_info*)*dst;
722 
723 				/* Yes, checking route validity in not connected
724 				   case is not very simple. Take into account,
725 				   that we do not support routing by source, TOS,
726 				   and MSG_DONTROUTE 		--ANK (980726)
727 
728 				   1. If route was host route, check that
729 				      cached destination is current.
730 				      If it is network route, we still may
731 				      check its validity using saved pointer
732 				      to the last used address: daddr_cache.
733 				      We do not want to save whole address now,
734 				      (because main consumer of this service
735 				       is tcp, which has not this problem),
736 				      so that the last trick works only on connected
737 				      sockets.
738 				   2. oif also should be the same.
739 				 */
740 
741 			if (((rt->rt6i_dst.plen != 128 ||
742 			      !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
743 			     && (np->daddr_cache == NULL ||
744 				 !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
745 			    || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
746 				dst_release(*dst);
747 				*dst = NULL;
748 			}
749 		}
750 	}
751 
752 	if (*dst == NULL)
753 		*dst = ip6_route_output(sk, fl);
754 
755 	if ((err = (*dst)->error))
756 		goto out_err_release;
757 
758 	if (ipv6_addr_any(&fl->fl6_src)) {
759 		err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
760 
761 		if (err)
762 			goto out_err_release;
763 	}
764 
765 	return 0;
766 
767 out_err_release:
768 	dst_release(*dst);
769 	*dst = NULL;
770 	return err;
771 }
772 inline int ip6_ufo_append_data(struct sock *sk,
773 			int getfrag(void *from, char *to, int offset, int len,
774 			int odd, struct sk_buff *skb),
775 			void *from, int length, int hh_len, int fragheaderlen,
776 			int transhdrlen, int mtu,unsigned int flags)
777 
778 {
779 	struct sk_buff *skb;
780 	int err;
781 
782 	/* There is support for UDP large send offload by network
783 	 * device, so create one single skb packet containing complete
784 	 * udp datagram
785 	 */
786 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
787 		skb = sock_alloc_send_skb(sk,
788 			hh_len + fragheaderlen + transhdrlen + 20,
789 			(flags & MSG_DONTWAIT), &err);
790 		if (skb == NULL)
791 			return -ENOMEM;
792 
793 		/* reserve space for Hardware header */
794 		skb_reserve(skb, hh_len);
795 
796 		/* create space for UDP/IP header */
797 		skb_put(skb,fragheaderlen + transhdrlen);
798 
799 		/* initialize network header pointer */
800 		skb->nh.raw = skb->data;
801 
802 		/* initialize protocol header pointer */
803 		skb->h.raw = skb->data + fragheaderlen;
804 
805 		skb->ip_summed = CHECKSUM_HW;
806 		skb->csum = 0;
807 		sk->sk_sndmsg_off = 0;
808 	}
809 
810 	err = skb_append_datato_frags(sk,skb, getfrag, from,
811 				      (length - transhdrlen));
812 	if (!err) {
813 		struct frag_hdr fhdr;
814 
815 		/* specify the length of each IP datagram fragment*/
816 		skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen) -
817 						sizeof(struct frag_hdr);
818 		ipv6_select_ident(skb, &fhdr);
819 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
820 		__skb_queue_tail(&sk->sk_write_queue, skb);
821 
822 		return 0;
823 	}
824 	/* There is not enough support do UPD LSO,
825 	 * so follow normal path
826 	 */
827 	kfree_skb(skb);
828 
829 	return err;
830 }
831 
832 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
833 	int offset, int len, int odd, struct sk_buff *skb),
834 	void *from, int length, int transhdrlen,
835 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
836 	struct rt6_info *rt, unsigned int flags)
837 {
838 	struct inet_sock *inet = inet_sk(sk);
839 	struct ipv6_pinfo *np = inet6_sk(sk);
840 	struct sk_buff *skb;
841 	unsigned int maxfraglen, fragheaderlen;
842 	int exthdrlen;
843 	int hh_len;
844 	int mtu;
845 	int copy;
846 	int err;
847 	int offset = 0;
848 	int csummode = CHECKSUM_NONE;
849 
850 	if (flags&MSG_PROBE)
851 		return 0;
852 	if (skb_queue_empty(&sk->sk_write_queue)) {
853 		/*
854 		 * setup for corking
855 		 */
856 		if (opt) {
857 			if (np->cork.opt == NULL) {
858 				np->cork.opt = kmalloc(opt->tot_len,
859 						       sk->sk_allocation);
860 				if (unlikely(np->cork.opt == NULL))
861 					return -ENOBUFS;
862 			} else if (np->cork.opt->tot_len < opt->tot_len) {
863 				printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
864 				return -EINVAL;
865 			}
866 			memcpy(np->cork.opt, opt, opt->tot_len);
867 			inet->cork.flags |= IPCORK_OPT;
868 			/* need source address above miyazawa*/
869 		}
870 		dst_hold(&rt->u.dst);
871 		np->cork.rt = rt;
872 		inet->cork.fl = *fl;
873 		np->cork.hop_limit = hlimit;
874 		np->cork.tclass = tclass;
875 		inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
876 		if (dst_allfrag(rt->u.dst.path))
877 			inet->cork.flags |= IPCORK_ALLFRAG;
878 		inet->cork.length = 0;
879 		sk->sk_sndmsg_page = NULL;
880 		sk->sk_sndmsg_off = 0;
881 		exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
882 		length += exthdrlen;
883 		transhdrlen += exthdrlen;
884 	} else {
885 		rt = np->cork.rt;
886 		fl = &inet->cork.fl;
887 		if (inet->cork.flags & IPCORK_OPT)
888 			opt = np->cork.opt;
889 		transhdrlen = 0;
890 		exthdrlen = 0;
891 		mtu = inet->cork.fragsize;
892 	}
893 
894 	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
895 
896 	fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
897 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
898 
899 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
900 		if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
901 			ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
902 			return -EMSGSIZE;
903 		}
904 	}
905 
906 	/*
907 	 * Let's try using as much space as possible.
908 	 * Use MTU if total length of the message fits into the MTU.
909 	 * Otherwise, we need to reserve fragment header and
910 	 * fragment alignment (= 8-15 octects, in total).
911 	 *
912 	 * Note that we may need to "move" the data from the tail of
913 	 * of the buffer to the new fragment when we split
914 	 * the message.
915 	 *
916 	 * FIXME: It may be fragmented into multiple chunks
917 	 *        at once if non-fragmentable extension headers
918 	 *        are too large.
919 	 * --yoshfuji
920 	 */
921 
922 	inet->cork.length += length;
923 	if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
924 	    (rt->u.dst.dev->features & NETIF_F_UFO)) {
925 
926 		if(ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
927 				fragheaderlen, transhdrlen, mtu, flags))
928 			goto error;
929 
930 		return 0;
931 	}
932 
933 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
934 		goto alloc_new_skb;
935 
936 	while (length > 0) {
937 		/* Check if the remaining data fits into current packet. */
938 		copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
939 		if (copy < length)
940 			copy = maxfraglen - skb->len;
941 
942 		if (copy <= 0) {
943 			char *data;
944 			unsigned int datalen;
945 			unsigned int fraglen;
946 			unsigned int fraggap;
947 			unsigned int alloclen;
948 			struct sk_buff *skb_prev;
949 alloc_new_skb:
950 			skb_prev = skb;
951 
952 			/* There's no room in the current skb */
953 			if (skb_prev)
954 				fraggap = skb_prev->len - maxfraglen;
955 			else
956 				fraggap = 0;
957 
958 			/*
959 			 * If remaining data exceeds the mtu,
960 			 * we know we need more fragment(s).
961 			 */
962 			datalen = length + fraggap;
963 			if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
964 				datalen = maxfraglen - fragheaderlen;
965 
966 			fraglen = datalen + fragheaderlen;
967 			if ((flags & MSG_MORE) &&
968 			    !(rt->u.dst.dev->features&NETIF_F_SG))
969 				alloclen = mtu;
970 			else
971 				alloclen = datalen + fragheaderlen;
972 
973 			/*
974 			 * The last fragment gets additional space at tail.
975 			 * Note: we overallocate on fragments with MSG_MODE
976 			 * because we have no idea if we're the last one.
977 			 */
978 			if (datalen == length + fraggap)
979 				alloclen += rt->u.dst.trailer_len;
980 
981 			/*
982 			 * We just reserve space for fragment header.
983 			 * Note: this may be overallocation if the message
984 			 * (without MSG_MORE) fits into the MTU.
985 			 */
986 			alloclen += sizeof(struct frag_hdr);
987 
988 			if (transhdrlen) {
989 				skb = sock_alloc_send_skb(sk,
990 						alloclen + hh_len,
991 						(flags & MSG_DONTWAIT), &err);
992 			} else {
993 				skb = NULL;
994 				if (atomic_read(&sk->sk_wmem_alloc) <=
995 				    2 * sk->sk_sndbuf)
996 					skb = sock_wmalloc(sk,
997 							   alloclen + hh_len, 1,
998 							   sk->sk_allocation);
999 				if (unlikely(skb == NULL))
1000 					err = -ENOBUFS;
1001 			}
1002 			if (skb == NULL)
1003 				goto error;
1004 			/*
1005 			 *	Fill in the control structures
1006 			 */
1007 			skb->ip_summed = csummode;
1008 			skb->csum = 0;
1009 			/* reserve for fragmentation */
1010 			skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1011 
1012 			/*
1013 			 *	Find where to start putting bytes
1014 			 */
1015 			data = skb_put(skb, fraglen);
1016 			skb->nh.raw = data + exthdrlen;
1017 			data += fragheaderlen;
1018 			skb->h.raw = data + exthdrlen;
1019 
1020 			if (fraggap) {
1021 				skb->csum = skb_copy_and_csum_bits(
1022 					skb_prev, maxfraglen,
1023 					data + transhdrlen, fraggap, 0);
1024 				skb_prev->csum = csum_sub(skb_prev->csum,
1025 							  skb->csum);
1026 				data += fraggap;
1027 				skb_trim(skb_prev, maxfraglen);
1028 			}
1029 			copy = datalen - transhdrlen - fraggap;
1030 			if (copy < 0) {
1031 				err = -EINVAL;
1032 				kfree_skb(skb);
1033 				goto error;
1034 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1035 				err = -EFAULT;
1036 				kfree_skb(skb);
1037 				goto error;
1038 			}
1039 
1040 			offset += copy;
1041 			length -= datalen - fraggap;
1042 			transhdrlen = 0;
1043 			exthdrlen = 0;
1044 			csummode = CHECKSUM_NONE;
1045 
1046 			/*
1047 			 * Put the packet on the pending queue
1048 			 */
1049 			__skb_queue_tail(&sk->sk_write_queue, skb);
1050 			continue;
1051 		}
1052 
1053 		if (copy > length)
1054 			copy = length;
1055 
1056 		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1057 			unsigned int off;
1058 
1059 			off = skb->len;
1060 			if (getfrag(from, skb_put(skb, copy),
1061 						offset, copy, off, skb) < 0) {
1062 				__skb_trim(skb, off);
1063 				err = -EFAULT;
1064 				goto error;
1065 			}
1066 		} else {
1067 			int i = skb_shinfo(skb)->nr_frags;
1068 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1069 			struct page *page = sk->sk_sndmsg_page;
1070 			int off = sk->sk_sndmsg_off;
1071 			unsigned int left;
1072 
1073 			if (page && (left = PAGE_SIZE - off) > 0) {
1074 				if (copy >= left)
1075 					copy = left;
1076 				if (page != frag->page) {
1077 					if (i == MAX_SKB_FRAGS) {
1078 						err = -EMSGSIZE;
1079 						goto error;
1080 					}
1081 					get_page(page);
1082 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1083 					frag = &skb_shinfo(skb)->frags[i];
1084 				}
1085 			} else if(i < MAX_SKB_FRAGS) {
1086 				if (copy > PAGE_SIZE)
1087 					copy = PAGE_SIZE;
1088 				page = alloc_pages(sk->sk_allocation, 0);
1089 				if (page == NULL) {
1090 					err = -ENOMEM;
1091 					goto error;
1092 				}
1093 				sk->sk_sndmsg_page = page;
1094 				sk->sk_sndmsg_off = 0;
1095 
1096 				skb_fill_page_desc(skb, i, page, 0, 0);
1097 				frag = &skb_shinfo(skb)->frags[i];
1098 				skb->truesize += PAGE_SIZE;
1099 				atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1100 			} else {
1101 				err = -EMSGSIZE;
1102 				goto error;
1103 			}
1104 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1105 				err = -EFAULT;
1106 				goto error;
1107 			}
1108 			sk->sk_sndmsg_off += copy;
1109 			frag->size += copy;
1110 			skb->len += copy;
1111 			skb->data_len += copy;
1112 		}
1113 		offset += copy;
1114 		length -= copy;
1115 	}
1116 	return 0;
1117 error:
1118 	inet->cork.length -= length;
1119 	IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1120 	return err;
1121 }
1122 
1123 int ip6_push_pending_frames(struct sock *sk)
1124 {
1125 	struct sk_buff *skb, *tmp_skb;
1126 	struct sk_buff **tail_skb;
1127 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1128 	struct inet_sock *inet = inet_sk(sk);
1129 	struct ipv6_pinfo *np = inet6_sk(sk);
1130 	struct ipv6hdr *hdr;
1131 	struct ipv6_txoptions *opt = np->cork.opt;
1132 	struct rt6_info *rt = np->cork.rt;
1133 	struct flowi *fl = &inet->cork.fl;
1134 	unsigned char proto = fl->proto;
1135 	int err = 0;
1136 
1137 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1138 		goto out;
1139 	tail_skb = &(skb_shinfo(skb)->frag_list);
1140 
1141 	/* move skb->data to ip header from ext header */
1142 	if (skb->data < skb->nh.raw)
1143 		__skb_pull(skb, skb->nh.raw - skb->data);
1144 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1145 		__skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1146 		*tail_skb = tmp_skb;
1147 		tail_skb = &(tmp_skb->next);
1148 		skb->len += tmp_skb->len;
1149 		skb->data_len += tmp_skb->len;
1150 		skb->truesize += tmp_skb->truesize;
1151 		__sock_put(tmp_skb->sk);
1152 		tmp_skb->destructor = NULL;
1153 		tmp_skb->sk = NULL;
1154 	}
1155 
1156 	ipv6_addr_copy(final_dst, &fl->fl6_dst);
1157 	__skb_pull(skb, skb->h.raw - skb->nh.raw);
1158 	if (opt && opt->opt_flen)
1159 		ipv6_push_frag_opts(skb, opt, &proto);
1160 	if (opt && opt->opt_nflen)
1161 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1162 
1163 	skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1164 
1165 	*(u32*)hdr = fl->fl6_flowlabel |
1166 		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1167 
1168 	if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1169 		hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1170 	else
1171 		hdr->payload_len = 0;
1172 	hdr->hop_limit = np->cork.hop_limit;
1173 	hdr->nexthdr = proto;
1174 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1175 	ipv6_addr_copy(&hdr->daddr, final_dst);
1176 
1177 	skb->dst = dst_clone(&rt->u.dst);
1178 	IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
1179 	err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1180 	if (err) {
1181 		if (err > 0)
1182 			err = np->recverr ? net_xmit_errno(err) : 0;
1183 		if (err)
1184 			goto error;
1185 	}
1186 
1187 out:
1188 	inet->cork.flags &= ~IPCORK_OPT;
1189 	if (np->cork.opt) {
1190 		kfree(np->cork.opt);
1191 		np->cork.opt = NULL;
1192 	}
1193 	if (np->cork.rt) {
1194 		dst_release(&np->cork.rt->u.dst);
1195 		np->cork.rt = NULL;
1196 		inet->cork.flags &= ~IPCORK_ALLFRAG;
1197 	}
1198 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1199 	return err;
1200 error:
1201 	goto out;
1202 }
1203 
1204 void ip6_flush_pending_frames(struct sock *sk)
1205 {
1206 	struct inet_sock *inet = inet_sk(sk);
1207 	struct ipv6_pinfo *np = inet6_sk(sk);
1208 	struct sk_buff *skb;
1209 
1210 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1211 		IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1212 		kfree_skb(skb);
1213 	}
1214 
1215 	inet->cork.flags &= ~IPCORK_OPT;
1216 
1217 	if (np->cork.opt) {
1218 		kfree(np->cork.opt);
1219 		np->cork.opt = NULL;
1220 	}
1221 	if (np->cork.rt) {
1222 		dst_release(&np->cork.rt->u.dst);
1223 		np->cork.rt = NULL;
1224 		inet->cork.flags &= ~IPCORK_ALLFRAG;
1225 	}
1226 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1227 }
1228