xref: /openbmc/linux/net/ipv6/ip6_output.c (revision f15cbe6f1a4b4d9df59142fc8e4abb973302cf44)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	: 	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 
41 #include <linux/netfilter.h>
42 #include <linux/netfilter_ipv6.h>
43 
44 #include <net/sock.h>
45 #include <net/snmp.h>
46 
47 #include <net/ipv6.h>
48 #include <net/ndisc.h>
49 #include <net/protocol.h>
50 #include <net/ip6_route.h>
51 #include <net/addrconf.h>
52 #include <net/rawv6.h>
53 #include <net/icmp.h>
54 #include <net/xfrm.h>
55 #include <net/checksum.h>
56 #include <linux/mroute6.h>
57 
58 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
59 
60 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
61 {
62 	static u32 ipv6_fragmentation_id = 1;
63 	static DEFINE_SPINLOCK(ip6_id_lock);
64 
65 	spin_lock_bh(&ip6_id_lock);
66 	fhdr->identification = htonl(ipv6_fragmentation_id);
67 	if (++ipv6_fragmentation_id == 0)
68 		ipv6_fragmentation_id = 1;
69 	spin_unlock_bh(&ip6_id_lock);
70 }
71 
72 int __ip6_local_out(struct sk_buff *skb)
73 {
74 	int len;
75 
76 	len = skb->len - sizeof(struct ipv6hdr);
77 	if (len > IPV6_MAXPLEN)
78 		len = 0;
79 	ipv6_hdr(skb)->payload_len = htons(len);
80 
81 	return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
82 		       dst_output);
83 }
84 
85 int ip6_local_out(struct sk_buff *skb)
86 {
87 	int err;
88 
89 	err = __ip6_local_out(skb);
90 	if (likely(err == 1))
91 		err = dst_output(skb);
92 
93 	return err;
94 }
95 EXPORT_SYMBOL_GPL(ip6_local_out);
96 
97 static int ip6_output_finish(struct sk_buff *skb)
98 {
99 	struct dst_entry *dst = skb->dst;
100 
101 	if (dst->hh)
102 		return neigh_hh_output(dst->hh, skb);
103 	else if (dst->neighbour)
104 		return dst->neighbour->output(skb);
105 
106 	IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
107 	kfree_skb(skb);
108 	return -EINVAL;
109 
110 }
111 
112 /* dev_loopback_xmit for use with netfilter. */
113 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
114 {
115 	skb_reset_mac_header(newskb);
116 	__skb_pull(newskb, skb_network_offset(newskb));
117 	newskb->pkt_type = PACKET_LOOPBACK;
118 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
119 	WARN_ON(!newskb->dst);
120 
121 	netif_rx(newskb);
122 	return 0;
123 }
124 
125 
126 static int ip6_output2(struct sk_buff *skb)
127 {
128 	struct dst_entry *dst = skb->dst;
129 	struct net_device *dev = dst->dev;
130 
131 	skb->protocol = htons(ETH_P_IPV6);
132 	skb->dev = dev;
133 
134 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
135 		struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
136 		struct inet6_dev *idev = ip6_dst_idev(skb->dst);
137 
138 		if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
139 		    ((mroute6_socket && !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
140 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
141 					 &ipv6_hdr(skb)->saddr))) {
142 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
143 
144 			/* Do not check for IFF_ALLMULTI; multicast routing
145 			   is not supported in any case.
146 			 */
147 			if (newskb)
148 				NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
149 					NULL, newskb->dev,
150 					ip6_dev_loopback_xmit);
151 
152 			if (ipv6_hdr(skb)->hop_limit == 0) {
153 				IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
154 				kfree_skb(skb);
155 				return 0;
156 			}
157 		}
158 
159 		IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS);
160 	}
161 
162 	return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
163 		       ip6_output_finish);
164 }
165 
166 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
167 {
168 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
169 
170 	return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
171 	       skb->dst->dev->mtu : dst_mtu(skb->dst);
172 }
173 
174 int ip6_output(struct sk_buff *skb)
175 {
176 	struct inet6_dev *idev = ip6_dst_idev(skb->dst);
177 	if (unlikely(idev->cnf.disable_ipv6)) {
178 		IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
179 		kfree_skb(skb);
180 		return 0;
181 	}
182 
183 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
184 				dst_allfrag(skb->dst))
185 		return ip6_fragment(skb, ip6_output2);
186 	else
187 		return ip6_output2(skb);
188 }
189 
190 /*
191  *	xmit an sk_buff (used by TCP)
192  */
193 
194 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
195 	     struct ipv6_txoptions *opt, int ipfragok)
196 {
197 	struct ipv6_pinfo *np = inet6_sk(sk);
198 	struct in6_addr *first_hop = &fl->fl6_dst;
199 	struct dst_entry *dst = skb->dst;
200 	struct ipv6hdr *hdr;
201 	u8  proto = fl->proto;
202 	int seg_len = skb->len;
203 	int hlimit, tclass;
204 	u32 mtu;
205 
206 	if (opt) {
207 		unsigned int head_room;
208 
209 		/* First: exthdrs may take lots of space (~8K for now)
210 		   MAX_HEADER is not enough.
211 		 */
212 		head_room = opt->opt_nflen + opt->opt_flen;
213 		seg_len += head_room;
214 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
215 
216 		if (skb_headroom(skb) < head_room) {
217 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
218 			if (skb2 == NULL) {
219 				IP6_INC_STATS(ip6_dst_idev(skb->dst),
220 					      IPSTATS_MIB_OUTDISCARDS);
221 				kfree_skb(skb);
222 				return -ENOBUFS;
223 			}
224 			kfree_skb(skb);
225 			skb = skb2;
226 			if (sk)
227 				skb_set_owner_w(skb, sk);
228 		}
229 		if (opt->opt_flen)
230 			ipv6_push_frag_opts(skb, opt, &proto);
231 		if (opt->opt_nflen)
232 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
233 	}
234 
235 	skb_push(skb, sizeof(struct ipv6hdr));
236 	skb_reset_network_header(skb);
237 	hdr = ipv6_hdr(skb);
238 
239 	/*
240 	 *	Fill in the IPv6 header
241 	 */
242 
243 	hlimit = -1;
244 	if (np)
245 		hlimit = np->hop_limit;
246 	if (hlimit < 0)
247 		hlimit = ip6_dst_hoplimit(dst);
248 
249 	tclass = -1;
250 	if (np)
251 		tclass = np->tclass;
252 	if (tclass < 0)
253 		tclass = 0;
254 
255 	*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
256 
257 	hdr->payload_len = htons(seg_len);
258 	hdr->nexthdr = proto;
259 	hdr->hop_limit = hlimit;
260 
261 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
262 	ipv6_addr_copy(&hdr->daddr, first_hop);
263 
264 	skb->priority = sk->sk_priority;
265 	skb->mark = sk->sk_mark;
266 
267 	mtu = dst_mtu(dst);
268 	if ((skb->len <= mtu) || ipfragok || skb_is_gso(skb)) {
269 		IP6_INC_STATS(ip6_dst_idev(skb->dst),
270 			      IPSTATS_MIB_OUTREQUESTS);
271 		return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
272 				dst_output);
273 	}
274 
275 	if (net_ratelimit())
276 		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
277 	skb->dev = dst->dev;
278 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
279 	IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
280 	kfree_skb(skb);
281 	return -EMSGSIZE;
282 }
283 
284 EXPORT_SYMBOL(ip6_xmit);
285 
286 /*
287  *	To avoid extra problems ND packets are send through this
288  *	routine. It's code duplication but I really want to avoid
289  *	extra checks since ipv6_build_header is used by TCP (which
290  *	is for us performance critical)
291  */
292 
293 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
294 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
295 	       int proto, int len)
296 {
297 	struct ipv6_pinfo *np = inet6_sk(sk);
298 	struct ipv6hdr *hdr;
299 	int totlen;
300 
301 	skb->protocol = htons(ETH_P_IPV6);
302 	skb->dev = dev;
303 
304 	totlen = len + sizeof(struct ipv6hdr);
305 
306 	skb_reset_network_header(skb);
307 	skb_put(skb, sizeof(struct ipv6hdr));
308 	hdr = ipv6_hdr(skb);
309 
310 	*(__be32*)hdr = htonl(0x60000000);
311 
312 	hdr->payload_len = htons(len);
313 	hdr->nexthdr = proto;
314 	hdr->hop_limit = np->hop_limit;
315 
316 	ipv6_addr_copy(&hdr->saddr, saddr);
317 	ipv6_addr_copy(&hdr->daddr, daddr);
318 
319 	return 0;
320 }
321 
322 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
323 {
324 	struct ip6_ra_chain *ra;
325 	struct sock *last = NULL;
326 
327 	read_lock(&ip6_ra_lock);
328 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
329 		struct sock *sk = ra->sk;
330 		if (sk && ra->sel == sel &&
331 		    (!sk->sk_bound_dev_if ||
332 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
333 			if (last) {
334 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
335 				if (skb2)
336 					rawv6_rcv(last, skb2);
337 			}
338 			last = sk;
339 		}
340 	}
341 
342 	if (last) {
343 		rawv6_rcv(last, skb);
344 		read_unlock(&ip6_ra_lock);
345 		return 1;
346 	}
347 	read_unlock(&ip6_ra_lock);
348 	return 0;
349 }
350 
351 static int ip6_forward_proxy_check(struct sk_buff *skb)
352 {
353 	struct ipv6hdr *hdr = ipv6_hdr(skb);
354 	u8 nexthdr = hdr->nexthdr;
355 	int offset;
356 
357 	if (ipv6_ext_hdr(nexthdr)) {
358 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
359 		if (offset < 0)
360 			return 0;
361 	} else
362 		offset = sizeof(struct ipv6hdr);
363 
364 	if (nexthdr == IPPROTO_ICMPV6) {
365 		struct icmp6hdr *icmp6;
366 
367 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
368 					 offset + 1 - skb->data)))
369 			return 0;
370 
371 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
372 
373 		switch (icmp6->icmp6_type) {
374 		case NDISC_ROUTER_SOLICITATION:
375 		case NDISC_ROUTER_ADVERTISEMENT:
376 		case NDISC_NEIGHBOUR_SOLICITATION:
377 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
378 		case NDISC_REDIRECT:
379 			/* For reaction involving unicast neighbor discovery
380 			 * message destined to the proxied address, pass it to
381 			 * input function.
382 			 */
383 			return 1;
384 		default:
385 			break;
386 		}
387 	}
388 
389 	/*
390 	 * The proxying router can't forward traffic sent to a link-local
391 	 * address, so signal the sender and discard the packet. This
392 	 * behavior is clarified by the MIPv6 specification.
393 	 */
394 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
395 		dst_link_failure(skb);
396 		return -1;
397 	}
398 
399 	return 0;
400 }
401 
402 static inline int ip6_forward_finish(struct sk_buff *skb)
403 {
404 	return dst_output(skb);
405 }
406 
407 int ip6_forward(struct sk_buff *skb)
408 {
409 	struct dst_entry *dst = skb->dst;
410 	struct ipv6hdr *hdr = ipv6_hdr(skb);
411 	struct inet6_skb_parm *opt = IP6CB(skb);
412 	struct net *net = dev_net(dst->dev);
413 
414 	if (net->ipv6.devconf_all->forwarding == 0)
415 		goto error;
416 
417 	if (skb_warn_if_lro(skb))
418 		goto drop;
419 
420 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
421 		IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
422 		goto drop;
423 	}
424 
425 	skb_forward_csum(skb);
426 
427 	/*
428 	 *	We DO NOT make any processing on
429 	 *	RA packets, pushing them to user level AS IS
430 	 *	without ane WARRANTY that application will be able
431 	 *	to interpret them. The reason is that we
432 	 *	cannot make anything clever here.
433 	 *
434 	 *	We are not end-node, so that if packet contains
435 	 *	AH/ESP, we cannot make anything.
436 	 *	Defragmentation also would be mistake, RA packets
437 	 *	cannot be fragmented, because there is no warranty
438 	 *	that different fragments will go along one path. --ANK
439 	 */
440 	if (opt->ra) {
441 		u8 *ptr = skb_network_header(skb) + opt->ra;
442 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
443 			return 0;
444 	}
445 
446 	/*
447 	 *	check and decrement ttl
448 	 */
449 	if (hdr->hop_limit <= 1) {
450 		/* Force OUTPUT device used as source address */
451 		skb->dev = dst->dev;
452 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
453 			    0, skb->dev);
454 		IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
455 
456 		kfree_skb(skb);
457 		return -ETIMEDOUT;
458 	}
459 
460 	/* XXX: idev->cnf.proxy_ndp? */
461 	if (net->ipv6.devconf_all->proxy_ndp &&
462 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
463 		int proxied = ip6_forward_proxy_check(skb);
464 		if (proxied > 0)
465 			return ip6_input(skb);
466 		else if (proxied < 0) {
467 			IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
468 			goto drop;
469 		}
470 	}
471 
472 	if (!xfrm6_route_forward(skb)) {
473 		IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
474 		goto drop;
475 	}
476 	dst = skb->dst;
477 
478 	/* IPv6 specs say nothing about it, but it is clear that we cannot
479 	   send redirects to source routed frames.
480 	   We don't send redirects to frames decapsulated from IPsec.
481 	 */
482 	if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
483 	    !skb->sp) {
484 		struct in6_addr *target = NULL;
485 		struct rt6_info *rt;
486 		struct neighbour *n = dst->neighbour;
487 
488 		/*
489 		 *	incoming and outgoing devices are the same
490 		 *	send a redirect.
491 		 */
492 
493 		rt = (struct rt6_info *) dst;
494 		if ((rt->rt6i_flags & RTF_GATEWAY))
495 			target = (struct in6_addr*)&n->primary_key;
496 		else
497 			target = &hdr->daddr;
498 
499 		/* Limit redirects both by destination (here)
500 		   and by source (inside ndisc_send_redirect)
501 		 */
502 		if (xrlim_allow(dst, 1*HZ))
503 			ndisc_send_redirect(skb, n, target);
504 	} else {
505 		int addrtype = ipv6_addr_type(&hdr->saddr);
506 
507 		/* This check is security critical. */
508 		if (addrtype == IPV6_ADDR_ANY ||
509 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
510 			goto error;
511 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
512 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
513 				ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
514 			goto error;
515 		}
516 	}
517 
518 	if (skb->len > dst_mtu(dst)) {
519 		/* Again, force OUTPUT device used as source address */
520 		skb->dev = dst->dev;
521 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
522 		IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
523 		IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
524 		kfree_skb(skb);
525 		return -EMSGSIZE;
526 	}
527 
528 	if (skb_cow(skb, dst->dev->hard_header_len)) {
529 		IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
530 		goto drop;
531 	}
532 
533 	hdr = ipv6_hdr(skb);
534 
535 	/* Mangling hops number delayed to point after skb COW */
536 
537 	hdr->hop_limit--;
538 
539 	IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
540 	return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
541 		       ip6_forward_finish);
542 
543 error:
544 	IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
545 drop:
546 	kfree_skb(skb);
547 	return -EINVAL;
548 }
549 
550 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
551 {
552 	to->pkt_type = from->pkt_type;
553 	to->priority = from->priority;
554 	to->protocol = from->protocol;
555 	dst_release(to->dst);
556 	to->dst = dst_clone(from->dst);
557 	to->dev = from->dev;
558 	to->mark = from->mark;
559 
560 #ifdef CONFIG_NET_SCHED
561 	to->tc_index = from->tc_index;
562 #endif
563 	nf_copy(to, from);
564 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
565     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
566 	to->nf_trace = from->nf_trace;
567 #endif
568 	skb_copy_secmark(to, from);
569 }
570 
571 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
572 {
573 	u16 offset = sizeof(struct ipv6hdr);
574 	struct ipv6_opt_hdr *exthdr =
575 				(struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
576 	unsigned int packet_len = skb->tail - skb->network_header;
577 	int found_rhdr = 0;
578 	*nexthdr = &ipv6_hdr(skb)->nexthdr;
579 
580 	while (offset + 1 <= packet_len) {
581 
582 		switch (**nexthdr) {
583 
584 		case NEXTHDR_HOP:
585 			break;
586 		case NEXTHDR_ROUTING:
587 			found_rhdr = 1;
588 			break;
589 		case NEXTHDR_DEST:
590 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
591 			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
592 				break;
593 #endif
594 			if (found_rhdr)
595 				return offset;
596 			break;
597 		default :
598 			return offset;
599 		}
600 
601 		offset += ipv6_optlen(exthdr);
602 		*nexthdr = &exthdr->nexthdr;
603 		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
604 						 offset);
605 	}
606 
607 	return offset;
608 }
609 
610 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
611 {
612 	struct net_device *dev;
613 	struct sk_buff *frag;
614 	struct rt6_info *rt = (struct rt6_info*)skb->dst;
615 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
616 	struct ipv6hdr *tmp_hdr;
617 	struct frag_hdr *fh;
618 	unsigned int mtu, hlen, left, len;
619 	__be32 frag_id = 0;
620 	int ptr, offset = 0, err=0;
621 	u8 *prevhdr, nexthdr = 0;
622 
623 	dev = rt->u.dst.dev;
624 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
625 	nexthdr = *prevhdr;
626 
627 	mtu = ip6_skb_dst_mtu(skb);
628 
629 	/* We must not fragment if the socket is set to force MTU discovery
630 	 * or if the skb it not generated by a local socket.  (This last
631 	 * check should be redundant, but it's free.)
632 	 */
633 	if (!skb->local_df) {
634 		skb->dev = skb->dst->dev;
635 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
636 		IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
637 		kfree_skb(skb);
638 		return -EMSGSIZE;
639 	}
640 
641 	if (np && np->frag_size < mtu) {
642 		if (np->frag_size)
643 			mtu = np->frag_size;
644 	}
645 	mtu -= hlen + sizeof(struct frag_hdr);
646 
647 	if (skb_shinfo(skb)->frag_list) {
648 		int first_len = skb_pagelen(skb);
649 		int truesizes = 0;
650 
651 		if (first_len - hlen > mtu ||
652 		    ((first_len - hlen) & 7) ||
653 		    skb_cloned(skb))
654 			goto slow_path;
655 
656 		for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
657 			/* Correct geometry. */
658 			if (frag->len > mtu ||
659 			    ((frag->len & 7) && frag->next) ||
660 			    skb_headroom(frag) < hlen)
661 			    goto slow_path;
662 
663 			/* Partially cloned skb? */
664 			if (skb_shared(frag))
665 				goto slow_path;
666 
667 			BUG_ON(frag->sk);
668 			if (skb->sk) {
669 				sock_hold(skb->sk);
670 				frag->sk = skb->sk;
671 				frag->destructor = sock_wfree;
672 				truesizes += frag->truesize;
673 			}
674 		}
675 
676 		err = 0;
677 		offset = 0;
678 		frag = skb_shinfo(skb)->frag_list;
679 		skb_shinfo(skb)->frag_list = NULL;
680 		/* BUILD HEADER */
681 
682 		*prevhdr = NEXTHDR_FRAGMENT;
683 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
684 		if (!tmp_hdr) {
685 			IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
686 			return -ENOMEM;
687 		}
688 
689 		__skb_pull(skb, hlen);
690 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
691 		__skb_push(skb, hlen);
692 		skb_reset_network_header(skb);
693 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
694 
695 		ipv6_select_ident(skb, fh);
696 		fh->nexthdr = nexthdr;
697 		fh->reserved = 0;
698 		fh->frag_off = htons(IP6_MF);
699 		frag_id = fh->identification;
700 
701 		first_len = skb_pagelen(skb);
702 		skb->data_len = first_len - skb_headlen(skb);
703 		skb->truesize -= truesizes;
704 		skb->len = first_len;
705 		ipv6_hdr(skb)->payload_len = htons(first_len -
706 						   sizeof(struct ipv6hdr));
707 
708 		dst_hold(&rt->u.dst);
709 
710 		for (;;) {
711 			/* Prepare header of the next frame,
712 			 * before previous one went down. */
713 			if (frag) {
714 				frag->ip_summed = CHECKSUM_NONE;
715 				skb_reset_transport_header(frag);
716 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
717 				__skb_push(frag, hlen);
718 				skb_reset_network_header(frag);
719 				memcpy(skb_network_header(frag), tmp_hdr,
720 				       hlen);
721 				offset += skb->len - hlen - sizeof(struct frag_hdr);
722 				fh->nexthdr = nexthdr;
723 				fh->reserved = 0;
724 				fh->frag_off = htons(offset);
725 				if (frag->next != NULL)
726 					fh->frag_off |= htons(IP6_MF);
727 				fh->identification = frag_id;
728 				ipv6_hdr(frag)->payload_len =
729 						htons(frag->len -
730 						      sizeof(struct ipv6hdr));
731 				ip6_copy_metadata(frag, skb);
732 			}
733 
734 			err = output(skb);
735 			if(!err)
736 				IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGCREATES);
737 
738 			if (err || !frag)
739 				break;
740 
741 			skb = frag;
742 			frag = skb->next;
743 			skb->next = NULL;
744 		}
745 
746 		kfree(tmp_hdr);
747 
748 		if (err == 0) {
749 			IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGOKS);
750 			dst_release(&rt->u.dst);
751 			return 0;
752 		}
753 
754 		while (frag) {
755 			skb = frag->next;
756 			kfree_skb(frag);
757 			frag = skb;
758 		}
759 
760 		IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGFAILS);
761 		dst_release(&rt->u.dst);
762 		return err;
763 	}
764 
765 slow_path:
766 	left = skb->len - hlen;		/* Space per frame */
767 	ptr = hlen;			/* Where to start from */
768 
769 	/*
770 	 *	Fragment the datagram.
771 	 */
772 
773 	*prevhdr = NEXTHDR_FRAGMENT;
774 
775 	/*
776 	 *	Keep copying data until we run out.
777 	 */
778 	while(left > 0)	{
779 		len = left;
780 		/* IF: it doesn't fit, use 'mtu' - the data space left */
781 		if (len > mtu)
782 			len = mtu;
783 		/* IF: we are not sending upto and including the packet end
784 		   then align the next start on an eight byte boundary */
785 		if (len < left)	{
786 			len &= ~7;
787 		}
788 		/*
789 		 *	Allocate buffer.
790 		 */
791 
792 		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
793 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
794 			IP6_INC_STATS(ip6_dst_idev(skb->dst),
795 				      IPSTATS_MIB_FRAGFAILS);
796 			err = -ENOMEM;
797 			goto fail;
798 		}
799 
800 		/*
801 		 *	Set up data on packet
802 		 */
803 
804 		ip6_copy_metadata(frag, skb);
805 		skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
806 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
807 		skb_reset_network_header(frag);
808 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
809 		frag->transport_header = (frag->network_header + hlen +
810 					  sizeof(struct frag_hdr));
811 
812 		/*
813 		 *	Charge the memory for the fragment to any owner
814 		 *	it might possess
815 		 */
816 		if (skb->sk)
817 			skb_set_owner_w(frag, skb->sk);
818 
819 		/*
820 		 *	Copy the packet header into the new buffer.
821 		 */
822 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
823 
824 		/*
825 		 *	Build fragment header.
826 		 */
827 		fh->nexthdr = nexthdr;
828 		fh->reserved = 0;
829 		if (!frag_id) {
830 			ipv6_select_ident(skb, fh);
831 			frag_id = fh->identification;
832 		} else
833 			fh->identification = frag_id;
834 
835 		/*
836 		 *	Copy a block of the IP datagram.
837 		 */
838 		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
839 			BUG();
840 		left -= len;
841 
842 		fh->frag_off = htons(offset);
843 		if (left > 0)
844 			fh->frag_off |= htons(IP6_MF);
845 		ipv6_hdr(frag)->payload_len = htons(frag->len -
846 						    sizeof(struct ipv6hdr));
847 
848 		ptr += len;
849 		offset += len;
850 
851 		/*
852 		 *	Put this fragment into the sending queue.
853 		 */
854 		err = output(frag);
855 		if (err)
856 			goto fail;
857 
858 		IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGCREATES);
859 	}
860 	IP6_INC_STATS(ip6_dst_idev(skb->dst),
861 		      IPSTATS_MIB_FRAGOKS);
862 	kfree_skb(skb);
863 	return err;
864 
865 fail:
866 	IP6_INC_STATS(ip6_dst_idev(skb->dst),
867 		      IPSTATS_MIB_FRAGFAILS);
868 	kfree_skb(skb);
869 	return err;
870 }
871 
872 static inline int ip6_rt_check(struct rt6key *rt_key,
873 			       struct in6_addr *fl_addr,
874 			       struct in6_addr *addr_cache)
875 {
876 	return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
877 		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
878 }
879 
880 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
881 					  struct dst_entry *dst,
882 					  struct flowi *fl)
883 {
884 	struct ipv6_pinfo *np = inet6_sk(sk);
885 	struct rt6_info *rt = (struct rt6_info *)dst;
886 
887 	if (!dst)
888 		goto out;
889 
890 	/* Yes, checking route validity in not connected
891 	 * case is not very simple. Take into account,
892 	 * that we do not support routing by source, TOS,
893 	 * and MSG_DONTROUTE 		--ANK (980726)
894 	 *
895 	 * 1. ip6_rt_check(): If route was host route,
896 	 *    check that cached destination is current.
897 	 *    If it is network route, we still may
898 	 *    check its validity using saved pointer
899 	 *    to the last used address: daddr_cache.
900 	 *    We do not want to save whole address now,
901 	 *    (because main consumer of this service
902 	 *    is tcp, which has not this problem),
903 	 *    so that the last trick works only on connected
904 	 *    sockets.
905 	 * 2. oif also should be the same.
906 	 */
907 	if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
908 #ifdef CONFIG_IPV6_SUBTREES
909 	    ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
910 #endif
911 	    (fl->oif && fl->oif != dst->dev->ifindex)) {
912 		dst_release(dst);
913 		dst = NULL;
914 	}
915 
916 out:
917 	return dst;
918 }
919 
920 static int ip6_dst_lookup_tail(struct sock *sk,
921 			       struct dst_entry **dst, struct flowi *fl)
922 {
923 	int err;
924 	struct net *net = sock_net(sk);
925 
926 	if (*dst == NULL)
927 		*dst = ip6_route_output(net, sk, fl);
928 
929 	if ((err = (*dst)->error))
930 		goto out_err_release;
931 
932 	if (ipv6_addr_any(&fl->fl6_src)) {
933 		err = ipv6_dev_get_saddr(ip6_dst_idev(*dst)->dev,
934 					 &fl->fl6_dst,
935 					 sk ? inet6_sk(sk)->srcprefs : 0,
936 					 &fl->fl6_src);
937 		if (err)
938 			goto out_err_release;
939 	}
940 
941 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
942 		/*
943 		 * Here if the dst entry we've looked up
944 		 * has a neighbour entry that is in the INCOMPLETE
945 		 * state and the src address from the flow is
946 		 * marked as OPTIMISTIC, we release the found
947 		 * dst entry and replace it instead with the
948 		 * dst entry of the nexthop router
949 		 */
950 		if (!((*dst)->neighbour->nud_state & NUD_VALID)) {
951 			struct inet6_ifaddr *ifp;
952 			struct flowi fl_gw;
953 			int redirect;
954 
955 			ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
956 					      (*dst)->dev, 1);
957 
958 			redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
959 			if (ifp)
960 				in6_ifa_put(ifp);
961 
962 			if (redirect) {
963 				/*
964 				 * We need to get the dst entry for the
965 				 * default router instead
966 				 */
967 				dst_release(*dst);
968 				memcpy(&fl_gw, fl, sizeof(struct flowi));
969 				memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
970 				*dst = ip6_route_output(net, sk, &fl_gw);
971 				if ((err = (*dst)->error))
972 					goto out_err_release;
973 			}
974 		}
975 #endif
976 
977 	return 0;
978 
979 out_err_release:
980 	if (err == -ENETUNREACH)
981 		IP6_INC_STATS_BH(NULL, IPSTATS_MIB_OUTNOROUTES);
982 	dst_release(*dst);
983 	*dst = NULL;
984 	return err;
985 }
986 
987 /**
988  *	ip6_dst_lookup - perform route lookup on flow
989  *	@sk: socket which provides route info
990  *	@dst: pointer to dst_entry * for result
991  *	@fl: flow to lookup
992  *
993  *	This function performs a route lookup on the given flow.
994  *
995  *	It returns zero on success, or a standard errno code on error.
996  */
997 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
998 {
999 	*dst = NULL;
1000 	return ip6_dst_lookup_tail(sk, dst, fl);
1001 }
1002 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1003 
1004 /**
1005  *	ip6_sk_dst_lookup - perform socket cached route lookup on flow
1006  *	@sk: socket which provides the dst cache and route info
1007  *	@dst: pointer to dst_entry * for result
1008  *	@fl: flow to lookup
1009  *
1010  *	This function performs a route lookup on the given flow with the
1011  *	possibility of using the cached route in the socket if it is valid.
1012  *	It will take the socket dst lock when operating on the dst cache.
1013  *	As a result, this function can only be used in process context.
1014  *
1015  *	It returns zero on success, or a standard errno code on error.
1016  */
1017 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1018 {
1019 	*dst = NULL;
1020 	if (sk) {
1021 		*dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1022 		*dst = ip6_sk_dst_check(sk, *dst, fl);
1023 	}
1024 
1025 	return ip6_dst_lookup_tail(sk, dst, fl);
1026 }
1027 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1028 
1029 static inline int ip6_ufo_append_data(struct sock *sk,
1030 			int getfrag(void *from, char *to, int offset, int len,
1031 			int odd, struct sk_buff *skb),
1032 			void *from, int length, int hh_len, int fragheaderlen,
1033 			int transhdrlen, int mtu,unsigned int flags)
1034 
1035 {
1036 	struct sk_buff *skb;
1037 	int err;
1038 
1039 	/* There is support for UDP large send offload by network
1040 	 * device, so create one single skb packet containing complete
1041 	 * udp datagram
1042 	 */
1043 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1044 		skb = sock_alloc_send_skb(sk,
1045 			hh_len + fragheaderlen + transhdrlen + 20,
1046 			(flags & MSG_DONTWAIT), &err);
1047 		if (skb == NULL)
1048 			return -ENOMEM;
1049 
1050 		/* reserve space for Hardware header */
1051 		skb_reserve(skb, hh_len);
1052 
1053 		/* create space for UDP/IP header */
1054 		skb_put(skb,fragheaderlen + transhdrlen);
1055 
1056 		/* initialize network header pointer */
1057 		skb_reset_network_header(skb);
1058 
1059 		/* initialize protocol header pointer */
1060 		skb->transport_header = skb->network_header + fragheaderlen;
1061 
1062 		skb->ip_summed = CHECKSUM_PARTIAL;
1063 		skb->csum = 0;
1064 		sk->sk_sndmsg_off = 0;
1065 	}
1066 
1067 	err = skb_append_datato_frags(sk,skb, getfrag, from,
1068 				      (length - transhdrlen));
1069 	if (!err) {
1070 		struct frag_hdr fhdr;
1071 
1072 		/* specify the length of each IP datagram fragment*/
1073 		skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
1074 					    sizeof(struct frag_hdr);
1075 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1076 		ipv6_select_ident(skb, &fhdr);
1077 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1078 		__skb_queue_tail(&sk->sk_write_queue, skb);
1079 
1080 		return 0;
1081 	}
1082 	/* There is not enough support do UPD LSO,
1083 	 * so follow normal path
1084 	 */
1085 	kfree_skb(skb);
1086 
1087 	return err;
1088 }
1089 
1090 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1091 	int offset, int len, int odd, struct sk_buff *skb),
1092 	void *from, int length, int transhdrlen,
1093 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1094 	struct rt6_info *rt, unsigned int flags)
1095 {
1096 	struct inet_sock *inet = inet_sk(sk);
1097 	struct ipv6_pinfo *np = inet6_sk(sk);
1098 	struct sk_buff *skb;
1099 	unsigned int maxfraglen, fragheaderlen;
1100 	int exthdrlen;
1101 	int hh_len;
1102 	int mtu;
1103 	int copy;
1104 	int err;
1105 	int offset = 0;
1106 	int csummode = CHECKSUM_NONE;
1107 
1108 	if (flags&MSG_PROBE)
1109 		return 0;
1110 	if (skb_queue_empty(&sk->sk_write_queue)) {
1111 		/*
1112 		 * setup for corking
1113 		 */
1114 		if (opt) {
1115 			if (np->cork.opt == NULL) {
1116 				np->cork.opt = kmalloc(opt->tot_len,
1117 						       sk->sk_allocation);
1118 				if (unlikely(np->cork.opt == NULL))
1119 					return -ENOBUFS;
1120 			} else if (np->cork.opt->tot_len < opt->tot_len) {
1121 				printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
1122 				return -EINVAL;
1123 			}
1124 			memcpy(np->cork.opt, opt, opt->tot_len);
1125 			inet->cork.flags |= IPCORK_OPT;
1126 			/* need source address above miyazawa*/
1127 		}
1128 		dst_hold(&rt->u.dst);
1129 		inet->cork.dst = &rt->u.dst;
1130 		inet->cork.fl = *fl;
1131 		np->cork.hop_limit = hlimit;
1132 		np->cork.tclass = tclass;
1133 		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1134 		      rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1135 		if (np->frag_size < mtu) {
1136 			if (np->frag_size)
1137 				mtu = np->frag_size;
1138 		}
1139 		inet->cork.fragsize = mtu;
1140 		if (dst_allfrag(rt->u.dst.path))
1141 			inet->cork.flags |= IPCORK_ALLFRAG;
1142 		inet->cork.length = 0;
1143 		sk->sk_sndmsg_page = NULL;
1144 		sk->sk_sndmsg_off = 0;
1145 		exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1146 			    rt->rt6i_nfheader_len;
1147 		length += exthdrlen;
1148 		transhdrlen += exthdrlen;
1149 	} else {
1150 		rt = (struct rt6_info *)inet->cork.dst;
1151 		fl = &inet->cork.fl;
1152 		if (inet->cork.flags & IPCORK_OPT)
1153 			opt = np->cork.opt;
1154 		transhdrlen = 0;
1155 		exthdrlen = 0;
1156 		mtu = inet->cork.fragsize;
1157 	}
1158 
1159 	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1160 
1161 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1162 			(opt ? opt->opt_nflen : 0);
1163 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1164 
1165 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1166 		if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1167 			ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1168 			return -EMSGSIZE;
1169 		}
1170 	}
1171 
1172 	/*
1173 	 * Let's try using as much space as possible.
1174 	 * Use MTU if total length of the message fits into the MTU.
1175 	 * Otherwise, we need to reserve fragment header and
1176 	 * fragment alignment (= 8-15 octects, in total).
1177 	 *
1178 	 * Note that we may need to "move" the data from the tail of
1179 	 * of the buffer to the new fragment when we split
1180 	 * the message.
1181 	 *
1182 	 * FIXME: It may be fragmented into multiple chunks
1183 	 *        at once if non-fragmentable extension headers
1184 	 *        are too large.
1185 	 * --yoshfuji
1186 	 */
1187 
1188 	inet->cork.length += length;
1189 	if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1190 	    (rt->u.dst.dev->features & NETIF_F_UFO)) {
1191 
1192 		err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1193 					  fragheaderlen, transhdrlen, mtu,
1194 					  flags);
1195 		if (err)
1196 			goto error;
1197 		return 0;
1198 	}
1199 
1200 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1201 		goto alloc_new_skb;
1202 
1203 	while (length > 0) {
1204 		/* Check if the remaining data fits into current packet. */
1205 		copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1206 		if (copy < length)
1207 			copy = maxfraglen - skb->len;
1208 
1209 		if (copy <= 0) {
1210 			char *data;
1211 			unsigned int datalen;
1212 			unsigned int fraglen;
1213 			unsigned int fraggap;
1214 			unsigned int alloclen;
1215 			struct sk_buff *skb_prev;
1216 alloc_new_skb:
1217 			skb_prev = skb;
1218 
1219 			/* There's no room in the current skb */
1220 			if (skb_prev)
1221 				fraggap = skb_prev->len - maxfraglen;
1222 			else
1223 				fraggap = 0;
1224 
1225 			/*
1226 			 * If remaining data exceeds the mtu,
1227 			 * we know we need more fragment(s).
1228 			 */
1229 			datalen = length + fraggap;
1230 			if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1231 				datalen = maxfraglen - fragheaderlen;
1232 
1233 			fraglen = datalen + fragheaderlen;
1234 			if ((flags & MSG_MORE) &&
1235 			    !(rt->u.dst.dev->features&NETIF_F_SG))
1236 				alloclen = mtu;
1237 			else
1238 				alloclen = datalen + fragheaderlen;
1239 
1240 			/*
1241 			 * The last fragment gets additional space at tail.
1242 			 * Note: we overallocate on fragments with MSG_MODE
1243 			 * because we have no idea if we're the last one.
1244 			 */
1245 			if (datalen == length + fraggap)
1246 				alloclen += rt->u.dst.trailer_len;
1247 
1248 			/*
1249 			 * We just reserve space for fragment header.
1250 			 * Note: this may be overallocation if the message
1251 			 * (without MSG_MORE) fits into the MTU.
1252 			 */
1253 			alloclen += sizeof(struct frag_hdr);
1254 
1255 			if (transhdrlen) {
1256 				skb = sock_alloc_send_skb(sk,
1257 						alloclen + hh_len,
1258 						(flags & MSG_DONTWAIT), &err);
1259 			} else {
1260 				skb = NULL;
1261 				if (atomic_read(&sk->sk_wmem_alloc) <=
1262 				    2 * sk->sk_sndbuf)
1263 					skb = sock_wmalloc(sk,
1264 							   alloclen + hh_len, 1,
1265 							   sk->sk_allocation);
1266 				if (unlikely(skb == NULL))
1267 					err = -ENOBUFS;
1268 			}
1269 			if (skb == NULL)
1270 				goto error;
1271 			/*
1272 			 *	Fill in the control structures
1273 			 */
1274 			skb->ip_summed = csummode;
1275 			skb->csum = 0;
1276 			/* reserve for fragmentation */
1277 			skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1278 
1279 			/*
1280 			 *	Find where to start putting bytes
1281 			 */
1282 			data = skb_put(skb, fraglen);
1283 			skb_set_network_header(skb, exthdrlen);
1284 			data += fragheaderlen;
1285 			skb->transport_header = (skb->network_header +
1286 						 fragheaderlen);
1287 			if (fraggap) {
1288 				skb->csum = skb_copy_and_csum_bits(
1289 					skb_prev, maxfraglen,
1290 					data + transhdrlen, fraggap, 0);
1291 				skb_prev->csum = csum_sub(skb_prev->csum,
1292 							  skb->csum);
1293 				data += fraggap;
1294 				pskb_trim_unique(skb_prev, maxfraglen);
1295 			}
1296 			copy = datalen - transhdrlen - fraggap;
1297 			if (copy < 0) {
1298 				err = -EINVAL;
1299 				kfree_skb(skb);
1300 				goto error;
1301 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1302 				err = -EFAULT;
1303 				kfree_skb(skb);
1304 				goto error;
1305 			}
1306 
1307 			offset += copy;
1308 			length -= datalen - fraggap;
1309 			transhdrlen = 0;
1310 			exthdrlen = 0;
1311 			csummode = CHECKSUM_NONE;
1312 
1313 			/*
1314 			 * Put the packet on the pending queue
1315 			 */
1316 			__skb_queue_tail(&sk->sk_write_queue, skb);
1317 			continue;
1318 		}
1319 
1320 		if (copy > length)
1321 			copy = length;
1322 
1323 		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1324 			unsigned int off;
1325 
1326 			off = skb->len;
1327 			if (getfrag(from, skb_put(skb, copy),
1328 						offset, copy, off, skb) < 0) {
1329 				__skb_trim(skb, off);
1330 				err = -EFAULT;
1331 				goto error;
1332 			}
1333 		} else {
1334 			int i = skb_shinfo(skb)->nr_frags;
1335 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1336 			struct page *page = sk->sk_sndmsg_page;
1337 			int off = sk->sk_sndmsg_off;
1338 			unsigned int left;
1339 
1340 			if (page && (left = PAGE_SIZE - off) > 0) {
1341 				if (copy >= left)
1342 					copy = left;
1343 				if (page != frag->page) {
1344 					if (i == MAX_SKB_FRAGS) {
1345 						err = -EMSGSIZE;
1346 						goto error;
1347 					}
1348 					get_page(page);
1349 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1350 					frag = &skb_shinfo(skb)->frags[i];
1351 				}
1352 			} else if(i < MAX_SKB_FRAGS) {
1353 				if (copy > PAGE_SIZE)
1354 					copy = PAGE_SIZE;
1355 				page = alloc_pages(sk->sk_allocation, 0);
1356 				if (page == NULL) {
1357 					err = -ENOMEM;
1358 					goto error;
1359 				}
1360 				sk->sk_sndmsg_page = page;
1361 				sk->sk_sndmsg_off = 0;
1362 
1363 				skb_fill_page_desc(skb, i, page, 0, 0);
1364 				frag = &skb_shinfo(skb)->frags[i];
1365 			} else {
1366 				err = -EMSGSIZE;
1367 				goto error;
1368 			}
1369 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1370 				err = -EFAULT;
1371 				goto error;
1372 			}
1373 			sk->sk_sndmsg_off += copy;
1374 			frag->size += copy;
1375 			skb->len += copy;
1376 			skb->data_len += copy;
1377 			skb->truesize += copy;
1378 			atomic_add(copy, &sk->sk_wmem_alloc);
1379 		}
1380 		offset += copy;
1381 		length -= copy;
1382 	}
1383 	return 0;
1384 error:
1385 	inet->cork.length -= length;
1386 	IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1387 	return err;
1388 }
1389 
1390 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1391 {
1392 	inet->cork.flags &= ~IPCORK_OPT;
1393 	kfree(np->cork.opt);
1394 	np->cork.opt = NULL;
1395 	if (inet->cork.dst) {
1396 		dst_release(inet->cork.dst);
1397 		inet->cork.dst = NULL;
1398 		inet->cork.flags &= ~IPCORK_ALLFRAG;
1399 	}
1400 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1401 }
1402 
1403 int ip6_push_pending_frames(struct sock *sk)
1404 {
1405 	struct sk_buff *skb, *tmp_skb;
1406 	struct sk_buff **tail_skb;
1407 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1408 	struct inet_sock *inet = inet_sk(sk);
1409 	struct ipv6_pinfo *np = inet6_sk(sk);
1410 	struct ipv6hdr *hdr;
1411 	struct ipv6_txoptions *opt = np->cork.opt;
1412 	struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1413 	struct flowi *fl = &inet->cork.fl;
1414 	unsigned char proto = fl->proto;
1415 	int err = 0;
1416 
1417 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1418 		goto out;
1419 	tail_skb = &(skb_shinfo(skb)->frag_list);
1420 
1421 	/* move skb->data to ip header from ext header */
1422 	if (skb->data < skb_network_header(skb))
1423 		__skb_pull(skb, skb_network_offset(skb));
1424 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1425 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1426 		*tail_skb = tmp_skb;
1427 		tail_skb = &(tmp_skb->next);
1428 		skb->len += tmp_skb->len;
1429 		skb->data_len += tmp_skb->len;
1430 		skb->truesize += tmp_skb->truesize;
1431 		__sock_put(tmp_skb->sk);
1432 		tmp_skb->destructor = NULL;
1433 		tmp_skb->sk = NULL;
1434 	}
1435 
1436 	/* Allow local fragmentation. */
1437 	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1438 		skb->local_df = 1;
1439 
1440 	ipv6_addr_copy(final_dst, &fl->fl6_dst);
1441 	__skb_pull(skb, skb_network_header_len(skb));
1442 	if (opt && opt->opt_flen)
1443 		ipv6_push_frag_opts(skb, opt, &proto);
1444 	if (opt && opt->opt_nflen)
1445 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1446 
1447 	skb_push(skb, sizeof(struct ipv6hdr));
1448 	skb_reset_network_header(skb);
1449 	hdr = ipv6_hdr(skb);
1450 
1451 	*(__be32*)hdr = fl->fl6_flowlabel |
1452 		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1453 
1454 	hdr->hop_limit = np->cork.hop_limit;
1455 	hdr->nexthdr = proto;
1456 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1457 	ipv6_addr_copy(&hdr->daddr, final_dst);
1458 
1459 	skb->priority = sk->sk_priority;
1460 	skb->mark = sk->sk_mark;
1461 
1462 	skb->dst = dst_clone(&rt->u.dst);
1463 	IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1464 	if (proto == IPPROTO_ICMPV6) {
1465 		struct inet6_dev *idev = ip6_dst_idev(skb->dst);
1466 
1467 		ICMP6MSGOUT_INC_STATS_BH(idev, icmp6_hdr(skb)->icmp6_type);
1468 		ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS);
1469 	}
1470 
1471 	err = ip6_local_out(skb);
1472 	if (err) {
1473 		if (err > 0)
1474 			err = np->recverr ? net_xmit_errno(err) : 0;
1475 		if (err)
1476 			goto error;
1477 	}
1478 
1479 out:
1480 	ip6_cork_release(inet, np);
1481 	return err;
1482 error:
1483 	goto out;
1484 }
1485 
1486 void ip6_flush_pending_frames(struct sock *sk)
1487 {
1488 	struct sk_buff *skb;
1489 
1490 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1491 		if (skb->dst)
1492 			IP6_INC_STATS(ip6_dst_idev(skb->dst),
1493 				      IPSTATS_MIB_OUTDISCARDS);
1494 		kfree_skb(skb);
1495 	}
1496 
1497 	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1498 }
1499