xref: /openbmc/linux/net/ipv6/ip6_output.c (revision fd589a8f)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	: 	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 
41 #include <linux/netfilter.h>
42 #include <linux/netfilter_ipv6.h>
43 
44 #include <net/sock.h>
45 #include <net/snmp.h>
46 
47 #include <net/ipv6.h>
48 #include <net/ndisc.h>
49 #include <net/protocol.h>
50 #include <net/ip6_route.h>
51 #include <net/addrconf.h>
52 #include <net/rawv6.h>
53 #include <net/icmp.h>
54 #include <net/xfrm.h>
55 #include <net/checksum.h>
56 #include <linux/mroute6.h>
57 
58 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
59 
60 int __ip6_local_out(struct sk_buff *skb)
61 {
62 	int len;
63 
64 	len = skb->len - sizeof(struct ipv6hdr);
65 	if (len > IPV6_MAXPLEN)
66 		len = 0;
67 	ipv6_hdr(skb)->payload_len = htons(len);
68 
69 	return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
70 		       dst_output);
71 }
72 
73 int ip6_local_out(struct sk_buff *skb)
74 {
75 	int err;
76 
77 	err = __ip6_local_out(skb);
78 	if (likely(err == 1))
79 		err = dst_output(skb);
80 
81 	return err;
82 }
83 EXPORT_SYMBOL_GPL(ip6_local_out);
84 
85 static int ip6_output_finish(struct sk_buff *skb)
86 {
87 	struct dst_entry *dst = skb_dst(skb);
88 
89 	if (dst->hh)
90 		return neigh_hh_output(dst->hh, skb);
91 	else if (dst->neighbour)
92 		return dst->neighbour->output(skb);
93 
94 	IP6_INC_STATS_BH(dev_net(dst->dev),
95 			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
96 	kfree_skb(skb);
97 	return -EINVAL;
98 
99 }
100 
101 /* dev_loopback_xmit for use with netfilter. */
102 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
103 {
104 	skb_reset_mac_header(newskb);
105 	__skb_pull(newskb, skb_network_offset(newskb));
106 	newskb->pkt_type = PACKET_LOOPBACK;
107 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
108 	WARN_ON(!skb_dst(newskb));
109 
110 	netif_rx(newskb);
111 	return 0;
112 }
113 
114 
115 static int ip6_output2(struct sk_buff *skb)
116 {
117 	struct dst_entry *dst = skb_dst(skb);
118 	struct net_device *dev = dst->dev;
119 
120 	skb->protocol = htons(ETH_P_IPV6);
121 	skb->dev = dev;
122 
123 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
124 		struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
125 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
126 
127 		if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
128 		    ((mroute6_socket(dev_net(dev)) &&
129 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
130 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
131 					 &ipv6_hdr(skb)->saddr))) {
132 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
133 
134 			/* Do not check for IFF_ALLMULTI; multicast routing
135 			   is not supported in any case.
136 			 */
137 			if (newskb)
138 				NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
139 					NULL, newskb->dev,
140 					ip6_dev_loopback_xmit);
141 
142 			if (ipv6_hdr(skb)->hop_limit == 0) {
143 				IP6_INC_STATS(dev_net(dev), idev,
144 					      IPSTATS_MIB_OUTDISCARDS);
145 				kfree_skb(skb);
146 				return 0;
147 			}
148 		}
149 
150 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
151 				skb->len);
152 	}
153 
154 	return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
155 		       ip6_output_finish);
156 }
157 
158 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
159 {
160 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
161 
162 	return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
163 	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
164 }
165 
166 int ip6_output(struct sk_buff *skb)
167 {
168 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
169 	if (unlikely(idev->cnf.disable_ipv6)) {
170 		IP6_INC_STATS(dev_net(skb_dst(skb)->dev), idev,
171 			      IPSTATS_MIB_OUTDISCARDS);
172 		kfree_skb(skb);
173 		return 0;
174 	}
175 
176 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
177 				dst_allfrag(skb_dst(skb)))
178 		return ip6_fragment(skb, ip6_output2);
179 	else
180 		return ip6_output2(skb);
181 }
182 
183 /*
184  *	xmit an sk_buff (used by TCP)
185  */
186 
187 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
188 	     struct ipv6_txoptions *opt, int ipfragok)
189 {
190 	struct net *net = sock_net(sk);
191 	struct ipv6_pinfo *np = inet6_sk(sk);
192 	struct in6_addr *first_hop = &fl->fl6_dst;
193 	struct dst_entry *dst = skb_dst(skb);
194 	struct ipv6hdr *hdr;
195 	u8  proto = fl->proto;
196 	int seg_len = skb->len;
197 	int hlimit = -1;
198 	int tclass = 0;
199 	u32 mtu;
200 
201 	if (opt) {
202 		unsigned int head_room;
203 
204 		/* First: exthdrs may take lots of space (~8K for now)
205 		   MAX_HEADER is not enough.
206 		 */
207 		head_room = opt->opt_nflen + opt->opt_flen;
208 		seg_len += head_room;
209 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
210 
211 		if (skb_headroom(skb) < head_room) {
212 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
213 			if (skb2 == NULL) {
214 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
215 					      IPSTATS_MIB_OUTDISCARDS);
216 				kfree_skb(skb);
217 				return -ENOBUFS;
218 			}
219 			kfree_skb(skb);
220 			skb = skb2;
221 			if (sk)
222 				skb_set_owner_w(skb, sk);
223 		}
224 		if (opt->opt_flen)
225 			ipv6_push_frag_opts(skb, opt, &proto);
226 		if (opt->opt_nflen)
227 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
228 	}
229 
230 	skb_push(skb, sizeof(struct ipv6hdr));
231 	skb_reset_network_header(skb);
232 	hdr = ipv6_hdr(skb);
233 
234 	/* Allow local fragmentation. */
235 	if (ipfragok)
236 		skb->local_df = 1;
237 
238 	/*
239 	 *	Fill in the IPv6 header
240 	 */
241 	if (np) {
242 		tclass = np->tclass;
243 		hlimit = np->hop_limit;
244 	}
245 	if (hlimit < 0)
246 		hlimit = ip6_dst_hoplimit(dst);
247 
248 	*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
249 
250 	hdr->payload_len = htons(seg_len);
251 	hdr->nexthdr = proto;
252 	hdr->hop_limit = hlimit;
253 
254 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
255 	ipv6_addr_copy(&hdr->daddr, first_hop);
256 
257 	skb->priority = sk->sk_priority;
258 	skb->mark = sk->sk_mark;
259 
260 	mtu = dst_mtu(dst);
261 	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
262 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
263 			      IPSTATS_MIB_OUT, skb->len);
264 		return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
265 				dst_output);
266 	}
267 
268 	if (net_ratelimit())
269 		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
270 	skb->dev = dst->dev;
271 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
272 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
273 	kfree_skb(skb);
274 	return -EMSGSIZE;
275 }
276 
277 EXPORT_SYMBOL(ip6_xmit);
278 
279 /*
280  *	To avoid extra problems ND packets are send through this
281  *	routine. It's code duplication but I really want to avoid
282  *	extra checks since ipv6_build_header is used by TCP (which
283  *	is for us performance critical)
284  */
285 
286 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
287 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
288 	       int proto, int len)
289 {
290 	struct ipv6_pinfo *np = inet6_sk(sk);
291 	struct ipv6hdr *hdr;
292 	int totlen;
293 
294 	skb->protocol = htons(ETH_P_IPV6);
295 	skb->dev = dev;
296 
297 	totlen = len + sizeof(struct ipv6hdr);
298 
299 	skb_reset_network_header(skb);
300 	skb_put(skb, sizeof(struct ipv6hdr));
301 	hdr = ipv6_hdr(skb);
302 
303 	*(__be32*)hdr = htonl(0x60000000);
304 
305 	hdr->payload_len = htons(len);
306 	hdr->nexthdr = proto;
307 	hdr->hop_limit = np->hop_limit;
308 
309 	ipv6_addr_copy(&hdr->saddr, saddr);
310 	ipv6_addr_copy(&hdr->daddr, daddr);
311 
312 	return 0;
313 }
314 
315 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
316 {
317 	struct ip6_ra_chain *ra;
318 	struct sock *last = NULL;
319 
320 	read_lock(&ip6_ra_lock);
321 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
322 		struct sock *sk = ra->sk;
323 		if (sk && ra->sel == sel &&
324 		    (!sk->sk_bound_dev_if ||
325 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
326 			if (last) {
327 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
328 				if (skb2)
329 					rawv6_rcv(last, skb2);
330 			}
331 			last = sk;
332 		}
333 	}
334 
335 	if (last) {
336 		rawv6_rcv(last, skb);
337 		read_unlock(&ip6_ra_lock);
338 		return 1;
339 	}
340 	read_unlock(&ip6_ra_lock);
341 	return 0;
342 }
343 
344 static int ip6_forward_proxy_check(struct sk_buff *skb)
345 {
346 	struct ipv6hdr *hdr = ipv6_hdr(skb);
347 	u8 nexthdr = hdr->nexthdr;
348 	int offset;
349 
350 	if (ipv6_ext_hdr(nexthdr)) {
351 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
352 		if (offset < 0)
353 			return 0;
354 	} else
355 		offset = sizeof(struct ipv6hdr);
356 
357 	if (nexthdr == IPPROTO_ICMPV6) {
358 		struct icmp6hdr *icmp6;
359 
360 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
361 					 offset + 1 - skb->data)))
362 			return 0;
363 
364 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
365 
366 		switch (icmp6->icmp6_type) {
367 		case NDISC_ROUTER_SOLICITATION:
368 		case NDISC_ROUTER_ADVERTISEMENT:
369 		case NDISC_NEIGHBOUR_SOLICITATION:
370 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
371 		case NDISC_REDIRECT:
372 			/* For reaction involving unicast neighbor discovery
373 			 * message destined to the proxied address, pass it to
374 			 * input function.
375 			 */
376 			return 1;
377 		default:
378 			break;
379 		}
380 	}
381 
382 	/*
383 	 * The proxying router can't forward traffic sent to a link-local
384 	 * address, so signal the sender and discard the packet. This
385 	 * behavior is clarified by the MIPv6 specification.
386 	 */
387 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
388 		dst_link_failure(skb);
389 		return -1;
390 	}
391 
392 	return 0;
393 }
394 
395 static inline int ip6_forward_finish(struct sk_buff *skb)
396 {
397 	return dst_output(skb);
398 }
399 
400 int ip6_forward(struct sk_buff *skb)
401 {
402 	struct dst_entry *dst = skb_dst(skb);
403 	struct ipv6hdr *hdr = ipv6_hdr(skb);
404 	struct inet6_skb_parm *opt = IP6CB(skb);
405 	struct net *net = dev_net(dst->dev);
406 
407 	if (net->ipv6.devconf_all->forwarding == 0)
408 		goto error;
409 
410 	if (skb_warn_if_lro(skb))
411 		goto drop;
412 
413 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
414 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
415 		goto drop;
416 	}
417 
418 	skb_forward_csum(skb);
419 
420 	/*
421 	 *	We DO NOT make any processing on
422 	 *	RA packets, pushing them to user level AS IS
423 	 *	without ane WARRANTY that application will be able
424 	 *	to interpret them. The reason is that we
425 	 *	cannot make anything clever here.
426 	 *
427 	 *	We are not end-node, so that if packet contains
428 	 *	AH/ESP, we cannot make anything.
429 	 *	Defragmentation also would be mistake, RA packets
430 	 *	cannot be fragmented, because there is no warranty
431 	 *	that different fragments will go along one path. --ANK
432 	 */
433 	if (opt->ra) {
434 		u8 *ptr = skb_network_header(skb) + opt->ra;
435 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
436 			return 0;
437 	}
438 
439 	/*
440 	 *	check and decrement ttl
441 	 */
442 	if (hdr->hop_limit <= 1) {
443 		/* Force OUTPUT device used as source address */
444 		skb->dev = dst->dev;
445 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
446 			    0, skb->dev);
447 		IP6_INC_STATS_BH(net,
448 				 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
449 
450 		kfree_skb(skb);
451 		return -ETIMEDOUT;
452 	}
453 
454 	/* XXX: idev->cnf.proxy_ndp? */
455 	if (net->ipv6.devconf_all->proxy_ndp &&
456 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
457 		int proxied = ip6_forward_proxy_check(skb);
458 		if (proxied > 0)
459 			return ip6_input(skb);
460 		else if (proxied < 0) {
461 			IP6_INC_STATS(net, ip6_dst_idev(dst),
462 				      IPSTATS_MIB_INDISCARDS);
463 			goto drop;
464 		}
465 	}
466 
467 	if (!xfrm6_route_forward(skb)) {
468 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
469 		goto drop;
470 	}
471 	dst = skb_dst(skb);
472 
473 	/* IPv6 specs say nothing about it, but it is clear that we cannot
474 	   send redirects to source routed frames.
475 	   We don't send redirects to frames decapsulated from IPsec.
476 	 */
477 	if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
478 	    !skb_sec_path(skb)) {
479 		struct in6_addr *target = NULL;
480 		struct rt6_info *rt;
481 		struct neighbour *n = dst->neighbour;
482 
483 		/*
484 		 *	incoming and outgoing devices are the same
485 		 *	send a redirect.
486 		 */
487 
488 		rt = (struct rt6_info *) dst;
489 		if ((rt->rt6i_flags & RTF_GATEWAY))
490 			target = (struct in6_addr*)&n->primary_key;
491 		else
492 			target = &hdr->daddr;
493 
494 		/* Limit redirects both by destination (here)
495 		   and by source (inside ndisc_send_redirect)
496 		 */
497 		if (xrlim_allow(dst, 1*HZ))
498 			ndisc_send_redirect(skb, n, target);
499 	} else {
500 		int addrtype = ipv6_addr_type(&hdr->saddr);
501 
502 		/* This check is security critical. */
503 		if (addrtype == IPV6_ADDR_ANY ||
504 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
505 			goto error;
506 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
507 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
508 				ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
509 			goto error;
510 		}
511 	}
512 
513 	if (skb->len > dst_mtu(dst)) {
514 		/* Again, force OUTPUT device used as source address */
515 		skb->dev = dst->dev;
516 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
517 		IP6_INC_STATS_BH(net,
518 				 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
519 		IP6_INC_STATS_BH(net,
520 				 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
521 		kfree_skb(skb);
522 		return -EMSGSIZE;
523 	}
524 
525 	if (skb_cow(skb, dst->dev->hard_header_len)) {
526 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
527 		goto drop;
528 	}
529 
530 	hdr = ipv6_hdr(skb);
531 
532 	/* Mangling hops number delayed to point after skb COW */
533 
534 	hdr->hop_limit--;
535 
536 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
537 	return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
538 		       ip6_forward_finish);
539 
540 error:
541 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
542 drop:
543 	kfree_skb(skb);
544 	return -EINVAL;
545 }
546 
547 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
548 {
549 	to->pkt_type = from->pkt_type;
550 	to->priority = from->priority;
551 	to->protocol = from->protocol;
552 	skb_dst_drop(to);
553 	skb_dst_set(to, dst_clone(skb_dst(from)));
554 	to->dev = from->dev;
555 	to->mark = from->mark;
556 
557 #ifdef CONFIG_NET_SCHED
558 	to->tc_index = from->tc_index;
559 #endif
560 	nf_copy(to, from);
561 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
562     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
563 	to->nf_trace = from->nf_trace;
564 #endif
565 	skb_copy_secmark(to, from);
566 }
567 
568 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
569 {
570 	u16 offset = sizeof(struct ipv6hdr);
571 	struct ipv6_opt_hdr *exthdr =
572 				(struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
573 	unsigned int packet_len = skb->tail - skb->network_header;
574 	int found_rhdr = 0;
575 	*nexthdr = &ipv6_hdr(skb)->nexthdr;
576 
577 	while (offset + 1 <= packet_len) {
578 
579 		switch (**nexthdr) {
580 
581 		case NEXTHDR_HOP:
582 			break;
583 		case NEXTHDR_ROUTING:
584 			found_rhdr = 1;
585 			break;
586 		case NEXTHDR_DEST:
587 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
588 			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
589 				break;
590 #endif
591 			if (found_rhdr)
592 				return offset;
593 			break;
594 		default :
595 			return offset;
596 		}
597 
598 		offset += ipv6_optlen(exthdr);
599 		*nexthdr = &exthdr->nexthdr;
600 		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
601 						 offset);
602 	}
603 
604 	return offset;
605 }
606 
607 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
608 {
609 	struct sk_buff *frag;
610 	struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
611 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
612 	struct ipv6hdr *tmp_hdr;
613 	struct frag_hdr *fh;
614 	unsigned int mtu, hlen, left, len;
615 	__be32 frag_id = 0;
616 	int ptr, offset = 0, err=0;
617 	u8 *prevhdr, nexthdr = 0;
618 	struct net *net = dev_net(skb_dst(skb)->dev);
619 
620 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
621 	nexthdr = *prevhdr;
622 
623 	mtu = ip6_skb_dst_mtu(skb);
624 
625 	/* We must not fragment if the socket is set to force MTU discovery
626 	 * or if the skb it not generated by a local socket.  (This last
627 	 * check should be redundant, but it's free.)
628 	 */
629 	if (!skb->local_df) {
630 		skb->dev = skb_dst(skb)->dev;
631 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
632 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
633 			      IPSTATS_MIB_FRAGFAILS);
634 		kfree_skb(skb);
635 		return -EMSGSIZE;
636 	}
637 
638 	if (np && np->frag_size < mtu) {
639 		if (np->frag_size)
640 			mtu = np->frag_size;
641 	}
642 	mtu -= hlen + sizeof(struct frag_hdr);
643 
644 	if (skb_has_frags(skb)) {
645 		int first_len = skb_pagelen(skb);
646 		int truesizes = 0;
647 
648 		if (first_len - hlen > mtu ||
649 		    ((first_len - hlen) & 7) ||
650 		    skb_cloned(skb))
651 			goto slow_path;
652 
653 		skb_walk_frags(skb, frag) {
654 			/* Correct geometry. */
655 			if (frag->len > mtu ||
656 			    ((frag->len & 7) && frag->next) ||
657 			    skb_headroom(frag) < hlen)
658 			    goto slow_path;
659 
660 			/* Partially cloned skb? */
661 			if (skb_shared(frag))
662 				goto slow_path;
663 
664 			BUG_ON(frag->sk);
665 			if (skb->sk) {
666 				frag->sk = skb->sk;
667 				frag->destructor = sock_wfree;
668 				truesizes += frag->truesize;
669 			}
670 		}
671 
672 		err = 0;
673 		offset = 0;
674 		frag = skb_shinfo(skb)->frag_list;
675 		skb_frag_list_init(skb);
676 		/* BUILD HEADER */
677 
678 		*prevhdr = NEXTHDR_FRAGMENT;
679 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
680 		if (!tmp_hdr) {
681 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
682 				      IPSTATS_MIB_FRAGFAILS);
683 			return -ENOMEM;
684 		}
685 
686 		__skb_pull(skb, hlen);
687 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
688 		__skb_push(skb, hlen);
689 		skb_reset_network_header(skb);
690 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
691 
692 		ipv6_select_ident(fh);
693 		fh->nexthdr = nexthdr;
694 		fh->reserved = 0;
695 		fh->frag_off = htons(IP6_MF);
696 		frag_id = fh->identification;
697 
698 		first_len = skb_pagelen(skb);
699 		skb->data_len = first_len - skb_headlen(skb);
700 		skb->truesize -= truesizes;
701 		skb->len = first_len;
702 		ipv6_hdr(skb)->payload_len = htons(first_len -
703 						   sizeof(struct ipv6hdr));
704 
705 		dst_hold(&rt->u.dst);
706 
707 		for (;;) {
708 			/* Prepare header of the next frame,
709 			 * before previous one went down. */
710 			if (frag) {
711 				frag->ip_summed = CHECKSUM_NONE;
712 				skb_reset_transport_header(frag);
713 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
714 				__skb_push(frag, hlen);
715 				skb_reset_network_header(frag);
716 				memcpy(skb_network_header(frag), tmp_hdr,
717 				       hlen);
718 				offset += skb->len - hlen - sizeof(struct frag_hdr);
719 				fh->nexthdr = nexthdr;
720 				fh->reserved = 0;
721 				fh->frag_off = htons(offset);
722 				if (frag->next != NULL)
723 					fh->frag_off |= htons(IP6_MF);
724 				fh->identification = frag_id;
725 				ipv6_hdr(frag)->payload_len =
726 						htons(frag->len -
727 						      sizeof(struct ipv6hdr));
728 				ip6_copy_metadata(frag, skb);
729 			}
730 
731 			err = output(skb);
732 			if(!err)
733 				IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
734 					      IPSTATS_MIB_FRAGCREATES);
735 
736 			if (err || !frag)
737 				break;
738 
739 			skb = frag;
740 			frag = skb->next;
741 			skb->next = NULL;
742 		}
743 
744 		kfree(tmp_hdr);
745 
746 		if (err == 0) {
747 			IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
748 				      IPSTATS_MIB_FRAGOKS);
749 			dst_release(&rt->u.dst);
750 			return 0;
751 		}
752 
753 		while (frag) {
754 			skb = frag->next;
755 			kfree_skb(frag);
756 			frag = skb;
757 		}
758 
759 		IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
760 			      IPSTATS_MIB_FRAGFAILS);
761 		dst_release(&rt->u.dst);
762 		return err;
763 	}
764 
765 slow_path:
766 	left = skb->len - hlen;		/* Space per frame */
767 	ptr = hlen;			/* Where to start from */
768 
769 	/*
770 	 *	Fragment the datagram.
771 	 */
772 
773 	*prevhdr = NEXTHDR_FRAGMENT;
774 
775 	/*
776 	 *	Keep copying data until we run out.
777 	 */
778 	while(left > 0)	{
779 		len = left;
780 		/* IF: it doesn't fit, use 'mtu' - the data space left */
781 		if (len > mtu)
782 			len = mtu;
783 		/* IF: we are not sending upto and including the packet end
784 		   then align the next start on an eight byte boundary */
785 		if (len < left)	{
786 			len &= ~7;
787 		}
788 		/*
789 		 *	Allocate buffer.
790 		 */
791 
792 		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
793 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
794 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
795 				      IPSTATS_MIB_FRAGFAILS);
796 			err = -ENOMEM;
797 			goto fail;
798 		}
799 
800 		/*
801 		 *	Set up data on packet
802 		 */
803 
804 		ip6_copy_metadata(frag, skb);
805 		skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
806 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
807 		skb_reset_network_header(frag);
808 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
809 		frag->transport_header = (frag->network_header + hlen +
810 					  sizeof(struct frag_hdr));
811 
812 		/*
813 		 *	Charge the memory for the fragment to any owner
814 		 *	it might possess
815 		 */
816 		if (skb->sk)
817 			skb_set_owner_w(frag, skb->sk);
818 
819 		/*
820 		 *	Copy the packet header into the new buffer.
821 		 */
822 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
823 
824 		/*
825 		 *	Build fragment header.
826 		 */
827 		fh->nexthdr = nexthdr;
828 		fh->reserved = 0;
829 		if (!frag_id) {
830 			ipv6_select_ident(fh);
831 			frag_id = fh->identification;
832 		} else
833 			fh->identification = frag_id;
834 
835 		/*
836 		 *	Copy a block of the IP datagram.
837 		 */
838 		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
839 			BUG();
840 		left -= len;
841 
842 		fh->frag_off = htons(offset);
843 		if (left > 0)
844 			fh->frag_off |= htons(IP6_MF);
845 		ipv6_hdr(frag)->payload_len = htons(frag->len -
846 						    sizeof(struct ipv6hdr));
847 
848 		ptr += len;
849 		offset += len;
850 
851 		/*
852 		 *	Put this fragment into the sending queue.
853 		 */
854 		err = output(frag);
855 		if (err)
856 			goto fail;
857 
858 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
859 			      IPSTATS_MIB_FRAGCREATES);
860 	}
861 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
862 		      IPSTATS_MIB_FRAGOKS);
863 	kfree_skb(skb);
864 	return err;
865 
866 fail:
867 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
868 		      IPSTATS_MIB_FRAGFAILS);
869 	kfree_skb(skb);
870 	return err;
871 }
872 
873 static inline int ip6_rt_check(struct rt6key *rt_key,
874 			       struct in6_addr *fl_addr,
875 			       struct in6_addr *addr_cache)
876 {
877 	return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
878 		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
879 }
880 
881 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
882 					  struct dst_entry *dst,
883 					  struct flowi *fl)
884 {
885 	struct ipv6_pinfo *np = inet6_sk(sk);
886 	struct rt6_info *rt = (struct rt6_info *)dst;
887 
888 	if (!dst)
889 		goto out;
890 
891 	/* Yes, checking route validity in not connected
892 	 * case is not very simple. Take into account,
893 	 * that we do not support routing by source, TOS,
894 	 * and MSG_DONTROUTE 		--ANK (980726)
895 	 *
896 	 * 1. ip6_rt_check(): If route was host route,
897 	 *    check that cached destination is current.
898 	 *    If it is network route, we still may
899 	 *    check its validity using saved pointer
900 	 *    to the last used address: daddr_cache.
901 	 *    We do not want to save whole address now,
902 	 *    (because main consumer of this service
903 	 *    is tcp, which has not this problem),
904 	 *    so that the last trick works only on connected
905 	 *    sockets.
906 	 * 2. oif also should be the same.
907 	 */
908 	if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
909 #ifdef CONFIG_IPV6_SUBTREES
910 	    ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
911 #endif
912 	    (fl->oif && fl->oif != dst->dev->ifindex)) {
913 		dst_release(dst);
914 		dst = NULL;
915 	}
916 
917 out:
918 	return dst;
919 }
920 
921 static int ip6_dst_lookup_tail(struct sock *sk,
922 			       struct dst_entry **dst, struct flowi *fl)
923 {
924 	int err;
925 	struct net *net = sock_net(sk);
926 
927 	if (*dst == NULL)
928 		*dst = ip6_route_output(net, sk, fl);
929 
930 	if ((err = (*dst)->error))
931 		goto out_err_release;
932 
933 	if (ipv6_addr_any(&fl->fl6_src)) {
934 		err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
935 					 &fl->fl6_dst,
936 					 sk ? inet6_sk(sk)->srcprefs : 0,
937 					 &fl->fl6_src);
938 		if (err)
939 			goto out_err_release;
940 	}
941 
942 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
943 	/*
944 	 * Here if the dst entry we've looked up
945 	 * has a neighbour entry that is in the INCOMPLETE
946 	 * state and the src address from the flow is
947 	 * marked as OPTIMISTIC, we release the found
948 	 * dst entry and replace it instead with the
949 	 * dst entry of the nexthop router
950 	 */
951 	if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
952 		struct inet6_ifaddr *ifp;
953 		struct flowi fl_gw;
954 		int redirect;
955 
956 		ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
957 				      (*dst)->dev, 1);
958 
959 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
960 		if (ifp)
961 			in6_ifa_put(ifp);
962 
963 		if (redirect) {
964 			/*
965 			 * We need to get the dst entry for the
966 			 * default router instead
967 			 */
968 			dst_release(*dst);
969 			memcpy(&fl_gw, fl, sizeof(struct flowi));
970 			memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
971 			*dst = ip6_route_output(net, sk, &fl_gw);
972 			if ((err = (*dst)->error))
973 				goto out_err_release;
974 		}
975 	}
976 #endif
977 
978 	return 0;
979 
980 out_err_release:
981 	if (err == -ENETUNREACH)
982 		IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
983 	dst_release(*dst);
984 	*dst = NULL;
985 	return err;
986 }
987 
988 /**
989  *	ip6_dst_lookup - perform route lookup on flow
990  *	@sk: socket which provides route info
991  *	@dst: pointer to dst_entry * for result
992  *	@fl: flow to lookup
993  *
994  *	This function performs a route lookup on the given flow.
995  *
996  *	It returns zero on success, or a standard errno code on error.
997  */
998 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
999 {
1000 	*dst = NULL;
1001 	return ip6_dst_lookup_tail(sk, dst, fl);
1002 }
1003 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1004 
1005 /**
1006  *	ip6_sk_dst_lookup - perform socket cached route lookup on flow
1007  *	@sk: socket which provides the dst cache and route info
1008  *	@dst: pointer to dst_entry * for result
1009  *	@fl: flow to lookup
1010  *
1011  *	This function performs a route lookup on the given flow with the
1012  *	possibility of using the cached route in the socket if it is valid.
1013  *	It will take the socket dst lock when operating on the dst cache.
1014  *	As a result, this function can only be used in process context.
1015  *
1016  *	It returns zero on success, or a standard errno code on error.
1017  */
1018 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1019 {
1020 	*dst = NULL;
1021 	if (sk) {
1022 		*dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1023 		*dst = ip6_sk_dst_check(sk, *dst, fl);
1024 	}
1025 
1026 	return ip6_dst_lookup_tail(sk, dst, fl);
1027 }
1028 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1029 
1030 static inline int ip6_ufo_append_data(struct sock *sk,
1031 			int getfrag(void *from, char *to, int offset, int len,
1032 			int odd, struct sk_buff *skb),
1033 			void *from, int length, int hh_len, int fragheaderlen,
1034 			int transhdrlen, int mtu,unsigned int flags)
1035 
1036 {
1037 	struct sk_buff *skb;
1038 	int err;
1039 
1040 	/* There is support for UDP large send offload by network
1041 	 * device, so create one single skb packet containing complete
1042 	 * udp datagram
1043 	 */
1044 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1045 		skb = sock_alloc_send_skb(sk,
1046 			hh_len + fragheaderlen + transhdrlen + 20,
1047 			(flags & MSG_DONTWAIT), &err);
1048 		if (skb == NULL)
1049 			return -ENOMEM;
1050 
1051 		/* reserve space for Hardware header */
1052 		skb_reserve(skb, hh_len);
1053 
1054 		/* create space for UDP/IP header */
1055 		skb_put(skb,fragheaderlen + transhdrlen);
1056 
1057 		/* initialize network header pointer */
1058 		skb_reset_network_header(skb);
1059 
1060 		/* initialize protocol header pointer */
1061 		skb->transport_header = skb->network_header + fragheaderlen;
1062 
1063 		skb->ip_summed = CHECKSUM_PARTIAL;
1064 		skb->csum = 0;
1065 		sk->sk_sndmsg_off = 0;
1066 	}
1067 
1068 	err = skb_append_datato_frags(sk,skb, getfrag, from,
1069 				      (length - transhdrlen));
1070 	if (!err) {
1071 		struct frag_hdr fhdr;
1072 
1073 		/* Specify the length of each IPv6 datagram fragment.
1074 		 * It has to be a multiple of 8.
1075 		 */
1076 		skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1077 					     sizeof(struct frag_hdr)) & ~7;
1078 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1079 		ipv6_select_ident(&fhdr);
1080 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1081 		__skb_queue_tail(&sk->sk_write_queue, skb);
1082 
1083 		return 0;
1084 	}
1085 	/* There is not enough support do UPD LSO,
1086 	 * so follow normal path
1087 	 */
1088 	kfree_skb(skb);
1089 
1090 	return err;
1091 }
1092 
1093 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1094 					       gfp_t gfp)
1095 {
1096 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1097 }
1098 
1099 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1100 						gfp_t gfp)
1101 {
1102 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1103 }
1104 
1105 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1106 	int offset, int len, int odd, struct sk_buff *skb),
1107 	void *from, int length, int transhdrlen,
1108 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1109 	struct rt6_info *rt, unsigned int flags)
1110 {
1111 	struct inet_sock *inet = inet_sk(sk);
1112 	struct ipv6_pinfo *np = inet6_sk(sk);
1113 	struct sk_buff *skb;
1114 	unsigned int maxfraglen, fragheaderlen;
1115 	int exthdrlen;
1116 	int hh_len;
1117 	int mtu;
1118 	int copy;
1119 	int err;
1120 	int offset = 0;
1121 	int csummode = CHECKSUM_NONE;
1122 
1123 	if (flags&MSG_PROBE)
1124 		return 0;
1125 	if (skb_queue_empty(&sk->sk_write_queue)) {
1126 		/*
1127 		 * setup for corking
1128 		 */
1129 		if (opt) {
1130 			if (WARN_ON(np->cork.opt))
1131 				return -EINVAL;
1132 
1133 			np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1134 			if (unlikely(np->cork.opt == NULL))
1135 				return -ENOBUFS;
1136 
1137 			np->cork.opt->tot_len = opt->tot_len;
1138 			np->cork.opt->opt_flen = opt->opt_flen;
1139 			np->cork.opt->opt_nflen = opt->opt_nflen;
1140 
1141 			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1142 							    sk->sk_allocation);
1143 			if (opt->dst0opt && !np->cork.opt->dst0opt)
1144 				return -ENOBUFS;
1145 
1146 			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1147 							    sk->sk_allocation);
1148 			if (opt->dst1opt && !np->cork.opt->dst1opt)
1149 				return -ENOBUFS;
1150 
1151 			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1152 							   sk->sk_allocation);
1153 			if (opt->hopopt && !np->cork.opt->hopopt)
1154 				return -ENOBUFS;
1155 
1156 			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1157 							    sk->sk_allocation);
1158 			if (opt->srcrt && !np->cork.opt->srcrt)
1159 				return -ENOBUFS;
1160 
1161 			/* need source address above miyazawa*/
1162 		}
1163 		dst_hold(&rt->u.dst);
1164 		inet->cork.dst = &rt->u.dst;
1165 		inet->cork.fl = *fl;
1166 		np->cork.hop_limit = hlimit;
1167 		np->cork.tclass = tclass;
1168 		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1169 		      rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1170 		if (np->frag_size < mtu) {
1171 			if (np->frag_size)
1172 				mtu = np->frag_size;
1173 		}
1174 		inet->cork.fragsize = mtu;
1175 		if (dst_allfrag(rt->u.dst.path))
1176 			inet->cork.flags |= IPCORK_ALLFRAG;
1177 		inet->cork.length = 0;
1178 		sk->sk_sndmsg_page = NULL;
1179 		sk->sk_sndmsg_off = 0;
1180 		exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1181 			    rt->rt6i_nfheader_len;
1182 		length += exthdrlen;
1183 		transhdrlen += exthdrlen;
1184 	} else {
1185 		rt = (struct rt6_info *)inet->cork.dst;
1186 		fl = &inet->cork.fl;
1187 		opt = np->cork.opt;
1188 		transhdrlen = 0;
1189 		exthdrlen = 0;
1190 		mtu = inet->cork.fragsize;
1191 	}
1192 
1193 	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1194 
1195 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1196 			(opt ? opt->opt_nflen : 0);
1197 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1198 
1199 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1200 		if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1201 			ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1202 			return -EMSGSIZE;
1203 		}
1204 	}
1205 
1206 	/*
1207 	 * Let's try using as much space as possible.
1208 	 * Use MTU if total length of the message fits into the MTU.
1209 	 * Otherwise, we need to reserve fragment header and
1210 	 * fragment alignment (= 8-15 octects, in total).
1211 	 *
1212 	 * Note that we may need to "move" the data from the tail of
1213 	 * of the buffer to the new fragment when we split
1214 	 * the message.
1215 	 *
1216 	 * FIXME: It may be fragmented into multiple chunks
1217 	 *        at once if non-fragmentable extension headers
1218 	 *        are too large.
1219 	 * --yoshfuji
1220 	 */
1221 
1222 	inet->cork.length += length;
1223 	if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1224 	    (rt->u.dst.dev->features & NETIF_F_UFO)) {
1225 
1226 		err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1227 					  fragheaderlen, transhdrlen, mtu,
1228 					  flags);
1229 		if (err)
1230 			goto error;
1231 		return 0;
1232 	}
1233 
1234 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1235 		goto alloc_new_skb;
1236 
1237 	while (length > 0) {
1238 		/* Check if the remaining data fits into current packet. */
1239 		copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1240 		if (copy < length)
1241 			copy = maxfraglen - skb->len;
1242 
1243 		if (copy <= 0) {
1244 			char *data;
1245 			unsigned int datalen;
1246 			unsigned int fraglen;
1247 			unsigned int fraggap;
1248 			unsigned int alloclen;
1249 			struct sk_buff *skb_prev;
1250 alloc_new_skb:
1251 			skb_prev = skb;
1252 
1253 			/* There's no room in the current skb */
1254 			if (skb_prev)
1255 				fraggap = skb_prev->len - maxfraglen;
1256 			else
1257 				fraggap = 0;
1258 
1259 			/*
1260 			 * If remaining data exceeds the mtu,
1261 			 * we know we need more fragment(s).
1262 			 */
1263 			datalen = length + fraggap;
1264 			if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1265 				datalen = maxfraglen - fragheaderlen;
1266 
1267 			fraglen = datalen + fragheaderlen;
1268 			if ((flags & MSG_MORE) &&
1269 			    !(rt->u.dst.dev->features&NETIF_F_SG))
1270 				alloclen = mtu;
1271 			else
1272 				alloclen = datalen + fragheaderlen;
1273 
1274 			/*
1275 			 * The last fragment gets additional space at tail.
1276 			 * Note: we overallocate on fragments with MSG_MODE
1277 			 * because we have no idea if we're the last one.
1278 			 */
1279 			if (datalen == length + fraggap)
1280 				alloclen += rt->u.dst.trailer_len;
1281 
1282 			/*
1283 			 * We just reserve space for fragment header.
1284 			 * Note: this may be overallocation if the message
1285 			 * (without MSG_MORE) fits into the MTU.
1286 			 */
1287 			alloclen += sizeof(struct frag_hdr);
1288 
1289 			if (transhdrlen) {
1290 				skb = sock_alloc_send_skb(sk,
1291 						alloclen + hh_len,
1292 						(flags & MSG_DONTWAIT), &err);
1293 			} else {
1294 				skb = NULL;
1295 				if (atomic_read(&sk->sk_wmem_alloc) <=
1296 				    2 * sk->sk_sndbuf)
1297 					skb = sock_wmalloc(sk,
1298 							   alloclen + hh_len, 1,
1299 							   sk->sk_allocation);
1300 				if (unlikely(skb == NULL))
1301 					err = -ENOBUFS;
1302 			}
1303 			if (skb == NULL)
1304 				goto error;
1305 			/*
1306 			 *	Fill in the control structures
1307 			 */
1308 			skb->ip_summed = csummode;
1309 			skb->csum = 0;
1310 			/* reserve for fragmentation */
1311 			skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1312 
1313 			/*
1314 			 *	Find where to start putting bytes
1315 			 */
1316 			data = skb_put(skb, fraglen);
1317 			skb_set_network_header(skb, exthdrlen);
1318 			data += fragheaderlen;
1319 			skb->transport_header = (skb->network_header +
1320 						 fragheaderlen);
1321 			if (fraggap) {
1322 				skb->csum = skb_copy_and_csum_bits(
1323 					skb_prev, maxfraglen,
1324 					data + transhdrlen, fraggap, 0);
1325 				skb_prev->csum = csum_sub(skb_prev->csum,
1326 							  skb->csum);
1327 				data += fraggap;
1328 				pskb_trim_unique(skb_prev, maxfraglen);
1329 			}
1330 			copy = datalen - transhdrlen - fraggap;
1331 			if (copy < 0) {
1332 				err = -EINVAL;
1333 				kfree_skb(skb);
1334 				goto error;
1335 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1336 				err = -EFAULT;
1337 				kfree_skb(skb);
1338 				goto error;
1339 			}
1340 
1341 			offset += copy;
1342 			length -= datalen - fraggap;
1343 			transhdrlen = 0;
1344 			exthdrlen = 0;
1345 			csummode = CHECKSUM_NONE;
1346 
1347 			/*
1348 			 * Put the packet on the pending queue
1349 			 */
1350 			__skb_queue_tail(&sk->sk_write_queue, skb);
1351 			continue;
1352 		}
1353 
1354 		if (copy > length)
1355 			copy = length;
1356 
1357 		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1358 			unsigned int off;
1359 
1360 			off = skb->len;
1361 			if (getfrag(from, skb_put(skb, copy),
1362 						offset, copy, off, skb) < 0) {
1363 				__skb_trim(skb, off);
1364 				err = -EFAULT;
1365 				goto error;
1366 			}
1367 		} else {
1368 			int i = skb_shinfo(skb)->nr_frags;
1369 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1370 			struct page *page = sk->sk_sndmsg_page;
1371 			int off = sk->sk_sndmsg_off;
1372 			unsigned int left;
1373 
1374 			if (page && (left = PAGE_SIZE - off) > 0) {
1375 				if (copy >= left)
1376 					copy = left;
1377 				if (page != frag->page) {
1378 					if (i == MAX_SKB_FRAGS) {
1379 						err = -EMSGSIZE;
1380 						goto error;
1381 					}
1382 					get_page(page);
1383 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1384 					frag = &skb_shinfo(skb)->frags[i];
1385 				}
1386 			} else if(i < MAX_SKB_FRAGS) {
1387 				if (copy > PAGE_SIZE)
1388 					copy = PAGE_SIZE;
1389 				page = alloc_pages(sk->sk_allocation, 0);
1390 				if (page == NULL) {
1391 					err = -ENOMEM;
1392 					goto error;
1393 				}
1394 				sk->sk_sndmsg_page = page;
1395 				sk->sk_sndmsg_off = 0;
1396 
1397 				skb_fill_page_desc(skb, i, page, 0, 0);
1398 				frag = &skb_shinfo(skb)->frags[i];
1399 			} else {
1400 				err = -EMSGSIZE;
1401 				goto error;
1402 			}
1403 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1404 				err = -EFAULT;
1405 				goto error;
1406 			}
1407 			sk->sk_sndmsg_off += copy;
1408 			frag->size += copy;
1409 			skb->len += copy;
1410 			skb->data_len += copy;
1411 			skb->truesize += copy;
1412 			atomic_add(copy, &sk->sk_wmem_alloc);
1413 		}
1414 		offset += copy;
1415 		length -= copy;
1416 	}
1417 	return 0;
1418 error:
1419 	inet->cork.length -= length;
1420 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1421 	return err;
1422 }
1423 
1424 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1425 {
1426 	if (np->cork.opt) {
1427 		kfree(np->cork.opt->dst0opt);
1428 		kfree(np->cork.opt->dst1opt);
1429 		kfree(np->cork.opt->hopopt);
1430 		kfree(np->cork.opt->srcrt);
1431 		kfree(np->cork.opt);
1432 		np->cork.opt = NULL;
1433 	}
1434 
1435 	if (inet->cork.dst) {
1436 		dst_release(inet->cork.dst);
1437 		inet->cork.dst = NULL;
1438 		inet->cork.flags &= ~IPCORK_ALLFRAG;
1439 	}
1440 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1441 }
1442 
1443 int ip6_push_pending_frames(struct sock *sk)
1444 {
1445 	struct sk_buff *skb, *tmp_skb;
1446 	struct sk_buff **tail_skb;
1447 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1448 	struct inet_sock *inet = inet_sk(sk);
1449 	struct ipv6_pinfo *np = inet6_sk(sk);
1450 	struct net *net = sock_net(sk);
1451 	struct ipv6hdr *hdr;
1452 	struct ipv6_txoptions *opt = np->cork.opt;
1453 	struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1454 	struct flowi *fl = &inet->cork.fl;
1455 	unsigned char proto = fl->proto;
1456 	int err = 0;
1457 
1458 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1459 		goto out;
1460 	tail_skb = &(skb_shinfo(skb)->frag_list);
1461 
1462 	/* move skb->data to ip header from ext header */
1463 	if (skb->data < skb_network_header(skb))
1464 		__skb_pull(skb, skb_network_offset(skb));
1465 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1466 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1467 		*tail_skb = tmp_skb;
1468 		tail_skb = &(tmp_skb->next);
1469 		skb->len += tmp_skb->len;
1470 		skb->data_len += tmp_skb->len;
1471 		skb->truesize += tmp_skb->truesize;
1472 		tmp_skb->destructor = NULL;
1473 		tmp_skb->sk = NULL;
1474 	}
1475 
1476 	/* Allow local fragmentation. */
1477 	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1478 		skb->local_df = 1;
1479 
1480 	ipv6_addr_copy(final_dst, &fl->fl6_dst);
1481 	__skb_pull(skb, skb_network_header_len(skb));
1482 	if (opt && opt->opt_flen)
1483 		ipv6_push_frag_opts(skb, opt, &proto);
1484 	if (opt && opt->opt_nflen)
1485 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1486 
1487 	skb_push(skb, sizeof(struct ipv6hdr));
1488 	skb_reset_network_header(skb);
1489 	hdr = ipv6_hdr(skb);
1490 
1491 	*(__be32*)hdr = fl->fl6_flowlabel |
1492 		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1493 
1494 	hdr->hop_limit = np->cork.hop_limit;
1495 	hdr->nexthdr = proto;
1496 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1497 	ipv6_addr_copy(&hdr->daddr, final_dst);
1498 
1499 	skb->priority = sk->sk_priority;
1500 	skb->mark = sk->sk_mark;
1501 
1502 	skb_dst_set(skb, dst_clone(&rt->u.dst));
1503 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1504 	if (proto == IPPROTO_ICMPV6) {
1505 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1506 
1507 		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1508 		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1509 	}
1510 
1511 	err = ip6_local_out(skb);
1512 	if (err) {
1513 		if (err > 0)
1514 			err = net_xmit_errno(err);
1515 		if (err)
1516 			goto error;
1517 	}
1518 
1519 out:
1520 	ip6_cork_release(inet, np);
1521 	return err;
1522 error:
1523 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1524 	goto out;
1525 }
1526 
1527 void ip6_flush_pending_frames(struct sock *sk)
1528 {
1529 	struct sk_buff *skb;
1530 
1531 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1532 		if (skb_dst(skb))
1533 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1534 				      IPSTATS_MIB_OUTDISCARDS);
1535 		kfree_skb(skb);
1536 	}
1537 
1538 	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1539 }
1540