xref: /openbmc/linux/net/ipv6/ip6_output.c (revision f6b72b6217f8c24f2a54988e58af858b4e66024d)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	: 	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63 	int len;
64 
65 	len = skb->len - sizeof(struct ipv6hdr);
66 	if (len > IPV6_MAXPLEN)
67 		len = 0;
68 	ipv6_hdr(skb)->payload_len = htons(len);
69 
70 	return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 		       skb_dst(skb)->dev, dst_output);
72 }
73 
74 int ip6_local_out(struct sk_buff *skb)
75 {
76 	int err;
77 
78 	err = __ip6_local_out(skb);
79 	if (likely(err == 1))
80 		err = dst_output(skb);
81 
82 	return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85 
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89 	skb_reset_mac_header(newskb);
90 	__skb_pull(newskb, skb_network_offset(newskb));
91 	newskb->pkt_type = PACKET_LOOPBACK;
92 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
93 	WARN_ON(!skb_dst(newskb));
94 
95 	netif_rx_ni(newskb);
96 	return 0;
97 }
98 
99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101 	struct dst_entry *dst = skb_dst(skb);
102 	struct net_device *dev = dst->dev;
103 	struct neighbour *neigh;
104 
105 	skb->protocol = htons(ETH_P_IPV6);
106 	skb->dev = dev;
107 
108 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
109 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
110 
111 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
112 		    ((mroute6_socket(dev_net(dev), skb) &&
113 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
114 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
115 					 &ipv6_hdr(skb)->saddr))) {
116 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
117 
118 			/* Do not check for IFF_ALLMULTI; multicast routing
119 			   is not supported in any case.
120 			 */
121 			if (newskb)
122 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
123 					newskb, NULL, newskb->dev,
124 					ip6_dev_loopback_xmit);
125 
126 			if (ipv6_hdr(skb)->hop_limit == 0) {
127 				IP6_INC_STATS(dev_net(dev), idev,
128 					      IPSTATS_MIB_OUTDISCARDS);
129 				kfree_skb(skb);
130 				return 0;
131 			}
132 		}
133 
134 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
135 				skb->len);
136 	}
137 
138 	neigh = dst->neighbour;
139 	if (neigh) {
140 		struct hh_cache *hh = &neigh->hh;
141 		if (hh->hh_len)
142 			return neigh_hh_output(hh, skb);
143 		else
144 			return dst->neighbour->output(skb);
145 	}
146 	IP6_INC_STATS_BH(dev_net(dst->dev),
147 			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
148 	kfree_skb(skb);
149 	return -EINVAL;
150 }
151 
152 static int ip6_finish_output(struct sk_buff *skb)
153 {
154 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
155 	    dst_allfrag(skb_dst(skb)))
156 		return ip6_fragment(skb, ip6_finish_output2);
157 	else
158 		return ip6_finish_output2(skb);
159 }
160 
161 int ip6_output(struct sk_buff *skb)
162 {
163 	struct net_device *dev = skb_dst(skb)->dev;
164 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
165 	if (unlikely(idev->cnf.disable_ipv6)) {
166 		IP6_INC_STATS(dev_net(dev), idev,
167 			      IPSTATS_MIB_OUTDISCARDS);
168 		kfree_skb(skb);
169 		return 0;
170 	}
171 
172 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
173 			    ip6_finish_output,
174 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
175 }
176 
177 /*
178  *	xmit an sk_buff (used by TCP, SCTP and DCCP)
179  */
180 
181 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
182 	     struct ipv6_txoptions *opt)
183 {
184 	struct net *net = sock_net(sk);
185 	struct ipv6_pinfo *np = inet6_sk(sk);
186 	struct in6_addr *first_hop = &fl6->daddr;
187 	struct dst_entry *dst = skb_dst(skb);
188 	struct ipv6hdr *hdr;
189 	u8  proto = fl6->flowi6_proto;
190 	int seg_len = skb->len;
191 	int hlimit = -1;
192 	int tclass = 0;
193 	u32 mtu;
194 
195 	if (opt) {
196 		unsigned int head_room;
197 
198 		/* First: exthdrs may take lots of space (~8K for now)
199 		   MAX_HEADER is not enough.
200 		 */
201 		head_room = opt->opt_nflen + opt->opt_flen;
202 		seg_len += head_room;
203 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
204 
205 		if (skb_headroom(skb) < head_room) {
206 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
207 			if (skb2 == NULL) {
208 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
209 					      IPSTATS_MIB_OUTDISCARDS);
210 				kfree_skb(skb);
211 				return -ENOBUFS;
212 			}
213 			kfree_skb(skb);
214 			skb = skb2;
215 			skb_set_owner_w(skb, sk);
216 		}
217 		if (opt->opt_flen)
218 			ipv6_push_frag_opts(skb, opt, &proto);
219 		if (opt->opt_nflen)
220 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
221 	}
222 
223 	skb_push(skb, sizeof(struct ipv6hdr));
224 	skb_reset_network_header(skb);
225 	hdr = ipv6_hdr(skb);
226 
227 	/*
228 	 *	Fill in the IPv6 header
229 	 */
230 	if (np) {
231 		tclass = np->tclass;
232 		hlimit = np->hop_limit;
233 	}
234 	if (hlimit < 0)
235 		hlimit = ip6_dst_hoplimit(dst);
236 
237 	*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
238 
239 	hdr->payload_len = htons(seg_len);
240 	hdr->nexthdr = proto;
241 	hdr->hop_limit = hlimit;
242 
243 	ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
244 	ipv6_addr_copy(&hdr->daddr, first_hop);
245 
246 	skb->priority = sk->sk_priority;
247 	skb->mark = sk->sk_mark;
248 
249 	mtu = dst_mtu(dst);
250 	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
251 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
252 			      IPSTATS_MIB_OUT, skb->len);
253 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
254 			       dst->dev, dst_output);
255 	}
256 
257 	if (net_ratelimit())
258 		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
259 	skb->dev = dst->dev;
260 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
261 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
262 	kfree_skb(skb);
263 	return -EMSGSIZE;
264 }
265 
266 EXPORT_SYMBOL(ip6_xmit);
267 
268 /*
269  *	To avoid extra problems ND packets are send through this
270  *	routine. It's code duplication but I really want to avoid
271  *	extra checks since ipv6_build_header is used by TCP (which
272  *	is for us performance critical)
273  */
274 
275 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
276 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
277 	       int proto, int len)
278 {
279 	struct ipv6_pinfo *np = inet6_sk(sk);
280 	struct ipv6hdr *hdr;
281 
282 	skb->protocol = htons(ETH_P_IPV6);
283 	skb->dev = dev;
284 
285 	skb_reset_network_header(skb);
286 	skb_put(skb, sizeof(struct ipv6hdr));
287 	hdr = ipv6_hdr(skb);
288 
289 	*(__be32*)hdr = htonl(0x60000000);
290 
291 	hdr->payload_len = htons(len);
292 	hdr->nexthdr = proto;
293 	hdr->hop_limit = np->hop_limit;
294 
295 	ipv6_addr_copy(&hdr->saddr, saddr);
296 	ipv6_addr_copy(&hdr->daddr, daddr);
297 
298 	return 0;
299 }
300 
301 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
302 {
303 	struct ip6_ra_chain *ra;
304 	struct sock *last = NULL;
305 
306 	read_lock(&ip6_ra_lock);
307 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
308 		struct sock *sk = ra->sk;
309 		if (sk && ra->sel == sel &&
310 		    (!sk->sk_bound_dev_if ||
311 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
312 			if (last) {
313 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
314 				if (skb2)
315 					rawv6_rcv(last, skb2);
316 			}
317 			last = sk;
318 		}
319 	}
320 
321 	if (last) {
322 		rawv6_rcv(last, skb);
323 		read_unlock(&ip6_ra_lock);
324 		return 1;
325 	}
326 	read_unlock(&ip6_ra_lock);
327 	return 0;
328 }
329 
330 static int ip6_forward_proxy_check(struct sk_buff *skb)
331 {
332 	struct ipv6hdr *hdr = ipv6_hdr(skb);
333 	u8 nexthdr = hdr->nexthdr;
334 	int offset;
335 
336 	if (ipv6_ext_hdr(nexthdr)) {
337 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
338 		if (offset < 0)
339 			return 0;
340 	} else
341 		offset = sizeof(struct ipv6hdr);
342 
343 	if (nexthdr == IPPROTO_ICMPV6) {
344 		struct icmp6hdr *icmp6;
345 
346 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
347 					 offset + 1 - skb->data)))
348 			return 0;
349 
350 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
351 
352 		switch (icmp6->icmp6_type) {
353 		case NDISC_ROUTER_SOLICITATION:
354 		case NDISC_ROUTER_ADVERTISEMENT:
355 		case NDISC_NEIGHBOUR_SOLICITATION:
356 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
357 		case NDISC_REDIRECT:
358 			/* For reaction involving unicast neighbor discovery
359 			 * message destined to the proxied address, pass it to
360 			 * input function.
361 			 */
362 			return 1;
363 		default:
364 			break;
365 		}
366 	}
367 
368 	/*
369 	 * The proxying router can't forward traffic sent to a link-local
370 	 * address, so signal the sender and discard the packet. This
371 	 * behavior is clarified by the MIPv6 specification.
372 	 */
373 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
374 		dst_link_failure(skb);
375 		return -1;
376 	}
377 
378 	return 0;
379 }
380 
381 static inline int ip6_forward_finish(struct sk_buff *skb)
382 {
383 	return dst_output(skb);
384 }
385 
386 int ip6_forward(struct sk_buff *skb)
387 {
388 	struct dst_entry *dst = skb_dst(skb);
389 	struct ipv6hdr *hdr = ipv6_hdr(skb);
390 	struct inet6_skb_parm *opt = IP6CB(skb);
391 	struct net *net = dev_net(dst->dev);
392 	u32 mtu;
393 
394 	if (net->ipv6.devconf_all->forwarding == 0)
395 		goto error;
396 
397 	if (skb_warn_if_lro(skb))
398 		goto drop;
399 
400 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
401 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
402 		goto drop;
403 	}
404 
405 	if (skb->pkt_type != PACKET_HOST)
406 		goto drop;
407 
408 	skb_forward_csum(skb);
409 
410 	/*
411 	 *	We DO NOT make any processing on
412 	 *	RA packets, pushing them to user level AS IS
413 	 *	without ane WARRANTY that application will be able
414 	 *	to interpret them. The reason is that we
415 	 *	cannot make anything clever here.
416 	 *
417 	 *	We are not end-node, so that if packet contains
418 	 *	AH/ESP, we cannot make anything.
419 	 *	Defragmentation also would be mistake, RA packets
420 	 *	cannot be fragmented, because there is no warranty
421 	 *	that different fragments will go along one path. --ANK
422 	 */
423 	if (opt->ra) {
424 		u8 *ptr = skb_network_header(skb) + opt->ra;
425 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
426 			return 0;
427 	}
428 
429 	/*
430 	 *	check and decrement ttl
431 	 */
432 	if (hdr->hop_limit <= 1) {
433 		/* Force OUTPUT device used as source address */
434 		skb->dev = dst->dev;
435 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
436 		IP6_INC_STATS_BH(net,
437 				 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
438 
439 		kfree_skb(skb);
440 		return -ETIMEDOUT;
441 	}
442 
443 	/* XXX: idev->cnf.proxy_ndp? */
444 	if (net->ipv6.devconf_all->proxy_ndp &&
445 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
446 		int proxied = ip6_forward_proxy_check(skb);
447 		if (proxied > 0)
448 			return ip6_input(skb);
449 		else if (proxied < 0) {
450 			IP6_INC_STATS(net, ip6_dst_idev(dst),
451 				      IPSTATS_MIB_INDISCARDS);
452 			goto drop;
453 		}
454 	}
455 
456 	if (!xfrm6_route_forward(skb)) {
457 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
458 		goto drop;
459 	}
460 	dst = skb_dst(skb);
461 
462 	/* IPv6 specs say nothing about it, but it is clear that we cannot
463 	   send redirects to source routed frames.
464 	   We don't send redirects to frames decapsulated from IPsec.
465 	 */
466 	if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
467 	    !skb_sec_path(skb)) {
468 		struct in6_addr *target = NULL;
469 		struct rt6_info *rt;
470 		struct neighbour *n = dst->neighbour;
471 
472 		/*
473 		 *	incoming and outgoing devices are the same
474 		 *	send a redirect.
475 		 */
476 
477 		rt = (struct rt6_info *) dst;
478 		if ((rt->rt6i_flags & RTF_GATEWAY))
479 			target = (struct in6_addr*)&n->primary_key;
480 		else
481 			target = &hdr->daddr;
482 
483 		if (!rt->rt6i_peer)
484 			rt6_bind_peer(rt, 1);
485 
486 		/* Limit redirects both by destination (here)
487 		   and by source (inside ndisc_send_redirect)
488 		 */
489 		if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
490 			ndisc_send_redirect(skb, n, target);
491 	} else {
492 		int addrtype = ipv6_addr_type(&hdr->saddr);
493 
494 		/* This check is security critical. */
495 		if (addrtype == IPV6_ADDR_ANY ||
496 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
497 			goto error;
498 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
499 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
500 				    ICMPV6_NOT_NEIGHBOUR, 0);
501 			goto error;
502 		}
503 	}
504 
505 	mtu = dst_mtu(dst);
506 	if (mtu < IPV6_MIN_MTU)
507 		mtu = IPV6_MIN_MTU;
508 
509 	if (skb->len > mtu && !skb_is_gso(skb)) {
510 		/* Again, force OUTPUT device used as source address */
511 		skb->dev = dst->dev;
512 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
513 		IP6_INC_STATS_BH(net,
514 				 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
515 		IP6_INC_STATS_BH(net,
516 				 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
517 		kfree_skb(skb);
518 		return -EMSGSIZE;
519 	}
520 
521 	if (skb_cow(skb, dst->dev->hard_header_len)) {
522 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
523 		goto drop;
524 	}
525 
526 	hdr = ipv6_hdr(skb);
527 
528 	/* Mangling hops number delayed to point after skb COW */
529 
530 	hdr->hop_limit--;
531 
532 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
533 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
534 		       ip6_forward_finish);
535 
536 error:
537 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
538 drop:
539 	kfree_skb(skb);
540 	return -EINVAL;
541 }
542 
543 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
544 {
545 	to->pkt_type = from->pkt_type;
546 	to->priority = from->priority;
547 	to->protocol = from->protocol;
548 	skb_dst_drop(to);
549 	skb_dst_set(to, dst_clone(skb_dst(from)));
550 	to->dev = from->dev;
551 	to->mark = from->mark;
552 
553 #ifdef CONFIG_NET_SCHED
554 	to->tc_index = from->tc_index;
555 #endif
556 	nf_copy(to, from);
557 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
558     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
559 	to->nf_trace = from->nf_trace;
560 #endif
561 	skb_copy_secmark(to, from);
562 }
563 
564 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
565 {
566 	u16 offset = sizeof(struct ipv6hdr);
567 	struct ipv6_opt_hdr *exthdr =
568 				(struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
569 	unsigned int packet_len = skb->tail - skb->network_header;
570 	int found_rhdr = 0;
571 	*nexthdr = &ipv6_hdr(skb)->nexthdr;
572 
573 	while (offset + 1 <= packet_len) {
574 
575 		switch (**nexthdr) {
576 
577 		case NEXTHDR_HOP:
578 			break;
579 		case NEXTHDR_ROUTING:
580 			found_rhdr = 1;
581 			break;
582 		case NEXTHDR_DEST:
583 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
584 			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
585 				break;
586 #endif
587 			if (found_rhdr)
588 				return offset;
589 			break;
590 		default :
591 			return offset;
592 		}
593 
594 		offset += ipv6_optlen(exthdr);
595 		*nexthdr = &exthdr->nexthdr;
596 		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
597 						 offset);
598 	}
599 
600 	return offset;
601 }
602 
603 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
604 {
605 	struct sk_buff *frag;
606 	struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
607 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
608 	struct ipv6hdr *tmp_hdr;
609 	struct frag_hdr *fh;
610 	unsigned int mtu, hlen, left, len;
611 	__be32 frag_id = 0;
612 	int ptr, offset = 0, err=0;
613 	u8 *prevhdr, nexthdr = 0;
614 	struct net *net = dev_net(skb_dst(skb)->dev);
615 
616 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
617 	nexthdr = *prevhdr;
618 
619 	mtu = ip6_skb_dst_mtu(skb);
620 
621 	/* We must not fragment if the socket is set to force MTU discovery
622 	 * or if the skb it not generated by a local socket.
623 	 */
624 	if (!skb->local_df && skb->len > mtu) {
625 		skb->dev = skb_dst(skb)->dev;
626 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
627 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
628 			      IPSTATS_MIB_FRAGFAILS);
629 		kfree_skb(skb);
630 		return -EMSGSIZE;
631 	}
632 
633 	if (np && np->frag_size < mtu) {
634 		if (np->frag_size)
635 			mtu = np->frag_size;
636 	}
637 	mtu -= hlen + sizeof(struct frag_hdr);
638 
639 	if (skb_has_frag_list(skb)) {
640 		int first_len = skb_pagelen(skb);
641 		struct sk_buff *frag2;
642 
643 		if (first_len - hlen > mtu ||
644 		    ((first_len - hlen) & 7) ||
645 		    skb_cloned(skb))
646 			goto slow_path;
647 
648 		skb_walk_frags(skb, frag) {
649 			/* Correct geometry. */
650 			if (frag->len > mtu ||
651 			    ((frag->len & 7) && frag->next) ||
652 			    skb_headroom(frag) < hlen)
653 				goto slow_path_clean;
654 
655 			/* Partially cloned skb? */
656 			if (skb_shared(frag))
657 				goto slow_path_clean;
658 
659 			BUG_ON(frag->sk);
660 			if (skb->sk) {
661 				frag->sk = skb->sk;
662 				frag->destructor = sock_wfree;
663 			}
664 			skb->truesize -= frag->truesize;
665 		}
666 
667 		err = 0;
668 		offset = 0;
669 		frag = skb_shinfo(skb)->frag_list;
670 		skb_frag_list_init(skb);
671 		/* BUILD HEADER */
672 
673 		*prevhdr = NEXTHDR_FRAGMENT;
674 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
675 		if (!tmp_hdr) {
676 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
677 				      IPSTATS_MIB_FRAGFAILS);
678 			return -ENOMEM;
679 		}
680 
681 		__skb_pull(skb, hlen);
682 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
683 		__skb_push(skb, hlen);
684 		skb_reset_network_header(skb);
685 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
686 
687 		ipv6_select_ident(fh);
688 		fh->nexthdr = nexthdr;
689 		fh->reserved = 0;
690 		fh->frag_off = htons(IP6_MF);
691 		frag_id = fh->identification;
692 
693 		first_len = skb_pagelen(skb);
694 		skb->data_len = first_len - skb_headlen(skb);
695 		skb->len = first_len;
696 		ipv6_hdr(skb)->payload_len = htons(first_len -
697 						   sizeof(struct ipv6hdr));
698 
699 		dst_hold(&rt->dst);
700 
701 		for (;;) {
702 			/* Prepare header of the next frame,
703 			 * before previous one went down. */
704 			if (frag) {
705 				frag->ip_summed = CHECKSUM_NONE;
706 				skb_reset_transport_header(frag);
707 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
708 				__skb_push(frag, hlen);
709 				skb_reset_network_header(frag);
710 				memcpy(skb_network_header(frag), tmp_hdr,
711 				       hlen);
712 				offset += skb->len - hlen - sizeof(struct frag_hdr);
713 				fh->nexthdr = nexthdr;
714 				fh->reserved = 0;
715 				fh->frag_off = htons(offset);
716 				if (frag->next != NULL)
717 					fh->frag_off |= htons(IP6_MF);
718 				fh->identification = frag_id;
719 				ipv6_hdr(frag)->payload_len =
720 						htons(frag->len -
721 						      sizeof(struct ipv6hdr));
722 				ip6_copy_metadata(frag, skb);
723 			}
724 
725 			err = output(skb);
726 			if(!err)
727 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
728 					      IPSTATS_MIB_FRAGCREATES);
729 
730 			if (err || !frag)
731 				break;
732 
733 			skb = frag;
734 			frag = skb->next;
735 			skb->next = NULL;
736 		}
737 
738 		kfree(tmp_hdr);
739 
740 		if (err == 0) {
741 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
742 				      IPSTATS_MIB_FRAGOKS);
743 			dst_release(&rt->dst);
744 			return 0;
745 		}
746 
747 		while (frag) {
748 			skb = frag->next;
749 			kfree_skb(frag);
750 			frag = skb;
751 		}
752 
753 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
754 			      IPSTATS_MIB_FRAGFAILS);
755 		dst_release(&rt->dst);
756 		return err;
757 
758 slow_path_clean:
759 		skb_walk_frags(skb, frag2) {
760 			if (frag2 == frag)
761 				break;
762 			frag2->sk = NULL;
763 			frag2->destructor = NULL;
764 			skb->truesize += frag2->truesize;
765 		}
766 	}
767 
768 slow_path:
769 	left = skb->len - hlen;		/* Space per frame */
770 	ptr = hlen;			/* Where to start from */
771 
772 	/*
773 	 *	Fragment the datagram.
774 	 */
775 
776 	*prevhdr = NEXTHDR_FRAGMENT;
777 
778 	/*
779 	 *	Keep copying data until we run out.
780 	 */
781 	while(left > 0)	{
782 		len = left;
783 		/* IF: it doesn't fit, use 'mtu' - the data space left */
784 		if (len > mtu)
785 			len = mtu;
786 		/* IF: we are not sending up to and including the packet end
787 		   then align the next start on an eight byte boundary */
788 		if (len < left)	{
789 			len &= ~7;
790 		}
791 		/*
792 		 *	Allocate buffer.
793 		 */
794 
795 		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
796 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
797 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
798 				      IPSTATS_MIB_FRAGFAILS);
799 			err = -ENOMEM;
800 			goto fail;
801 		}
802 
803 		/*
804 		 *	Set up data on packet
805 		 */
806 
807 		ip6_copy_metadata(frag, skb);
808 		skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
809 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
810 		skb_reset_network_header(frag);
811 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
812 		frag->transport_header = (frag->network_header + hlen +
813 					  sizeof(struct frag_hdr));
814 
815 		/*
816 		 *	Charge the memory for the fragment to any owner
817 		 *	it might possess
818 		 */
819 		if (skb->sk)
820 			skb_set_owner_w(frag, skb->sk);
821 
822 		/*
823 		 *	Copy the packet header into the new buffer.
824 		 */
825 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
826 
827 		/*
828 		 *	Build fragment header.
829 		 */
830 		fh->nexthdr = nexthdr;
831 		fh->reserved = 0;
832 		if (!frag_id) {
833 			ipv6_select_ident(fh);
834 			frag_id = fh->identification;
835 		} else
836 			fh->identification = frag_id;
837 
838 		/*
839 		 *	Copy a block of the IP datagram.
840 		 */
841 		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
842 			BUG();
843 		left -= len;
844 
845 		fh->frag_off = htons(offset);
846 		if (left > 0)
847 			fh->frag_off |= htons(IP6_MF);
848 		ipv6_hdr(frag)->payload_len = htons(frag->len -
849 						    sizeof(struct ipv6hdr));
850 
851 		ptr += len;
852 		offset += len;
853 
854 		/*
855 		 *	Put this fragment into the sending queue.
856 		 */
857 		err = output(frag);
858 		if (err)
859 			goto fail;
860 
861 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
862 			      IPSTATS_MIB_FRAGCREATES);
863 	}
864 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
865 		      IPSTATS_MIB_FRAGOKS);
866 	kfree_skb(skb);
867 	return err;
868 
869 fail:
870 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
871 		      IPSTATS_MIB_FRAGFAILS);
872 	kfree_skb(skb);
873 	return err;
874 }
875 
876 static inline int ip6_rt_check(const struct rt6key *rt_key,
877 			       const struct in6_addr *fl_addr,
878 			       const struct in6_addr *addr_cache)
879 {
880 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
881 		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
882 }
883 
884 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
885 					  struct dst_entry *dst,
886 					  const struct flowi6 *fl6)
887 {
888 	struct ipv6_pinfo *np = inet6_sk(sk);
889 	struct rt6_info *rt = (struct rt6_info *)dst;
890 
891 	if (!dst)
892 		goto out;
893 
894 	/* Yes, checking route validity in not connected
895 	 * case is not very simple. Take into account,
896 	 * that we do not support routing by source, TOS,
897 	 * and MSG_DONTROUTE 		--ANK (980726)
898 	 *
899 	 * 1. ip6_rt_check(): If route was host route,
900 	 *    check that cached destination is current.
901 	 *    If it is network route, we still may
902 	 *    check its validity using saved pointer
903 	 *    to the last used address: daddr_cache.
904 	 *    We do not want to save whole address now,
905 	 *    (because main consumer of this service
906 	 *    is tcp, which has not this problem),
907 	 *    so that the last trick works only on connected
908 	 *    sockets.
909 	 * 2. oif also should be the same.
910 	 */
911 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
912 #ifdef CONFIG_IPV6_SUBTREES
913 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
914 #endif
915 	    (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
916 		dst_release(dst);
917 		dst = NULL;
918 	}
919 
920 out:
921 	return dst;
922 }
923 
924 static int ip6_dst_lookup_tail(struct sock *sk,
925 			       struct dst_entry **dst, struct flowi6 *fl6)
926 {
927 	int err;
928 	struct net *net = sock_net(sk);
929 
930 	if (*dst == NULL)
931 		*dst = ip6_route_output(net, sk, fl6);
932 
933 	if ((err = (*dst)->error))
934 		goto out_err_release;
935 
936 	if (ipv6_addr_any(&fl6->saddr)) {
937 		struct rt6_info *rt = (struct rt6_info *) *dst;
938 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
939 					  sk ? inet6_sk(sk)->srcprefs : 0,
940 					  &fl6->saddr);
941 		if (err)
942 			goto out_err_release;
943 	}
944 
945 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
946 	/*
947 	 * Here if the dst entry we've looked up
948 	 * has a neighbour entry that is in the INCOMPLETE
949 	 * state and the src address from the flow is
950 	 * marked as OPTIMISTIC, we release the found
951 	 * dst entry and replace it instead with the
952 	 * dst entry of the nexthop router
953 	 */
954 	if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
955 		struct inet6_ifaddr *ifp;
956 		struct flowi6 fl_gw6;
957 		int redirect;
958 
959 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
960 				      (*dst)->dev, 1);
961 
962 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
963 		if (ifp)
964 			in6_ifa_put(ifp);
965 
966 		if (redirect) {
967 			/*
968 			 * We need to get the dst entry for the
969 			 * default router instead
970 			 */
971 			dst_release(*dst);
972 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
973 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
974 			*dst = ip6_route_output(net, sk, &fl_gw6);
975 			if ((err = (*dst)->error))
976 				goto out_err_release;
977 		}
978 	}
979 #endif
980 
981 	return 0;
982 
983 out_err_release:
984 	if (err == -ENETUNREACH)
985 		IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
986 	dst_release(*dst);
987 	*dst = NULL;
988 	return err;
989 }
990 
991 /**
992  *	ip6_dst_lookup - perform route lookup on flow
993  *	@sk: socket which provides route info
994  *	@dst: pointer to dst_entry * for result
995  *	@fl6: flow to lookup
996  *
997  *	This function performs a route lookup on the given flow.
998  *
999  *	It returns zero on success, or a standard errno code on error.
1000  */
1001 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1002 {
1003 	*dst = NULL;
1004 	return ip6_dst_lookup_tail(sk, dst, fl6);
1005 }
1006 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1007 
1008 /**
1009  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1010  *	@sk: socket which provides route info
1011  *	@fl6: flow to lookup
1012  *	@final_dst: final destination address for ipsec lookup
1013  *	@can_sleep: we are in a sleepable context
1014  *
1015  *	This function performs a route lookup on the given flow.
1016  *
1017  *	It returns a valid dst pointer on success, or a pointer encoded
1018  *	error code.
1019  */
1020 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1021 				      const struct in6_addr *final_dst,
1022 				      bool can_sleep)
1023 {
1024 	struct dst_entry *dst = NULL;
1025 	int err;
1026 
1027 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1028 	if (err)
1029 		return ERR_PTR(err);
1030 	if (final_dst)
1031 		ipv6_addr_copy(&fl6->daddr, final_dst);
1032 	if (can_sleep)
1033 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1034 
1035 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1036 }
1037 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1038 
1039 /**
1040  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1041  *	@sk: socket which provides the dst cache and route info
1042  *	@fl6: flow to lookup
1043  *	@final_dst: final destination address for ipsec lookup
1044  *	@can_sleep: we are in a sleepable context
1045  *
1046  *	This function performs a route lookup on the given flow with the
1047  *	possibility of using the cached route in the socket if it is valid.
1048  *	It will take the socket dst lock when operating on the dst cache.
1049  *	As a result, this function can only be used in process context.
1050  *
1051  *	It returns a valid dst pointer on success, or a pointer encoded
1052  *	error code.
1053  */
1054 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1055 					 const struct in6_addr *final_dst,
1056 					 bool can_sleep)
1057 {
1058 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1059 	int err;
1060 
1061 	dst = ip6_sk_dst_check(sk, dst, fl6);
1062 
1063 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1064 	if (err)
1065 		return ERR_PTR(err);
1066 	if (final_dst)
1067 		ipv6_addr_copy(&fl6->daddr, final_dst);
1068 	if (can_sleep)
1069 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1070 
1071 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1072 }
1073 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1074 
1075 static inline int ip6_ufo_append_data(struct sock *sk,
1076 			int getfrag(void *from, char *to, int offset, int len,
1077 			int odd, struct sk_buff *skb),
1078 			void *from, int length, int hh_len, int fragheaderlen,
1079 			int transhdrlen, int mtu,unsigned int flags)
1080 
1081 {
1082 	struct sk_buff *skb;
1083 	int err;
1084 
1085 	/* There is support for UDP large send offload by network
1086 	 * device, so create one single skb packet containing complete
1087 	 * udp datagram
1088 	 */
1089 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1090 		skb = sock_alloc_send_skb(sk,
1091 			hh_len + fragheaderlen + transhdrlen + 20,
1092 			(flags & MSG_DONTWAIT), &err);
1093 		if (skb == NULL)
1094 			return -ENOMEM;
1095 
1096 		/* reserve space for Hardware header */
1097 		skb_reserve(skb, hh_len);
1098 
1099 		/* create space for UDP/IP header */
1100 		skb_put(skb,fragheaderlen + transhdrlen);
1101 
1102 		/* initialize network header pointer */
1103 		skb_reset_network_header(skb);
1104 
1105 		/* initialize protocol header pointer */
1106 		skb->transport_header = skb->network_header + fragheaderlen;
1107 
1108 		skb->ip_summed = CHECKSUM_PARTIAL;
1109 		skb->csum = 0;
1110 	}
1111 
1112 	err = skb_append_datato_frags(sk,skb, getfrag, from,
1113 				      (length - transhdrlen));
1114 	if (!err) {
1115 		struct frag_hdr fhdr;
1116 
1117 		/* Specify the length of each IPv6 datagram fragment.
1118 		 * It has to be a multiple of 8.
1119 		 */
1120 		skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1121 					     sizeof(struct frag_hdr)) & ~7;
1122 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1123 		ipv6_select_ident(&fhdr);
1124 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1125 		__skb_queue_tail(&sk->sk_write_queue, skb);
1126 
1127 		return 0;
1128 	}
1129 	/* There is not enough support do UPD LSO,
1130 	 * so follow normal path
1131 	 */
1132 	kfree_skb(skb);
1133 
1134 	return err;
1135 }
1136 
1137 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1138 					       gfp_t gfp)
1139 {
1140 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1141 }
1142 
1143 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1144 						gfp_t gfp)
1145 {
1146 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1147 }
1148 
1149 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1150 	int offset, int len, int odd, struct sk_buff *skb),
1151 	void *from, int length, int transhdrlen,
1152 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1153 	struct rt6_info *rt, unsigned int flags, int dontfrag)
1154 {
1155 	struct inet_sock *inet = inet_sk(sk);
1156 	struct ipv6_pinfo *np = inet6_sk(sk);
1157 	struct inet_cork *cork;
1158 	struct sk_buff *skb;
1159 	unsigned int maxfraglen, fragheaderlen;
1160 	int exthdrlen;
1161 	int hh_len;
1162 	int mtu;
1163 	int copy;
1164 	int err;
1165 	int offset = 0;
1166 	int csummode = CHECKSUM_NONE;
1167 	__u8 tx_flags = 0;
1168 
1169 	if (flags&MSG_PROBE)
1170 		return 0;
1171 	cork = &inet->cork.base;
1172 	if (skb_queue_empty(&sk->sk_write_queue)) {
1173 		/*
1174 		 * setup for corking
1175 		 */
1176 		if (opt) {
1177 			if (WARN_ON(np->cork.opt))
1178 				return -EINVAL;
1179 
1180 			np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1181 			if (unlikely(np->cork.opt == NULL))
1182 				return -ENOBUFS;
1183 
1184 			np->cork.opt->tot_len = opt->tot_len;
1185 			np->cork.opt->opt_flen = opt->opt_flen;
1186 			np->cork.opt->opt_nflen = opt->opt_nflen;
1187 
1188 			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1189 							    sk->sk_allocation);
1190 			if (opt->dst0opt && !np->cork.opt->dst0opt)
1191 				return -ENOBUFS;
1192 
1193 			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1194 							    sk->sk_allocation);
1195 			if (opt->dst1opt && !np->cork.opt->dst1opt)
1196 				return -ENOBUFS;
1197 
1198 			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1199 							   sk->sk_allocation);
1200 			if (opt->hopopt && !np->cork.opt->hopopt)
1201 				return -ENOBUFS;
1202 
1203 			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1204 							    sk->sk_allocation);
1205 			if (opt->srcrt && !np->cork.opt->srcrt)
1206 				return -ENOBUFS;
1207 
1208 			/* need source address above miyazawa*/
1209 		}
1210 		dst_hold(&rt->dst);
1211 		cork->dst = &rt->dst;
1212 		inet->cork.fl.u.ip6 = *fl6;
1213 		np->cork.hop_limit = hlimit;
1214 		np->cork.tclass = tclass;
1215 		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1216 		      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1217 		if (np->frag_size < mtu) {
1218 			if (np->frag_size)
1219 				mtu = np->frag_size;
1220 		}
1221 		cork->fragsize = mtu;
1222 		if (dst_allfrag(rt->dst.path))
1223 			cork->flags |= IPCORK_ALLFRAG;
1224 		cork->length = 0;
1225 		sk->sk_sndmsg_page = NULL;
1226 		sk->sk_sndmsg_off = 0;
1227 		exthdrlen = rt->dst.header_len + (opt ? opt->opt_flen : 0) -
1228 			    rt->rt6i_nfheader_len;
1229 		length += exthdrlen;
1230 		transhdrlen += exthdrlen;
1231 	} else {
1232 		rt = (struct rt6_info *)cork->dst;
1233 		fl6 = &inet->cork.fl.u.ip6;
1234 		opt = np->cork.opt;
1235 		transhdrlen = 0;
1236 		exthdrlen = 0;
1237 		mtu = cork->fragsize;
1238 	}
1239 
1240 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1241 
1242 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1243 			(opt ? opt->opt_nflen : 0);
1244 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1245 
1246 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1247 		if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1248 			ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1249 			return -EMSGSIZE;
1250 		}
1251 	}
1252 
1253 	/* For UDP, check if TX timestamp is enabled */
1254 	if (sk->sk_type == SOCK_DGRAM) {
1255 		err = sock_tx_timestamp(sk, &tx_flags);
1256 		if (err)
1257 			goto error;
1258 	}
1259 
1260 	/*
1261 	 * Let's try using as much space as possible.
1262 	 * Use MTU if total length of the message fits into the MTU.
1263 	 * Otherwise, we need to reserve fragment header and
1264 	 * fragment alignment (= 8-15 octects, in total).
1265 	 *
1266 	 * Note that we may need to "move" the data from the tail of
1267 	 * of the buffer to the new fragment when we split
1268 	 * the message.
1269 	 *
1270 	 * FIXME: It may be fragmented into multiple chunks
1271 	 *        at once if non-fragmentable extension headers
1272 	 *        are too large.
1273 	 * --yoshfuji
1274 	 */
1275 
1276 	cork->length += length;
1277 	if (length > mtu) {
1278 		int proto = sk->sk_protocol;
1279 		if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1280 			ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1281 			return -EMSGSIZE;
1282 		}
1283 
1284 		if (proto == IPPROTO_UDP &&
1285 		    (rt->dst.dev->features & NETIF_F_UFO)) {
1286 
1287 			err = ip6_ufo_append_data(sk, getfrag, from, length,
1288 						  hh_len, fragheaderlen,
1289 						  transhdrlen, mtu, flags);
1290 			if (err)
1291 				goto error;
1292 			return 0;
1293 		}
1294 	}
1295 
1296 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1297 		goto alloc_new_skb;
1298 
1299 	while (length > 0) {
1300 		/* Check if the remaining data fits into current packet. */
1301 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1302 		if (copy < length)
1303 			copy = maxfraglen - skb->len;
1304 
1305 		if (copy <= 0) {
1306 			char *data;
1307 			unsigned int datalen;
1308 			unsigned int fraglen;
1309 			unsigned int fraggap;
1310 			unsigned int alloclen;
1311 			struct sk_buff *skb_prev;
1312 alloc_new_skb:
1313 			skb_prev = skb;
1314 
1315 			/* There's no room in the current skb */
1316 			if (skb_prev)
1317 				fraggap = skb_prev->len - maxfraglen;
1318 			else
1319 				fraggap = 0;
1320 
1321 			/*
1322 			 * If remaining data exceeds the mtu,
1323 			 * we know we need more fragment(s).
1324 			 */
1325 			datalen = length + fraggap;
1326 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1327 				datalen = maxfraglen - fragheaderlen;
1328 
1329 			fraglen = datalen + fragheaderlen;
1330 			if ((flags & MSG_MORE) &&
1331 			    !(rt->dst.dev->features&NETIF_F_SG))
1332 				alloclen = mtu;
1333 			else
1334 				alloclen = datalen + fragheaderlen;
1335 
1336 			/*
1337 			 * The last fragment gets additional space at tail.
1338 			 * Note: we overallocate on fragments with MSG_MODE
1339 			 * because we have no idea if we're the last one.
1340 			 */
1341 			if (datalen == length + fraggap)
1342 				alloclen += rt->dst.trailer_len;
1343 
1344 			/*
1345 			 * We just reserve space for fragment header.
1346 			 * Note: this may be overallocation if the message
1347 			 * (without MSG_MORE) fits into the MTU.
1348 			 */
1349 			alloclen += sizeof(struct frag_hdr);
1350 
1351 			if (transhdrlen) {
1352 				skb = sock_alloc_send_skb(sk,
1353 						alloclen + hh_len,
1354 						(flags & MSG_DONTWAIT), &err);
1355 			} else {
1356 				skb = NULL;
1357 				if (atomic_read(&sk->sk_wmem_alloc) <=
1358 				    2 * sk->sk_sndbuf)
1359 					skb = sock_wmalloc(sk,
1360 							   alloclen + hh_len, 1,
1361 							   sk->sk_allocation);
1362 				if (unlikely(skb == NULL))
1363 					err = -ENOBUFS;
1364 				else {
1365 					/* Only the initial fragment
1366 					 * is time stamped.
1367 					 */
1368 					tx_flags = 0;
1369 				}
1370 			}
1371 			if (skb == NULL)
1372 				goto error;
1373 			/*
1374 			 *	Fill in the control structures
1375 			 */
1376 			skb->ip_summed = csummode;
1377 			skb->csum = 0;
1378 			/* reserve for fragmentation */
1379 			skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1380 
1381 			if (sk->sk_type == SOCK_DGRAM)
1382 				skb_shinfo(skb)->tx_flags = tx_flags;
1383 
1384 			/*
1385 			 *	Find where to start putting bytes
1386 			 */
1387 			data = skb_put(skb, fraglen);
1388 			skb_set_network_header(skb, exthdrlen);
1389 			data += fragheaderlen;
1390 			skb->transport_header = (skb->network_header +
1391 						 fragheaderlen);
1392 			if (fraggap) {
1393 				skb->csum = skb_copy_and_csum_bits(
1394 					skb_prev, maxfraglen,
1395 					data + transhdrlen, fraggap, 0);
1396 				skb_prev->csum = csum_sub(skb_prev->csum,
1397 							  skb->csum);
1398 				data += fraggap;
1399 				pskb_trim_unique(skb_prev, maxfraglen);
1400 			}
1401 			copy = datalen - transhdrlen - fraggap;
1402 			if (copy < 0) {
1403 				err = -EINVAL;
1404 				kfree_skb(skb);
1405 				goto error;
1406 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1407 				err = -EFAULT;
1408 				kfree_skb(skb);
1409 				goto error;
1410 			}
1411 
1412 			offset += copy;
1413 			length -= datalen - fraggap;
1414 			transhdrlen = 0;
1415 			exthdrlen = 0;
1416 			csummode = CHECKSUM_NONE;
1417 
1418 			/*
1419 			 * Put the packet on the pending queue
1420 			 */
1421 			__skb_queue_tail(&sk->sk_write_queue, skb);
1422 			continue;
1423 		}
1424 
1425 		if (copy > length)
1426 			copy = length;
1427 
1428 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1429 			unsigned int off;
1430 
1431 			off = skb->len;
1432 			if (getfrag(from, skb_put(skb, copy),
1433 						offset, copy, off, skb) < 0) {
1434 				__skb_trim(skb, off);
1435 				err = -EFAULT;
1436 				goto error;
1437 			}
1438 		} else {
1439 			int i = skb_shinfo(skb)->nr_frags;
1440 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1441 			struct page *page = sk->sk_sndmsg_page;
1442 			int off = sk->sk_sndmsg_off;
1443 			unsigned int left;
1444 
1445 			if (page && (left = PAGE_SIZE - off) > 0) {
1446 				if (copy >= left)
1447 					copy = left;
1448 				if (page != frag->page) {
1449 					if (i == MAX_SKB_FRAGS) {
1450 						err = -EMSGSIZE;
1451 						goto error;
1452 					}
1453 					get_page(page);
1454 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1455 					frag = &skb_shinfo(skb)->frags[i];
1456 				}
1457 			} else if(i < MAX_SKB_FRAGS) {
1458 				if (copy > PAGE_SIZE)
1459 					copy = PAGE_SIZE;
1460 				page = alloc_pages(sk->sk_allocation, 0);
1461 				if (page == NULL) {
1462 					err = -ENOMEM;
1463 					goto error;
1464 				}
1465 				sk->sk_sndmsg_page = page;
1466 				sk->sk_sndmsg_off = 0;
1467 
1468 				skb_fill_page_desc(skb, i, page, 0, 0);
1469 				frag = &skb_shinfo(skb)->frags[i];
1470 			} else {
1471 				err = -EMSGSIZE;
1472 				goto error;
1473 			}
1474 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1475 				err = -EFAULT;
1476 				goto error;
1477 			}
1478 			sk->sk_sndmsg_off += copy;
1479 			frag->size += copy;
1480 			skb->len += copy;
1481 			skb->data_len += copy;
1482 			skb->truesize += copy;
1483 			atomic_add(copy, &sk->sk_wmem_alloc);
1484 		}
1485 		offset += copy;
1486 		length -= copy;
1487 	}
1488 	return 0;
1489 error:
1490 	cork->length -= length;
1491 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1492 	return err;
1493 }
1494 
1495 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1496 {
1497 	if (np->cork.opt) {
1498 		kfree(np->cork.opt->dst0opt);
1499 		kfree(np->cork.opt->dst1opt);
1500 		kfree(np->cork.opt->hopopt);
1501 		kfree(np->cork.opt->srcrt);
1502 		kfree(np->cork.opt);
1503 		np->cork.opt = NULL;
1504 	}
1505 
1506 	if (inet->cork.base.dst) {
1507 		dst_release(inet->cork.base.dst);
1508 		inet->cork.base.dst = NULL;
1509 		inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1510 	}
1511 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1512 }
1513 
1514 int ip6_push_pending_frames(struct sock *sk)
1515 {
1516 	struct sk_buff *skb, *tmp_skb;
1517 	struct sk_buff **tail_skb;
1518 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1519 	struct inet_sock *inet = inet_sk(sk);
1520 	struct ipv6_pinfo *np = inet6_sk(sk);
1521 	struct net *net = sock_net(sk);
1522 	struct ipv6hdr *hdr;
1523 	struct ipv6_txoptions *opt = np->cork.opt;
1524 	struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1525 	struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1526 	unsigned char proto = fl6->flowi6_proto;
1527 	int err = 0;
1528 
1529 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1530 		goto out;
1531 	tail_skb = &(skb_shinfo(skb)->frag_list);
1532 
1533 	/* move skb->data to ip header from ext header */
1534 	if (skb->data < skb_network_header(skb))
1535 		__skb_pull(skb, skb_network_offset(skb));
1536 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1537 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1538 		*tail_skb = tmp_skb;
1539 		tail_skb = &(tmp_skb->next);
1540 		skb->len += tmp_skb->len;
1541 		skb->data_len += tmp_skb->len;
1542 		skb->truesize += tmp_skb->truesize;
1543 		tmp_skb->destructor = NULL;
1544 		tmp_skb->sk = NULL;
1545 	}
1546 
1547 	/* Allow local fragmentation. */
1548 	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1549 		skb->local_df = 1;
1550 
1551 	ipv6_addr_copy(final_dst, &fl6->daddr);
1552 	__skb_pull(skb, skb_network_header_len(skb));
1553 	if (opt && opt->opt_flen)
1554 		ipv6_push_frag_opts(skb, opt, &proto);
1555 	if (opt && opt->opt_nflen)
1556 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1557 
1558 	skb_push(skb, sizeof(struct ipv6hdr));
1559 	skb_reset_network_header(skb);
1560 	hdr = ipv6_hdr(skb);
1561 
1562 	*(__be32*)hdr = fl6->flowlabel |
1563 		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1564 
1565 	hdr->hop_limit = np->cork.hop_limit;
1566 	hdr->nexthdr = proto;
1567 	ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
1568 	ipv6_addr_copy(&hdr->daddr, final_dst);
1569 
1570 	skb->priority = sk->sk_priority;
1571 	skb->mark = sk->sk_mark;
1572 
1573 	skb_dst_set(skb, dst_clone(&rt->dst));
1574 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1575 	if (proto == IPPROTO_ICMPV6) {
1576 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1577 
1578 		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1579 		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1580 	}
1581 
1582 	err = ip6_local_out(skb);
1583 	if (err) {
1584 		if (err > 0)
1585 			err = net_xmit_errno(err);
1586 		if (err)
1587 			goto error;
1588 	}
1589 
1590 out:
1591 	ip6_cork_release(inet, np);
1592 	return err;
1593 error:
1594 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1595 	goto out;
1596 }
1597 
1598 void ip6_flush_pending_frames(struct sock *sk)
1599 {
1600 	struct sk_buff *skb;
1601 
1602 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1603 		if (skb_dst(skb))
1604 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1605 				      IPSTATS_MIB_OUTDISCARDS);
1606 		kfree_skb(skb);
1607 	}
1608 
1609 	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1610 }
1611