xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 7ad6848c7e81a603605fad3f3575841aab004eea)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	: 	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 
41 #include <linux/netfilter.h>
42 #include <linux/netfilter_ipv6.h>
43 
44 #include <net/sock.h>
45 #include <net/snmp.h>
46 
47 #include <net/ipv6.h>
48 #include <net/ndisc.h>
49 #include <net/protocol.h>
50 #include <net/ip6_route.h>
51 #include <net/addrconf.h>
52 #include <net/rawv6.h>
53 #include <net/icmp.h>
54 #include <net/xfrm.h>
55 #include <net/checksum.h>
56 #include <linux/mroute6.h>
57 
58 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
59 
60 int __ip6_local_out(struct sk_buff *skb)
61 {
62 	int len;
63 
64 	len = skb->len - sizeof(struct ipv6hdr);
65 	if (len > IPV6_MAXPLEN)
66 		len = 0;
67 	ipv6_hdr(skb)->payload_len = htons(len);
68 
69 	return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
70 		       dst_output);
71 }
72 
73 int ip6_local_out(struct sk_buff *skb)
74 {
75 	int err;
76 
77 	err = __ip6_local_out(skb);
78 	if (likely(err == 1))
79 		err = dst_output(skb);
80 
81 	return err;
82 }
83 EXPORT_SYMBOL_GPL(ip6_local_out);
84 
85 static int ip6_output_finish(struct sk_buff *skb)
86 {
87 	struct dst_entry *dst = skb_dst(skb);
88 
89 	if (dst->hh)
90 		return neigh_hh_output(dst->hh, skb);
91 	else if (dst->neighbour)
92 		return dst->neighbour->output(skb);
93 
94 	IP6_INC_STATS_BH(dev_net(dst->dev),
95 			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
96 	kfree_skb(skb);
97 	return -EINVAL;
98 
99 }
100 
101 /* dev_loopback_xmit for use with netfilter. */
102 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
103 {
104 	skb_reset_mac_header(newskb);
105 	__skb_pull(newskb, skb_network_offset(newskb));
106 	newskb->pkt_type = PACKET_LOOPBACK;
107 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
108 	WARN_ON(!skb_dst(newskb));
109 
110 	netif_rx(newskb);
111 	return 0;
112 }
113 
114 
115 static int ip6_output2(struct sk_buff *skb)
116 {
117 	struct dst_entry *dst = skb_dst(skb);
118 	struct net_device *dev = dst->dev;
119 
120 	skb->protocol = htons(ETH_P_IPV6);
121 	skb->dev = dev;
122 
123 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
124 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
125 
126 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
127 		    ((mroute6_socket(dev_net(dev)) &&
128 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
129 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
130 					 &ipv6_hdr(skb)->saddr))) {
131 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
132 
133 			/* Do not check for IFF_ALLMULTI; multicast routing
134 			   is not supported in any case.
135 			 */
136 			if (newskb)
137 				NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
138 					NULL, newskb->dev,
139 					ip6_dev_loopback_xmit);
140 
141 			if (ipv6_hdr(skb)->hop_limit == 0) {
142 				IP6_INC_STATS(dev_net(dev), idev,
143 					      IPSTATS_MIB_OUTDISCARDS);
144 				kfree_skb(skb);
145 				return 0;
146 			}
147 		}
148 
149 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
150 				skb->len);
151 	}
152 
153 	return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
154 		       ip6_output_finish);
155 }
156 
157 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
158 {
159 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
160 
161 	return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
162 	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
163 }
164 
165 int ip6_output(struct sk_buff *skb)
166 {
167 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
168 	if (unlikely(idev->cnf.disable_ipv6)) {
169 		IP6_INC_STATS(dev_net(skb_dst(skb)->dev), idev,
170 			      IPSTATS_MIB_OUTDISCARDS);
171 		kfree_skb(skb);
172 		return 0;
173 	}
174 
175 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
176 				dst_allfrag(skb_dst(skb)))
177 		return ip6_fragment(skb, ip6_output2);
178 	else
179 		return ip6_output2(skb);
180 }
181 
182 /*
183  *	xmit an sk_buff (used by TCP)
184  */
185 
186 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
187 	     struct ipv6_txoptions *opt, int ipfragok)
188 {
189 	struct net *net = sock_net(sk);
190 	struct ipv6_pinfo *np = inet6_sk(sk);
191 	struct in6_addr *first_hop = &fl->fl6_dst;
192 	struct dst_entry *dst = skb_dst(skb);
193 	struct ipv6hdr *hdr;
194 	u8  proto = fl->proto;
195 	int seg_len = skb->len;
196 	int hlimit = -1;
197 	int tclass = 0;
198 	u32 mtu;
199 
200 	if (opt) {
201 		unsigned int head_room;
202 
203 		/* First: exthdrs may take lots of space (~8K for now)
204 		   MAX_HEADER is not enough.
205 		 */
206 		head_room = opt->opt_nflen + opt->opt_flen;
207 		seg_len += head_room;
208 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
209 
210 		if (skb_headroom(skb) < head_room) {
211 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
212 			if (skb2 == NULL) {
213 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
214 					      IPSTATS_MIB_OUTDISCARDS);
215 				kfree_skb(skb);
216 				return -ENOBUFS;
217 			}
218 			kfree_skb(skb);
219 			skb = skb2;
220 			if (sk)
221 				skb_set_owner_w(skb, sk);
222 		}
223 		if (opt->opt_flen)
224 			ipv6_push_frag_opts(skb, opt, &proto);
225 		if (opt->opt_nflen)
226 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
227 	}
228 
229 	skb_push(skb, sizeof(struct ipv6hdr));
230 	skb_reset_network_header(skb);
231 	hdr = ipv6_hdr(skb);
232 
233 	/* Allow local fragmentation. */
234 	if (ipfragok)
235 		skb->local_df = 1;
236 
237 	/*
238 	 *	Fill in the IPv6 header
239 	 */
240 	if (np) {
241 		tclass = np->tclass;
242 		hlimit = np->hop_limit;
243 	}
244 	if (hlimit < 0)
245 		hlimit = ip6_dst_hoplimit(dst);
246 
247 	*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
248 
249 	hdr->payload_len = htons(seg_len);
250 	hdr->nexthdr = proto;
251 	hdr->hop_limit = hlimit;
252 
253 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
254 	ipv6_addr_copy(&hdr->daddr, first_hop);
255 
256 	skb->priority = sk->sk_priority;
257 	skb->mark = sk->sk_mark;
258 
259 	mtu = dst_mtu(dst);
260 	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
261 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
262 			      IPSTATS_MIB_OUT, skb->len);
263 		return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
264 				dst_output);
265 	}
266 
267 	if (net_ratelimit())
268 		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
269 	skb->dev = dst->dev;
270 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
271 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
272 	kfree_skb(skb);
273 	return -EMSGSIZE;
274 }
275 
276 EXPORT_SYMBOL(ip6_xmit);
277 
278 /*
279  *	To avoid extra problems ND packets are send through this
280  *	routine. It's code duplication but I really want to avoid
281  *	extra checks since ipv6_build_header is used by TCP (which
282  *	is for us performance critical)
283  */
284 
285 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
286 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
287 	       int proto, int len)
288 {
289 	struct ipv6_pinfo *np = inet6_sk(sk);
290 	struct ipv6hdr *hdr;
291 	int totlen;
292 
293 	skb->protocol = htons(ETH_P_IPV6);
294 	skb->dev = dev;
295 
296 	totlen = len + sizeof(struct ipv6hdr);
297 
298 	skb_reset_network_header(skb);
299 	skb_put(skb, sizeof(struct ipv6hdr));
300 	hdr = ipv6_hdr(skb);
301 
302 	*(__be32*)hdr = htonl(0x60000000);
303 
304 	hdr->payload_len = htons(len);
305 	hdr->nexthdr = proto;
306 	hdr->hop_limit = np->hop_limit;
307 
308 	ipv6_addr_copy(&hdr->saddr, saddr);
309 	ipv6_addr_copy(&hdr->daddr, daddr);
310 
311 	return 0;
312 }
313 
314 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
315 {
316 	struct ip6_ra_chain *ra;
317 	struct sock *last = NULL;
318 
319 	read_lock(&ip6_ra_lock);
320 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
321 		struct sock *sk = ra->sk;
322 		if (sk && ra->sel == sel &&
323 		    (!sk->sk_bound_dev_if ||
324 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
325 			if (last) {
326 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
327 				if (skb2)
328 					rawv6_rcv(last, skb2);
329 			}
330 			last = sk;
331 		}
332 	}
333 
334 	if (last) {
335 		rawv6_rcv(last, skb);
336 		read_unlock(&ip6_ra_lock);
337 		return 1;
338 	}
339 	read_unlock(&ip6_ra_lock);
340 	return 0;
341 }
342 
343 static int ip6_forward_proxy_check(struct sk_buff *skb)
344 {
345 	struct ipv6hdr *hdr = ipv6_hdr(skb);
346 	u8 nexthdr = hdr->nexthdr;
347 	int offset;
348 
349 	if (ipv6_ext_hdr(nexthdr)) {
350 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
351 		if (offset < 0)
352 			return 0;
353 	} else
354 		offset = sizeof(struct ipv6hdr);
355 
356 	if (nexthdr == IPPROTO_ICMPV6) {
357 		struct icmp6hdr *icmp6;
358 
359 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
360 					 offset + 1 - skb->data)))
361 			return 0;
362 
363 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
364 
365 		switch (icmp6->icmp6_type) {
366 		case NDISC_ROUTER_SOLICITATION:
367 		case NDISC_ROUTER_ADVERTISEMENT:
368 		case NDISC_NEIGHBOUR_SOLICITATION:
369 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
370 		case NDISC_REDIRECT:
371 			/* For reaction involving unicast neighbor discovery
372 			 * message destined to the proxied address, pass it to
373 			 * input function.
374 			 */
375 			return 1;
376 		default:
377 			break;
378 		}
379 	}
380 
381 	/*
382 	 * The proxying router can't forward traffic sent to a link-local
383 	 * address, so signal the sender and discard the packet. This
384 	 * behavior is clarified by the MIPv6 specification.
385 	 */
386 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
387 		dst_link_failure(skb);
388 		return -1;
389 	}
390 
391 	return 0;
392 }
393 
394 static inline int ip6_forward_finish(struct sk_buff *skb)
395 {
396 	return dst_output(skb);
397 }
398 
399 int ip6_forward(struct sk_buff *skb)
400 {
401 	struct dst_entry *dst = skb_dst(skb);
402 	struct ipv6hdr *hdr = ipv6_hdr(skb);
403 	struct inet6_skb_parm *opt = IP6CB(skb);
404 	struct net *net = dev_net(dst->dev);
405 
406 	if (net->ipv6.devconf_all->forwarding == 0)
407 		goto error;
408 
409 	if (skb_warn_if_lro(skb))
410 		goto drop;
411 
412 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
413 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
414 		goto drop;
415 	}
416 
417 	skb_forward_csum(skb);
418 
419 	/*
420 	 *	We DO NOT make any processing on
421 	 *	RA packets, pushing them to user level AS IS
422 	 *	without ane WARRANTY that application will be able
423 	 *	to interpret them. The reason is that we
424 	 *	cannot make anything clever here.
425 	 *
426 	 *	We are not end-node, so that if packet contains
427 	 *	AH/ESP, we cannot make anything.
428 	 *	Defragmentation also would be mistake, RA packets
429 	 *	cannot be fragmented, because there is no warranty
430 	 *	that different fragments will go along one path. --ANK
431 	 */
432 	if (opt->ra) {
433 		u8 *ptr = skb_network_header(skb) + opt->ra;
434 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
435 			return 0;
436 	}
437 
438 	/*
439 	 *	check and decrement ttl
440 	 */
441 	if (hdr->hop_limit <= 1) {
442 		/* Force OUTPUT device used as source address */
443 		skb->dev = dst->dev;
444 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
445 			    0, skb->dev);
446 		IP6_INC_STATS_BH(net,
447 				 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
448 
449 		kfree_skb(skb);
450 		return -ETIMEDOUT;
451 	}
452 
453 	/* XXX: idev->cnf.proxy_ndp? */
454 	if (net->ipv6.devconf_all->proxy_ndp &&
455 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
456 		int proxied = ip6_forward_proxy_check(skb);
457 		if (proxied > 0)
458 			return ip6_input(skb);
459 		else if (proxied < 0) {
460 			IP6_INC_STATS(net, ip6_dst_idev(dst),
461 				      IPSTATS_MIB_INDISCARDS);
462 			goto drop;
463 		}
464 	}
465 
466 	if (!xfrm6_route_forward(skb)) {
467 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
468 		goto drop;
469 	}
470 	dst = skb_dst(skb);
471 
472 	/* IPv6 specs say nothing about it, but it is clear that we cannot
473 	   send redirects to source routed frames.
474 	   We don't send redirects to frames decapsulated from IPsec.
475 	 */
476 	if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
477 	    !skb_sec_path(skb)) {
478 		struct in6_addr *target = NULL;
479 		struct rt6_info *rt;
480 		struct neighbour *n = dst->neighbour;
481 
482 		/*
483 		 *	incoming and outgoing devices are the same
484 		 *	send a redirect.
485 		 */
486 
487 		rt = (struct rt6_info *) dst;
488 		if ((rt->rt6i_flags & RTF_GATEWAY))
489 			target = (struct in6_addr*)&n->primary_key;
490 		else
491 			target = &hdr->daddr;
492 
493 		/* Limit redirects both by destination (here)
494 		   and by source (inside ndisc_send_redirect)
495 		 */
496 		if (xrlim_allow(dst, 1*HZ))
497 			ndisc_send_redirect(skb, n, target);
498 	} else {
499 		int addrtype = ipv6_addr_type(&hdr->saddr);
500 
501 		/* This check is security critical. */
502 		if (addrtype == IPV6_ADDR_ANY ||
503 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
504 			goto error;
505 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
506 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
507 				ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
508 			goto error;
509 		}
510 	}
511 
512 	if (skb->len > dst_mtu(dst)) {
513 		/* Again, force OUTPUT device used as source address */
514 		skb->dev = dst->dev;
515 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
516 		IP6_INC_STATS_BH(net,
517 				 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
518 		IP6_INC_STATS_BH(net,
519 				 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
520 		kfree_skb(skb);
521 		return -EMSGSIZE;
522 	}
523 
524 	if (skb_cow(skb, dst->dev->hard_header_len)) {
525 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
526 		goto drop;
527 	}
528 
529 	hdr = ipv6_hdr(skb);
530 
531 	/* Mangling hops number delayed to point after skb COW */
532 
533 	hdr->hop_limit--;
534 
535 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
536 	return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
537 		       ip6_forward_finish);
538 
539 error:
540 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
541 drop:
542 	kfree_skb(skb);
543 	return -EINVAL;
544 }
545 
546 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
547 {
548 	to->pkt_type = from->pkt_type;
549 	to->priority = from->priority;
550 	to->protocol = from->protocol;
551 	skb_dst_drop(to);
552 	skb_dst_set(to, dst_clone(skb_dst(from)));
553 	to->dev = from->dev;
554 	to->mark = from->mark;
555 
556 #ifdef CONFIG_NET_SCHED
557 	to->tc_index = from->tc_index;
558 #endif
559 	nf_copy(to, from);
560 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
561     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
562 	to->nf_trace = from->nf_trace;
563 #endif
564 	skb_copy_secmark(to, from);
565 }
566 
567 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
568 {
569 	u16 offset = sizeof(struct ipv6hdr);
570 	struct ipv6_opt_hdr *exthdr =
571 				(struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
572 	unsigned int packet_len = skb->tail - skb->network_header;
573 	int found_rhdr = 0;
574 	*nexthdr = &ipv6_hdr(skb)->nexthdr;
575 
576 	while (offset + 1 <= packet_len) {
577 
578 		switch (**nexthdr) {
579 
580 		case NEXTHDR_HOP:
581 			break;
582 		case NEXTHDR_ROUTING:
583 			found_rhdr = 1;
584 			break;
585 		case NEXTHDR_DEST:
586 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
587 			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
588 				break;
589 #endif
590 			if (found_rhdr)
591 				return offset;
592 			break;
593 		default :
594 			return offset;
595 		}
596 
597 		offset += ipv6_optlen(exthdr);
598 		*nexthdr = &exthdr->nexthdr;
599 		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
600 						 offset);
601 	}
602 
603 	return offset;
604 }
605 
606 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
607 {
608 	struct sk_buff *frag;
609 	struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
610 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
611 	struct ipv6hdr *tmp_hdr;
612 	struct frag_hdr *fh;
613 	unsigned int mtu, hlen, left, len;
614 	__be32 frag_id = 0;
615 	int ptr, offset = 0, err=0;
616 	u8 *prevhdr, nexthdr = 0;
617 	struct net *net = dev_net(skb_dst(skb)->dev);
618 
619 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
620 	nexthdr = *prevhdr;
621 
622 	mtu = ip6_skb_dst_mtu(skb);
623 
624 	/* We must not fragment if the socket is set to force MTU discovery
625 	 * or if the skb it not generated by a local socket.  (This last
626 	 * check should be redundant, but it's free.)
627 	 */
628 	if (!skb->local_df) {
629 		skb->dev = skb_dst(skb)->dev;
630 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
631 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
632 			      IPSTATS_MIB_FRAGFAILS);
633 		kfree_skb(skb);
634 		return -EMSGSIZE;
635 	}
636 
637 	if (np && np->frag_size < mtu) {
638 		if (np->frag_size)
639 			mtu = np->frag_size;
640 	}
641 	mtu -= hlen + sizeof(struct frag_hdr);
642 
643 	if (skb_has_frags(skb)) {
644 		int first_len = skb_pagelen(skb);
645 		int truesizes = 0;
646 
647 		if (first_len - hlen > mtu ||
648 		    ((first_len - hlen) & 7) ||
649 		    skb_cloned(skb))
650 			goto slow_path;
651 
652 		skb_walk_frags(skb, frag) {
653 			/* Correct geometry. */
654 			if (frag->len > mtu ||
655 			    ((frag->len & 7) && frag->next) ||
656 			    skb_headroom(frag) < hlen)
657 			    goto slow_path;
658 
659 			/* Partially cloned skb? */
660 			if (skb_shared(frag))
661 				goto slow_path;
662 
663 			BUG_ON(frag->sk);
664 			if (skb->sk) {
665 				frag->sk = skb->sk;
666 				frag->destructor = sock_wfree;
667 				truesizes += frag->truesize;
668 			}
669 		}
670 
671 		err = 0;
672 		offset = 0;
673 		frag = skb_shinfo(skb)->frag_list;
674 		skb_frag_list_init(skb);
675 		/* BUILD HEADER */
676 
677 		*prevhdr = NEXTHDR_FRAGMENT;
678 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
679 		if (!tmp_hdr) {
680 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
681 				      IPSTATS_MIB_FRAGFAILS);
682 			return -ENOMEM;
683 		}
684 
685 		__skb_pull(skb, hlen);
686 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
687 		__skb_push(skb, hlen);
688 		skb_reset_network_header(skb);
689 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
690 
691 		ipv6_select_ident(fh);
692 		fh->nexthdr = nexthdr;
693 		fh->reserved = 0;
694 		fh->frag_off = htons(IP6_MF);
695 		frag_id = fh->identification;
696 
697 		first_len = skb_pagelen(skb);
698 		skb->data_len = first_len - skb_headlen(skb);
699 		skb->truesize -= truesizes;
700 		skb->len = first_len;
701 		ipv6_hdr(skb)->payload_len = htons(first_len -
702 						   sizeof(struct ipv6hdr));
703 
704 		dst_hold(&rt->u.dst);
705 
706 		for (;;) {
707 			/* Prepare header of the next frame,
708 			 * before previous one went down. */
709 			if (frag) {
710 				frag->ip_summed = CHECKSUM_NONE;
711 				skb_reset_transport_header(frag);
712 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
713 				__skb_push(frag, hlen);
714 				skb_reset_network_header(frag);
715 				memcpy(skb_network_header(frag), tmp_hdr,
716 				       hlen);
717 				offset += skb->len - hlen - sizeof(struct frag_hdr);
718 				fh->nexthdr = nexthdr;
719 				fh->reserved = 0;
720 				fh->frag_off = htons(offset);
721 				if (frag->next != NULL)
722 					fh->frag_off |= htons(IP6_MF);
723 				fh->identification = frag_id;
724 				ipv6_hdr(frag)->payload_len =
725 						htons(frag->len -
726 						      sizeof(struct ipv6hdr));
727 				ip6_copy_metadata(frag, skb);
728 			}
729 
730 			err = output(skb);
731 			if(!err)
732 				IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
733 					      IPSTATS_MIB_FRAGCREATES);
734 
735 			if (err || !frag)
736 				break;
737 
738 			skb = frag;
739 			frag = skb->next;
740 			skb->next = NULL;
741 		}
742 
743 		kfree(tmp_hdr);
744 
745 		if (err == 0) {
746 			IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
747 				      IPSTATS_MIB_FRAGOKS);
748 			dst_release(&rt->u.dst);
749 			return 0;
750 		}
751 
752 		while (frag) {
753 			skb = frag->next;
754 			kfree_skb(frag);
755 			frag = skb;
756 		}
757 
758 		IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
759 			      IPSTATS_MIB_FRAGFAILS);
760 		dst_release(&rt->u.dst);
761 		return err;
762 	}
763 
764 slow_path:
765 	left = skb->len - hlen;		/* Space per frame */
766 	ptr = hlen;			/* Where to start from */
767 
768 	/*
769 	 *	Fragment the datagram.
770 	 */
771 
772 	*prevhdr = NEXTHDR_FRAGMENT;
773 
774 	/*
775 	 *	Keep copying data until we run out.
776 	 */
777 	while(left > 0)	{
778 		len = left;
779 		/* IF: it doesn't fit, use 'mtu' - the data space left */
780 		if (len > mtu)
781 			len = mtu;
782 		/* IF: we are not sending upto and including the packet end
783 		   then align the next start on an eight byte boundary */
784 		if (len < left)	{
785 			len &= ~7;
786 		}
787 		/*
788 		 *	Allocate buffer.
789 		 */
790 
791 		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
792 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
793 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
794 				      IPSTATS_MIB_FRAGFAILS);
795 			err = -ENOMEM;
796 			goto fail;
797 		}
798 
799 		/*
800 		 *	Set up data on packet
801 		 */
802 
803 		ip6_copy_metadata(frag, skb);
804 		skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
805 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
806 		skb_reset_network_header(frag);
807 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
808 		frag->transport_header = (frag->network_header + hlen +
809 					  sizeof(struct frag_hdr));
810 
811 		/*
812 		 *	Charge the memory for the fragment to any owner
813 		 *	it might possess
814 		 */
815 		if (skb->sk)
816 			skb_set_owner_w(frag, skb->sk);
817 
818 		/*
819 		 *	Copy the packet header into the new buffer.
820 		 */
821 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
822 
823 		/*
824 		 *	Build fragment header.
825 		 */
826 		fh->nexthdr = nexthdr;
827 		fh->reserved = 0;
828 		if (!frag_id) {
829 			ipv6_select_ident(fh);
830 			frag_id = fh->identification;
831 		} else
832 			fh->identification = frag_id;
833 
834 		/*
835 		 *	Copy a block of the IP datagram.
836 		 */
837 		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
838 			BUG();
839 		left -= len;
840 
841 		fh->frag_off = htons(offset);
842 		if (left > 0)
843 			fh->frag_off |= htons(IP6_MF);
844 		ipv6_hdr(frag)->payload_len = htons(frag->len -
845 						    sizeof(struct ipv6hdr));
846 
847 		ptr += len;
848 		offset += len;
849 
850 		/*
851 		 *	Put this fragment into the sending queue.
852 		 */
853 		err = output(frag);
854 		if (err)
855 			goto fail;
856 
857 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
858 			      IPSTATS_MIB_FRAGCREATES);
859 	}
860 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
861 		      IPSTATS_MIB_FRAGOKS);
862 	kfree_skb(skb);
863 	return err;
864 
865 fail:
866 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
867 		      IPSTATS_MIB_FRAGFAILS);
868 	kfree_skb(skb);
869 	return err;
870 }
871 
872 static inline int ip6_rt_check(struct rt6key *rt_key,
873 			       struct in6_addr *fl_addr,
874 			       struct in6_addr *addr_cache)
875 {
876 	return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
877 		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
878 }
879 
880 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
881 					  struct dst_entry *dst,
882 					  struct flowi *fl)
883 {
884 	struct ipv6_pinfo *np = inet6_sk(sk);
885 	struct rt6_info *rt = (struct rt6_info *)dst;
886 
887 	if (!dst)
888 		goto out;
889 
890 	/* Yes, checking route validity in not connected
891 	 * case is not very simple. Take into account,
892 	 * that we do not support routing by source, TOS,
893 	 * and MSG_DONTROUTE 		--ANK (980726)
894 	 *
895 	 * 1. ip6_rt_check(): If route was host route,
896 	 *    check that cached destination is current.
897 	 *    If it is network route, we still may
898 	 *    check its validity using saved pointer
899 	 *    to the last used address: daddr_cache.
900 	 *    We do not want to save whole address now,
901 	 *    (because main consumer of this service
902 	 *    is tcp, which has not this problem),
903 	 *    so that the last trick works only on connected
904 	 *    sockets.
905 	 * 2. oif also should be the same.
906 	 */
907 	if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
908 #ifdef CONFIG_IPV6_SUBTREES
909 	    ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
910 #endif
911 	    (fl->oif && fl->oif != dst->dev->ifindex)) {
912 		dst_release(dst);
913 		dst = NULL;
914 	}
915 
916 out:
917 	return dst;
918 }
919 
920 static int ip6_dst_lookup_tail(struct sock *sk,
921 			       struct dst_entry **dst, struct flowi *fl)
922 {
923 	int err;
924 	struct net *net = sock_net(sk);
925 
926 	if (*dst == NULL)
927 		*dst = ip6_route_output(net, sk, fl);
928 
929 	if ((err = (*dst)->error))
930 		goto out_err_release;
931 
932 	if (ipv6_addr_any(&fl->fl6_src)) {
933 		err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
934 					 &fl->fl6_dst,
935 					 sk ? inet6_sk(sk)->srcprefs : 0,
936 					 &fl->fl6_src);
937 		if (err)
938 			goto out_err_release;
939 	}
940 
941 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
942 	/*
943 	 * Here if the dst entry we've looked up
944 	 * has a neighbour entry that is in the INCOMPLETE
945 	 * state and the src address from the flow is
946 	 * marked as OPTIMISTIC, we release the found
947 	 * dst entry and replace it instead with the
948 	 * dst entry of the nexthop router
949 	 */
950 	if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
951 		struct inet6_ifaddr *ifp;
952 		struct flowi fl_gw;
953 		int redirect;
954 
955 		ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
956 				      (*dst)->dev, 1);
957 
958 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
959 		if (ifp)
960 			in6_ifa_put(ifp);
961 
962 		if (redirect) {
963 			/*
964 			 * We need to get the dst entry for the
965 			 * default router instead
966 			 */
967 			dst_release(*dst);
968 			memcpy(&fl_gw, fl, sizeof(struct flowi));
969 			memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
970 			*dst = ip6_route_output(net, sk, &fl_gw);
971 			if ((err = (*dst)->error))
972 				goto out_err_release;
973 		}
974 	}
975 #endif
976 
977 	return 0;
978 
979 out_err_release:
980 	if (err == -ENETUNREACH)
981 		IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
982 	dst_release(*dst);
983 	*dst = NULL;
984 	return err;
985 }
986 
987 /**
988  *	ip6_dst_lookup - perform route lookup on flow
989  *	@sk: socket which provides route info
990  *	@dst: pointer to dst_entry * for result
991  *	@fl: flow to lookup
992  *
993  *	This function performs a route lookup on the given flow.
994  *
995  *	It returns zero on success, or a standard errno code on error.
996  */
997 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
998 {
999 	*dst = NULL;
1000 	return ip6_dst_lookup_tail(sk, dst, fl);
1001 }
1002 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1003 
1004 /**
1005  *	ip6_sk_dst_lookup - perform socket cached route lookup on flow
1006  *	@sk: socket which provides the dst cache and route info
1007  *	@dst: pointer to dst_entry * for result
1008  *	@fl: flow to lookup
1009  *
1010  *	This function performs a route lookup on the given flow with the
1011  *	possibility of using the cached route in the socket if it is valid.
1012  *	It will take the socket dst lock when operating on the dst cache.
1013  *	As a result, this function can only be used in process context.
1014  *
1015  *	It returns zero on success, or a standard errno code on error.
1016  */
1017 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1018 {
1019 	*dst = NULL;
1020 	if (sk) {
1021 		*dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1022 		*dst = ip6_sk_dst_check(sk, *dst, fl);
1023 	}
1024 
1025 	return ip6_dst_lookup_tail(sk, dst, fl);
1026 }
1027 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1028 
1029 static inline int ip6_ufo_append_data(struct sock *sk,
1030 			int getfrag(void *from, char *to, int offset, int len,
1031 			int odd, struct sk_buff *skb),
1032 			void *from, int length, int hh_len, int fragheaderlen,
1033 			int transhdrlen, int mtu,unsigned int flags)
1034 
1035 {
1036 	struct sk_buff *skb;
1037 	int err;
1038 
1039 	/* There is support for UDP large send offload by network
1040 	 * device, so create one single skb packet containing complete
1041 	 * udp datagram
1042 	 */
1043 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1044 		skb = sock_alloc_send_skb(sk,
1045 			hh_len + fragheaderlen + transhdrlen + 20,
1046 			(flags & MSG_DONTWAIT), &err);
1047 		if (skb == NULL)
1048 			return -ENOMEM;
1049 
1050 		/* reserve space for Hardware header */
1051 		skb_reserve(skb, hh_len);
1052 
1053 		/* create space for UDP/IP header */
1054 		skb_put(skb,fragheaderlen + transhdrlen);
1055 
1056 		/* initialize network header pointer */
1057 		skb_reset_network_header(skb);
1058 
1059 		/* initialize protocol header pointer */
1060 		skb->transport_header = skb->network_header + fragheaderlen;
1061 
1062 		skb->ip_summed = CHECKSUM_PARTIAL;
1063 		skb->csum = 0;
1064 		sk->sk_sndmsg_off = 0;
1065 	}
1066 
1067 	err = skb_append_datato_frags(sk,skb, getfrag, from,
1068 				      (length - transhdrlen));
1069 	if (!err) {
1070 		struct frag_hdr fhdr;
1071 
1072 		/* Specify the length of each IPv6 datagram fragment.
1073 		 * It has to be a multiple of 8.
1074 		 */
1075 		skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1076 					     sizeof(struct frag_hdr)) & ~7;
1077 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1078 		ipv6_select_ident(&fhdr);
1079 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1080 		__skb_queue_tail(&sk->sk_write_queue, skb);
1081 
1082 		return 0;
1083 	}
1084 	/* There is not enough support do UPD LSO,
1085 	 * so follow normal path
1086 	 */
1087 	kfree_skb(skb);
1088 
1089 	return err;
1090 }
1091 
1092 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1093 					       gfp_t gfp)
1094 {
1095 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1096 }
1097 
1098 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1099 						gfp_t gfp)
1100 {
1101 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1102 }
1103 
1104 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1105 	int offset, int len, int odd, struct sk_buff *skb),
1106 	void *from, int length, int transhdrlen,
1107 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1108 	struct rt6_info *rt, unsigned int flags)
1109 {
1110 	struct inet_sock *inet = inet_sk(sk);
1111 	struct ipv6_pinfo *np = inet6_sk(sk);
1112 	struct sk_buff *skb;
1113 	unsigned int maxfraglen, fragheaderlen;
1114 	int exthdrlen;
1115 	int hh_len;
1116 	int mtu;
1117 	int copy;
1118 	int err;
1119 	int offset = 0;
1120 	int csummode = CHECKSUM_NONE;
1121 
1122 	if (flags&MSG_PROBE)
1123 		return 0;
1124 	if (skb_queue_empty(&sk->sk_write_queue)) {
1125 		/*
1126 		 * setup for corking
1127 		 */
1128 		if (opt) {
1129 			if (WARN_ON(np->cork.opt))
1130 				return -EINVAL;
1131 
1132 			np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1133 			if (unlikely(np->cork.opt == NULL))
1134 				return -ENOBUFS;
1135 
1136 			np->cork.opt->tot_len = opt->tot_len;
1137 			np->cork.opt->opt_flen = opt->opt_flen;
1138 			np->cork.opt->opt_nflen = opt->opt_nflen;
1139 
1140 			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1141 							    sk->sk_allocation);
1142 			if (opt->dst0opt && !np->cork.opt->dst0opt)
1143 				return -ENOBUFS;
1144 
1145 			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1146 							    sk->sk_allocation);
1147 			if (opt->dst1opt && !np->cork.opt->dst1opt)
1148 				return -ENOBUFS;
1149 
1150 			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1151 							   sk->sk_allocation);
1152 			if (opt->hopopt && !np->cork.opt->hopopt)
1153 				return -ENOBUFS;
1154 
1155 			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1156 							    sk->sk_allocation);
1157 			if (opt->srcrt && !np->cork.opt->srcrt)
1158 				return -ENOBUFS;
1159 
1160 			/* need source address above miyazawa*/
1161 		}
1162 		dst_hold(&rt->u.dst);
1163 		inet->cork.dst = &rt->u.dst;
1164 		inet->cork.fl = *fl;
1165 		np->cork.hop_limit = hlimit;
1166 		np->cork.tclass = tclass;
1167 		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1168 		      rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1169 		if (np->frag_size < mtu) {
1170 			if (np->frag_size)
1171 				mtu = np->frag_size;
1172 		}
1173 		inet->cork.fragsize = mtu;
1174 		if (dst_allfrag(rt->u.dst.path))
1175 			inet->cork.flags |= IPCORK_ALLFRAG;
1176 		inet->cork.length = 0;
1177 		sk->sk_sndmsg_page = NULL;
1178 		sk->sk_sndmsg_off = 0;
1179 		exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1180 			    rt->rt6i_nfheader_len;
1181 		length += exthdrlen;
1182 		transhdrlen += exthdrlen;
1183 	} else {
1184 		rt = (struct rt6_info *)inet->cork.dst;
1185 		fl = &inet->cork.fl;
1186 		opt = np->cork.opt;
1187 		transhdrlen = 0;
1188 		exthdrlen = 0;
1189 		mtu = inet->cork.fragsize;
1190 	}
1191 
1192 	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1193 
1194 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1195 			(opt ? opt->opt_nflen : 0);
1196 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1197 
1198 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1199 		if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1200 			ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1201 			return -EMSGSIZE;
1202 		}
1203 	}
1204 
1205 	/*
1206 	 * Let's try using as much space as possible.
1207 	 * Use MTU if total length of the message fits into the MTU.
1208 	 * Otherwise, we need to reserve fragment header and
1209 	 * fragment alignment (= 8-15 octects, in total).
1210 	 *
1211 	 * Note that we may need to "move" the data from the tail of
1212 	 * of the buffer to the new fragment when we split
1213 	 * the message.
1214 	 *
1215 	 * FIXME: It may be fragmented into multiple chunks
1216 	 *        at once if non-fragmentable extension headers
1217 	 *        are too large.
1218 	 * --yoshfuji
1219 	 */
1220 
1221 	inet->cork.length += length;
1222 	if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1223 	    (rt->u.dst.dev->features & NETIF_F_UFO)) {
1224 
1225 		err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1226 					  fragheaderlen, transhdrlen, mtu,
1227 					  flags);
1228 		if (err)
1229 			goto error;
1230 		return 0;
1231 	}
1232 
1233 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1234 		goto alloc_new_skb;
1235 
1236 	while (length > 0) {
1237 		/* Check if the remaining data fits into current packet. */
1238 		copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1239 		if (copy < length)
1240 			copy = maxfraglen - skb->len;
1241 
1242 		if (copy <= 0) {
1243 			char *data;
1244 			unsigned int datalen;
1245 			unsigned int fraglen;
1246 			unsigned int fraggap;
1247 			unsigned int alloclen;
1248 			struct sk_buff *skb_prev;
1249 alloc_new_skb:
1250 			skb_prev = skb;
1251 
1252 			/* There's no room in the current skb */
1253 			if (skb_prev)
1254 				fraggap = skb_prev->len - maxfraglen;
1255 			else
1256 				fraggap = 0;
1257 
1258 			/*
1259 			 * If remaining data exceeds the mtu,
1260 			 * we know we need more fragment(s).
1261 			 */
1262 			datalen = length + fraggap;
1263 			if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1264 				datalen = maxfraglen - fragheaderlen;
1265 
1266 			fraglen = datalen + fragheaderlen;
1267 			if ((flags & MSG_MORE) &&
1268 			    !(rt->u.dst.dev->features&NETIF_F_SG))
1269 				alloclen = mtu;
1270 			else
1271 				alloclen = datalen + fragheaderlen;
1272 
1273 			/*
1274 			 * The last fragment gets additional space at tail.
1275 			 * Note: we overallocate on fragments with MSG_MODE
1276 			 * because we have no idea if we're the last one.
1277 			 */
1278 			if (datalen == length + fraggap)
1279 				alloclen += rt->u.dst.trailer_len;
1280 
1281 			/*
1282 			 * We just reserve space for fragment header.
1283 			 * Note: this may be overallocation if the message
1284 			 * (without MSG_MORE) fits into the MTU.
1285 			 */
1286 			alloclen += sizeof(struct frag_hdr);
1287 
1288 			if (transhdrlen) {
1289 				skb = sock_alloc_send_skb(sk,
1290 						alloclen + hh_len,
1291 						(flags & MSG_DONTWAIT), &err);
1292 			} else {
1293 				skb = NULL;
1294 				if (atomic_read(&sk->sk_wmem_alloc) <=
1295 				    2 * sk->sk_sndbuf)
1296 					skb = sock_wmalloc(sk,
1297 							   alloclen + hh_len, 1,
1298 							   sk->sk_allocation);
1299 				if (unlikely(skb == NULL))
1300 					err = -ENOBUFS;
1301 			}
1302 			if (skb == NULL)
1303 				goto error;
1304 			/*
1305 			 *	Fill in the control structures
1306 			 */
1307 			skb->ip_summed = csummode;
1308 			skb->csum = 0;
1309 			/* reserve for fragmentation */
1310 			skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1311 
1312 			/*
1313 			 *	Find where to start putting bytes
1314 			 */
1315 			data = skb_put(skb, fraglen);
1316 			skb_set_network_header(skb, exthdrlen);
1317 			data += fragheaderlen;
1318 			skb->transport_header = (skb->network_header +
1319 						 fragheaderlen);
1320 			if (fraggap) {
1321 				skb->csum = skb_copy_and_csum_bits(
1322 					skb_prev, maxfraglen,
1323 					data + transhdrlen, fraggap, 0);
1324 				skb_prev->csum = csum_sub(skb_prev->csum,
1325 							  skb->csum);
1326 				data += fraggap;
1327 				pskb_trim_unique(skb_prev, maxfraglen);
1328 			}
1329 			copy = datalen - transhdrlen - fraggap;
1330 			if (copy < 0) {
1331 				err = -EINVAL;
1332 				kfree_skb(skb);
1333 				goto error;
1334 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1335 				err = -EFAULT;
1336 				kfree_skb(skb);
1337 				goto error;
1338 			}
1339 
1340 			offset += copy;
1341 			length -= datalen - fraggap;
1342 			transhdrlen = 0;
1343 			exthdrlen = 0;
1344 			csummode = CHECKSUM_NONE;
1345 
1346 			/*
1347 			 * Put the packet on the pending queue
1348 			 */
1349 			__skb_queue_tail(&sk->sk_write_queue, skb);
1350 			continue;
1351 		}
1352 
1353 		if (copy > length)
1354 			copy = length;
1355 
1356 		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1357 			unsigned int off;
1358 
1359 			off = skb->len;
1360 			if (getfrag(from, skb_put(skb, copy),
1361 						offset, copy, off, skb) < 0) {
1362 				__skb_trim(skb, off);
1363 				err = -EFAULT;
1364 				goto error;
1365 			}
1366 		} else {
1367 			int i = skb_shinfo(skb)->nr_frags;
1368 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1369 			struct page *page = sk->sk_sndmsg_page;
1370 			int off = sk->sk_sndmsg_off;
1371 			unsigned int left;
1372 
1373 			if (page && (left = PAGE_SIZE - off) > 0) {
1374 				if (copy >= left)
1375 					copy = left;
1376 				if (page != frag->page) {
1377 					if (i == MAX_SKB_FRAGS) {
1378 						err = -EMSGSIZE;
1379 						goto error;
1380 					}
1381 					get_page(page);
1382 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1383 					frag = &skb_shinfo(skb)->frags[i];
1384 				}
1385 			} else if(i < MAX_SKB_FRAGS) {
1386 				if (copy > PAGE_SIZE)
1387 					copy = PAGE_SIZE;
1388 				page = alloc_pages(sk->sk_allocation, 0);
1389 				if (page == NULL) {
1390 					err = -ENOMEM;
1391 					goto error;
1392 				}
1393 				sk->sk_sndmsg_page = page;
1394 				sk->sk_sndmsg_off = 0;
1395 
1396 				skb_fill_page_desc(skb, i, page, 0, 0);
1397 				frag = &skb_shinfo(skb)->frags[i];
1398 			} else {
1399 				err = -EMSGSIZE;
1400 				goto error;
1401 			}
1402 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1403 				err = -EFAULT;
1404 				goto error;
1405 			}
1406 			sk->sk_sndmsg_off += copy;
1407 			frag->size += copy;
1408 			skb->len += copy;
1409 			skb->data_len += copy;
1410 			skb->truesize += copy;
1411 			atomic_add(copy, &sk->sk_wmem_alloc);
1412 		}
1413 		offset += copy;
1414 		length -= copy;
1415 	}
1416 	return 0;
1417 error:
1418 	inet->cork.length -= length;
1419 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1420 	return err;
1421 }
1422 
1423 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1424 {
1425 	if (np->cork.opt) {
1426 		kfree(np->cork.opt->dst0opt);
1427 		kfree(np->cork.opt->dst1opt);
1428 		kfree(np->cork.opt->hopopt);
1429 		kfree(np->cork.opt->srcrt);
1430 		kfree(np->cork.opt);
1431 		np->cork.opt = NULL;
1432 	}
1433 
1434 	if (inet->cork.dst) {
1435 		dst_release(inet->cork.dst);
1436 		inet->cork.dst = NULL;
1437 		inet->cork.flags &= ~IPCORK_ALLFRAG;
1438 	}
1439 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1440 }
1441 
1442 int ip6_push_pending_frames(struct sock *sk)
1443 {
1444 	struct sk_buff *skb, *tmp_skb;
1445 	struct sk_buff **tail_skb;
1446 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1447 	struct inet_sock *inet = inet_sk(sk);
1448 	struct ipv6_pinfo *np = inet6_sk(sk);
1449 	struct net *net = sock_net(sk);
1450 	struct ipv6hdr *hdr;
1451 	struct ipv6_txoptions *opt = np->cork.opt;
1452 	struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1453 	struct flowi *fl = &inet->cork.fl;
1454 	unsigned char proto = fl->proto;
1455 	int err = 0;
1456 
1457 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1458 		goto out;
1459 	tail_skb = &(skb_shinfo(skb)->frag_list);
1460 
1461 	/* move skb->data to ip header from ext header */
1462 	if (skb->data < skb_network_header(skb))
1463 		__skb_pull(skb, skb_network_offset(skb));
1464 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1465 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1466 		*tail_skb = tmp_skb;
1467 		tail_skb = &(tmp_skb->next);
1468 		skb->len += tmp_skb->len;
1469 		skb->data_len += tmp_skb->len;
1470 		skb->truesize += tmp_skb->truesize;
1471 		tmp_skb->destructor = NULL;
1472 		tmp_skb->sk = NULL;
1473 	}
1474 
1475 	/* Allow local fragmentation. */
1476 	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1477 		skb->local_df = 1;
1478 
1479 	ipv6_addr_copy(final_dst, &fl->fl6_dst);
1480 	__skb_pull(skb, skb_network_header_len(skb));
1481 	if (opt && opt->opt_flen)
1482 		ipv6_push_frag_opts(skb, opt, &proto);
1483 	if (opt && opt->opt_nflen)
1484 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1485 
1486 	skb_push(skb, sizeof(struct ipv6hdr));
1487 	skb_reset_network_header(skb);
1488 	hdr = ipv6_hdr(skb);
1489 
1490 	*(__be32*)hdr = fl->fl6_flowlabel |
1491 		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1492 
1493 	hdr->hop_limit = np->cork.hop_limit;
1494 	hdr->nexthdr = proto;
1495 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1496 	ipv6_addr_copy(&hdr->daddr, final_dst);
1497 
1498 	skb->priority = sk->sk_priority;
1499 	skb->mark = sk->sk_mark;
1500 
1501 	skb_dst_set(skb, dst_clone(&rt->u.dst));
1502 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1503 	if (proto == IPPROTO_ICMPV6) {
1504 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1505 
1506 		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1507 		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1508 	}
1509 
1510 	err = ip6_local_out(skb);
1511 	if (err) {
1512 		if (err > 0)
1513 			err = net_xmit_errno(err);
1514 		if (err)
1515 			goto error;
1516 	}
1517 
1518 out:
1519 	ip6_cork_release(inet, np);
1520 	return err;
1521 error:
1522 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1523 	goto out;
1524 }
1525 
1526 void ip6_flush_pending_frames(struct sock *sk)
1527 {
1528 	struct sk_buff *skb;
1529 
1530 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1531 		if (skb_dst(skb))
1532 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1533 				      IPSTATS_MIB_OUTDISCARDS);
1534 		kfree_skb(skb);
1535 	}
1536 
1537 	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1538 }
1539