xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 0d456bad)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	: 	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63 	int len;
64 
65 	len = skb->len - sizeof(struct ipv6hdr);
66 	if (len > IPV6_MAXPLEN)
67 		len = 0;
68 	ipv6_hdr(skb)->payload_len = htons(len);
69 
70 	return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 		       skb_dst(skb)->dev, dst_output);
72 }
73 
74 int ip6_local_out(struct sk_buff *skb)
75 {
76 	int err;
77 
78 	err = __ip6_local_out(skb);
79 	if (likely(err == 1))
80 		err = dst_output(skb);
81 
82 	return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85 
86 static int ip6_finish_output2(struct sk_buff *skb)
87 {
88 	struct dst_entry *dst = skb_dst(skb);
89 	struct net_device *dev = dst->dev;
90 	struct neighbour *neigh;
91 	struct rt6_info *rt;
92 
93 	skb->protocol = htons(ETH_P_IPV6);
94 	skb->dev = dev;
95 
96 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
97 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
98 
99 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
100 		    ((mroute6_socket(dev_net(dev), skb) &&
101 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
102 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
103 					 &ipv6_hdr(skb)->saddr))) {
104 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
105 
106 			/* Do not check for IFF_ALLMULTI; multicast routing
107 			   is not supported in any case.
108 			 */
109 			if (newskb)
110 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
111 					newskb, NULL, newskb->dev,
112 					dev_loopback_xmit);
113 
114 			if (ipv6_hdr(skb)->hop_limit == 0) {
115 				IP6_INC_STATS(dev_net(dev), idev,
116 					      IPSTATS_MIB_OUTDISCARDS);
117 				kfree_skb(skb);
118 				return 0;
119 			}
120 		}
121 
122 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
123 				skb->len);
124 	}
125 
126 	rt = (struct rt6_info *) dst;
127 	neigh = rt->n;
128 	if (neigh)
129 		return dst_neigh_output(dst, neigh, skb);
130 
131 	IP6_INC_STATS_BH(dev_net(dst->dev),
132 			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
133 	kfree_skb(skb);
134 	return -EINVAL;
135 }
136 
137 static int ip6_finish_output(struct sk_buff *skb)
138 {
139 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
140 	    dst_allfrag(skb_dst(skb)))
141 		return ip6_fragment(skb, ip6_finish_output2);
142 	else
143 		return ip6_finish_output2(skb);
144 }
145 
146 int ip6_output(struct sk_buff *skb)
147 {
148 	struct net_device *dev = skb_dst(skb)->dev;
149 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
150 	if (unlikely(idev->cnf.disable_ipv6)) {
151 		IP6_INC_STATS(dev_net(dev), idev,
152 			      IPSTATS_MIB_OUTDISCARDS);
153 		kfree_skb(skb);
154 		return 0;
155 	}
156 
157 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
158 			    ip6_finish_output,
159 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
160 }
161 
162 /*
163  *	xmit an sk_buff (used by TCP, SCTP and DCCP)
164  */
165 
166 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
167 	     struct ipv6_txoptions *opt, int tclass)
168 {
169 	struct net *net = sock_net(sk);
170 	struct ipv6_pinfo *np = inet6_sk(sk);
171 	struct in6_addr *first_hop = &fl6->daddr;
172 	struct dst_entry *dst = skb_dst(skb);
173 	struct ipv6hdr *hdr;
174 	u8  proto = fl6->flowi6_proto;
175 	int seg_len = skb->len;
176 	int hlimit = -1;
177 	u32 mtu;
178 
179 	if (opt) {
180 		unsigned int head_room;
181 
182 		/* First: exthdrs may take lots of space (~8K for now)
183 		   MAX_HEADER is not enough.
184 		 */
185 		head_room = opt->opt_nflen + opt->opt_flen;
186 		seg_len += head_room;
187 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
188 
189 		if (skb_headroom(skb) < head_room) {
190 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
191 			if (skb2 == NULL) {
192 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
193 					      IPSTATS_MIB_OUTDISCARDS);
194 				kfree_skb(skb);
195 				return -ENOBUFS;
196 			}
197 			consume_skb(skb);
198 			skb = skb2;
199 			skb_set_owner_w(skb, sk);
200 		}
201 		if (opt->opt_flen)
202 			ipv6_push_frag_opts(skb, opt, &proto);
203 		if (opt->opt_nflen)
204 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
205 	}
206 
207 	skb_push(skb, sizeof(struct ipv6hdr));
208 	skb_reset_network_header(skb);
209 	hdr = ipv6_hdr(skb);
210 
211 	/*
212 	 *	Fill in the IPv6 header
213 	 */
214 	if (np)
215 		hlimit = np->hop_limit;
216 	if (hlimit < 0)
217 		hlimit = ip6_dst_hoplimit(dst);
218 
219 	*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
220 
221 	hdr->payload_len = htons(seg_len);
222 	hdr->nexthdr = proto;
223 	hdr->hop_limit = hlimit;
224 
225 	hdr->saddr = fl6->saddr;
226 	hdr->daddr = *first_hop;
227 
228 	skb->priority = sk->sk_priority;
229 	skb->mark = sk->sk_mark;
230 
231 	mtu = dst_mtu(dst);
232 	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
233 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
234 			      IPSTATS_MIB_OUT, skb->len);
235 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
236 			       dst->dev, dst_output);
237 	}
238 
239 	net_dbg_ratelimited("IPv6: sending pkt_too_big to self\n");
240 	skb->dev = dst->dev;
241 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
242 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
243 	kfree_skb(skb);
244 	return -EMSGSIZE;
245 }
246 
247 EXPORT_SYMBOL(ip6_xmit);
248 
249 /*
250  *	To avoid extra problems ND packets are send through this
251  *	routine. It's code duplication but I really want to avoid
252  *	extra checks since ipv6_build_header is used by TCP (which
253  *	is for us performance critical)
254  */
255 
256 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
257 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
258 	       int proto, int len)
259 {
260 	struct ipv6_pinfo *np = inet6_sk(sk);
261 	struct ipv6hdr *hdr;
262 
263 	skb->protocol = htons(ETH_P_IPV6);
264 	skb->dev = dev;
265 
266 	skb_reset_network_header(skb);
267 	skb_put(skb, sizeof(struct ipv6hdr));
268 	hdr = ipv6_hdr(skb);
269 
270 	*(__be32*)hdr = htonl(0x60000000);
271 
272 	hdr->payload_len = htons(len);
273 	hdr->nexthdr = proto;
274 	hdr->hop_limit = np->hop_limit;
275 
276 	hdr->saddr = *saddr;
277 	hdr->daddr = *daddr;
278 
279 	return 0;
280 }
281 
282 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
283 {
284 	struct ip6_ra_chain *ra;
285 	struct sock *last = NULL;
286 
287 	read_lock(&ip6_ra_lock);
288 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
289 		struct sock *sk = ra->sk;
290 		if (sk && ra->sel == sel &&
291 		    (!sk->sk_bound_dev_if ||
292 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
293 			if (last) {
294 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
295 				if (skb2)
296 					rawv6_rcv(last, skb2);
297 			}
298 			last = sk;
299 		}
300 	}
301 
302 	if (last) {
303 		rawv6_rcv(last, skb);
304 		read_unlock(&ip6_ra_lock);
305 		return 1;
306 	}
307 	read_unlock(&ip6_ra_lock);
308 	return 0;
309 }
310 
311 static int ip6_forward_proxy_check(struct sk_buff *skb)
312 {
313 	struct ipv6hdr *hdr = ipv6_hdr(skb);
314 	u8 nexthdr = hdr->nexthdr;
315 	__be16 frag_off;
316 	int offset;
317 
318 	if (ipv6_ext_hdr(nexthdr)) {
319 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
320 		if (offset < 0)
321 			return 0;
322 	} else
323 		offset = sizeof(struct ipv6hdr);
324 
325 	if (nexthdr == IPPROTO_ICMPV6) {
326 		struct icmp6hdr *icmp6;
327 
328 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
329 					 offset + 1 - skb->data)))
330 			return 0;
331 
332 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
333 
334 		switch (icmp6->icmp6_type) {
335 		case NDISC_ROUTER_SOLICITATION:
336 		case NDISC_ROUTER_ADVERTISEMENT:
337 		case NDISC_NEIGHBOUR_SOLICITATION:
338 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
339 		case NDISC_REDIRECT:
340 			/* For reaction involving unicast neighbor discovery
341 			 * message destined to the proxied address, pass it to
342 			 * input function.
343 			 */
344 			return 1;
345 		default:
346 			break;
347 		}
348 	}
349 
350 	/*
351 	 * The proxying router can't forward traffic sent to a link-local
352 	 * address, so signal the sender and discard the packet. This
353 	 * behavior is clarified by the MIPv6 specification.
354 	 */
355 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
356 		dst_link_failure(skb);
357 		return -1;
358 	}
359 
360 	return 0;
361 }
362 
363 static inline int ip6_forward_finish(struct sk_buff *skb)
364 {
365 	return dst_output(skb);
366 }
367 
368 int ip6_forward(struct sk_buff *skb)
369 {
370 	struct dst_entry *dst = skb_dst(skb);
371 	struct ipv6hdr *hdr = ipv6_hdr(skb);
372 	struct inet6_skb_parm *opt = IP6CB(skb);
373 	struct net *net = dev_net(dst->dev);
374 	u32 mtu;
375 
376 	if (net->ipv6.devconf_all->forwarding == 0)
377 		goto error;
378 
379 	if (skb_warn_if_lro(skb))
380 		goto drop;
381 
382 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
383 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
384 		goto drop;
385 	}
386 
387 	if (skb->pkt_type != PACKET_HOST)
388 		goto drop;
389 
390 	skb_forward_csum(skb);
391 
392 	/*
393 	 *	We DO NOT make any processing on
394 	 *	RA packets, pushing them to user level AS IS
395 	 *	without ane WARRANTY that application will be able
396 	 *	to interpret them. The reason is that we
397 	 *	cannot make anything clever here.
398 	 *
399 	 *	We are not end-node, so that if packet contains
400 	 *	AH/ESP, we cannot make anything.
401 	 *	Defragmentation also would be mistake, RA packets
402 	 *	cannot be fragmented, because there is no warranty
403 	 *	that different fragments will go along one path. --ANK
404 	 */
405 	if (opt->ra) {
406 		u8 *ptr = skb_network_header(skb) + opt->ra;
407 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
408 			return 0;
409 	}
410 
411 	/*
412 	 *	check and decrement ttl
413 	 */
414 	if (hdr->hop_limit <= 1) {
415 		/* Force OUTPUT device used as source address */
416 		skb->dev = dst->dev;
417 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
418 		IP6_INC_STATS_BH(net,
419 				 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
420 
421 		kfree_skb(skb);
422 		return -ETIMEDOUT;
423 	}
424 
425 	/* XXX: idev->cnf.proxy_ndp? */
426 	if (net->ipv6.devconf_all->proxy_ndp &&
427 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
428 		int proxied = ip6_forward_proxy_check(skb);
429 		if (proxied > 0)
430 			return ip6_input(skb);
431 		else if (proxied < 0) {
432 			IP6_INC_STATS(net, ip6_dst_idev(dst),
433 				      IPSTATS_MIB_INDISCARDS);
434 			goto drop;
435 		}
436 	}
437 
438 	if (!xfrm6_route_forward(skb)) {
439 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
440 		goto drop;
441 	}
442 	dst = skb_dst(skb);
443 
444 	/* IPv6 specs say nothing about it, but it is clear that we cannot
445 	   send redirects to source routed frames.
446 	   We don't send redirects to frames decapsulated from IPsec.
447 	 */
448 	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
449 		struct in6_addr *target = NULL;
450 		struct inet_peer *peer;
451 		struct rt6_info *rt;
452 
453 		/*
454 		 *	incoming and outgoing devices are the same
455 		 *	send a redirect.
456 		 */
457 
458 		rt = (struct rt6_info *) dst;
459 		if (rt->rt6i_flags & RTF_GATEWAY)
460 			target = &rt->rt6i_gateway;
461 		else
462 			target = &hdr->daddr;
463 
464 		peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
465 
466 		/* Limit redirects both by destination (here)
467 		   and by source (inside ndisc_send_redirect)
468 		 */
469 		if (inet_peer_xrlim_allow(peer, 1*HZ))
470 			ndisc_send_redirect(skb, target);
471 		if (peer)
472 			inet_putpeer(peer);
473 	} else {
474 		int addrtype = ipv6_addr_type(&hdr->saddr);
475 
476 		/* This check is security critical. */
477 		if (addrtype == IPV6_ADDR_ANY ||
478 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
479 			goto error;
480 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
481 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
482 				    ICMPV6_NOT_NEIGHBOUR, 0);
483 			goto error;
484 		}
485 	}
486 
487 	mtu = dst_mtu(dst);
488 	if (mtu < IPV6_MIN_MTU)
489 		mtu = IPV6_MIN_MTU;
490 
491 	if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) ||
492 	    (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) {
493 		/* Again, force OUTPUT device used as source address */
494 		skb->dev = dst->dev;
495 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
496 		IP6_INC_STATS_BH(net,
497 				 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
498 		IP6_INC_STATS_BH(net,
499 				 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
500 		kfree_skb(skb);
501 		return -EMSGSIZE;
502 	}
503 
504 	if (skb_cow(skb, dst->dev->hard_header_len)) {
505 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
506 		goto drop;
507 	}
508 
509 	hdr = ipv6_hdr(skb);
510 
511 	/* Mangling hops number delayed to point after skb COW */
512 
513 	hdr->hop_limit--;
514 
515 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
516 	IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
517 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
518 		       ip6_forward_finish);
519 
520 error:
521 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
522 drop:
523 	kfree_skb(skb);
524 	return -EINVAL;
525 }
526 
527 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
528 {
529 	to->pkt_type = from->pkt_type;
530 	to->priority = from->priority;
531 	to->protocol = from->protocol;
532 	skb_dst_drop(to);
533 	skb_dst_set(to, dst_clone(skb_dst(from)));
534 	to->dev = from->dev;
535 	to->mark = from->mark;
536 
537 #ifdef CONFIG_NET_SCHED
538 	to->tc_index = from->tc_index;
539 #endif
540 	nf_copy(to, from);
541 #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
542 	to->nf_trace = from->nf_trace;
543 #endif
544 	skb_copy_secmark(to, from);
545 }
546 
547 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
548 {
549 	struct sk_buff *frag;
550 	struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
551 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
552 	struct ipv6hdr *tmp_hdr;
553 	struct frag_hdr *fh;
554 	unsigned int mtu, hlen, left, len;
555 	int hroom, troom;
556 	__be32 frag_id = 0;
557 	int ptr, offset = 0, err=0;
558 	u8 *prevhdr, nexthdr = 0;
559 	struct net *net = dev_net(skb_dst(skb)->dev);
560 
561 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
562 	nexthdr = *prevhdr;
563 
564 	mtu = ip6_skb_dst_mtu(skb);
565 
566 	/* We must not fragment if the socket is set to force MTU discovery
567 	 * or if the skb it not generated by a local socket.
568 	 */
569 	if (unlikely(!skb->local_df && skb->len > mtu) ||
570 		     (IP6CB(skb)->frag_max_size &&
571 		      IP6CB(skb)->frag_max_size > mtu)) {
572 		if (skb->sk && dst_allfrag(skb_dst(skb)))
573 			sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
574 
575 		skb->dev = skb_dst(skb)->dev;
576 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
577 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
578 			      IPSTATS_MIB_FRAGFAILS);
579 		kfree_skb(skb);
580 		return -EMSGSIZE;
581 	}
582 
583 	if (np && np->frag_size < mtu) {
584 		if (np->frag_size)
585 			mtu = np->frag_size;
586 	}
587 	mtu -= hlen + sizeof(struct frag_hdr);
588 
589 	if (skb_has_frag_list(skb)) {
590 		int first_len = skb_pagelen(skb);
591 		struct sk_buff *frag2;
592 
593 		if (first_len - hlen > mtu ||
594 		    ((first_len - hlen) & 7) ||
595 		    skb_cloned(skb))
596 			goto slow_path;
597 
598 		skb_walk_frags(skb, frag) {
599 			/* Correct geometry. */
600 			if (frag->len > mtu ||
601 			    ((frag->len & 7) && frag->next) ||
602 			    skb_headroom(frag) < hlen)
603 				goto slow_path_clean;
604 
605 			/* Partially cloned skb? */
606 			if (skb_shared(frag))
607 				goto slow_path_clean;
608 
609 			BUG_ON(frag->sk);
610 			if (skb->sk) {
611 				frag->sk = skb->sk;
612 				frag->destructor = sock_wfree;
613 			}
614 			skb->truesize -= frag->truesize;
615 		}
616 
617 		err = 0;
618 		offset = 0;
619 		frag = skb_shinfo(skb)->frag_list;
620 		skb_frag_list_init(skb);
621 		/* BUILD HEADER */
622 
623 		*prevhdr = NEXTHDR_FRAGMENT;
624 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
625 		if (!tmp_hdr) {
626 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
627 				      IPSTATS_MIB_FRAGFAILS);
628 			return -ENOMEM;
629 		}
630 
631 		__skb_pull(skb, hlen);
632 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
633 		__skb_push(skb, hlen);
634 		skb_reset_network_header(skb);
635 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
636 
637 		ipv6_select_ident(fh, rt);
638 		fh->nexthdr = nexthdr;
639 		fh->reserved = 0;
640 		fh->frag_off = htons(IP6_MF);
641 		frag_id = fh->identification;
642 
643 		first_len = skb_pagelen(skb);
644 		skb->data_len = first_len - skb_headlen(skb);
645 		skb->len = first_len;
646 		ipv6_hdr(skb)->payload_len = htons(first_len -
647 						   sizeof(struct ipv6hdr));
648 
649 		dst_hold(&rt->dst);
650 
651 		for (;;) {
652 			/* Prepare header of the next frame,
653 			 * before previous one went down. */
654 			if (frag) {
655 				frag->ip_summed = CHECKSUM_NONE;
656 				skb_reset_transport_header(frag);
657 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
658 				__skb_push(frag, hlen);
659 				skb_reset_network_header(frag);
660 				memcpy(skb_network_header(frag), tmp_hdr,
661 				       hlen);
662 				offset += skb->len - hlen - sizeof(struct frag_hdr);
663 				fh->nexthdr = nexthdr;
664 				fh->reserved = 0;
665 				fh->frag_off = htons(offset);
666 				if (frag->next != NULL)
667 					fh->frag_off |= htons(IP6_MF);
668 				fh->identification = frag_id;
669 				ipv6_hdr(frag)->payload_len =
670 						htons(frag->len -
671 						      sizeof(struct ipv6hdr));
672 				ip6_copy_metadata(frag, skb);
673 			}
674 
675 			err = output(skb);
676 			if(!err)
677 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
678 					      IPSTATS_MIB_FRAGCREATES);
679 
680 			if (err || !frag)
681 				break;
682 
683 			skb = frag;
684 			frag = skb->next;
685 			skb->next = NULL;
686 		}
687 
688 		kfree(tmp_hdr);
689 
690 		if (err == 0) {
691 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
692 				      IPSTATS_MIB_FRAGOKS);
693 			ip6_rt_put(rt);
694 			return 0;
695 		}
696 
697 		while (frag) {
698 			skb = frag->next;
699 			kfree_skb(frag);
700 			frag = skb;
701 		}
702 
703 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
704 			      IPSTATS_MIB_FRAGFAILS);
705 		ip6_rt_put(rt);
706 		return err;
707 
708 slow_path_clean:
709 		skb_walk_frags(skb, frag2) {
710 			if (frag2 == frag)
711 				break;
712 			frag2->sk = NULL;
713 			frag2->destructor = NULL;
714 			skb->truesize += frag2->truesize;
715 		}
716 	}
717 
718 slow_path:
719 	if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
720 	    skb_checksum_help(skb))
721 		goto fail;
722 
723 	left = skb->len - hlen;		/* Space per frame */
724 	ptr = hlen;			/* Where to start from */
725 
726 	/*
727 	 *	Fragment the datagram.
728 	 */
729 
730 	*prevhdr = NEXTHDR_FRAGMENT;
731 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
732 	troom = rt->dst.dev->needed_tailroom;
733 
734 	/*
735 	 *	Keep copying data until we run out.
736 	 */
737 	while(left > 0)	{
738 		len = left;
739 		/* IF: it doesn't fit, use 'mtu' - the data space left */
740 		if (len > mtu)
741 			len = mtu;
742 		/* IF: we are not sending up to and including the packet end
743 		   then align the next start on an eight byte boundary */
744 		if (len < left)	{
745 			len &= ~7;
746 		}
747 		/*
748 		 *	Allocate buffer.
749 		 */
750 
751 		if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
752 				      hroom + troom, GFP_ATOMIC)) == NULL) {
753 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
754 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
755 				      IPSTATS_MIB_FRAGFAILS);
756 			err = -ENOMEM;
757 			goto fail;
758 		}
759 
760 		/*
761 		 *	Set up data on packet
762 		 */
763 
764 		ip6_copy_metadata(frag, skb);
765 		skb_reserve(frag, hroom);
766 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
767 		skb_reset_network_header(frag);
768 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
769 		frag->transport_header = (frag->network_header + hlen +
770 					  sizeof(struct frag_hdr));
771 
772 		/*
773 		 *	Charge the memory for the fragment to any owner
774 		 *	it might possess
775 		 */
776 		if (skb->sk)
777 			skb_set_owner_w(frag, skb->sk);
778 
779 		/*
780 		 *	Copy the packet header into the new buffer.
781 		 */
782 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
783 
784 		/*
785 		 *	Build fragment header.
786 		 */
787 		fh->nexthdr = nexthdr;
788 		fh->reserved = 0;
789 		if (!frag_id) {
790 			ipv6_select_ident(fh, rt);
791 			frag_id = fh->identification;
792 		} else
793 			fh->identification = frag_id;
794 
795 		/*
796 		 *	Copy a block of the IP datagram.
797 		 */
798 		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
799 			BUG();
800 		left -= len;
801 
802 		fh->frag_off = htons(offset);
803 		if (left > 0)
804 			fh->frag_off |= htons(IP6_MF);
805 		ipv6_hdr(frag)->payload_len = htons(frag->len -
806 						    sizeof(struct ipv6hdr));
807 
808 		ptr += len;
809 		offset += len;
810 
811 		/*
812 		 *	Put this fragment into the sending queue.
813 		 */
814 		err = output(frag);
815 		if (err)
816 			goto fail;
817 
818 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
819 			      IPSTATS_MIB_FRAGCREATES);
820 	}
821 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
822 		      IPSTATS_MIB_FRAGOKS);
823 	consume_skb(skb);
824 	return err;
825 
826 fail:
827 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
828 		      IPSTATS_MIB_FRAGFAILS);
829 	kfree_skb(skb);
830 	return err;
831 }
832 
833 static inline int ip6_rt_check(const struct rt6key *rt_key,
834 			       const struct in6_addr *fl_addr,
835 			       const struct in6_addr *addr_cache)
836 {
837 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
838 		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
839 }
840 
841 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
842 					  struct dst_entry *dst,
843 					  const struct flowi6 *fl6)
844 {
845 	struct ipv6_pinfo *np = inet6_sk(sk);
846 	struct rt6_info *rt = (struct rt6_info *)dst;
847 
848 	if (!dst)
849 		goto out;
850 
851 	/* Yes, checking route validity in not connected
852 	 * case is not very simple. Take into account,
853 	 * that we do not support routing by source, TOS,
854 	 * and MSG_DONTROUTE 		--ANK (980726)
855 	 *
856 	 * 1. ip6_rt_check(): If route was host route,
857 	 *    check that cached destination is current.
858 	 *    If it is network route, we still may
859 	 *    check its validity using saved pointer
860 	 *    to the last used address: daddr_cache.
861 	 *    We do not want to save whole address now,
862 	 *    (because main consumer of this service
863 	 *    is tcp, which has not this problem),
864 	 *    so that the last trick works only on connected
865 	 *    sockets.
866 	 * 2. oif also should be the same.
867 	 */
868 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
869 #ifdef CONFIG_IPV6_SUBTREES
870 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
871 #endif
872 	    (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
873 		dst_release(dst);
874 		dst = NULL;
875 	}
876 
877 out:
878 	return dst;
879 }
880 
881 static int ip6_dst_lookup_tail(struct sock *sk,
882 			       struct dst_entry **dst, struct flowi6 *fl6)
883 {
884 	struct net *net = sock_net(sk);
885 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
886 	struct neighbour *n;
887 	struct rt6_info *rt;
888 #endif
889 	int err;
890 
891 	if (*dst == NULL)
892 		*dst = ip6_route_output(net, sk, fl6);
893 
894 	if ((err = (*dst)->error))
895 		goto out_err_release;
896 
897 	if (ipv6_addr_any(&fl6->saddr)) {
898 		struct rt6_info *rt = (struct rt6_info *) *dst;
899 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
900 					  sk ? inet6_sk(sk)->srcprefs : 0,
901 					  &fl6->saddr);
902 		if (err)
903 			goto out_err_release;
904 	}
905 
906 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
907 	/*
908 	 * Here if the dst entry we've looked up
909 	 * has a neighbour entry that is in the INCOMPLETE
910 	 * state and the src address from the flow is
911 	 * marked as OPTIMISTIC, we release the found
912 	 * dst entry and replace it instead with the
913 	 * dst entry of the nexthop router
914 	 */
915 	rt = (struct rt6_info *) *dst;
916 	n = rt->n;
917 	if (n && !(n->nud_state & NUD_VALID)) {
918 		struct inet6_ifaddr *ifp;
919 		struct flowi6 fl_gw6;
920 		int redirect;
921 
922 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
923 				      (*dst)->dev, 1);
924 
925 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
926 		if (ifp)
927 			in6_ifa_put(ifp);
928 
929 		if (redirect) {
930 			/*
931 			 * We need to get the dst entry for the
932 			 * default router instead
933 			 */
934 			dst_release(*dst);
935 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
936 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
937 			*dst = ip6_route_output(net, sk, &fl_gw6);
938 			if ((err = (*dst)->error))
939 				goto out_err_release;
940 		}
941 	}
942 #endif
943 
944 	return 0;
945 
946 out_err_release:
947 	if (err == -ENETUNREACH)
948 		IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
949 	dst_release(*dst);
950 	*dst = NULL;
951 	return err;
952 }
953 
954 /**
955  *	ip6_dst_lookup - perform route lookup on flow
956  *	@sk: socket which provides route info
957  *	@dst: pointer to dst_entry * for result
958  *	@fl6: flow to lookup
959  *
960  *	This function performs a route lookup on the given flow.
961  *
962  *	It returns zero on success, or a standard errno code on error.
963  */
964 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
965 {
966 	*dst = NULL;
967 	return ip6_dst_lookup_tail(sk, dst, fl6);
968 }
969 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
970 
971 /**
972  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
973  *	@sk: socket which provides route info
974  *	@fl6: flow to lookup
975  *	@final_dst: final destination address for ipsec lookup
976  *	@can_sleep: we are in a sleepable context
977  *
978  *	This function performs a route lookup on the given flow.
979  *
980  *	It returns a valid dst pointer on success, or a pointer encoded
981  *	error code.
982  */
983 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
984 				      const struct in6_addr *final_dst,
985 				      bool can_sleep)
986 {
987 	struct dst_entry *dst = NULL;
988 	int err;
989 
990 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
991 	if (err)
992 		return ERR_PTR(err);
993 	if (final_dst)
994 		fl6->daddr = *final_dst;
995 	if (can_sleep)
996 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
997 
998 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
999 }
1000 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1001 
1002 /**
1003  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1004  *	@sk: socket which provides the dst cache and route info
1005  *	@fl6: flow to lookup
1006  *	@final_dst: final destination address for ipsec lookup
1007  *	@can_sleep: we are in a sleepable context
1008  *
1009  *	This function performs a route lookup on the given flow with the
1010  *	possibility of using the cached route in the socket if it is valid.
1011  *	It will take the socket dst lock when operating on the dst cache.
1012  *	As a result, this function can only be used in process context.
1013  *
1014  *	It returns a valid dst pointer on success, or a pointer encoded
1015  *	error code.
1016  */
1017 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1018 					 const struct in6_addr *final_dst,
1019 					 bool can_sleep)
1020 {
1021 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1022 	int err;
1023 
1024 	dst = ip6_sk_dst_check(sk, dst, fl6);
1025 
1026 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1027 	if (err)
1028 		return ERR_PTR(err);
1029 	if (final_dst)
1030 		fl6->daddr = *final_dst;
1031 	if (can_sleep)
1032 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1033 
1034 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1035 }
1036 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1037 
1038 static inline int ip6_ufo_append_data(struct sock *sk,
1039 			int getfrag(void *from, char *to, int offset, int len,
1040 			int odd, struct sk_buff *skb),
1041 			void *from, int length, int hh_len, int fragheaderlen,
1042 			int transhdrlen, int mtu,unsigned int flags,
1043 			struct rt6_info *rt)
1044 
1045 {
1046 	struct sk_buff *skb;
1047 	int err;
1048 
1049 	/* There is support for UDP large send offload by network
1050 	 * device, so create one single skb packet containing complete
1051 	 * udp datagram
1052 	 */
1053 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1054 		skb = sock_alloc_send_skb(sk,
1055 			hh_len + fragheaderlen + transhdrlen + 20,
1056 			(flags & MSG_DONTWAIT), &err);
1057 		if (skb == NULL)
1058 			return err;
1059 
1060 		/* reserve space for Hardware header */
1061 		skb_reserve(skb, hh_len);
1062 
1063 		/* create space for UDP/IP header */
1064 		skb_put(skb,fragheaderlen + transhdrlen);
1065 
1066 		/* initialize network header pointer */
1067 		skb_reset_network_header(skb);
1068 
1069 		/* initialize protocol header pointer */
1070 		skb->transport_header = skb->network_header + fragheaderlen;
1071 
1072 		skb->ip_summed = CHECKSUM_PARTIAL;
1073 		skb->csum = 0;
1074 	}
1075 
1076 	err = skb_append_datato_frags(sk,skb, getfrag, from,
1077 				      (length - transhdrlen));
1078 	if (!err) {
1079 		struct frag_hdr fhdr;
1080 
1081 		/* Specify the length of each IPv6 datagram fragment.
1082 		 * It has to be a multiple of 8.
1083 		 */
1084 		skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1085 					     sizeof(struct frag_hdr)) & ~7;
1086 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1087 		ipv6_select_ident(&fhdr, rt);
1088 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1089 		__skb_queue_tail(&sk->sk_write_queue, skb);
1090 
1091 		return 0;
1092 	}
1093 	/* There is not enough support do UPD LSO,
1094 	 * so follow normal path
1095 	 */
1096 	kfree_skb(skb);
1097 
1098 	return err;
1099 }
1100 
1101 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1102 					       gfp_t gfp)
1103 {
1104 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1105 }
1106 
1107 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1108 						gfp_t gfp)
1109 {
1110 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1111 }
1112 
1113 static void ip6_append_data_mtu(int *mtu,
1114 				int *maxfraglen,
1115 				unsigned int fragheaderlen,
1116 				struct sk_buff *skb,
1117 				struct rt6_info *rt)
1118 {
1119 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1120 		if (skb == NULL) {
1121 			/* first fragment, reserve header_len */
1122 			*mtu = *mtu - rt->dst.header_len;
1123 
1124 		} else {
1125 			/*
1126 			 * this fragment is not first, the headers
1127 			 * space is regarded as data space.
1128 			 */
1129 			*mtu = dst_mtu(rt->dst.path);
1130 		}
1131 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1132 			      + fragheaderlen - sizeof(struct frag_hdr);
1133 	}
1134 }
1135 
1136 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1137 	int offset, int len, int odd, struct sk_buff *skb),
1138 	void *from, int length, int transhdrlen,
1139 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1140 	struct rt6_info *rt, unsigned int flags, int dontfrag)
1141 {
1142 	struct inet_sock *inet = inet_sk(sk);
1143 	struct ipv6_pinfo *np = inet6_sk(sk);
1144 	struct inet_cork *cork;
1145 	struct sk_buff *skb, *skb_prev = NULL;
1146 	unsigned int maxfraglen, fragheaderlen;
1147 	int exthdrlen;
1148 	int dst_exthdrlen;
1149 	int hh_len;
1150 	int mtu;
1151 	int copy;
1152 	int err;
1153 	int offset = 0;
1154 	__u8 tx_flags = 0;
1155 
1156 	if (flags&MSG_PROBE)
1157 		return 0;
1158 	cork = &inet->cork.base;
1159 	if (skb_queue_empty(&sk->sk_write_queue)) {
1160 		/*
1161 		 * setup for corking
1162 		 */
1163 		if (opt) {
1164 			if (WARN_ON(np->cork.opt))
1165 				return -EINVAL;
1166 
1167 			np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1168 			if (unlikely(np->cork.opt == NULL))
1169 				return -ENOBUFS;
1170 
1171 			np->cork.opt->tot_len = opt->tot_len;
1172 			np->cork.opt->opt_flen = opt->opt_flen;
1173 			np->cork.opt->opt_nflen = opt->opt_nflen;
1174 
1175 			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1176 							    sk->sk_allocation);
1177 			if (opt->dst0opt && !np->cork.opt->dst0opt)
1178 				return -ENOBUFS;
1179 
1180 			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1181 							    sk->sk_allocation);
1182 			if (opt->dst1opt && !np->cork.opt->dst1opt)
1183 				return -ENOBUFS;
1184 
1185 			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1186 							   sk->sk_allocation);
1187 			if (opt->hopopt && !np->cork.opt->hopopt)
1188 				return -ENOBUFS;
1189 
1190 			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1191 							    sk->sk_allocation);
1192 			if (opt->srcrt && !np->cork.opt->srcrt)
1193 				return -ENOBUFS;
1194 
1195 			/* need source address above miyazawa*/
1196 		}
1197 		dst_hold(&rt->dst);
1198 		cork->dst = &rt->dst;
1199 		inet->cork.fl.u.ip6 = *fl6;
1200 		np->cork.hop_limit = hlimit;
1201 		np->cork.tclass = tclass;
1202 		if (rt->dst.flags & DST_XFRM_TUNNEL)
1203 			mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1204 			      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1205 		else
1206 			mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1207 			      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1208 		if (np->frag_size < mtu) {
1209 			if (np->frag_size)
1210 				mtu = np->frag_size;
1211 		}
1212 		cork->fragsize = mtu;
1213 		if (dst_allfrag(rt->dst.path))
1214 			cork->flags |= IPCORK_ALLFRAG;
1215 		cork->length = 0;
1216 		exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1217 		length += exthdrlen;
1218 		transhdrlen += exthdrlen;
1219 		dst_exthdrlen = rt->dst.header_len;
1220 	} else {
1221 		rt = (struct rt6_info *)cork->dst;
1222 		fl6 = &inet->cork.fl.u.ip6;
1223 		opt = np->cork.opt;
1224 		transhdrlen = 0;
1225 		exthdrlen = 0;
1226 		dst_exthdrlen = 0;
1227 		mtu = cork->fragsize;
1228 	}
1229 
1230 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1231 
1232 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1233 			(opt ? opt->opt_nflen : 0);
1234 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1235 
1236 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1237 		if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1238 			ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1239 			return -EMSGSIZE;
1240 		}
1241 	}
1242 
1243 	/* For UDP, check if TX timestamp is enabled */
1244 	if (sk->sk_type == SOCK_DGRAM) {
1245 		err = sock_tx_timestamp(sk, &tx_flags);
1246 		if (err)
1247 			goto error;
1248 	}
1249 
1250 	/*
1251 	 * Let's try using as much space as possible.
1252 	 * Use MTU if total length of the message fits into the MTU.
1253 	 * Otherwise, we need to reserve fragment header and
1254 	 * fragment alignment (= 8-15 octects, in total).
1255 	 *
1256 	 * Note that we may need to "move" the data from the tail of
1257 	 * of the buffer to the new fragment when we split
1258 	 * the message.
1259 	 *
1260 	 * FIXME: It may be fragmented into multiple chunks
1261 	 *        at once if non-fragmentable extension headers
1262 	 *        are too large.
1263 	 * --yoshfuji
1264 	 */
1265 
1266 	cork->length += length;
1267 	if (length > mtu) {
1268 		int proto = sk->sk_protocol;
1269 		if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1270 			ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1271 			return -EMSGSIZE;
1272 		}
1273 
1274 		if (proto == IPPROTO_UDP &&
1275 		    (rt->dst.dev->features & NETIF_F_UFO)) {
1276 
1277 			err = ip6_ufo_append_data(sk, getfrag, from, length,
1278 						  hh_len, fragheaderlen,
1279 						  transhdrlen, mtu, flags, rt);
1280 			if (err)
1281 				goto error;
1282 			return 0;
1283 		}
1284 	}
1285 
1286 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1287 		goto alloc_new_skb;
1288 
1289 	while (length > 0) {
1290 		/* Check if the remaining data fits into current packet. */
1291 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1292 		if (copy < length)
1293 			copy = maxfraglen - skb->len;
1294 
1295 		if (copy <= 0) {
1296 			char *data;
1297 			unsigned int datalen;
1298 			unsigned int fraglen;
1299 			unsigned int fraggap;
1300 			unsigned int alloclen;
1301 alloc_new_skb:
1302 			/* There's no room in the current skb */
1303 			if (skb)
1304 				fraggap = skb->len - maxfraglen;
1305 			else
1306 				fraggap = 0;
1307 			/* update mtu and maxfraglen if necessary */
1308 			if (skb == NULL || skb_prev == NULL)
1309 				ip6_append_data_mtu(&mtu, &maxfraglen,
1310 						    fragheaderlen, skb, rt);
1311 
1312 			skb_prev = skb;
1313 
1314 			/*
1315 			 * If remaining data exceeds the mtu,
1316 			 * we know we need more fragment(s).
1317 			 */
1318 			datalen = length + fraggap;
1319 
1320 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1321 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1322 			if ((flags & MSG_MORE) &&
1323 			    !(rt->dst.dev->features&NETIF_F_SG))
1324 				alloclen = mtu;
1325 			else
1326 				alloclen = datalen + fragheaderlen;
1327 
1328 			alloclen += dst_exthdrlen;
1329 
1330 			if (datalen != length + fraggap) {
1331 				/*
1332 				 * this is not the last fragment, the trailer
1333 				 * space is regarded as data space.
1334 				 */
1335 				datalen += rt->dst.trailer_len;
1336 			}
1337 
1338 			alloclen += rt->dst.trailer_len;
1339 			fraglen = datalen + fragheaderlen;
1340 
1341 			/*
1342 			 * We just reserve space for fragment header.
1343 			 * Note: this may be overallocation if the message
1344 			 * (without MSG_MORE) fits into the MTU.
1345 			 */
1346 			alloclen += sizeof(struct frag_hdr);
1347 
1348 			if (transhdrlen) {
1349 				skb = sock_alloc_send_skb(sk,
1350 						alloclen + hh_len,
1351 						(flags & MSG_DONTWAIT), &err);
1352 			} else {
1353 				skb = NULL;
1354 				if (atomic_read(&sk->sk_wmem_alloc) <=
1355 				    2 * sk->sk_sndbuf)
1356 					skb = sock_wmalloc(sk,
1357 							   alloclen + hh_len, 1,
1358 							   sk->sk_allocation);
1359 				if (unlikely(skb == NULL))
1360 					err = -ENOBUFS;
1361 				else {
1362 					/* Only the initial fragment
1363 					 * is time stamped.
1364 					 */
1365 					tx_flags = 0;
1366 				}
1367 			}
1368 			if (skb == NULL)
1369 				goto error;
1370 			/*
1371 			 *	Fill in the control structures
1372 			 */
1373 			skb->ip_summed = CHECKSUM_NONE;
1374 			skb->csum = 0;
1375 			/* reserve for fragmentation and ipsec header */
1376 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1377 				    dst_exthdrlen);
1378 
1379 			if (sk->sk_type == SOCK_DGRAM)
1380 				skb_shinfo(skb)->tx_flags = tx_flags;
1381 
1382 			/*
1383 			 *	Find where to start putting bytes
1384 			 */
1385 			data = skb_put(skb, fraglen);
1386 			skb_set_network_header(skb, exthdrlen);
1387 			data += fragheaderlen;
1388 			skb->transport_header = (skb->network_header +
1389 						 fragheaderlen);
1390 			if (fraggap) {
1391 				skb->csum = skb_copy_and_csum_bits(
1392 					skb_prev, maxfraglen,
1393 					data + transhdrlen, fraggap, 0);
1394 				skb_prev->csum = csum_sub(skb_prev->csum,
1395 							  skb->csum);
1396 				data += fraggap;
1397 				pskb_trim_unique(skb_prev, maxfraglen);
1398 			}
1399 			copy = datalen - transhdrlen - fraggap;
1400 
1401 			if (copy < 0) {
1402 				err = -EINVAL;
1403 				kfree_skb(skb);
1404 				goto error;
1405 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1406 				err = -EFAULT;
1407 				kfree_skb(skb);
1408 				goto error;
1409 			}
1410 
1411 			offset += copy;
1412 			length -= datalen - fraggap;
1413 			transhdrlen = 0;
1414 			exthdrlen = 0;
1415 			dst_exthdrlen = 0;
1416 
1417 			/*
1418 			 * Put the packet on the pending queue
1419 			 */
1420 			__skb_queue_tail(&sk->sk_write_queue, skb);
1421 			continue;
1422 		}
1423 
1424 		if (copy > length)
1425 			copy = length;
1426 
1427 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1428 			unsigned int off;
1429 
1430 			off = skb->len;
1431 			if (getfrag(from, skb_put(skb, copy),
1432 						offset, copy, off, skb) < 0) {
1433 				__skb_trim(skb, off);
1434 				err = -EFAULT;
1435 				goto error;
1436 			}
1437 		} else {
1438 			int i = skb_shinfo(skb)->nr_frags;
1439 			struct page_frag *pfrag = sk_page_frag(sk);
1440 
1441 			err = -ENOMEM;
1442 			if (!sk_page_frag_refill(sk, pfrag))
1443 				goto error;
1444 
1445 			if (!skb_can_coalesce(skb, i, pfrag->page,
1446 					      pfrag->offset)) {
1447 				err = -EMSGSIZE;
1448 				if (i == MAX_SKB_FRAGS)
1449 					goto error;
1450 
1451 				__skb_fill_page_desc(skb, i, pfrag->page,
1452 						     pfrag->offset, 0);
1453 				skb_shinfo(skb)->nr_frags = ++i;
1454 				get_page(pfrag->page);
1455 			}
1456 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1457 			if (getfrag(from,
1458 				    page_address(pfrag->page) + pfrag->offset,
1459 				    offset, copy, skb->len, skb) < 0)
1460 				goto error_efault;
1461 
1462 			pfrag->offset += copy;
1463 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1464 			skb->len += copy;
1465 			skb->data_len += copy;
1466 			skb->truesize += copy;
1467 			atomic_add(copy, &sk->sk_wmem_alloc);
1468 		}
1469 		offset += copy;
1470 		length -= copy;
1471 	}
1472 
1473 	return 0;
1474 
1475 error_efault:
1476 	err = -EFAULT;
1477 error:
1478 	cork->length -= length;
1479 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1480 	return err;
1481 }
1482 EXPORT_SYMBOL_GPL(ip6_append_data);
1483 
1484 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1485 {
1486 	if (np->cork.opt) {
1487 		kfree(np->cork.opt->dst0opt);
1488 		kfree(np->cork.opt->dst1opt);
1489 		kfree(np->cork.opt->hopopt);
1490 		kfree(np->cork.opt->srcrt);
1491 		kfree(np->cork.opt);
1492 		np->cork.opt = NULL;
1493 	}
1494 
1495 	if (inet->cork.base.dst) {
1496 		dst_release(inet->cork.base.dst);
1497 		inet->cork.base.dst = NULL;
1498 		inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1499 	}
1500 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1501 }
1502 
1503 int ip6_push_pending_frames(struct sock *sk)
1504 {
1505 	struct sk_buff *skb, *tmp_skb;
1506 	struct sk_buff **tail_skb;
1507 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1508 	struct inet_sock *inet = inet_sk(sk);
1509 	struct ipv6_pinfo *np = inet6_sk(sk);
1510 	struct net *net = sock_net(sk);
1511 	struct ipv6hdr *hdr;
1512 	struct ipv6_txoptions *opt = np->cork.opt;
1513 	struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1514 	struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1515 	unsigned char proto = fl6->flowi6_proto;
1516 	int err = 0;
1517 
1518 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1519 		goto out;
1520 	tail_skb = &(skb_shinfo(skb)->frag_list);
1521 
1522 	/* move skb->data to ip header from ext header */
1523 	if (skb->data < skb_network_header(skb))
1524 		__skb_pull(skb, skb_network_offset(skb));
1525 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1526 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1527 		*tail_skb = tmp_skb;
1528 		tail_skb = &(tmp_skb->next);
1529 		skb->len += tmp_skb->len;
1530 		skb->data_len += tmp_skb->len;
1531 		skb->truesize += tmp_skb->truesize;
1532 		tmp_skb->destructor = NULL;
1533 		tmp_skb->sk = NULL;
1534 	}
1535 
1536 	/* Allow local fragmentation. */
1537 	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1538 		skb->local_df = 1;
1539 
1540 	*final_dst = fl6->daddr;
1541 	__skb_pull(skb, skb_network_header_len(skb));
1542 	if (opt && opt->opt_flen)
1543 		ipv6_push_frag_opts(skb, opt, &proto);
1544 	if (opt && opt->opt_nflen)
1545 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1546 
1547 	skb_push(skb, sizeof(struct ipv6hdr));
1548 	skb_reset_network_header(skb);
1549 	hdr = ipv6_hdr(skb);
1550 
1551 	*(__be32*)hdr = fl6->flowlabel |
1552 		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1553 
1554 	hdr->hop_limit = np->cork.hop_limit;
1555 	hdr->nexthdr = proto;
1556 	hdr->saddr = fl6->saddr;
1557 	hdr->daddr = *final_dst;
1558 
1559 	skb->priority = sk->sk_priority;
1560 	skb->mark = sk->sk_mark;
1561 
1562 	skb_dst_set(skb, dst_clone(&rt->dst));
1563 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1564 	if (proto == IPPROTO_ICMPV6) {
1565 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1566 
1567 		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1568 		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1569 	}
1570 
1571 	err = ip6_local_out(skb);
1572 	if (err) {
1573 		if (err > 0)
1574 			err = net_xmit_errno(err);
1575 		if (err)
1576 			goto error;
1577 	}
1578 
1579 out:
1580 	ip6_cork_release(inet, np);
1581 	return err;
1582 error:
1583 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1584 	goto out;
1585 }
1586 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1587 
1588 void ip6_flush_pending_frames(struct sock *sk)
1589 {
1590 	struct sk_buff *skb;
1591 
1592 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1593 		if (skb_dst(skb))
1594 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1595 				      IPSTATS_MIB_OUTDISCARDS);
1596 		kfree_skb(skb);
1597 	}
1598 
1599 	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1600 }
1601 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1602