xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 8ff374b9)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	: 	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 
59 static int ip6_finish_output2(struct sk_buff *skb)
60 {
61 	struct dst_entry *dst = skb_dst(skb);
62 	struct net_device *dev = dst->dev;
63 	struct neighbour *neigh;
64 	struct in6_addr *nexthop;
65 	int ret;
66 
67 	skb->protocol = htons(ETH_P_IPV6);
68 	skb->dev = dev;
69 
70 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72 
73 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
74 		    ((mroute6_socket(dev_net(dev), skb) &&
75 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 					 &ipv6_hdr(skb)->saddr))) {
78 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79 
80 			/* Do not check for IFF_ALLMULTI; multicast routing
81 			   is not supported in any case.
82 			 */
83 			if (newskb)
84 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 					newskb, NULL, newskb->dev,
86 					dev_loopback_xmit);
87 
88 			if (ipv6_hdr(skb)->hop_limit == 0) {
89 				IP6_INC_STATS(dev_net(dev), idev,
90 					      IPSTATS_MIB_OUTDISCARDS);
91 				kfree_skb(skb);
92 				return 0;
93 			}
94 		}
95 
96 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
97 				skb->len);
98 
99 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
100 		    IPV6_ADDR_SCOPE_NODELOCAL &&
101 		    !(dev->flags & IFF_LOOPBACK)) {
102 			kfree_skb(skb);
103 			return 0;
104 		}
105 	}
106 
107 	rcu_read_lock_bh();
108 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
109 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
110 	if (unlikely(!neigh))
111 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
112 	if (!IS_ERR(neigh)) {
113 		ret = dst_neigh_output(dst, neigh, skb);
114 		rcu_read_unlock_bh();
115 		return ret;
116 	}
117 	rcu_read_unlock_bh();
118 
119 	IP6_INC_STATS_BH(dev_net(dst->dev),
120 			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
121 	kfree_skb(skb);
122 	return -EINVAL;
123 }
124 
125 static int ip6_finish_output(struct sk_buff *skb)
126 {
127 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
128 	    dst_allfrag(skb_dst(skb)))
129 		return ip6_fragment(skb, ip6_finish_output2);
130 	else
131 		return ip6_finish_output2(skb);
132 }
133 
134 int ip6_output(struct sk_buff *skb)
135 {
136 	struct net_device *dev = skb_dst(skb)->dev;
137 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
138 	if (unlikely(idev->cnf.disable_ipv6)) {
139 		IP6_INC_STATS(dev_net(dev), idev,
140 			      IPSTATS_MIB_OUTDISCARDS);
141 		kfree_skb(skb);
142 		return 0;
143 	}
144 
145 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
146 			    ip6_finish_output,
147 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
148 }
149 
150 /*
151  *	xmit an sk_buff (used by TCP, SCTP and DCCP)
152  */
153 
154 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
155 	     struct ipv6_txoptions *opt, int tclass)
156 {
157 	struct net *net = sock_net(sk);
158 	struct ipv6_pinfo *np = inet6_sk(sk);
159 	struct in6_addr *first_hop = &fl6->daddr;
160 	struct dst_entry *dst = skb_dst(skb);
161 	struct ipv6hdr *hdr;
162 	u8  proto = fl6->flowi6_proto;
163 	int seg_len = skb->len;
164 	int hlimit = -1;
165 	u32 mtu;
166 
167 	if (opt) {
168 		unsigned int head_room;
169 
170 		/* First: exthdrs may take lots of space (~8K for now)
171 		   MAX_HEADER is not enough.
172 		 */
173 		head_room = opt->opt_nflen + opt->opt_flen;
174 		seg_len += head_room;
175 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
176 
177 		if (skb_headroom(skb) < head_room) {
178 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
179 			if (skb2 == NULL) {
180 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
181 					      IPSTATS_MIB_OUTDISCARDS);
182 				kfree_skb(skb);
183 				return -ENOBUFS;
184 			}
185 			consume_skb(skb);
186 			skb = skb2;
187 			skb_set_owner_w(skb, sk);
188 		}
189 		if (opt->opt_flen)
190 			ipv6_push_frag_opts(skb, opt, &proto);
191 		if (opt->opt_nflen)
192 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
193 	}
194 
195 	skb_push(skb, sizeof(struct ipv6hdr));
196 	skb_reset_network_header(skb);
197 	hdr = ipv6_hdr(skb);
198 
199 	/*
200 	 *	Fill in the IPv6 header
201 	 */
202 	if (np)
203 		hlimit = np->hop_limit;
204 	if (hlimit < 0)
205 		hlimit = ip6_dst_hoplimit(dst);
206 
207 	ip6_flow_hdr(hdr, tclass, fl6->flowlabel);
208 
209 	hdr->payload_len = htons(seg_len);
210 	hdr->nexthdr = proto;
211 	hdr->hop_limit = hlimit;
212 
213 	hdr->saddr = fl6->saddr;
214 	hdr->daddr = *first_hop;
215 
216 	skb->protocol = htons(ETH_P_IPV6);
217 	skb->priority = sk->sk_priority;
218 	skb->mark = sk->sk_mark;
219 
220 	mtu = dst_mtu(dst);
221 	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
222 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
223 			      IPSTATS_MIB_OUT, skb->len);
224 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
225 			       dst->dev, dst_output);
226 	}
227 
228 	skb->dev = dst->dev;
229 	ipv6_local_error(sk, EMSGSIZE, fl6, mtu);
230 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
231 	kfree_skb(skb);
232 	return -EMSGSIZE;
233 }
234 
235 EXPORT_SYMBOL(ip6_xmit);
236 
237 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
238 {
239 	struct ip6_ra_chain *ra;
240 	struct sock *last = NULL;
241 
242 	read_lock(&ip6_ra_lock);
243 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
244 		struct sock *sk = ra->sk;
245 		if (sk && ra->sel == sel &&
246 		    (!sk->sk_bound_dev_if ||
247 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
248 			if (last) {
249 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
250 				if (skb2)
251 					rawv6_rcv(last, skb2);
252 			}
253 			last = sk;
254 		}
255 	}
256 
257 	if (last) {
258 		rawv6_rcv(last, skb);
259 		read_unlock(&ip6_ra_lock);
260 		return 1;
261 	}
262 	read_unlock(&ip6_ra_lock);
263 	return 0;
264 }
265 
266 static int ip6_forward_proxy_check(struct sk_buff *skb)
267 {
268 	struct ipv6hdr *hdr = ipv6_hdr(skb);
269 	u8 nexthdr = hdr->nexthdr;
270 	__be16 frag_off;
271 	int offset;
272 
273 	if (ipv6_ext_hdr(nexthdr)) {
274 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
275 		if (offset < 0)
276 			return 0;
277 	} else
278 		offset = sizeof(struct ipv6hdr);
279 
280 	if (nexthdr == IPPROTO_ICMPV6) {
281 		struct icmp6hdr *icmp6;
282 
283 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
284 					 offset + 1 - skb->data)))
285 			return 0;
286 
287 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
288 
289 		switch (icmp6->icmp6_type) {
290 		case NDISC_ROUTER_SOLICITATION:
291 		case NDISC_ROUTER_ADVERTISEMENT:
292 		case NDISC_NEIGHBOUR_SOLICITATION:
293 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
294 		case NDISC_REDIRECT:
295 			/* For reaction involving unicast neighbor discovery
296 			 * message destined to the proxied address, pass it to
297 			 * input function.
298 			 */
299 			return 1;
300 		default:
301 			break;
302 		}
303 	}
304 
305 	/*
306 	 * The proxying router can't forward traffic sent to a link-local
307 	 * address, so signal the sender and discard the packet. This
308 	 * behavior is clarified by the MIPv6 specification.
309 	 */
310 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
311 		dst_link_failure(skb);
312 		return -1;
313 	}
314 
315 	return 0;
316 }
317 
318 static inline int ip6_forward_finish(struct sk_buff *skb)
319 {
320 	return dst_output(skb);
321 }
322 
323 int ip6_forward(struct sk_buff *skb)
324 {
325 	struct dst_entry *dst = skb_dst(skb);
326 	struct ipv6hdr *hdr = ipv6_hdr(skb);
327 	struct inet6_skb_parm *opt = IP6CB(skb);
328 	struct net *net = dev_net(dst->dev);
329 	u32 mtu;
330 
331 	if (net->ipv6.devconf_all->forwarding == 0)
332 		goto error;
333 
334 	if (skb_warn_if_lro(skb))
335 		goto drop;
336 
337 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
338 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
339 		goto drop;
340 	}
341 
342 	if (skb->pkt_type != PACKET_HOST)
343 		goto drop;
344 
345 	skb_forward_csum(skb);
346 
347 	/*
348 	 *	We DO NOT make any processing on
349 	 *	RA packets, pushing them to user level AS IS
350 	 *	without ane WARRANTY that application will be able
351 	 *	to interpret them. The reason is that we
352 	 *	cannot make anything clever here.
353 	 *
354 	 *	We are not end-node, so that if packet contains
355 	 *	AH/ESP, we cannot make anything.
356 	 *	Defragmentation also would be mistake, RA packets
357 	 *	cannot be fragmented, because there is no warranty
358 	 *	that different fragments will go along one path. --ANK
359 	 */
360 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
361 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
362 			return 0;
363 	}
364 
365 	/*
366 	 *	check and decrement ttl
367 	 */
368 	if (hdr->hop_limit <= 1) {
369 		/* Force OUTPUT device used as source address */
370 		skb->dev = dst->dev;
371 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
372 		IP6_INC_STATS_BH(net,
373 				 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
374 
375 		kfree_skb(skb);
376 		return -ETIMEDOUT;
377 	}
378 
379 	/* XXX: idev->cnf.proxy_ndp? */
380 	if (net->ipv6.devconf_all->proxy_ndp &&
381 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
382 		int proxied = ip6_forward_proxy_check(skb);
383 		if (proxied > 0)
384 			return ip6_input(skb);
385 		else if (proxied < 0) {
386 			IP6_INC_STATS(net, ip6_dst_idev(dst),
387 				      IPSTATS_MIB_INDISCARDS);
388 			goto drop;
389 		}
390 	}
391 
392 	if (!xfrm6_route_forward(skb)) {
393 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
394 		goto drop;
395 	}
396 	dst = skb_dst(skb);
397 
398 	/* IPv6 specs say nothing about it, but it is clear that we cannot
399 	   send redirects to source routed frames.
400 	   We don't send redirects to frames decapsulated from IPsec.
401 	 */
402 	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
403 		struct in6_addr *target = NULL;
404 		struct inet_peer *peer;
405 		struct rt6_info *rt;
406 
407 		/*
408 		 *	incoming and outgoing devices are the same
409 		 *	send a redirect.
410 		 */
411 
412 		rt = (struct rt6_info *) dst;
413 		if (rt->rt6i_flags & RTF_GATEWAY)
414 			target = &rt->rt6i_gateway;
415 		else
416 			target = &hdr->daddr;
417 
418 		peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
419 
420 		/* Limit redirects both by destination (here)
421 		   and by source (inside ndisc_send_redirect)
422 		 */
423 		if (inet_peer_xrlim_allow(peer, 1*HZ))
424 			ndisc_send_redirect(skb, target);
425 		if (peer)
426 			inet_putpeer(peer);
427 	} else {
428 		int addrtype = ipv6_addr_type(&hdr->saddr);
429 
430 		/* This check is security critical. */
431 		if (addrtype == IPV6_ADDR_ANY ||
432 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
433 			goto error;
434 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
435 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
436 				    ICMPV6_NOT_NEIGHBOUR, 0);
437 			goto error;
438 		}
439 	}
440 
441 	mtu = dst_mtu(dst);
442 	if (mtu < IPV6_MIN_MTU)
443 		mtu = IPV6_MIN_MTU;
444 
445 	if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) ||
446 	    (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) {
447 		/* Again, force OUTPUT device used as source address */
448 		skb->dev = dst->dev;
449 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
450 		IP6_INC_STATS_BH(net,
451 				 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
452 		IP6_INC_STATS_BH(net,
453 				 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
454 		kfree_skb(skb);
455 		return -EMSGSIZE;
456 	}
457 
458 	if (skb_cow(skb, dst->dev->hard_header_len)) {
459 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
460 		goto drop;
461 	}
462 
463 	hdr = ipv6_hdr(skb);
464 
465 	/* Mangling hops number delayed to point after skb COW */
466 
467 	hdr->hop_limit--;
468 
469 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
470 	IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
471 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
472 		       ip6_forward_finish);
473 
474 error:
475 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
476 drop:
477 	kfree_skb(skb);
478 	return -EINVAL;
479 }
480 
481 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
482 {
483 	to->pkt_type = from->pkt_type;
484 	to->priority = from->priority;
485 	to->protocol = from->protocol;
486 	skb_dst_drop(to);
487 	skb_dst_set(to, dst_clone(skb_dst(from)));
488 	to->dev = from->dev;
489 	to->mark = from->mark;
490 
491 #ifdef CONFIG_NET_SCHED
492 	to->tc_index = from->tc_index;
493 #endif
494 	nf_copy(to, from);
495 #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
496 	to->nf_trace = from->nf_trace;
497 #endif
498 	skb_copy_secmark(to, from);
499 }
500 
501 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
502 {
503 	struct sk_buff *frag;
504 	struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
505 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
506 	struct ipv6hdr *tmp_hdr;
507 	struct frag_hdr *fh;
508 	unsigned int mtu, hlen, left, len;
509 	int hroom, troom;
510 	__be32 frag_id = 0;
511 	int ptr, offset = 0, err=0;
512 	u8 *prevhdr, nexthdr = 0;
513 	struct net *net = dev_net(skb_dst(skb)->dev);
514 
515 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
516 	nexthdr = *prevhdr;
517 
518 	mtu = ip6_skb_dst_mtu(skb);
519 
520 	/* We must not fragment if the socket is set to force MTU discovery
521 	 * or if the skb it not generated by a local socket.
522 	 */
523 	if (unlikely(!skb->local_df && skb->len > mtu) ||
524 		     (IP6CB(skb)->frag_max_size &&
525 		      IP6CB(skb)->frag_max_size > mtu)) {
526 		if (skb->sk && dst_allfrag(skb_dst(skb)))
527 			sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
528 
529 		skb->dev = skb_dst(skb)->dev;
530 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
531 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
532 			      IPSTATS_MIB_FRAGFAILS);
533 		kfree_skb(skb);
534 		return -EMSGSIZE;
535 	}
536 
537 	if (np && np->frag_size < mtu) {
538 		if (np->frag_size)
539 			mtu = np->frag_size;
540 	}
541 	mtu -= hlen + sizeof(struct frag_hdr);
542 
543 	if (skb_has_frag_list(skb)) {
544 		int first_len = skb_pagelen(skb);
545 		struct sk_buff *frag2;
546 
547 		if (first_len - hlen > mtu ||
548 		    ((first_len - hlen) & 7) ||
549 		    skb_cloned(skb))
550 			goto slow_path;
551 
552 		skb_walk_frags(skb, frag) {
553 			/* Correct geometry. */
554 			if (frag->len > mtu ||
555 			    ((frag->len & 7) && frag->next) ||
556 			    skb_headroom(frag) < hlen)
557 				goto slow_path_clean;
558 
559 			/* Partially cloned skb? */
560 			if (skb_shared(frag))
561 				goto slow_path_clean;
562 
563 			BUG_ON(frag->sk);
564 			if (skb->sk) {
565 				frag->sk = skb->sk;
566 				frag->destructor = sock_wfree;
567 			}
568 			skb->truesize -= frag->truesize;
569 		}
570 
571 		err = 0;
572 		offset = 0;
573 		frag = skb_shinfo(skb)->frag_list;
574 		skb_frag_list_init(skb);
575 		/* BUILD HEADER */
576 
577 		*prevhdr = NEXTHDR_FRAGMENT;
578 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
579 		if (!tmp_hdr) {
580 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
581 				      IPSTATS_MIB_FRAGFAILS);
582 			return -ENOMEM;
583 		}
584 
585 		__skb_pull(skb, hlen);
586 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
587 		__skb_push(skb, hlen);
588 		skb_reset_network_header(skb);
589 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
590 
591 		ipv6_select_ident(fh, rt);
592 		fh->nexthdr = nexthdr;
593 		fh->reserved = 0;
594 		fh->frag_off = htons(IP6_MF);
595 		frag_id = fh->identification;
596 
597 		first_len = skb_pagelen(skb);
598 		skb->data_len = first_len - skb_headlen(skb);
599 		skb->len = first_len;
600 		ipv6_hdr(skb)->payload_len = htons(first_len -
601 						   sizeof(struct ipv6hdr));
602 
603 		dst_hold(&rt->dst);
604 
605 		for (;;) {
606 			/* Prepare header of the next frame,
607 			 * before previous one went down. */
608 			if (frag) {
609 				frag->ip_summed = CHECKSUM_NONE;
610 				skb_reset_transport_header(frag);
611 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
612 				__skb_push(frag, hlen);
613 				skb_reset_network_header(frag);
614 				memcpy(skb_network_header(frag), tmp_hdr,
615 				       hlen);
616 				offset += skb->len - hlen - sizeof(struct frag_hdr);
617 				fh->nexthdr = nexthdr;
618 				fh->reserved = 0;
619 				fh->frag_off = htons(offset);
620 				if (frag->next != NULL)
621 					fh->frag_off |= htons(IP6_MF);
622 				fh->identification = frag_id;
623 				ipv6_hdr(frag)->payload_len =
624 						htons(frag->len -
625 						      sizeof(struct ipv6hdr));
626 				ip6_copy_metadata(frag, skb);
627 			}
628 
629 			err = output(skb);
630 			if(!err)
631 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
632 					      IPSTATS_MIB_FRAGCREATES);
633 
634 			if (err || !frag)
635 				break;
636 
637 			skb = frag;
638 			frag = skb->next;
639 			skb->next = NULL;
640 		}
641 
642 		kfree(tmp_hdr);
643 
644 		if (err == 0) {
645 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
646 				      IPSTATS_MIB_FRAGOKS);
647 			ip6_rt_put(rt);
648 			return 0;
649 		}
650 
651 		while (frag) {
652 			skb = frag->next;
653 			kfree_skb(frag);
654 			frag = skb;
655 		}
656 
657 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
658 			      IPSTATS_MIB_FRAGFAILS);
659 		ip6_rt_put(rt);
660 		return err;
661 
662 slow_path_clean:
663 		skb_walk_frags(skb, frag2) {
664 			if (frag2 == frag)
665 				break;
666 			frag2->sk = NULL;
667 			frag2->destructor = NULL;
668 			skb->truesize += frag2->truesize;
669 		}
670 	}
671 
672 slow_path:
673 	if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
674 	    skb_checksum_help(skb))
675 		goto fail;
676 
677 	left = skb->len - hlen;		/* Space per frame */
678 	ptr = hlen;			/* Where to start from */
679 
680 	/*
681 	 *	Fragment the datagram.
682 	 */
683 
684 	*prevhdr = NEXTHDR_FRAGMENT;
685 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
686 	troom = rt->dst.dev->needed_tailroom;
687 
688 	/*
689 	 *	Keep copying data until we run out.
690 	 */
691 	while(left > 0)	{
692 		len = left;
693 		/* IF: it doesn't fit, use 'mtu' - the data space left */
694 		if (len > mtu)
695 			len = mtu;
696 		/* IF: we are not sending up to and including the packet end
697 		   then align the next start on an eight byte boundary */
698 		if (len < left)	{
699 			len &= ~7;
700 		}
701 		/*
702 		 *	Allocate buffer.
703 		 */
704 
705 		if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
706 				      hroom + troom, GFP_ATOMIC)) == NULL) {
707 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
708 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
709 				      IPSTATS_MIB_FRAGFAILS);
710 			err = -ENOMEM;
711 			goto fail;
712 		}
713 
714 		/*
715 		 *	Set up data on packet
716 		 */
717 
718 		ip6_copy_metadata(frag, skb);
719 		skb_reserve(frag, hroom);
720 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
721 		skb_reset_network_header(frag);
722 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
723 		frag->transport_header = (frag->network_header + hlen +
724 					  sizeof(struct frag_hdr));
725 
726 		/*
727 		 *	Charge the memory for the fragment to any owner
728 		 *	it might possess
729 		 */
730 		if (skb->sk)
731 			skb_set_owner_w(frag, skb->sk);
732 
733 		/*
734 		 *	Copy the packet header into the new buffer.
735 		 */
736 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
737 
738 		/*
739 		 *	Build fragment header.
740 		 */
741 		fh->nexthdr = nexthdr;
742 		fh->reserved = 0;
743 		if (!frag_id) {
744 			ipv6_select_ident(fh, rt);
745 			frag_id = fh->identification;
746 		} else
747 			fh->identification = frag_id;
748 
749 		/*
750 		 *	Copy a block of the IP datagram.
751 		 */
752 		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
753 			BUG();
754 		left -= len;
755 
756 		fh->frag_off = htons(offset);
757 		if (left > 0)
758 			fh->frag_off |= htons(IP6_MF);
759 		ipv6_hdr(frag)->payload_len = htons(frag->len -
760 						    sizeof(struct ipv6hdr));
761 
762 		ptr += len;
763 		offset += len;
764 
765 		/*
766 		 *	Put this fragment into the sending queue.
767 		 */
768 		err = output(frag);
769 		if (err)
770 			goto fail;
771 
772 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
773 			      IPSTATS_MIB_FRAGCREATES);
774 	}
775 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
776 		      IPSTATS_MIB_FRAGOKS);
777 	consume_skb(skb);
778 	return err;
779 
780 fail:
781 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
782 		      IPSTATS_MIB_FRAGFAILS);
783 	kfree_skb(skb);
784 	return err;
785 }
786 
787 static inline int ip6_rt_check(const struct rt6key *rt_key,
788 			       const struct in6_addr *fl_addr,
789 			       const struct in6_addr *addr_cache)
790 {
791 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
792 		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
793 }
794 
795 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
796 					  struct dst_entry *dst,
797 					  const struct flowi6 *fl6)
798 {
799 	struct ipv6_pinfo *np = inet6_sk(sk);
800 	struct rt6_info *rt;
801 
802 	if (!dst)
803 		goto out;
804 
805 	if (dst->ops->family != AF_INET6) {
806 		dst_release(dst);
807 		return NULL;
808 	}
809 
810 	rt = (struct rt6_info *)dst;
811 	/* Yes, checking route validity in not connected
812 	 * case is not very simple. Take into account,
813 	 * that we do not support routing by source, TOS,
814 	 * and MSG_DONTROUTE 		--ANK (980726)
815 	 *
816 	 * 1. ip6_rt_check(): If route was host route,
817 	 *    check that cached destination is current.
818 	 *    If it is network route, we still may
819 	 *    check its validity using saved pointer
820 	 *    to the last used address: daddr_cache.
821 	 *    We do not want to save whole address now,
822 	 *    (because main consumer of this service
823 	 *    is tcp, which has not this problem),
824 	 *    so that the last trick works only on connected
825 	 *    sockets.
826 	 * 2. oif also should be the same.
827 	 */
828 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
829 #ifdef CONFIG_IPV6_SUBTREES
830 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
831 #endif
832 	    (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
833 		dst_release(dst);
834 		dst = NULL;
835 	}
836 
837 out:
838 	return dst;
839 }
840 
841 static int ip6_dst_lookup_tail(struct sock *sk,
842 			       struct dst_entry **dst, struct flowi6 *fl6)
843 {
844 	struct net *net = sock_net(sk);
845 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
846 	struct neighbour *n;
847 	struct rt6_info *rt;
848 #endif
849 	int err;
850 
851 	if (*dst == NULL)
852 		*dst = ip6_route_output(net, sk, fl6);
853 
854 	if ((err = (*dst)->error))
855 		goto out_err_release;
856 
857 	if (ipv6_addr_any(&fl6->saddr)) {
858 		struct rt6_info *rt = (struct rt6_info *) *dst;
859 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
860 					  sk ? inet6_sk(sk)->srcprefs : 0,
861 					  &fl6->saddr);
862 		if (err)
863 			goto out_err_release;
864 	}
865 
866 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
867 	/*
868 	 * Here if the dst entry we've looked up
869 	 * has a neighbour entry that is in the INCOMPLETE
870 	 * state and the src address from the flow is
871 	 * marked as OPTIMISTIC, we release the found
872 	 * dst entry and replace it instead with the
873 	 * dst entry of the nexthop router
874 	 */
875 	rt = (struct rt6_info *) *dst;
876 	rcu_read_lock_bh();
877 	n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt, &fl6->daddr));
878 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
879 	rcu_read_unlock_bh();
880 
881 	if (err) {
882 		struct inet6_ifaddr *ifp;
883 		struct flowi6 fl_gw6;
884 		int redirect;
885 
886 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
887 				      (*dst)->dev, 1);
888 
889 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
890 		if (ifp)
891 			in6_ifa_put(ifp);
892 
893 		if (redirect) {
894 			/*
895 			 * We need to get the dst entry for the
896 			 * default router instead
897 			 */
898 			dst_release(*dst);
899 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
900 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
901 			*dst = ip6_route_output(net, sk, &fl_gw6);
902 			if ((err = (*dst)->error))
903 				goto out_err_release;
904 		}
905 	}
906 #endif
907 
908 	return 0;
909 
910 out_err_release:
911 	if (err == -ENETUNREACH)
912 		IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
913 	dst_release(*dst);
914 	*dst = NULL;
915 	return err;
916 }
917 
918 /**
919  *	ip6_dst_lookup - perform route lookup on flow
920  *	@sk: socket which provides route info
921  *	@dst: pointer to dst_entry * for result
922  *	@fl6: flow to lookup
923  *
924  *	This function performs a route lookup on the given flow.
925  *
926  *	It returns zero on success, or a standard errno code on error.
927  */
928 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
929 {
930 	*dst = NULL;
931 	return ip6_dst_lookup_tail(sk, dst, fl6);
932 }
933 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
934 
935 /**
936  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
937  *	@sk: socket which provides route info
938  *	@fl6: flow to lookup
939  *	@final_dst: final destination address for ipsec lookup
940  *	@can_sleep: we are in a sleepable context
941  *
942  *	This function performs a route lookup on the given flow.
943  *
944  *	It returns a valid dst pointer on success, or a pointer encoded
945  *	error code.
946  */
947 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
948 				      const struct in6_addr *final_dst,
949 				      bool can_sleep)
950 {
951 	struct dst_entry *dst = NULL;
952 	int err;
953 
954 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
955 	if (err)
956 		return ERR_PTR(err);
957 	if (final_dst)
958 		fl6->daddr = *final_dst;
959 	if (can_sleep)
960 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
961 
962 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
963 }
964 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
965 
966 /**
967  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
968  *	@sk: socket which provides the dst cache and route info
969  *	@fl6: flow to lookup
970  *	@final_dst: final destination address for ipsec lookup
971  *	@can_sleep: we are in a sleepable context
972  *
973  *	This function performs a route lookup on the given flow with the
974  *	possibility of using the cached route in the socket if it is valid.
975  *	It will take the socket dst lock when operating on the dst cache.
976  *	As a result, this function can only be used in process context.
977  *
978  *	It returns a valid dst pointer on success, or a pointer encoded
979  *	error code.
980  */
981 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
982 					 const struct in6_addr *final_dst,
983 					 bool can_sleep)
984 {
985 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
986 	int err;
987 
988 	dst = ip6_sk_dst_check(sk, dst, fl6);
989 
990 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
991 	if (err)
992 		return ERR_PTR(err);
993 	if (final_dst)
994 		fl6->daddr = *final_dst;
995 	if (can_sleep)
996 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
997 
998 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
999 }
1000 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1001 
1002 static inline int ip6_ufo_append_data(struct sock *sk,
1003 			int getfrag(void *from, char *to, int offset, int len,
1004 			int odd, struct sk_buff *skb),
1005 			void *from, int length, int hh_len, int fragheaderlen,
1006 			int transhdrlen, int mtu,unsigned int flags,
1007 			struct rt6_info *rt)
1008 
1009 {
1010 	struct sk_buff *skb;
1011 	int err;
1012 
1013 	/* There is support for UDP large send offload by network
1014 	 * device, so create one single skb packet containing complete
1015 	 * udp datagram
1016 	 */
1017 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1018 		skb = sock_alloc_send_skb(sk,
1019 			hh_len + fragheaderlen + transhdrlen + 20,
1020 			(flags & MSG_DONTWAIT), &err);
1021 		if (skb == NULL)
1022 			return err;
1023 
1024 		/* reserve space for Hardware header */
1025 		skb_reserve(skb, hh_len);
1026 
1027 		/* create space for UDP/IP header */
1028 		skb_put(skb,fragheaderlen + transhdrlen);
1029 
1030 		/* initialize network header pointer */
1031 		skb_reset_network_header(skb);
1032 
1033 		/* initialize protocol header pointer */
1034 		skb->transport_header = skb->network_header + fragheaderlen;
1035 
1036 		skb->protocol = htons(ETH_P_IPV6);
1037 		skb->ip_summed = CHECKSUM_PARTIAL;
1038 		skb->csum = 0;
1039 	}
1040 
1041 	err = skb_append_datato_frags(sk,skb, getfrag, from,
1042 				      (length - transhdrlen));
1043 	if (!err) {
1044 		struct frag_hdr fhdr;
1045 
1046 		/* Specify the length of each IPv6 datagram fragment.
1047 		 * It has to be a multiple of 8.
1048 		 */
1049 		skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1050 					     sizeof(struct frag_hdr)) & ~7;
1051 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1052 		ipv6_select_ident(&fhdr, rt);
1053 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1054 		__skb_queue_tail(&sk->sk_write_queue, skb);
1055 
1056 		return 0;
1057 	}
1058 	/* There is not enough support do UPD LSO,
1059 	 * so follow normal path
1060 	 */
1061 	kfree_skb(skb);
1062 
1063 	return err;
1064 }
1065 
1066 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1067 					       gfp_t gfp)
1068 {
1069 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1070 }
1071 
1072 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1073 						gfp_t gfp)
1074 {
1075 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1076 }
1077 
1078 static void ip6_append_data_mtu(unsigned int *mtu,
1079 				int *maxfraglen,
1080 				unsigned int fragheaderlen,
1081 				struct sk_buff *skb,
1082 				struct rt6_info *rt,
1083 				bool pmtuprobe)
1084 {
1085 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1086 		if (skb == NULL) {
1087 			/* first fragment, reserve header_len */
1088 			*mtu = *mtu - rt->dst.header_len;
1089 
1090 		} else {
1091 			/*
1092 			 * this fragment is not first, the headers
1093 			 * space is regarded as data space.
1094 			 */
1095 			*mtu = min(*mtu, pmtuprobe ?
1096 				   rt->dst.dev->mtu :
1097 				   dst_mtu(rt->dst.path));
1098 		}
1099 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1100 			      + fragheaderlen - sizeof(struct frag_hdr);
1101 	}
1102 }
1103 
1104 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1105 	int offset, int len, int odd, struct sk_buff *skb),
1106 	void *from, int length, int transhdrlen,
1107 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1108 	struct rt6_info *rt, unsigned int flags, int dontfrag)
1109 {
1110 	struct inet_sock *inet = inet_sk(sk);
1111 	struct ipv6_pinfo *np = inet6_sk(sk);
1112 	struct inet_cork *cork;
1113 	struct sk_buff *skb, *skb_prev = NULL;
1114 	unsigned int maxfraglen, fragheaderlen, mtu;
1115 	int exthdrlen;
1116 	int dst_exthdrlen;
1117 	int hh_len;
1118 	int copy;
1119 	int err;
1120 	int offset = 0;
1121 	__u8 tx_flags = 0;
1122 
1123 	if (flags&MSG_PROBE)
1124 		return 0;
1125 	cork = &inet->cork.base;
1126 	if (skb_queue_empty(&sk->sk_write_queue)) {
1127 		/*
1128 		 * setup for corking
1129 		 */
1130 		if (opt) {
1131 			if (WARN_ON(np->cork.opt))
1132 				return -EINVAL;
1133 
1134 			np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation);
1135 			if (unlikely(np->cork.opt == NULL))
1136 				return -ENOBUFS;
1137 
1138 			np->cork.opt->tot_len = opt->tot_len;
1139 			np->cork.opt->opt_flen = opt->opt_flen;
1140 			np->cork.opt->opt_nflen = opt->opt_nflen;
1141 
1142 			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1143 							    sk->sk_allocation);
1144 			if (opt->dst0opt && !np->cork.opt->dst0opt)
1145 				return -ENOBUFS;
1146 
1147 			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1148 							    sk->sk_allocation);
1149 			if (opt->dst1opt && !np->cork.opt->dst1opt)
1150 				return -ENOBUFS;
1151 
1152 			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1153 							   sk->sk_allocation);
1154 			if (opt->hopopt && !np->cork.opt->hopopt)
1155 				return -ENOBUFS;
1156 
1157 			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1158 							    sk->sk_allocation);
1159 			if (opt->srcrt && !np->cork.opt->srcrt)
1160 				return -ENOBUFS;
1161 
1162 			/* need source address above miyazawa*/
1163 		}
1164 		dst_hold(&rt->dst);
1165 		cork->dst = &rt->dst;
1166 		inet->cork.fl.u.ip6 = *fl6;
1167 		np->cork.hop_limit = hlimit;
1168 		np->cork.tclass = tclass;
1169 		if (rt->dst.flags & DST_XFRM_TUNNEL)
1170 			mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1171 			      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1172 		else
1173 			mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1174 			      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1175 		if (np->frag_size < mtu) {
1176 			if (np->frag_size)
1177 				mtu = np->frag_size;
1178 		}
1179 		cork->fragsize = mtu;
1180 		if (dst_allfrag(rt->dst.path))
1181 			cork->flags |= IPCORK_ALLFRAG;
1182 		cork->length = 0;
1183 		exthdrlen = (opt ? opt->opt_flen : 0);
1184 		length += exthdrlen;
1185 		transhdrlen += exthdrlen;
1186 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1187 	} else {
1188 		rt = (struct rt6_info *)cork->dst;
1189 		fl6 = &inet->cork.fl.u.ip6;
1190 		opt = np->cork.opt;
1191 		transhdrlen = 0;
1192 		exthdrlen = 0;
1193 		dst_exthdrlen = 0;
1194 		mtu = cork->fragsize;
1195 	}
1196 
1197 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1198 
1199 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1200 			(opt ? opt->opt_nflen : 0);
1201 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1202 
1203 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1204 		if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1205 			ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1206 			return -EMSGSIZE;
1207 		}
1208 	}
1209 
1210 	/* For UDP, check if TX timestamp is enabled */
1211 	if (sk->sk_type == SOCK_DGRAM)
1212 		sock_tx_timestamp(sk, &tx_flags);
1213 
1214 	/*
1215 	 * Let's try using as much space as possible.
1216 	 * Use MTU if total length of the message fits into the MTU.
1217 	 * Otherwise, we need to reserve fragment header and
1218 	 * fragment alignment (= 8-15 octects, in total).
1219 	 *
1220 	 * Note that we may need to "move" the data from the tail of
1221 	 * of the buffer to the new fragment when we split
1222 	 * the message.
1223 	 *
1224 	 * FIXME: It may be fragmented into multiple chunks
1225 	 *        at once if non-fragmentable extension headers
1226 	 *        are too large.
1227 	 * --yoshfuji
1228 	 */
1229 
1230 	cork->length += length;
1231 	if (length > mtu) {
1232 		int proto = sk->sk_protocol;
1233 		if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1234 			ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1235 			return -EMSGSIZE;
1236 		}
1237 
1238 		if (proto == IPPROTO_UDP &&
1239 		    (rt->dst.dev->features & NETIF_F_UFO)) {
1240 
1241 			err = ip6_ufo_append_data(sk, getfrag, from, length,
1242 						  hh_len, fragheaderlen,
1243 						  transhdrlen, mtu, flags, rt);
1244 			if (err)
1245 				goto error;
1246 			return 0;
1247 		}
1248 	}
1249 
1250 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1251 		goto alloc_new_skb;
1252 
1253 	while (length > 0) {
1254 		/* Check if the remaining data fits into current packet. */
1255 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1256 		if (copy < length)
1257 			copy = maxfraglen - skb->len;
1258 
1259 		if (copy <= 0) {
1260 			char *data;
1261 			unsigned int datalen;
1262 			unsigned int fraglen;
1263 			unsigned int fraggap;
1264 			unsigned int alloclen;
1265 alloc_new_skb:
1266 			/* There's no room in the current skb */
1267 			if (skb)
1268 				fraggap = skb->len - maxfraglen;
1269 			else
1270 				fraggap = 0;
1271 			/* update mtu and maxfraglen if necessary */
1272 			if (skb == NULL || skb_prev == NULL)
1273 				ip6_append_data_mtu(&mtu, &maxfraglen,
1274 						    fragheaderlen, skb, rt,
1275 						    np->pmtudisc ==
1276 						    IPV6_PMTUDISC_PROBE);
1277 
1278 			skb_prev = skb;
1279 
1280 			/*
1281 			 * If remaining data exceeds the mtu,
1282 			 * we know we need more fragment(s).
1283 			 */
1284 			datalen = length + fraggap;
1285 
1286 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1287 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1288 			if ((flags & MSG_MORE) &&
1289 			    !(rt->dst.dev->features&NETIF_F_SG))
1290 				alloclen = mtu;
1291 			else
1292 				alloclen = datalen + fragheaderlen;
1293 
1294 			alloclen += dst_exthdrlen;
1295 
1296 			if (datalen != length + fraggap) {
1297 				/*
1298 				 * this is not the last fragment, the trailer
1299 				 * space is regarded as data space.
1300 				 */
1301 				datalen += rt->dst.trailer_len;
1302 			}
1303 
1304 			alloclen += rt->dst.trailer_len;
1305 			fraglen = datalen + fragheaderlen;
1306 
1307 			/*
1308 			 * We just reserve space for fragment header.
1309 			 * Note: this may be overallocation if the message
1310 			 * (without MSG_MORE) fits into the MTU.
1311 			 */
1312 			alloclen += sizeof(struct frag_hdr);
1313 
1314 			if (transhdrlen) {
1315 				skb = sock_alloc_send_skb(sk,
1316 						alloclen + hh_len,
1317 						(flags & MSG_DONTWAIT), &err);
1318 			} else {
1319 				skb = NULL;
1320 				if (atomic_read(&sk->sk_wmem_alloc) <=
1321 				    2 * sk->sk_sndbuf)
1322 					skb = sock_wmalloc(sk,
1323 							   alloclen + hh_len, 1,
1324 							   sk->sk_allocation);
1325 				if (unlikely(skb == NULL))
1326 					err = -ENOBUFS;
1327 				else {
1328 					/* Only the initial fragment
1329 					 * is time stamped.
1330 					 */
1331 					tx_flags = 0;
1332 				}
1333 			}
1334 			if (skb == NULL)
1335 				goto error;
1336 			/*
1337 			 *	Fill in the control structures
1338 			 */
1339 			skb->protocol = htons(ETH_P_IPV6);
1340 			skb->ip_summed = CHECKSUM_NONE;
1341 			skb->csum = 0;
1342 			/* reserve for fragmentation and ipsec header */
1343 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1344 				    dst_exthdrlen);
1345 
1346 			if (sk->sk_type == SOCK_DGRAM)
1347 				skb_shinfo(skb)->tx_flags = tx_flags;
1348 
1349 			/*
1350 			 *	Find where to start putting bytes
1351 			 */
1352 			data = skb_put(skb, fraglen);
1353 			skb_set_network_header(skb, exthdrlen);
1354 			data += fragheaderlen;
1355 			skb->transport_header = (skb->network_header +
1356 						 fragheaderlen);
1357 			if (fraggap) {
1358 				skb->csum = skb_copy_and_csum_bits(
1359 					skb_prev, maxfraglen,
1360 					data + transhdrlen, fraggap, 0);
1361 				skb_prev->csum = csum_sub(skb_prev->csum,
1362 							  skb->csum);
1363 				data += fraggap;
1364 				pskb_trim_unique(skb_prev, maxfraglen);
1365 			}
1366 			copy = datalen - transhdrlen - fraggap;
1367 
1368 			if (copy < 0) {
1369 				err = -EINVAL;
1370 				kfree_skb(skb);
1371 				goto error;
1372 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1373 				err = -EFAULT;
1374 				kfree_skb(skb);
1375 				goto error;
1376 			}
1377 
1378 			offset += copy;
1379 			length -= datalen - fraggap;
1380 			transhdrlen = 0;
1381 			exthdrlen = 0;
1382 			dst_exthdrlen = 0;
1383 
1384 			/*
1385 			 * Put the packet on the pending queue
1386 			 */
1387 			__skb_queue_tail(&sk->sk_write_queue, skb);
1388 			continue;
1389 		}
1390 
1391 		if (copy > length)
1392 			copy = length;
1393 
1394 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1395 			unsigned int off;
1396 
1397 			off = skb->len;
1398 			if (getfrag(from, skb_put(skb, copy),
1399 						offset, copy, off, skb) < 0) {
1400 				__skb_trim(skb, off);
1401 				err = -EFAULT;
1402 				goto error;
1403 			}
1404 		} else {
1405 			int i = skb_shinfo(skb)->nr_frags;
1406 			struct page_frag *pfrag = sk_page_frag(sk);
1407 
1408 			err = -ENOMEM;
1409 			if (!sk_page_frag_refill(sk, pfrag))
1410 				goto error;
1411 
1412 			if (!skb_can_coalesce(skb, i, pfrag->page,
1413 					      pfrag->offset)) {
1414 				err = -EMSGSIZE;
1415 				if (i == MAX_SKB_FRAGS)
1416 					goto error;
1417 
1418 				__skb_fill_page_desc(skb, i, pfrag->page,
1419 						     pfrag->offset, 0);
1420 				skb_shinfo(skb)->nr_frags = ++i;
1421 				get_page(pfrag->page);
1422 			}
1423 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1424 			if (getfrag(from,
1425 				    page_address(pfrag->page) + pfrag->offset,
1426 				    offset, copy, skb->len, skb) < 0)
1427 				goto error_efault;
1428 
1429 			pfrag->offset += copy;
1430 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1431 			skb->len += copy;
1432 			skb->data_len += copy;
1433 			skb->truesize += copy;
1434 			atomic_add(copy, &sk->sk_wmem_alloc);
1435 		}
1436 		offset += copy;
1437 		length -= copy;
1438 	}
1439 
1440 	return 0;
1441 
1442 error_efault:
1443 	err = -EFAULT;
1444 error:
1445 	cork->length -= length;
1446 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1447 	return err;
1448 }
1449 EXPORT_SYMBOL_GPL(ip6_append_data);
1450 
1451 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1452 {
1453 	if (np->cork.opt) {
1454 		kfree(np->cork.opt->dst0opt);
1455 		kfree(np->cork.opt->dst1opt);
1456 		kfree(np->cork.opt->hopopt);
1457 		kfree(np->cork.opt->srcrt);
1458 		kfree(np->cork.opt);
1459 		np->cork.opt = NULL;
1460 	}
1461 
1462 	if (inet->cork.base.dst) {
1463 		dst_release(inet->cork.base.dst);
1464 		inet->cork.base.dst = NULL;
1465 		inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1466 	}
1467 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1468 }
1469 
1470 int ip6_push_pending_frames(struct sock *sk)
1471 {
1472 	struct sk_buff *skb, *tmp_skb;
1473 	struct sk_buff **tail_skb;
1474 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1475 	struct inet_sock *inet = inet_sk(sk);
1476 	struct ipv6_pinfo *np = inet6_sk(sk);
1477 	struct net *net = sock_net(sk);
1478 	struct ipv6hdr *hdr;
1479 	struct ipv6_txoptions *opt = np->cork.opt;
1480 	struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1481 	struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1482 	unsigned char proto = fl6->flowi6_proto;
1483 	int err = 0;
1484 
1485 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1486 		goto out;
1487 	tail_skb = &(skb_shinfo(skb)->frag_list);
1488 
1489 	/* move skb->data to ip header from ext header */
1490 	if (skb->data < skb_network_header(skb))
1491 		__skb_pull(skb, skb_network_offset(skb));
1492 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1493 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1494 		*tail_skb = tmp_skb;
1495 		tail_skb = &(tmp_skb->next);
1496 		skb->len += tmp_skb->len;
1497 		skb->data_len += tmp_skb->len;
1498 		skb->truesize += tmp_skb->truesize;
1499 		tmp_skb->destructor = NULL;
1500 		tmp_skb->sk = NULL;
1501 	}
1502 
1503 	/* Allow local fragmentation. */
1504 	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1505 		skb->local_df = 1;
1506 
1507 	*final_dst = fl6->daddr;
1508 	__skb_pull(skb, skb_network_header_len(skb));
1509 	if (opt && opt->opt_flen)
1510 		ipv6_push_frag_opts(skb, opt, &proto);
1511 	if (opt && opt->opt_nflen)
1512 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1513 
1514 	skb_push(skb, sizeof(struct ipv6hdr));
1515 	skb_reset_network_header(skb);
1516 	hdr = ipv6_hdr(skb);
1517 
1518 	ip6_flow_hdr(hdr, np->cork.tclass, fl6->flowlabel);
1519 	hdr->hop_limit = np->cork.hop_limit;
1520 	hdr->nexthdr = proto;
1521 	hdr->saddr = fl6->saddr;
1522 	hdr->daddr = *final_dst;
1523 
1524 	skb->priority = sk->sk_priority;
1525 	skb->mark = sk->sk_mark;
1526 
1527 	skb_dst_set(skb, dst_clone(&rt->dst));
1528 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1529 	if (proto == IPPROTO_ICMPV6) {
1530 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1531 
1532 		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1533 		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1534 	}
1535 
1536 	err = ip6_local_out(skb);
1537 	if (err) {
1538 		if (err > 0)
1539 			err = net_xmit_errno(err);
1540 		if (err)
1541 			goto error;
1542 	}
1543 
1544 out:
1545 	ip6_cork_release(inet, np);
1546 	return err;
1547 error:
1548 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1549 	goto out;
1550 }
1551 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1552 
1553 void ip6_flush_pending_frames(struct sock *sk)
1554 {
1555 	struct sk_buff *skb;
1556 
1557 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1558 		if (skb_dst(skb))
1559 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1560 				      IPSTATS_MIB_OUTDISCARDS);
1561 		kfree_skb(skb);
1562 	}
1563 
1564 	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1565 }
1566 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1567