xref: /openbmc/linux/net/ipv6/ip6_output.c (revision ee8a99bd)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	: 	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 
59 int __ip6_local_out(struct sk_buff *skb)
60 {
61 	int len;
62 
63 	len = skb->len - sizeof(struct ipv6hdr);
64 	if (len > IPV6_MAXPLEN)
65 		len = 0;
66 	ipv6_hdr(skb)->payload_len = htons(len);
67 
68 	return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
69 		       skb_dst(skb)->dev, dst_output);
70 }
71 
72 int ip6_local_out(struct sk_buff *skb)
73 {
74 	int err;
75 
76 	err = __ip6_local_out(skb);
77 	if (likely(err == 1))
78 		err = dst_output(skb);
79 
80 	return err;
81 }
82 EXPORT_SYMBOL_GPL(ip6_local_out);
83 
84 static int ip6_finish_output2(struct sk_buff *skb)
85 {
86 	struct dst_entry *dst = skb_dst(skb);
87 	struct net_device *dev = dst->dev;
88 	struct neighbour *neigh;
89 	struct in6_addr *nexthop;
90 	int ret;
91 
92 	skb->protocol = htons(ETH_P_IPV6);
93 	skb->dev = dev;
94 
95 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
96 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
97 
98 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
99 		    ((mroute6_socket(dev_net(dev), skb) &&
100 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
101 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
102 					 &ipv6_hdr(skb)->saddr))) {
103 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
104 
105 			/* Do not check for IFF_ALLMULTI; multicast routing
106 			   is not supported in any case.
107 			 */
108 			if (newskb)
109 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
110 					newskb, NULL, newskb->dev,
111 					dev_loopback_xmit);
112 
113 			if (ipv6_hdr(skb)->hop_limit == 0) {
114 				IP6_INC_STATS(dev_net(dev), idev,
115 					      IPSTATS_MIB_OUTDISCARDS);
116 				kfree_skb(skb);
117 				return 0;
118 			}
119 		}
120 
121 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
122 				skb->len);
123 
124 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
125 		    IPV6_ADDR_SCOPE_NODELOCAL &&
126 		    !(dev->flags & IFF_LOOPBACK)) {
127 			kfree_skb(skb);
128 			return 0;
129 		}
130 	}
131 
132 	rcu_read_lock_bh();
133 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
134 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
135 	if (unlikely(!neigh))
136 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
137 	if (!IS_ERR(neigh)) {
138 		ret = dst_neigh_output(dst, neigh, skb);
139 		rcu_read_unlock_bh();
140 		return ret;
141 	}
142 	rcu_read_unlock_bh();
143 
144 	IP6_INC_STATS_BH(dev_net(dst->dev),
145 			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
146 	kfree_skb(skb);
147 	return -EINVAL;
148 }
149 
150 static int ip6_finish_output(struct sk_buff *skb)
151 {
152 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
153 	    dst_allfrag(skb_dst(skb)))
154 		return ip6_fragment(skb, ip6_finish_output2);
155 	else
156 		return ip6_finish_output2(skb);
157 }
158 
159 int ip6_output(struct sk_buff *skb)
160 {
161 	struct net_device *dev = skb_dst(skb)->dev;
162 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
163 	if (unlikely(idev->cnf.disable_ipv6)) {
164 		IP6_INC_STATS(dev_net(dev), idev,
165 			      IPSTATS_MIB_OUTDISCARDS);
166 		kfree_skb(skb);
167 		return 0;
168 	}
169 
170 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
171 			    ip6_finish_output,
172 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
173 }
174 
175 /*
176  *	xmit an sk_buff (used by TCP, SCTP and DCCP)
177  */
178 
179 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
180 	     struct ipv6_txoptions *opt, int tclass)
181 {
182 	struct net *net = sock_net(sk);
183 	struct ipv6_pinfo *np = inet6_sk(sk);
184 	struct in6_addr *first_hop = &fl6->daddr;
185 	struct dst_entry *dst = skb_dst(skb);
186 	struct ipv6hdr *hdr;
187 	u8  proto = fl6->flowi6_proto;
188 	int seg_len = skb->len;
189 	int hlimit = -1;
190 	u32 mtu;
191 
192 	if (opt) {
193 		unsigned int head_room;
194 
195 		/* First: exthdrs may take lots of space (~8K for now)
196 		   MAX_HEADER is not enough.
197 		 */
198 		head_room = opt->opt_nflen + opt->opt_flen;
199 		seg_len += head_room;
200 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
201 
202 		if (skb_headroom(skb) < head_room) {
203 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
204 			if (skb2 == NULL) {
205 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
206 					      IPSTATS_MIB_OUTDISCARDS);
207 				kfree_skb(skb);
208 				return -ENOBUFS;
209 			}
210 			consume_skb(skb);
211 			skb = skb2;
212 			skb_set_owner_w(skb, sk);
213 		}
214 		if (opt->opt_flen)
215 			ipv6_push_frag_opts(skb, opt, &proto);
216 		if (opt->opt_nflen)
217 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
218 	}
219 
220 	skb_push(skb, sizeof(struct ipv6hdr));
221 	skb_reset_network_header(skb);
222 	hdr = ipv6_hdr(skb);
223 
224 	/*
225 	 *	Fill in the IPv6 header
226 	 */
227 	if (np)
228 		hlimit = np->hop_limit;
229 	if (hlimit < 0)
230 		hlimit = ip6_dst_hoplimit(dst);
231 
232 	ip6_flow_hdr(hdr, tclass, fl6->flowlabel);
233 
234 	hdr->payload_len = htons(seg_len);
235 	hdr->nexthdr = proto;
236 	hdr->hop_limit = hlimit;
237 
238 	hdr->saddr = fl6->saddr;
239 	hdr->daddr = *first_hop;
240 
241 	skb->priority = sk->sk_priority;
242 	skb->mark = sk->sk_mark;
243 
244 	mtu = dst_mtu(dst);
245 	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
246 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
247 			      IPSTATS_MIB_OUT, skb->len);
248 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
249 			       dst->dev, dst_output);
250 	}
251 
252 	skb->dev = dst->dev;
253 	ipv6_local_error(sk, EMSGSIZE, fl6, mtu);
254 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
255 	kfree_skb(skb);
256 	return -EMSGSIZE;
257 }
258 
259 EXPORT_SYMBOL(ip6_xmit);
260 
261 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
262 {
263 	struct ip6_ra_chain *ra;
264 	struct sock *last = NULL;
265 
266 	read_lock(&ip6_ra_lock);
267 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
268 		struct sock *sk = ra->sk;
269 		if (sk && ra->sel == sel &&
270 		    (!sk->sk_bound_dev_if ||
271 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
272 			if (last) {
273 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
274 				if (skb2)
275 					rawv6_rcv(last, skb2);
276 			}
277 			last = sk;
278 		}
279 	}
280 
281 	if (last) {
282 		rawv6_rcv(last, skb);
283 		read_unlock(&ip6_ra_lock);
284 		return 1;
285 	}
286 	read_unlock(&ip6_ra_lock);
287 	return 0;
288 }
289 
290 static int ip6_forward_proxy_check(struct sk_buff *skb)
291 {
292 	struct ipv6hdr *hdr = ipv6_hdr(skb);
293 	u8 nexthdr = hdr->nexthdr;
294 	__be16 frag_off;
295 	int offset;
296 
297 	if (ipv6_ext_hdr(nexthdr)) {
298 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
299 		if (offset < 0)
300 			return 0;
301 	} else
302 		offset = sizeof(struct ipv6hdr);
303 
304 	if (nexthdr == IPPROTO_ICMPV6) {
305 		struct icmp6hdr *icmp6;
306 
307 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
308 					 offset + 1 - skb->data)))
309 			return 0;
310 
311 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
312 
313 		switch (icmp6->icmp6_type) {
314 		case NDISC_ROUTER_SOLICITATION:
315 		case NDISC_ROUTER_ADVERTISEMENT:
316 		case NDISC_NEIGHBOUR_SOLICITATION:
317 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
318 		case NDISC_REDIRECT:
319 			/* For reaction involving unicast neighbor discovery
320 			 * message destined to the proxied address, pass it to
321 			 * input function.
322 			 */
323 			return 1;
324 		default:
325 			break;
326 		}
327 	}
328 
329 	/*
330 	 * The proxying router can't forward traffic sent to a link-local
331 	 * address, so signal the sender and discard the packet. This
332 	 * behavior is clarified by the MIPv6 specification.
333 	 */
334 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
335 		dst_link_failure(skb);
336 		return -1;
337 	}
338 
339 	return 0;
340 }
341 
342 static inline int ip6_forward_finish(struct sk_buff *skb)
343 {
344 	return dst_output(skb);
345 }
346 
347 int ip6_forward(struct sk_buff *skb)
348 {
349 	struct dst_entry *dst = skb_dst(skb);
350 	struct ipv6hdr *hdr = ipv6_hdr(skb);
351 	struct inet6_skb_parm *opt = IP6CB(skb);
352 	struct net *net = dev_net(dst->dev);
353 	u32 mtu;
354 
355 	if (net->ipv6.devconf_all->forwarding == 0)
356 		goto error;
357 
358 	if (skb_warn_if_lro(skb))
359 		goto drop;
360 
361 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
362 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
363 		goto drop;
364 	}
365 
366 	if (skb->pkt_type != PACKET_HOST)
367 		goto drop;
368 
369 	skb_forward_csum(skb);
370 
371 	/*
372 	 *	We DO NOT make any processing on
373 	 *	RA packets, pushing them to user level AS IS
374 	 *	without ane WARRANTY that application will be able
375 	 *	to interpret them. The reason is that we
376 	 *	cannot make anything clever here.
377 	 *
378 	 *	We are not end-node, so that if packet contains
379 	 *	AH/ESP, we cannot make anything.
380 	 *	Defragmentation also would be mistake, RA packets
381 	 *	cannot be fragmented, because there is no warranty
382 	 *	that different fragments will go along one path. --ANK
383 	 */
384 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
385 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
386 			return 0;
387 	}
388 
389 	/*
390 	 *	check and decrement ttl
391 	 */
392 	if (hdr->hop_limit <= 1) {
393 		/* Force OUTPUT device used as source address */
394 		skb->dev = dst->dev;
395 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
396 		IP6_INC_STATS_BH(net,
397 				 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
398 
399 		kfree_skb(skb);
400 		return -ETIMEDOUT;
401 	}
402 
403 	/* XXX: idev->cnf.proxy_ndp? */
404 	if (net->ipv6.devconf_all->proxy_ndp &&
405 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
406 		int proxied = ip6_forward_proxy_check(skb);
407 		if (proxied > 0)
408 			return ip6_input(skb);
409 		else if (proxied < 0) {
410 			IP6_INC_STATS(net, ip6_dst_idev(dst),
411 				      IPSTATS_MIB_INDISCARDS);
412 			goto drop;
413 		}
414 	}
415 
416 	if (!xfrm6_route_forward(skb)) {
417 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
418 		goto drop;
419 	}
420 	dst = skb_dst(skb);
421 
422 	/* IPv6 specs say nothing about it, but it is clear that we cannot
423 	   send redirects to source routed frames.
424 	   We don't send redirects to frames decapsulated from IPsec.
425 	 */
426 	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
427 		struct in6_addr *target = NULL;
428 		struct inet_peer *peer;
429 		struct rt6_info *rt;
430 
431 		/*
432 		 *	incoming and outgoing devices are the same
433 		 *	send a redirect.
434 		 */
435 
436 		rt = (struct rt6_info *) dst;
437 		if (rt->rt6i_flags & RTF_GATEWAY)
438 			target = &rt->rt6i_gateway;
439 		else
440 			target = &hdr->daddr;
441 
442 		peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
443 
444 		/* Limit redirects both by destination (here)
445 		   and by source (inside ndisc_send_redirect)
446 		 */
447 		if (inet_peer_xrlim_allow(peer, 1*HZ))
448 			ndisc_send_redirect(skb, target);
449 		if (peer)
450 			inet_putpeer(peer);
451 	} else {
452 		int addrtype = ipv6_addr_type(&hdr->saddr);
453 
454 		/* This check is security critical. */
455 		if (addrtype == IPV6_ADDR_ANY ||
456 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
457 			goto error;
458 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
459 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
460 				    ICMPV6_NOT_NEIGHBOUR, 0);
461 			goto error;
462 		}
463 	}
464 
465 	mtu = dst_mtu(dst);
466 	if (mtu < IPV6_MIN_MTU)
467 		mtu = IPV6_MIN_MTU;
468 
469 	if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) ||
470 	    (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) {
471 		/* Again, force OUTPUT device used as source address */
472 		skb->dev = dst->dev;
473 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
474 		IP6_INC_STATS_BH(net,
475 				 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
476 		IP6_INC_STATS_BH(net,
477 				 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
478 		kfree_skb(skb);
479 		return -EMSGSIZE;
480 	}
481 
482 	if (skb_cow(skb, dst->dev->hard_header_len)) {
483 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
484 		goto drop;
485 	}
486 
487 	hdr = ipv6_hdr(skb);
488 
489 	/* Mangling hops number delayed to point after skb COW */
490 
491 	hdr->hop_limit--;
492 
493 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
494 	IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
495 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
496 		       ip6_forward_finish);
497 
498 error:
499 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
500 drop:
501 	kfree_skb(skb);
502 	return -EINVAL;
503 }
504 
505 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
506 {
507 	to->pkt_type = from->pkt_type;
508 	to->priority = from->priority;
509 	to->protocol = from->protocol;
510 	skb_dst_drop(to);
511 	skb_dst_set(to, dst_clone(skb_dst(from)));
512 	to->dev = from->dev;
513 	to->mark = from->mark;
514 
515 #ifdef CONFIG_NET_SCHED
516 	to->tc_index = from->tc_index;
517 #endif
518 	nf_copy(to, from);
519 #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
520 	to->nf_trace = from->nf_trace;
521 #endif
522 	skb_copy_secmark(to, from);
523 }
524 
525 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
526 {
527 	struct sk_buff *frag;
528 	struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
529 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
530 	struct ipv6hdr *tmp_hdr;
531 	struct frag_hdr *fh;
532 	unsigned int mtu, hlen, left, len;
533 	int hroom, troom;
534 	__be32 frag_id = 0;
535 	int ptr, offset = 0, err=0;
536 	u8 *prevhdr, nexthdr = 0;
537 	struct net *net = dev_net(skb_dst(skb)->dev);
538 
539 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
540 	nexthdr = *prevhdr;
541 
542 	mtu = ip6_skb_dst_mtu(skb);
543 
544 	/* We must not fragment if the socket is set to force MTU discovery
545 	 * or if the skb it not generated by a local socket.
546 	 */
547 	if (unlikely(!skb->local_df && skb->len > mtu) ||
548 		     (IP6CB(skb)->frag_max_size &&
549 		      IP6CB(skb)->frag_max_size > mtu)) {
550 		if (skb->sk && dst_allfrag(skb_dst(skb)))
551 			sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
552 
553 		skb->dev = skb_dst(skb)->dev;
554 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
555 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
556 			      IPSTATS_MIB_FRAGFAILS);
557 		kfree_skb(skb);
558 		return -EMSGSIZE;
559 	}
560 
561 	if (np && np->frag_size < mtu) {
562 		if (np->frag_size)
563 			mtu = np->frag_size;
564 	}
565 	mtu -= hlen + sizeof(struct frag_hdr);
566 
567 	if (skb_has_frag_list(skb)) {
568 		int first_len = skb_pagelen(skb);
569 		struct sk_buff *frag2;
570 
571 		if (first_len - hlen > mtu ||
572 		    ((first_len - hlen) & 7) ||
573 		    skb_cloned(skb))
574 			goto slow_path;
575 
576 		skb_walk_frags(skb, frag) {
577 			/* Correct geometry. */
578 			if (frag->len > mtu ||
579 			    ((frag->len & 7) && frag->next) ||
580 			    skb_headroom(frag) < hlen)
581 				goto slow_path_clean;
582 
583 			/* Partially cloned skb? */
584 			if (skb_shared(frag))
585 				goto slow_path_clean;
586 
587 			BUG_ON(frag->sk);
588 			if (skb->sk) {
589 				frag->sk = skb->sk;
590 				frag->destructor = sock_wfree;
591 			}
592 			skb->truesize -= frag->truesize;
593 		}
594 
595 		err = 0;
596 		offset = 0;
597 		frag = skb_shinfo(skb)->frag_list;
598 		skb_frag_list_init(skb);
599 		/* BUILD HEADER */
600 
601 		*prevhdr = NEXTHDR_FRAGMENT;
602 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
603 		if (!tmp_hdr) {
604 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
605 				      IPSTATS_MIB_FRAGFAILS);
606 			return -ENOMEM;
607 		}
608 
609 		__skb_pull(skb, hlen);
610 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
611 		__skb_push(skb, hlen);
612 		skb_reset_network_header(skb);
613 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
614 
615 		ipv6_select_ident(fh, rt);
616 		fh->nexthdr = nexthdr;
617 		fh->reserved = 0;
618 		fh->frag_off = htons(IP6_MF);
619 		frag_id = fh->identification;
620 
621 		first_len = skb_pagelen(skb);
622 		skb->data_len = first_len - skb_headlen(skb);
623 		skb->len = first_len;
624 		ipv6_hdr(skb)->payload_len = htons(first_len -
625 						   sizeof(struct ipv6hdr));
626 
627 		dst_hold(&rt->dst);
628 
629 		for (;;) {
630 			/* Prepare header of the next frame,
631 			 * before previous one went down. */
632 			if (frag) {
633 				frag->ip_summed = CHECKSUM_NONE;
634 				skb_reset_transport_header(frag);
635 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
636 				__skb_push(frag, hlen);
637 				skb_reset_network_header(frag);
638 				memcpy(skb_network_header(frag), tmp_hdr,
639 				       hlen);
640 				offset += skb->len - hlen - sizeof(struct frag_hdr);
641 				fh->nexthdr = nexthdr;
642 				fh->reserved = 0;
643 				fh->frag_off = htons(offset);
644 				if (frag->next != NULL)
645 					fh->frag_off |= htons(IP6_MF);
646 				fh->identification = frag_id;
647 				ipv6_hdr(frag)->payload_len =
648 						htons(frag->len -
649 						      sizeof(struct ipv6hdr));
650 				ip6_copy_metadata(frag, skb);
651 			}
652 
653 			err = output(skb);
654 			if(!err)
655 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
656 					      IPSTATS_MIB_FRAGCREATES);
657 
658 			if (err || !frag)
659 				break;
660 
661 			skb = frag;
662 			frag = skb->next;
663 			skb->next = NULL;
664 		}
665 
666 		kfree(tmp_hdr);
667 
668 		if (err == 0) {
669 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
670 				      IPSTATS_MIB_FRAGOKS);
671 			ip6_rt_put(rt);
672 			return 0;
673 		}
674 
675 		while (frag) {
676 			skb = frag->next;
677 			kfree_skb(frag);
678 			frag = skb;
679 		}
680 
681 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
682 			      IPSTATS_MIB_FRAGFAILS);
683 		ip6_rt_put(rt);
684 		return err;
685 
686 slow_path_clean:
687 		skb_walk_frags(skb, frag2) {
688 			if (frag2 == frag)
689 				break;
690 			frag2->sk = NULL;
691 			frag2->destructor = NULL;
692 			skb->truesize += frag2->truesize;
693 		}
694 	}
695 
696 slow_path:
697 	if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
698 	    skb_checksum_help(skb))
699 		goto fail;
700 
701 	left = skb->len - hlen;		/* Space per frame */
702 	ptr = hlen;			/* Where to start from */
703 
704 	/*
705 	 *	Fragment the datagram.
706 	 */
707 
708 	*prevhdr = NEXTHDR_FRAGMENT;
709 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
710 	troom = rt->dst.dev->needed_tailroom;
711 
712 	/*
713 	 *	Keep copying data until we run out.
714 	 */
715 	while(left > 0)	{
716 		len = left;
717 		/* IF: it doesn't fit, use 'mtu' - the data space left */
718 		if (len > mtu)
719 			len = mtu;
720 		/* IF: we are not sending up to and including the packet end
721 		   then align the next start on an eight byte boundary */
722 		if (len < left)	{
723 			len &= ~7;
724 		}
725 		/*
726 		 *	Allocate buffer.
727 		 */
728 
729 		if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
730 				      hroom + troom, GFP_ATOMIC)) == NULL) {
731 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
732 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
733 				      IPSTATS_MIB_FRAGFAILS);
734 			err = -ENOMEM;
735 			goto fail;
736 		}
737 
738 		/*
739 		 *	Set up data on packet
740 		 */
741 
742 		ip6_copy_metadata(frag, skb);
743 		skb_reserve(frag, hroom);
744 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
745 		skb_reset_network_header(frag);
746 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
747 		frag->transport_header = (frag->network_header + hlen +
748 					  sizeof(struct frag_hdr));
749 
750 		/*
751 		 *	Charge the memory for the fragment to any owner
752 		 *	it might possess
753 		 */
754 		if (skb->sk)
755 			skb_set_owner_w(frag, skb->sk);
756 
757 		/*
758 		 *	Copy the packet header into the new buffer.
759 		 */
760 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
761 
762 		/*
763 		 *	Build fragment header.
764 		 */
765 		fh->nexthdr = nexthdr;
766 		fh->reserved = 0;
767 		if (!frag_id) {
768 			ipv6_select_ident(fh, rt);
769 			frag_id = fh->identification;
770 		} else
771 			fh->identification = frag_id;
772 
773 		/*
774 		 *	Copy a block of the IP datagram.
775 		 */
776 		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
777 			BUG();
778 		left -= len;
779 
780 		fh->frag_off = htons(offset);
781 		if (left > 0)
782 			fh->frag_off |= htons(IP6_MF);
783 		ipv6_hdr(frag)->payload_len = htons(frag->len -
784 						    sizeof(struct ipv6hdr));
785 
786 		ptr += len;
787 		offset += len;
788 
789 		/*
790 		 *	Put this fragment into the sending queue.
791 		 */
792 		err = output(frag);
793 		if (err)
794 			goto fail;
795 
796 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
797 			      IPSTATS_MIB_FRAGCREATES);
798 	}
799 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
800 		      IPSTATS_MIB_FRAGOKS);
801 	consume_skb(skb);
802 	return err;
803 
804 fail:
805 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
806 		      IPSTATS_MIB_FRAGFAILS);
807 	kfree_skb(skb);
808 	return err;
809 }
810 
811 static inline int ip6_rt_check(const struct rt6key *rt_key,
812 			       const struct in6_addr *fl_addr,
813 			       const struct in6_addr *addr_cache)
814 {
815 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
816 		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
817 }
818 
819 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
820 					  struct dst_entry *dst,
821 					  const struct flowi6 *fl6)
822 {
823 	struct ipv6_pinfo *np = inet6_sk(sk);
824 	struct rt6_info *rt;
825 
826 	if (!dst)
827 		goto out;
828 
829 	if (dst->ops->family != AF_INET6) {
830 		dst_release(dst);
831 		return NULL;
832 	}
833 
834 	rt = (struct rt6_info *)dst;
835 	/* Yes, checking route validity in not connected
836 	 * case is not very simple. Take into account,
837 	 * that we do not support routing by source, TOS,
838 	 * and MSG_DONTROUTE 		--ANK (980726)
839 	 *
840 	 * 1. ip6_rt_check(): If route was host route,
841 	 *    check that cached destination is current.
842 	 *    If it is network route, we still may
843 	 *    check its validity using saved pointer
844 	 *    to the last used address: daddr_cache.
845 	 *    We do not want to save whole address now,
846 	 *    (because main consumer of this service
847 	 *    is tcp, which has not this problem),
848 	 *    so that the last trick works only on connected
849 	 *    sockets.
850 	 * 2. oif also should be the same.
851 	 */
852 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
853 #ifdef CONFIG_IPV6_SUBTREES
854 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
855 #endif
856 	    (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
857 		dst_release(dst);
858 		dst = NULL;
859 	}
860 
861 out:
862 	return dst;
863 }
864 
865 static int ip6_dst_lookup_tail(struct sock *sk,
866 			       struct dst_entry **dst, struct flowi6 *fl6)
867 {
868 	struct net *net = sock_net(sk);
869 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
870 	struct neighbour *n;
871 	struct rt6_info *rt;
872 #endif
873 	int err;
874 
875 	if (*dst == NULL)
876 		*dst = ip6_route_output(net, sk, fl6);
877 
878 	if ((err = (*dst)->error))
879 		goto out_err_release;
880 
881 	if (ipv6_addr_any(&fl6->saddr)) {
882 		struct rt6_info *rt = (struct rt6_info *) *dst;
883 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
884 					  sk ? inet6_sk(sk)->srcprefs : 0,
885 					  &fl6->saddr);
886 		if (err)
887 			goto out_err_release;
888 	}
889 
890 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
891 	/*
892 	 * Here if the dst entry we've looked up
893 	 * has a neighbour entry that is in the INCOMPLETE
894 	 * state and the src address from the flow is
895 	 * marked as OPTIMISTIC, we release the found
896 	 * dst entry and replace it instead with the
897 	 * dst entry of the nexthop router
898 	 */
899 	rt = (struct rt6_info *) *dst;
900 	rcu_read_lock_bh();
901 	n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt, &fl6->daddr));
902 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
903 	rcu_read_unlock_bh();
904 
905 	if (err) {
906 		struct inet6_ifaddr *ifp;
907 		struct flowi6 fl_gw6;
908 		int redirect;
909 
910 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
911 				      (*dst)->dev, 1);
912 
913 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
914 		if (ifp)
915 			in6_ifa_put(ifp);
916 
917 		if (redirect) {
918 			/*
919 			 * We need to get the dst entry for the
920 			 * default router instead
921 			 */
922 			dst_release(*dst);
923 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
924 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
925 			*dst = ip6_route_output(net, sk, &fl_gw6);
926 			if ((err = (*dst)->error))
927 				goto out_err_release;
928 		}
929 	}
930 #endif
931 
932 	return 0;
933 
934 out_err_release:
935 	if (err == -ENETUNREACH)
936 		IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
937 	dst_release(*dst);
938 	*dst = NULL;
939 	return err;
940 }
941 
942 /**
943  *	ip6_dst_lookup - perform route lookup on flow
944  *	@sk: socket which provides route info
945  *	@dst: pointer to dst_entry * for result
946  *	@fl6: flow to lookup
947  *
948  *	This function performs a route lookup on the given flow.
949  *
950  *	It returns zero on success, or a standard errno code on error.
951  */
952 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
953 {
954 	*dst = NULL;
955 	return ip6_dst_lookup_tail(sk, dst, fl6);
956 }
957 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
958 
959 /**
960  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
961  *	@sk: socket which provides route info
962  *	@fl6: flow to lookup
963  *	@final_dst: final destination address for ipsec lookup
964  *	@can_sleep: we are in a sleepable context
965  *
966  *	This function performs a route lookup on the given flow.
967  *
968  *	It returns a valid dst pointer on success, or a pointer encoded
969  *	error code.
970  */
971 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
972 				      const struct in6_addr *final_dst,
973 				      bool can_sleep)
974 {
975 	struct dst_entry *dst = NULL;
976 	int err;
977 
978 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
979 	if (err)
980 		return ERR_PTR(err);
981 	if (final_dst)
982 		fl6->daddr = *final_dst;
983 	if (can_sleep)
984 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
985 
986 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
987 }
988 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
989 
990 /**
991  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
992  *	@sk: socket which provides the dst cache and route info
993  *	@fl6: flow to lookup
994  *	@final_dst: final destination address for ipsec lookup
995  *	@can_sleep: we are in a sleepable context
996  *
997  *	This function performs a route lookup on the given flow with the
998  *	possibility of using the cached route in the socket if it is valid.
999  *	It will take the socket dst lock when operating on the dst cache.
1000  *	As a result, this function can only be used in process context.
1001  *
1002  *	It returns a valid dst pointer on success, or a pointer encoded
1003  *	error code.
1004  */
1005 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1006 					 const struct in6_addr *final_dst,
1007 					 bool can_sleep)
1008 {
1009 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1010 	int err;
1011 
1012 	dst = ip6_sk_dst_check(sk, dst, fl6);
1013 
1014 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1015 	if (err)
1016 		return ERR_PTR(err);
1017 	if (final_dst)
1018 		fl6->daddr = *final_dst;
1019 	if (can_sleep)
1020 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1021 
1022 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1023 }
1024 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1025 
1026 static inline int ip6_ufo_append_data(struct sock *sk,
1027 			int getfrag(void *from, char *to, int offset, int len,
1028 			int odd, struct sk_buff *skb),
1029 			void *from, int length, int hh_len, int fragheaderlen,
1030 			int transhdrlen, int mtu,unsigned int flags,
1031 			struct rt6_info *rt)
1032 
1033 {
1034 	struct sk_buff *skb;
1035 	int err;
1036 
1037 	/* There is support for UDP large send offload by network
1038 	 * device, so create one single skb packet containing complete
1039 	 * udp datagram
1040 	 */
1041 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1042 		skb = sock_alloc_send_skb(sk,
1043 			hh_len + fragheaderlen + transhdrlen + 20,
1044 			(flags & MSG_DONTWAIT), &err);
1045 		if (skb == NULL)
1046 			return err;
1047 
1048 		/* reserve space for Hardware header */
1049 		skb_reserve(skb, hh_len);
1050 
1051 		/* create space for UDP/IP header */
1052 		skb_put(skb,fragheaderlen + transhdrlen);
1053 
1054 		/* initialize network header pointer */
1055 		skb_reset_network_header(skb);
1056 
1057 		/* initialize protocol header pointer */
1058 		skb->transport_header = skb->network_header + fragheaderlen;
1059 
1060 		skb->ip_summed = CHECKSUM_PARTIAL;
1061 		skb->csum = 0;
1062 	}
1063 
1064 	err = skb_append_datato_frags(sk,skb, getfrag, from,
1065 				      (length - transhdrlen));
1066 	if (!err) {
1067 		struct frag_hdr fhdr;
1068 
1069 		/* Specify the length of each IPv6 datagram fragment.
1070 		 * It has to be a multiple of 8.
1071 		 */
1072 		skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1073 					     sizeof(struct frag_hdr)) & ~7;
1074 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1075 		ipv6_select_ident(&fhdr, rt);
1076 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1077 		__skb_queue_tail(&sk->sk_write_queue, skb);
1078 
1079 		return 0;
1080 	}
1081 	/* There is not enough support do UPD LSO,
1082 	 * so follow normal path
1083 	 */
1084 	kfree_skb(skb);
1085 
1086 	return err;
1087 }
1088 
1089 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1090 					       gfp_t gfp)
1091 {
1092 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1093 }
1094 
1095 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1096 						gfp_t gfp)
1097 {
1098 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1099 }
1100 
1101 static void ip6_append_data_mtu(unsigned int *mtu,
1102 				int *maxfraglen,
1103 				unsigned int fragheaderlen,
1104 				struct sk_buff *skb,
1105 				struct rt6_info *rt,
1106 				bool pmtuprobe)
1107 {
1108 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1109 		if (skb == NULL) {
1110 			/* first fragment, reserve header_len */
1111 			*mtu = *mtu - rt->dst.header_len;
1112 
1113 		} else {
1114 			/*
1115 			 * this fragment is not first, the headers
1116 			 * space is regarded as data space.
1117 			 */
1118 			*mtu = min(*mtu, pmtuprobe ?
1119 				   rt->dst.dev->mtu :
1120 				   dst_mtu(rt->dst.path));
1121 		}
1122 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1123 			      + fragheaderlen - sizeof(struct frag_hdr);
1124 	}
1125 }
1126 
1127 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1128 	int offset, int len, int odd, struct sk_buff *skb),
1129 	void *from, int length, int transhdrlen,
1130 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1131 	struct rt6_info *rt, unsigned int flags, int dontfrag)
1132 {
1133 	struct inet_sock *inet = inet_sk(sk);
1134 	struct ipv6_pinfo *np = inet6_sk(sk);
1135 	struct inet_cork *cork;
1136 	struct sk_buff *skb, *skb_prev = NULL;
1137 	unsigned int maxfraglen, fragheaderlen, mtu;
1138 	int exthdrlen;
1139 	int dst_exthdrlen;
1140 	int hh_len;
1141 	int copy;
1142 	int err;
1143 	int offset = 0;
1144 	__u8 tx_flags = 0;
1145 
1146 	if (flags&MSG_PROBE)
1147 		return 0;
1148 	cork = &inet->cork.base;
1149 	if (skb_queue_empty(&sk->sk_write_queue)) {
1150 		/*
1151 		 * setup for corking
1152 		 */
1153 		if (opt) {
1154 			if (WARN_ON(np->cork.opt))
1155 				return -EINVAL;
1156 
1157 			np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation);
1158 			if (unlikely(np->cork.opt == NULL))
1159 				return -ENOBUFS;
1160 
1161 			np->cork.opt->tot_len = opt->tot_len;
1162 			np->cork.opt->opt_flen = opt->opt_flen;
1163 			np->cork.opt->opt_nflen = opt->opt_nflen;
1164 
1165 			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1166 							    sk->sk_allocation);
1167 			if (opt->dst0opt && !np->cork.opt->dst0opt)
1168 				return -ENOBUFS;
1169 
1170 			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1171 							    sk->sk_allocation);
1172 			if (opt->dst1opt && !np->cork.opt->dst1opt)
1173 				return -ENOBUFS;
1174 
1175 			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1176 							   sk->sk_allocation);
1177 			if (opt->hopopt && !np->cork.opt->hopopt)
1178 				return -ENOBUFS;
1179 
1180 			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1181 							    sk->sk_allocation);
1182 			if (opt->srcrt && !np->cork.opt->srcrt)
1183 				return -ENOBUFS;
1184 
1185 			/* need source address above miyazawa*/
1186 		}
1187 		dst_hold(&rt->dst);
1188 		cork->dst = &rt->dst;
1189 		inet->cork.fl.u.ip6 = *fl6;
1190 		np->cork.hop_limit = hlimit;
1191 		np->cork.tclass = tclass;
1192 		if (rt->dst.flags & DST_XFRM_TUNNEL)
1193 			mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1194 			      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1195 		else
1196 			mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1197 			      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1198 		if (np->frag_size < mtu) {
1199 			if (np->frag_size)
1200 				mtu = np->frag_size;
1201 		}
1202 		cork->fragsize = mtu;
1203 		if (dst_allfrag(rt->dst.path))
1204 			cork->flags |= IPCORK_ALLFRAG;
1205 		cork->length = 0;
1206 		exthdrlen = (opt ? opt->opt_flen : 0);
1207 		length += exthdrlen;
1208 		transhdrlen += exthdrlen;
1209 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1210 	} else {
1211 		rt = (struct rt6_info *)cork->dst;
1212 		fl6 = &inet->cork.fl.u.ip6;
1213 		opt = np->cork.opt;
1214 		transhdrlen = 0;
1215 		exthdrlen = 0;
1216 		dst_exthdrlen = 0;
1217 		mtu = cork->fragsize;
1218 	}
1219 
1220 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1221 
1222 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1223 			(opt ? opt->opt_nflen : 0);
1224 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1225 
1226 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1227 		if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1228 			ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1229 			return -EMSGSIZE;
1230 		}
1231 	}
1232 
1233 	/* For UDP, check if TX timestamp is enabled */
1234 	if (sk->sk_type == SOCK_DGRAM)
1235 		sock_tx_timestamp(sk, &tx_flags);
1236 
1237 	/*
1238 	 * Let's try using as much space as possible.
1239 	 * Use MTU if total length of the message fits into the MTU.
1240 	 * Otherwise, we need to reserve fragment header and
1241 	 * fragment alignment (= 8-15 octects, in total).
1242 	 *
1243 	 * Note that we may need to "move" the data from the tail of
1244 	 * of the buffer to the new fragment when we split
1245 	 * the message.
1246 	 *
1247 	 * FIXME: It may be fragmented into multiple chunks
1248 	 *        at once if non-fragmentable extension headers
1249 	 *        are too large.
1250 	 * --yoshfuji
1251 	 */
1252 
1253 	cork->length += length;
1254 	if (length > mtu) {
1255 		int proto = sk->sk_protocol;
1256 		if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1257 			ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1258 			return -EMSGSIZE;
1259 		}
1260 
1261 		if (proto == IPPROTO_UDP &&
1262 		    (rt->dst.dev->features & NETIF_F_UFO)) {
1263 
1264 			err = ip6_ufo_append_data(sk, getfrag, from, length,
1265 						  hh_len, fragheaderlen,
1266 						  transhdrlen, mtu, flags, rt);
1267 			if (err)
1268 				goto error;
1269 			return 0;
1270 		}
1271 	}
1272 
1273 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1274 		goto alloc_new_skb;
1275 
1276 	while (length > 0) {
1277 		/* Check if the remaining data fits into current packet. */
1278 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1279 		if (copy < length)
1280 			copy = maxfraglen - skb->len;
1281 
1282 		if (copy <= 0) {
1283 			char *data;
1284 			unsigned int datalen;
1285 			unsigned int fraglen;
1286 			unsigned int fraggap;
1287 			unsigned int alloclen;
1288 alloc_new_skb:
1289 			/* There's no room in the current skb */
1290 			if (skb)
1291 				fraggap = skb->len - maxfraglen;
1292 			else
1293 				fraggap = 0;
1294 			/* update mtu and maxfraglen if necessary */
1295 			if (skb == NULL || skb_prev == NULL)
1296 				ip6_append_data_mtu(&mtu, &maxfraglen,
1297 						    fragheaderlen, skb, rt,
1298 						    np->pmtudisc ==
1299 						    IPV6_PMTUDISC_PROBE);
1300 
1301 			skb_prev = skb;
1302 
1303 			/*
1304 			 * If remaining data exceeds the mtu,
1305 			 * we know we need more fragment(s).
1306 			 */
1307 			datalen = length + fraggap;
1308 
1309 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1310 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1311 			if ((flags & MSG_MORE) &&
1312 			    !(rt->dst.dev->features&NETIF_F_SG))
1313 				alloclen = mtu;
1314 			else
1315 				alloclen = datalen + fragheaderlen;
1316 
1317 			alloclen += dst_exthdrlen;
1318 
1319 			if (datalen != length + fraggap) {
1320 				/*
1321 				 * this is not the last fragment, the trailer
1322 				 * space is regarded as data space.
1323 				 */
1324 				datalen += rt->dst.trailer_len;
1325 			}
1326 
1327 			alloclen += rt->dst.trailer_len;
1328 			fraglen = datalen + fragheaderlen;
1329 
1330 			/*
1331 			 * We just reserve space for fragment header.
1332 			 * Note: this may be overallocation if the message
1333 			 * (without MSG_MORE) fits into the MTU.
1334 			 */
1335 			alloclen += sizeof(struct frag_hdr);
1336 
1337 			if (transhdrlen) {
1338 				skb = sock_alloc_send_skb(sk,
1339 						alloclen + hh_len,
1340 						(flags & MSG_DONTWAIT), &err);
1341 			} else {
1342 				skb = NULL;
1343 				if (atomic_read(&sk->sk_wmem_alloc) <=
1344 				    2 * sk->sk_sndbuf)
1345 					skb = sock_wmalloc(sk,
1346 							   alloclen + hh_len, 1,
1347 							   sk->sk_allocation);
1348 				if (unlikely(skb == NULL))
1349 					err = -ENOBUFS;
1350 				else {
1351 					/* Only the initial fragment
1352 					 * is time stamped.
1353 					 */
1354 					tx_flags = 0;
1355 				}
1356 			}
1357 			if (skb == NULL)
1358 				goto error;
1359 			/*
1360 			 *	Fill in the control structures
1361 			 */
1362 			skb->ip_summed = CHECKSUM_NONE;
1363 			skb->csum = 0;
1364 			/* reserve for fragmentation and ipsec header */
1365 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1366 				    dst_exthdrlen);
1367 
1368 			if (sk->sk_type == SOCK_DGRAM)
1369 				skb_shinfo(skb)->tx_flags = tx_flags;
1370 
1371 			/*
1372 			 *	Find where to start putting bytes
1373 			 */
1374 			data = skb_put(skb, fraglen);
1375 			skb_set_network_header(skb, exthdrlen);
1376 			data += fragheaderlen;
1377 			skb->transport_header = (skb->network_header +
1378 						 fragheaderlen);
1379 			if (fraggap) {
1380 				skb->csum = skb_copy_and_csum_bits(
1381 					skb_prev, maxfraglen,
1382 					data + transhdrlen, fraggap, 0);
1383 				skb_prev->csum = csum_sub(skb_prev->csum,
1384 							  skb->csum);
1385 				data += fraggap;
1386 				pskb_trim_unique(skb_prev, maxfraglen);
1387 			}
1388 			copy = datalen - transhdrlen - fraggap;
1389 
1390 			if (copy < 0) {
1391 				err = -EINVAL;
1392 				kfree_skb(skb);
1393 				goto error;
1394 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1395 				err = -EFAULT;
1396 				kfree_skb(skb);
1397 				goto error;
1398 			}
1399 
1400 			offset += copy;
1401 			length -= datalen - fraggap;
1402 			transhdrlen = 0;
1403 			exthdrlen = 0;
1404 			dst_exthdrlen = 0;
1405 
1406 			/*
1407 			 * Put the packet on the pending queue
1408 			 */
1409 			__skb_queue_tail(&sk->sk_write_queue, skb);
1410 			continue;
1411 		}
1412 
1413 		if (copy > length)
1414 			copy = length;
1415 
1416 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1417 			unsigned int off;
1418 
1419 			off = skb->len;
1420 			if (getfrag(from, skb_put(skb, copy),
1421 						offset, copy, off, skb) < 0) {
1422 				__skb_trim(skb, off);
1423 				err = -EFAULT;
1424 				goto error;
1425 			}
1426 		} else {
1427 			int i = skb_shinfo(skb)->nr_frags;
1428 			struct page_frag *pfrag = sk_page_frag(sk);
1429 
1430 			err = -ENOMEM;
1431 			if (!sk_page_frag_refill(sk, pfrag))
1432 				goto error;
1433 
1434 			if (!skb_can_coalesce(skb, i, pfrag->page,
1435 					      pfrag->offset)) {
1436 				err = -EMSGSIZE;
1437 				if (i == MAX_SKB_FRAGS)
1438 					goto error;
1439 
1440 				__skb_fill_page_desc(skb, i, pfrag->page,
1441 						     pfrag->offset, 0);
1442 				skb_shinfo(skb)->nr_frags = ++i;
1443 				get_page(pfrag->page);
1444 			}
1445 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1446 			if (getfrag(from,
1447 				    page_address(pfrag->page) + pfrag->offset,
1448 				    offset, copy, skb->len, skb) < 0)
1449 				goto error_efault;
1450 
1451 			pfrag->offset += copy;
1452 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1453 			skb->len += copy;
1454 			skb->data_len += copy;
1455 			skb->truesize += copy;
1456 			atomic_add(copy, &sk->sk_wmem_alloc);
1457 		}
1458 		offset += copy;
1459 		length -= copy;
1460 	}
1461 
1462 	return 0;
1463 
1464 error_efault:
1465 	err = -EFAULT;
1466 error:
1467 	cork->length -= length;
1468 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1469 	return err;
1470 }
1471 EXPORT_SYMBOL_GPL(ip6_append_data);
1472 
1473 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1474 {
1475 	if (np->cork.opt) {
1476 		kfree(np->cork.opt->dst0opt);
1477 		kfree(np->cork.opt->dst1opt);
1478 		kfree(np->cork.opt->hopopt);
1479 		kfree(np->cork.opt->srcrt);
1480 		kfree(np->cork.opt);
1481 		np->cork.opt = NULL;
1482 	}
1483 
1484 	if (inet->cork.base.dst) {
1485 		dst_release(inet->cork.base.dst);
1486 		inet->cork.base.dst = NULL;
1487 		inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1488 	}
1489 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1490 }
1491 
1492 int ip6_push_pending_frames(struct sock *sk)
1493 {
1494 	struct sk_buff *skb, *tmp_skb;
1495 	struct sk_buff **tail_skb;
1496 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1497 	struct inet_sock *inet = inet_sk(sk);
1498 	struct ipv6_pinfo *np = inet6_sk(sk);
1499 	struct net *net = sock_net(sk);
1500 	struct ipv6hdr *hdr;
1501 	struct ipv6_txoptions *opt = np->cork.opt;
1502 	struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1503 	struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1504 	unsigned char proto = fl6->flowi6_proto;
1505 	int err = 0;
1506 
1507 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1508 		goto out;
1509 	tail_skb = &(skb_shinfo(skb)->frag_list);
1510 
1511 	/* move skb->data to ip header from ext header */
1512 	if (skb->data < skb_network_header(skb))
1513 		__skb_pull(skb, skb_network_offset(skb));
1514 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1515 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1516 		*tail_skb = tmp_skb;
1517 		tail_skb = &(tmp_skb->next);
1518 		skb->len += tmp_skb->len;
1519 		skb->data_len += tmp_skb->len;
1520 		skb->truesize += tmp_skb->truesize;
1521 		tmp_skb->destructor = NULL;
1522 		tmp_skb->sk = NULL;
1523 	}
1524 
1525 	/* Allow local fragmentation. */
1526 	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1527 		skb->local_df = 1;
1528 
1529 	*final_dst = fl6->daddr;
1530 	__skb_pull(skb, skb_network_header_len(skb));
1531 	if (opt && opt->opt_flen)
1532 		ipv6_push_frag_opts(skb, opt, &proto);
1533 	if (opt && opt->opt_nflen)
1534 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1535 
1536 	skb_push(skb, sizeof(struct ipv6hdr));
1537 	skb_reset_network_header(skb);
1538 	hdr = ipv6_hdr(skb);
1539 
1540 	ip6_flow_hdr(hdr, np->cork.tclass, fl6->flowlabel);
1541 	hdr->hop_limit = np->cork.hop_limit;
1542 	hdr->nexthdr = proto;
1543 	hdr->saddr = fl6->saddr;
1544 	hdr->daddr = *final_dst;
1545 
1546 	skb->priority = sk->sk_priority;
1547 	skb->mark = sk->sk_mark;
1548 
1549 	skb_dst_set(skb, dst_clone(&rt->dst));
1550 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1551 	if (proto == IPPROTO_ICMPV6) {
1552 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1553 
1554 		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1555 		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1556 	}
1557 
1558 	err = ip6_local_out(skb);
1559 	if (err) {
1560 		if (err > 0)
1561 			err = net_xmit_errno(err);
1562 		if (err)
1563 			goto error;
1564 	}
1565 
1566 out:
1567 	ip6_cork_release(inet, np);
1568 	return err;
1569 error:
1570 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1571 	goto out;
1572 }
1573 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1574 
1575 void ip6_flush_pending_frames(struct sock *sk)
1576 {
1577 	struct sk_buff *skb;
1578 
1579 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1580 		if (skb_dst(skb))
1581 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1582 				      IPSTATS_MIB_OUTDISCARDS);
1583 		kfree_skb(skb);
1584 	}
1585 
1586 	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1587 }
1588 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1589