xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 9d749629)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	: 	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 
59 int __ip6_local_out(struct sk_buff *skb)
60 {
61 	int len;
62 
63 	len = skb->len - sizeof(struct ipv6hdr);
64 	if (len > IPV6_MAXPLEN)
65 		len = 0;
66 	ipv6_hdr(skb)->payload_len = htons(len);
67 
68 	return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
69 		       skb_dst(skb)->dev, dst_output);
70 }
71 
72 int ip6_local_out(struct sk_buff *skb)
73 {
74 	int err;
75 
76 	err = __ip6_local_out(skb);
77 	if (likely(err == 1))
78 		err = dst_output(skb);
79 
80 	return err;
81 }
82 EXPORT_SYMBOL_GPL(ip6_local_out);
83 
84 static int ip6_finish_output2(struct sk_buff *skb)
85 {
86 	struct dst_entry *dst = skb_dst(skb);
87 	struct net_device *dev = dst->dev;
88 	struct neighbour *neigh;
89 	struct in6_addr *nexthop;
90 	int ret;
91 
92 	skb->protocol = htons(ETH_P_IPV6);
93 	skb->dev = dev;
94 
95 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
96 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
97 
98 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
99 		    ((mroute6_socket(dev_net(dev), skb) &&
100 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
101 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
102 					 &ipv6_hdr(skb)->saddr))) {
103 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
104 
105 			/* Do not check for IFF_ALLMULTI; multicast routing
106 			   is not supported in any case.
107 			 */
108 			if (newskb)
109 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
110 					newskb, NULL, newskb->dev,
111 					dev_loopback_xmit);
112 
113 			if (ipv6_hdr(skb)->hop_limit == 0) {
114 				IP6_INC_STATS(dev_net(dev), idev,
115 					      IPSTATS_MIB_OUTDISCARDS);
116 				kfree_skb(skb);
117 				return 0;
118 			}
119 		}
120 
121 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
122 				skb->len);
123 
124 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
125 		    IPV6_ADDR_SCOPE_NODELOCAL &&
126 		    !(dev->flags & IFF_LOOPBACK)) {
127 			kfree_skb(skb);
128 			return 0;
129 		}
130 	}
131 
132 	rcu_read_lock_bh();
133 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
134 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
135 	if (unlikely(!neigh))
136 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
137 	if (!IS_ERR(neigh)) {
138 		ret = dst_neigh_output(dst, neigh, skb);
139 		rcu_read_unlock_bh();
140 		return ret;
141 	}
142 	rcu_read_unlock_bh();
143 
144 	IP6_INC_STATS_BH(dev_net(dst->dev),
145 			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
146 	kfree_skb(skb);
147 	return -EINVAL;
148 }
149 
150 static int ip6_finish_output(struct sk_buff *skb)
151 {
152 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
153 	    dst_allfrag(skb_dst(skb)))
154 		return ip6_fragment(skb, ip6_finish_output2);
155 	else
156 		return ip6_finish_output2(skb);
157 }
158 
159 int ip6_output(struct sk_buff *skb)
160 {
161 	struct net_device *dev = skb_dst(skb)->dev;
162 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
163 	if (unlikely(idev->cnf.disable_ipv6)) {
164 		IP6_INC_STATS(dev_net(dev), idev,
165 			      IPSTATS_MIB_OUTDISCARDS);
166 		kfree_skb(skb);
167 		return 0;
168 	}
169 
170 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
171 			    ip6_finish_output,
172 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
173 }
174 
175 /*
176  *	xmit an sk_buff (used by TCP, SCTP and DCCP)
177  */
178 
179 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
180 	     struct ipv6_txoptions *opt, int tclass)
181 {
182 	struct net *net = sock_net(sk);
183 	struct ipv6_pinfo *np = inet6_sk(sk);
184 	struct in6_addr *first_hop = &fl6->daddr;
185 	struct dst_entry *dst = skb_dst(skb);
186 	struct ipv6hdr *hdr;
187 	u8  proto = fl6->flowi6_proto;
188 	int seg_len = skb->len;
189 	int hlimit = -1;
190 	u32 mtu;
191 
192 	if (opt) {
193 		unsigned int head_room;
194 
195 		/* First: exthdrs may take lots of space (~8K for now)
196 		   MAX_HEADER is not enough.
197 		 */
198 		head_room = opt->opt_nflen + opt->opt_flen;
199 		seg_len += head_room;
200 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
201 
202 		if (skb_headroom(skb) < head_room) {
203 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
204 			if (skb2 == NULL) {
205 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
206 					      IPSTATS_MIB_OUTDISCARDS);
207 				kfree_skb(skb);
208 				return -ENOBUFS;
209 			}
210 			consume_skb(skb);
211 			skb = skb2;
212 			skb_set_owner_w(skb, sk);
213 		}
214 		if (opt->opt_flen)
215 			ipv6_push_frag_opts(skb, opt, &proto);
216 		if (opt->opt_nflen)
217 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
218 	}
219 
220 	skb_push(skb, sizeof(struct ipv6hdr));
221 	skb_reset_network_header(skb);
222 	hdr = ipv6_hdr(skb);
223 
224 	/*
225 	 *	Fill in the IPv6 header
226 	 */
227 	if (np)
228 		hlimit = np->hop_limit;
229 	if (hlimit < 0)
230 		hlimit = ip6_dst_hoplimit(dst);
231 
232 	ip6_flow_hdr(hdr, tclass, fl6->flowlabel);
233 
234 	hdr->payload_len = htons(seg_len);
235 	hdr->nexthdr = proto;
236 	hdr->hop_limit = hlimit;
237 
238 	hdr->saddr = fl6->saddr;
239 	hdr->daddr = *first_hop;
240 
241 	skb->priority = sk->sk_priority;
242 	skb->mark = sk->sk_mark;
243 
244 	mtu = dst_mtu(dst);
245 	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
246 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
247 			      IPSTATS_MIB_OUT, skb->len);
248 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
249 			       dst->dev, dst_output);
250 	}
251 
252 	skb->dev = dst->dev;
253 	ipv6_local_error(sk, EMSGSIZE, fl6, mtu);
254 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
255 	kfree_skb(skb);
256 	return -EMSGSIZE;
257 }
258 
259 EXPORT_SYMBOL(ip6_xmit);
260 
261 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
262 {
263 	struct ip6_ra_chain *ra;
264 	struct sock *last = NULL;
265 
266 	read_lock(&ip6_ra_lock);
267 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
268 		struct sock *sk = ra->sk;
269 		if (sk && ra->sel == sel &&
270 		    (!sk->sk_bound_dev_if ||
271 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
272 			if (last) {
273 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
274 				if (skb2)
275 					rawv6_rcv(last, skb2);
276 			}
277 			last = sk;
278 		}
279 	}
280 
281 	if (last) {
282 		rawv6_rcv(last, skb);
283 		read_unlock(&ip6_ra_lock);
284 		return 1;
285 	}
286 	read_unlock(&ip6_ra_lock);
287 	return 0;
288 }
289 
290 static int ip6_forward_proxy_check(struct sk_buff *skb)
291 {
292 	struct ipv6hdr *hdr = ipv6_hdr(skb);
293 	u8 nexthdr = hdr->nexthdr;
294 	__be16 frag_off;
295 	int offset;
296 
297 	if (ipv6_ext_hdr(nexthdr)) {
298 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
299 		if (offset < 0)
300 			return 0;
301 	} else
302 		offset = sizeof(struct ipv6hdr);
303 
304 	if (nexthdr == IPPROTO_ICMPV6) {
305 		struct icmp6hdr *icmp6;
306 
307 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
308 					 offset + 1 - skb->data)))
309 			return 0;
310 
311 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
312 
313 		switch (icmp6->icmp6_type) {
314 		case NDISC_ROUTER_SOLICITATION:
315 		case NDISC_ROUTER_ADVERTISEMENT:
316 		case NDISC_NEIGHBOUR_SOLICITATION:
317 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
318 		case NDISC_REDIRECT:
319 			/* For reaction involving unicast neighbor discovery
320 			 * message destined to the proxied address, pass it to
321 			 * input function.
322 			 */
323 			return 1;
324 		default:
325 			break;
326 		}
327 	}
328 
329 	/*
330 	 * The proxying router can't forward traffic sent to a link-local
331 	 * address, so signal the sender and discard the packet. This
332 	 * behavior is clarified by the MIPv6 specification.
333 	 */
334 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
335 		dst_link_failure(skb);
336 		return -1;
337 	}
338 
339 	return 0;
340 }
341 
342 static inline int ip6_forward_finish(struct sk_buff *skb)
343 {
344 	return dst_output(skb);
345 }
346 
347 int ip6_forward(struct sk_buff *skb)
348 {
349 	struct dst_entry *dst = skb_dst(skb);
350 	struct ipv6hdr *hdr = ipv6_hdr(skb);
351 	struct inet6_skb_parm *opt = IP6CB(skb);
352 	struct net *net = dev_net(dst->dev);
353 	u32 mtu;
354 
355 	if (net->ipv6.devconf_all->forwarding == 0)
356 		goto error;
357 
358 	if (skb_warn_if_lro(skb))
359 		goto drop;
360 
361 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
362 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
363 		goto drop;
364 	}
365 
366 	if (skb->pkt_type != PACKET_HOST)
367 		goto drop;
368 
369 	skb_forward_csum(skb);
370 
371 	/*
372 	 *	We DO NOT make any processing on
373 	 *	RA packets, pushing them to user level AS IS
374 	 *	without ane WARRANTY that application will be able
375 	 *	to interpret them. The reason is that we
376 	 *	cannot make anything clever here.
377 	 *
378 	 *	We are not end-node, so that if packet contains
379 	 *	AH/ESP, we cannot make anything.
380 	 *	Defragmentation also would be mistake, RA packets
381 	 *	cannot be fragmented, because there is no warranty
382 	 *	that different fragments will go along one path. --ANK
383 	 */
384 	if (opt->ra) {
385 		u8 *ptr = skb_network_header(skb) + opt->ra;
386 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
387 			return 0;
388 	}
389 
390 	/*
391 	 *	check and decrement ttl
392 	 */
393 	if (hdr->hop_limit <= 1) {
394 		/* Force OUTPUT device used as source address */
395 		skb->dev = dst->dev;
396 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
397 		IP6_INC_STATS_BH(net,
398 				 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
399 
400 		kfree_skb(skb);
401 		return -ETIMEDOUT;
402 	}
403 
404 	/* XXX: idev->cnf.proxy_ndp? */
405 	if (net->ipv6.devconf_all->proxy_ndp &&
406 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
407 		int proxied = ip6_forward_proxy_check(skb);
408 		if (proxied > 0)
409 			return ip6_input(skb);
410 		else if (proxied < 0) {
411 			IP6_INC_STATS(net, ip6_dst_idev(dst),
412 				      IPSTATS_MIB_INDISCARDS);
413 			goto drop;
414 		}
415 	}
416 
417 	if (!xfrm6_route_forward(skb)) {
418 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
419 		goto drop;
420 	}
421 	dst = skb_dst(skb);
422 
423 	/* IPv6 specs say nothing about it, but it is clear that we cannot
424 	   send redirects to source routed frames.
425 	   We don't send redirects to frames decapsulated from IPsec.
426 	 */
427 	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
428 		struct in6_addr *target = NULL;
429 		struct inet_peer *peer;
430 		struct rt6_info *rt;
431 
432 		/*
433 		 *	incoming and outgoing devices are the same
434 		 *	send a redirect.
435 		 */
436 
437 		rt = (struct rt6_info *) dst;
438 		if (rt->rt6i_flags & RTF_GATEWAY)
439 			target = &rt->rt6i_gateway;
440 		else
441 			target = &hdr->daddr;
442 
443 		peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
444 
445 		/* Limit redirects both by destination (here)
446 		   and by source (inside ndisc_send_redirect)
447 		 */
448 		if (inet_peer_xrlim_allow(peer, 1*HZ))
449 			ndisc_send_redirect(skb, target);
450 		if (peer)
451 			inet_putpeer(peer);
452 	} else {
453 		int addrtype = ipv6_addr_type(&hdr->saddr);
454 
455 		/* This check is security critical. */
456 		if (addrtype == IPV6_ADDR_ANY ||
457 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
458 			goto error;
459 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
460 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
461 				    ICMPV6_NOT_NEIGHBOUR, 0);
462 			goto error;
463 		}
464 	}
465 
466 	mtu = dst_mtu(dst);
467 	if (mtu < IPV6_MIN_MTU)
468 		mtu = IPV6_MIN_MTU;
469 
470 	if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) ||
471 	    (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) {
472 		/* Again, force OUTPUT device used as source address */
473 		skb->dev = dst->dev;
474 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
475 		IP6_INC_STATS_BH(net,
476 				 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
477 		IP6_INC_STATS_BH(net,
478 				 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
479 		kfree_skb(skb);
480 		return -EMSGSIZE;
481 	}
482 
483 	if (skb_cow(skb, dst->dev->hard_header_len)) {
484 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
485 		goto drop;
486 	}
487 
488 	hdr = ipv6_hdr(skb);
489 
490 	/* Mangling hops number delayed to point after skb COW */
491 
492 	hdr->hop_limit--;
493 
494 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
495 	IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
496 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
497 		       ip6_forward_finish);
498 
499 error:
500 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
501 drop:
502 	kfree_skb(skb);
503 	return -EINVAL;
504 }
505 
506 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
507 {
508 	to->pkt_type = from->pkt_type;
509 	to->priority = from->priority;
510 	to->protocol = from->protocol;
511 	skb_dst_drop(to);
512 	skb_dst_set(to, dst_clone(skb_dst(from)));
513 	to->dev = from->dev;
514 	to->mark = from->mark;
515 
516 #ifdef CONFIG_NET_SCHED
517 	to->tc_index = from->tc_index;
518 #endif
519 	nf_copy(to, from);
520 #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
521 	to->nf_trace = from->nf_trace;
522 #endif
523 	skb_copy_secmark(to, from);
524 }
525 
526 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
527 {
528 	struct sk_buff *frag;
529 	struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
530 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
531 	struct ipv6hdr *tmp_hdr;
532 	struct frag_hdr *fh;
533 	unsigned int mtu, hlen, left, len;
534 	int hroom, troom;
535 	__be32 frag_id = 0;
536 	int ptr, offset = 0, err=0;
537 	u8 *prevhdr, nexthdr = 0;
538 	struct net *net = dev_net(skb_dst(skb)->dev);
539 
540 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
541 	nexthdr = *prevhdr;
542 
543 	mtu = ip6_skb_dst_mtu(skb);
544 
545 	/* We must not fragment if the socket is set to force MTU discovery
546 	 * or if the skb it not generated by a local socket.
547 	 */
548 	if (unlikely(!skb->local_df && skb->len > mtu) ||
549 		     (IP6CB(skb)->frag_max_size &&
550 		      IP6CB(skb)->frag_max_size > mtu)) {
551 		if (skb->sk && dst_allfrag(skb_dst(skb)))
552 			sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
553 
554 		skb->dev = skb_dst(skb)->dev;
555 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
556 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
557 			      IPSTATS_MIB_FRAGFAILS);
558 		kfree_skb(skb);
559 		return -EMSGSIZE;
560 	}
561 
562 	if (np && np->frag_size < mtu) {
563 		if (np->frag_size)
564 			mtu = np->frag_size;
565 	}
566 	mtu -= hlen + sizeof(struct frag_hdr);
567 
568 	if (skb_has_frag_list(skb)) {
569 		int first_len = skb_pagelen(skb);
570 		struct sk_buff *frag2;
571 
572 		if (first_len - hlen > mtu ||
573 		    ((first_len - hlen) & 7) ||
574 		    skb_cloned(skb))
575 			goto slow_path;
576 
577 		skb_walk_frags(skb, frag) {
578 			/* Correct geometry. */
579 			if (frag->len > mtu ||
580 			    ((frag->len & 7) && frag->next) ||
581 			    skb_headroom(frag) < hlen)
582 				goto slow_path_clean;
583 
584 			/* Partially cloned skb? */
585 			if (skb_shared(frag))
586 				goto slow_path_clean;
587 
588 			BUG_ON(frag->sk);
589 			if (skb->sk) {
590 				frag->sk = skb->sk;
591 				frag->destructor = sock_wfree;
592 			}
593 			skb->truesize -= frag->truesize;
594 		}
595 
596 		err = 0;
597 		offset = 0;
598 		frag = skb_shinfo(skb)->frag_list;
599 		skb_frag_list_init(skb);
600 		/* BUILD HEADER */
601 
602 		*prevhdr = NEXTHDR_FRAGMENT;
603 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
604 		if (!tmp_hdr) {
605 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
606 				      IPSTATS_MIB_FRAGFAILS);
607 			return -ENOMEM;
608 		}
609 
610 		__skb_pull(skb, hlen);
611 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
612 		__skb_push(skb, hlen);
613 		skb_reset_network_header(skb);
614 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
615 
616 		ipv6_select_ident(fh, rt);
617 		fh->nexthdr = nexthdr;
618 		fh->reserved = 0;
619 		fh->frag_off = htons(IP6_MF);
620 		frag_id = fh->identification;
621 
622 		first_len = skb_pagelen(skb);
623 		skb->data_len = first_len - skb_headlen(skb);
624 		skb->len = first_len;
625 		ipv6_hdr(skb)->payload_len = htons(first_len -
626 						   sizeof(struct ipv6hdr));
627 
628 		dst_hold(&rt->dst);
629 
630 		for (;;) {
631 			/* Prepare header of the next frame,
632 			 * before previous one went down. */
633 			if (frag) {
634 				frag->ip_summed = CHECKSUM_NONE;
635 				skb_reset_transport_header(frag);
636 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
637 				__skb_push(frag, hlen);
638 				skb_reset_network_header(frag);
639 				memcpy(skb_network_header(frag), tmp_hdr,
640 				       hlen);
641 				offset += skb->len - hlen - sizeof(struct frag_hdr);
642 				fh->nexthdr = nexthdr;
643 				fh->reserved = 0;
644 				fh->frag_off = htons(offset);
645 				if (frag->next != NULL)
646 					fh->frag_off |= htons(IP6_MF);
647 				fh->identification = frag_id;
648 				ipv6_hdr(frag)->payload_len =
649 						htons(frag->len -
650 						      sizeof(struct ipv6hdr));
651 				ip6_copy_metadata(frag, skb);
652 			}
653 
654 			err = output(skb);
655 			if(!err)
656 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
657 					      IPSTATS_MIB_FRAGCREATES);
658 
659 			if (err || !frag)
660 				break;
661 
662 			skb = frag;
663 			frag = skb->next;
664 			skb->next = NULL;
665 		}
666 
667 		kfree(tmp_hdr);
668 
669 		if (err == 0) {
670 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
671 				      IPSTATS_MIB_FRAGOKS);
672 			ip6_rt_put(rt);
673 			return 0;
674 		}
675 
676 		while (frag) {
677 			skb = frag->next;
678 			kfree_skb(frag);
679 			frag = skb;
680 		}
681 
682 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
683 			      IPSTATS_MIB_FRAGFAILS);
684 		ip6_rt_put(rt);
685 		return err;
686 
687 slow_path_clean:
688 		skb_walk_frags(skb, frag2) {
689 			if (frag2 == frag)
690 				break;
691 			frag2->sk = NULL;
692 			frag2->destructor = NULL;
693 			skb->truesize += frag2->truesize;
694 		}
695 	}
696 
697 slow_path:
698 	if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
699 	    skb_checksum_help(skb))
700 		goto fail;
701 
702 	left = skb->len - hlen;		/* Space per frame */
703 	ptr = hlen;			/* Where to start from */
704 
705 	/*
706 	 *	Fragment the datagram.
707 	 */
708 
709 	*prevhdr = NEXTHDR_FRAGMENT;
710 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
711 	troom = rt->dst.dev->needed_tailroom;
712 
713 	/*
714 	 *	Keep copying data until we run out.
715 	 */
716 	while(left > 0)	{
717 		len = left;
718 		/* IF: it doesn't fit, use 'mtu' - the data space left */
719 		if (len > mtu)
720 			len = mtu;
721 		/* IF: we are not sending up to and including the packet end
722 		   then align the next start on an eight byte boundary */
723 		if (len < left)	{
724 			len &= ~7;
725 		}
726 		/*
727 		 *	Allocate buffer.
728 		 */
729 
730 		if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
731 				      hroom + troom, GFP_ATOMIC)) == NULL) {
732 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
733 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
734 				      IPSTATS_MIB_FRAGFAILS);
735 			err = -ENOMEM;
736 			goto fail;
737 		}
738 
739 		/*
740 		 *	Set up data on packet
741 		 */
742 
743 		ip6_copy_metadata(frag, skb);
744 		skb_reserve(frag, hroom);
745 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
746 		skb_reset_network_header(frag);
747 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
748 		frag->transport_header = (frag->network_header + hlen +
749 					  sizeof(struct frag_hdr));
750 
751 		/*
752 		 *	Charge the memory for the fragment to any owner
753 		 *	it might possess
754 		 */
755 		if (skb->sk)
756 			skb_set_owner_w(frag, skb->sk);
757 
758 		/*
759 		 *	Copy the packet header into the new buffer.
760 		 */
761 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
762 
763 		/*
764 		 *	Build fragment header.
765 		 */
766 		fh->nexthdr = nexthdr;
767 		fh->reserved = 0;
768 		if (!frag_id) {
769 			ipv6_select_ident(fh, rt);
770 			frag_id = fh->identification;
771 		} else
772 			fh->identification = frag_id;
773 
774 		/*
775 		 *	Copy a block of the IP datagram.
776 		 */
777 		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
778 			BUG();
779 		left -= len;
780 
781 		fh->frag_off = htons(offset);
782 		if (left > 0)
783 			fh->frag_off |= htons(IP6_MF);
784 		ipv6_hdr(frag)->payload_len = htons(frag->len -
785 						    sizeof(struct ipv6hdr));
786 
787 		ptr += len;
788 		offset += len;
789 
790 		/*
791 		 *	Put this fragment into the sending queue.
792 		 */
793 		err = output(frag);
794 		if (err)
795 			goto fail;
796 
797 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
798 			      IPSTATS_MIB_FRAGCREATES);
799 	}
800 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
801 		      IPSTATS_MIB_FRAGOKS);
802 	consume_skb(skb);
803 	return err;
804 
805 fail:
806 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
807 		      IPSTATS_MIB_FRAGFAILS);
808 	kfree_skb(skb);
809 	return err;
810 }
811 
812 static inline int ip6_rt_check(const struct rt6key *rt_key,
813 			       const struct in6_addr *fl_addr,
814 			       const struct in6_addr *addr_cache)
815 {
816 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
817 		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
818 }
819 
820 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
821 					  struct dst_entry *dst,
822 					  const struct flowi6 *fl6)
823 {
824 	struct ipv6_pinfo *np = inet6_sk(sk);
825 	struct rt6_info *rt = (struct rt6_info *)dst;
826 
827 	if (!dst)
828 		goto out;
829 
830 	/* Yes, checking route validity in not connected
831 	 * case is not very simple. Take into account,
832 	 * that we do not support routing by source, TOS,
833 	 * and MSG_DONTROUTE 		--ANK (980726)
834 	 *
835 	 * 1. ip6_rt_check(): If route was host route,
836 	 *    check that cached destination is current.
837 	 *    If it is network route, we still may
838 	 *    check its validity using saved pointer
839 	 *    to the last used address: daddr_cache.
840 	 *    We do not want to save whole address now,
841 	 *    (because main consumer of this service
842 	 *    is tcp, which has not this problem),
843 	 *    so that the last trick works only on connected
844 	 *    sockets.
845 	 * 2. oif also should be the same.
846 	 */
847 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
848 #ifdef CONFIG_IPV6_SUBTREES
849 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
850 #endif
851 	    (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
852 		dst_release(dst);
853 		dst = NULL;
854 	}
855 
856 out:
857 	return dst;
858 }
859 
860 static int ip6_dst_lookup_tail(struct sock *sk,
861 			       struct dst_entry **dst, struct flowi6 *fl6)
862 {
863 	struct net *net = sock_net(sk);
864 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
865 	struct neighbour *n;
866 	struct rt6_info *rt;
867 #endif
868 	int err;
869 
870 	if (*dst == NULL)
871 		*dst = ip6_route_output(net, sk, fl6);
872 
873 	if ((err = (*dst)->error))
874 		goto out_err_release;
875 
876 	if (ipv6_addr_any(&fl6->saddr)) {
877 		struct rt6_info *rt = (struct rt6_info *) *dst;
878 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
879 					  sk ? inet6_sk(sk)->srcprefs : 0,
880 					  &fl6->saddr);
881 		if (err)
882 			goto out_err_release;
883 	}
884 
885 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
886 	/*
887 	 * Here if the dst entry we've looked up
888 	 * has a neighbour entry that is in the INCOMPLETE
889 	 * state and the src address from the flow is
890 	 * marked as OPTIMISTIC, we release the found
891 	 * dst entry and replace it instead with the
892 	 * dst entry of the nexthop router
893 	 */
894 	rt = (struct rt6_info *) *dst;
895 	rcu_read_lock_bh();
896 	n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt, &fl6->daddr));
897 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
898 	rcu_read_unlock_bh();
899 
900 	if (err) {
901 		struct inet6_ifaddr *ifp;
902 		struct flowi6 fl_gw6;
903 		int redirect;
904 
905 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
906 				      (*dst)->dev, 1);
907 
908 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
909 		if (ifp)
910 			in6_ifa_put(ifp);
911 
912 		if (redirect) {
913 			/*
914 			 * We need to get the dst entry for the
915 			 * default router instead
916 			 */
917 			dst_release(*dst);
918 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
919 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
920 			*dst = ip6_route_output(net, sk, &fl_gw6);
921 			if ((err = (*dst)->error))
922 				goto out_err_release;
923 		}
924 	}
925 #endif
926 
927 	return 0;
928 
929 out_err_release:
930 	if (err == -ENETUNREACH)
931 		IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
932 	dst_release(*dst);
933 	*dst = NULL;
934 	return err;
935 }
936 
937 /**
938  *	ip6_dst_lookup - perform route lookup on flow
939  *	@sk: socket which provides route info
940  *	@dst: pointer to dst_entry * for result
941  *	@fl6: flow to lookup
942  *
943  *	This function performs a route lookup on the given flow.
944  *
945  *	It returns zero on success, or a standard errno code on error.
946  */
947 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
948 {
949 	*dst = NULL;
950 	return ip6_dst_lookup_tail(sk, dst, fl6);
951 }
952 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
953 
954 /**
955  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
956  *	@sk: socket which provides route info
957  *	@fl6: flow to lookup
958  *	@final_dst: final destination address for ipsec lookup
959  *	@can_sleep: we are in a sleepable context
960  *
961  *	This function performs a route lookup on the given flow.
962  *
963  *	It returns a valid dst pointer on success, or a pointer encoded
964  *	error code.
965  */
966 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
967 				      const struct in6_addr *final_dst,
968 				      bool can_sleep)
969 {
970 	struct dst_entry *dst = NULL;
971 	int err;
972 
973 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
974 	if (err)
975 		return ERR_PTR(err);
976 	if (final_dst)
977 		fl6->daddr = *final_dst;
978 	if (can_sleep)
979 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
980 
981 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
982 }
983 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
984 
985 /**
986  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
987  *	@sk: socket which provides the dst cache and route info
988  *	@fl6: flow to lookup
989  *	@final_dst: final destination address for ipsec lookup
990  *	@can_sleep: we are in a sleepable context
991  *
992  *	This function performs a route lookup on the given flow with the
993  *	possibility of using the cached route in the socket if it is valid.
994  *	It will take the socket dst lock when operating on the dst cache.
995  *	As a result, this function can only be used in process context.
996  *
997  *	It returns a valid dst pointer on success, or a pointer encoded
998  *	error code.
999  */
1000 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1001 					 const struct in6_addr *final_dst,
1002 					 bool can_sleep)
1003 {
1004 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1005 	int err;
1006 
1007 	dst = ip6_sk_dst_check(sk, dst, fl6);
1008 
1009 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1010 	if (err)
1011 		return ERR_PTR(err);
1012 	if (final_dst)
1013 		fl6->daddr = *final_dst;
1014 	if (can_sleep)
1015 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1016 
1017 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1018 }
1019 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1020 
1021 static inline int ip6_ufo_append_data(struct sock *sk,
1022 			int getfrag(void *from, char *to, int offset, int len,
1023 			int odd, struct sk_buff *skb),
1024 			void *from, int length, int hh_len, int fragheaderlen,
1025 			int transhdrlen, int mtu,unsigned int flags,
1026 			struct rt6_info *rt)
1027 
1028 {
1029 	struct sk_buff *skb;
1030 	int err;
1031 
1032 	/* There is support for UDP large send offload by network
1033 	 * device, so create one single skb packet containing complete
1034 	 * udp datagram
1035 	 */
1036 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1037 		skb = sock_alloc_send_skb(sk,
1038 			hh_len + fragheaderlen + transhdrlen + 20,
1039 			(flags & MSG_DONTWAIT), &err);
1040 		if (skb == NULL)
1041 			return err;
1042 
1043 		/* reserve space for Hardware header */
1044 		skb_reserve(skb, hh_len);
1045 
1046 		/* create space for UDP/IP header */
1047 		skb_put(skb,fragheaderlen + transhdrlen);
1048 
1049 		/* initialize network header pointer */
1050 		skb_reset_network_header(skb);
1051 
1052 		/* initialize protocol header pointer */
1053 		skb->transport_header = skb->network_header + fragheaderlen;
1054 
1055 		skb->ip_summed = CHECKSUM_PARTIAL;
1056 		skb->csum = 0;
1057 	}
1058 
1059 	err = skb_append_datato_frags(sk,skb, getfrag, from,
1060 				      (length - transhdrlen));
1061 	if (!err) {
1062 		struct frag_hdr fhdr;
1063 
1064 		/* Specify the length of each IPv6 datagram fragment.
1065 		 * It has to be a multiple of 8.
1066 		 */
1067 		skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1068 					     sizeof(struct frag_hdr)) & ~7;
1069 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1070 		ipv6_select_ident(&fhdr, rt);
1071 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1072 		__skb_queue_tail(&sk->sk_write_queue, skb);
1073 
1074 		return 0;
1075 	}
1076 	/* There is not enough support do UPD LSO,
1077 	 * so follow normal path
1078 	 */
1079 	kfree_skb(skb);
1080 
1081 	return err;
1082 }
1083 
1084 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1085 					       gfp_t gfp)
1086 {
1087 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1088 }
1089 
1090 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1091 						gfp_t gfp)
1092 {
1093 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1094 }
1095 
1096 static void ip6_append_data_mtu(int *mtu,
1097 				int *maxfraglen,
1098 				unsigned int fragheaderlen,
1099 				struct sk_buff *skb,
1100 				struct rt6_info *rt)
1101 {
1102 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1103 		if (skb == NULL) {
1104 			/* first fragment, reserve header_len */
1105 			*mtu = *mtu - rt->dst.header_len;
1106 
1107 		} else {
1108 			/*
1109 			 * this fragment is not first, the headers
1110 			 * space is regarded as data space.
1111 			 */
1112 			*mtu = dst_mtu(rt->dst.path);
1113 		}
1114 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1115 			      + fragheaderlen - sizeof(struct frag_hdr);
1116 	}
1117 }
1118 
1119 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1120 	int offset, int len, int odd, struct sk_buff *skb),
1121 	void *from, int length, int transhdrlen,
1122 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1123 	struct rt6_info *rt, unsigned int flags, int dontfrag)
1124 {
1125 	struct inet_sock *inet = inet_sk(sk);
1126 	struct ipv6_pinfo *np = inet6_sk(sk);
1127 	struct inet_cork *cork;
1128 	struct sk_buff *skb, *skb_prev = NULL;
1129 	unsigned int maxfraglen, fragheaderlen;
1130 	int exthdrlen;
1131 	int dst_exthdrlen;
1132 	int hh_len;
1133 	int mtu;
1134 	int copy;
1135 	int err;
1136 	int offset = 0;
1137 	__u8 tx_flags = 0;
1138 
1139 	if (flags&MSG_PROBE)
1140 		return 0;
1141 	cork = &inet->cork.base;
1142 	if (skb_queue_empty(&sk->sk_write_queue)) {
1143 		/*
1144 		 * setup for corking
1145 		 */
1146 		if (opt) {
1147 			if (WARN_ON(np->cork.opt))
1148 				return -EINVAL;
1149 
1150 			np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1151 			if (unlikely(np->cork.opt == NULL))
1152 				return -ENOBUFS;
1153 
1154 			np->cork.opt->tot_len = opt->tot_len;
1155 			np->cork.opt->opt_flen = opt->opt_flen;
1156 			np->cork.opt->opt_nflen = opt->opt_nflen;
1157 
1158 			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1159 							    sk->sk_allocation);
1160 			if (opt->dst0opt && !np->cork.opt->dst0opt)
1161 				return -ENOBUFS;
1162 
1163 			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1164 							    sk->sk_allocation);
1165 			if (opt->dst1opt && !np->cork.opt->dst1opt)
1166 				return -ENOBUFS;
1167 
1168 			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1169 							   sk->sk_allocation);
1170 			if (opt->hopopt && !np->cork.opt->hopopt)
1171 				return -ENOBUFS;
1172 
1173 			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1174 							    sk->sk_allocation);
1175 			if (opt->srcrt && !np->cork.opt->srcrt)
1176 				return -ENOBUFS;
1177 
1178 			/* need source address above miyazawa*/
1179 		}
1180 		dst_hold(&rt->dst);
1181 		cork->dst = &rt->dst;
1182 		inet->cork.fl.u.ip6 = *fl6;
1183 		np->cork.hop_limit = hlimit;
1184 		np->cork.tclass = tclass;
1185 		if (rt->dst.flags & DST_XFRM_TUNNEL)
1186 			mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1187 			      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1188 		else
1189 			mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1190 			      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1191 		if (np->frag_size < mtu) {
1192 			if (np->frag_size)
1193 				mtu = np->frag_size;
1194 		}
1195 		cork->fragsize = mtu;
1196 		if (dst_allfrag(rt->dst.path))
1197 			cork->flags |= IPCORK_ALLFRAG;
1198 		cork->length = 0;
1199 		exthdrlen = (opt ? opt->opt_flen : 0);
1200 		length += exthdrlen;
1201 		transhdrlen += exthdrlen;
1202 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1203 	} else {
1204 		rt = (struct rt6_info *)cork->dst;
1205 		fl6 = &inet->cork.fl.u.ip6;
1206 		opt = np->cork.opt;
1207 		transhdrlen = 0;
1208 		exthdrlen = 0;
1209 		dst_exthdrlen = 0;
1210 		mtu = cork->fragsize;
1211 	}
1212 
1213 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1214 
1215 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1216 			(opt ? opt->opt_nflen : 0);
1217 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1218 
1219 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1220 		if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1221 			ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1222 			return -EMSGSIZE;
1223 		}
1224 	}
1225 
1226 	/* For UDP, check if TX timestamp is enabled */
1227 	if (sk->sk_type == SOCK_DGRAM) {
1228 		err = sock_tx_timestamp(sk, &tx_flags);
1229 		if (err)
1230 			goto error;
1231 	}
1232 
1233 	/*
1234 	 * Let's try using as much space as possible.
1235 	 * Use MTU if total length of the message fits into the MTU.
1236 	 * Otherwise, we need to reserve fragment header and
1237 	 * fragment alignment (= 8-15 octects, in total).
1238 	 *
1239 	 * Note that we may need to "move" the data from the tail of
1240 	 * of the buffer to the new fragment when we split
1241 	 * the message.
1242 	 *
1243 	 * FIXME: It may be fragmented into multiple chunks
1244 	 *        at once if non-fragmentable extension headers
1245 	 *        are too large.
1246 	 * --yoshfuji
1247 	 */
1248 
1249 	cork->length += length;
1250 	if (length > mtu) {
1251 		int proto = sk->sk_protocol;
1252 		if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1253 			ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1254 			return -EMSGSIZE;
1255 		}
1256 
1257 		if (proto == IPPROTO_UDP &&
1258 		    (rt->dst.dev->features & NETIF_F_UFO)) {
1259 
1260 			err = ip6_ufo_append_data(sk, getfrag, from, length,
1261 						  hh_len, fragheaderlen,
1262 						  transhdrlen, mtu, flags, rt);
1263 			if (err)
1264 				goto error;
1265 			return 0;
1266 		}
1267 	}
1268 
1269 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1270 		goto alloc_new_skb;
1271 
1272 	while (length > 0) {
1273 		/* Check if the remaining data fits into current packet. */
1274 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1275 		if (copy < length)
1276 			copy = maxfraglen - skb->len;
1277 
1278 		if (copy <= 0) {
1279 			char *data;
1280 			unsigned int datalen;
1281 			unsigned int fraglen;
1282 			unsigned int fraggap;
1283 			unsigned int alloclen;
1284 alloc_new_skb:
1285 			/* There's no room in the current skb */
1286 			if (skb)
1287 				fraggap = skb->len - maxfraglen;
1288 			else
1289 				fraggap = 0;
1290 			/* update mtu and maxfraglen if necessary */
1291 			if (skb == NULL || skb_prev == NULL)
1292 				ip6_append_data_mtu(&mtu, &maxfraglen,
1293 						    fragheaderlen, skb, rt);
1294 
1295 			skb_prev = skb;
1296 
1297 			/*
1298 			 * If remaining data exceeds the mtu,
1299 			 * we know we need more fragment(s).
1300 			 */
1301 			datalen = length + fraggap;
1302 
1303 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1304 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1305 			if ((flags & MSG_MORE) &&
1306 			    !(rt->dst.dev->features&NETIF_F_SG))
1307 				alloclen = mtu;
1308 			else
1309 				alloclen = datalen + fragheaderlen;
1310 
1311 			alloclen += dst_exthdrlen;
1312 
1313 			if (datalen != length + fraggap) {
1314 				/*
1315 				 * this is not the last fragment, the trailer
1316 				 * space is regarded as data space.
1317 				 */
1318 				datalen += rt->dst.trailer_len;
1319 			}
1320 
1321 			alloclen += rt->dst.trailer_len;
1322 			fraglen = datalen + fragheaderlen;
1323 
1324 			/*
1325 			 * We just reserve space for fragment header.
1326 			 * Note: this may be overallocation if the message
1327 			 * (without MSG_MORE) fits into the MTU.
1328 			 */
1329 			alloclen += sizeof(struct frag_hdr);
1330 
1331 			if (transhdrlen) {
1332 				skb = sock_alloc_send_skb(sk,
1333 						alloclen + hh_len,
1334 						(flags & MSG_DONTWAIT), &err);
1335 			} else {
1336 				skb = NULL;
1337 				if (atomic_read(&sk->sk_wmem_alloc) <=
1338 				    2 * sk->sk_sndbuf)
1339 					skb = sock_wmalloc(sk,
1340 							   alloclen + hh_len, 1,
1341 							   sk->sk_allocation);
1342 				if (unlikely(skb == NULL))
1343 					err = -ENOBUFS;
1344 				else {
1345 					/* Only the initial fragment
1346 					 * is time stamped.
1347 					 */
1348 					tx_flags = 0;
1349 				}
1350 			}
1351 			if (skb == NULL)
1352 				goto error;
1353 			/*
1354 			 *	Fill in the control structures
1355 			 */
1356 			skb->ip_summed = CHECKSUM_NONE;
1357 			skb->csum = 0;
1358 			/* reserve for fragmentation and ipsec header */
1359 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1360 				    dst_exthdrlen);
1361 
1362 			if (sk->sk_type == SOCK_DGRAM)
1363 				skb_shinfo(skb)->tx_flags = tx_flags;
1364 
1365 			/*
1366 			 *	Find where to start putting bytes
1367 			 */
1368 			data = skb_put(skb, fraglen);
1369 			skb_set_network_header(skb, exthdrlen);
1370 			data += fragheaderlen;
1371 			skb->transport_header = (skb->network_header +
1372 						 fragheaderlen);
1373 			if (fraggap) {
1374 				skb->csum = skb_copy_and_csum_bits(
1375 					skb_prev, maxfraglen,
1376 					data + transhdrlen, fraggap, 0);
1377 				skb_prev->csum = csum_sub(skb_prev->csum,
1378 							  skb->csum);
1379 				data += fraggap;
1380 				pskb_trim_unique(skb_prev, maxfraglen);
1381 			}
1382 			copy = datalen - transhdrlen - fraggap;
1383 
1384 			if (copy < 0) {
1385 				err = -EINVAL;
1386 				kfree_skb(skb);
1387 				goto error;
1388 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1389 				err = -EFAULT;
1390 				kfree_skb(skb);
1391 				goto error;
1392 			}
1393 
1394 			offset += copy;
1395 			length -= datalen - fraggap;
1396 			transhdrlen = 0;
1397 			exthdrlen = 0;
1398 			dst_exthdrlen = 0;
1399 
1400 			/*
1401 			 * Put the packet on the pending queue
1402 			 */
1403 			__skb_queue_tail(&sk->sk_write_queue, skb);
1404 			continue;
1405 		}
1406 
1407 		if (copy > length)
1408 			copy = length;
1409 
1410 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1411 			unsigned int off;
1412 
1413 			off = skb->len;
1414 			if (getfrag(from, skb_put(skb, copy),
1415 						offset, copy, off, skb) < 0) {
1416 				__skb_trim(skb, off);
1417 				err = -EFAULT;
1418 				goto error;
1419 			}
1420 		} else {
1421 			int i = skb_shinfo(skb)->nr_frags;
1422 			struct page_frag *pfrag = sk_page_frag(sk);
1423 
1424 			err = -ENOMEM;
1425 			if (!sk_page_frag_refill(sk, pfrag))
1426 				goto error;
1427 
1428 			if (!skb_can_coalesce(skb, i, pfrag->page,
1429 					      pfrag->offset)) {
1430 				err = -EMSGSIZE;
1431 				if (i == MAX_SKB_FRAGS)
1432 					goto error;
1433 
1434 				__skb_fill_page_desc(skb, i, pfrag->page,
1435 						     pfrag->offset, 0);
1436 				skb_shinfo(skb)->nr_frags = ++i;
1437 				get_page(pfrag->page);
1438 			}
1439 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1440 			if (getfrag(from,
1441 				    page_address(pfrag->page) + pfrag->offset,
1442 				    offset, copy, skb->len, skb) < 0)
1443 				goto error_efault;
1444 
1445 			pfrag->offset += copy;
1446 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1447 			skb->len += copy;
1448 			skb->data_len += copy;
1449 			skb->truesize += copy;
1450 			atomic_add(copy, &sk->sk_wmem_alloc);
1451 		}
1452 		offset += copy;
1453 		length -= copy;
1454 	}
1455 
1456 	return 0;
1457 
1458 error_efault:
1459 	err = -EFAULT;
1460 error:
1461 	cork->length -= length;
1462 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1463 	return err;
1464 }
1465 EXPORT_SYMBOL_GPL(ip6_append_data);
1466 
1467 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1468 {
1469 	if (np->cork.opt) {
1470 		kfree(np->cork.opt->dst0opt);
1471 		kfree(np->cork.opt->dst1opt);
1472 		kfree(np->cork.opt->hopopt);
1473 		kfree(np->cork.opt->srcrt);
1474 		kfree(np->cork.opt);
1475 		np->cork.opt = NULL;
1476 	}
1477 
1478 	if (inet->cork.base.dst) {
1479 		dst_release(inet->cork.base.dst);
1480 		inet->cork.base.dst = NULL;
1481 		inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1482 	}
1483 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1484 }
1485 
1486 int ip6_push_pending_frames(struct sock *sk)
1487 {
1488 	struct sk_buff *skb, *tmp_skb;
1489 	struct sk_buff **tail_skb;
1490 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1491 	struct inet_sock *inet = inet_sk(sk);
1492 	struct ipv6_pinfo *np = inet6_sk(sk);
1493 	struct net *net = sock_net(sk);
1494 	struct ipv6hdr *hdr;
1495 	struct ipv6_txoptions *opt = np->cork.opt;
1496 	struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1497 	struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1498 	unsigned char proto = fl6->flowi6_proto;
1499 	int err = 0;
1500 
1501 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1502 		goto out;
1503 	tail_skb = &(skb_shinfo(skb)->frag_list);
1504 
1505 	/* move skb->data to ip header from ext header */
1506 	if (skb->data < skb_network_header(skb))
1507 		__skb_pull(skb, skb_network_offset(skb));
1508 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1509 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1510 		*tail_skb = tmp_skb;
1511 		tail_skb = &(tmp_skb->next);
1512 		skb->len += tmp_skb->len;
1513 		skb->data_len += tmp_skb->len;
1514 		skb->truesize += tmp_skb->truesize;
1515 		tmp_skb->destructor = NULL;
1516 		tmp_skb->sk = NULL;
1517 	}
1518 
1519 	/* Allow local fragmentation. */
1520 	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1521 		skb->local_df = 1;
1522 
1523 	*final_dst = fl6->daddr;
1524 	__skb_pull(skb, skb_network_header_len(skb));
1525 	if (opt && opt->opt_flen)
1526 		ipv6_push_frag_opts(skb, opt, &proto);
1527 	if (opt && opt->opt_nflen)
1528 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1529 
1530 	skb_push(skb, sizeof(struct ipv6hdr));
1531 	skb_reset_network_header(skb);
1532 	hdr = ipv6_hdr(skb);
1533 
1534 	ip6_flow_hdr(hdr, np->cork.tclass, fl6->flowlabel);
1535 	hdr->hop_limit = np->cork.hop_limit;
1536 	hdr->nexthdr = proto;
1537 	hdr->saddr = fl6->saddr;
1538 	hdr->daddr = *final_dst;
1539 
1540 	skb->priority = sk->sk_priority;
1541 	skb->mark = sk->sk_mark;
1542 
1543 	skb_dst_set(skb, dst_clone(&rt->dst));
1544 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1545 	if (proto == IPPROTO_ICMPV6) {
1546 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1547 
1548 		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1549 		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1550 	}
1551 
1552 	err = ip6_local_out(skb);
1553 	if (err) {
1554 		if (err > 0)
1555 			err = net_xmit_errno(err);
1556 		if (err)
1557 			goto error;
1558 	}
1559 
1560 out:
1561 	ip6_cork_release(inet, np);
1562 	return err;
1563 error:
1564 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1565 	goto out;
1566 }
1567 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1568 
1569 void ip6_flush_pending_frames(struct sock *sk)
1570 {
1571 	struct sk_buff *skb;
1572 
1573 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1574 		if (skb_dst(skb))
1575 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1576 				      IPSTATS_MIB_OUTDISCARDS);
1577 		kfree_skb(skb);
1578 	}
1579 
1580 	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1581 }
1582 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1583