xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 63159f29be1df7f93563a8a0f78c5e65fc844ed6)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	:	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 
59 static int ip6_finish_output2(struct sk_buff *skb)
60 {
61 	struct dst_entry *dst = skb_dst(skb);
62 	struct net_device *dev = dst->dev;
63 	struct neighbour *neigh;
64 	struct in6_addr *nexthop;
65 	int ret;
66 
67 	skb->protocol = htons(ETH_P_IPV6);
68 	skb->dev = dev;
69 
70 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72 
73 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
74 		    ((mroute6_socket(dev_net(dev), skb) &&
75 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 					 &ipv6_hdr(skb)->saddr))) {
78 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79 
80 			/* Do not check for IFF_ALLMULTI; multicast routing
81 			   is not supported in any case.
82 			 */
83 			if (newskb)
84 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 					newskb, NULL, newskb->dev,
86 					dev_loopback_xmit);
87 
88 			if (ipv6_hdr(skb)->hop_limit == 0) {
89 				IP6_INC_STATS(dev_net(dev), idev,
90 					      IPSTATS_MIB_OUTDISCARDS);
91 				kfree_skb(skb);
92 				return 0;
93 			}
94 		}
95 
96 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
97 				skb->len);
98 
99 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
100 		    IPV6_ADDR_SCOPE_NODELOCAL &&
101 		    !(dev->flags & IFF_LOOPBACK)) {
102 			kfree_skb(skb);
103 			return 0;
104 		}
105 	}
106 
107 	rcu_read_lock_bh();
108 	nexthop = rt6_nexthop((struct rt6_info *)dst);
109 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
110 	if (unlikely(!neigh))
111 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
112 	if (!IS_ERR(neigh)) {
113 		ret = dst_neigh_output(dst, neigh, skb);
114 		rcu_read_unlock_bh();
115 		return ret;
116 	}
117 	rcu_read_unlock_bh();
118 
119 	IP6_INC_STATS(dev_net(dst->dev),
120 		      ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
121 	kfree_skb(skb);
122 	return -EINVAL;
123 }
124 
125 static int ip6_finish_output(struct sk_buff *skb)
126 {
127 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
128 	    dst_allfrag(skb_dst(skb)) ||
129 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
130 		return ip6_fragment(skb, ip6_finish_output2);
131 	else
132 		return ip6_finish_output2(skb);
133 }
134 
135 int ip6_output(struct sock *sk, struct sk_buff *skb)
136 {
137 	struct net_device *dev = skb_dst(skb)->dev;
138 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
139 	if (unlikely(idev->cnf.disable_ipv6)) {
140 		IP6_INC_STATS(dev_net(dev), idev,
141 			      IPSTATS_MIB_OUTDISCARDS);
142 		kfree_skb(skb);
143 		return 0;
144 	}
145 
146 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
147 			    ip6_finish_output,
148 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
149 }
150 
151 /*
152  *	xmit an sk_buff (used by TCP, SCTP and DCCP)
153  */
154 
155 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
156 	     struct ipv6_txoptions *opt, int tclass)
157 {
158 	struct net *net = sock_net(sk);
159 	struct ipv6_pinfo *np = inet6_sk(sk);
160 	struct in6_addr *first_hop = &fl6->daddr;
161 	struct dst_entry *dst = skb_dst(skb);
162 	struct ipv6hdr *hdr;
163 	u8  proto = fl6->flowi6_proto;
164 	int seg_len = skb->len;
165 	int hlimit = -1;
166 	u32 mtu;
167 
168 	if (opt) {
169 		unsigned int head_room;
170 
171 		/* First: exthdrs may take lots of space (~8K for now)
172 		   MAX_HEADER is not enough.
173 		 */
174 		head_room = opt->opt_nflen + opt->opt_flen;
175 		seg_len += head_room;
176 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
177 
178 		if (skb_headroom(skb) < head_room) {
179 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
180 			if (!skb2) {
181 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
182 					      IPSTATS_MIB_OUTDISCARDS);
183 				kfree_skb(skb);
184 				return -ENOBUFS;
185 			}
186 			consume_skb(skb);
187 			skb = skb2;
188 			skb_set_owner_w(skb, sk);
189 		}
190 		if (opt->opt_flen)
191 			ipv6_push_frag_opts(skb, opt, &proto);
192 		if (opt->opt_nflen)
193 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
194 	}
195 
196 	skb_push(skb, sizeof(struct ipv6hdr));
197 	skb_reset_network_header(skb);
198 	hdr = ipv6_hdr(skb);
199 
200 	/*
201 	 *	Fill in the IPv6 header
202 	 */
203 	if (np)
204 		hlimit = np->hop_limit;
205 	if (hlimit < 0)
206 		hlimit = ip6_dst_hoplimit(dst);
207 
208 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
209 						     np->autoflowlabel));
210 
211 	hdr->payload_len = htons(seg_len);
212 	hdr->nexthdr = proto;
213 	hdr->hop_limit = hlimit;
214 
215 	hdr->saddr = fl6->saddr;
216 	hdr->daddr = *first_hop;
217 
218 	skb->protocol = htons(ETH_P_IPV6);
219 	skb->priority = sk->sk_priority;
220 	skb->mark = sk->sk_mark;
221 
222 	mtu = dst_mtu(dst);
223 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
224 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
225 			      IPSTATS_MIB_OUT, skb->len);
226 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
227 			       dst->dev, dst_output);
228 	}
229 
230 	skb->dev = dst->dev;
231 	ipv6_local_error(sk, EMSGSIZE, fl6, mtu);
232 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
233 	kfree_skb(skb);
234 	return -EMSGSIZE;
235 }
236 EXPORT_SYMBOL(ip6_xmit);
237 
238 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
239 {
240 	struct ip6_ra_chain *ra;
241 	struct sock *last = NULL;
242 
243 	read_lock(&ip6_ra_lock);
244 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
245 		struct sock *sk = ra->sk;
246 		if (sk && ra->sel == sel &&
247 		    (!sk->sk_bound_dev_if ||
248 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
249 			if (last) {
250 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
251 				if (skb2)
252 					rawv6_rcv(last, skb2);
253 			}
254 			last = sk;
255 		}
256 	}
257 
258 	if (last) {
259 		rawv6_rcv(last, skb);
260 		read_unlock(&ip6_ra_lock);
261 		return 1;
262 	}
263 	read_unlock(&ip6_ra_lock);
264 	return 0;
265 }
266 
267 static int ip6_forward_proxy_check(struct sk_buff *skb)
268 {
269 	struct ipv6hdr *hdr = ipv6_hdr(skb);
270 	u8 nexthdr = hdr->nexthdr;
271 	__be16 frag_off;
272 	int offset;
273 
274 	if (ipv6_ext_hdr(nexthdr)) {
275 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
276 		if (offset < 0)
277 			return 0;
278 	} else
279 		offset = sizeof(struct ipv6hdr);
280 
281 	if (nexthdr == IPPROTO_ICMPV6) {
282 		struct icmp6hdr *icmp6;
283 
284 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
285 					 offset + 1 - skb->data)))
286 			return 0;
287 
288 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
289 
290 		switch (icmp6->icmp6_type) {
291 		case NDISC_ROUTER_SOLICITATION:
292 		case NDISC_ROUTER_ADVERTISEMENT:
293 		case NDISC_NEIGHBOUR_SOLICITATION:
294 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
295 		case NDISC_REDIRECT:
296 			/* For reaction involving unicast neighbor discovery
297 			 * message destined to the proxied address, pass it to
298 			 * input function.
299 			 */
300 			return 1;
301 		default:
302 			break;
303 		}
304 	}
305 
306 	/*
307 	 * The proxying router can't forward traffic sent to a link-local
308 	 * address, so signal the sender and discard the packet. This
309 	 * behavior is clarified by the MIPv6 specification.
310 	 */
311 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
312 		dst_link_failure(skb);
313 		return -1;
314 	}
315 
316 	return 0;
317 }
318 
319 static inline int ip6_forward_finish(struct sk_buff *skb)
320 {
321 	skb_sender_cpu_clear(skb);
322 	return dst_output(skb);
323 }
324 
325 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
326 {
327 	unsigned int mtu;
328 	struct inet6_dev *idev;
329 
330 	if (dst_metric_locked(dst, RTAX_MTU)) {
331 		mtu = dst_metric_raw(dst, RTAX_MTU);
332 		if (mtu)
333 			return mtu;
334 	}
335 
336 	mtu = IPV6_MIN_MTU;
337 	rcu_read_lock();
338 	idev = __in6_dev_get(dst->dev);
339 	if (idev)
340 		mtu = idev->cnf.mtu6;
341 	rcu_read_unlock();
342 
343 	return mtu;
344 }
345 
346 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
347 {
348 	if (skb->len <= mtu)
349 		return false;
350 
351 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
352 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
353 		return true;
354 
355 	if (skb->ignore_df)
356 		return false;
357 
358 	if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
359 		return false;
360 
361 	return true;
362 }
363 
364 int ip6_forward(struct sk_buff *skb)
365 {
366 	struct dst_entry *dst = skb_dst(skb);
367 	struct ipv6hdr *hdr = ipv6_hdr(skb);
368 	struct inet6_skb_parm *opt = IP6CB(skb);
369 	struct net *net = dev_net(dst->dev);
370 	u32 mtu;
371 
372 	if (net->ipv6.devconf_all->forwarding == 0)
373 		goto error;
374 
375 	if (skb->pkt_type != PACKET_HOST)
376 		goto drop;
377 
378 	if (skb_warn_if_lro(skb))
379 		goto drop;
380 
381 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
382 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
383 				 IPSTATS_MIB_INDISCARDS);
384 		goto drop;
385 	}
386 
387 	skb_forward_csum(skb);
388 
389 	/*
390 	 *	We DO NOT make any processing on
391 	 *	RA packets, pushing them to user level AS IS
392 	 *	without ane WARRANTY that application will be able
393 	 *	to interpret them. The reason is that we
394 	 *	cannot make anything clever here.
395 	 *
396 	 *	We are not end-node, so that if packet contains
397 	 *	AH/ESP, we cannot make anything.
398 	 *	Defragmentation also would be mistake, RA packets
399 	 *	cannot be fragmented, because there is no warranty
400 	 *	that different fragments will go along one path. --ANK
401 	 */
402 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
403 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
404 			return 0;
405 	}
406 
407 	/*
408 	 *	check and decrement ttl
409 	 */
410 	if (hdr->hop_limit <= 1) {
411 		/* Force OUTPUT device used as source address */
412 		skb->dev = dst->dev;
413 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
414 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
415 				 IPSTATS_MIB_INHDRERRORS);
416 
417 		kfree_skb(skb);
418 		return -ETIMEDOUT;
419 	}
420 
421 	/* XXX: idev->cnf.proxy_ndp? */
422 	if (net->ipv6.devconf_all->proxy_ndp &&
423 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
424 		int proxied = ip6_forward_proxy_check(skb);
425 		if (proxied > 0)
426 			return ip6_input(skb);
427 		else if (proxied < 0) {
428 			IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
429 					 IPSTATS_MIB_INDISCARDS);
430 			goto drop;
431 		}
432 	}
433 
434 	if (!xfrm6_route_forward(skb)) {
435 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
436 				 IPSTATS_MIB_INDISCARDS);
437 		goto drop;
438 	}
439 	dst = skb_dst(skb);
440 
441 	/* IPv6 specs say nothing about it, but it is clear that we cannot
442 	   send redirects to source routed frames.
443 	   We don't send redirects to frames decapsulated from IPsec.
444 	 */
445 	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
446 		struct in6_addr *target = NULL;
447 		struct inet_peer *peer;
448 		struct rt6_info *rt;
449 
450 		/*
451 		 *	incoming and outgoing devices are the same
452 		 *	send a redirect.
453 		 */
454 
455 		rt = (struct rt6_info *) dst;
456 		if (rt->rt6i_flags & RTF_GATEWAY)
457 			target = &rt->rt6i_gateway;
458 		else
459 			target = &hdr->daddr;
460 
461 		peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
462 
463 		/* Limit redirects both by destination (here)
464 		   and by source (inside ndisc_send_redirect)
465 		 */
466 		if (inet_peer_xrlim_allow(peer, 1*HZ))
467 			ndisc_send_redirect(skb, target);
468 		if (peer)
469 			inet_putpeer(peer);
470 	} else {
471 		int addrtype = ipv6_addr_type(&hdr->saddr);
472 
473 		/* This check is security critical. */
474 		if (addrtype == IPV6_ADDR_ANY ||
475 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
476 			goto error;
477 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
478 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
479 				    ICMPV6_NOT_NEIGHBOUR, 0);
480 			goto error;
481 		}
482 	}
483 
484 	mtu = ip6_dst_mtu_forward(dst);
485 	if (mtu < IPV6_MIN_MTU)
486 		mtu = IPV6_MIN_MTU;
487 
488 	if (ip6_pkt_too_big(skb, mtu)) {
489 		/* Again, force OUTPUT device used as source address */
490 		skb->dev = dst->dev;
491 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
492 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
493 				 IPSTATS_MIB_INTOOBIGERRORS);
494 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
495 				 IPSTATS_MIB_FRAGFAILS);
496 		kfree_skb(skb);
497 		return -EMSGSIZE;
498 	}
499 
500 	if (skb_cow(skb, dst->dev->hard_header_len)) {
501 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
502 				 IPSTATS_MIB_OUTDISCARDS);
503 		goto drop;
504 	}
505 
506 	hdr = ipv6_hdr(skb);
507 
508 	/* Mangling hops number delayed to point after skb COW */
509 
510 	hdr->hop_limit--;
511 
512 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
513 	IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
514 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
515 		       ip6_forward_finish);
516 
517 error:
518 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
519 drop:
520 	kfree_skb(skb);
521 	return -EINVAL;
522 }
523 
524 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
525 {
526 	to->pkt_type = from->pkt_type;
527 	to->priority = from->priority;
528 	to->protocol = from->protocol;
529 	skb_dst_drop(to);
530 	skb_dst_set(to, dst_clone(skb_dst(from)));
531 	to->dev = from->dev;
532 	to->mark = from->mark;
533 
534 #ifdef CONFIG_NET_SCHED
535 	to->tc_index = from->tc_index;
536 #endif
537 	nf_copy(to, from);
538 	skb_copy_secmark(to, from);
539 }
540 
541 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
542 {
543 	struct sk_buff *frag;
544 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
545 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
546 	struct ipv6hdr *tmp_hdr;
547 	struct frag_hdr *fh;
548 	unsigned int mtu, hlen, left, len;
549 	int hroom, troom;
550 	__be32 frag_id = 0;
551 	int ptr, offset = 0, err = 0;
552 	u8 *prevhdr, nexthdr = 0;
553 	struct net *net = dev_net(skb_dst(skb)->dev);
554 
555 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
556 	nexthdr = *prevhdr;
557 
558 	mtu = ip6_skb_dst_mtu(skb);
559 
560 	/* We must not fragment if the socket is set to force MTU discovery
561 	 * or if the skb it not generated by a local socket.
562 	 */
563 	if (unlikely(!skb->ignore_df && skb->len > mtu) ||
564 		     (IP6CB(skb)->frag_max_size &&
565 		      IP6CB(skb)->frag_max_size > mtu)) {
566 		if (skb->sk && dst_allfrag(skb_dst(skb)))
567 			sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
568 
569 		skb->dev = skb_dst(skb)->dev;
570 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
571 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
572 			      IPSTATS_MIB_FRAGFAILS);
573 		kfree_skb(skb);
574 		return -EMSGSIZE;
575 	}
576 
577 	if (np && np->frag_size < mtu) {
578 		if (np->frag_size)
579 			mtu = np->frag_size;
580 	}
581 	mtu -= hlen + sizeof(struct frag_hdr);
582 
583 	if (skb_has_frag_list(skb)) {
584 		int first_len = skb_pagelen(skb);
585 		struct sk_buff *frag2;
586 
587 		if (first_len - hlen > mtu ||
588 		    ((first_len - hlen) & 7) ||
589 		    skb_cloned(skb))
590 			goto slow_path;
591 
592 		skb_walk_frags(skb, frag) {
593 			/* Correct geometry. */
594 			if (frag->len > mtu ||
595 			    ((frag->len & 7) && frag->next) ||
596 			    skb_headroom(frag) < hlen)
597 				goto slow_path_clean;
598 
599 			/* Partially cloned skb? */
600 			if (skb_shared(frag))
601 				goto slow_path_clean;
602 
603 			BUG_ON(frag->sk);
604 			if (skb->sk) {
605 				frag->sk = skb->sk;
606 				frag->destructor = sock_wfree;
607 			}
608 			skb->truesize -= frag->truesize;
609 		}
610 
611 		err = 0;
612 		offset = 0;
613 		frag = skb_shinfo(skb)->frag_list;
614 		skb_frag_list_init(skb);
615 		/* BUILD HEADER */
616 
617 		*prevhdr = NEXTHDR_FRAGMENT;
618 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
619 		if (!tmp_hdr) {
620 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
621 				      IPSTATS_MIB_FRAGFAILS);
622 			return -ENOMEM;
623 		}
624 
625 		__skb_pull(skb, hlen);
626 		fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
627 		__skb_push(skb, hlen);
628 		skb_reset_network_header(skb);
629 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
630 
631 		ipv6_select_ident(net, fh, rt);
632 		fh->nexthdr = nexthdr;
633 		fh->reserved = 0;
634 		fh->frag_off = htons(IP6_MF);
635 		frag_id = fh->identification;
636 
637 		first_len = skb_pagelen(skb);
638 		skb->data_len = first_len - skb_headlen(skb);
639 		skb->len = first_len;
640 		ipv6_hdr(skb)->payload_len = htons(first_len -
641 						   sizeof(struct ipv6hdr));
642 
643 		dst_hold(&rt->dst);
644 
645 		for (;;) {
646 			/* Prepare header of the next frame,
647 			 * before previous one went down. */
648 			if (frag) {
649 				frag->ip_summed = CHECKSUM_NONE;
650 				skb_reset_transport_header(frag);
651 				fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
652 				__skb_push(frag, hlen);
653 				skb_reset_network_header(frag);
654 				memcpy(skb_network_header(frag), tmp_hdr,
655 				       hlen);
656 				offset += skb->len - hlen - sizeof(struct frag_hdr);
657 				fh->nexthdr = nexthdr;
658 				fh->reserved = 0;
659 				fh->frag_off = htons(offset);
660 				if (frag->next != NULL)
661 					fh->frag_off |= htons(IP6_MF);
662 				fh->identification = frag_id;
663 				ipv6_hdr(frag)->payload_len =
664 						htons(frag->len -
665 						      sizeof(struct ipv6hdr));
666 				ip6_copy_metadata(frag, skb);
667 			}
668 
669 			err = output(skb);
670 			if (!err)
671 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
672 					      IPSTATS_MIB_FRAGCREATES);
673 
674 			if (err || !frag)
675 				break;
676 
677 			skb = frag;
678 			frag = skb->next;
679 			skb->next = NULL;
680 		}
681 
682 		kfree(tmp_hdr);
683 
684 		if (err == 0) {
685 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
686 				      IPSTATS_MIB_FRAGOKS);
687 			ip6_rt_put(rt);
688 			return 0;
689 		}
690 
691 		kfree_skb_list(frag);
692 
693 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
694 			      IPSTATS_MIB_FRAGFAILS);
695 		ip6_rt_put(rt);
696 		return err;
697 
698 slow_path_clean:
699 		skb_walk_frags(skb, frag2) {
700 			if (frag2 == frag)
701 				break;
702 			frag2->sk = NULL;
703 			frag2->destructor = NULL;
704 			skb->truesize += frag2->truesize;
705 		}
706 	}
707 
708 slow_path:
709 	if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
710 	    skb_checksum_help(skb))
711 		goto fail;
712 
713 	left = skb->len - hlen;		/* Space per frame */
714 	ptr = hlen;			/* Where to start from */
715 
716 	/*
717 	 *	Fragment the datagram.
718 	 */
719 
720 	*prevhdr = NEXTHDR_FRAGMENT;
721 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
722 	troom = rt->dst.dev->needed_tailroom;
723 
724 	/*
725 	 *	Keep copying data until we run out.
726 	 */
727 	while (left > 0)	{
728 		len = left;
729 		/* IF: it doesn't fit, use 'mtu' - the data space left */
730 		if (len > mtu)
731 			len = mtu;
732 		/* IF: we are not sending up to and including the packet end
733 		   then align the next start on an eight byte boundary */
734 		if (len < left)	{
735 			len &= ~7;
736 		}
737 
738 		/* Allocate buffer */
739 		frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
740 				 hroom + troom, GFP_ATOMIC);
741 		if (!frag) {
742 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
743 				      IPSTATS_MIB_FRAGFAILS);
744 			err = -ENOMEM;
745 			goto fail;
746 		}
747 
748 		/*
749 		 *	Set up data on packet
750 		 */
751 
752 		ip6_copy_metadata(frag, skb);
753 		skb_reserve(frag, hroom);
754 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
755 		skb_reset_network_header(frag);
756 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
757 		frag->transport_header = (frag->network_header + hlen +
758 					  sizeof(struct frag_hdr));
759 
760 		/*
761 		 *	Charge the memory for the fragment to any owner
762 		 *	it might possess
763 		 */
764 		if (skb->sk)
765 			skb_set_owner_w(frag, skb->sk);
766 
767 		/*
768 		 *	Copy the packet header into the new buffer.
769 		 */
770 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
771 
772 		/*
773 		 *	Build fragment header.
774 		 */
775 		fh->nexthdr = nexthdr;
776 		fh->reserved = 0;
777 		if (!frag_id) {
778 			ipv6_select_ident(net, fh, rt);
779 			frag_id = fh->identification;
780 		} else
781 			fh->identification = frag_id;
782 
783 		/*
784 		 *	Copy a block of the IP datagram.
785 		 */
786 		BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
787 				     len));
788 		left -= len;
789 
790 		fh->frag_off = htons(offset);
791 		if (left > 0)
792 			fh->frag_off |= htons(IP6_MF);
793 		ipv6_hdr(frag)->payload_len = htons(frag->len -
794 						    sizeof(struct ipv6hdr));
795 
796 		ptr += len;
797 		offset += len;
798 
799 		/*
800 		 *	Put this fragment into the sending queue.
801 		 */
802 		err = output(frag);
803 		if (err)
804 			goto fail;
805 
806 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
807 			      IPSTATS_MIB_FRAGCREATES);
808 	}
809 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
810 		      IPSTATS_MIB_FRAGOKS);
811 	consume_skb(skb);
812 	return err;
813 
814 fail:
815 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
816 		      IPSTATS_MIB_FRAGFAILS);
817 	kfree_skb(skb);
818 	return err;
819 }
820 
821 static inline int ip6_rt_check(const struct rt6key *rt_key,
822 			       const struct in6_addr *fl_addr,
823 			       const struct in6_addr *addr_cache)
824 {
825 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
826 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
827 }
828 
829 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
830 					  struct dst_entry *dst,
831 					  const struct flowi6 *fl6)
832 {
833 	struct ipv6_pinfo *np = inet6_sk(sk);
834 	struct rt6_info *rt;
835 
836 	if (!dst)
837 		goto out;
838 
839 	if (dst->ops->family != AF_INET6) {
840 		dst_release(dst);
841 		return NULL;
842 	}
843 
844 	rt = (struct rt6_info *)dst;
845 	/* Yes, checking route validity in not connected
846 	 * case is not very simple. Take into account,
847 	 * that we do not support routing by source, TOS,
848 	 * and MSG_DONTROUTE		--ANK (980726)
849 	 *
850 	 * 1. ip6_rt_check(): If route was host route,
851 	 *    check that cached destination is current.
852 	 *    If it is network route, we still may
853 	 *    check its validity using saved pointer
854 	 *    to the last used address: daddr_cache.
855 	 *    We do not want to save whole address now,
856 	 *    (because main consumer of this service
857 	 *    is tcp, which has not this problem),
858 	 *    so that the last trick works only on connected
859 	 *    sockets.
860 	 * 2. oif also should be the same.
861 	 */
862 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
863 #ifdef CONFIG_IPV6_SUBTREES
864 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
865 #endif
866 	    (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
867 		dst_release(dst);
868 		dst = NULL;
869 	}
870 
871 out:
872 	return dst;
873 }
874 
875 static int ip6_dst_lookup_tail(struct sock *sk,
876 			       struct dst_entry **dst, struct flowi6 *fl6)
877 {
878 	struct net *net = sock_net(sk);
879 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
880 	struct neighbour *n;
881 	struct rt6_info *rt;
882 #endif
883 	int err;
884 
885 	if (!*dst)
886 		*dst = ip6_route_output(net, sk, fl6);
887 
888 	err = (*dst)->error;
889 	if (err)
890 		goto out_err_release;
891 
892 	if (ipv6_addr_any(&fl6->saddr)) {
893 		struct rt6_info *rt = (struct rt6_info *) *dst;
894 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
895 					  sk ? inet6_sk(sk)->srcprefs : 0,
896 					  &fl6->saddr);
897 		if (err)
898 			goto out_err_release;
899 	}
900 
901 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
902 	/*
903 	 * Here if the dst entry we've looked up
904 	 * has a neighbour entry that is in the INCOMPLETE
905 	 * state and the src address from the flow is
906 	 * marked as OPTIMISTIC, we release the found
907 	 * dst entry and replace it instead with the
908 	 * dst entry of the nexthop router
909 	 */
910 	rt = (struct rt6_info *) *dst;
911 	rcu_read_lock_bh();
912 	n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt));
913 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
914 	rcu_read_unlock_bh();
915 
916 	if (err) {
917 		struct inet6_ifaddr *ifp;
918 		struct flowi6 fl_gw6;
919 		int redirect;
920 
921 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
922 				      (*dst)->dev, 1);
923 
924 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
925 		if (ifp)
926 			in6_ifa_put(ifp);
927 
928 		if (redirect) {
929 			/*
930 			 * We need to get the dst entry for the
931 			 * default router instead
932 			 */
933 			dst_release(*dst);
934 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
935 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
936 			*dst = ip6_route_output(net, sk, &fl_gw6);
937 			err = (*dst)->error;
938 			if (err)
939 				goto out_err_release;
940 		}
941 	}
942 #endif
943 
944 	return 0;
945 
946 out_err_release:
947 	if (err == -ENETUNREACH)
948 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
949 	dst_release(*dst);
950 	*dst = NULL;
951 	return err;
952 }
953 
954 /**
955  *	ip6_dst_lookup - perform route lookup on flow
956  *	@sk: socket which provides route info
957  *	@dst: pointer to dst_entry * for result
958  *	@fl6: flow to lookup
959  *
960  *	This function performs a route lookup on the given flow.
961  *
962  *	It returns zero on success, or a standard errno code on error.
963  */
964 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
965 {
966 	*dst = NULL;
967 	return ip6_dst_lookup_tail(sk, dst, fl6);
968 }
969 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
970 
971 /**
972  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
973  *	@sk: socket which provides route info
974  *	@fl6: flow to lookup
975  *	@final_dst: final destination address for ipsec lookup
976  *
977  *	This function performs a route lookup on the given flow.
978  *
979  *	It returns a valid dst pointer on success, or a pointer encoded
980  *	error code.
981  */
982 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
983 				      const struct in6_addr *final_dst)
984 {
985 	struct dst_entry *dst = NULL;
986 	int err;
987 
988 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
989 	if (err)
990 		return ERR_PTR(err);
991 	if (final_dst)
992 		fl6->daddr = *final_dst;
993 
994 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
995 }
996 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
997 
998 /**
999  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1000  *	@sk: socket which provides the dst cache and route info
1001  *	@fl6: flow to lookup
1002  *	@final_dst: final destination address for ipsec lookup
1003  *
1004  *	This function performs a route lookup on the given flow with the
1005  *	possibility of using the cached route in the socket if it is valid.
1006  *	It will take the socket dst lock when operating on the dst cache.
1007  *	As a result, this function can only be used in process context.
1008  *
1009  *	It returns a valid dst pointer on success, or a pointer encoded
1010  *	error code.
1011  */
1012 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1013 					 const struct in6_addr *final_dst)
1014 {
1015 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1016 	int err;
1017 
1018 	dst = ip6_sk_dst_check(sk, dst, fl6);
1019 
1020 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1021 	if (err)
1022 		return ERR_PTR(err);
1023 	if (final_dst)
1024 		fl6->daddr = *final_dst;
1025 
1026 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1027 }
1028 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1029 
1030 static inline int ip6_ufo_append_data(struct sock *sk,
1031 			struct sk_buff_head *queue,
1032 			int getfrag(void *from, char *to, int offset, int len,
1033 			int odd, struct sk_buff *skb),
1034 			void *from, int length, int hh_len, int fragheaderlen,
1035 			int transhdrlen, int mtu, unsigned int flags,
1036 			struct rt6_info *rt)
1037 
1038 {
1039 	struct sk_buff *skb;
1040 	struct frag_hdr fhdr;
1041 	int err;
1042 
1043 	/* There is support for UDP large send offload by network
1044 	 * device, so create one single skb packet containing complete
1045 	 * udp datagram
1046 	 */
1047 	skb = skb_peek_tail(queue);
1048 	if (!skb) {
1049 		skb = sock_alloc_send_skb(sk,
1050 			hh_len + fragheaderlen + transhdrlen + 20,
1051 			(flags & MSG_DONTWAIT), &err);
1052 		if (!skb)
1053 			return err;
1054 
1055 		/* reserve space for Hardware header */
1056 		skb_reserve(skb, hh_len);
1057 
1058 		/* create space for UDP/IP header */
1059 		skb_put(skb, fragheaderlen + transhdrlen);
1060 
1061 		/* initialize network header pointer */
1062 		skb_reset_network_header(skb);
1063 
1064 		/* initialize protocol header pointer */
1065 		skb->transport_header = skb->network_header + fragheaderlen;
1066 
1067 		skb->protocol = htons(ETH_P_IPV6);
1068 		skb->csum = 0;
1069 
1070 		__skb_queue_tail(queue, skb);
1071 	} else if (skb_is_gso(skb)) {
1072 		goto append;
1073 	}
1074 
1075 	skb->ip_summed = CHECKSUM_PARTIAL;
1076 	/* Specify the length of each IPv6 datagram fragment.
1077 	 * It has to be a multiple of 8.
1078 	 */
1079 	skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1080 				     sizeof(struct frag_hdr)) & ~7;
1081 	skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1082 	ipv6_select_ident(sock_net(sk), &fhdr, rt);
1083 	skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1084 
1085 append:
1086 	return skb_append_datato_frags(sk, skb, getfrag, from,
1087 				       (length - transhdrlen));
1088 }
1089 
1090 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1091 					       gfp_t gfp)
1092 {
1093 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1094 }
1095 
1096 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1097 						gfp_t gfp)
1098 {
1099 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1100 }
1101 
1102 static void ip6_append_data_mtu(unsigned int *mtu,
1103 				int *maxfraglen,
1104 				unsigned int fragheaderlen,
1105 				struct sk_buff *skb,
1106 				struct rt6_info *rt,
1107 				unsigned int orig_mtu)
1108 {
1109 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1110 		if (!skb) {
1111 			/* first fragment, reserve header_len */
1112 			*mtu = orig_mtu - rt->dst.header_len;
1113 
1114 		} else {
1115 			/*
1116 			 * this fragment is not first, the headers
1117 			 * space is regarded as data space.
1118 			 */
1119 			*mtu = orig_mtu;
1120 		}
1121 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1122 			      + fragheaderlen - sizeof(struct frag_hdr);
1123 	}
1124 }
1125 
1126 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1127 			  struct inet6_cork *v6_cork,
1128 			  int hlimit, int tclass, struct ipv6_txoptions *opt,
1129 			  struct rt6_info *rt, struct flowi6 *fl6)
1130 {
1131 	struct ipv6_pinfo *np = inet6_sk(sk);
1132 	unsigned int mtu;
1133 
1134 	/*
1135 	 * setup for corking
1136 	 */
1137 	if (opt) {
1138 		if (WARN_ON(v6_cork->opt))
1139 			return -EINVAL;
1140 
1141 		v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1142 		if (unlikely(!v6_cork->opt))
1143 			return -ENOBUFS;
1144 
1145 		v6_cork->opt->tot_len = opt->tot_len;
1146 		v6_cork->opt->opt_flen = opt->opt_flen;
1147 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1148 
1149 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1150 						    sk->sk_allocation);
1151 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1152 			return -ENOBUFS;
1153 
1154 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1155 						    sk->sk_allocation);
1156 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1157 			return -ENOBUFS;
1158 
1159 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1160 						   sk->sk_allocation);
1161 		if (opt->hopopt && !v6_cork->opt->hopopt)
1162 			return -ENOBUFS;
1163 
1164 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1165 						    sk->sk_allocation);
1166 		if (opt->srcrt && !v6_cork->opt->srcrt)
1167 			return -ENOBUFS;
1168 
1169 		/* need source address above miyazawa*/
1170 	}
1171 	dst_hold(&rt->dst);
1172 	cork->base.dst = &rt->dst;
1173 	cork->fl.u.ip6 = *fl6;
1174 	v6_cork->hop_limit = hlimit;
1175 	v6_cork->tclass = tclass;
1176 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1177 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1178 		      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1179 	else
1180 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1181 		      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1182 	if (np->frag_size < mtu) {
1183 		if (np->frag_size)
1184 			mtu = np->frag_size;
1185 	}
1186 	cork->base.fragsize = mtu;
1187 	if (dst_allfrag(rt->dst.path))
1188 		cork->base.flags |= IPCORK_ALLFRAG;
1189 	cork->base.length = 0;
1190 
1191 	return 0;
1192 }
1193 
1194 static int __ip6_append_data(struct sock *sk,
1195 			     struct flowi6 *fl6,
1196 			     struct sk_buff_head *queue,
1197 			     struct inet_cork *cork,
1198 			     struct inet6_cork *v6_cork,
1199 			     struct page_frag *pfrag,
1200 			     int getfrag(void *from, char *to, int offset,
1201 					 int len, int odd, struct sk_buff *skb),
1202 			     void *from, int length, int transhdrlen,
1203 			     unsigned int flags, int dontfrag)
1204 {
1205 	struct sk_buff *skb, *skb_prev = NULL;
1206 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1207 	int exthdrlen = 0;
1208 	int dst_exthdrlen = 0;
1209 	int hh_len;
1210 	int copy;
1211 	int err;
1212 	int offset = 0;
1213 	__u8 tx_flags = 0;
1214 	u32 tskey = 0;
1215 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1216 	struct ipv6_txoptions *opt = v6_cork->opt;
1217 	int csummode = CHECKSUM_NONE;
1218 
1219 	skb = skb_peek_tail(queue);
1220 	if (!skb) {
1221 		exthdrlen = opt ? opt->opt_flen : 0;
1222 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1223 	}
1224 
1225 	mtu = cork->fragsize;
1226 	orig_mtu = mtu;
1227 
1228 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1229 
1230 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1231 			(opt ? opt->opt_nflen : 0);
1232 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1233 		     sizeof(struct frag_hdr);
1234 
1235 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1236 		unsigned int maxnonfragsize, headersize;
1237 
1238 		headersize = sizeof(struct ipv6hdr) +
1239 			     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1240 			     (dst_allfrag(&rt->dst) ?
1241 			      sizeof(struct frag_hdr) : 0) +
1242 			     rt->rt6i_nfheader_len;
1243 
1244 		if (ip6_sk_ignore_df(sk))
1245 			maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1246 		else
1247 			maxnonfragsize = mtu;
1248 
1249 		/* dontfrag active */
1250 		if ((cork->length + length > mtu - headersize) && dontfrag &&
1251 		    (sk->sk_protocol == IPPROTO_UDP ||
1252 		     sk->sk_protocol == IPPROTO_RAW)) {
1253 			ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1254 						   sizeof(struct ipv6hdr));
1255 			goto emsgsize;
1256 		}
1257 
1258 		if (cork->length + length > maxnonfragsize - headersize) {
1259 emsgsize:
1260 			ipv6_local_error(sk, EMSGSIZE, fl6,
1261 					 mtu - headersize +
1262 					 sizeof(struct ipv6hdr));
1263 			return -EMSGSIZE;
1264 		}
1265 	}
1266 
1267 	if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1268 		sock_tx_timestamp(sk, &tx_flags);
1269 		if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1270 		    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1271 			tskey = sk->sk_tskey++;
1272 	}
1273 
1274 	/* If this is the first and only packet and device
1275 	 * supports checksum offloading, let's use it.
1276 	 */
1277 	if (!skb && sk->sk_protocol == IPPROTO_UDP &&
1278 	    length + fragheaderlen < mtu &&
1279 	    rt->dst.dev->features & NETIF_F_V6_CSUM &&
1280 	    !exthdrlen)
1281 		csummode = CHECKSUM_PARTIAL;
1282 	/*
1283 	 * Let's try using as much space as possible.
1284 	 * Use MTU if total length of the message fits into the MTU.
1285 	 * Otherwise, we need to reserve fragment header and
1286 	 * fragment alignment (= 8-15 octects, in total).
1287 	 *
1288 	 * Note that we may need to "move" the data from the tail of
1289 	 * of the buffer to the new fragment when we split
1290 	 * the message.
1291 	 *
1292 	 * FIXME: It may be fragmented into multiple chunks
1293 	 *        at once if non-fragmentable extension headers
1294 	 *        are too large.
1295 	 * --yoshfuji
1296 	 */
1297 
1298 	cork->length += length;
1299 	if (((length > mtu) ||
1300 	     (skb && skb_is_gso(skb))) &&
1301 	    (sk->sk_protocol == IPPROTO_UDP) &&
1302 	    (rt->dst.dev->features & NETIF_F_UFO) &&
1303 	    (sk->sk_type == SOCK_DGRAM)) {
1304 		err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1305 					  hh_len, fragheaderlen,
1306 					  transhdrlen, mtu, flags, rt);
1307 		if (err)
1308 			goto error;
1309 		return 0;
1310 	}
1311 
1312 	if (!skb)
1313 		goto alloc_new_skb;
1314 
1315 	while (length > 0) {
1316 		/* Check if the remaining data fits into current packet. */
1317 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1318 		if (copy < length)
1319 			copy = maxfraglen - skb->len;
1320 
1321 		if (copy <= 0) {
1322 			char *data;
1323 			unsigned int datalen;
1324 			unsigned int fraglen;
1325 			unsigned int fraggap;
1326 			unsigned int alloclen;
1327 alloc_new_skb:
1328 			/* There's no room in the current skb */
1329 			if (skb)
1330 				fraggap = skb->len - maxfraglen;
1331 			else
1332 				fraggap = 0;
1333 			/* update mtu and maxfraglen if necessary */
1334 			if (!skb || !skb_prev)
1335 				ip6_append_data_mtu(&mtu, &maxfraglen,
1336 						    fragheaderlen, skb, rt,
1337 						    orig_mtu);
1338 
1339 			skb_prev = skb;
1340 
1341 			/*
1342 			 * If remaining data exceeds the mtu,
1343 			 * we know we need more fragment(s).
1344 			 */
1345 			datalen = length + fraggap;
1346 
1347 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1348 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1349 			if ((flags & MSG_MORE) &&
1350 			    !(rt->dst.dev->features&NETIF_F_SG))
1351 				alloclen = mtu;
1352 			else
1353 				alloclen = datalen + fragheaderlen;
1354 
1355 			alloclen += dst_exthdrlen;
1356 
1357 			if (datalen != length + fraggap) {
1358 				/*
1359 				 * this is not the last fragment, the trailer
1360 				 * space is regarded as data space.
1361 				 */
1362 				datalen += rt->dst.trailer_len;
1363 			}
1364 
1365 			alloclen += rt->dst.trailer_len;
1366 			fraglen = datalen + fragheaderlen;
1367 
1368 			/*
1369 			 * We just reserve space for fragment header.
1370 			 * Note: this may be overallocation if the message
1371 			 * (without MSG_MORE) fits into the MTU.
1372 			 */
1373 			alloclen += sizeof(struct frag_hdr);
1374 
1375 			if (transhdrlen) {
1376 				skb = sock_alloc_send_skb(sk,
1377 						alloclen + hh_len,
1378 						(flags & MSG_DONTWAIT), &err);
1379 			} else {
1380 				skb = NULL;
1381 				if (atomic_read(&sk->sk_wmem_alloc) <=
1382 				    2 * sk->sk_sndbuf)
1383 					skb = sock_wmalloc(sk,
1384 							   alloclen + hh_len, 1,
1385 							   sk->sk_allocation);
1386 				if (unlikely(!skb))
1387 					err = -ENOBUFS;
1388 			}
1389 			if (!skb)
1390 				goto error;
1391 			/*
1392 			 *	Fill in the control structures
1393 			 */
1394 			skb->protocol = htons(ETH_P_IPV6);
1395 			skb->ip_summed = csummode;
1396 			skb->csum = 0;
1397 			/* reserve for fragmentation and ipsec header */
1398 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1399 				    dst_exthdrlen);
1400 
1401 			/* Only the initial fragment is time stamped */
1402 			skb_shinfo(skb)->tx_flags = tx_flags;
1403 			tx_flags = 0;
1404 			skb_shinfo(skb)->tskey = tskey;
1405 			tskey = 0;
1406 
1407 			/*
1408 			 *	Find where to start putting bytes
1409 			 */
1410 			data = skb_put(skb, fraglen);
1411 			skb_set_network_header(skb, exthdrlen);
1412 			data += fragheaderlen;
1413 			skb->transport_header = (skb->network_header +
1414 						 fragheaderlen);
1415 			if (fraggap) {
1416 				skb->csum = skb_copy_and_csum_bits(
1417 					skb_prev, maxfraglen,
1418 					data + transhdrlen, fraggap, 0);
1419 				skb_prev->csum = csum_sub(skb_prev->csum,
1420 							  skb->csum);
1421 				data += fraggap;
1422 				pskb_trim_unique(skb_prev, maxfraglen);
1423 			}
1424 			copy = datalen - transhdrlen - fraggap;
1425 
1426 			if (copy < 0) {
1427 				err = -EINVAL;
1428 				kfree_skb(skb);
1429 				goto error;
1430 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1431 				err = -EFAULT;
1432 				kfree_skb(skb);
1433 				goto error;
1434 			}
1435 
1436 			offset += copy;
1437 			length -= datalen - fraggap;
1438 			transhdrlen = 0;
1439 			exthdrlen = 0;
1440 			dst_exthdrlen = 0;
1441 
1442 			/*
1443 			 * Put the packet on the pending queue
1444 			 */
1445 			__skb_queue_tail(queue, skb);
1446 			continue;
1447 		}
1448 
1449 		if (copy > length)
1450 			copy = length;
1451 
1452 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1453 			unsigned int off;
1454 
1455 			off = skb->len;
1456 			if (getfrag(from, skb_put(skb, copy),
1457 						offset, copy, off, skb) < 0) {
1458 				__skb_trim(skb, off);
1459 				err = -EFAULT;
1460 				goto error;
1461 			}
1462 		} else {
1463 			int i = skb_shinfo(skb)->nr_frags;
1464 
1465 			err = -ENOMEM;
1466 			if (!sk_page_frag_refill(sk, pfrag))
1467 				goto error;
1468 
1469 			if (!skb_can_coalesce(skb, i, pfrag->page,
1470 					      pfrag->offset)) {
1471 				err = -EMSGSIZE;
1472 				if (i == MAX_SKB_FRAGS)
1473 					goto error;
1474 
1475 				__skb_fill_page_desc(skb, i, pfrag->page,
1476 						     pfrag->offset, 0);
1477 				skb_shinfo(skb)->nr_frags = ++i;
1478 				get_page(pfrag->page);
1479 			}
1480 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1481 			if (getfrag(from,
1482 				    page_address(pfrag->page) + pfrag->offset,
1483 				    offset, copy, skb->len, skb) < 0)
1484 				goto error_efault;
1485 
1486 			pfrag->offset += copy;
1487 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1488 			skb->len += copy;
1489 			skb->data_len += copy;
1490 			skb->truesize += copy;
1491 			atomic_add(copy, &sk->sk_wmem_alloc);
1492 		}
1493 		offset += copy;
1494 		length -= copy;
1495 	}
1496 
1497 	return 0;
1498 
1499 error_efault:
1500 	err = -EFAULT;
1501 error:
1502 	cork->length -= length;
1503 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1504 	return err;
1505 }
1506 
1507 int ip6_append_data(struct sock *sk,
1508 		    int getfrag(void *from, char *to, int offset, int len,
1509 				int odd, struct sk_buff *skb),
1510 		    void *from, int length, int transhdrlen, int hlimit,
1511 		    int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1512 		    struct rt6_info *rt, unsigned int flags, int dontfrag)
1513 {
1514 	struct inet_sock *inet = inet_sk(sk);
1515 	struct ipv6_pinfo *np = inet6_sk(sk);
1516 	int exthdrlen;
1517 	int err;
1518 
1519 	if (flags&MSG_PROBE)
1520 		return 0;
1521 	if (skb_queue_empty(&sk->sk_write_queue)) {
1522 		/*
1523 		 * setup for corking
1524 		 */
1525 		err = ip6_setup_cork(sk, &inet->cork, &np->cork, hlimit,
1526 				     tclass, opt, rt, fl6);
1527 		if (err)
1528 			return err;
1529 
1530 		exthdrlen = (opt ? opt->opt_flen : 0);
1531 		length += exthdrlen;
1532 		transhdrlen += exthdrlen;
1533 	} else {
1534 		fl6 = &inet->cork.fl.u.ip6;
1535 		transhdrlen = 0;
1536 	}
1537 
1538 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1539 				 &np->cork, sk_page_frag(sk), getfrag,
1540 				 from, length, transhdrlen, flags, dontfrag);
1541 }
1542 EXPORT_SYMBOL_GPL(ip6_append_data);
1543 
1544 static void ip6_cork_release(struct inet_cork_full *cork,
1545 			     struct inet6_cork *v6_cork)
1546 {
1547 	if (v6_cork->opt) {
1548 		kfree(v6_cork->opt->dst0opt);
1549 		kfree(v6_cork->opt->dst1opt);
1550 		kfree(v6_cork->opt->hopopt);
1551 		kfree(v6_cork->opt->srcrt);
1552 		kfree(v6_cork->opt);
1553 		v6_cork->opt = NULL;
1554 	}
1555 
1556 	if (cork->base.dst) {
1557 		dst_release(cork->base.dst);
1558 		cork->base.dst = NULL;
1559 		cork->base.flags &= ~IPCORK_ALLFRAG;
1560 	}
1561 	memset(&cork->fl, 0, sizeof(cork->fl));
1562 }
1563 
1564 struct sk_buff *__ip6_make_skb(struct sock *sk,
1565 			       struct sk_buff_head *queue,
1566 			       struct inet_cork_full *cork,
1567 			       struct inet6_cork *v6_cork)
1568 {
1569 	struct sk_buff *skb, *tmp_skb;
1570 	struct sk_buff **tail_skb;
1571 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1572 	struct ipv6_pinfo *np = inet6_sk(sk);
1573 	struct net *net = sock_net(sk);
1574 	struct ipv6hdr *hdr;
1575 	struct ipv6_txoptions *opt = v6_cork->opt;
1576 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1577 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1578 	unsigned char proto = fl6->flowi6_proto;
1579 
1580 	skb = __skb_dequeue(queue);
1581 	if (!skb)
1582 		goto out;
1583 	tail_skb = &(skb_shinfo(skb)->frag_list);
1584 
1585 	/* move skb->data to ip header from ext header */
1586 	if (skb->data < skb_network_header(skb))
1587 		__skb_pull(skb, skb_network_offset(skb));
1588 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1589 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1590 		*tail_skb = tmp_skb;
1591 		tail_skb = &(tmp_skb->next);
1592 		skb->len += tmp_skb->len;
1593 		skb->data_len += tmp_skb->len;
1594 		skb->truesize += tmp_skb->truesize;
1595 		tmp_skb->destructor = NULL;
1596 		tmp_skb->sk = NULL;
1597 	}
1598 
1599 	/* Allow local fragmentation. */
1600 	skb->ignore_df = ip6_sk_ignore_df(sk);
1601 
1602 	*final_dst = fl6->daddr;
1603 	__skb_pull(skb, skb_network_header_len(skb));
1604 	if (opt && opt->opt_flen)
1605 		ipv6_push_frag_opts(skb, opt, &proto);
1606 	if (opt && opt->opt_nflen)
1607 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1608 
1609 	skb_push(skb, sizeof(struct ipv6hdr));
1610 	skb_reset_network_header(skb);
1611 	hdr = ipv6_hdr(skb);
1612 
1613 	ip6_flow_hdr(hdr, v6_cork->tclass,
1614 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1615 					np->autoflowlabel));
1616 	hdr->hop_limit = v6_cork->hop_limit;
1617 	hdr->nexthdr = proto;
1618 	hdr->saddr = fl6->saddr;
1619 	hdr->daddr = *final_dst;
1620 
1621 	skb->priority = sk->sk_priority;
1622 	skb->mark = sk->sk_mark;
1623 
1624 	skb_dst_set(skb, dst_clone(&rt->dst));
1625 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1626 	if (proto == IPPROTO_ICMPV6) {
1627 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1628 
1629 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1630 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1631 	}
1632 
1633 	ip6_cork_release(cork, v6_cork);
1634 out:
1635 	return skb;
1636 }
1637 
1638 int ip6_send_skb(struct sk_buff *skb)
1639 {
1640 	struct net *net = sock_net(skb->sk);
1641 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1642 	int err;
1643 
1644 	err = ip6_local_out(skb);
1645 	if (err) {
1646 		if (err > 0)
1647 			err = net_xmit_errno(err);
1648 		if (err)
1649 			IP6_INC_STATS(net, rt->rt6i_idev,
1650 				      IPSTATS_MIB_OUTDISCARDS);
1651 	}
1652 
1653 	return err;
1654 }
1655 
1656 int ip6_push_pending_frames(struct sock *sk)
1657 {
1658 	struct sk_buff *skb;
1659 
1660 	skb = ip6_finish_skb(sk);
1661 	if (!skb)
1662 		return 0;
1663 
1664 	return ip6_send_skb(skb);
1665 }
1666 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1667 
1668 static void __ip6_flush_pending_frames(struct sock *sk,
1669 				       struct sk_buff_head *queue,
1670 				       struct inet_cork_full *cork,
1671 				       struct inet6_cork *v6_cork)
1672 {
1673 	struct sk_buff *skb;
1674 
1675 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1676 		if (skb_dst(skb))
1677 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1678 				      IPSTATS_MIB_OUTDISCARDS);
1679 		kfree_skb(skb);
1680 	}
1681 
1682 	ip6_cork_release(cork, v6_cork);
1683 }
1684 
1685 void ip6_flush_pending_frames(struct sock *sk)
1686 {
1687 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1688 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1689 }
1690 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1691 
1692 struct sk_buff *ip6_make_skb(struct sock *sk,
1693 			     int getfrag(void *from, char *to, int offset,
1694 					 int len, int odd, struct sk_buff *skb),
1695 			     void *from, int length, int transhdrlen,
1696 			     int hlimit, int tclass,
1697 			     struct ipv6_txoptions *opt, struct flowi6 *fl6,
1698 			     struct rt6_info *rt, unsigned int flags,
1699 			     int dontfrag)
1700 {
1701 	struct inet_cork_full cork;
1702 	struct inet6_cork v6_cork;
1703 	struct sk_buff_head queue;
1704 	int exthdrlen = (opt ? opt->opt_flen : 0);
1705 	int err;
1706 
1707 	if (flags & MSG_PROBE)
1708 		return NULL;
1709 
1710 	__skb_queue_head_init(&queue);
1711 
1712 	cork.base.flags = 0;
1713 	cork.base.addr = 0;
1714 	cork.base.opt = NULL;
1715 	v6_cork.opt = NULL;
1716 	err = ip6_setup_cork(sk, &cork, &v6_cork, hlimit, tclass, opt, rt, fl6);
1717 	if (err)
1718 		return ERR_PTR(err);
1719 
1720 	if (dontfrag < 0)
1721 		dontfrag = inet6_sk(sk)->dontfrag;
1722 
1723 	err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1724 				&current->task_frag, getfrag, from,
1725 				length + exthdrlen, transhdrlen + exthdrlen,
1726 				flags, dontfrag);
1727 	if (err) {
1728 		__ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1729 		return ERR_PTR(err);
1730 	}
1731 
1732 	return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1733 }
1734