xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 9b9c2cd4)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	:	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 #include <net/l3mdev.h>
59 
60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61 {
62 	struct dst_entry *dst = skb_dst(skb);
63 	struct net_device *dev = dst->dev;
64 	struct neighbour *neigh;
65 	struct in6_addr *nexthop;
66 	int ret;
67 
68 	skb->protocol = htons(ETH_P_IPV6);
69 	skb->dev = dev;
70 
71 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
72 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
73 
74 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
75 		    ((mroute6_socket(net, skb) &&
76 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
77 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
78 					 &ipv6_hdr(skb)->saddr))) {
79 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
80 
81 			/* Do not check for IFF_ALLMULTI; multicast routing
82 			   is not supported in any case.
83 			 */
84 			if (newskb)
85 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
86 					net, sk, newskb, NULL, newskb->dev,
87 					dev_loopback_xmit);
88 
89 			if (ipv6_hdr(skb)->hop_limit == 0) {
90 				IP6_INC_STATS(net, idev,
91 					      IPSTATS_MIB_OUTDISCARDS);
92 				kfree_skb(skb);
93 				return 0;
94 			}
95 		}
96 
97 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
98 
99 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
100 		    IPV6_ADDR_SCOPE_NODELOCAL &&
101 		    !(dev->flags & IFF_LOOPBACK)) {
102 			kfree_skb(skb);
103 			return 0;
104 		}
105 	}
106 
107 	rcu_read_lock_bh();
108 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
109 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
110 	if (unlikely(!neigh))
111 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
112 	if (!IS_ERR(neigh)) {
113 		ret = dst_neigh_output(dst, neigh, skb);
114 		rcu_read_unlock_bh();
115 		return ret;
116 	}
117 	rcu_read_unlock_bh();
118 
119 	IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
120 	kfree_skb(skb);
121 	return -EINVAL;
122 }
123 
124 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
125 {
126 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
127 	    dst_allfrag(skb_dst(skb)) ||
128 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
129 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
130 	else
131 		return ip6_finish_output2(net, sk, skb);
132 }
133 
134 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
135 {
136 	struct net_device *dev = skb_dst(skb)->dev;
137 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
138 
139 	if (unlikely(idev->cnf.disable_ipv6)) {
140 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
141 		kfree_skb(skb);
142 		return 0;
143 	}
144 
145 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
146 			    net, sk, skb, NULL, dev,
147 			    ip6_finish_output,
148 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
149 }
150 
151 /*
152  * xmit an sk_buff (used by TCP, SCTP and DCCP)
153  * Note : socket lock is not held for SYNACK packets, but might be modified
154  * by calls to skb_set_owner_w() and ipv6_local_error(),
155  * which are using proper atomic operations or spinlocks.
156  */
157 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
158 	     struct ipv6_txoptions *opt, int tclass)
159 {
160 	struct net *net = sock_net(sk);
161 	const struct ipv6_pinfo *np = inet6_sk(sk);
162 	struct in6_addr *first_hop = &fl6->daddr;
163 	struct dst_entry *dst = skb_dst(skb);
164 	struct ipv6hdr *hdr;
165 	u8  proto = fl6->flowi6_proto;
166 	int seg_len = skb->len;
167 	int hlimit = -1;
168 	u32 mtu;
169 
170 	if (opt) {
171 		unsigned int head_room;
172 
173 		/* First: exthdrs may take lots of space (~8K for now)
174 		   MAX_HEADER is not enough.
175 		 */
176 		head_room = opt->opt_nflen + opt->opt_flen;
177 		seg_len += head_room;
178 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
179 
180 		if (skb_headroom(skb) < head_room) {
181 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
182 			if (!skb2) {
183 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
184 					      IPSTATS_MIB_OUTDISCARDS);
185 				kfree_skb(skb);
186 				return -ENOBUFS;
187 			}
188 			consume_skb(skb);
189 			skb = skb2;
190 			/* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
191 			 * it is safe to call in our context (socket lock not held)
192 			 */
193 			skb_set_owner_w(skb, (struct sock *)sk);
194 		}
195 		if (opt->opt_flen)
196 			ipv6_push_frag_opts(skb, opt, &proto);
197 		if (opt->opt_nflen)
198 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
199 	}
200 
201 	skb_push(skb, sizeof(struct ipv6hdr));
202 	skb_reset_network_header(skb);
203 	hdr = ipv6_hdr(skb);
204 
205 	/*
206 	 *	Fill in the IPv6 header
207 	 */
208 	if (np)
209 		hlimit = np->hop_limit;
210 	if (hlimit < 0)
211 		hlimit = ip6_dst_hoplimit(dst);
212 
213 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
214 						     np->autoflowlabel, fl6));
215 
216 	hdr->payload_len = htons(seg_len);
217 	hdr->nexthdr = proto;
218 	hdr->hop_limit = hlimit;
219 
220 	hdr->saddr = fl6->saddr;
221 	hdr->daddr = *first_hop;
222 
223 	skb->protocol = htons(ETH_P_IPV6);
224 	skb->priority = sk->sk_priority;
225 	skb->mark = sk->sk_mark;
226 
227 	mtu = dst_mtu(dst);
228 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
229 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
230 			      IPSTATS_MIB_OUT, skb->len);
231 		/* hooks should never assume socket lock is held.
232 		 * we promote our socket to non const
233 		 */
234 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
235 			       net, (struct sock *)sk, skb, NULL, dst->dev,
236 			       dst_output);
237 	}
238 
239 	skb->dev = dst->dev;
240 	/* ipv6_local_error() does not require socket lock,
241 	 * we promote our socket to non const
242 	 */
243 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
244 
245 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
246 	kfree_skb(skb);
247 	return -EMSGSIZE;
248 }
249 EXPORT_SYMBOL(ip6_xmit);
250 
251 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
252 {
253 	struct ip6_ra_chain *ra;
254 	struct sock *last = NULL;
255 
256 	read_lock(&ip6_ra_lock);
257 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
258 		struct sock *sk = ra->sk;
259 		if (sk && ra->sel == sel &&
260 		    (!sk->sk_bound_dev_if ||
261 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
262 			if (last) {
263 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
264 				if (skb2)
265 					rawv6_rcv(last, skb2);
266 			}
267 			last = sk;
268 		}
269 	}
270 
271 	if (last) {
272 		rawv6_rcv(last, skb);
273 		read_unlock(&ip6_ra_lock);
274 		return 1;
275 	}
276 	read_unlock(&ip6_ra_lock);
277 	return 0;
278 }
279 
280 static int ip6_forward_proxy_check(struct sk_buff *skb)
281 {
282 	struct ipv6hdr *hdr = ipv6_hdr(skb);
283 	u8 nexthdr = hdr->nexthdr;
284 	__be16 frag_off;
285 	int offset;
286 
287 	if (ipv6_ext_hdr(nexthdr)) {
288 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
289 		if (offset < 0)
290 			return 0;
291 	} else
292 		offset = sizeof(struct ipv6hdr);
293 
294 	if (nexthdr == IPPROTO_ICMPV6) {
295 		struct icmp6hdr *icmp6;
296 
297 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
298 					 offset + 1 - skb->data)))
299 			return 0;
300 
301 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
302 
303 		switch (icmp6->icmp6_type) {
304 		case NDISC_ROUTER_SOLICITATION:
305 		case NDISC_ROUTER_ADVERTISEMENT:
306 		case NDISC_NEIGHBOUR_SOLICITATION:
307 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
308 		case NDISC_REDIRECT:
309 			/* For reaction involving unicast neighbor discovery
310 			 * message destined to the proxied address, pass it to
311 			 * input function.
312 			 */
313 			return 1;
314 		default:
315 			break;
316 		}
317 	}
318 
319 	/*
320 	 * The proxying router can't forward traffic sent to a link-local
321 	 * address, so signal the sender and discard the packet. This
322 	 * behavior is clarified by the MIPv6 specification.
323 	 */
324 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
325 		dst_link_failure(skb);
326 		return -1;
327 	}
328 
329 	return 0;
330 }
331 
332 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
333 				     struct sk_buff *skb)
334 {
335 	skb_sender_cpu_clear(skb);
336 	return dst_output(net, sk, skb);
337 }
338 
339 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
340 {
341 	unsigned int mtu;
342 	struct inet6_dev *idev;
343 
344 	if (dst_metric_locked(dst, RTAX_MTU)) {
345 		mtu = dst_metric_raw(dst, RTAX_MTU);
346 		if (mtu)
347 			return mtu;
348 	}
349 
350 	mtu = IPV6_MIN_MTU;
351 	rcu_read_lock();
352 	idev = __in6_dev_get(dst->dev);
353 	if (idev)
354 		mtu = idev->cnf.mtu6;
355 	rcu_read_unlock();
356 
357 	return mtu;
358 }
359 
360 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
361 {
362 	if (skb->len <= mtu)
363 		return false;
364 
365 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
366 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
367 		return true;
368 
369 	if (skb->ignore_df)
370 		return false;
371 
372 	if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
373 		return false;
374 
375 	return true;
376 }
377 
378 int ip6_forward(struct sk_buff *skb)
379 {
380 	struct dst_entry *dst = skb_dst(skb);
381 	struct ipv6hdr *hdr = ipv6_hdr(skb);
382 	struct inet6_skb_parm *opt = IP6CB(skb);
383 	struct net *net = dev_net(dst->dev);
384 	u32 mtu;
385 
386 	if (net->ipv6.devconf_all->forwarding == 0)
387 		goto error;
388 
389 	if (skb->pkt_type != PACKET_HOST)
390 		goto drop;
391 
392 	if (unlikely(skb->sk))
393 		goto drop;
394 
395 	if (skb_warn_if_lro(skb))
396 		goto drop;
397 
398 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
399 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
400 				 IPSTATS_MIB_INDISCARDS);
401 		goto drop;
402 	}
403 
404 	skb_forward_csum(skb);
405 
406 	/*
407 	 *	We DO NOT make any processing on
408 	 *	RA packets, pushing them to user level AS IS
409 	 *	without ane WARRANTY that application will be able
410 	 *	to interpret them. The reason is that we
411 	 *	cannot make anything clever here.
412 	 *
413 	 *	We are not end-node, so that if packet contains
414 	 *	AH/ESP, we cannot make anything.
415 	 *	Defragmentation also would be mistake, RA packets
416 	 *	cannot be fragmented, because there is no warranty
417 	 *	that different fragments will go along one path. --ANK
418 	 */
419 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
420 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
421 			return 0;
422 	}
423 
424 	/*
425 	 *	check and decrement ttl
426 	 */
427 	if (hdr->hop_limit <= 1) {
428 		/* Force OUTPUT device used as source address */
429 		skb->dev = dst->dev;
430 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
431 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
432 				 IPSTATS_MIB_INHDRERRORS);
433 
434 		kfree_skb(skb);
435 		return -ETIMEDOUT;
436 	}
437 
438 	/* XXX: idev->cnf.proxy_ndp? */
439 	if (net->ipv6.devconf_all->proxy_ndp &&
440 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
441 		int proxied = ip6_forward_proxy_check(skb);
442 		if (proxied > 0)
443 			return ip6_input(skb);
444 		else if (proxied < 0) {
445 			IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
446 					 IPSTATS_MIB_INDISCARDS);
447 			goto drop;
448 		}
449 	}
450 
451 	if (!xfrm6_route_forward(skb)) {
452 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
453 				 IPSTATS_MIB_INDISCARDS);
454 		goto drop;
455 	}
456 	dst = skb_dst(skb);
457 
458 	/* IPv6 specs say nothing about it, but it is clear that we cannot
459 	   send redirects to source routed frames.
460 	   We don't send redirects to frames decapsulated from IPsec.
461 	 */
462 	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
463 		struct in6_addr *target = NULL;
464 		struct inet_peer *peer;
465 		struct rt6_info *rt;
466 
467 		/*
468 		 *	incoming and outgoing devices are the same
469 		 *	send a redirect.
470 		 */
471 
472 		rt = (struct rt6_info *) dst;
473 		if (rt->rt6i_flags & RTF_GATEWAY)
474 			target = &rt->rt6i_gateway;
475 		else
476 			target = &hdr->daddr;
477 
478 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
479 
480 		/* Limit redirects both by destination (here)
481 		   and by source (inside ndisc_send_redirect)
482 		 */
483 		if (inet_peer_xrlim_allow(peer, 1*HZ))
484 			ndisc_send_redirect(skb, target);
485 		if (peer)
486 			inet_putpeer(peer);
487 	} else {
488 		int addrtype = ipv6_addr_type(&hdr->saddr);
489 
490 		/* This check is security critical. */
491 		if (addrtype == IPV6_ADDR_ANY ||
492 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
493 			goto error;
494 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
495 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
496 				    ICMPV6_NOT_NEIGHBOUR, 0);
497 			goto error;
498 		}
499 	}
500 
501 	mtu = ip6_dst_mtu_forward(dst);
502 	if (mtu < IPV6_MIN_MTU)
503 		mtu = IPV6_MIN_MTU;
504 
505 	if (ip6_pkt_too_big(skb, mtu)) {
506 		/* Again, force OUTPUT device used as source address */
507 		skb->dev = dst->dev;
508 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
509 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
510 				 IPSTATS_MIB_INTOOBIGERRORS);
511 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
512 				 IPSTATS_MIB_FRAGFAILS);
513 		kfree_skb(skb);
514 		return -EMSGSIZE;
515 	}
516 
517 	if (skb_cow(skb, dst->dev->hard_header_len)) {
518 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
519 				 IPSTATS_MIB_OUTDISCARDS);
520 		goto drop;
521 	}
522 
523 	hdr = ipv6_hdr(skb);
524 
525 	/* Mangling hops number delayed to point after skb COW */
526 
527 	hdr->hop_limit--;
528 
529 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
530 	IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
531 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
532 		       net, NULL, skb, skb->dev, dst->dev,
533 		       ip6_forward_finish);
534 
535 error:
536 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
537 drop:
538 	kfree_skb(skb);
539 	return -EINVAL;
540 }
541 
542 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
543 {
544 	to->pkt_type = from->pkt_type;
545 	to->priority = from->priority;
546 	to->protocol = from->protocol;
547 	skb_dst_drop(to);
548 	skb_dst_set(to, dst_clone(skb_dst(from)));
549 	to->dev = from->dev;
550 	to->mark = from->mark;
551 
552 #ifdef CONFIG_NET_SCHED
553 	to->tc_index = from->tc_index;
554 #endif
555 	nf_copy(to, from);
556 	skb_copy_secmark(to, from);
557 }
558 
559 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
560 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
561 {
562 	struct sk_buff *frag;
563 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
564 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
565 				inet6_sk(skb->sk) : NULL;
566 	struct ipv6hdr *tmp_hdr;
567 	struct frag_hdr *fh;
568 	unsigned int mtu, hlen, left, len;
569 	int hroom, troom;
570 	__be32 frag_id;
571 	int ptr, offset = 0, err = 0;
572 	u8 *prevhdr, nexthdr = 0;
573 
574 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
575 	nexthdr = *prevhdr;
576 
577 	mtu = ip6_skb_dst_mtu(skb);
578 
579 	/* We must not fragment if the socket is set to force MTU discovery
580 	 * or if the skb it not generated by a local socket.
581 	 */
582 	if (unlikely(!skb->ignore_df && skb->len > mtu))
583 		goto fail_toobig;
584 
585 	if (IP6CB(skb)->frag_max_size) {
586 		if (IP6CB(skb)->frag_max_size > mtu)
587 			goto fail_toobig;
588 
589 		/* don't send fragments larger than what we received */
590 		mtu = IP6CB(skb)->frag_max_size;
591 		if (mtu < IPV6_MIN_MTU)
592 			mtu = IPV6_MIN_MTU;
593 	}
594 
595 	if (np && np->frag_size < mtu) {
596 		if (np->frag_size)
597 			mtu = np->frag_size;
598 	}
599 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
600 		goto fail_toobig;
601 	mtu -= hlen + sizeof(struct frag_hdr);
602 
603 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
604 				    &ipv6_hdr(skb)->saddr);
605 
606 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
607 	    (err = skb_checksum_help(skb)))
608 		goto fail;
609 
610 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
611 	if (skb_has_frag_list(skb)) {
612 		int first_len = skb_pagelen(skb);
613 		struct sk_buff *frag2;
614 
615 		if (first_len - hlen > mtu ||
616 		    ((first_len - hlen) & 7) ||
617 		    skb_cloned(skb) ||
618 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
619 			goto slow_path;
620 
621 		skb_walk_frags(skb, frag) {
622 			/* Correct geometry. */
623 			if (frag->len > mtu ||
624 			    ((frag->len & 7) && frag->next) ||
625 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
626 				goto slow_path_clean;
627 
628 			/* Partially cloned skb? */
629 			if (skb_shared(frag))
630 				goto slow_path_clean;
631 
632 			BUG_ON(frag->sk);
633 			if (skb->sk) {
634 				frag->sk = skb->sk;
635 				frag->destructor = sock_wfree;
636 			}
637 			skb->truesize -= frag->truesize;
638 		}
639 
640 		err = 0;
641 		offset = 0;
642 		/* BUILD HEADER */
643 
644 		*prevhdr = NEXTHDR_FRAGMENT;
645 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
646 		if (!tmp_hdr) {
647 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
648 				      IPSTATS_MIB_FRAGFAILS);
649 			err = -ENOMEM;
650 			goto fail;
651 		}
652 		frag = skb_shinfo(skb)->frag_list;
653 		skb_frag_list_init(skb);
654 
655 		__skb_pull(skb, hlen);
656 		fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
657 		__skb_push(skb, hlen);
658 		skb_reset_network_header(skb);
659 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
660 
661 		fh->nexthdr = nexthdr;
662 		fh->reserved = 0;
663 		fh->frag_off = htons(IP6_MF);
664 		fh->identification = frag_id;
665 
666 		first_len = skb_pagelen(skb);
667 		skb->data_len = first_len - skb_headlen(skb);
668 		skb->len = first_len;
669 		ipv6_hdr(skb)->payload_len = htons(first_len -
670 						   sizeof(struct ipv6hdr));
671 
672 		dst_hold(&rt->dst);
673 
674 		for (;;) {
675 			/* Prepare header of the next frame,
676 			 * before previous one went down. */
677 			if (frag) {
678 				frag->ip_summed = CHECKSUM_NONE;
679 				skb_reset_transport_header(frag);
680 				fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
681 				__skb_push(frag, hlen);
682 				skb_reset_network_header(frag);
683 				memcpy(skb_network_header(frag), tmp_hdr,
684 				       hlen);
685 				offset += skb->len - hlen - sizeof(struct frag_hdr);
686 				fh->nexthdr = nexthdr;
687 				fh->reserved = 0;
688 				fh->frag_off = htons(offset);
689 				if (frag->next)
690 					fh->frag_off |= htons(IP6_MF);
691 				fh->identification = frag_id;
692 				ipv6_hdr(frag)->payload_len =
693 						htons(frag->len -
694 						      sizeof(struct ipv6hdr));
695 				ip6_copy_metadata(frag, skb);
696 			}
697 
698 			err = output(net, sk, skb);
699 			if (!err)
700 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
701 					      IPSTATS_MIB_FRAGCREATES);
702 
703 			if (err || !frag)
704 				break;
705 
706 			skb = frag;
707 			frag = skb->next;
708 			skb->next = NULL;
709 		}
710 
711 		kfree(tmp_hdr);
712 
713 		if (err == 0) {
714 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
715 				      IPSTATS_MIB_FRAGOKS);
716 			ip6_rt_put(rt);
717 			return 0;
718 		}
719 
720 		kfree_skb_list(frag);
721 
722 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
723 			      IPSTATS_MIB_FRAGFAILS);
724 		ip6_rt_put(rt);
725 		return err;
726 
727 slow_path_clean:
728 		skb_walk_frags(skb, frag2) {
729 			if (frag2 == frag)
730 				break;
731 			frag2->sk = NULL;
732 			frag2->destructor = NULL;
733 			skb->truesize += frag2->truesize;
734 		}
735 	}
736 
737 slow_path:
738 	left = skb->len - hlen;		/* Space per frame */
739 	ptr = hlen;			/* Where to start from */
740 
741 	/*
742 	 *	Fragment the datagram.
743 	 */
744 
745 	*prevhdr = NEXTHDR_FRAGMENT;
746 	troom = rt->dst.dev->needed_tailroom;
747 
748 	/*
749 	 *	Keep copying data until we run out.
750 	 */
751 	while (left > 0)	{
752 		len = left;
753 		/* IF: it doesn't fit, use 'mtu' - the data space left */
754 		if (len > mtu)
755 			len = mtu;
756 		/* IF: we are not sending up to and including the packet end
757 		   then align the next start on an eight byte boundary */
758 		if (len < left)	{
759 			len &= ~7;
760 		}
761 
762 		/* Allocate buffer */
763 		frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
764 				 hroom + troom, GFP_ATOMIC);
765 		if (!frag) {
766 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
767 				      IPSTATS_MIB_FRAGFAILS);
768 			err = -ENOMEM;
769 			goto fail;
770 		}
771 
772 		/*
773 		 *	Set up data on packet
774 		 */
775 
776 		ip6_copy_metadata(frag, skb);
777 		skb_reserve(frag, hroom);
778 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
779 		skb_reset_network_header(frag);
780 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
781 		frag->transport_header = (frag->network_header + hlen +
782 					  sizeof(struct frag_hdr));
783 
784 		/*
785 		 *	Charge the memory for the fragment to any owner
786 		 *	it might possess
787 		 */
788 		if (skb->sk)
789 			skb_set_owner_w(frag, skb->sk);
790 
791 		/*
792 		 *	Copy the packet header into the new buffer.
793 		 */
794 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
795 
796 		/*
797 		 *	Build fragment header.
798 		 */
799 		fh->nexthdr = nexthdr;
800 		fh->reserved = 0;
801 		fh->identification = frag_id;
802 
803 		/*
804 		 *	Copy a block of the IP datagram.
805 		 */
806 		BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
807 				     len));
808 		left -= len;
809 
810 		fh->frag_off = htons(offset);
811 		if (left > 0)
812 			fh->frag_off |= htons(IP6_MF);
813 		ipv6_hdr(frag)->payload_len = htons(frag->len -
814 						    sizeof(struct ipv6hdr));
815 
816 		ptr += len;
817 		offset += len;
818 
819 		/*
820 		 *	Put this fragment into the sending queue.
821 		 */
822 		err = output(net, sk, frag);
823 		if (err)
824 			goto fail;
825 
826 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
827 			      IPSTATS_MIB_FRAGCREATES);
828 	}
829 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
830 		      IPSTATS_MIB_FRAGOKS);
831 	consume_skb(skb);
832 	return err;
833 
834 fail_toobig:
835 	if (skb->sk && dst_allfrag(skb_dst(skb)))
836 		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
837 
838 	skb->dev = skb_dst(skb)->dev;
839 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
840 	err = -EMSGSIZE;
841 
842 fail:
843 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
844 		      IPSTATS_MIB_FRAGFAILS);
845 	kfree_skb(skb);
846 	return err;
847 }
848 
849 static inline int ip6_rt_check(const struct rt6key *rt_key,
850 			       const struct in6_addr *fl_addr,
851 			       const struct in6_addr *addr_cache)
852 {
853 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
854 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
855 }
856 
857 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
858 					  struct dst_entry *dst,
859 					  const struct flowi6 *fl6)
860 {
861 	struct ipv6_pinfo *np = inet6_sk(sk);
862 	struct rt6_info *rt;
863 
864 	if (!dst)
865 		goto out;
866 
867 	if (dst->ops->family != AF_INET6) {
868 		dst_release(dst);
869 		return NULL;
870 	}
871 
872 	rt = (struct rt6_info *)dst;
873 	/* Yes, checking route validity in not connected
874 	 * case is not very simple. Take into account,
875 	 * that we do not support routing by source, TOS,
876 	 * and MSG_DONTROUTE		--ANK (980726)
877 	 *
878 	 * 1. ip6_rt_check(): If route was host route,
879 	 *    check that cached destination is current.
880 	 *    If it is network route, we still may
881 	 *    check its validity using saved pointer
882 	 *    to the last used address: daddr_cache.
883 	 *    We do not want to save whole address now,
884 	 *    (because main consumer of this service
885 	 *    is tcp, which has not this problem),
886 	 *    so that the last trick works only on connected
887 	 *    sockets.
888 	 * 2. oif also should be the same.
889 	 */
890 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
891 #ifdef CONFIG_IPV6_SUBTREES
892 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
893 #endif
894 	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
895 	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
896 		dst_release(dst);
897 		dst = NULL;
898 	}
899 
900 out:
901 	return dst;
902 }
903 
904 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
905 			       struct dst_entry **dst, struct flowi6 *fl6)
906 {
907 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
908 	struct neighbour *n;
909 	struct rt6_info *rt;
910 #endif
911 	int err;
912 
913 	/* The correct way to handle this would be to do
914 	 * ip6_route_get_saddr, and then ip6_route_output; however,
915 	 * the route-specific preferred source forces the
916 	 * ip6_route_output call _before_ ip6_route_get_saddr.
917 	 *
918 	 * In source specific routing (no src=any default route),
919 	 * ip6_route_output will fail given src=any saddr, though, so
920 	 * that's why we try it again later.
921 	 */
922 	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
923 		struct rt6_info *rt;
924 		bool had_dst = *dst != NULL;
925 
926 		if (!had_dst)
927 			*dst = ip6_route_output(net, sk, fl6);
928 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
929 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
930 					  sk ? inet6_sk(sk)->srcprefs : 0,
931 					  &fl6->saddr);
932 		if (err)
933 			goto out_err_release;
934 
935 		/* If we had an erroneous initial result, pretend it
936 		 * never existed and let the SA-enabled version take
937 		 * over.
938 		 */
939 		if (!had_dst && (*dst)->error) {
940 			dst_release(*dst);
941 			*dst = NULL;
942 		}
943 	}
944 
945 	if (!*dst)
946 		*dst = ip6_route_output(net, sk, fl6);
947 
948 	err = (*dst)->error;
949 	if (err)
950 		goto out_err_release;
951 
952 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
953 	/*
954 	 * Here if the dst entry we've looked up
955 	 * has a neighbour entry that is in the INCOMPLETE
956 	 * state and the src address from the flow is
957 	 * marked as OPTIMISTIC, we release the found
958 	 * dst entry and replace it instead with the
959 	 * dst entry of the nexthop router
960 	 */
961 	rt = (struct rt6_info *) *dst;
962 	rcu_read_lock_bh();
963 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
964 				      rt6_nexthop(rt, &fl6->daddr));
965 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
966 	rcu_read_unlock_bh();
967 
968 	if (err) {
969 		struct inet6_ifaddr *ifp;
970 		struct flowi6 fl_gw6;
971 		int redirect;
972 
973 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
974 				      (*dst)->dev, 1);
975 
976 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
977 		if (ifp)
978 			in6_ifa_put(ifp);
979 
980 		if (redirect) {
981 			/*
982 			 * We need to get the dst entry for the
983 			 * default router instead
984 			 */
985 			dst_release(*dst);
986 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
987 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
988 			*dst = ip6_route_output(net, sk, &fl_gw6);
989 			err = (*dst)->error;
990 			if (err)
991 				goto out_err_release;
992 		}
993 	}
994 #endif
995 
996 	return 0;
997 
998 out_err_release:
999 	if (err == -ENETUNREACH)
1000 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1001 	dst_release(*dst);
1002 	*dst = NULL;
1003 	return err;
1004 }
1005 
1006 /**
1007  *	ip6_dst_lookup - perform route lookup on flow
1008  *	@sk: socket which provides route info
1009  *	@dst: pointer to dst_entry * for result
1010  *	@fl6: flow to lookup
1011  *
1012  *	This function performs a route lookup on the given flow.
1013  *
1014  *	It returns zero on success, or a standard errno code on error.
1015  */
1016 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1017 		   struct flowi6 *fl6)
1018 {
1019 	*dst = NULL;
1020 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1021 }
1022 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1023 
1024 /**
1025  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1026  *	@sk: socket which provides route info
1027  *	@fl6: flow to lookup
1028  *	@final_dst: final destination address for ipsec lookup
1029  *
1030  *	This function performs a route lookup on the given flow.
1031  *
1032  *	It returns a valid dst pointer on success, or a pointer encoded
1033  *	error code.
1034  */
1035 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1036 				      const struct in6_addr *final_dst)
1037 {
1038 	struct dst_entry *dst = NULL;
1039 	int err;
1040 
1041 	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1042 	if (err)
1043 		return ERR_PTR(err);
1044 	if (final_dst)
1045 		fl6->daddr = *final_dst;
1046 	if (!fl6->flowi6_oif)
1047 		fl6->flowi6_oif = l3mdev_fib_oif(dst->dev);
1048 
1049 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1050 }
1051 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1052 
1053 /**
1054  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1055  *	@sk: socket which provides the dst cache and route info
1056  *	@fl6: flow to lookup
1057  *	@final_dst: final destination address for ipsec lookup
1058  *
1059  *	This function performs a route lookup on the given flow with the
1060  *	possibility of using the cached route in the socket if it is valid.
1061  *	It will take the socket dst lock when operating on the dst cache.
1062  *	As a result, this function can only be used in process context.
1063  *
1064  *	It returns a valid dst pointer on success, or a pointer encoded
1065  *	error code.
1066  */
1067 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1068 					 const struct in6_addr *final_dst)
1069 {
1070 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1071 	int err;
1072 
1073 	dst = ip6_sk_dst_check(sk, dst, fl6);
1074 
1075 	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1076 	if (err)
1077 		return ERR_PTR(err);
1078 	if (final_dst)
1079 		fl6->daddr = *final_dst;
1080 
1081 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1082 }
1083 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1084 
1085 static inline int ip6_ufo_append_data(struct sock *sk,
1086 			struct sk_buff_head *queue,
1087 			int getfrag(void *from, char *to, int offset, int len,
1088 			int odd, struct sk_buff *skb),
1089 			void *from, int length, int hh_len, int fragheaderlen,
1090 			int transhdrlen, int mtu, unsigned int flags,
1091 			const struct flowi6 *fl6)
1092 
1093 {
1094 	struct sk_buff *skb;
1095 	int err;
1096 
1097 	/* There is support for UDP large send offload by network
1098 	 * device, so create one single skb packet containing complete
1099 	 * udp datagram
1100 	 */
1101 	skb = skb_peek_tail(queue);
1102 	if (!skb) {
1103 		skb = sock_alloc_send_skb(sk,
1104 			hh_len + fragheaderlen + transhdrlen + 20,
1105 			(flags & MSG_DONTWAIT), &err);
1106 		if (!skb)
1107 			return err;
1108 
1109 		/* reserve space for Hardware header */
1110 		skb_reserve(skb, hh_len);
1111 
1112 		/* create space for UDP/IP header */
1113 		skb_put(skb, fragheaderlen + transhdrlen);
1114 
1115 		/* initialize network header pointer */
1116 		skb_reset_network_header(skb);
1117 
1118 		/* initialize protocol header pointer */
1119 		skb->transport_header = skb->network_header + fragheaderlen;
1120 
1121 		skb->protocol = htons(ETH_P_IPV6);
1122 		skb->csum = 0;
1123 
1124 		__skb_queue_tail(queue, skb);
1125 	} else if (skb_is_gso(skb)) {
1126 		goto append;
1127 	}
1128 
1129 	skb->ip_summed = CHECKSUM_PARTIAL;
1130 	/* Specify the length of each IPv6 datagram fragment.
1131 	 * It has to be a multiple of 8.
1132 	 */
1133 	skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1134 				     sizeof(struct frag_hdr)) & ~7;
1135 	skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1136 	skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1137 							 &fl6->daddr,
1138 							 &fl6->saddr);
1139 
1140 append:
1141 	return skb_append_datato_frags(sk, skb, getfrag, from,
1142 				       (length - transhdrlen));
1143 }
1144 
1145 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1146 					       gfp_t gfp)
1147 {
1148 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1149 }
1150 
1151 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1152 						gfp_t gfp)
1153 {
1154 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1155 }
1156 
1157 static void ip6_append_data_mtu(unsigned int *mtu,
1158 				int *maxfraglen,
1159 				unsigned int fragheaderlen,
1160 				struct sk_buff *skb,
1161 				struct rt6_info *rt,
1162 				unsigned int orig_mtu)
1163 {
1164 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1165 		if (!skb) {
1166 			/* first fragment, reserve header_len */
1167 			*mtu = orig_mtu - rt->dst.header_len;
1168 
1169 		} else {
1170 			/*
1171 			 * this fragment is not first, the headers
1172 			 * space is regarded as data space.
1173 			 */
1174 			*mtu = orig_mtu;
1175 		}
1176 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1177 			      + fragheaderlen - sizeof(struct frag_hdr);
1178 	}
1179 }
1180 
1181 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1182 			  struct inet6_cork *v6_cork,
1183 			  int hlimit, int tclass, struct ipv6_txoptions *opt,
1184 			  struct rt6_info *rt, struct flowi6 *fl6)
1185 {
1186 	struct ipv6_pinfo *np = inet6_sk(sk);
1187 	unsigned int mtu;
1188 
1189 	/*
1190 	 * setup for corking
1191 	 */
1192 	if (opt) {
1193 		if (WARN_ON(v6_cork->opt))
1194 			return -EINVAL;
1195 
1196 		v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1197 		if (unlikely(!v6_cork->opt))
1198 			return -ENOBUFS;
1199 
1200 		v6_cork->opt->tot_len = opt->tot_len;
1201 		v6_cork->opt->opt_flen = opt->opt_flen;
1202 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1203 
1204 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1205 						    sk->sk_allocation);
1206 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1207 			return -ENOBUFS;
1208 
1209 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1210 						    sk->sk_allocation);
1211 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1212 			return -ENOBUFS;
1213 
1214 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1215 						   sk->sk_allocation);
1216 		if (opt->hopopt && !v6_cork->opt->hopopt)
1217 			return -ENOBUFS;
1218 
1219 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1220 						    sk->sk_allocation);
1221 		if (opt->srcrt && !v6_cork->opt->srcrt)
1222 			return -ENOBUFS;
1223 
1224 		/* need source address above miyazawa*/
1225 	}
1226 	dst_hold(&rt->dst);
1227 	cork->base.dst = &rt->dst;
1228 	cork->fl.u.ip6 = *fl6;
1229 	v6_cork->hop_limit = hlimit;
1230 	v6_cork->tclass = tclass;
1231 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1232 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1233 		      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1234 	else
1235 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1236 		      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1237 	if (np->frag_size < mtu) {
1238 		if (np->frag_size)
1239 			mtu = np->frag_size;
1240 	}
1241 	cork->base.fragsize = mtu;
1242 	if (dst_allfrag(rt->dst.path))
1243 		cork->base.flags |= IPCORK_ALLFRAG;
1244 	cork->base.length = 0;
1245 
1246 	return 0;
1247 }
1248 
1249 static int __ip6_append_data(struct sock *sk,
1250 			     struct flowi6 *fl6,
1251 			     struct sk_buff_head *queue,
1252 			     struct inet_cork *cork,
1253 			     struct inet6_cork *v6_cork,
1254 			     struct page_frag *pfrag,
1255 			     int getfrag(void *from, char *to, int offset,
1256 					 int len, int odd, struct sk_buff *skb),
1257 			     void *from, int length, int transhdrlen,
1258 			     unsigned int flags, int dontfrag)
1259 {
1260 	struct sk_buff *skb, *skb_prev = NULL;
1261 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1262 	int exthdrlen = 0;
1263 	int dst_exthdrlen = 0;
1264 	int hh_len;
1265 	int copy;
1266 	int err;
1267 	int offset = 0;
1268 	__u8 tx_flags = 0;
1269 	u32 tskey = 0;
1270 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1271 	struct ipv6_txoptions *opt = v6_cork->opt;
1272 	int csummode = CHECKSUM_NONE;
1273 	unsigned int maxnonfragsize, headersize;
1274 
1275 	skb = skb_peek_tail(queue);
1276 	if (!skb) {
1277 		exthdrlen = opt ? opt->opt_flen : 0;
1278 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1279 	}
1280 
1281 	mtu = cork->fragsize;
1282 	orig_mtu = mtu;
1283 
1284 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1285 
1286 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1287 			(opt ? opt->opt_nflen : 0);
1288 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1289 		     sizeof(struct frag_hdr);
1290 
1291 	headersize = sizeof(struct ipv6hdr) +
1292 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1293 		     (dst_allfrag(&rt->dst) ?
1294 		      sizeof(struct frag_hdr) : 0) +
1295 		     rt->rt6i_nfheader_len;
1296 
1297 	if (cork->length + length > mtu - headersize && dontfrag &&
1298 	    (sk->sk_protocol == IPPROTO_UDP ||
1299 	     sk->sk_protocol == IPPROTO_RAW)) {
1300 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1301 				sizeof(struct ipv6hdr));
1302 		goto emsgsize;
1303 	}
1304 
1305 	if (ip6_sk_ignore_df(sk))
1306 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1307 	else
1308 		maxnonfragsize = mtu;
1309 
1310 	if (cork->length + length > maxnonfragsize - headersize) {
1311 emsgsize:
1312 		ipv6_local_error(sk, EMSGSIZE, fl6,
1313 				 mtu - headersize +
1314 				 sizeof(struct ipv6hdr));
1315 		return -EMSGSIZE;
1316 	}
1317 
1318 	/* CHECKSUM_PARTIAL only with no extension headers and when
1319 	 * we are not going to fragment
1320 	 */
1321 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1322 	    headersize == sizeof(struct ipv6hdr) &&
1323 	    length < mtu - headersize &&
1324 	    !(flags & MSG_MORE) &&
1325 	    rt->dst.dev->features & NETIF_F_V6_CSUM)
1326 		csummode = CHECKSUM_PARTIAL;
1327 
1328 	if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1329 		sock_tx_timestamp(sk, &tx_flags);
1330 		if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1331 		    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1332 			tskey = sk->sk_tskey++;
1333 	}
1334 
1335 	/*
1336 	 * Let's try using as much space as possible.
1337 	 * Use MTU if total length of the message fits into the MTU.
1338 	 * Otherwise, we need to reserve fragment header and
1339 	 * fragment alignment (= 8-15 octects, in total).
1340 	 *
1341 	 * Note that we may need to "move" the data from the tail of
1342 	 * of the buffer to the new fragment when we split
1343 	 * the message.
1344 	 *
1345 	 * FIXME: It may be fragmented into multiple chunks
1346 	 *        at once if non-fragmentable extension headers
1347 	 *        are too large.
1348 	 * --yoshfuji
1349 	 */
1350 
1351 	cork->length += length;
1352 	if (((length > mtu) ||
1353 	     (skb && skb_is_gso(skb))) &&
1354 	    (sk->sk_protocol == IPPROTO_UDP) &&
1355 	    (rt->dst.dev->features & NETIF_F_UFO) &&
1356 	    (sk->sk_type == SOCK_DGRAM)) {
1357 		err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1358 					  hh_len, fragheaderlen,
1359 					  transhdrlen, mtu, flags, fl6);
1360 		if (err)
1361 			goto error;
1362 		return 0;
1363 	}
1364 
1365 	if (!skb)
1366 		goto alloc_new_skb;
1367 
1368 	while (length > 0) {
1369 		/* Check if the remaining data fits into current packet. */
1370 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1371 		if (copy < length)
1372 			copy = maxfraglen - skb->len;
1373 
1374 		if (copy <= 0) {
1375 			char *data;
1376 			unsigned int datalen;
1377 			unsigned int fraglen;
1378 			unsigned int fraggap;
1379 			unsigned int alloclen;
1380 alloc_new_skb:
1381 			/* There's no room in the current skb */
1382 			if (skb)
1383 				fraggap = skb->len - maxfraglen;
1384 			else
1385 				fraggap = 0;
1386 			/* update mtu and maxfraglen if necessary */
1387 			if (!skb || !skb_prev)
1388 				ip6_append_data_mtu(&mtu, &maxfraglen,
1389 						    fragheaderlen, skb, rt,
1390 						    orig_mtu);
1391 
1392 			skb_prev = skb;
1393 
1394 			/*
1395 			 * If remaining data exceeds the mtu,
1396 			 * we know we need more fragment(s).
1397 			 */
1398 			datalen = length + fraggap;
1399 
1400 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1401 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1402 			if ((flags & MSG_MORE) &&
1403 			    !(rt->dst.dev->features&NETIF_F_SG))
1404 				alloclen = mtu;
1405 			else
1406 				alloclen = datalen + fragheaderlen;
1407 
1408 			alloclen += dst_exthdrlen;
1409 
1410 			if (datalen != length + fraggap) {
1411 				/*
1412 				 * this is not the last fragment, the trailer
1413 				 * space is regarded as data space.
1414 				 */
1415 				datalen += rt->dst.trailer_len;
1416 			}
1417 
1418 			alloclen += rt->dst.trailer_len;
1419 			fraglen = datalen + fragheaderlen;
1420 
1421 			/*
1422 			 * We just reserve space for fragment header.
1423 			 * Note: this may be overallocation if the message
1424 			 * (without MSG_MORE) fits into the MTU.
1425 			 */
1426 			alloclen += sizeof(struct frag_hdr);
1427 
1428 			if (transhdrlen) {
1429 				skb = sock_alloc_send_skb(sk,
1430 						alloclen + hh_len,
1431 						(flags & MSG_DONTWAIT), &err);
1432 			} else {
1433 				skb = NULL;
1434 				if (atomic_read(&sk->sk_wmem_alloc) <=
1435 				    2 * sk->sk_sndbuf)
1436 					skb = sock_wmalloc(sk,
1437 							   alloclen + hh_len, 1,
1438 							   sk->sk_allocation);
1439 				if (unlikely(!skb))
1440 					err = -ENOBUFS;
1441 			}
1442 			if (!skb)
1443 				goto error;
1444 			/*
1445 			 *	Fill in the control structures
1446 			 */
1447 			skb->protocol = htons(ETH_P_IPV6);
1448 			skb->ip_summed = csummode;
1449 			skb->csum = 0;
1450 			/* reserve for fragmentation and ipsec header */
1451 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1452 				    dst_exthdrlen);
1453 
1454 			/* Only the initial fragment is time stamped */
1455 			skb_shinfo(skb)->tx_flags = tx_flags;
1456 			tx_flags = 0;
1457 			skb_shinfo(skb)->tskey = tskey;
1458 			tskey = 0;
1459 
1460 			/*
1461 			 *	Find where to start putting bytes
1462 			 */
1463 			data = skb_put(skb, fraglen);
1464 			skb_set_network_header(skb, exthdrlen);
1465 			data += fragheaderlen;
1466 			skb->transport_header = (skb->network_header +
1467 						 fragheaderlen);
1468 			if (fraggap) {
1469 				skb->csum = skb_copy_and_csum_bits(
1470 					skb_prev, maxfraglen,
1471 					data + transhdrlen, fraggap, 0);
1472 				skb_prev->csum = csum_sub(skb_prev->csum,
1473 							  skb->csum);
1474 				data += fraggap;
1475 				pskb_trim_unique(skb_prev, maxfraglen);
1476 			}
1477 			copy = datalen - transhdrlen - fraggap;
1478 
1479 			if (copy < 0) {
1480 				err = -EINVAL;
1481 				kfree_skb(skb);
1482 				goto error;
1483 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1484 				err = -EFAULT;
1485 				kfree_skb(skb);
1486 				goto error;
1487 			}
1488 
1489 			offset += copy;
1490 			length -= datalen - fraggap;
1491 			transhdrlen = 0;
1492 			exthdrlen = 0;
1493 			dst_exthdrlen = 0;
1494 
1495 			/*
1496 			 * Put the packet on the pending queue
1497 			 */
1498 			__skb_queue_tail(queue, skb);
1499 			continue;
1500 		}
1501 
1502 		if (copy > length)
1503 			copy = length;
1504 
1505 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1506 			unsigned int off;
1507 
1508 			off = skb->len;
1509 			if (getfrag(from, skb_put(skb, copy),
1510 						offset, copy, off, skb) < 0) {
1511 				__skb_trim(skb, off);
1512 				err = -EFAULT;
1513 				goto error;
1514 			}
1515 		} else {
1516 			int i = skb_shinfo(skb)->nr_frags;
1517 
1518 			err = -ENOMEM;
1519 			if (!sk_page_frag_refill(sk, pfrag))
1520 				goto error;
1521 
1522 			if (!skb_can_coalesce(skb, i, pfrag->page,
1523 					      pfrag->offset)) {
1524 				err = -EMSGSIZE;
1525 				if (i == MAX_SKB_FRAGS)
1526 					goto error;
1527 
1528 				__skb_fill_page_desc(skb, i, pfrag->page,
1529 						     pfrag->offset, 0);
1530 				skb_shinfo(skb)->nr_frags = ++i;
1531 				get_page(pfrag->page);
1532 			}
1533 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1534 			if (getfrag(from,
1535 				    page_address(pfrag->page) + pfrag->offset,
1536 				    offset, copy, skb->len, skb) < 0)
1537 				goto error_efault;
1538 
1539 			pfrag->offset += copy;
1540 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1541 			skb->len += copy;
1542 			skb->data_len += copy;
1543 			skb->truesize += copy;
1544 			atomic_add(copy, &sk->sk_wmem_alloc);
1545 		}
1546 		offset += copy;
1547 		length -= copy;
1548 	}
1549 
1550 	return 0;
1551 
1552 error_efault:
1553 	err = -EFAULT;
1554 error:
1555 	cork->length -= length;
1556 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1557 	return err;
1558 }
1559 
1560 int ip6_append_data(struct sock *sk,
1561 		    int getfrag(void *from, char *to, int offset, int len,
1562 				int odd, struct sk_buff *skb),
1563 		    void *from, int length, int transhdrlen, int hlimit,
1564 		    int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1565 		    struct rt6_info *rt, unsigned int flags, int dontfrag)
1566 {
1567 	struct inet_sock *inet = inet_sk(sk);
1568 	struct ipv6_pinfo *np = inet6_sk(sk);
1569 	int exthdrlen;
1570 	int err;
1571 
1572 	if (flags&MSG_PROBE)
1573 		return 0;
1574 	if (skb_queue_empty(&sk->sk_write_queue)) {
1575 		/*
1576 		 * setup for corking
1577 		 */
1578 		err = ip6_setup_cork(sk, &inet->cork, &np->cork, hlimit,
1579 				     tclass, opt, rt, fl6);
1580 		if (err)
1581 			return err;
1582 
1583 		exthdrlen = (opt ? opt->opt_flen : 0);
1584 		length += exthdrlen;
1585 		transhdrlen += exthdrlen;
1586 	} else {
1587 		fl6 = &inet->cork.fl.u.ip6;
1588 		transhdrlen = 0;
1589 	}
1590 
1591 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1592 				 &np->cork, sk_page_frag(sk), getfrag,
1593 				 from, length, transhdrlen, flags, dontfrag);
1594 }
1595 EXPORT_SYMBOL_GPL(ip6_append_data);
1596 
1597 static void ip6_cork_release(struct inet_cork_full *cork,
1598 			     struct inet6_cork *v6_cork)
1599 {
1600 	if (v6_cork->opt) {
1601 		kfree(v6_cork->opt->dst0opt);
1602 		kfree(v6_cork->opt->dst1opt);
1603 		kfree(v6_cork->opt->hopopt);
1604 		kfree(v6_cork->opt->srcrt);
1605 		kfree(v6_cork->opt);
1606 		v6_cork->opt = NULL;
1607 	}
1608 
1609 	if (cork->base.dst) {
1610 		dst_release(cork->base.dst);
1611 		cork->base.dst = NULL;
1612 		cork->base.flags &= ~IPCORK_ALLFRAG;
1613 	}
1614 	memset(&cork->fl, 0, sizeof(cork->fl));
1615 }
1616 
1617 struct sk_buff *__ip6_make_skb(struct sock *sk,
1618 			       struct sk_buff_head *queue,
1619 			       struct inet_cork_full *cork,
1620 			       struct inet6_cork *v6_cork)
1621 {
1622 	struct sk_buff *skb, *tmp_skb;
1623 	struct sk_buff **tail_skb;
1624 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1625 	struct ipv6_pinfo *np = inet6_sk(sk);
1626 	struct net *net = sock_net(sk);
1627 	struct ipv6hdr *hdr;
1628 	struct ipv6_txoptions *opt = v6_cork->opt;
1629 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1630 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1631 	unsigned char proto = fl6->flowi6_proto;
1632 
1633 	skb = __skb_dequeue(queue);
1634 	if (!skb)
1635 		goto out;
1636 	tail_skb = &(skb_shinfo(skb)->frag_list);
1637 
1638 	/* move skb->data to ip header from ext header */
1639 	if (skb->data < skb_network_header(skb))
1640 		__skb_pull(skb, skb_network_offset(skb));
1641 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1642 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1643 		*tail_skb = tmp_skb;
1644 		tail_skb = &(tmp_skb->next);
1645 		skb->len += tmp_skb->len;
1646 		skb->data_len += tmp_skb->len;
1647 		skb->truesize += tmp_skb->truesize;
1648 		tmp_skb->destructor = NULL;
1649 		tmp_skb->sk = NULL;
1650 	}
1651 
1652 	/* Allow local fragmentation. */
1653 	skb->ignore_df = ip6_sk_ignore_df(sk);
1654 
1655 	*final_dst = fl6->daddr;
1656 	__skb_pull(skb, skb_network_header_len(skb));
1657 	if (opt && opt->opt_flen)
1658 		ipv6_push_frag_opts(skb, opt, &proto);
1659 	if (opt && opt->opt_nflen)
1660 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1661 
1662 	skb_push(skb, sizeof(struct ipv6hdr));
1663 	skb_reset_network_header(skb);
1664 	hdr = ipv6_hdr(skb);
1665 
1666 	ip6_flow_hdr(hdr, v6_cork->tclass,
1667 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1668 					np->autoflowlabel, fl6));
1669 	hdr->hop_limit = v6_cork->hop_limit;
1670 	hdr->nexthdr = proto;
1671 	hdr->saddr = fl6->saddr;
1672 	hdr->daddr = *final_dst;
1673 
1674 	skb->priority = sk->sk_priority;
1675 	skb->mark = sk->sk_mark;
1676 
1677 	skb_dst_set(skb, dst_clone(&rt->dst));
1678 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1679 	if (proto == IPPROTO_ICMPV6) {
1680 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1681 
1682 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1683 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1684 	}
1685 
1686 	ip6_cork_release(cork, v6_cork);
1687 out:
1688 	return skb;
1689 }
1690 
1691 int ip6_send_skb(struct sk_buff *skb)
1692 {
1693 	struct net *net = sock_net(skb->sk);
1694 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1695 	int err;
1696 
1697 	err = ip6_local_out(net, skb->sk, skb);
1698 	if (err) {
1699 		if (err > 0)
1700 			err = net_xmit_errno(err);
1701 		if (err)
1702 			IP6_INC_STATS(net, rt->rt6i_idev,
1703 				      IPSTATS_MIB_OUTDISCARDS);
1704 	}
1705 
1706 	return err;
1707 }
1708 
1709 int ip6_push_pending_frames(struct sock *sk)
1710 {
1711 	struct sk_buff *skb;
1712 
1713 	skb = ip6_finish_skb(sk);
1714 	if (!skb)
1715 		return 0;
1716 
1717 	return ip6_send_skb(skb);
1718 }
1719 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1720 
1721 static void __ip6_flush_pending_frames(struct sock *sk,
1722 				       struct sk_buff_head *queue,
1723 				       struct inet_cork_full *cork,
1724 				       struct inet6_cork *v6_cork)
1725 {
1726 	struct sk_buff *skb;
1727 
1728 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1729 		if (skb_dst(skb))
1730 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1731 				      IPSTATS_MIB_OUTDISCARDS);
1732 		kfree_skb(skb);
1733 	}
1734 
1735 	ip6_cork_release(cork, v6_cork);
1736 }
1737 
1738 void ip6_flush_pending_frames(struct sock *sk)
1739 {
1740 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1741 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1742 }
1743 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1744 
1745 struct sk_buff *ip6_make_skb(struct sock *sk,
1746 			     int getfrag(void *from, char *to, int offset,
1747 					 int len, int odd, struct sk_buff *skb),
1748 			     void *from, int length, int transhdrlen,
1749 			     int hlimit, int tclass,
1750 			     struct ipv6_txoptions *opt, struct flowi6 *fl6,
1751 			     struct rt6_info *rt, unsigned int flags,
1752 			     int dontfrag)
1753 {
1754 	struct inet_cork_full cork;
1755 	struct inet6_cork v6_cork;
1756 	struct sk_buff_head queue;
1757 	int exthdrlen = (opt ? opt->opt_flen : 0);
1758 	int err;
1759 
1760 	if (flags & MSG_PROBE)
1761 		return NULL;
1762 
1763 	__skb_queue_head_init(&queue);
1764 
1765 	cork.base.flags = 0;
1766 	cork.base.addr = 0;
1767 	cork.base.opt = NULL;
1768 	v6_cork.opt = NULL;
1769 	err = ip6_setup_cork(sk, &cork, &v6_cork, hlimit, tclass, opt, rt, fl6);
1770 	if (err)
1771 		return ERR_PTR(err);
1772 
1773 	if (dontfrag < 0)
1774 		dontfrag = inet6_sk(sk)->dontfrag;
1775 
1776 	err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1777 				&current->task_frag, getfrag, from,
1778 				length + exthdrlen, transhdrlen + exthdrlen,
1779 				flags, dontfrag);
1780 	if (err) {
1781 		__ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1782 		return ERR_PTR(err);
1783 	}
1784 
1785 	return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1786 }
1787