xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 400c2a45)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	IPv6 output functions
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on linux/net/ipv4/ip_output.c
10  *
11  *	Changes:
12  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13  *				extension headers are implemented.
14  *				route changes now work.
15  *				ip6_forward does not confuse sniffers.
16  *				etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *	Imran Patel	:	frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *			:       add ip6_append_data and related functions
22  *				for datagram xmit
23  */
24 
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37 
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41 
42 #include <net/sock.h>
43 #include <net/snmp.h>
44 
45 #include <net/ipv6.h>
46 #include <net/ndisc.h>
47 #include <net/protocol.h>
48 #include <net/ip6_route.h>
49 #include <net/addrconf.h>
50 #include <net/rawv6.h>
51 #include <net/icmp.h>
52 #include <net/xfrm.h>
53 #include <net/checksum.h>
54 #include <linux/mroute6.h>
55 #include <net/l3mdev.h>
56 #include <net/lwtunnel.h>
57 #include <net/ip_tunnels.h>
58 
59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
60 {
61 	struct dst_entry *dst = skb_dst(skb);
62 	struct net_device *dev = dst->dev;
63 	const struct in6_addr *nexthop;
64 	struct neighbour *neigh;
65 	int ret;
66 
67 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
68 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
69 
70 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
71 		    ((mroute6_is_socket(net, skb) &&
72 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
73 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
74 					 &ipv6_hdr(skb)->saddr))) {
75 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
76 
77 			/* Do not check for IFF_ALLMULTI; multicast routing
78 			   is not supported in any case.
79 			 */
80 			if (newskb)
81 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
82 					net, sk, newskb, NULL, newskb->dev,
83 					dev_loopback_xmit);
84 
85 			if (ipv6_hdr(skb)->hop_limit == 0) {
86 				IP6_INC_STATS(net, idev,
87 					      IPSTATS_MIB_OUTDISCARDS);
88 				kfree_skb(skb);
89 				return 0;
90 			}
91 		}
92 
93 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
94 
95 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
96 		    IPV6_ADDR_SCOPE_NODELOCAL &&
97 		    !(dev->flags & IFF_LOOPBACK)) {
98 			kfree_skb(skb);
99 			return 0;
100 		}
101 	}
102 
103 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
104 		int res = lwtunnel_xmit(skb);
105 
106 		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
107 			return res;
108 	}
109 
110 	rcu_read_lock_bh();
111 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
112 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
113 	if (unlikely(!neigh))
114 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
115 	if (!IS_ERR(neigh)) {
116 		sock_confirm_neigh(skb, neigh);
117 		ret = neigh_output(neigh, skb, false);
118 		rcu_read_unlock_bh();
119 		return ret;
120 	}
121 	rcu_read_unlock_bh();
122 
123 	IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
124 	kfree_skb(skb);
125 	return -EINVAL;
126 }
127 
128 static int
129 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
130 				    struct sk_buff *skb, unsigned int mtu)
131 {
132 	struct sk_buff *segs, *nskb;
133 	netdev_features_t features;
134 	int ret = 0;
135 
136 	/* Please see corresponding comment in ip_finish_output_gso
137 	 * describing the cases where GSO segment length exceeds the
138 	 * egress MTU.
139 	 */
140 	features = netif_skb_features(skb);
141 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
142 	if (IS_ERR_OR_NULL(segs)) {
143 		kfree_skb(skb);
144 		return -ENOMEM;
145 	}
146 
147 	consume_skb(skb);
148 
149 	skb_list_walk_safe(segs, segs, nskb) {
150 		int err;
151 
152 		skb_mark_not_on_list(segs);
153 		err = ip6_fragment(net, sk, segs, ip6_finish_output2);
154 		if (err && ret == 0)
155 			ret = err;
156 	}
157 
158 	return ret;
159 }
160 
161 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
162 {
163 	unsigned int mtu;
164 
165 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
166 	/* Policy lookup after SNAT yielded a new policy */
167 	if (skb_dst(skb)->xfrm) {
168 		IPCB(skb)->flags |= IPSKB_REROUTED;
169 		return dst_output(net, sk, skb);
170 	}
171 #endif
172 
173 	mtu = ip6_skb_dst_mtu(skb);
174 	if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
175 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
176 
177 	if ((skb->len > mtu && !skb_is_gso(skb)) ||
178 	    dst_allfrag(skb_dst(skb)) ||
179 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
180 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
181 	else
182 		return ip6_finish_output2(net, sk, skb);
183 }
184 
185 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
186 {
187 	int ret;
188 
189 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
190 	switch (ret) {
191 	case NET_XMIT_SUCCESS:
192 		return __ip6_finish_output(net, sk, skb);
193 	case NET_XMIT_CN:
194 		return __ip6_finish_output(net, sk, skb) ? : ret;
195 	default:
196 		kfree_skb(skb);
197 		return ret;
198 	}
199 }
200 
201 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
202 {
203 	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
204 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
205 
206 	skb->protocol = htons(ETH_P_IPV6);
207 	skb->dev = dev;
208 
209 	if (unlikely(idev->cnf.disable_ipv6)) {
210 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
211 		kfree_skb(skb);
212 		return 0;
213 	}
214 
215 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
216 			    net, sk, skb, indev, dev,
217 			    ip6_finish_output,
218 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
219 }
220 
221 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
222 {
223 	if (!np->autoflowlabel_set)
224 		return ip6_default_np_autolabel(net);
225 	else
226 		return np->autoflowlabel;
227 }
228 
229 /*
230  * xmit an sk_buff (used by TCP, SCTP and DCCP)
231  * Note : socket lock is not held for SYNACK packets, but might be modified
232  * by calls to skb_set_owner_w() and ipv6_local_error(),
233  * which are using proper atomic operations or spinlocks.
234  */
235 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
236 	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
237 {
238 	struct net *net = sock_net(sk);
239 	const struct ipv6_pinfo *np = inet6_sk(sk);
240 	struct in6_addr *first_hop = &fl6->daddr;
241 	struct dst_entry *dst = skb_dst(skb);
242 	unsigned int head_room;
243 	struct ipv6hdr *hdr;
244 	u8  proto = fl6->flowi6_proto;
245 	int seg_len = skb->len;
246 	int hlimit = -1;
247 	u32 mtu;
248 
249 	head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
250 	if (opt)
251 		head_room += opt->opt_nflen + opt->opt_flen;
252 
253 	if (unlikely(skb_headroom(skb) < head_room)) {
254 		struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
255 		if (!skb2) {
256 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
257 				      IPSTATS_MIB_OUTDISCARDS);
258 			kfree_skb(skb);
259 			return -ENOBUFS;
260 		}
261 		if (skb->sk)
262 			skb_set_owner_w(skb2, skb->sk);
263 		consume_skb(skb);
264 		skb = skb2;
265 	}
266 
267 	if (opt) {
268 		seg_len += opt->opt_nflen + opt->opt_flen;
269 
270 		if (opt->opt_flen)
271 			ipv6_push_frag_opts(skb, opt, &proto);
272 
273 		if (opt->opt_nflen)
274 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
275 					     &fl6->saddr);
276 	}
277 
278 	skb_push(skb, sizeof(struct ipv6hdr));
279 	skb_reset_network_header(skb);
280 	hdr = ipv6_hdr(skb);
281 
282 	/*
283 	 *	Fill in the IPv6 header
284 	 */
285 	if (np)
286 		hlimit = np->hop_limit;
287 	if (hlimit < 0)
288 		hlimit = ip6_dst_hoplimit(dst);
289 
290 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
291 				ip6_autoflowlabel(net, np), fl6));
292 
293 	hdr->payload_len = htons(seg_len);
294 	hdr->nexthdr = proto;
295 	hdr->hop_limit = hlimit;
296 
297 	hdr->saddr = fl6->saddr;
298 	hdr->daddr = *first_hop;
299 
300 	skb->protocol = htons(ETH_P_IPV6);
301 	skb->priority = priority;
302 	skb->mark = mark;
303 
304 	mtu = dst_mtu(dst);
305 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
306 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
307 			      IPSTATS_MIB_OUT, skb->len);
308 
309 		/* if egress device is enslaved to an L3 master device pass the
310 		 * skb to its handler for processing
311 		 */
312 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
313 		if (unlikely(!skb))
314 			return 0;
315 
316 		/* hooks should never assume socket lock is held.
317 		 * we promote our socket to non const
318 		 */
319 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
320 			       net, (struct sock *)sk, skb, NULL, dst->dev,
321 			       dst_output);
322 	}
323 
324 	skb->dev = dst->dev;
325 	/* ipv6_local_error() does not require socket lock,
326 	 * we promote our socket to non const
327 	 */
328 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
329 
330 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
331 	kfree_skb(skb);
332 	return -EMSGSIZE;
333 }
334 EXPORT_SYMBOL(ip6_xmit);
335 
336 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
337 {
338 	struct ip6_ra_chain *ra;
339 	struct sock *last = NULL;
340 
341 	read_lock(&ip6_ra_lock);
342 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
343 		struct sock *sk = ra->sk;
344 		if (sk && ra->sel == sel &&
345 		    (!sk->sk_bound_dev_if ||
346 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
347 			struct ipv6_pinfo *np = inet6_sk(sk);
348 
349 			if (np && np->rtalert_isolate &&
350 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
351 				continue;
352 			}
353 			if (last) {
354 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
355 				if (skb2)
356 					rawv6_rcv(last, skb2);
357 			}
358 			last = sk;
359 		}
360 	}
361 
362 	if (last) {
363 		rawv6_rcv(last, skb);
364 		read_unlock(&ip6_ra_lock);
365 		return 1;
366 	}
367 	read_unlock(&ip6_ra_lock);
368 	return 0;
369 }
370 
371 static int ip6_forward_proxy_check(struct sk_buff *skb)
372 {
373 	struct ipv6hdr *hdr = ipv6_hdr(skb);
374 	u8 nexthdr = hdr->nexthdr;
375 	__be16 frag_off;
376 	int offset;
377 
378 	if (ipv6_ext_hdr(nexthdr)) {
379 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
380 		if (offset < 0)
381 			return 0;
382 	} else
383 		offset = sizeof(struct ipv6hdr);
384 
385 	if (nexthdr == IPPROTO_ICMPV6) {
386 		struct icmp6hdr *icmp6;
387 
388 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
389 					 offset + 1 - skb->data)))
390 			return 0;
391 
392 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
393 
394 		switch (icmp6->icmp6_type) {
395 		case NDISC_ROUTER_SOLICITATION:
396 		case NDISC_ROUTER_ADVERTISEMENT:
397 		case NDISC_NEIGHBOUR_SOLICITATION:
398 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
399 		case NDISC_REDIRECT:
400 			/* For reaction involving unicast neighbor discovery
401 			 * message destined to the proxied address, pass it to
402 			 * input function.
403 			 */
404 			return 1;
405 		default:
406 			break;
407 		}
408 	}
409 
410 	/*
411 	 * The proxying router can't forward traffic sent to a link-local
412 	 * address, so signal the sender and discard the packet. This
413 	 * behavior is clarified by the MIPv6 specification.
414 	 */
415 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
416 		dst_link_failure(skb);
417 		return -1;
418 	}
419 
420 	return 0;
421 }
422 
423 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
424 				     struct sk_buff *skb)
425 {
426 	struct dst_entry *dst = skb_dst(skb);
427 
428 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
429 	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
430 
431 #ifdef CONFIG_NET_SWITCHDEV
432 	if (skb->offload_l3_fwd_mark) {
433 		consume_skb(skb);
434 		return 0;
435 	}
436 #endif
437 
438 	skb->tstamp = 0;
439 	return dst_output(net, sk, skb);
440 }
441 
442 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
443 {
444 	if (skb->len <= mtu)
445 		return false;
446 
447 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
448 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
449 		return true;
450 
451 	if (skb->ignore_df)
452 		return false;
453 
454 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
455 		return false;
456 
457 	return true;
458 }
459 
460 int ip6_forward(struct sk_buff *skb)
461 {
462 	struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
463 	struct dst_entry *dst = skb_dst(skb);
464 	struct ipv6hdr *hdr = ipv6_hdr(skb);
465 	struct inet6_skb_parm *opt = IP6CB(skb);
466 	struct net *net = dev_net(dst->dev);
467 	u32 mtu;
468 
469 	if (net->ipv6.devconf_all->forwarding == 0)
470 		goto error;
471 
472 	if (skb->pkt_type != PACKET_HOST)
473 		goto drop;
474 
475 	if (unlikely(skb->sk))
476 		goto drop;
477 
478 	if (skb_warn_if_lro(skb))
479 		goto drop;
480 
481 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
482 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
483 		goto drop;
484 	}
485 
486 	skb_forward_csum(skb);
487 
488 	/*
489 	 *	We DO NOT make any processing on
490 	 *	RA packets, pushing them to user level AS IS
491 	 *	without ane WARRANTY that application will be able
492 	 *	to interpret them. The reason is that we
493 	 *	cannot make anything clever here.
494 	 *
495 	 *	We are not end-node, so that if packet contains
496 	 *	AH/ESP, we cannot make anything.
497 	 *	Defragmentation also would be mistake, RA packets
498 	 *	cannot be fragmented, because there is no warranty
499 	 *	that different fragments will go along one path. --ANK
500 	 */
501 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
502 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
503 			return 0;
504 	}
505 
506 	/*
507 	 *	check and decrement ttl
508 	 */
509 	if (hdr->hop_limit <= 1) {
510 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
511 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
512 
513 		kfree_skb(skb);
514 		return -ETIMEDOUT;
515 	}
516 
517 	/* XXX: idev->cnf.proxy_ndp? */
518 	if (net->ipv6.devconf_all->proxy_ndp &&
519 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
520 		int proxied = ip6_forward_proxy_check(skb);
521 		if (proxied > 0)
522 			return ip6_input(skb);
523 		else if (proxied < 0) {
524 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
525 			goto drop;
526 		}
527 	}
528 
529 	if (!xfrm6_route_forward(skb)) {
530 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
531 		goto drop;
532 	}
533 	dst = skb_dst(skb);
534 
535 	/* IPv6 specs say nothing about it, but it is clear that we cannot
536 	   send redirects to source routed frames.
537 	   We don't send redirects to frames decapsulated from IPsec.
538 	 */
539 	if (IP6CB(skb)->iif == dst->dev->ifindex &&
540 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
541 		struct in6_addr *target = NULL;
542 		struct inet_peer *peer;
543 		struct rt6_info *rt;
544 
545 		/*
546 		 *	incoming and outgoing devices are the same
547 		 *	send a redirect.
548 		 */
549 
550 		rt = (struct rt6_info *) dst;
551 		if (rt->rt6i_flags & RTF_GATEWAY)
552 			target = &rt->rt6i_gateway;
553 		else
554 			target = &hdr->daddr;
555 
556 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
557 
558 		/* Limit redirects both by destination (here)
559 		   and by source (inside ndisc_send_redirect)
560 		 */
561 		if (inet_peer_xrlim_allow(peer, 1*HZ))
562 			ndisc_send_redirect(skb, target);
563 		if (peer)
564 			inet_putpeer(peer);
565 	} else {
566 		int addrtype = ipv6_addr_type(&hdr->saddr);
567 
568 		/* This check is security critical. */
569 		if (addrtype == IPV6_ADDR_ANY ||
570 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
571 			goto error;
572 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
573 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
574 				    ICMPV6_NOT_NEIGHBOUR, 0);
575 			goto error;
576 		}
577 	}
578 
579 	mtu = ip6_dst_mtu_forward(dst);
580 	if (mtu < IPV6_MIN_MTU)
581 		mtu = IPV6_MIN_MTU;
582 
583 	if (ip6_pkt_too_big(skb, mtu)) {
584 		/* Again, force OUTPUT device used as source address */
585 		skb->dev = dst->dev;
586 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
587 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
588 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
589 				IPSTATS_MIB_FRAGFAILS);
590 		kfree_skb(skb);
591 		return -EMSGSIZE;
592 	}
593 
594 	if (skb_cow(skb, dst->dev->hard_header_len)) {
595 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
596 				IPSTATS_MIB_OUTDISCARDS);
597 		goto drop;
598 	}
599 
600 	hdr = ipv6_hdr(skb);
601 
602 	/* Mangling hops number delayed to point after skb COW */
603 
604 	hdr->hop_limit--;
605 
606 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
607 		       net, NULL, skb, skb->dev, dst->dev,
608 		       ip6_forward_finish);
609 
610 error:
611 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
612 drop:
613 	kfree_skb(skb);
614 	return -EINVAL;
615 }
616 
617 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
618 {
619 	to->pkt_type = from->pkt_type;
620 	to->priority = from->priority;
621 	to->protocol = from->protocol;
622 	skb_dst_drop(to);
623 	skb_dst_set(to, dst_clone(skb_dst(from)));
624 	to->dev = from->dev;
625 	to->mark = from->mark;
626 
627 	skb_copy_hash(to, from);
628 
629 #ifdef CONFIG_NET_SCHED
630 	to->tc_index = from->tc_index;
631 #endif
632 	nf_copy(to, from);
633 	skb_ext_copy(to, from);
634 	skb_copy_secmark(to, from);
635 }
636 
637 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
638 		      u8 nexthdr, __be32 frag_id,
639 		      struct ip6_fraglist_iter *iter)
640 {
641 	unsigned int first_len;
642 	struct frag_hdr *fh;
643 
644 	/* BUILD HEADER */
645 	*prevhdr = NEXTHDR_FRAGMENT;
646 	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
647 	if (!iter->tmp_hdr)
648 		return -ENOMEM;
649 
650 	iter->frag = skb_shinfo(skb)->frag_list;
651 	skb_frag_list_init(skb);
652 
653 	iter->offset = 0;
654 	iter->hlen = hlen;
655 	iter->frag_id = frag_id;
656 	iter->nexthdr = nexthdr;
657 
658 	__skb_pull(skb, hlen);
659 	fh = __skb_push(skb, sizeof(struct frag_hdr));
660 	__skb_push(skb, hlen);
661 	skb_reset_network_header(skb);
662 	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
663 
664 	fh->nexthdr = nexthdr;
665 	fh->reserved = 0;
666 	fh->frag_off = htons(IP6_MF);
667 	fh->identification = frag_id;
668 
669 	first_len = skb_pagelen(skb);
670 	skb->data_len = first_len - skb_headlen(skb);
671 	skb->len = first_len;
672 	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
673 
674 	return 0;
675 }
676 EXPORT_SYMBOL(ip6_fraglist_init);
677 
678 void ip6_fraglist_prepare(struct sk_buff *skb,
679 			  struct ip6_fraglist_iter *iter)
680 {
681 	struct sk_buff *frag = iter->frag;
682 	unsigned int hlen = iter->hlen;
683 	struct frag_hdr *fh;
684 
685 	frag->ip_summed = CHECKSUM_NONE;
686 	skb_reset_transport_header(frag);
687 	fh = __skb_push(frag, sizeof(struct frag_hdr));
688 	__skb_push(frag, hlen);
689 	skb_reset_network_header(frag);
690 	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
691 	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
692 	fh->nexthdr = iter->nexthdr;
693 	fh->reserved = 0;
694 	fh->frag_off = htons(iter->offset);
695 	if (frag->next)
696 		fh->frag_off |= htons(IP6_MF);
697 	fh->identification = iter->frag_id;
698 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
699 	ip6_copy_metadata(frag, skb);
700 }
701 EXPORT_SYMBOL(ip6_fraglist_prepare);
702 
703 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
704 		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
705 		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
706 {
707 	state->prevhdr = prevhdr;
708 	state->nexthdr = nexthdr;
709 	state->frag_id = frag_id;
710 
711 	state->hlen = hlen;
712 	state->mtu = mtu;
713 
714 	state->left = skb->len - hlen;	/* Space per frame */
715 	state->ptr = hlen;		/* Where to start from */
716 
717 	state->hroom = hdr_room;
718 	state->troom = needed_tailroom;
719 
720 	state->offset = 0;
721 }
722 EXPORT_SYMBOL(ip6_frag_init);
723 
724 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
725 {
726 	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
727 	struct sk_buff *frag;
728 	struct frag_hdr *fh;
729 	unsigned int len;
730 
731 	len = state->left;
732 	/* IF: it doesn't fit, use 'mtu' - the data space left */
733 	if (len > state->mtu)
734 		len = state->mtu;
735 	/* IF: we are not sending up to and including the packet end
736 	   then align the next start on an eight byte boundary */
737 	if (len < state->left)
738 		len &= ~7;
739 
740 	/* Allocate buffer */
741 	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
742 			 state->hroom + state->troom, GFP_ATOMIC);
743 	if (!frag)
744 		return ERR_PTR(-ENOMEM);
745 
746 	/*
747 	 *	Set up data on packet
748 	 */
749 
750 	ip6_copy_metadata(frag, skb);
751 	skb_reserve(frag, state->hroom);
752 	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
753 	skb_reset_network_header(frag);
754 	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
755 	frag->transport_header = (frag->network_header + state->hlen +
756 				  sizeof(struct frag_hdr));
757 
758 	/*
759 	 *	Charge the memory for the fragment to any owner
760 	 *	it might possess
761 	 */
762 	if (skb->sk)
763 		skb_set_owner_w(frag, skb->sk);
764 
765 	/*
766 	 *	Copy the packet header into the new buffer.
767 	 */
768 	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
769 
770 	fragnexthdr_offset = skb_network_header(frag);
771 	fragnexthdr_offset += prevhdr - skb_network_header(skb);
772 	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
773 
774 	/*
775 	 *	Build fragment header.
776 	 */
777 	fh->nexthdr = state->nexthdr;
778 	fh->reserved = 0;
779 	fh->identification = state->frag_id;
780 
781 	/*
782 	 *	Copy a block of the IP datagram.
783 	 */
784 	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
785 			     len));
786 	state->left -= len;
787 
788 	fh->frag_off = htons(state->offset);
789 	if (state->left > 0)
790 		fh->frag_off |= htons(IP6_MF);
791 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
792 
793 	state->ptr += len;
794 	state->offset += len;
795 
796 	return frag;
797 }
798 EXPORT_SYMBOL(ip6_frag_next);
799 
800 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
801 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
802 {
803 	struct sk_buff *frag;
804 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
805 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
806 				inet6_sk(skb->sk) : NULL;
807 	struct ip6_frag_state state;
808 	unsigned int mtu, hlen, nexthdr_offset;
809 	ktime_t tstamp = skb->tstamp;
810 	int hroom, err = 0;
811 	__be32 frag_id;
812 	u8 *prevhdr, nexthdr = 0;
813 
814 	err = ip6_find_1stfragopt(skb, &prevhdr);
815 	if (err < 0)
816 		goto fail;
817 	hlen = err;
818 	nexthdr = *prevhdr;
819 	nexthdr_offset = prevhdr - skb_network_header(skb);
820 
821 	mtu = ip6_skb_dst_mtu(skb);
822 
823 	/* We must not fragment if the socket is set to force MTU discovery
824 	 * or if the skb it not generated by a local socket.
825 	 */
826 	if (unlikely(!skb->ignore_df && skb->len > mtu))
827 		goto fail_toobig;
828 
829 	if (IP6CB(skb)->frag_max_size) {
830 		if (IP6CB(skb)->frag_max_size > mtu)
831 			goto fail_toobig;
832 
833 		/* don't send fragments larger than what we received */
834 		mtu = IP6CB(skb)->frag_max_size;
835 		if (mtu < IPV6_MIN_MTU)
836 			mtu = IPV6_MIN_MTU;
837 	}
838 
839 	if (np && np->frag_size < mtu) {
840 		if (np->frag_size)
841 			mtu = np->frag_size;
842 	}
843 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
844 		goto fail_toobig;
845 	mtu -= hlen + sizeof(struct frag_hdr);
846 
847 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
848 				    &ipv6_hdr(skb)->saddr);
849 
850 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
851 	    (err = skb_checksum_help(skb)))
852 		goto fail;
853 
854 	prevhdr = skb_network_header(skb) + nexthdr_offset;
855 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
856 	if (skb_has_frag_list(skb)) {
857 		unsigned int first_len = skb_pagelen(skb);
858 		struct ip6_fraglist_iter iter;
859 		struct sk_buff *frag2;
860 
861 		if (first_len - hlen > mtu ||
862 		    ((first_len - hlen) & 7) ||
863 		    skb_cloned(skb) ||
864 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
865 			goto slow_path;
866 
867 		skb_walk_frags(skb, frag) {
868 			/* Correct geometry. */
869 			if (frag->len > mtu ||
870 			    ((frag->len & 7) && frag->next) ||
871 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
872 				goto slow_path_clean;
873 
874 			/* Partially cloned skb? */
875 			if (skb_shared(frag))
876 				goto slow_path_clean;
877 
878 			BUG_ON(frag->sk);
879 			if (skb->sk) {
880 				frag->sk = skb->sk;
881 				frag->destructor = sock_wfree;
882 			}
883 			skb->truesize -= frag->truesize;
884 		}
885 
886 		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
887 					&iter);
888 		if (err < 0)
889 			goto fail;
890 
891 		for (;;) {
892 			/* Prepare header of the next frame,
893 			 * before previous one went down. */
894 			if (iter.frag)
895 				ip6_fraglist_prepare(skb, &iter);
896 
897 			skb->tstamp = tstamp;
898 			err = output(net, sk, skb);
899 			if (!err)
900 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
901 					      IPSTATS_MIB_FRAGCREATES);
902 
903 			if (err || !iter.frag)
904 				break;
905 
906 			skb = ip6_fraglist_next(&iter);
907 		}
908 
909 		kfree(iter.tmp_hdr);
910 
911 		if (err == 0) {
912 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
913 				      IPSTATS_MIB_FRAGOKS);
914 			return 0;
915 		}
916 
917 		kfree_skb_list(iter.frag);
918 
919 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
920 			      IPSTATS_MIB_FRAGFAILS);
921 		return err;
922 
923 slow_path_clean:
924 		skb_walk_frags(skb, frag2) {
925 			if (frag2 == frag)
926 				break;
927 			frag2->sk = NULL;
928 			frag2->destructor = NULL;
929 			skb->truesize += frag2->truesize;
930 		}
931 	}
932 
933 slow_path:
934 	/*
935 	 *	Fragment the datagram.
936 	 */
937 
938 	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
939 		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
940 		      &state);
941 
942 	/*
943 	 *	Keep copying data until we run out.
944 	 */
945 
946 	while (state.left > 0) {
947 		frag = ip6_frag_next(skb, &state);
948 		if (IS_ERR(frag)) {
949 			err = PTR_ERR(frag);
950 			goto fail;
951 		}
952 
953 		/*
954 		 *	Put this fragment into the sending queue.
955 		 */
956 		frag->tstamp = tstamp;
957 		err = output(net, sk, frag);
958 		if (err)
959 			goto fail;
960 
961 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
962 			      IPSTATS_MIB_FRAGCREATES);
963 	}
964 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
965 		      IPSTATS_MIB_FRAGOKS);
966 	consume_skb(skb);
967 	return err;
968 
969 fail_toobig:
970 	if (skb->sk && dst_allfrag(skb_dst(skb)))
971 		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
972 
973 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
974 	err = -EMSGSIZE;
975 
976 fail:
977 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
978 		      IPSTATS_MIB_FRAGFAILS);
979 	kfree_skb(skb);
980 	return err;
981 }
982 
983 static inline int ip6_rt_check(const struct rt6key *rt_key,
984 			       const struct in6_addr *fl_addr,
985 			       const struct in6_addr *addr_cache)
986 {
987 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
988 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
989 }
990 
991 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
992 					  struct dst_entry *dst,
993 					  const struct flowi6 *fl6)
994 {
995 	struct ipv6_pinfo *np = inet6_sk(sk);
996 	struct rt6_info *rt;
997 
998 	if (!dst)
999 		goto out;
1000 
1001 	if (dst->ops->family != AF_INET6) {
1002 		dst_release(dst);
1003 		return NULL;
1004 	}
1005 
1006 	rt = (struct rt6_info *)dst;
1007 	/* Yes, checking route validity in not connected
1008 	 * case is not very simple. Take into account,
1009 	 * that we do not support routing by source, TOS,
1010 	 * and MSG_DONTROUTE		--ANK (980726)
1011 	 *
1012 	 * 1. ip6_rt_check(): If route was host route,
1013 	 *    check that cached destination is current.
1014 	 *    If it is network route, we still may
1015 	 *    check its validity using saved pointer
1016 	 *    to the last used address: daddr_cache.
1017 	 *    We do not want to save whole address now,
1018 	 *    (because main consumer of this service
1019 	 *    is tcp, which has not this problem),
1020 	 *    so that the last trick works only on connected
1021 	 *    sockets.
1022 	 * 2. oif also should be the same.
1023 	 */
1024 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1025 #ifdef CONFIG_IPV6_SUBTREES
1026 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1027 #endif
1028 	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
1029 	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
1030 		dst_release(dst);
1031 		dst = NULL;
1032 	}
1033 
1034 out:
1035 	return dst;
1036 }
1037 
1038 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1039 			       struct dst_entry **dst, struct flowi6 *fl6)
1040 {
1041 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1042 	struct neighbour *n;
1043 	struct rt6_info *rt;
1044 #endif
1045 	int err;
1046 	int flags = 0;
1047 
1048 	/* The correct way to handle this would be to do
1049 	 * ip6_route_get_saddr, and then ip6_route_output; however,
1050 	 * the route-specific preferred source forces the
1051 	 * ip6_route_output call _before_ ip6_route_get_saddr.
1052 	 *
1053 	 * In source specific routing (no src=any default route),
1054 	 * ip6_route_output will fail given src=any saddr, though, so
1055 	 * that's why we try it again later.
1056 	 */
1057 	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
1058 		struct fib6_info *from;
1059 		struct rt6_info *rt;
1060 		bool had_dst = *dst != NULL;
1061 
1062 		if (!had_dst)
1063 			*dst = ip6_route_output(net, sk, fl6);
1064 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1065 
1066 		rcu_read_lock();
1067 		from = rt ? rcu_dereference(rt->from) : NULL;
1068 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1069 					  sk ? inet6_sk(sk)->srcprefs : 0,
1070 					  &fl6->saddr);
1071 		rcu_read_unlock();
1072 
1073 		if (err)
1074 			goto out_err_release;
1075 
1076 		/* If we had an erroneous initial result, pretend it
1077 		 * never existed and let the SA-enabled version take
1078 		 * over.
1079 		 */
1080 		if (!had_dst && (*dst)->error) {
1081 			dst_release(*dst);
1082 			*dst = NULL;
1083 		}
1084 
1085 		if (fl6->flowi6_oif)
1086 			flags |= RT6_LOOKUP_F_IFACE;
1087 	}
1088 
1089 	if (!*dst)
1090 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1091 
1092 	err = (*dst)->error;
1093 	if (err)
1094 		goto out_err_release;
1095 
1096 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1097 	/*
1098 	 * Here if the dst entry we've looked up
1099 	 * has a neighbour entry that is in the INCOMPLETE
1100 	 * state and the src address from the flow is
1101 	 * marked as OPTIMISTIC, we release the found
1102 	 * dst entry and replace it instead with the
1103 	 * dst entry of the nexthop router
1104 	 */
1105 	rt = (struct rt6_info *) *dst;
1106 	rcu_read_lock_bh();
1107 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1108 				      rt6_nexthop(rt, &fl6->daddr));
1109 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1110 	rcu_read_unlock_bh();
1111 
1112 	if (err) {
1113 		struct inet6_ifaddr *ifp;
1114 		struct flowi6 fl_gw6;
1115 		int redirect;
1116 
1117 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1118 				      (*dst)->dev, 1);
1119 
1120 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1121 		if (ifp)
1122 			in6_ifa_put(ifp);
1123 
1124 		if (redirect) {
1125 			/*
1126 			 * We need to get the dst entry for the
1127 			 * default router instead
1128 			 */
1129 			dst_release(*dst);
1130 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1131 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1132 			*dst = ip6_route_output(net, sk, &fl_gw6);
1133 			err = (*dst)->error;
1134 			if (err)
1135 				goto out_err_release;
1136 		}
1137 	}
1138 #endif
1139 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1140 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1141 		err = -EAFNOSUPPORT;
1142 		goto out_err_release;
1143 	}
1144 
1145 	return 0;
1146 
1147 out_err_release:
1148 	dst_release(*dst);
1149 	*dst = NULL;
1150 
1151 	if (err == -ENETUNREACH)
1152 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1153 	return err;
1154 }
1155 
1156 /**
1157  *	ip6_dst_lookup - perform route lookup on flow
1158  *	@net: Network namespace to perform lookup in
1159  *	@sk: socket which provides route info
1160  *	@dst: pointer to dst_entry * for result
1161  *	@fl6: flow to lookup
1162  *
1163  *	This function performs a route lookup on the given flow.
1164  *
1165  *	It returns zero on success, or a standard errno code on error.
1166  */
1167 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1168 		   struct flowi6 *fl6)
1169 {
1170 	*dst = NULL;
1171 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1172 }
1173 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1174 
1175 /**
1176  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1177  *	@net: Network namespace to perform lookup in
1178  *	@sk: socket which provides route info
1179  *	@fl6: flow to lookup
1180  *	@final_dst: final destination address for ipsec lookup
1181  *
1182  *	This function performs a route lookup on the given flow.
1183  *
1184  *	It returns a valid dst pointer on success, or a pointer encoded
1185  *	error code.
1186  */
1187 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1188 				      const struct in6_addr *final_dst)
1189 {
1190 	struct dst_entry *dst = NULL;
1191 	int err;
1192 
1193 	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1194 	if (err)
1195 		return ERR_PTR(err);
1196 	if (final_dst)
1197 		fl6->daddr = *final_dst;
1198 
1199 	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1200 }
1201 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1202 
1203 /**
1204  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1205  *	@sk: socket which provides the dst cache and route info
1206  *	@fl6: flow to lookup
1207  *	@final_dst: final destination address for ipsec lookup
1208  *	@connected: whether @sk is connected or not
1209  *
1210  *	This function performs a route lookup on the given flow with the
1211  *	possibility of using the cached route in the socket if it is valid.
1212  *	It will take the socket dst lock when operating on the dst cache.
1213  *	As a result, this function can only be used in process context.
1214  *
1215  *	In addition, for a connected socket, cache the dst in the socket
1216  *	if the current cache is not valid.
1217  *
1218  *	It returns a valid dst pointer on success, or a pointer encoded
1219  *	error code.
1220  */
1221 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1222 					 const struct in6_addr *final_dst,
1223 					 bool connected)
1224 {
1225 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1226 
1227 	dst = ip6_sk_dst_check(sk, dst, fl6);
1228 	if (dst)
1229 		return dst;
1230 
1231 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1232 	if (connected && !IS_ERR(dst))
1233 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1234 
1235 	return dst;
1236 }
1237 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1238 
1239 /**
1240  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1241  *      @skb: Packet for which lookup is done
1242  *      @dev: Tunnel device
1243  *      @net: Network namespace of tunnel device
1244  *      @sock: Socket which provides route info
1245  *      @saddr: Memory to store the src ip address
1246  *      @info: Tunnel information
1247  *      @protocol: IP protocol
1248  *      @use_cache: Flag to enable cache usage
1249  *      This function performs a route lookup on a tunnel
1250  *
1251  *      It returns a valid dst pointer and stores src address to be used in
1252  *      tunnel in param saddr on success, else a pointer encoded error code.
1253  */
1254 
1255 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1256 					struct net_device *dev,
1257 					struct net *net,
1258 					struct socket *sock,
1259 					struct in6_addr *saddr,
1260 					const struct ip_tunnel_info *info,
1261 					u8 protocol,
1262 					bool use_cache)
1263 {
1264 	struct dst_entry *dst = NULL;
1265 #ifdef CONFIG_DST_CACHE
1266 	struct dst_cache *dst_cache;
1267 #endif
1268 	struct flowi6 fl6;
1269 	__u8 prio;
1270 
1271 #ifdef CONFIG_DST_CACHE
1272 	dst_cache = (struct dst_cache *)&info->dst_cache;
1273 	if (use_cache) {
1274 		dst = dst_cache_get_ip6(dst_cache, saddr);
1275 		if (dst)
1276 			return dst;
1277 	}
1278 #endif
1279 	memset(&fl6, 0, sizeof(fl6));
1280 	fl6.flowi6_mark = skb->mark;
1281 	fl6.flowi6_proto = protocol;
1282 	fl6.daddr = info->key.u.ipv6.dst;
1283 	fl6.saddr = info->key.u.ipv6.src;
1284 	prio = info->key.tos;
1285 	fl6.flowlabel = ip6_make_flowinfo(RT_TOS(prio),
1286 					  info->key.label);
1287 
1288 	dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1289 					      NULL);
1290 	if (IS_ERR(dst)) {
1291 		netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1292 		return ERR_PTR(-ENETUNREACH);
1293 	}
1294 	if (dst->dev == dev) { /* is this necessary? */
1295 		netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1296 		dst_release(dst);
1297 		return ERR_PTR(-ELOOP);
1298 	}
1299 #ifdef CONFIG_DST_CACHE
1300 	if (use_cache)
1301 		dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1302 #endif
1303 	*saddr = fl6.saddr;
1304 	return dst;
1305 }
1306 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1307 
1308 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1309 					       gfp_t gfp)
1310 {
1311 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1312 }
1313 
1314 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1315 						gfp_t gfp)
1316 {
1317 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1318 }
1319 
1320 static void ip6_append_data_mtu(unsigned int *mtu,
1321 				int *maxfraglen,
1322 				unsigned int fragheaderlen,
1323 				struct sk_buff *skb,
1324 				struct rt6_info *rt,
1325 				unsigned int orig_mtu)
1326 {
1327 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1328 		if (!skb) {
1329 			/* first fragment, reserve header_len */
1330 			*mtu = orig_mtu - rt->dst.header_len;
1331 
1332 		} else {
1333 			/*
1334 			 * this fragment is not first, the headers
1335 			 * space is regarded as data space.
1336 			 */
1337 			*mtu = orig_mtu;
1338 		}
1339 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1340 			      + fragheaderlen - sizeof(struct frag_hdr);
1341 	}
1342 }
1343 
1344 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1345 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1346 			  struct rt6_info *rt, struct flowi6 *fl6)
1347 {
1348 	struct ipv6_pinfo *np = inet6_sk(sk);
1349 	unsigned int mtu;
1350 	struct ipv6_txoptions *opt = ipc6->opt;
1351 
1352 	/*
1353 	 * setup for corking
1354 	 */
1355 	if (opt) {
1356 		if (WARN_ON(v6_cork->opt))
1357 			return -EINVAL;
1358 
1359 		v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1360 		if (unlikely(!v6_cork->opt))
1361 			return -ENOBUFS;
1362 
1363 		v6_cork->opt->tot_len = sizeof(*opt);
1364 		v6_cork->opt->opt_flen = opt->opt_flen;
1365 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1366 
1367 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1368 						    sk->sk_allocation);
1369 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1370 			return -ENOBUFS;
1371 
1372 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1373 						    sk->sk_allocation);
1374 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1375 			return -ENOBUFS;
1376 
1377 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1378 						   sk->sk_allocation);
1379 		if (opt->hopopt && !v6_cork->opt->hopopt)
1380 			return -ENOBUFS;
1381 
1382 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1383 						    sk->sk_allocation);
1384 		if (opt->srcrt && !v6_cork->opt->srcrt)
1385 			return -ENOBUFS;
1386 
1387 		/* need source address above miyazawa*/
1388 	}
1389 	dst_hold(&rt->dst);
1390 	cork->base.dst = &rt->dst;
1391 	cork->fl.u.ip6 = *fl6;
1392 	v6_cork->hop_limit = ipc6->hlimit;
1393 	v6_cork->tclass = ipc6->tclass;
1394 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1395 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1396 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1397 	else
1398 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1399 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1400 	if (np->frag_size < mtu) {
1401 		if (np->frag_size)
1402 			mtu = np->frag_size;
1403 	}
1404 	if (mtu < IPV6_MIN_MTU)
1405 		return -EINVAL;
1406 	cork->base.fragsize = mtu;
1407 	cork->base.gso_size = ipc6->gso_size;
1408 	cork->base.tx_flags = 0;
1409 	cork->base.mark = ipc6->sockc.mark;
1410 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1411 
1412 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1413 		cork->base.flags |= IPCORK_ALLFRAG;
1414 	cork->base.length = 0;
1415 
1416 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1417 
1418 	return 0;
1419 }
1420 
1421 static int __ip6_append_data(struct sock *sk,
1422 			     struct flowi6 *fl6,
1423 			     struct sk_buff_head *queue,
1424 			     struct inet_cork *cork,
1425 			     struct inet6_cork *v6_cork,
1426 			     struct page_frag *pfrag,
1427 			     int getfrag(void *from, char *to, int offset,
1428 					 int len, int odd, struct sk_buff *skb),
1429 			     void *from, int length, int transhdrlen,
1430 			     unsigned int flags, struct ipcm6_cookie *ipc6)
1431 {
1432 	struct sk_buff *skb, *skb_prev = NULL;
1433 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1434 	struct ubuf_info *uarg = NULL;
1435 	int exthdrlen = 0;
1436 	int dst_exthdrlen = 0;
1437 	int hh_len;
1438 	int copy;
1439 	int err;
1440 	int offset = 0;
1441 	u32 tskey = 0;
1442 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1443 	struct ipv6_txoptions *opt = v6_cork->opt;
1444 	int csummode = CHECKSUM_NONE;
1445 	unsigned int maxnonfragsize, headersize;
1446 	unsigned int wmem_alloc_delta = 0;
1447 	bool paged, extra_uref = false;
1448 
1449 	skb = skb_peek_tail(queue);
1450 	if (!skb) {
1451 		exthdrlen = opt ? opt->opt_flen : 0;
1452 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1453 	}
1454 
1455 	paged = !!cork->gso_size;
1456 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1457 	orig_mtu = mtu;
1458 
1459 	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1460 	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1461 		tskey = sk->sk_tskey++;
1462 
1463 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1464 
1465 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1466 			(opt ? opt->opt_nflen : 0);
1467 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1468 		     sizeof(struct frag_hdr);
1469 
1470 	headersize = sizeof(struct ipv6hdr) +
1471 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1472 		     (dst_allfrag(&rt->dst) ?
1473 		      sizeof(struct frag_hdr) : 0) +
1474 		     rt->rt6i_nfheader_len;
1475 
1476 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1477 	 * the first fragment
1478 	 */
1479 	if (headersize + transhdrlen > mtu)
1480 		goto emsgsize;
1481 
1482 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1483 	    (sk->sk_protocol == IPPROTO_UDP ||
1484 	     sk->sk_protocol == IPPROTO_RAW)) {
1485 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1486 				sizeof(struct ipv6hdr));
1487 		goto emsgsize;
1488 	}
1489 
1490 	if (ip6_sk_ignore_df(sk))
1491 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1492 	else
1493 		maxnonfragsize = mtu;
1494 
1495 	if (cork->length + length > maxnonfragsize - headersize) {
1496 emsgsize:
1497 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1498 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1499 		return -EMSGSIZE;
1500 	}
1501 
1502 	/* CHECKSUM_PARTIAL only with no extension headers and when
1503 	 * we are not going to fragment
1504 	 */
1505 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1506 	    headersize == sizeof(struct ipv6hdr) &&
1507 	    length <= mtu - headersize &&
1508 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1509 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1510 		csummode = CHECKSUM_PARTIAL;
1511 
1512 	if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1513 		uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
1514 		if (!uarg)
1515 			return -ENOBUFS;
1516 		extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1517 		if (rt->dst.dev->features & NETIF_F_SG &&
1518 		    csummode == CHECKSUM_PARTIAL) {
1519 			paged = true;
1520 		} else {
1521 			uarg->zerocopy = 0;
1522 			skb_zcopy_set(skb, uarg, &extra_uref);
1523 		}
1524 	}
1525 
1526 	/*
1527 	 * Let's try using as much space as possible.
1528 	 * Use MTU if total length of the message fits into the MTU.
1529 	 * Otherwise, we need to reserve fragment header and
1530 	 * fragment alignment (= 8-15 octects, in total).
1531 	 *
1532 	 * Note that we may need to "move" the data from the tail
1533 	 * of the buffer to the new fragment when we split
1534 	 * the message.
1535 	 *
1536 	 * FIXME: It may be fragmented into multiple chunks
1537 	 *        at once if non-fragmentable extension headers
1538 	 *        are too large.
1539 	 * --yoshfuji
1540 	 */
1541 
1542 	cork->length += length;
1543 	if (!skb)
1544 		goto alloc_new_skb;
1545 
1546 	while (length > 0) {
1547 		/* Check if the remaining data fits into current packet. */
1548 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1549 		if (copy < length)
1550 			copy = maxfraglen - skb->len;
1551 
1552 		if (copy <= 0) {
1553 			char *data;
1554 			unsigned int datalen;
1555 			unsigned int fraglen;
1556 			unsigned int fraggap;
1557 			unsigned int alloclen;
1558 			unsigned int pagedlen;
1559 alloc_new_skb:
1560 			/* There's no room in the current skb */
1561 			if (skb)
1562 				fraggap = skb->len - maxfraglen;
1563 			else
1564 				fraggap = 0;
1565 			/* update mtu and maxfraglen if necessary */
1566 			if (!skb || !skb_prev)
1567 				ip6_append_data_mtu(&mtu, &maxfraglen,
1568 						    fragheaderlen, skb, rt,
1569 						    orig_mtu);
1570 
1571 			skb_prev = skb;
1572 
1573 			/*
1574 			 * If remaining data exceeds the mtu,
1575 			 * we know we need more fragment(s).
1576 			 */
1577 			datalen = length + fraggap;
1578 
1579 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1580 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1581 			fraglen = datalen + fragheaderlen;
1582 			pagedlen = 0;
1583 
1584 			if ((flags & MSG_MORE) &&
1585 			    !(rt->dst.dev->features&NETIF_F_SG))
1586 				alloclen = mtu;
1587 			else if (!paged)
1588 				alloclen = fraglen;
1589 			else {
1590 				alloclen = min_t(int, fraglen, MAX_HEADER);
1591 				pagedlen = fraglen - alloclen;
1592 			}
1593 
1594 			alloclen += dst_exthdrlen;
1595 
1596 			if (datalen != length + fraggap) {
1597 				/*
1598 				 * this is not the last fragment, the trailer
1599 				 * space is regarded as data space.
1600 				 */
1601 				datalen += rt->dst.trailer_len;
1602 			}
1603 
1604 			alloclen += rt->dst.trailer_len;
1605 			fraglen = datalen + fragheaderlen;
1606 
1607 			/*
1608 			 * We just reserve space for fragment header.
1609 			 * Note: this may be overallocation if the message
1610 			 * (without MSG_MORE) fits into the MTU.
1611 			 */
1612 			alloclen += sizeof(struct frag_hdr);
1613 
1614 			copy = datalen - transhdrlen - fraggap - pagedlen;
1615 			if (copy < 0) {
1616 				err = -EINVAL;
1617 				goto error;
1618 			}
1619 			if (transhdrlen) {
1620 				skb = sock_alloc_send_skb(sk,
1621 						alloclen + hh_len,
1622 						(flags & MSG_DONTWAIT), &err);
1623 			} else {
1624 				skb = NULL;
1625 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1626 				    2 * sk->sk_sndbuf)
1627 					skb = alloc_skb(alloclen + hh_len,
1628 							sk->sk_allocation);
1629 				if (unlikely(!skb))
1630 					err = -ENOBUFS;
1631 			}
1632 			if (!skb)
1633 				goto error;
1634 			/*
1635 			 *	Fill in the control structures
1636 			 */
1637 			skb->protocol = htons(ETH_P_IPV6);
1638 			skb->ip_summed = csummode;
1639 			skb->csum = 0;
1640 			/* reserve for fragmentation and ipsec header */
1641 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1642 				    dst_exthdrlen);
1643 
1644 			/*
1645 			 *	Find where to start putting bytes
1646 			 */
1647 			data = skb_put(skb, fraglen - pagedlen);
1648 			skb_set_network_header(skb, exthdrlen);
1649 			data += fragheaderlen;
1650 			skb->transport_header = (skb->network_header +
1651 						 fragheaderlen);
1652 			if (fraggap) {
1653 				skb->csum = skb_copy_and_csum_bits(
1654 					skb_prev, maxfraglen,
1655 					data + transhdrlen, fraggap);
1656 				skb_prev->csum = csum_sub(skb_prev->csum,
1657 							  skb->csum);
1658 				data += fraggap;
1659 				pskb_trim_unique(skb_prev, maxfraglen);
1660 			}
1661 			if (copy > 0 &&
1662 			    getfrag(from, data + transhdrlen, offset,
1663 				    copy, fraggap, skb) < 0) {
1664 				err = -EFAULT;
1665 				kfree_skb(skb);
1666 				goto error;
1667 			}
1668 
1669 			offset += copy;
1670 			length -= copy + transhdrlen;
1671 			transhdrlen = 0;
1672 			exthdrlen = 0;
1673 			dst_exthdrlen = 0;
1674 
1675 			/* Only the initial fragment is time stamped */
1676 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1677 			cork->tx_flags = 0;
1678 			skb_shinfo(skb)->tskey = tskey;
1679 			tskey = 0;
1680 			skb_zcopy_set(skb, uarg, &extra_uref);
1681 
1682 			if ((flags & MSG_CONFIRM) && !skb_prev)
1683 				skb_set_dst_pending_confirm(skb, 1);
1684 
1685 			/*
1686 			 * Put the packet on the pending queue
1687 			 */
1688 			if (!skb->destructor) {
1689 				skb->destructor = sock_wfree;
1690 				skb->sk = sk;
1691 				wmem_alloc_delta += skb->truesize;
1692 			}
1693 			__skb_queue_tail(queue, skb);
1694 			continue;
1695 		}
1696 
1697 		if (copy > length)
1698 			copy = length;
1699 
1700 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1701 		    skb_tailroom(skb) >= copy) {
1702 			unsigned int off;
1703 
1704 			off = skb->len;
1705 			if (getfrag(from, skb_put(skb, copy),
1706 						offset, copy, off, skb) < 0) {
1707 				__skb_trim(skb, off);
1708 				err = -EFAULT;
1709 				goto error;
1710 			}
1711 		} else if (!uarg || !uarg->zerocopy) {
1712 			int i = skb_shinfo(skb)->nr_frags;
1713 
1714 			err = -ENOMEM;
1715 			if (!sk_page_frag_refill(sk, pfrag))
1716 				goto error;
1717 
1718 			if (!skb_can_coalesce(skb, i, pfrag->page,
1719 					      pfrag->offset)) {
1720 				err = -EMSGSIZE;
1721 				if (i == MAX_SKB_FRAGS)
1722 					goto error;
1723 
1724 				__skb_fill_page_desc(skb, i, pfrag->page,
1725 						     pfrag->offset, 0);
1726 				skb_shinfo(skb)->nr_frags = ++i;
1727 				get_page(pfrag->page);
1728 			}
1729 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1730 			if (getfrag(from,
1731 				    page_address(pfrag->page) + pfrag->offset,
1732 				    offset, copy, skb->len, skb) < 0)
1733 				goto error_efault;
1734 
1735 			pfrag->offset += copy;
1736 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1737 			skb->len += copy;
1738 			skb->data_len += copy;
1739 			skb->truesize += copy;
1740 			wmem_alloc_delta += copy;
1741 		} else {
1742 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1743 			if (err < 0)
1744 				goto error;
1745 		}
1746 		offset += copy;
1747 		length -= copy;
1748 	}
1749 
1750 	if (wmem_alloc_delta)
1751 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1752 	return 0;
1753 
1754 error_efault:
1755 	err = -EFAULT;
1756 error:
1757 	if (uarg)
1758 		sock_zerocopy_put_abort(uarg, extra_uref);
1759 	cork->length -= length;
1760 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1761 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1762 	return err;
1763 }
1764 
1765 int ip6_append_data(struct sock *sk,
1766 		    int getfrag(void *from, char *to, int offset, int len,
1767 				int odd, struct sk_buff *skb),
1768 		    void *from, int length, int transhdrlen,
1769 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1770 		    struct rt6_info *rt, unsigned int flags)
1771 {
1772 	struct inet_sock *inet = inet_sk(sk);
1773 	struct ipv6_pinfo *np = inet6_sk(sk);
1774 	int exthdrlen;
1775 	int err;
1776 
1777 	if (flags&MSG_PROBE)
1778 		return 0;
1779 	if (skb_queue_empty(&sk->sk_write_queue)) {
1780 		/*
1781 		 * setup for corking
1782 		 */
1783 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1784 				     ipc6, rt, fl6);
1785 		if (err)
1786 			return err;
1787 
1788 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1789 		length += exthdrlen;
1790 		transhdrlen += exthdrlen;
1791 	} else {
1792 		fl6 = &inet->cork.fl.u.ip6;
1793 		transhdrlen = 0;
1794 	}
1795 
1796 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1797 				 &np->cork, sk_page_frag(sk), getfrag,
1798 				 from, length, transhdrlen, flags, ipc6);
1799 }
1800 EXPORT_SYMBOL_GPL(ip6_append_data);
1801 
1802 static void ip6_cork_release(struct inet_cork_full *cork,
1803 			     struct inet6_cork *v6_cork)
1804 {
1805 	if (v6_cork->opt) {
1806 		kfree(v6_cork->opt->dst0opt);
1807 		kfree(v6_cork->opt->dst1opt);
1808 		kfree(v6_cork->opt->hopopt);
1809 		kfree(v6_cork->opt->srcrt);
1810 		kfree(v6_cork->opt);
1811 		v6_cork->opt = NULL;
1812 	}
1813 
1814 	if (cork->base.dst) {
1815 		dst_release(cork->base.dst);
1816 		cork->base.dst = NULL;
1817 		cork->base.flags &= ~IPCORK_ALLFRAG;
1818 	}
1819 	memset(&cork->fl, 0, sizeof(cork->fl));
1820 }
1821 
1822 struct sk_buff *__ip6_make_skb(struct sock *sk,
1823 			       struct sk_buff_head *queue,
1824 			       struct inet_cork_full *cork,
1825 			       struct inet6_cork *v6_cork)
1826 {
1827 	struct sk_buff *skb, *tmp_skb;
1828 	struct sk_buff **tail_skb;
1829 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1830 	struct ipv6_pinfo *np = inet6_sk(sk);
1831 	struct net *net = sock_net(sk);
1832 	struct ipv6hdr *hdr;
1833 	struct ipv6_txoptions *opt = v6_cork->opt;
1834 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1835 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1836 	unsigned char proto = fl6->flowi6_proto;
1837 
1838 	skb = __skb_dequeue(queue);
1839 	if (!skb)
1840 		goto out;
1841 	tail_skb = &(skb_shinfo(skb)->frag_list);
1842 
1843 	/* move skb->data to ip header from ext header */
1844 	if (skb->data < skb_network_header(skb))
1845 		__skb_pull(skb, skb_network_offset(skb));
1846 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1847 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1848 		*tail_skb = tmp_skb;
1849 		tail_skb = &(tmp_skb->next);
1850 		skb->len += tmp_skb->len;
1851 		skb->data_len += tmp_skb->len;
1852 		skb->truesize += tmp_skb->truesize;
1853 		tmp_skb->destructor = NULL;
1854 		tmp_skb->sk = NULL;
1855 	}
1856 
1857 	/* Allow local fragmentation. */
1858 	skb->ignore_df = ip6_sk_ignore_df(sk);
1859 
1860 	*final_dst = fl6->daddr;
1861 	__skb_pull(skb, skb_network_header_len(skb));
1862 	if (opt && opt->opt_flen)
1863 		ipv6_push_frag_opts(skb, opt, &proto);
1864 	if (opt && opt->opt_nflen)
1865 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1866 
1867 	skb_push(skb, sizeof(struct ipv6hdr));
1868 	skb_reset_network_header(skb);
1869 	hdr = ipv6_hdr(skb);
1870 
1871 	ip6_flow_hdr(hdr, v6_cork->tclass,
1872 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1873 					ip6_autoflowlabel(net, np), fl6));
1874 	hdr->hop_limit = v6_cork->hop_limit;
1875 	hdr->nexthdr = proto;
1876 	hdr->saddr = fl6->saddr;
1877 	hdr->daddr = *final_dst;
1878 
1879 	skb->priority = sk->sk_priority;
1880 	skb->mark = cork->base.mark;
1881 
1882 	skb->tstamp = cork->base.transmit_time;
1883 
1884 	skb_dst_set(skb, dst_clone(&rt->dst));
1885 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1886 	if (proto == IPPROTO_ICMPV6) {
1887 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1888 
1889 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1890 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1891 	}
1892 
1893 	ip6_cork_release(cork, v6_cork);
1894 out:
1895 	return skb;
1896 }
1897 
1898 int ip6_send_skb(struct sk_buff *skb)
1899 {
1900 	struct net *net = sock_net(skb->sk);
1901 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1902 	int err;
1903 
1904 	err = ip6_local_out(net, skb->sk, skb);
1905 	if (err) {
1906 		if (err > 0)
1907 			err = net_xmit_errno(err);
1908 		if (err)
1909 			IP6_INC_STATS(net, rt->rt6i_idev,
1910 				      IPSTATS_MIB_OUTDISCARDS);
1911 	}
1912 
1913 	return err;
1914 }
1915 
1916 int ip6_push_pending_frames(struct sock *sk)
1917 {
1918 	struct sk_buff *skb;
1919 
1920 	skb = ip6_finish_skb(sk);
1921 	if (!skb)
1922 		return 0;
1923 
1924 	return ip6_send_skb(skb);
1925 }
1926 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1927 
1928 static void __ip6_flush_pending_frames(struct sock *sk,
1929 				       struct sk_buff_head *queue,
1930 				       struct inet_cork_full *cork,
1931 				       struct inet6_cork *v6_cork)
1932 {
1933 	struct sk_buff *skb;
1934 
1935 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1936 		if (skb_dst(skb))
1937 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1938 				      IPSTATS_MIB_OUTDISCARDS);
1939 		kfree_skb(skb);
1940 	}
1941 
1942 	ip6_cork_release(cork, v6_cork);
1943 }
1944 
1945 void ip6_flush_pending_frames(struct sock *sk)
1946 {
1947 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1948 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1949 }
1950 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1951 
1952 struct sk_buff *ip6_make_skb(struct sock *sk,
1953 			     int getfrag(void *from, char *to, int offset,
1954 					 int len, int odd, struct sk_buff *skb),
1955 			     void *from, int length, int transhdrlen,
1956 			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1957 			     struct rt6_info *rt, unsigned int flags,
1958 			     struct inet_cork_full *cork)
1959 {
1960 	struct inet6_cork v6_cork;
1961 	struct sk_buff_head queue;
1962 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1963 	int err;
1964 
1965 	if (flags & MSG_PROBE)
1966 		return NULL;
1967 
1968 	__skb_queue_head_init(&queue);
1969 
1970 	cork->base.flags = 0;
1971 	cork->base.addr = 0;
1972 	cork->base.opt = NULL;
1973 	cork->base.dst = NULL;
1974 	v6_cork.opt = NULL;
1975 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1976 	if (err) {
1977 		ip6_cork_release(cork, &v6_cork);
1978 		return ERR_PTR(err);
1979 	}
1980 	if (ipc6->dontfrag < 0)
1981 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1982 
1983 	err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1984 				&current->task_frag, getfrag, from,
1985 				length + exthdrlen, transhdrlen + exthdrlen,
1986 				flags, ipc6);
1987 	if (err) {
1988 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1989 		return ERR_PTR(err);
1990 	}
1991 
1992 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1993 }
1994