xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 8dda2eac)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	IPv6 output functions
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on linux/net/ipv4/ip_output.c
10  *
11  *	Changes:
12  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13  *				extension headers are implemented.
14  *				route changes now work.
15  *				ip6_forward does not confuse sniffers.
16  *				etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *	Imran Patel	:	frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *			:       add ip6_append_data and related functions
22  *				for datagram xmit
23  */
24 
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37 
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41 
42 #include <net/sock.h>
43 #include <net/snmp.h>
44 
45 #include <net/ipv6.h>
46 #include <net/ndisc.h>
47 #include <net/protocol.h>
48 #include <net/ip6_route.h>
49 #include <net/addrconf.h>
50 #include <net/rawv6.h>
51 #include <net/icmp.h>
52 #include <net/xfrm.h>
53 #include <net/checksum.h>
54 #include <linux/mroute6.h>
55 #include <net/l3mdev.h>
56 #include <net/lwtunnel.h>
57 #include <net/ip_tunnels.h>
58 
59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
60 {
61 	struct dst_entry *dst = skb_dst(skb);
62 	struct net_device *dev = dst->dev;
63 	const struct in6_addr *nexthop;
64 	struct neighbour *neigh;
65 	int ret;
66 
67 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
68 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
69 
70 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
71 		    ((mroute6_is_socket(net, skb) &&
72 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
73 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
74 					 &ipv6_hdr(skb)->saddr))) {
75 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
76 
77 			/* Do not check for IFF_ALLMULTI; multicast routing
78 			   is not supported in any case.
79 			 */
80 			if (newskb)
81 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
82 					net, sk, newskb, NULL, newskb->dev,
83 					dev_loopback_xmit);
84 
85 			if (ipv6_hdr(skb)->hop_limit == 0) {
86 				IP6_INC_STATS(net, idev,
87 					      IPSTATS_MIB_OUTDISCARDS);
88 				kfree_skb(skb);
89 				return 0;
90 			}
91 		}
92 
93 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
94 
95 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
96 		    IPV6_ADDR_SCOPE_NODELOCAL &&
97 		    !(dev->flags & IFF_LOOPBACK)) {
98 			kfree_skb(skb);
99 			return 0;
100 		}
101 	}
102 
103 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
104 		int res = lwtunnel_xmit(skb);
105 
106 		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
107 			return res;
108 	}
109 
110 	rcu_read_lock_bh();
111 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
112 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
113 	if (unlikely(!neigh))
114 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
115 	if (!IS_ERR(neigh)) {
116 		sock_confirm_neigh(skb, neigh);
117 		ret = neigh_output(neigh, skb, false);
118 		rcu_read_unlock_bh();
119 		return ret;
120 	}
121 	rcu_read_unlock_bh();
122 
123 	IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
124 	kfree_skb(skb);
125 	return -EINVAL;
126 }
127 
128 static int
129 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
130 				    struct sk_buff *skb, unsigned int mtu)
131 {
132 	struct sk_buff *segs, *nskb;
133 	netdev_features_t features;
134 	int ret = 0;
135 
136 	/* Please see corresponding comment in ip_finish_output_gso
137 	 * describing the cases where GSO segment length exceeds the
138 	 * egress MTU.
139 	 */
140 	features = netif_skb_features(skb);
141 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
142 	if (IS_ERR_OR_NULL(segs)) {
143 		kfree_skb(skb);
144 		return -ENOMEM;
145 	}
146 
147 	consume_skb(skb);
148 
149 	skb_list_walk_safe(segs, segs, nskb) {
150 		int err;
151 
152 		skb_mark_not_on_list(segs);
153 		err = ip6_fragment(net, sk, segs, ip6_finish_output2);
154 		if (err && ret == 0)
155 			ret = err;
156 	}
157 
158 	return ret;
159 }
160 
161 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
162 {
163 	unsigned int mtu;
164 
165 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
166 	/* Policy lookup after SNAT yielded a new policy */
167 	if (skb_dst(skb)->xfrm) {
168 		IPCB(skb)->flags |= IPSKB_REROUTED;
169 		return dst_output(net, sk, skb);
170 	}
171 #endif
172 
173 	mtu = ip6_skb_dst_mtu(skb);
174 	if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
175 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
176 
177 	if ((skb->len > mtu && !skb_is_gso(skb)) ||
178 	    dst_allfrag(skb_dst(skb)) ||
179 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
180 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
181 	else
182 		return ip6_finish_output2(net, sk, skb);
183 }
184 
185 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
186 {
187 	int ret;
188 
189 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
190 	switch (ret) {
191 	case NET_XMIT_SUCCESS:
192 		return __ip6_finish_output(net, sk, skb);
193 	case NET_XMIT_CN:
194 		return __ip6_finish_output(net, sk, skb) ? : ret;
195 	default:
196 		kfree_skb(skb);
197 		return ret;
198 	}
199 }
200 
201 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
202 {
203 	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
204 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
205 
206 	skb->protocol = htons(ETH_P_IPV6);
207 	skb->dev = dev;
208 
209 	if (unlikely(idev->cnf.disable_ipv6)) {
210 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
211 		kfree_skb(skb);
212 		return 0;
213 	}
214 
215 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
216 			    net, sk, skb, indev, dev,
217 			    ip6_finish_output,
218 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
219 }
220 EXPORT_SYMBOL(ip6_output);
221 
222 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
223 {
224 	if (!np->autoflowlabel_set)
225 		return ip6_default_np_autolabel(net);
226 	else
227 		return np->autoflowlabel;
228 }
229 
230 /*
231  * xmit an sk_buff (used by TCP, SCTP and DCCP)
232  * Note : socket lock is not held for SYNACK packets, but might be modified
233  * by calls to skb_set_owner_w() and ipv6_local_error(),
234  * which are using proper atomic operations or spinlocks.
235  */
236 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
237 	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
238 {
239 	struct net *net = sock_net(sk);
240 	const struct ipv6_pinfo *np = inet6_sk(sk);
241 	struct in6_addr *first_hop = &fl6->daddr;
242 	struct dst_entry *dst = skb_dst(skb);
243 	unsigned int head_room;
244 	struct ipv6hdr *hdr;
245 	u8  proto = fl6->flowi6_proto;
246 	int seg_len = skb->len;
247 	int hlimit = -1;
248 	u32 mtu;
249 
250 	head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
251 	if (opt)
252 		head_room += opt->opt_nflen + opt->opt_flen;
253 
254 	if (unlikely(skb_headroom(skb) < head_room)) {
255 		struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
256 		if (!skb2) {
257 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
258 				      IPSTATS_MIB_OUTDISCARDS);
259 			kfree_skb(skb);
260 			return -ENOBUFS;
261 		}
262 		if (skb->sk)
263 			skb_set_owner_w(skb2, skb->sk);
264 		consume_skb(skb);
265 		skb = skb2;
266 	}
267 
268 	if (opt) {
269 		seg_len += opt->opt_nflen + opt->opt_flen;
270 
271 		if (opt->opt_flen)
272 			ipv6_push_frag_opts(skb, opt, &proto);
273 
274 		if (opt->opt_nflen)
275 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
276 					     &fl6->saddr);
277 	}
278 
279 	skb_push(skb, sizeof(struct ipv6hdr));
280 	skb_reset_network_header(skb);
281 	hdr = ipv6_hdr(skb);
282 
283 	/*
284 	 *	Fill in the IPv6 header
285 	 */
286 	if (np)
287 		hlimit = np->hop_limit;
288 	if (hlimit < 0)
289 		hlimit = ip6_dst_hoplimit(dst);
290 
291 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
292 				ip6_autoflowlabel(net, np), fl6));
293 
294 	hdr->payload_len = htons(seg_len);
295 	hdr->nexthdr = proto;
296 	hdr->hop_limit = hlimit;
297 
298 	hdr->saddr = fl6->saddr;
299 	hdr->daddr = *first_hop;
300 
301 	skb->protocol = htons(ETH_P_IPV6);
302 	skb->priority = priority;
303 	skb->mark = mark;
304 
305 	mtu = dst_mtu(dst);
306 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
307 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
308 			      IPSTATS_MIB_OUT, skb->len);
309 
310 		/* if egress device is enslaved to an L3 master device pass the
311 		 * skb to its handler for processing
312 		 */
313 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
314 		if (unlikely(!skb))
315 			return 0;
316 
317 		/* hooks should never assume socket lock is held.
318 		 * we promote our socket to non const
319 		 */
320 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
321 			       net, (struct sock *)sk, skb, NULL, dst->dev,
322 			       dst_output);
323 	}
324 
325 	skb->dev = dst->dev;
326 	/* ipv6_local_error() does not require socket lock,
327 	 * we promote our socket to non const
328 	 */
329 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
330 
331 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
332 	kfree_skb(skb);
333 	return -EMSGSIZE;
334 }
335 EXPORT_SYMBOL(ip6_xmit);
336 
337 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
338 {
339 	struct ip6_ra_chain *ra;
340 	struct sock *last = NULL;
341 
342 	read_lock(&ip6_ra_lock);
343 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
344 		struct sock *sk = ra->sk;
345 		if (sk && ra->sel == sel &&
346 		    (!sk->sk_bound_dev_if ||
347 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
348 			struct ipv6_pinfo *np = inet6_sk(sk);
349 
350 			if (np && np->rtalert_isolate &&
351 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
352 				continue;
353 			}
354 			if (last) {
355 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
356 				if (skb2)
357 					rawv6_rcv(last, skb2);
358 			}
359 			last = sk;
360 		}
361 	}
362 
363 	if (last) {
364 		rawv6_rcv(last, skb);
365 		read_unlock(&ip6_ra_lock);
366 		return 1;
367 	}
368 	read_unlock(&ip6_ra_lock);
369 	return 0;
370 }
371 
372 static int ip6_forward_proxy_check(struct sk_buff *skb)
373 {
374 	struct ipv6hdr *hdr = ipv6_hdr(skb);
375 	u8 nexthdr = hdr->nexthdr;
376 	__be16 frag_off;
377 	int offset;
378 
379 	if (ipv6_ext_hdr(nexthdr)) {
380 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
381 		if (offset < 0)
382 			return 0;
383 	} else
384 		offset = sizeof(struct ipv6hdr);
385 
386 	if (nexthdr == IPPROTO_ICMPV6) {
387 		struct icmp6hdr *icmp6;
388 
389 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
390 					 offset + 1 - skb->data)))
391 			return 0;
392 
393 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
394 
395 		switch (icmp6->icmp6_type) {
396 		case NDISC_ROUTER_SOLICITATION:
397 		case NDISC_ROUTER_ADVERTISEMENT:
398 		case NDISC_NEIGHBOUR_SOLICITATION:
399 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
400 		case NDISC_REDIRECT:
401 			/* For reaction involving unicast neighbor discovery
402 			 * message destined to the proxied address, pass it to
403 			 * input function.
404 			 */
405 			return 1;
406 		default:
407 			break;
408 		}
409 	}
410 
411 	/*
412 	 * The proxying router can't forward traffic sent to a link-local
413 	 * address, so signal the sender and discard the packet. This
414 	 * behavior is clarified by the MIPv6 specification.
415 	 */
416 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
417 		dst_link_failure(skb);
418 		return -1;
419 	}
420 
421 	return 0;
422 }
423 
424 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
425 				     struct sk_buff *skb)
426 {
427 	struct dst_entry *dst = skb_dst(skb);
428 
429 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
430 	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
431 
432 #ifdef CONFIG_NET_SWITCHDEV
433 	if (skb->offload_l3_fwd_mark) {
434 		consume_skb(skb);
435 		return 0;
436 	}
437 #endif
438 
439 	skb->tstamp = 0;
440 	return dst_output(net, sk, skb);
441 }
442 
443 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
444 {
445 	if (skb->len <= mtu)
446 		return false;
447 
448 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
449 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
450 		return true;
451 
452 	if (skb->ignore_df)
453 		return false;
454 
455 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
456 		return false;
457 
458 	return true;
459 }
460 
461 int ip6_forward(struct sk_buff *skb)
462 {
463 	struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
464 	struct dst_entry *dst = skb_dst(skb);
465 	struct ipv6hdr *hdr = ipv6_hdr(skb);
466 	struct inet6_skb_parm *opt = IP6CB(skb);
467 	struct net *net = dev_net(dst->dev);
468 	u32 mtu;
469 
470 	if (net->ipv6.devconf_all->forwarding == 0)
471 		goto error;
472 
473 	if (skb->pkt_type != PACKET_HOST)
474 		goto drop;
475 
476 	if (unlikely(skb->sk))
477 		goto drop;
478 
479 	if (skb_warn_if_lro(skb))
480 		goto drop;
481 
482 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
483 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
484 		goto drop;
485 	}
486 
487 	skb_forward_csum(skb);
488 
489 	/*
490 	 *	We DO NOT make any processing on
491 	 *	RA packets, pushing them to user level AS IS
492 	 *	without ane WARRANTY that application will be able
493 	 *	to interpret them. The reason is that we
494 	 *	cannot make anything clever here.
495 	 *
496 	 *	We are not end-node, so that if packet contains
497 	 *	AH/ESP, we cannot make anything.
498 	 *	Defragmentation also would be mistake, RA packets
499 	 *	cannot be fragmented, because there is no warranty
500 	 *	that different fragments will go along one path. --ANK
501 	 */
502 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
503 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
504 			return 0;
505 	}
506 
507 	/*
508 	 *	check and decrement ttl
509 	 */
510 	if (hdr->hop_limit <= 1) {
511 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
512 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
513 
514 		kfree_skb(skb);
515 		return -ETIMEDOUT;
516 	}
517 
518 	/* XXX: idev->cnf.proxy_ndp? */
519 	if (net->ipv6.devconf_all->proxy_ndp &&
520 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
521 		int proxied = ip6_forward_proxy_check(skb);
522 		if (proxied > 0)
523 			return ip6_input(skb);
524 		else if (proxied < 0) {
525 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
526 			goto drop;
527 		}
528 	}
529 
530 	if (!xfrm6_route_forward(skb)) {
531 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
532 		goto drop;
533 	}
534 	dst = skb_dst(skb);
535 
536 	/* IPv6 specs say nothing about it, but it is clear that we cannot
537 	   send redirects to source routed frames.
538 	   We don't send redirects to frames decapsulated from IPsec.
539 	 */
540 	if (IP6CB(skb)->iif == dst->dev->ifindex &&
541 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
542 		struct in6_addr *target = NULL;
543 		struct inet_peer *peer;
544 		struct rt6_info *rt;
545 
546 		/*
547 		 *	incoming and outgoing devices are the same
548 		 *	send a redirect.
549 		 */
550 
551 		rt = (struct rt6_info *) dst;
552 		if (rt->rt6i_flags & RTF_GATEWAY)
553 			target = &rt->rt6i_gateway;
554 		else
555 			target = &hdr->daddr;
556 
557 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
558 
559 		/* Limit redirects both by destination (here)
560 		   and by source (inside ndisc_send_redirect)
561 		 */
562 		if (inet_peer_xrlim_allow(peer, 1*HZ))
563 			ndisc_send_redirect(skb, target);
564 		if (peer)
565 			inet_putpeer(peer);
566 	} else {
567 		int addrtype = ipv6_addr_type(&hdr->saddr);
568 
569 		/* This check is security critical. */
570 		if (addrtype == IPV6_ADDR_ANY ||
571 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
572 			goto error;
573 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
574 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
575 				    ICMPV6_NOT_NEIGHBOUR, 0);
576 			goto error;
577 		}
578 	}
579 
580 	mtu = ip6_dst_mtu_forward(dst);
581 	if (mtu < IPV6_MIN_MTU)
582 		mtu = IPV6_MIN_MTU;
583 
584 	if (ip6_pkt_too_big(skb, mtu)) {
585 		/* Again, force OUTPUT device used as source address */
586 		skb->dev = dst->dev;
587 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
588 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
589 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
590 				IPSTATS_MIB_FRAGFAILS);
591 		kfree_skb(skb);
592 		return -EMSGSIZE;
593 	}
594 
595 	if (skb_cow(skb, dst->dev->hard_header_len)) {
596 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
597 				IPSTATS_MIB_OUTDISCARDS);
598 		goto drop;
599 	}
600 
601 	hdr = ipv6_hdr(skb);
602 
603 	/* Mangling hops number delayed to point after skb COW */
604 
605 	hdr->hop_limit--;
606 
607 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
608 		       net, NULL, skb, skb->dev, dst->dev,
609 		       ip6_forward_finish);
610 
611 error:
612 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
613 drop:
614 	kfree_skb(skb);
615 	return -EINVAL;
616 }
617 
618 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
619 {
620 	to->pkt_type = from->pkt_type;
621 	to->priority = from->priority;
622 	to->protocol = from->protocol;
623 	skb_dst_drop(to);
624 	skb_dst_set(to, dst_clone(skb_dst(from)));
625 	to->dev = from->dev;
626 	to->mark = from->mark;
627 
628 	skb_copy_hash(to, from);
629 
630 #ifdef CONFIG_NET_SCHED
631 	to->tc_index = from->tc_index;
632 #endif
633 	nf_copy(to, from);
634 	skb_ext_copy(to, from);
635 	skb_copy_secmark(to, from);
636 }
637 
638 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
639 		      u8 nexthdr, __be32 frag_id,
640 		      struct ip6_fraglist_iter *iter)
641 {
642 	unsigned int first_len;
643 	struct frag_hdr *fh;
644 
645 	/* BUILD HEADER */
646 	*prevhdr = NEXTHDR_FRAGMENT;
647 	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
648 	if (!iter->tmp_hdr)
649 		return -ENOMEM;
650 
651 	iter->frag = skb_shinfo(skb)->frag_list;
652 	skb_frag_list_init(skb);
653 
654 	iter->offset = 0;
655 	iter->hlen = hlen;
656 	iter->frag_id = frag_id;
657 	iter->nexthdr = nexthdr;
658 
659 	__skb_pull(skb, hlen);
660 	fh = __skb_push(skb, sizeof(struct frag_hdr));
661 	__skb_push(skb, hlen);
662 	skb_reset_network_header(skb);
663 	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
664 
665 	fh->nexthdr = nexthdr;
666 	fh->reserved = 0;
667 	fh->frag_off = htons(IP6_MF);
668 	fh->identification = frag_id;
669 
670 	first_len = skb_pagelen(skb);
671 	skb->data_len = first_len - skb_headlen(skb);
672 	skb->len = first_len;
673 	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
674 
675 	return 0;
676 }
677 EXPORT_SYMBOL(ip6_fraglist_init);
678 
679 void ip6_fraglist_prepare(struct sk_buff *skb,
680 			  struct ip6_fraglist_iter *iter)
681 {
682 	struct sk_buff *frag = iter->frag;
683 	unsigned int hlen = iter->hlen;
684 	struct frag_hdr *fh;
685 
686 	frag->ip_summed = CHECKSUM_NONE;
687 	skb_reset_transport_header(frag);
688 	fh = __skb_push(frag, sizeof(struct frag_hdr));
689 	__skb_push(frag, hlen);
690 	skb_reset_network_header(frag);
691 	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
692 	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
693 	fh->nexthdr = iter->nexthdr;
694 	fh->reserved = 0;
695 	fh->frag_off = htons(iter->offset);
696 	if (frag->next)
697 		fh->frag_off |= htons(IP6_MF);
698 	fh->identification = iter->frag_id;
699 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
700 	ip6_copy_metadata(frag, skb);
701 }
702 EXPORT_SYMBOL(ip6_fraglist_prepare);
703 
704 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
705 		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
706 		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
707 {
708 	state->prevhdr = prevhdr;
709 	state->nexthdr = nexthdr;
710 	state->frag_id = frag_id;
711 
712 	state->hlen = hlen;
713 	state->mtu = mtu;
714 
715 	state->left = skb->len - hlen;	/* Space per frame */
716 	state->ptr = hlen;		/* Where to start from */
717 
718 	state->hroom = hdr_room;
719 	state->troom = needed_tailroom;
720 
721 	state->offset = 0;
722 }
723 EXPORT_SYMBOL(ip6_frag_init);
724 
725 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
726 {
727 	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
728 	struct sk_buff *frag;
729 	struct frag_hdr *fh;
730 	unsigned int len;
731 
732 	len = state->left;
733 	/* IF: it doesn't fit, use 'mtu' - the data space left */
734 	if (len > state->mtu)
735 		len = state->mtu;
736 	/* IF: we are not sending up to and including the packet end
737 	   then align the next start on an eight byte boundary */
738 	if (len < state->left)
739 		len &= ~7;
740 
741 	/* Allocate buffer */
742 	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
743 			 state->hroom + state->troom, GFP_ATOMIC);
744 	if (!frag)
745 		return ERR_PTR(-ENOMEM);
746 
747 	/*
748 	 *	Set up data on packet
749 	 */
750 
751 	ip6_copy_metadata(frag, skb);
752 	skb_reserve(frag, state->hroom);
753 	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
754 	skb_reset_network_header(frag);
755 	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
756 	frag->transport_header = (frag->network_header + state->hlen +
757 				  sizeof(struct frag_hdr));
758 
759 	/*
760 	 *	Charge the memory for the fragment to any owner
761 	 *	it might possess
762 	 */
763 	if (skb->sk)
764 		skb_set_owner_w(frag, skb->sk);
765 
766 	/*
767 	 *	Copy the packet header into the new buffer.
768 	 */
769 	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
770 
771 	fragnexthdr_offset = skb_network_header(frag);
772 	fragnexthdr_offset += prevhdr - skb_network_header(skb);
773 	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
774 
775 	/*
776 	 *	Build fragment header.
777 	 */
778 	fh->nexthdr = state->nexthdr;
779 	fh->reserved = 0;
780 	fh->identification = state->frag_id;
781 
782 	/*
783 	 *	Copy a block of the IP datagram.
784 	 */
785 	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
786 			     len));
787 	state->left -= len;
788 
789 	fh->frag_off = htons(state->offset);
790 	if (state->left > 0)
791 		fh->frag_off |= htons(IP6_MF);
792 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
793 
794 	state->ptr += len;
795 	state->offset += len;
796 
797 	return frag;
798 }
799 EXPORT_SYMBOL(ip6_frag_next);
800 
801 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
802 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
803 {
804 	struct sk_buff *frag;
805 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
806 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
807 				inet6_sk(skb->sk) : NULL;
808 	struct ip6_frag_state state;
809 	unsigned int mtu, hlen, nexthdr_offset;
810 	ktime_t tstamp = skb->tstamp;
811 	int hroom, err = 0;
812 	__be32 frag_id;
813 	u8 *prevhdr, nexthdr = 0;
814 
815 	err = ip6_find_1stfragopt(skb, &prevhdr);
816 	if (err < 0)
817 		goto fail;
818 	hlen = err;
819 	nexthdr = *prevhdr;
820 	nexthdr_offset = prevhdr - skb_network_header(skb);
821 
822 	mtu = ip6_skb_dst_mtu(skb);
823 
824 	/* We must not fragment if the socket is set to force MTU discovery
825 	 * or if the skb it not generated by a local socket.
826 	 */
827 	if (unlikely(!skb->ignore_df && skb->len > mtu))
828 		goto fail_toobig;
829 
830 	if (IP6CB(skb)->frag_max_size) {
831 		if (IP6CB(skb)->frag_max_size > mtu)
832 			goto fail_toobig;
833 
834 		/* don't send fragments larger than what we received */
835 		mtu = IP6CB(skb)->frag_max_size;
836 		if (mtu < IPV6_MIN_MTU)
837 			mtu = IPV6_MIN_MTU;
838 	}
839 
840 	if (np && np->frag_size < mtu) {
841 		if (np->frag_size)
842 			mtu = np->frag_size;
843 	}
844 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
845 		goto fail_toobig;
846 	mtu -= hlen + sizeof(struct frag_hdr);
847 
848 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
849 				    &ipv6_hdr(skb)->saddr);
850 
851 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
852 	    (err = skb_checksum_help(skb)))
853 		goto fail;
854 
855 	prevhdr = skb_network_header(skb) + nexthdr_offset;
856 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
857 	if (skb_has_frag_list(skb)) {
858 		unsigned int first_len = skb_pagelen(skb);
859 		struct ip6_fraglist_iter iter;
860 		struct sk_buff *frag2;
861 
862 		if (first_len - hlen > mtu ||
863 		    ((first_len - hlen) & 7) ||
864 		    skb_cloned(skb) ||
865 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
866 			goto slow_path;
867 
868 		skb_walk_frags(skb, frag) {
869 			/* Correct geometry. */
870 			if (frag->len > mtu ||
871 			    ((frag->len & 7) && frag->next) ||
872 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
873 				goto slow_path_clean;
874 
875 			/* Partially cloned skb? */
876 			if (skb_shared(frag))
877 				goto slow_path_clean;
878 
879 			BUG_ON(frag->sk);
880 			if (skb->sk) {
881 				frag->sk = skb->sk;
882 				frag->destructor = sock_wfree;
883 			}
884 			skb->truesize -= frag->truesize;
885 		}
886 
887 		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
888 					&iter);
889 		if (err < 0)
890 			goto fail;
891 
892 		for (;;) {
893 			/* Prepare header of the next frame,
894 			 * before previous one went down. */
895 			if (iter.frag)
896 				ip6_fraglist_prepare(skb, &iter);
897 
898 			skb->tstamp = tstamp;
899 			err = output(net, sk, skb);
900 			if (!err)
901 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
902 					      IPSTATS_MIB_FRAGCREATES);
903 
904 			if (err || !iter.frag)
905 				break;
906 
907 			skb = ip6_fraglist_next(&iter);
908 		}
909 
910 		kfree(iter.tmp_hdr);
911 
912 		if (err == 0) {
913 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
914 				      IPSTATS_MIB_FRAGOKS);
915 			return 0;
916 		}
917 
918 		kfree_skb_list(iter.frag);
919 
920 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
921 			      IPSTATS_MIB_FRAGFAILS);
922 		return err;
923 
924 slow_path_clean:
925 		skb_walk_frags(skb, frag2) {
926 			if (frag2 == frag)
927 				break;
928 			frag2->sk = NULL;
929 			frag2->destructor = NULL;
930 			skb->truesize += frag2->truesize;
931 		}
932 	}
933 
934 slow_path:
935 	/*
936 	 *	Fragment the datagram.
937 	 */
938 
939 	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
940 		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
941 		      &state);
942 
943 	/*
944 	 *	Keep copying data until we run out.
945 	 */
946 
947 	while (state.left > 0) {
948 		frag = ip6_frag_next(skb, &state);
949 		if (IS_ERR(frag)) {
950 			err = PTR_ERR(frag);
951 			goto fail;
952 		}
953 
954 		/*
955 		 *	Put this fragment into the sending queue.
956 		 */
957 		frag->tstamp = tstamp;
958 		err = output(net, sk, frag);
959 		if (err)
960 			goto fail;
961 
962 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
963 			      IPSTATS_MIB_FRAGCREATES);
964 	}
965 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
966 		      IPSTATS_MIB_FRAGOKS);
967 	consume_skb(skb);
968 	return err;
969 
970 fail_toobig:
971 	if (skb->sk && dst_allfrag(skb_dst(skb)))
972 		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
973 
974 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
975 	err = -EMSGSIZE;
976 
977 fail:
978 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
979 		      IPSTATS_MIB_FRAGFAILS);
980 	kfree_skb(skb);
981 	return err;
982 }
983 
984 static inline int ip6_rt_check(const struct rt6key *rt_key,
985 			       const struct in6_addr *fl_addr,
986 			       const struct in6_addr *addr_cache)
987 {
988 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
989 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
990 }
991 
992 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
993 					  struct dst_entry *dst,
994 					  const struct flowi6 *fl6)
995 {
996 	struct ipv6_pinfo *np = inet6_sk(sk);
997 	struct rt6_info *rt;
998 
999 	if (!dst)
1000 		goto out;
1001 
1002 	if (dst->ops->family != AF_INET6) {
1003 		dst_release(dst);
1004 		return NULL;
1005 	}
1006 
1007 	rt = (struct rt6_info *)dst;
1008 	/* Yes, checking route validity in not connected
1009 	 * case is not very simple. Take into account,
1010 	 * that we do not support routing by source, TOS,
1011 	 * and MSG_DONTROUTE		--ANK (980726)
1012 	 *
1013 	 * 1. ip6_rt_check(): If route was host route,
1014 	 *    check that cached destination is current.
1015 	 *    If it is network route, we still may
1016 	 *    check its validity using saved pointer
1017 	 *    to the last used address: daddr_cache.
1018 	 *    We do not want to save whole address now,
1019 	 *    (because main consumer of this service
1020 	 *    is tcp, which has not this problem),
1021 	 *    so that the last trick works only on connected
1022 	 *    sockets.
1023 	 * 2. oif also should be the same.
1024 	 */
1025 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1026 #ifdef CONFIG_IPV6_SUBTREES
1027 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1028 #endif
1029 	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
1030 	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
1031 		dst_release(dst);
1032 		dst = NULL;
1033 	}
1034 
1035 out:
1036 	return dst;
1037 }
1038 
1039 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1040 			       struct dst_entry **dst, struct flowi6 *fl6)
1041 {
1042 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1043 	struct neighbour *n;
1044 	struct rt6_info *rt;
1045 #endif
1046 	int err;
1047 	int flags = 0;
1048 
1049 	/* The correct way to handle this would be to do
1050 	 * ip6_route_get_saddr, and then ip6_route_output; however,
1051 	 * the route-specific preferred source forces the
1052 	 * ip6_route_output call _before_ ip6_route_get_saddr.
1053 	 *
1054 	 * In source specific routing (no src=any default route),
1055 	 * ip6_route_output will fail given src=any saddr, though, so
1056 	 * that's why we try it again later.
1057 	 */
1058 	if (ipv6_addr_any(&fl6->saddr)) {
1059 		struct fib6_info *from;
1060 		struct rt6_info *rt;
1061 
1062 		*dst = ip6_route_output(net, sk, fl6);
1063 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1064 
1065 		rcu_read_lock();
1066 		from = rt ? rcu_dereference(rt->from) : NULL;
1067 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1068 					  sk ? inet6_sk(sk)->srcprefs : 0,
1069 					  &fl6->saddr);
1070 		rcu_read_unlock();
1071 
1072 		if (err)
1073 			goto out_err_release;
1074 
1075 		/* If we had an erroneous initial result, pretend it
1076 		 * never existed and let the SA-enabled version take
1077 		 * over.
1078 		 */
1079 		if ((*dst)->error) {
1080 			dst_release(*dst);
1081 			*dst = NULL;
1082 		}
1083 
1084 		if (fl6->flowi6_oif)
1085 			flags |= RT6_LOOKUP_F_IFACE;
1086 	}
1087 
1088 	if (!*dst)
1089 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1090 
1091 	err = (*dst)->error;
1092 	if (err)
1093 		goto out_err_release;
1094 
1095 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1096 	/*
1097 	 * Here if the dst entry we've looked up
1098 	 * has a neighbour entry that is in the INCOMPLETE
1099 	 * state and the src address from the flow is
1100 	 * marked as OPTIMISTIC, we release the found
1101 	 * dst entry and replace it instead with the
1102 	 * dst entry of the nexthop router
1103 	 */
1104 	rt = (struct rt6_info *) *dst;
1105 	rcu_read_lock_bh();
1106 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1107 				      rt6_nexthop(rt, &fl6->daddr));
1108 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1109 	rcu_read_unlock_bh();
1110 
1111 	if (err) {
1112 		struct inet6_ifaddr *ifp;
1113 		struct flowi6 fl_gw6;
1114 		int redirect;
1115 
1116 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1117 				      (*dst)->dev, 1);
1118 
1119 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1120 		if (ifp)
1121 			in6_ifa_put(ifp);
1122 
1123 		if (redirect) {
1124 			/*
1125 			 * We need to get the dst entry for the
1126 			 * default router instead
1127 			 */
1128 			dst_release(*dst);
1129 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1130 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1131 			*dst = ip6_route_output(net, sk, &fl_gw6);
1132 			err = (*dst)->error;
1133 			if (err)
1134 				goto out_err_release;
1135 		}
1136 	}
1137 #endif
1138 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1139 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1140 		err = -EAFNOSUPPORT;
1141 		goto out_err_release;
1142 	}
1143 
1144 	return 0;
1145 
1146 out_err_release:
1147 	dst_release(*dst);
1148 	*dst = NULL;
1149 
1150 	if (err == -ENETUNREACH)
1151 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1152 	return err;
1153 }
1154 
1155 /**
1156  *	ip6_dst_lookup - perform route lookup on flow
1157  *	@net: Network namespace to perform lookup in
1158  *	@sk: socket which provides route info
1159  *	@dst: pointer to dst_entry * for result
1160  *	@fl6: flow to lookup
1161  *
1162  *	This function performs a route lookup on the given flow.
1163  *
1164  *	It returns zero on success, or a standard errno code on error.
1165  */
1166 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1167 		   struct flowi6 *fl6)
1168 {
1169 	*dst = NULL;
1170 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1171 }
1172 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1173 
1174 /**
1175  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1176  *	@net: Network namespace to perform lookup in
1177  *	@sk: socket which provides route info
1178  *	@fl6: flow to lookup
1179  *	@final_dst: final destination address for ipsec lookup
1180  *
1181  *	This function performs a route lookup on the given flow.
1182  *
1183  *	It returns a valid dst pointer on success, or a pointer encoded
1184  *	error code.
1185  */
1186 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1187 				      const struct in6_addr *final_dst)
1188 {
1189 	struct dst_entry *dst = NULL;
1190 	int err;
1191 
1192 	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1193 	if (err)
1194 		return ERR_PTR(err);
1195 	if (final_dst)
1196 		fl6->daddr = *final_dst;
1197 
1198 	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1199 }
1200 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1201 
1202 /**
1203  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1204  *	@sk: socket which provides the dst cache and route info
1205  *	@fl6: flow to lookup
1206  *	@final_dst: final destination address for ipsec lookup
1207  *	@connected: whether @sk is connected or not
1208  *
1209  *	This function performs a route lookup on the given flow with the
1210  *	possibility of using the cached route in the socket if it is valid.
1211  *	It will take the socket dst lock when operating on the dst cache.
1212  *	As a result, this function can only be used in process context.
1213  *
1214  *	In addition, for a connected socket, cache the dst in the socket
1215  *	if the current cache is not valid.
1216  *
1217  *	It returns a valid dst pointer on success, or a pointer encoded
1218  *	error code.
1219  */
1220 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1221 					 const struct in6_addr *final_dst,
1222 					 bool connected)
1223 {
1224 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1225 
1226 	dst = ip6_sk_dst_check(sk, dst, fl6);
1227 	if (dst)
1228 		return dst;
1229 
1230 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1231 	if (connected && !IS_ERR(dst))
1232 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1233 
1234 	return dst;
1235 }
1236 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1237 
1238 /**
1239  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1240  *      @skb: Packet for which lookup is done
1241  *      @dev: Tunnel device
1242  *      @net: Network namespace of tunnel device
1243  *      @sock: Socket which provides route info
1244  *      @saddr: Memory to store the src ip address
1245  *      @info: Tunnel information
1246  *      @protocol: IP protocol
1247  *      @use_cache: Flag to enable cache usage
1248  *      This function performs a route lookup on a tunnel
1249  *
1250  *      It returns a valid dst pointer and stores src address to be used in
1251  *      tunnel in param saddr on success, else a pointer encoded error code.
1252  */
1253 
1254 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1255 					struct net_device *dev,
1256 					struct net *net,
1257 					struct socket *sock,
1258 					struct in6_addr *saddr,
1259 					const struct ip_tunnel_info *info,
1260 					u8 protocol,
1261 					bool use_cache)
1262 {
1263 	struct dst_entry *dst = NULL;
1264 #ifdef CONFIG_DST_CACHE
1265 	struct dst_cache *dst_cache;
1266 #endif
1267 	struct flowi6 fl6;
1268 	__u8 prio;
1269 
1270 #ifdef CONFIG_DST_CACHE
1271 	dst_cache = (struct dst_cache *)&info->dst_cache;
1272 	if (use_cache) {
1273 		dst = dst_cache_get_ip6(dst_cache, saddr);
1274 		if (dst)
1275 			return dst;
1276 	}
1277 #endif
1278 	memset(&fl6, 0, sizeof(fl6));
1279 	fl6.flowi6_mark = skb->mark;
1280 	fl6.flowi6_proto = protocol;
1281 	fl6.daddr = info->key.u.ipv6.dst;
1282 	fl6.saddr = info->key.u.ipv6.src;
1283 	prio = info->key.tos;
1284 	fl6.flowlabel = ip6_make_flowinfo(RT_TOS(prio),
1285 					  info->key.label);
1286 
1287 	dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1288 					      NULL);
1289 	if (IS_ERR(dst)) {
1290 		netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1291 		return ERR_PTR(-ENETUNREACH);
1292 	}
1293 	if (dst->dev == dev) { /* is this necessary? */
1294 		netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1295 		dst_release(dst);
1296 		return ERR_PTR(-ELOOP);
1297 	}
1298 #ifdef CONFIG_DST_CACHE
1299 	if (use_cache)
1300 		dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1301 #endif
1302 	*saddr = fl6.saddr;
1303 	return dst;
1304 }
1305 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1306 
1307 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1308 					       gfp_t gfp)
1309 {
1310 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1311 }
1312 
1313 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1314 						gfp_t gfp)
1315 {
1316 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1317 }
1318 
1319 static void ip6_append_data_mtu(unsigned int *mtu,
1320 				int *maxfraglen,
1321 				unsigned int fragheaderlen,
1322 				struct sk_buff *skb,
1323 				struct rt6_info *rt,
1324 				unsigned int orig_mtu)
1325 {
1326 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1327 		if (!skb) {
1328 			/* first fragment, reserve header_len */
1329 			*mtu = orig_mtu - rt->dst.header_len;
1330 
1331 		} else {
1332 			/*
1333 			 * this fragment is not first, the headers
1334 			 * space is regarded as data space.
1335 			 */
1336 			*mtu = orig_mtu;
1337 		}
1338 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1339 			      + fragheaderlen - sizeof(struct frag_hdr);
1340 	}
1341 }
1342 
1343 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1344 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1345 			  struct rt6_info *rt, struct flowi6 *fl6)
1346 {
1347 	struct ipv6_pinfo *np = inet6_sk(sk);
1348 	unsigned int mtu;
1349 	struct ipv6_txoptions *opt = ipc6->opt;
1350 
1351 	/*
1352 	 * setup for corking
1353 	 */
1354 	if (opt) {
1355 		if (WARN_ON(v6_cork->opt))
1356 			return -EINVAL;
1357 
1358 		v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1359 		if (unlikely(!v6_cork->opt))
1360 			return -ENOBUFS;
1361 
1362 		v6_cork->opt->tot_len = sizeof(*opt);
1363 		v6_cork->opt->opt_flen = opt->opt_flen;
1364 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1365 
1366 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1367 						    sk->sk_allocation);
1368 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1369 			return -ENOBUFS;
1370 
1371 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1372 						    sk->sk_allocation);
1373 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1374 			return -ENOBUFS;
1375 
1376 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1377 						   sk->sk_allocation);
1378 		if (opt->hopopt && !v6_cork->opt->hopopt)
1379 			return -ENOBUFS;
1380 
1381 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1382 						    sk->sk_allocation);
1383 		if (opt->srcrt && !v6_cork->opt->srcrt)
1384 			return -ENOBUFS;
1385 
1386 		/* need source address above miyazawa*/
1387 	}
1388 	dst_hold(&rt->dst);
1389 	cork->base.dst = &rt->dst;
1390 	cork->fl.u.ip6 = *fl6;
1391 	v6_cork->hop_limit = ipc6->hlimit;
1392 	v6_cork->tclass = ipc6->tclass;
1393 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1394 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1395 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1396 	else
1397 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1398 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1399 	if (np->frag_size < mtu) {
1400 		if (np->frag_size)
1401 			mtu = np->frag_size;
1402 	}
1403 	if (mtu < IPV6_MIN_MTU)
1404 		return -EINVAL;
1405 	cork->base.fragsize = mtu;
1406 	cork->base.gso_size = ipc6->gso_size;
1407 	cork->base.tx_flags = 0;
1408 	cork->base.mark = ipc6->sockc.mark;
1409 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1410 
1411 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1412 		cork->base.flags |= IPCORK_ALLFRAG;
1413 	cork->base.length = 0;
1414 
1415 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1416 
1417 	return 0;
1418 }
1419 
1420 static int __ip6_append_data(struct sock *sk,
1421 			     struct flowi6 *fl6,
1422 			     struct sk_buff_head *queue,
1423 			     struct inet_cork *cork,
1424 			     struct inet6_cork *v6_cork,
1425 			     struct page_frag *pfrag,
1426 			     int getfrag(void *from, char *to, int offset,
1427 					 int len, int odd, struct sk_buff *skb),
1428 			     void *from, int length, int transhdrlen,
1429 			     unsigned int flags, struct ipcm6_cookie *ipc6)
1430 {
1431 	struct sk_buff *skb, *skb_prev = NULL;
1432 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1433 	struct ubuf_info *uarg = NULL;
1434 	int exthdrlen = 0;
1435 	int dst_exthdrlen = 0;
1436 	int hh_len;
1437 	int copy;
1438 	int err;
1439 	int offset = 0;
1440 	u32 tskey = 0;
1441 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1442 	struct ipv6_txoptions *opt = v6_cork->opt;
1443 	int csummode = CHECKSUM_NONE;
1444 	unsigned int maxnonfragsize, headersize;
1445 	unsigned int wmem_alloc_delta = 0;
1446 	bool paged, extra_uref = false;
1447 
1448 	skb = skb_peek_tail(queue);
1449 	if (!skb) {
1450 		exthdrlen = opt ? opt->opt_flen : 0;
1451 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1452 	}
1453 
1454 	paged = !!cork->gso_size;
1455 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1456 	orig_mtu = mtu;
1457 
1458 	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1459 	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1460 		tskey = sk->sk_tskey++;
1461 
1462 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1463 
1464 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1465 			(opt ? opt->opt_nflen : 0);
1466 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1467 		     sizeof(struct frag_hdr);
1468 
1469 	headersize = sizeof(struct ipv6hdr) +
1470 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1471 		     (dst_allfrag(&rt->dst) ?
1472 		      sizeof(struct frag_hdr) : 0) +
1473 		     rt->rt6i_nfheader_len;
1474 
1475 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1476 	 * the first fragment
1477 	 */
1478 	if (headersize + transhdrlen > mtu)
1479 		goto emsgsize;
1480 
1481 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1482 	    (sk->sk_protocol == IPPROTO_UDP ||
1483 	     sk->sk_protocol == IPPROTO_RAW)) {
1484 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1485 				sizeof(struct ipv6hdr));
1486 		goto emsgsize;
1487 	}
1488 
1489 	if (ip6_sk_ignore_df(sk))
1490 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1491 	else
1492 		maxnonfragsize = mtu;
1493 
1494 	if (cork->length + length > maxnonfragsize - headersize) {
1495 emsgsize:
1496 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1497 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1498 		return -EMSGSIZE;
1499 	}
1500 
1501 	/* CHECKSUM_PARTIAL only with no extension headers and when
1502 	 * we are not going to fragment
1503 	 */
1504 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1505 	    headersize == sizeof(struct ipv6hdr) &&
1506 	    length <= mtu - headersize &&
1507 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1508 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1509 		csummode = CHECKSUM_PARTIAL;
1510 
1511 	if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1512 		uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1513 		if (!uarg)
1514 			return -ENOBUFS;
1515 		extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1516 		if (rt->dst.dev->features & NETIF_F_SG &&
1517 		    csummode == CHECKSUM_PARTIAL) {
1518 			paged = true;
1519 		} else {
1520 			uarg->zerocopy = 0;
1521 			skb_zcopy_set(skb, uarg, &extra_uref);
1522 		}
1523 	}
1524 
1525 	/*
1526 	 * Let's try using as much space as possible.
1527 	 * Use MTU if total length of the message fits into the MTU.
1528 	 * Otherwise, we need to reserve fragment header and
1529 	 * fragment alignment (= 8-15 octects, in total).
1530 	 *
1531 	 * Note that we may need to "move" the data from the tail
1532 	 * of the buffer to the new fragment when we split
1533 	 * the message.
1534 	 *
1535 	 * FIXME: It may be fragmented into multiple chunks
1536 	 *        at once if non-fragmentable extension headers
1537 	 *        are too large.
1538 	 * --yoshfuji
1539 	 */
1540 
1541 	cork->length += length;
1542 	if (!skb)
1543 		goto alloc_new_skb;
1544 
1545 	while (length > 0) {
1546 		/* Check if the remaining data fits into current packet. */
1547 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1548 		if (copy < length)
1549 			copy = maxfraglen - skb->len;
1550 
1551 		if (copy <= 0) {
1552 			char *data;
1553 			unsigned int datalen;
1554 			unsigned int fraglen;
1555 			unsigned int fraggap;
1556 			unsigned int alloclen, alloc_extra;
1557 			unsigned int pagedlen;
1558 alloc_new_skb:
1559 			/* There's no room in the current skb */
1560 			if (skb)
1561 				fraggap = skb->len - maxfraglen;
1562 			else
1563 				fraggap = 0;
1564 			/* update mtu and maxfraglen if necessary */
1565 			if (!skb || !skb_prev)
1566 				ip6_append_data_mtu(&mtu, &maxfraglen,
1567 						    fragheaderlen, skb, rt,
1568 						    orig_mtu);
1569 
1570 			skb_prev = skb;
1571 
1572 			/*
1573 			 * If remaining data exceeds the mtu,
1574 			 * we know we need more fragment(s).
1575 			 */
1576 			datalen = length + fraggap;
1577 
1578 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1579 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1580 			fraglen = datalen + fragheaderlen;
1581 			pagedlen = 0;
1582 
1583 			alloc_extra = hh_len;
1584 			alloc_extra += dst_exthdrlen;
1585 			alloc_extra += rt->dst.trailer_len;
1586 
1587 			/* We just reserve space for fragment header.
1588 			 * Note: this may be overallocation if the message
1589 			 * (without MSG_MORE) fits into the MTU.
1590 			 */
1591 			alloc_extra += sizeof(struct frag_hdr);
1592 
1593 			if ((flags & MSG_MORE) &&
1594 			    !(rt->dst.dev->features&NETIF_F_SG))
1595 				alloclen = mtu;
1596 			else if (!paged &&
1597 				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1598 				  !(rt->dst.dev->features & NETIF_F_SG)))
1599 				alloclen = fraglen;
1600 			else {
1601 				alloclen = min_t(int, fraglen, MAX_HEADER);
1602 				pagedlen = fraglen - alloclen;
1603 			}
1604 			alloclen += alloc_extra;
1605 
1606 			if (datalen != length + fraggap) {
1607 				/*
1608 				 * this is not the last fragment, the trailer
1609 				 * space is regarded as data space.
1610 				 */
1611 				datalen += rt->dst.trailer_len;
1612 			}
1613 
1614 			fraglen = datalen + fragheaderlen;
1615 
1616 			copy = datalen - transhdrlen - fraggap - pagedlen;
1617 			if (copy < 0) {
1618 				err = -EINVAL;
1619 				goto error;
1620 			}
1621 			if (transhdrlen) {
1622 				skb = sock_alloc_send_skb(sk, alloclen,
1623 						(flags & MSG_DONTWAIT), &err);
1624 			} else {
1625 				skb = NULL;
1626 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1627 				    2 * sk->sk_sndbuf)
1628 					skb = alloc_skb(alloclen,
1629 							sk->sk_allocation);
1630 				if (unlikely(!skb))
1631 					err = -ENOBUFS;
1632 			}
1633 			if (!skb)
1634 				goto error;
1635 			/*
1636 			 *	Fill in the control structures
1637 			 */
1638 			skb->protocol = htons(ETH_P_IPV6);
1639 			skb->ip_summed = csummode;
1640 			skb->csum = 0;
1641 			/* reserve for fragmentation and ipsec header */
1642 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1643 				    dst_exthdrlen);
1644 
1645 			/*
1646 			 *	Find where to start putting bytes
1647 			 */
1648 			data = skb_put(skb, fraglen - pagedlen);
1649 			skb_set_network_header(skb, exthdrlen);
1650 			data += fragheaderlen;
1651 			skb->transport_header = (skb->network_header +
1652 						 fragheaderlen);
1653 			if (fraggap) {
1654 				skb->csum = skb_copy_and_csum_bits(
1655 					skb_prev, maxfraglen,
1656 					data + transhdrlen, fraggap);
1657 				skb_prev->csum = csum_sub(skb_prev->csum,
1658 							  skb->csum);
1659 				data += fraggap;
1660 				pskb_trim_unique(skb_prev, maxfraglen);
1661 			}
1662 			if (copy > 0 &&
1663 			    getfrag(from, data + transhdrlen, offset,
1664 				    copy, fraggap, skb) < 0) {
1665 				err = -EFAULT;
1666 				kfree_skb(skb);
1667 				goto error;
1668 			}
1669 
1670 			offset += copy;
1671 			length -= copy + transhdrlen;
1672 			transhdrlen = 0;
1673 			exthdrlen = 0;
1674 			dst_exthdrlen = 0;
1675 
1676 			/* Only the initial fragment is time stamped */
1677 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1678 			cork->tx_flags = 0;
1679 			skb_shinfo(skb)->tskey = tskey;
1680 			tskey = 0;
1681 			skb_zcopy_set(skb, uarg, &extra_uref);
1682 
1683 			if ((flags & MSG_CONFIRM) && !skb_prev)
1684 				skb_set_dst_pending_confirm(skb, 1);
1685 
1686 			/*
1687 			 * Put the packet on the pending queue
1688 			 */
1689 			if (!skb->destructor) {
1690 				skb->destructor = sock_wfree;
1691 				skb->sk = sk;
1692 				wmem_alloc_delta += skb->truesize;
1693 			}
1694 			__skb_queue_tail(queue, skb);
1695 			continue;
1696 		}
1697 
1698 		if (copy > length)
1699 			copy = length;
1700 
1701 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1702 		    skb_tailroom(skb) >= copy) {
1703 			unsigned int off;
1704 
1705 			off = skb->len;
1706 			if (getfrag(from, skb_put(skb, copy),
1707 						offset, copy, off, skb) < 0) {
1708 				__skb_trim(skb, off);
1709 				err = -EFAULT;
1710 				goto error;
1711 			}
1712 		} else if (!uarg || !uarg->zerocopy) {
1713 			int i = skb_shinfo(skb)->nr_frags;
1714 
1715 			err = -ENOMEM;
1716 			if (!sk_page_frag_refill(sk, pfrag))
1717 				goto error;
1718 
1719 			if (!skb_can_coalesce(skb, i, pfrag->page,
1720 					      pfrag->offset)) {
1721 				err = -EMSGSIZE;
1722 				if (i == MAX_SKB_FRAGS)
1723 					goto error;
1724 
1725 				__skb_fill_page_desc(skb, i, pfrag->page,
1726 						     pfrag->offset, 0);
1727 				skb_shinfo(skb)->nr_frags = ++i;
1728 				get_page(pfrag->page);
1729 			}
1730 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1731 			if (getfrag(from,
1732 				    page_address(pfrag->page) + pfrag->offset,
1733 				    offset, copy, skb->len, skb) < 0)
1734 				goto error_efault;
1735 
1736 			pfrag->offset += copy;
1737 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1738 			skb->len += copy;
1739 			skb->data_len += copy;
1740 			skb->truesize += copy;
1741 			wmem_alloc_delta += copy;
1742 		} else {
1743 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1744 			if (err < 0)
1745 				goto error;
1746 		}
1747 		offset += copy;
1748 		length -= copy;
1749 	}
1750 
1751 	if (wmem_alloc_delta)
1752 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1753 	return 0;
1754 
1755 error_efault:
1756 	err = -EFAULT;
1757 error:
1758 	net_zcopy_put_abort(uarg, extra_uref);
1759 	cork->length -= length;
1760 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1761 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1762 	return err;
1763 }
1764 
1765 int ip6_append_data(struct sock *sk,
1766 		    int getfrag(void *from, char *to, int offset, int len,
1767 				int odd, struct sk_buff *skb),
1768 		    void *from, int length, int transhdrlen,
1769 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1770 		    struct rt6_info *rt, unsigned int flags)
1771 {
1772 	struct inet_sock *inet = inet_sk(sk);
1773 	struct ipv6_pinfo *np = inet6_sk(sk);
1774 	int exthdrlen;
1775 	int err;
1776 
1777 	if (flags&MSG_PROBE)
1778 		return 0;
1779 	if (skb_queue_empty(&sk->sk_write_queue)) {
1780 		/*
1781 		 * setup for corking
1782 		 */
1783 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1784 				     ipc6, rt, fl6);
1785 		if (err)
1786 			return err;
1787 
1788 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1789 		length += exthdrlen;
1790 		transhdrlen += exthdrlen;
1791 	} else {
1792 		fl6 = &inet->cork.fl.u.ip6;
1793 		transhdrlen = 0;
1794 	}
1795 
1796 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1797 				 &np->cork, sk_page_frag(sk), getfrag,
1798 				 from, length, transhdrlen, flags, ipc6);
1799 }
1800 EXPORT_SYMBOL_GPL(ip6_append_data);
1801 
1802 static void ip6_cork_release(struct inet_cork_full *cork,
1803 			     struct inet6_cork *v6_cork)
1804 {
1805 	if (v6_cork->opt) {
1806 		kfree(v6_cork->opt->dst0opt);
1807 		kfree(v6_cork->opt->dst1opt);
1808 		kfree(v6_cork->opt->hopopt);
1809 		kfree(v6_cork->opt->srcrt);
1810 		kfree(v6_cork->opt);
1811 		v6_cork->opt = NULL;
1812 	}
1813 
1814 	if (cork->base.dst) {
1815 		dst_release(cork->base.dst);
1816 		cork->base.dst = NULL;
1817 		cork->base.flags &= ~IPCORK_ALLFRAG;
1818 	}
1819 	memset(&cork->fl, 0, sizeof(cork->fl));
1820 }
1821 
1822 struct sk_buff *__ip6_make_skb(struct sock *sk,
1823 			       struct sk_buff_head *queue,
1824 			       struct inet_cork_full *cork,
1825 			       struct inet6_cork *v6_cork)
1826 {
1827 	struct sk_buff *skb, *tmp_skb;
1828 	struct sk_buff **tail_skb;
1829 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1830 	struct ipv6_pinfo *np = inet6_sk(sk);
1831 	struct net *net = sock_net(sk);
1832 	struct ipv6hdr *hdr;
1833 	struct ipv6_txoptions *opt = v6_cork->opt;
1834 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1835 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1836 	unsigned char proto = fl6->flowi6_proto;
1837 
1838 	skb = __skb_dequeue(queue);
1839 	if (!skb)
1840 		goto out;
1841 	tail_skb = &(skb_shinfo(skb)->frag_list);
1842 
1843 	/* move skb->data to ip header from ext header */
1844 	if (skb->data < skb_network_header(skb))
1845 		__skb_pull(skb, skb_network_offset(skb));
1846 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1847 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1848 		*tail_skb = tmp_skb;
1849 		tail_skb = &(tmp_skb->next);
1850 		skb->len += tmp_skb->len;
1851 		skb->data_len += tmp_skb->len;
1852 		skb->truesize += tmp_skb->truesize;
1853 		tmp_skb->destructor = NULL;
1854 		tmp_skb->sk = NULL;
1855 	}
1856 
1857 	/* Allow local fragmentation. */
1858 	skb->ignore_df = ip6_sk_ignore_df(sk);
1859 
1860 	*final_dst = fl6->daddr;
1861 	__skb_pull(skb, skb_network_header_len(skb));
1862 	if (opt && opt->opt_flen)
1863 		ipv6_push_frag_opts(skb, opt, &proto);
1864 	if (opt && opt->opt_nflen)
1865 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1866 
1867 	skb_push(skb, sizeof(struct ipv6hdr));
1868 	skb_reset_network_header(skb);
1869 	hdr = ipv6_hdr(skb);
1870 
1871 	ip6_flow_hdr(hdr, v6_cork->tclass,
1872 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1873 					ip6_autoflowlabel(net, np), fl6));
1874 	hdr->hop_limit = v6_cork->hop_limit;
1875 	hdr->nexthdr = proto;
1876 	hdr->saddr = fl6->saddr;
1877 	hdr->daddr = *final_dst;
1878 
1879 	skb->priority = sk->sk_priority;
1880 	skb->mark = cork->base.mark;
1881 
1882 	skb->tstamp = cork->base.transmit_time;
1883 
1884 	skb_dst_set(skb, dst_clone(&rt->dst));
1885 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1886 	if (proto == IPPROTO_ICMPV6) {
1887 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1888 
1889 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1890 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1891 	}
1892 
1893 	ip6_cork_release(cork, v6_cork);
1894 out:
1895 	return skb;
1896 }
1897 
1898 int ip6_send_skb(struct sk_buff *skb)
1899 {
1900 	struct net *net = sock_net(skb->sk);
1901 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1902 	int err;
1903 
1904 	err = ip6_local_out(net, skb->sk, skb);
1905 	if (err) {
1906 		if (err > 0)
1907 			err = net_xmit_errno(err);
1908 		if (err)
1909 			IP6_INC_STATS(net, rt->rt6i_idev,
1910 				      IPSTATS_MIB_OUTDISCARDS);
1911 	}
1912 
1913 	return err;
1914 }
1915 
1916 int ip6_push_pending_frames(struct sock *sk)
1917 {
1918 	struct sk_buff *skb;
1919 
1920 	skb = ip6_finish_skb(sk);
1921 	if (!skb)
1922 		return 0;
1923 
1924 	return ip6_send_skb(skb);
1925 }
1926 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1927 
1928 static void __ip6_flush_pending_frames(struct sock *sk,
1929 				       struct sk_buff_head *queue,
1930 				       struct inet_cork_full *cork,
1931 				       struct inet6_cork *v6_cork)
1932 {
1933 	struct sk_buff *skb;
1934 
1935 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1936 		if (skb_dst(skb))
1937 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1938 				      IPSTATS_MIB_OUTDISCARDS);
1939 		kfree_skb(skb);
1940 	}
1941 
1942 	ip6_cork_release(cork, v6_cork);
1943 }
1944 
1945 void ip6_flush_pending_frames(struct sock *sk)
1946 {
1947 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1948 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1949 }
1950 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1951 
1952 struct sk_buff *ip6_make_skb(struct sock *sk,
1953 			     int getfrag(void *from, char *to, int offset,
1954 					 int len, int odd, struct sk_buff *skb),
1955 			     void *from, int length, int transhdrlen,
1956 			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1957 			     struct rt6_info *rt, unsigned int flags,
1958 			     struct inet_cork_full *cork)
1959 {
1960 	struct inet6_cork v6_cork;
1961 	struct sk_buff_head queue;
1962 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1963 	int err;
1964 
1965 	if (flags & MSG_PROBE)
1966 		return NULL;
1967 
1968 	__skb_queue_head_init(&queue);
1969 
1970 	cork->base.flags = 0;
1971 	cork->base.addr = 0;
1972 	cork->base.opt = NULL;
1973 	cork->base.dst = NULL;
1974 	v6_cork.opt = NULL;
1975 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1976 	if (err) {
1977 		ip6_cork_release(cork, &v6_cork);
1978 		return ERR_PTR(err);
1979 	}
1980 	if (ipc6->dontfrag < 0)
1981 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1982 
1983 	err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1984 				&current->task_frag, getfrag, from,
1985 				length + exthdrlen, transhdrlen + exthdrlen,
1986 				flags, ipc6);
1987 	if (err) {
1988 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1989 		return ERR_PTR(err);
1990 	}
1991 
1992 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1993 }
1994