xref: /openbmc/linux/net/ipv6/ip6_output.c (revision c9933d49)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	IPv6 output functions
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on linux/net/ipv4/ip_output.c
10  *
11  *	Changes:
12  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13  *				extension headers are implemented.
14  *				route changes now work.
15  *				ip6_forward does not confuse sniffers.
16  *				etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *	Imran Patel	:	frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *			:       add ip6_append_data and related functions
22  *				for datagram xmit
23  */
24 
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37 
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41 
42 #include <net/sock.h>
43 #include <net/snmp.h>
44 
45 #include <net/ipv6.h>
46 #include <net/ndisc.h>
47 #include <net/protocol.h>
48 #include <net/ip6_route.h>
49 #include <net/addrconf.h>
50 #include <net/rawv6.h>
51 #include <net/icmp.h>
52 #include <net/xfrm.h>
53 #include <net/checksum.h>
54 #include <linux/mroute6.h>
55 #include <net/l3mdev.h>
56 #include <net/lwtunnel.h>
57 #include <net/ip_tunnels.h>
58 
59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
60 {
61 	struct dst_entry *dst = skb_dst(skb);
62 	struct net_device *dev = dst->dev;
63 	struct inet6_dev *idev = ip6_dst_idev(dst);
64 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
65 	const struct in6_addr *daddr, *nexthop;
66 	struct ipv6hdr *hdr;
67 	struct neighbour *neigh;
68 	int ret;
69 
70 	/* Be paranoid, rather than too clever. */
71 	if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
72 		skb = skb_expand_head(skb, hh_len);
73 		if (!skb) {
74 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
75 			return -ENOMEM;
76 		}
77 	}
78 
79 	hdr = ipv6_hdr(skb);
80 	daddr = &hdr->daddr;
81 	if (ipv6_addr_is_multicast(daddr)) {
82 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
83 		    ((mroute6_is_socket(net, skb) &&
84 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
85 		     ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
86 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
87 
88 			/* Do not check for IFF_ALLMULTI; multicast routing
89 			   is not supported in any case.
90 			 */
91 			if (newskb)
92 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
93 					net, sk, newskb, NULL, newskb->dev,
94 					dev_loopback_xmit);
95 
96 			if (hdr->hop_limit == 0) {
97 				IP6_INC_STATS(net, idev,
98 					      IPSTATS_MIB_OUTDISCARDS);
99 				kfree_skb(skb);
100 				return 0;
101 			}
102 		}
103 
104 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
105 		if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
106 		    !(dev->flags & IFF_LOOPBACK)) {
107 			kfree_skb(skb);
108 			return 0;
109 		}
110 	}
111 
112 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
113 		int res = lwtunnel_xmit(skb);
114 
115 		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
116 			return res;
117 	}
118 
119 	rcu_read_lock_bh();
120 	nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
121 	neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
122 
123 	if (unlikely(IS_ERR_OR_NULL(neigh))) {
124 		if (unlikely(!neigh))
125 			neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
126 		if (IS_ERR(neigh)) {
127 			rcu_read_unlock_bh();
128 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
129 			kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
130 			return -EINVAL;
131 		}
132 	}
133 	sock_confirm_neigh(skb, neigh);
134 	ret = neigh_output(neigh, skb, false);
135 	rcu_read_unlock_bh();
136 	return ret;
137 }
138 
139 static int
140 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
141 				    struct sk_buff *skb, unsigned int mtu)
142 {
143 	struct sk_buff *segs, *nskb;
144 	netdev_features_t features;
145 	int ret = 0;
146 
147 	/* Please see corresponding comment in ip_finish_output_gso
148 	 * describing the cases where GSO segment length exceeds the
149 	 * egress MTU.
150 	 */
151 	features = netif_skb_features(skb);
152 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
153 	if (IS_ERR_OR_NULL(segs)) {
154 		kfree_skb(skb);
155 		return -ENOMEM;
156 	}
157 
158 	consume_skb(skb);
159 
160 	skb_list_walk_safe(segs, segs, nskb) {
161 		int err;
162 
163 		skb_mark_not_on_list(segs);
164 		err = ip6_fragment(net, sk, segs, ip6_finish_output2);
165 		if (err && ret == 0)
166 			ret = err;
167 	}
168 
169 	return ret;
170 }
171 
172 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
173 {
174 	unsigned int mtu;
175 
176 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
177 	/* Policy lookup after SNAT yielded a new policy */
178 	if (skb_dst(skb)->xfrm) {
179 		IP6CB(skb)->flags |= IP6SKB_REROUTED;
180 		return dst_output(net, sk, skb);
181 	}
182 #endif
183 
184 	mtu = ip6_skb_dst_mtu(skb);
185 	if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
186 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
187 
188 	if ((skb->len > mtu && !skb_is_gso(skb)) ||
189 	    dst_allfrag(skb_dst(skb)) ||
190 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
191 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
192 	else
193 		return ip6_finish_output2(net, sk, skb);
194 }
195 
196 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
197 {
198 	int ret;
199 
200 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
201 	switch (ret) {
202 	case NET_XMIT_SUCCESS:
203 	case NET_XMIT_CN:
204 		return __ip6_finish_output(net, sk, skb) ? : ret;
205 	default:
206 		kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
207 		return ret;
208 	}
209 }
210 
211 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
212 {
213 	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
214 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
215 
216 	skb->protocol = htons(ETH_P_IPV6);
217 	skb->dev = dev;
218 
219 	if (unlikely(idev->cnf.disable_ipv6)) {
220 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
221 		kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
222 		return 0;
223 	}
224 
225 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
226 			    net, sk, skb, indev, dev,
227 			    ip6_finish_output,
228 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
229 }
230 EXPORT_SYMBOL(ip6_output);
231 
232 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
233 {
234 	if (!np->autoflowlabel_set)
235 		return ip6_default_np_autolabel(net);
236 	else
237 		return np->autoflowlabel;
238 }
239 
240 /*
241  * xmit an sk_buff (used by TCP, SCTP and DCCP)
242  * Note : socket lock is not held for SYNACK packets, but might be modified
243  * by calls to skb_set_owner_w() and ipv6_local_error(),
244  * which are using proper atomic operations or spinlocks.
245  */
246 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
247 	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
248 {
249 	struct net *net = sock_net(sk);
250 	const struct ipv6_pinfo *np = inet6_sk(sk);
251 	struct in6_addr *first_hop = &fl6->daddr;
252 	struct dst_entry *dst = skb_dst(skb);
253 	struct net_device *dev = dst->dev;
254 	struct inet6_dev *idev = ip6_dst_idev(dst);
255 	unsigned int head_room;
256 	struct ipv6hdr *hdr;
257 	u8  proto = fl6->flowi6_proto;
258 	int seg_len = skb->len;
259 	int hlimit = -1;
260 	u32 mtu;
261 
262 	head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev);
263 	if (opt)
264 		head_room += opt->opt_nflen + opt->opt_flen;
265 
266 	if (unlikely(head_room > skb_headroom(skb))) {
267 		skb = skb_expand_head(skb, head_room);
268 		if (!skb) {
269 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
270 			return -ENOBUFS;
271 		}
272 	}
273 
274 	if (opt) {
275 		seg_len += opt->opt_nflen + opt->opt_flen;
276 
277 		if (opt->opt_flen)
278 			ipv6_push_frag_opts(skb, opt, &proto);
279 
280 		if (opt->opt_nflen)
281 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
282 					     &fl6->saddr);
283 	}
284 
285 	skb_push(skb, sizeof(struct ipv6hdr));
286 	skb_reset_network_header(skb);
287 	hdr = ipv6_hdr(skb);
288 
289 	/*
290 	 *	Fill in the IPv6 header
291 	 */
292 	if (np)
293 		hlimit = np->hop_limit;
294 	if (hlimit < 0)
295 		hlimit = ip6_dst_hoplimit(dst);
296 
297 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
298 				ip6_autoflowlabel(net, np), fl6));
299 
300 	hdr->payload_len = htons(seg_len);
301 	hdr->nexthdr = proto;
302 	hdr->hop_limit = hlimit;
303 
304 	hdr->saddr = fl6->saddr;
305 	hdr->daddr = *first_hop;
306 
307 	skb->protocol = htons(ETH_P_IPV6);
308 	skb->priority = priority;
309 	skb->mark = mark;
310 
311 	mtu = dst_mtu(dst);
312 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
313 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
314 
315 		/* if egress device is enslaved to an L3 master device pass the
316 		 * skb to its handler for processing
317 		 */
318 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
319 		if (unlikely(!skb))
320 			return 0;
321 
322 		/* hooks should never assume socket lock is held.
323 		 * we promote our socket to non const
324 		 */
325 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
326 			       net, (struct sock *)sk, skb, NULL, dev,
327 			       dst_output);
328 	}
329 
330 	skb->dev = dev;
331 	/* ipv6_local_error() does not require socket lock,
332 	 * we promote our socket to non const
333 	 */
334 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
335 
336 	IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
337 	kfree_skb(skb);
338 	return -EMSGSIZE;
339 }
340 EXPORT_SYMBOL(ip6_xmit);
341 
342 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
343 {
344 	struct ip6_ra_chain *ra;
345 	struct sock *last = NULL;
346 
347 	read_lock(&ip6_ra_lock);
348 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
349 		struct sock *sk = ra->sk;
350 		if (sk && ra->sel == sel &&
351 		    (!sk->sk_bound_dev_if ||
352 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
353 			struct ipv6_pinfo *np = inet6_sk(sk);
354 
355 			if (np && np->rtalert_isolate &&
356 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
357 				continue;
358 			}
359 			if (last) {
360 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
361 				if (skb2)
362 					rawv6_rcv(last, skb2);
363 			}
364 			last = sk;
365 		}
366 	}
367 
368 	if (last) {
369 		rawv6_rcv(last, skb);
370 		read_unlock(&ip6_ra_lock);
371 		return 1;
372 	}
373 	read_unlock(&ip6_ra_lock);
374 	return 0;
375 }
376 
377 static int ip6_forward_proxy_check(struct sk_buff *skb)
378 {
379 	struct ipv6hdr *hdr = ipv6_hdr(skb);
380 	u8 nexthdr = hdr->nexthdr;
381 	__be16 frag_off;
382 	int offset;
383 
384 	if (ipv6_ext_hdr(nexthdr)) {
385 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
386 		if (offset < 0)
387 			return 0;
388 	} else
389 		offset = sizeof(struct ipv6hdr);
390 
391 	if (nexthdr == IPPROTO_ICMPV6) {
392 		struct icmp6hdr *icmp6;
393 
394 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
395 					 offset + 1 - skb->data)))
396 			return 0;
397 
398 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
399 
400 		switch (icmp6->icmp6_type) {
401 		case NDISC_ROUTER_SOLICITATION:
402 		case NDISC_ROUTER_ADVERTISEMENT:
403 		case NDISC_NEIGHBOUR_SOLICITATION:
404 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
405 		case NDISC_REDIRECT:
406 			/* For reaction involving unicast neighbor discovery
407 			 * message destined to the proxied address, pass it to
408 			 * input function.
409 			 */
410 			return 1;
411 		default:
412 			break;
413 		}
414 	}
415 
416 	/*
417 	 * The proxying router can't forward traffic sent to a link-local
418 	 * address, so signal the sender and discard the packet. This
419 	 * behavior is clarified by the MIPv6 specification.
420 	 */
421 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
422 		dst_link_failure(skb);
423 		return -1;
424 	}
425 
426 	return 0;
427 }
428 
429 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
430 				     struct sk_buff *skb)
431 {
432 	struct dst_entry *dst = skb_dst(skb);
433 
434 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
435 	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
436 
437 #ifdef CONFIG_NET_SWITCHDEV
438 	if (skb->offload_l3_fwd_mark) {
439 		consume_skb(skb);
440 		return 0;
441 	}
442 #endif
443 
444 	skb_clear_tstamp(skb);
445 	return dst_output(net, sk, skb);
446 }
447 
448 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
449 {
450 	if (skb->len <= mtu)
451 		return false;
452 
453 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
454 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
455 		return true;
456 
457 	if (skb->ignore_df)
458 		return false;
459 
460 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
461 		return false;
462 
463 	return true;
464 }
465 
466 int ip6_forward(struct sk_buff *skb)
467 {
468 	struct dst_entry *dst = skb_dst(skb);
469 	struct ipv6hdr *hdr = ipv6_hdr(skb);
470 	struct inet6_skb_parm *opt = IP6CB(skb);
471 	struct net *net = dev_net(dst->dev);
472 	struct inet6_dev *idev;
473 	SKB_DR(reason);
474 	u32 mtu;
475 
476 	idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
477 	if (net->ipv6.devconf_all->forwarding == 0)
478 		goto error;
479 
480 	if (skb->pkt_type != PACKET_HOST)
481 		goto drop;
482 
483 	if (unlikely(skb->sk))
484 		goto drop;
485 
486 	if (skb_warn_if_lro(skb))
487 		goto drop;
488 
489 	if (!net->ipv6.devconf_all->disable_policy &&
490 	    (!idev || !idev->cnf.disable_policy) &&
491 	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
492 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
493 		goto drop;
494 	}
495 
496 	skb_forward_csum(skb);
497 
498 	/*
499 	 *	We DO NOT make any processing on
500 	 *	RA packets, pushing them to user level AS IS
501 	 *	without ane WARRANTY that application will be able
502 	 *	to interpret them. The reason is that we
503 	 *	cannot make anything clever here.
504 	 *
505 	 *	We are not end-node, so that if packet contains
506 	 *	AH/ESP, we cannot make anything.
507 	 *	Defragmentation also would be mistake, RA packets
508 	 *	cannot be fragmented, because there is no warranty
509 	 *	that different fragments will go along one path. --ANK
510 	 */
511 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
512 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
513 			return 0;
514 	}
515 
516 	/*
517 	 *	check and decrement ttl
518 	 */
519 	if (hdr->hop_limit <= 1) {
520 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
521 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
522 
523 		kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
524 		return -ETIMEDOUT;
525 	}
526 
527 	/* XXX: idev->cnf.proxy_ndp? */
528 	if (net->ipv6.devconf_all->proxy_ndp &&
529 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
530 		int proxied = ip6_forward_proxy_check(skb);
531 		if (proxied > 0) {
532 			hdr->hop_limit--;
533 			return ip6_input(skb);
534 		} else if (proxied < 0) {
535 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
536 			goto drop;
537 		}
538 	}
539 
540 	if (!xfrm6_route_forward(skb)) {
541 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
542 		SKB_DR_SET(reason, XFRM_POLICY);
543 		goto drop;
544 	}
545 	dst = skb_dst(skb);
546 
547 	/* IPv6 specs say nothing about it, but it is clear that we cannot
548 	   send redirects to source routed frames.
549 	   We don't send redirects to frames decapsulated from IPsec.
550 	 */
551 	if (IP6CB(skb)->iif == dst->dev->ifindex &&
552 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
553 		struct in6_addr *target = NULL;
554 		struct inet_peer *peer;
555 		struct rt6_info *rt;
556 
557 		/*
558 		 *	incoming and outgoing devices are the same
559 		 *	send a redirect.
560 		 */
561 
562 		rt = (struct rt6_info *) dst;
563 		if (rt->rt6i_flags & RTF_GATEWAY)
564 			target = &rt->rt6i_gateway;
565 		else
566 			target = &hdr->daddr;
567 
568 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
569 
570 		/* Limit redirects both by destination (here)
571 		   and by source (inside ndisc_send_redirect)
572 		 */
573 		if (inet_peer_xrlim_allow(peer, 1*HZ))
574 			ndisc_send_redirect(skb, target);
575 		if (peer)
576 			inet_putpeer(peer);
577 	} else {
578 		int addrtype = ipv6_addr_type(&hdr->saddr);
579 
580 		/* This check is security critical. */
581 		if (addrtype == IPV6_ADDR_ANY ||
582 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
583 			goto error;
584 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
585 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
586 				    ICMPV6_NOT_NEIGHBOUR, 0);
587 			goto error;
588 		}
589 	}
590 
591 	mtu = ip6_dst_mtu_maybe_forward(dst, true);
592 	if (mtu < IPV6_MIN_MTU)
593 		mtu = IPV6_MIN_MTU;
594 
595 	if (ip6_pkt_too_big(skb, mtu)) {
596 		/* Again, force OUTPUT device used as source address */
597 		skb->dev = dst->dev;
598 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
599 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
600 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
601 				IPSTATS_MIB_FRAGFAILS);
602 		kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
603 		return -EMSGSIZE;
604 	}
605 
606 	if (skb_cow(skb, dst->dev->hard_header_len)) {
607 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
608 				IPSTATS_MIB_OUTDISCARDS);
609 		goto drop;
610 	}
611 
612 	hdr = ipv6_hdr(skb);
613 
614 	/* Mangling hops number delayed to point after skb COW */
615 
616 	hdr->hop_limit--;
617 
618 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
619 		       net, NULL, skb, skb->dev, dst->dev,
620 		       ip6_forward_finish);
621 
622 error:
623 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
624 	SKB_DR_SET(reason, IP_INADDRERRORS);
625 drop:
626 	kfree_skb_reason(skb, reason);
627 	return -EINVAL;
628 }
629 
630 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
631 {
632 	to->pkt_type = from->pkt_type;
633 	to->priority = from->priority;
634 	to->protocol = from->protocol;
635 	skb_dst_drop(to);
636 	skb_dst_set(to, dst_clone(skb_dst(from)));
637 	to->dev = from->dev;
638 	to->mark = from->mark;
639 
640 	skb_copy_hash(to, from);
641 
642 #ifdef CONFIG_NET_SCHED
643 	to->tc_index = from->tc_index;
644 #endif
645 	nf_copy(to, from);
646 	skb_ext_copy(to, from);
647 	skb_copy_secmark(to, from);
648 }
649 
650 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
651 		      u8 nexthdr, __be32 frag_id,
652 		      struct ip6_fraglist_iter *iter)
653 {
654 	unsigned int first_len;
655 	struct frag_hdr *fh;
656 
657 	/* BUILD HEADER */
658 	*prevhdr = NEXTHDR_FRAGMENT;
659 	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
660 	if (!iter->tmp_hdr)
661 		return -ENOMEM;
662 
663 	iter->frag = skb_shinfo(skb)->frag_list;
664 	skb_frag_list_init(skb);
665 
666 	iter->offset = 0;
667 	iter->hlen = hlen;
668 	iter->frag_id = frag_id;
669 	iter->nexthdr = nexthdr;
670 
671 	__skb_pull(skb, hlen);
672 	fh = __skb_push(skb, sizeof(struct frag_hdr));
673 	__skb_push(skb, hlen);
674 	skb_reset_network_header(skb);
675 	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
676 
677 	fh->nexthdr = nexthdr;
678 	fh->reserved = 0;
679 	fh->frag_off = htons(IP6_MF);
680 	fh->identification = frag_id;
681 
682 	first_len = skb_pagelen(skb);
683 	skb->data_len = first_len - skb_headlen(skb);
684 	skb->len = first_len;
685 	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
686 
687 	return 0;
688 }
689 EXPORT_SYMBOL(ip6_fraglist_init);
690 
691 void ip6_fraglist_prepare(struct sk_buff *skb,
692 			  struct ip6_fraglist_iter *iter)
693 {
694 	struct sk_buff *frag = iter->frag;
695 	unsigned int hlen = iter->hlen;
696 	struct frag_hdr *fh;
697 
698 	frag->ip_summed = CHECKSUM_NONE;
699 	skb_reset_transport_header(frag);
700 	fh = __skb_push(frag, sizeof(struct frag_hdr));
701 	__skb_push(frag, hlen);
702 	skb_reset_network_header(frag);
703 	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
704 	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
705 	fh->nexthdr = iter->nexthdr;
706 	fh->reserved = 0;
707 	fh->frag_off = htons(iter->offset);
708 	if (frag->next)
709 		fh->frag_off |= htons(IP6_MF);
710 	fh->identification = iter->frag_id;
711 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
712 	ip6_copy_metadata(frag, skb);
713 }
714 EXPORT_SYMBOL(ip6_fraglist_prepare);
715 
716 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
717 		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
718 		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
719 {
720 	state->prevhdr = prevhdr;
721 	state->nexthdr = nexthdr;
722 	state->frag_id = frag_id;
723 
724 	state->hlen = hlen;
725 	state->mtu = mtu;
726 
727 	state->left = skb->len - hlen;	/* Space per frame */
728 	state->ptr = hlen;		/* Where to start from */
729 
730 	state->hroom = hdr_room;
731 	state->troom = needed_tailroom;
732 
733 	state->offset = 0;
734 }
735 EXPORT_SYMBOL(ip6_frag_init);
736 
737 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
738 {
739 	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
740 	struct sk_buff *frag;
741 	struct frag_hdr *fh;
742 	unsigned int len;
743 
744 	len = state->left;
745 	/* IF: it doesn't fit, use 'mtu' - the data space left */
746 	if (len > state->mtu)
747 		len = state->mtu;
748 	/* IF: we are not sending up to and including the packet end
749 	   then align the next start on an eight byte boundary */
750 	if (len < state->left)
751 		len &= ~7;
752 
753 	/* Allocate buffer */
754 	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
755 			 state->hroom + state->troom, GFP_ATOMIC);
756 	if (!frag)
757 		return ERR_PTR(-ENOMEM);
758 
759 	/*
760 	 *	Set up data on packet
761 	 */
762 
763 	ip6_copy_metadata(frag, skb);
764 	skb_reserve(frag, state->hroom);
765 	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
766 	skb_reset_network_header(frag);
767 	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
768 	frag->transport_header = (frag->network_header + state->hlen +
769 				  sizeof(struct frag_hdr));
770 
771 	/*
772 	 *	Charge the memory for the fragment to any owner
773 	 *	it might possess
774 	 */
775 	if (skb->sk)
776 		skb_set_owner_w(frag, skb->sk);
777 
778 	/*
779 	 *	Copy the packet header into the new buffer.
780 	 */
781 	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
782 
783 	fragnexthdr_offset = skb_network_header(frag);
784 	fragnexthdr_offset += prevhdr - skb_network_header(skb);
785 	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
786 
787 	/*
788 	 *	Build fragment header.
789 	 */
790 	fh->nexthdr = state->nexthdr;
791 	fh->reserved = 0;
792 	fh->identification = state->frag_id;
793 
794 	/*
795 	 *	Copy a block of the IP datagram.
796 	 */
797 	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
798 			     len));
799 	state->left -= len;
800 
801 	fh->frag_off = htons(state->offset);
802 	if (state->left > 0)
803 		fh->frag_off |= htons(IP6_MF);
804 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
805 
806 	state->ptr += len;
807 	state->offset += len;
808 
809 	return frag;
810 }
811 EXPORT_SYMBOL(ip6_frag_next);
812 
813 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
814 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
815 {
816 	struct sk_buff *frag;
817 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
818 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
819 				inet6_sk(skb->sk) : NULL;
820 	bool mono_delivery_time = skb->mono_delivery_time;
821 	struct ip6_frag_state state;
822 	unsigned int mtu, hlen, nexthdr_offset;
823 	ktime_t tstamp = skb->tstamp;
824 	int hroom, err = 0;
825 	__be32 frag_id;
826 	u8 *prevhdr, nexthdr = 0;
827 
828 	err = ip6_find_1stfragopt(skb, &prevhdr);
829 	if (err < 0)
830 		goto fail;
831 	hlen = err;
832 	nexthdr = *prevhdr;
833 	nexthdr_offset = prevhdr - skb_network_header(skb);
834 
835 	mtu = ip6_skb_dst_mtu(skb);
836 
837 	/* We must not fragment if the socket is set to force MTU discovery
838 	 * or if the skb it not generated by a local socket.
839 	 */
840 	if (unlikely(!skb->ignore_df && skb->len > mtu))
841 		goto fail_toobig;
842 
843 	if (IP6CB(skb)->frag_max_size) {
844 		if (IP6CB(skb)->frag_max_size > mtu)
845 			goto fail_toobig;
846 
847 		/* don't send fragments larger than what we received */
848 		mtu = IP6CB(skb)->frag_max_size;
849 		if (mtu < IPV6_MIN_MTU)
850 			mtu = IPV6_MIN_MTU;
851 	}
852 
853 	if (np && np->frag_size < mtu) {
854 		if (np->frag_size)
855 			mtu = np->frag_size;
856 	}
857 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
858 		goto fail_toobig;
859 	mtu -= hlen + sizeof(struct frag_hdr);
860 
861 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
862 				    &ipv6_hdr(skb)->saddr);
863 
864 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
865 	    (err = skb_checksum_help(skb)))
866 		goto fail;
867 
868 	prevhdr = skb_network_header(skb) + nexthdr_offset;
869 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
870 	if (skb_has_frag_list(skb)) {
871 		unsigned int first_len = skb_pagelen(skb);
872 		struct ip6_fraglist_iter iter;
873 		struct sk_buff *frag2;
874 
875 		if (first_len - hlen > mtu ||
876 		    ((first_len - hlen) & 7) ||
877 		    skb_cloned(skb) ||
878 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
879 			goto slow_path;
880 
881 		skb_walk_frags(skb, frag) {
882 			/* Correct geometry. */
883 			if (frag->len > mtu ||
884 			    ((frag->len & 7) && frag->next) ||
885 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
886 				goto slow_path_clean;
887 
888 			/* Partially cloned skb? */
889 			if (skb_shared(frag))
890 				goto slow_path_clean;
891 
892 			BUG_ON(frag->sk);
893 			if (skb->sk) {
894 				frag->sk = skb->sk;
895 				frag->destructor = sock_wfree;
896 			}
897 			skb->truesize -= frag->truesize;
898 		}
899 
900 		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
901 					&iter);
902 		if (err < 0)
903 			goto fail;
904 
905 		for (;;) {
906 			/* Prepare header of the next frame,
907 			 * before previous one went down. */
908 			if (iter.frag)
909 				ip6_fraglist_prepare(skb, &iter);
910 
911 			skb_set_delivery_time(skb, tstamp, mono_delivery_time);
912 			err = output(net, sk, skb);
913 			if (!err)
914 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
915 					      IPSTATS_MIB_FRAGCREATES);
916 
917 			if (err || !iter.frag)
918 				break;
919 
920 			skb = ip6_fraglist_next(&iter);
921 		}
922 
923 		kfree(iter.tmp_hdr);
924 
925 		if (err == 0) {
926 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
927 				      IPSTATS_MIB_FRAGOKS);
928 			return 0;
929 		}
930 
931 		kfree_skb_list(iter.frag);
932 
933 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
934 			      IPSTATS_MIB_FRAGFAILS);
935 		return err;
936 
937 slow_path_clean:
938 		skb_walk_frags(skb, frag2) {
939 			if (frag2 == frag)
940 				break;
941 			frag2->sk = NULL;
942 			frag2->destructor = NULL;
943 			skb->truesize += frag2->truesize;
944 		}
945 	}
946 
947 slow_path:
948 	/*
949 	 *	Fragment the datagram.
950 	 */
951 
952 	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
953 		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
954 		      &state);
955 
956 	/*
957 	 *	Keep copying data until we run out.
958 	 */
959 
960 	while (state.left > 0) {
961 		frag = ip6_frag_next(skb, &state);
962 		if (IS_ERR(frag)) {
963 			err = PTR_ERR(frag);
964 			goto fail;
965 		}
966 
967 		/*
968 		 *	Put this fragment into the sending queue.
969 		 */
970 		skb_set_delivery_time(frag, tstamp, mono_delivery_time);
971 		err = output(net, sk, frag);
972 		if (err)
973 			goto fail;
974 
975 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
976 			      IPSTATS_MIB_FRAGCREATES);
977 	}
978 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
979 		      IPSTATS_MIB_FRAGOKS);
980 	consume_skb(skb);
981 	return err;
982 
983 fail_toobig:
984 	if (skb->sk && dst_allfrag(skb_dst(skb)))
985 		sk_gso_disable(skb->sk);
986 
987 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
988 	err = -EMSGSIZE;
989 
990 fail:
991 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
992 		      IPSTATS_MIB_FRAGFAILS);
993 	kfree_skb(skb);
994 	return err;
995 }
996 
997 static inline int ip6_rt_check(const struct rt6key *rt_key,
998 			       const struct in6_addr *fl_addr,
999 			       const struct in6_addr *addr_cache)
1000 {
1001 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1002 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1003 }
1004 
1005 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1006 					  struct dst_entry *dst,
1007 					  const struct flowi6 *fl6)
1008 {
1009 	struct ipv6_pinfo *np = inet6_sk(sk);
1010 	struct rt6_info *rt;
1011 
1012 	if (!dst)
1013 		goto out;
1014 
1015 	if (dst->ops->family != AF_INET6) {
1016 		dst_release(dst);
1017 		return NULL;
1018 	}
1019 
1020 	rt = (struct rt6_info *)dst;
1021 	/* Yes, checking route validity in not connected
1022 	 * case is not very simple. Take into account,
1023 	 * that we do not support routing by source, TOS,
1024 	 * and MSG_DONTROUTE		--ANK (980726)
1025 	 *
1026 	 * 1. ip6_rt_check(): If route was host route,
1027 	 *    check that cached destination is current.
1028 	 *    If it is network route, we still may
1029 	 *    check its validity using saved pointer
1030 	 *    to the last used address: daddr_cache.
1031 	 *    We do not want to save whole address now,
1032 	 *    (because main consumer of this service
1033 	 *    is tcp, which has not this problem),
1034 	 *    so that the last trick works only on connected
1035 	 *    sockets.
1036 	 * 2. oif also should be the same.
1037 	 */
1038 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1039 #ifdef CONFIG_IPV6_SUBTREES
1040 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1041 #endif
1042 	   (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
1043 		dst_release(dst);
1044 		dst = NULL;
1045 	}
1046 
1047 out:
1048 	return dst;
1049 }
1050 
1051 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1052 			       struct dst_entry **dst, struct flowi6 *fl6)
1053 {
1054 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1055 	struct neighbour *n;
1056 	struct rt6_info *rt;
1057 #endif
1058 	int err;
1059 	int flags = 0;
1060 
1061 	/* The correct way to handle this would be to do
1062 	 * ip6_route_get_saddr, and then ip6_route_output; however,
1063 	 * the route-specific preferred source forces the
1064 	 * ip6_route_output call _before_ ip6_route_get_saddr.
1065 	 *
1066 	 * In source specific routing (no src=any default route),
1067 	 * ip6_route_output will fail given src=any saddr, though, so
1068 	 * that's why we try it again later.
1069 	 */
1070 	if (ipv6_addr_any(&fl6->saddr)) {
1071 		struct fib6_info *from;
1072 		struct rt6_info *rt;
1073 
1074 		*dst = ip6_route_output(net, sk, fl6);
1075 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1076 
1077 		rcu_read_lock();
1078 		from = rt ? rcu_dereference(rt->from) : NULL;
1079 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1080 					  sk ? inet6_sk(sk)->srcprefs : 0,
1081 					  &fl6->saddr);
1082 		rcu_read_unlock();
1083 
1084 		if (err)
1085 			goto out_err_release;
1086 
1087 		/* If we had an erroneous initial result, pretend it
1088 		 * never existed and let the SA-enabled version take
1089 		 * over.
1090 		 */
1091 		if ((*dst)->error) {
1092 			dst_release(*dst);
1093 			*dst = NULL;
1094 		}
1095 
1096 		if (fl6->flowi6_oif)
1097 			flags |= RT6_LOOKUP_F_IFACE;
1098 	}
1099 
1100 	if (!*dst)
1101 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1102 
1103 	err = (*dst)->error;
1104 	if (err)
1105 		goto out_err_release;
1106 
1107 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1108 	/*
1109 	 * Here if the dst entry we've looked up
1110 	 * has a neighbour entry that is in the INCOMPLETE
1111 	 * state and the src address from the flow is
1112 	 * marked as OPTIMISTIC, we release the found
1113 	 * dst entry and replace it instead with the
1114 	 * dst entry of the nexthop router
1115 	 */
1116 	rt = (struct rt6_info *) *dst;
1117 	rcu_read_lock_bh();
1118 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1119 				      rt6_nexthop(rt, &fl6->daddr));
1120 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1121 	rcu_read_unlock_bh();
1122 
1123 	if (err) {
1124 		struct inet6_ifaddr *ifp;
1125 		struct flowi6 fl_gw6;
1126 		int redirect;
1127 
1128 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1129 				      (*dst)->dev, 1);
1130 
1131 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1132 		if (ifp)
1133 			in6_ifa_put(ifp);
1134 
1135 		if (redirect) {
1136 			/*
1137 			 * We need to get the dst entry for the
1138 			 * default router instead
1139 			 */
1140 			dst_release(*dst);
1141 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1142 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1143 			*dst = ip6_route_output(net, sk, &fl_gw6);
1144 			err = (*dst)->error;
1145 			if (err)
1146 				goto out_err_release;
1147 		}
1148 	}
1149 #endif
1150 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1151 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1152 		err = -EAFNOSUPPORT;
1153 		goto out_err_release;
1154 	}
1155 
1156 	return 0;
1157 
1158 out_err_release:
1159 	dst_release(*dst);
1160 	*dst = NULL;
1161 
1162 	if (err == -ENETUNREACH)
1163 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1164 	return err;
1165 }
1166 
1167 /**
1168  *	ip6_dst_lookup - perform route lookup on flow
1169  *	@net: Network namespace to perform lookup in
1170  *	@sk: socket which provides route info
1171  *	@dst: pointer to dst_entry * for result
1172  *	@fl6: flow to lookup
1173  *
1174  *	This function performs a route lookup on the given flow.
1175  *
1176  *	It returns zero on success, or a standard errno code on error.
1177  */
1178 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1179 		   struct flowi6 *fl6)
1180 {
1181 	*dst = NULL;
1182 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1183 }
1184 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1185 
1186 /**
1187  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1188  *	@net: Network namespace to perform lookup in
1189  *	@sk: socket which provides route info
1190  *	@fl6: flow to lookup
1191  *	@final_dst: final destination address for ipsec lookup
1192  *
1193  *	This function performs a route lookup on the given flow.
1194  *
1195  *	It returns a valid dst pointer on success, or a pointer encoded
1196  *	error code.
1197  */
1198 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1199 				      const struct in6_addr *final_dst)
1200 {
1201 	struct dst_entry *dst = NULL;
1202 	int err;
1203 
1204 	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1205 	if (err)
1206 		return ERR_PTR(err);
1207 	if (final_dst)
1208 		fl6->daddr = *final_dst;
1209 
1210 	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1211 }
1212 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1213 
1214 /**
1215  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1216  *	@sk: socket which provides the dst cache and route info
1217  *	@fl6: flow to lookup
1218  *	@final_dst: final destination address for ipsec lookup
1219  *	@connected: whether @sk is connected or not
1220  *
1221  *	This function performs a route lookup on the given flow with the
1222  *	possibility of using the cached route in the socket if it is valid.
1223  *	It will take the socket dst lock when operating on the dst cache.
1224  *	As a result, this function can only be used in process context.
1225  *
1226  *	In addition, for a connected socket, cache the dst in the socket
1227  *	if the current cache is not valid.
1228  *
1229  *	It returns a valid dst pointer on success, or a pointer encoded
1230  *	error code.
1231  */
1232 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1233 					 const struct in6_addr *final_dst,
1234 					 bool connected)
1235 {
1236 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1237 
1238 	dst = ip6_sk_dst_check(sk, dst, fl6);
1239 	if (dst)
1240 		return dst;
1241 
1242 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1243 	if (connected && !IS_ERR(dst))
1244 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1245 
1246 	return dst;
1247 }
1248 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1249 
1250 /**
1251  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1252  *      @skb: Packet for which lookup is done
1253  *      @dev: Tunnel device
1254  *      @net: Network namespace of tunnel device
1255  *      @sock: Socket which provides route info
1256  *      @saddr: Memory to store the src ip address
1257  *      @info: Tunnel information
1258  *      @protocol: IP protocol
1259  *      @use_cache: Flag to enable cache usage
1260  *      This function performs a route lookup on a tunnel
1261  *
1262  *      It returns a valid dst pointer and stores src address to be used in
1263  *      tunnel in param saddr on success, else a pointer encoded error code.
1264  */
1265 
1266 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1267 					struct net_device *dev,
1268 					struct net *net,
1269 					struct socket *sock,
1270 					struct in6_addr *saddr,
1271 					const struct ip_tunnel_info *info,
1272 					u8 protocol,
1273 					bool use_cache)
1274 {
1275 	struct dst_entry *dst = NULL;
1276 #ifdef CONFIG_DST_CACHE
1277 	struct dst_cache *dst_cache;
1278 #endif
1279 	struct flowi6 fl6;
1280 	__u8 prio;
1281 
1282 #ifdef CONFIG_DST_CACHE
1283 	dst_cache = (struct dst_cache *)&info->dst_cache;
1284 	if (use_cache) {
1285 		dst = dst_cache_get_ip6(dst_cache, saddr);
1286 		if (dst)
1287 			return dst;
1288 	}
1289 #endif
1290 	memset(&fl6, 0, sizeof(fl6));
1291 	fl6.flowi6_mark = skb->mark;
1292 	fl6.flowi6_proto = protocol;
1293 	fl6.daddr = info->key.u.ipv6.dst;
1294 	fl6.saddr = info->key.u.ipv6.src;
1295 	prio = info->key.tos;
1296 	fl6.flowlabel = ip6_make_flowinfo(RT_TOS(prio),
1297 					  info->key.label);
1298 
1299 	dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1300 					      NULL);
1301 	if (IS_ERR(dst)) {
1302 		netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1303 		return ERR_PTR(-ENETUNREACH);
1304 	}
1305 	if (dst->dev == dev) { /* is this necessary? */
1306 		netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1307 		dst_release(dst);
1308 		return ERR_PTR(-ELOOP);
1309 	}
1310 #ifdef CONFIG_DST_CACHE
1311 	if (use_cache)
1312 		dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1313 #endif
1314 	*saddr = fl6.saddr;
1315 	return dst;
1316 }
1317 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1318 
1319 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1320 					       gfp_t gfp)
1321 {
1322 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1323 }
1324 
1325 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1326 						gfp_t gfp)
1327 {
1328 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1329 }
1330 
1331 static void ip6_append_data_mtu(unsigned int *mtu,
1332 				int *maxfraglen,
1333 				unsigned int fragheaderlen,
1334 				struct sk_buff *skb,
1335 				struct rt6_info *rt,
1336 				unsigned int orig_mtu)
1337 {
1338 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1339 		if (!skb) {
1340 			/* first fragment, reserve header_len */
1341 			*mtu = orig_mtu - rt->dst.header_len;
1342 
1343 		} else {
1344 			/*
1345 			 * this fragment is not first, the headers
1346 			 * space is regarded as data space.
1347 			 */
1348 			*mtu = orig_mtu;
1349 		}
1350 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1351 			      + fragheaderlen - sizeof(struct frag_hdr);
1352 	}
1353 }
1354 
1355 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1356 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1357 			  struct rt6_info *rt)
1358 {
1359 	struct ipv6_pinfo *np = inet6_sk(sk);
1360 	unsigned int mtu;
1361 	struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1362 
1363 	/* callers pass dst together with a reference, set it first so
1364 	 * ip6_cork_release() can put it down even in case of an error.
1365 	 */
1366 	cork->base.dst = &rt->dst;
1367 
1368 	/*
1369 	 * setup for corking
1370 	 */
1371 	if (opt) {
1372 		if (WARN_ON(v6_cork->opt))
1373 			return -EINVAL;
1374 
1375 		nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1376 		if (unlikely(!nopt))
1377 			return -ENOBUFS;
1378 
1379 		nopt->tot_len = sizeof(*opt);
1380 		nopt->opt_flen = opt->opt_flen;
1381 		nopt->opt_nflen = opt->opt_nflen;
1382 
1383 		nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1384 		if (opt->dst0opt && !nopt->dst0opt)
1385 			return -ENOBUFS;
1386 
1387 		nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1388 		if (opt->dst1opt && !nopt->dst1opt)
1389 			return -ENOBUFS;
1390 
1391 		nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1392 		if (opt->hopopt && !nopt->hopopt)
1393 			return -ENOBUFS;
1394 
1395 		nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1396 		if (opt->srcrt && !nopt->srcrt)
1397 			return -ENOBUFS;
1398 
1399 		/* need source address above miyazawa*/
1400 	}
1401 	v6_cork->hop_limit = ipc6->hlimit;
1402 	v6_cork->tclass = ipc6->tclass;
1403 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1404 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1405 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1406 	else
1407 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1408 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1409 	if (np->frag_size < mtu) {
1410 		if (np->frag_size)
1411 			mtu = np->frag_size;
1412 	}
1413 	cork->base.fragsize = mtu;
1414 	cork->base.gso_size = ipc6->gso_size;
1415 	cork->base.tx_flags = 0;
1416 	cork->base.mark = ipc6->sockc.mark;
1417 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1418 
1419 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1420 		cork->base.flags |= IPCORK_ALLFRAG;
1421 	cork->base.length = 0;
1422 
1423 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1424 
1425 	return 0;
1426 }
1427 
1428 static int __ip6_append_data(struct sock *sk,
1429 			     struct sk_buff_head *queue,
1430 			     struct inet_cork_full *cork_full,
1431 			     struct inet6_cork *v6_cork,
1432 			     struct page_frag *pfrag,
1433 			     int getfrag(void *from, char *to, int offset,
1434 					 int len, int odd, struct sk_buff *skb),
1435 			     void *from, int length, int transhdrlen,
1436 			     unsigned int flags, struct ipcm6_cookie *ipc6)
1437 {
1438 	struct sk_buff *skb, *skb_prev = NULL;
1439 	struct inet_cork *cork = &cork_full->base;
1440 	struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1441 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1442 	struct ubuf_info *uarg = NULL;
1443 	int exthdrlen = 0;
1444 	int dst_exthdrlen = 0;
1445 	int hh_len;
1446 	int copy;
1447 	int err;
1448 	int offset = 0;
1449 	u32 tskey = 0;
1450 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1451 	struct ipv6_txoptions *opt = v6_cork->opt;
1452 	int csummode = CHECKSUM_NONE;
1453 	unsigned int maxnonfragsize, headersize;
1454 	unsigned int wmem_alloc_delta = 0;
1455 	bool paged, extra_uref = false;
1456 
1457 	skb = skb_peek_tail(queue);
1458 	if (!skb) {
1459 		exthdrlen = opt ? opt->opt_flen : 0;
1460 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1461 	}
1462 
1463 	paged = !!cork->gso_size;
1464 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1465 	orig_mtu = mtu;
1466 
1467 	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1468 	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1469 		tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1470 
1471 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1472 
1473 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1474 			(opt ? opt->opt_nflen : 0);
1475 
1476 	headersize = sizeof(struct ipv6hdr) +
1477 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1478 		     (dst_allfrag(&rt->dst) ?
1479 		      sizeof(struct frag_hdr) : 0) +
1480 		     rt->rt6i_nfheader_len;
1481 
1482 	if (mtu <= fragheaderlen ||
1483 	    ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1484 		goto emsgsize;
1485 
1486 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1487 		     sizeof(struct frag_hdr);
1488 
1489 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1490 	 * the first fragment
1491 	 */
1492 	if (headersize + transhdrlen > mtu)
1493 		goto emsgsize;
1494 
1495 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1496 	    (sk->sk_protocol == IPPROTO_UDP ||
1497 	     sk->sk_protocol == IPPROTO_ICMPV6 ||
1498 	     sk->sk_protocol == IPPROTO_RAW)) {
1499 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1500 				sizeof(struct ipv6hdr));
1501 		goto emsgsize;
1502 	}
1503 
1504 	if (ip6_sk_ignore_df(sk))
1505 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1506 	else
1507 		maxnonfragsize = mtu;
1508 
1509 	if (cork->length + length > maxnonfragsize - headersize) {
1510 emsgsize:
1511 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1512 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1513 		return -EMSGSIZE;
1514 	}
1515 
1516 	/* CHECKSUM_PARTIAL only with no extension headers and when
1517 	 * we are not going to fragment
1518 	 */
1519 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1520 	    headersize == sizeof(struct ipv6hdr) &&
1521 	    length <= mtu - headersize &&
1522 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1523 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1524 		csummode = CHECKSUM_PARTIAL;
1525 
1526 	if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1527 		uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1528 		if (!uarg)
1529 			return -ENOBUFS;
1530 		extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1531 		if (rt->dst.dev->features & NETIF_F_SG &&
1532 		    csummode == CHECKSUM_PARTIAL) {
1533 			paged = true;
1534 		} else {
1535 			uarg->zerocopy = 0;
1536 			skb_zcopy_set(skb, uarg, &extra_uref);
1537 		}
1538 	}
1539 
1540 	/*
1541 	 * Let's try using as much space as possible.
1542 	 * Use MTU if total length of the message fits into the MTU.
1543 	 * Otherwise, we need to reserve fragment header and
1544 	 * fragment alignment (= 8-15 octects, in total).
1545 	 *
1546 	 * Note that we may need to "move" the data from the tail
1547 	 * of the buffer to the new fragment when we split
1548 	 * the message.
1549 	 *
1550 	 * FIXME: It may be fragmented into multiple chunks
1551 	 *        at once if non-fragmentable extension headers
1552 	 *        are too large.
1553 	 * --yoshfuji
1554 	 */
1555 
1556 	cork->length += length;
1557 	if (!skb)
1558 		goto alloc_new_skb;
1559 
1560 	while (length > 0) {
1561 		/* Check if the remaining data fits into current packet. */
1562 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1563 		if (copy < length)
1564 			copy = maxfraglen - skb->len;
1565 
1566 		if (copy <= 0) {
1567 			char *data;
1568 			unsigned int datalen;
1569 			unsigned int fraglen;
1570 			unsigned int fraggap;
1571 			unsigned int alloclen, alloc_extra;
1572 			unsigned int pagedlen;
1573 alloc_new_skb:
1574 			/* There's no room in the current skb */
1575 			if (skb)
1576 				fraggap = skb->len - maxfraglen;
1577 			else
1578 				fraggap = 0;
1579 			/* update mtu and maxfraglen if necessary */
1580 			if (!skb || !skb_prev)
1581 				ip6_append_data_mtu(&mtu, &maxfraglen,
1582 						    fragheaderlen, skb, rt,
1583 						    orig_mtu);
1584 
1585 			skb_prev = skb;
1586 
1587 			/*
1588 			 * If remaining data exceeds the mtu,
1589 			 * we know we need more fragment(s).
1590 			 */
1591 			datalen = length + fraggap;
1592 
1593 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1594 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1595 			fraglen = datalen + fragheaderlen;
1596 			pagedlen = 0;
1597 
1598 			alloc_extra = hh_len;
1599 			alloc_extra += dst_exthdrlen;
1600 			alloc_extra += rt->dst.trailer_len;
1601 
1602 			/* We just reserve space for fragment header.
1603 			 * Note: this may be overallocation if the message
1604 			 * (without MSG_MORE) fits into the MTU.
1605 			 */
1606 			alloc_extra += sizeof(struct frag_hdr);
1607 
1608 			if ((flags & MSG_MORE) &&
1609 			    !(rt->dst.dev->features&NETIF_F_SG))
1610 				alloclen = mtu;
1611 			else if (!paged &&
1612 				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1613 				  !(rt->dst.dev->features & NETIF_F_SG)))
1614 				alloclen = fraglen;
1615 			else {
1616 				alloclen = min_t(int, fraglen, MAX_HEADER);
1617 				pagedlen = fraglen - alloclen;
1618 			}
1619 			alloclen += alloc_extra;
1620 
1621 			if (datalen != length + fraggap) {
1622 				/*
1623 				 * this is not the last fragment, the trailer
1624 				 * space is regarded as data space.
1625 				 */
1626 				datalen += rt->dst.trailer_len;
1627 			}
1628 
1629 			fraglen = datalen + fragheaderlen;
1630 
1631 			copy = datalen - transhdrlen - fraggap - pagedlen;
1632 			if (copy < 0) {
1633 				err = -EINVAL;
1634 				goto error;
1635 			}
1636 			if (transhdrlen) {
1637 				skb = sock_alloc_send_skb(sk, alloclen,
1638 						(flags & MSG_DONTWAIT), &err);
1639 			} else {
1640 				skb = NULL;
1641 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1642 				    2 * sk->sk_sndbuf)
1643 					skb = alloc_skb(alloclen,
1644 							sk->sk_allocation);
1645 				if (unlikely(!skb))
1646 					err = -ENOBUFS;
1647 			}
1648 			if (!skb)
1649 				goto error;
1650 			/*
1651 			 *	Fill in the control structures
1652 			 */
1653 			skb->protocol = htons(ETH_P_IPV6);
1654 			skb->ip_summed = csummode;
1655 			skb->csum = 0;
1656 			/* reserve for fragmentation and ipsec header */
1657 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1658 				    dst_exthdrlen);
1659 
1660 			/*
1661 			 *	Find where to start putting bytes
1662 			 */
1663 			data = skb_put(skb, fraglen - pagedlen);
1664 			skb_set_network_header(skb, exthdrlen);
1665 			data += fragheaderlen;
1666 			skb->transport_header = (skb->network_header +
1667 						 fragheaderlen);
1668 			if (fraggap) {
1669 				skb->csum = skb_copy_and_csum_bits(
1670 					skb_prev, maxfraglen,
1671 					data + transhdrlen, fraggap);
1672 				skb_prev->csum = csum_sub(skb_prev->csum,
1673 							  skb->csum);
1674 				data += fraggap;
1675 				pskb_trim_unique(skb_prev, maxfraglen);
1676 			}
1677 			if (copy > 0 &&
1678 			    getfrag(from, data + transhdrlen, offset,
1679 				    copy, fraggap, skb) < 0) {
1680 				err = -EFAULT;
1681 				kfree_skb(skb);
1682 				goto error;
1683 			}
1684 
1685 			offset += copy;
1686 			length -= copy + transhdrlen;
1687 			transhdrlen = 0;
1688 			exthdrlen = 0;
1689 			dst_exthdrlen = 0;
1690 
1691 			/* Only the initial fragment is time stamped */
1692 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1693 			cork->tx_flags = 0;
1694 			skb_shinfo(skb)->tskey = tskey;
1695 			tskey = 0;
1696 			skb_zcopy_set(skb, uarg, &extra_uref);
1697 
1698 			if ((flags & MSG_CONFIRM) && !skb_prev)
1699 				skb_set_dst_pending_confirm(skb, 1);
1700 
1701 			/*
1702 			 * Put the packet on the pending queue
1703 			 */
1704 			if (!skb->destructor) {
1705 				skb->destructor = sock_wfree;
1706 				skb->sk = sk;
1707 				wmem_alloc_delta += skb->truesize;
1708 			}
1709 			__skb_queue_tail(queue, skb);
1710 			continue;
1711 		}
1712 
1713 		if (copy > length)
1714 			copy = length;
1715 
1716 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1717 		    skb_tailroom(skb) >= copy) {
1718 			unsigned int off;
1719 
1720 			off = skb->len;
1721 			if (getfrag(from, skb_put(skb, copy),
1722 						offset, copy, off, skb) < 0) {
1723 				__skb_trim(skb, off);
1724 				err = -EFAULT;
1725 				goto error;
1726 			}
1727 		} else if (!uarg || !uarg->zerocopy) {
1728 			int i = skb_shinfo(skb)->nr_frags;
1729 
1730 			err = -ENOMEM;
1731 			if (!sk_page_frag_refill(sk, pfrag))
1732 				goto error;
1733 
1734 			if (!skb_can_coalesce(skb, i, pfrag->page,
1735 					      pfrag->offset)) {
1736 				err = -EMSGSIZE;
1737 				if (i == MAX_SKB_FRAGS)
1738 					goto error;
1739 
1740 				__skb_fill_page_desc(skb, i, pfrag->page,
1741 						     pfrag->offset, 0);
1742 				skb_shinfo(skb)->nr_frags = ++i;
1743 				get_page(pfrag->page);
1744 			}
1745 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1746 			if (getfrag(from,
1747 				    page_address(pfrag->page) + pfrag->offset,
1748 				    offset, copy, skb->len, skb) < 0)
1749 				goto error_efault;
1750 
1751 			pfrag->offset += copy;
1752 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1753 			skb->len += copy;
1754 			skb->data_len += copy;
1755 			skb->truesize += copy;
1756 			wmem_alloc_delta += copy;
1757 		} else {
1758 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1759 			if (err < 0)
1760 				goto error;
1761 		}
1762 		offset += copy;
1763 		length -= copy;
1764 	}
1765 
1766 	if (wmem_alloc_delta)
1767 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1768 	return 0;
1769 
1770 error_efault:
1771 	err = -EFAULT;
1772 error:
1773 	net_zcopy_put_abort(uarg, extra_uref);
1774 	cork->length -= length;
1775 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1776 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1777 	return err;
1778 }
1779 
1780 int ip6_append_data(struct sock *sk,
1781 		    int getfrag(void *from, char *to, int offset, int len,
1782 				int odd, struct sk_buff *skb),
1783 		    void *from, int length, int transhdrlen,
1784 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1785 		    struct rt6_info *rt, unsigned int flags)
1786 {
1787 	struct inet_sock *inet = inet_sk(sk);
1788 	struct ipv6_pinfo *np = inet6_sk(sk);
1789 	int exthdrlen;
1790 	int err;
1791 
1792 	if (flags&MSG_PROBE)
1793 		return 0;
1794 	if (skb_queue_empty(&sk->sk_write_queue)) {
1795 		/*
1796 		 * setup for corking
1797 		 */
1798 		dst_hold(&rt->dst);
1799 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1800 				     ipc6, rt);
1801 		if (err)
1802 			return err;
1803 
1804 		inet->cork.fl.u.ip6 = *fl6;
1805 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1806 		length += exthdrlen;
1807 		transhdrlen += exthdrlen;
1808 	} else {
1809 		transhdrlen = 0;
1810 	}
1811 
1812 	return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1813 				 &np->cork, sk_page_frag(sk), getfrag,
1814 				 from, length, transhdrlen, flags, ipc6);
1815 }
1816 EXPORT_SYMBOL_GPL(ip6_append_data);
1817 
1818 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1819 {
1820 	struct dst_entry *dst = cork->base.dst;
1821 
1822 	cork->base.dst = NULL;
1823 	cork->base.flags &= ~IPCORK_ALLFRAG;
1824 	skb_dst_set(skb, dst);
1825 }
1826 
1827 static void ip6_cork_release(struct inet_cork_full *cork,
1828 			     struct inet6_cork *v6_cork)
1829 {
1830 	if (v6_cork->opt) {
1831 		struct ipv6_txoptions *opt = v6_cork->opt;
1832 
1833 		kfree(opt->dst0opt);
1834 		kfree(opt->dst1opt);
1835 		kfree(opt->hopopt);
1836 		kfree(opt->srcrt);
1837 		kfree(opt);
1838 		v6_cork->opt = NULL;
1839 	}
1840 
1841 	if (cork->base.dst) {
1842 		dst_release(cork->base.dst);
1843 		cork->base.dst = NULL;
1844 		cork->base.flags &= ~IPCORK_ALLFRAG;
1845 	}
1846 }
1847 
1848 struct sk_buff *__ip6_make_skb(struct sock *sk,
1849 			       struct sk_buff_head *queue,
1850 			       struct inet_cork_full *cork,
1851 			       struct inet6_cork *v6_cork)
1852 {
1853 	struct sk_buff *skb, *tmp_skb;
1854 	struct sk_buff **tail_skb;
1855 	struct in6_addr *final_dst;
1856 	struct ipv6_pinfo *np = inet6_sk(sk);
1857 	struct net *net = sock_net(sk);
1858 	struct ipv6hdr *hdr;
1859 	struct ipv6_txoptions *opt = v6_cork->opt;
1860 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1861 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1862 	unsigned char proto = fl6->flowi6_proto;
1863 
1864 	skb = __skb_dequeue(queue);
1865 	if (!skb)
1866 		goto out;
1867 	tail_skb = &(skb_shinfo(skb)->frag_list);
1868 
1869 	/* move skb->data to ip header from ext header */
1870 	if (skb->data < skb_network_header(skb))
1871 		__skb_pull(skb, skb_network_offset(skb));
1872 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1873 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1874 		*tail_skb = tmp_skb;
1875 		tail_skb = &(tmp_skb->next);
1876 		skb->len += tmp_skb->len;
1877 		skb->data_len += tmp_skb->len;
1878 		skb->truesize += tmp_skb->truesize;
1879 		tmp_skb->destructor = NULL;
1880 		tmp_skb->sk = NULL;
1881 	}
1882 
1883 	/* Allow local fragmentation. */
1884 	skb->ignore_df = ip6_sk_ignore_df(sk);
1885 	__skb_pull(skb, skb_network_header_len(skb));
1886 
1887 	final_dst = &fl6->daddr;
1888 	if (opt && opt->opt_flen)
1889 		ipv6_push_frag_opts(skb, opt, &proto);
1890 	if (opt && opt->opt_nflen)
1891 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1892 
1893 	skb_push(skb, sizeof(struct ipv6hdr));
1894 	skb_reset_network_header(skb);
1895 	hdr = ipv6_hdr(skb);
1896 
1897 	ip6_flow_hdr(hdr, v6_cork->tclass,
1898 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1899 					ip6_autoflowlabel(net, np), fl6));
1900 	hdr->hop_limit = v6_cork->hop_limit;
1901 	hdr->nexthdr = proto;
1902 	hdr->saddr = fl6->saddr;
1903 	hdr->daddr = *final_dst;
1904 
1905 	skb->priority = sk->sk_priority;
1906 	skb->mark = cork->base.mark;
1907 	skb->tstamp = cork->base.transmit_time;
1908 
1909 	ip6_cork_steal_dst(skb, cork);
1910 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1911 	if (proto == IPPROTO_ICMPV6) {
1912 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1913 
1914 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1915 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1916 	}
1917 
1918 	ip6_cork_release(cork, v6_cork);
1919 out:
1920 	return skb;
1921 }
1922 
1923 int ip6_send_skb(struct sk_buff *skb)
1924 {
1925 	struct net *net = sock_net(skb->sk);
1926 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1927 	int err;
1928 
1929 	err = ip6_local_out(net, skb->sk, skb);
1930 	if (err) {
1931 		if (err > 0)
1932 			err = net_xmit_errno(err);
1933 		if (err)
1934 			IP6_INC_STATS(net, rt->rt6i_idev,
1935 				      IPSTATS_MIB_OUTDISCARDS);
1936 	}
1937 
1938 	return err;
1939 }
1940 
1941 int ip6_push_pending_frames(struct sock *sk)
1942 {
1943 	struct sk_buff *skb;
1944 
1945 	skb = ip6_finish_skb(sk);
1946 	if (!skb)
1947 		return 0;
1948 
1949 	return ip6_send_skb(skb);
1950 }
1951 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1952 
1953 static void __ip6_flush_pending_frames(struct sock *sk,
1954 				       struct sk_buff_head *queue,
1955 				       struct inet_cork_full *cork,
1956 				       struct inet6_cork *v6_cork)
1957 {
1958 	struct sk_buff *skb;
1959 
1960 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1961 		if (skb_dst(skb))
1962 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1963 				      IPSTATS_MIB_OUTDISCARDS);
1964 		kfree_skb(skb);
1965 	}
1966 
1967 	ip6_cork_release(cork, v6_cork);
1968 }
1969 
1970 void ip6_flush_pending_frames(struct sock *sk)
1971 {
1972 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1973 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1974 }
1975 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1976 
1977 struct sk_buff *ip6_make_skb(struct sock *sk,
1978 			     int getfrag(void *from, char *to, int offset,
1979 					 int len, int odd, struct sk_buff *skb),
1980 			     void *from, int length, int transhdrlen,
1981 			     struct ipcm6_cookie *ipc6, struct rt6_info *rt,
1982 			     unsigned int flags, struct inet_cork_full *cork)
1983 {
1984 	struct inet6_cork v6_cork;
1985 	struct sk_buff_head queue;
1986 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1987 	int err;
1988 
1989 	if (flags & MSG_PROBE) {
1990 		dst_release(&rt->dst);
1991 		return NULL;
1992 	}
1993 
1994 	__skb_queue_head_init(&queue);
1995 
1996 	cork->base.flags = 0;
1997 	cork->base.addr = 0;
1998 	cork->base.opt = NULL;
1999 	v6_cork.opt = NULL;
2000 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
2001 	if (err) {
2002 		ip6_cork_release(cork, &v6_cork);
2003 		return ERR_PTR(err);
2004 	}
2005 	if (ipc6->dontfrag < 0)
2006 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2007 
2008 	err = __ip6_append_data(sk, &queue, cork, &v6_cork,
2009 				&current->task_frag, getfrag, from,
2010 				length + exthdrlen, transhdrlen + exthdrlen,
2011 				flags, ipc6);
2012 	if (err) {
2013 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2014 		return ERR_PTR(err);
2015 	}
2016 
2017 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2018 }
2019