xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 35267cea)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	IPv6 output functions
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on linux/net/ipv4/ip_output.c
10  *
11  *	Changes:
12  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13  *				extension headers are implemented.
14  *				route changes now work.
15  *				ip6_forward does not confuse sniffers.
16  *				etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *	Imran Patel	:	frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *			:       add ip6_append_data and related functions
22  *				for datagram xmit
23  */
24 
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37 
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41 
42 #include <net/sock.h>
43 #include <net/snmp.h>
44 
45 #include <net/ipv6.h>
46 #include <net/ndisc.h>
47 #include <net/protocol.h>
48 #include <net/ip6_route.h>
49 #include <net/addrconf.h>
50 #include <net/rawv6.h>
51 #include <net/icmp.h>
52 #include <net/xfrm.h>
53 #include <net/checksum.h>
54 #include <linux/mroute6.h>
55 #include <net/l3mdev.h>
56 #include <net/lwtunnel.h>
57 #include <net/ip_tunnels.h>
58 
59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
60 {
61 	struct dst_entry *dst = skb_dst(skb);
62 	struct net_device *dev = dst->dev;
63 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
64 	int delta = hh_len - skb_headroom(skb);
65 	const struct in6_addr *nexthop;
66 	struct neighbour *neigh;
67 	int ret;
68 
69 	/* Be paranoid, rather than too clever. */
70 	if (unlikely(delta > 0) && dev->header_ops) {
71 		/* pskb_expand_head() might crash, if skb is shared */
72 		if (skb_shared(skb)) {
73 			struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
74 
75 			if (likely(nskb)) {
76 				if (skb->sk)
77 					skb_set_owner_w(nskb, skb->sk);
78 				consume_skb(skb);
79 			} else {
80 				kfree_skb(skb);
81 			}
82 			skb = nskb;
83 		}
84 		if (skb &&
85 		    pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
86 			kfree_skb(skb);
87 			skb = NULL;
88 		}
89 		if (!skb) {
90 			IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
91 			return -ENOMEM;
92 		}
93 	}
94 
95 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
96 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
97 
98 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
99 		    ((mroute6_is_socket(net, skb) &&
100 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
101 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
102 					 &ipv6_hdr(skb)->saddr))) {
103 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
104 
105 			/* Do not check for IFF_ALLMULTI; multicast routing
106 			   is not supported in any case.
107 			 */
108 			if (newskb)
109 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
110 					net, sk, newskb, NULL, newskb->dev,
111 					dev_loopback_xmit);
112 
113 			if (ipv6_hdr(skb)->hop_limit == 0) {
114 				IP6_INC_STATS(net, idev,
115 					      IPSTATS_MIB_OUTDISCARDS);
116 				kfree_skb(skb);
117 				return 0;
118 			}
119 		}
120 
121 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
122 
123 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
124 		    IPV6_ADDR_SCOPE_NODELOCAL &&
125 		    !(dev->flags & IFF_LOOPBACK)) {
126 			kfree_skb(skb);
127 			return 0;
128 		}
129 	}
130 
131 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
132 		int res = lwtunnel_xmit(skb);
133 
134 		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
135 			return res;
136 	}
137 
138 	rcu_read_lock_bh();
139 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
140 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
141 	if (unlikely(!neigh))
142 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
143 	if (!IS_ERR(neigh)) {
144 		sock_confirm_neigh(skb, neigh);
145 		ret = neigh_output(neigh, skb, false);
146 		rcu_read_unlock_bh();
147 		return ret;
148 	}
149 	rcu_read_unlock_bh();
150 
151 	IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
152 	kfree_skb(skb);
153 	return -EINVAL;
154 }
155 
156 static int
157 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
158 				    struct sk_buff *skb, unsigned int mtu)
159 {
160 	struct sk_buff *segs, *nskb;
161 	netdev_features_t features;
162 	int ret = 0;
163 
164 	/* Please see corresponding comment in ip_finish_output_gso
165 	 * describing the cases where GSO segment length exceeds the
166 	 * egress MTU.
167 	 */
168 	features = netif_skb_features(skb);
169 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
170 	if (IS_ERR_OR_NULL(segs)) {
171 		kfree_skb(skb);
172 		return -ENOMEM;
173 	}
174 
175 	consume_skb(skb);
176 
177 	skb_list_walk_safe(segs, segs, nskb) {
178 		int err;
179 
180 		skb_mark_not_on_list(segs);
181 		err = ip6_fragment(net, sk, segs, ip6_finish_output2);
182 		if (err && ret == 0)
183 			ret = err;
184 	}
185 
186 	return ret;
187 }
188 
189 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
190 {
191 	unsigned int mtu;
192 
193 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
194 	/* Policy lookup after SNAT yielded a new policy */
195 	if (skb_dst(skb)->xfrm) {
196 		IPCB(skb)->flags |= IPSKB_REROUTED;
197 		return dst_output(net, sk, skb);
198 	}
199 #endif
200 
201 	mtu = ip6_skb_dst_mtu(skb);
202 	if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
203 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
204 
205 	if ((skb->len > mtu && !skb_is_gso(skb)) ||
206 	    dst_allfrag(skb_dst(skb)) ||
207 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
208 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
209 	else
210 		return ip6_finish_output2(net, sk, skb);
211 }
212 
213 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
214 {
215 	int ret;
216 
217 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
218 	switch (ret) {
219 	case NET_XMIT_SUCCESS:
220 		return __ip6_finish_output(net, sk, skb);
221 	case NET_XMIT_CN:
222 		return __ip6_finish_output(net, sk, skb) ? : ret;
223 	default:
224 		kfree_skb(skb);
225 		return ret;
226 	}
227 }
228 
229 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
230 {
231 	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
232 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
233 
234 	skb->protocol = htons(ETH_P_IPV6);
235 	skb->dev = dev;
236 
237 	if (unlikely(idev->cnf.disable_ipv6)) {
238 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
239 		kfree_skb(skb);
240 		return 0;
241 	}
242 
243 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
244 			    net, sk, skb, indev, dev,
245 			    ip6_finish_output,
246 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
247 }
248 EXPORT_SYMBOL(ip6_output);
249 
250 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
251 {
252 	if (!np->autoflowlabel_set)
253 		return ip6_default_np_autolabel(net);
254 	else
255 		return np->autoflowlabel;
256 }
257 
258 /*
259  * xmit an sk_buff (used by TCP, SCTP and DCCP)
260  * Note : socket lock is not held for SYNACK packets, but might be modified
261  * by calls to skb_set_owner_w() and ipv6_local_error(),
262  * which are using proper atomic operations or spinlocks.
263  */
264 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
265 	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
266 {
267 	struct net *net = sock_net(sk);
268 	const struct ipv6_pinfo *np = inet6_sk(sk);
269 	struct in6_addr *first_hop = &fl6->daddr;
270 	struct dst_entry *dst = skb_dst(skb);
271 	unsigned int head_room;
272 	struct ipv6hdr *hdr;
273 	u8  proto = fl6->flowi6_proto;
274 	int seg_len = skb->len;
275 	int hlimit = -1;
276 	u32 mtu;
277 
278 	head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
279 	if (opt)
280 		head_room += opt->opt_nflen + opt->opt_flen;
281 
282 	if (unlikely(skb_headroom(skb) < head_room)) {
283 		struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
284 		if (!skb2) {
285 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
286 				      IPSTATS_MIB_OUTDISCARDS);
287 			kfree_skb(skb);
288 			return -ENOBUFS;
289 		}
290 		if (skb->sk)
291 			skb_set_owner_w(skb2, skb->sk);
292 		consume_skb(skb);
293 		skb = skb2;
294 	}
295 
296 	if (opt) {
297 		seg_len += opt->opt_nflen + opt->opt_flen;
298 
299 		if (opt->opt_flen)
300 			ipv6_push_frag_opts(skb, opt, &proto);
301 
302 		if (opt->opt_nflen)
303 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
304 					     &fl6->saddr);
305 	}
306 
307 	skb_push(skb, sizeof(struct ipv6hdr));
308 	skb_reset_network_header(skb);
309 	hdr = ipv6_hdr(skb);
310 
311 	/*
312 	 *	Fill in the IPv6 header
313 	 */
314 	if (np)
315 		hlimit = np->hop_limit;
316 	if (hlimit < 0)
317 		hlimit = ip6_dst_hoplimit(dst);
318 
319 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
320 				ip6_autoflowlabel(net, np), fl6));
321 
322 	hdr->payload_len = htons(seg_len);
323 	hdr->nexthdr = proto;
324 	hdr->hop_limit = hlimit;
325 
326 	hdr->saddr = fl6->saddr;
327 	hdr->daddr = *first_hop;
328 
329 	skb->protocol = htons(ETH_P_IPV6);
330 	skb->priority = priority;
331 	skb->mark = mark;
332 
333 	mtu = dst_mtu(dst);
334 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
335 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
336 			      IPSTATS_MIB_OUT, skb->len);
337 
338 		/* if egress device is enslaved to an L3 master device pass the
339 		 * skb to its handler for processing
340 		 */
341 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
342 		if (unlikely(!skb))
343 			return 0;
344 
345 		/* hooks should never assume socket lock is held.
346 		 * we promote our socket to non const
347 		 */
348 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
349 			       net, (struct sock *)sk, skb, NULL, dst->dev,
350 			       dst_output);
351 	}
352 
353 	skb->dev = dst->dev;
354 	/* ipv6_local_error() does not require socket lock,
355 	 * we promote our socket to non const
356 	 */
357 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
358 
359 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
360 	kfree_skb(skb);
361 	return -EMSGSIZE;
362 }
363 EXPORT_SYMBOL(ip6_xmit);
364 
365 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
366 {
367 	struct ip6_ra_chain *ra;
368 	struct sock *last = NULL;
369 
370 	read_lock(&ip6_ra_lock);
371 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
372 		struct sock *sk = ra->sk;
373 		if (sk && ra->sel == sel &&
374 		    (!sk->sk_bound_dev_if ||
375 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
376 			struct ipv6_pinfo *np = inet6_sk(sk);
377 
378 			if (np && np->rtalert_isolate &&
379 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
380 				continue;
381 			}
382 			if (last) {
383 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
384 				if (skb2)
385 					rawv6_rcv(last, skb2);
386 			}
387 			last = sk;
388 		}
389 	}
390 
391 	if (last) {
392 		rawv6_rcv(last, skb);
393 		read_unlock(&ip6_ra_lock);
394 		return 1;
395 	}
396 	read_unlock(&ip6_ra_lock);
397 	return 0;
398 }
399 
400 static int ip6_forward_proxy_check(struct sk_buff *skb)
401 {
402 	struct ipv6hdr *hdr = ipv6_hdr(skb);
403 	u8 nexthdr = hdr->nexthdr;
404 	__be16 frag_off;
405 	int offset;
406 
407 	if (ipv6_ext_hdr(nexthdr)) {
408 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
409 		if (offset < 0)
410 			return 0;
411 	} else
412 		offset = sizeof(struct ipv6hdr);
413 
414 	if (nexthdr == IPPROTO_ICMPV6) {
415 		struct icmp6hdr *icmp6;
416 
417 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
418 					 offset + 1 - skb->data)))
419 			return 0;
420 
421 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
422 
423 		switch (icmp6->icmp6_type) {
424 		case NDISC_ROUTER_SOLICITATION:
425 		case NDISC_ROUTER_ADVERTISEMENT:
426 		case NDISC_NEIGHBOUR_SOLICITATION:
427 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
428 		case NDISC_REDIRECT:
429 			/* For reaction involving unicast neighbor discovery
430 			 * message destined to the proxied address, pass it to
431 			 * input function.
432 			 */
433 			return 1;
434 		default:
435 			break;
436 		}
437 	}
438 
439 	/*
440 	 * The proxying router can't forward traffic sent to a link-local
441 	 * address, so signal the sender and discard the packet. This
442 	 * behavior is clarified by the MIPv6 specification.
443 	 */
444 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
445 		dst_link_failure(skb);
446 		return -1;
447 	}
448 
449 	return 0;
450 }
451 
452 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
453 				     struct sk_buff *skb)
454 {
455 	struct dst_entry *dst = skb_dst(skb);
456 
457 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
458 	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
459 
460 #ifdef CONFIG_NET_SWITCHDEV
461 	if (skb->offload_l3_fwd_mark) {
462 		consume_skb(skb);
463 		return 0;
464 	}
465 #endif
466 
467 	skb->tstamp = 0;
468 	return dst_output(net, sk, skb);
469 }
470 
471 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
472 {
473 	if (skb->len <= mtu)
474 		return false;
475 
476 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
477 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
478 		return true;
479 
480 	if (skb->ignore_df)
481 		return false;
482 
483 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
484 		return false;
485 
486 	return true;
487 }
488 
489 int ip6_forward(struct sk_buff *skb)
490 {
491 	struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
492 	struct dst_entry *dst = skb_dst(skb);
493 	struct ipv6hdr *hdr = ipv6_hdr(skb);
494 	struct inet6_skb_parm *opt = IP6CB(skb);
495 	struct net *net = dev_net(dst->dev);
496 	u32 mtu;
497 
498 	if (net->ipv6.devconf_all->forwarding == 0)
499 		goto error;
500 
501 	if (skb->pkt_type != PACKET_HOST)
502 		goto drop;
503 
504 	if (unlikely(skb->sk))
505 		goto drop;
506 
507 	if (skb_warn_if_lro(skb))
508 		goto drop;
509 
510 	if (!net->ipv6.devconf_all->disable_policy &&
511 	    !idev->cnf.disable_policy &&
512 	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
513 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
514 		goto drop;
515 	}
516 
517 	skb_forward_csum(skb);
518 
519 	/*
520 	 *	We DO NOT make any processing on
521 	 *	RA packets, pushing them to user level AS IS
522 	 *	without ane WARRANTY that application will be able
523 	 *	to interpret them. The reason is that we
524 	 *	cannot make anything clever here.
525 	 *
526 	 *	We are not end-node, so that if packet contains
527 	 *	AH/ESP, we cannot make anything.
528 	 *	Defragmentation also would be mistake, RA packets
529 	 *	cannot be fragmented, because there is no warranty
530 	 *	that different fragments will go along one path. --ANK
531 	 */
532 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
533 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
534 			return 0;
535 	}
536 
537 	/*
538 	 *	check and decrement ttl
539 	 */
540 	if (hdr->hop_limit <= 1) {
541 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
542 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
543 
544 		kfree_skb(skb);
545 		return -ETIMEDOUT;
546 	}
547 
548 	/* XXX: idev->cnf.proxy_ndp? */
549 	if (net->ipv6.devconf_all->proxy_ndp &&
550 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
551 		int proxied = ip6_forward_proxy_check(skb);
552 		if (proxied > 0) {
553 			hdr->hop_limit--;
554 			return ip6_input(skb);
555 		} else if (proxied < 0) {
556 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
557 			goto drop;
558 		}
559 	}
560 
561 	if (!xfrm6_route_forward(skb)) {
562 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
563 		goto drop;
564 	}
565 	dst = skb_dst(skb);
566 
567 	/* IPv6 specs say nothing about it, but it is clear that we cannot
568 	   send redirects to source routed frames.
569 	   We don't send redirects to frames decapsulated from IPsec.
570 	 */
571 	if (IP6CB(skb)->iif == dst->dev->ifindex &&
572 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
573 		struct in6_addr *target = NULL;
574 		struct inet_peer *peer;
575 		struct rt6_info *rt;
576 
577 		/*
578 		 *	incoming and outgoing devices are the same
579 		 *	send a redirect.
580 		 */
581 
582 		rt = (struct rt6_info *) dst;
583 		if (rt->rt6i_flags & RTF_GATEWAY)
584 			target = &rt->rt6i_gateway;
585 		else
586 			target = &hdr->daddr;
587 
588 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
589 
590 		/* Limit redirects both by destination (here)
591 		   and by source (inside ndisc_send_redirect)
592 		 */
593 		if (inet_peer_xrlim_allow(peer, 1*HZ))
594 			ndisc_send_redirect(skb, target);
595 		if (peer)
596 			inet_putpeer(peer);
597 	} else {
598 		int addrtype = ipv6_addr_type(&hdr->saddr);
599 
600 		/* This check is security critical. */
601 		if (addrtype == IPV6_ADDR_ANY ||
602 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
603 			goto error;
604 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
605 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
606 				    ICMPV6_NOT_NEIGHBOUR, 0);
607 			goto error;
608 		}
609 	}
610 
611 	mtu = ip6_dst_mtu_forward(dst);
612 	if (mtu < IPV6_MIN_MTU)
613 		mtu = IPV6_MIN_MTU;
614 
615 	if (ip6_pkt_too_big(skb, mtu)) {
616 		/* Again, force OUTPUT device used as source address */
617 		skb->dev = dst->dev;
618 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
619 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
620 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
621 				IPSTATS_MIB_FRAGFAILS);
622 		kfree_skb(skb);
623 		return -EMSGSIZE;
624 	}
625 
626 	if (skb_cow(skb, dst->dev->hard_header_len)) {
627 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
628 				IPSTATS_MIB_OUTDISCARDS);
629 		goto drop;
630 	}
631 
632 	hdr = ipv6_hdr(skb);
633 
634 	/* Mangling hops number delayed to point after skb COW */
635 
636 	hdr->hop_limit--;
637 
638 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
639 		       net, NULL, skb, skb->dev, dst->dev,
640 		       ip6_forward_finish);
641 
642 error:
643 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
644 drop:
645 	kfree_skb(skb);
646 	return -EINVAL;
647 }
648 
649 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
650 {
651 	to->pkt_type = from->pkt_type;
652 	to->priority = from->priority;
653 	to->protocol = from->protocol;
654 	skb_dst_drop(to);
655 	skb_dst_set(to, dst_clone(skb_dst(from)));
656 	to->dev = from->dev;
657 	to->mark = from->mark;
658 
659 	skb_copy_hash(to, from);
660 
661 #ifdef CONFIG_NET_SCHED
662 	to->tc_index = from->tc_index;
663 #endif
664 	nf_copy(to, from);
665 	skb_ext_copy(to, from);
666 	skb_copy_secmark(to, from);
667 }
668 
669 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
670 		      u8 nexthdr, __be32 frag_id,
671 		      struct ip6_fraglist_iter *iter)
672 {
673 	unsigned int first_len;
674 	struct frag_hdr *fh;
675 
676 	/* BUILD HEADER */
677 	*prevhdr = NEXTHDR_FRAGMENT;
678 	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
679 	if (!iter->tmp_hdr)
680 		return -ENOMEM;
681 
682 	iter->frag = skb_shinfo(skb)->frag_list;
683 	skb_frag_list_init(skb);
684 
685 	iter->offset = 0;
686 	iter->hlen = hlen;
687 	iter->frag_id = frag_id;
688 	iter->nexthdr = nexthdr;
689 
690 	__skb_pull(skb, hlen);
691 	fh = __skb_push(skb, sizeof(struct frag_hdr));
692 	__skb_push(skb, hlen);
693 	skb_reset_network_header(skb);
694 	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
695 
696 	fh->nexthdr = nexthdr;
697 	fh->reserved = 0;
698 	fh->frag_off = htons(IP6_MF);
699 	fh->identification = frag_id;
700 
701 	first_len = skb_pagelen(skb);
702 	skb->data_len = first_len - skb_headlen(skb);
703 	skb->len = first_len;
704 	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
705 
706 	return 0;
707 }
708 EXPORT_SYMBOL(ip6_fraglist_init);
709 
710 void ip6_fraglist_prepare(struct sk_buff *skb,
711 			  struct ip6_fraglist_iter *iter)
712 {
713 	struct sk_buff *frag = iter->frag;
714 	unsigned int hlen = iter->hlen;
715 	struct frag_hdr *fh;
716 
717 	frag->ip_summed = CHECKSUM_NONE;
718 	skb_reset_transport_header(frag);
719 	fh = __skb_push(frag, sizeof(struct frag_hdr));
720 	__skb_push(frag, hlen);
721 	skb_reset_network_header(frag);
722 	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
723 	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
724 	fh->nexthdr = iter->nexthdr;
725 	fh->reserved = 0;
726 	fh->frag_off = htons(iter->offset);
727 	if (frag->next)
728 		fh->frag_off |= htons(IP6_MF);
729 	fh->identification = iter->frag_id;
730 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
731 	ip6_copy_metadata(frag, skb);
732 }
733 EXPORT_SYMBOL(ip6_fraglist_prepare);
734 
735 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
736 		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
737 		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
738 {
739 	state->prevhdr = prevhdr;
740 	state->nexthdr = nexthdr;
741 	state->frag_id = frag_id;
742 
743 	state->hlen = hlen;
744 	state->mtu = mtu;
745 
746 	state->left = skb->len - hlen;	/* Space per frame */
747 	state->ptr = hlen;		/* Where to start from */
748 
749 	state->hroom = hdr_room;
750 	state->troom = needed_tailroom;
751 
752 	state->offset = 0;
753 }
754 EXPORT_SYMBOL(ip6_frag_init);
755 
756 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
757 {
758 	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
759 	struct sk_buff *frag;
760 	struct frag_hdr *fh;
761 	unsigned int len;
762 
763 	len = state->left;
764 	/* IF: it doesn't fit, use 'mtu' - the data space left */
765 	if (len > state->mtu)
766 		len = state->mtu;
767 	/* IF: we are not sending up to and including the packet end
768 	   then align the next start on an eight byte boundary */
769 	if (len < state->left)
770 		len &= ~7;
771 
772 	/* Allocate buffer */
773 	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
774 			 state->hroom + state->troom, GFP_ATOMIC);
775 	if (!frag)
776 		return ERR_PTR(-ENOMEM);
777 
778 	/*
779 	 *	Set up data on packet
780 	 */
781 
782 	ip6_copy_metadata(frag, skb);
783 	skb_reserve(frag, state->hroom);
784 	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
785 	skb_reset_network_header(frag);
786 	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
787 	frag->transport_header = (frag->network_header + state->hlen +
788 				  sizeof(struct frag_hdr));
789 
790 	/*
791 	 *	Charge the memory for the fragment to any owner
792 	 *	it might possess
793 	 */
794 	if (skb->sk)
795 		skb_set_owner_w(frag, skb->sk);
796 
797 	/*
798 	 *	Copy the packet header into the new buffer.
799 	 */
800 	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
801 
802 	fragnexthdr_offset = skb_network_header(frag);
803 	fragnexthdr_offset += prevhdr - skb_network_header(skb);
804 	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
805 
806 	/*
807 	 *	Build fragment header.
808 	 */
809 	fh->nexthdr = state->nexthdr;
810 	fh->reserved = 0;
811 	fh->identification = state->frag_id;
812 
813 	/*
814 	 *	Copy a block of the IP datagram.
815 	 */
816 	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
817 			     len));
818 	state->left -= len;
819 
820 	fh->frag_off = htons(state->offset);
821 	if (state->left > 0)
822 		fh->frag_off |= htons(IP6_MF);
823 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
824 
825 	state->ptr += len;
826 	state->offset += len;
827 
828 	return frag;
829 }
830 EXPORT_SYMBOL(ip6_frag_next);
831 
832 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
833 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
834 {
835 	struct sk_buff *frag;
836 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
837 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
838 				inet6_sk(skb->sk) : NULL;
839 	struct ip6_frag_state state;
840 	unsigned int mtu, hlen, nexthdr_offset;
841 	ktime_t tstamp = skb->tstamp;
842 	int hroom, err = 0;
843 	__be32 frag_id;
844 	u8 *prevhdr, nexthdr = 0;
845 
846 	err = ip6_find_1stfragopt(skb, &prevhdr);
847 	if (err < 0)
848 		goto fail;
849 	hlen = err;
850 	nexthdr = *prevhdr;
851 	nexthdr_offset = prevhdr - skb_network_header(skb);
852 
853 	mtu = ip6_skb_dst_mtu(skb);
854 
855 	/* We must not fragment if the socket is set to force MTU discovery
856 	 * or if the skb it not generated by a local socket.
857 	 */
858 	if (unlikely(!skb->ignore_df && skb->len > mtu))
859 		goto fail_toobig;
860 
861 	if (IP6CB(skb)->frag_max_size) {
862 		if (IP6CB(skb)->frag_max_size > mtu)
863 			goto fail_toobig;
864 
865 		/* don't send fragments larger than what we received */
866 		mtu = IP6CB(skb)->frag_max_size;
867 		if (mtu < IPV6_MIN_MTU)
868 			mtu = IPV6_MIN_MTU;
869 	}
870 
871 	if (np && np->frag_size < mtu) {
872 		if (np->frag_size)
873 			mtu = np->frag_size;
874 	}
875 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
876 		goto fail_toobig;
877 	mtu -= hlen + sizeof(struct frag_hdr);
878 
879 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
880 				    &ipv6_hdr(skb)->saddr);
881 
882 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
883 	    (err = skb_checksum_help(skb)))
884 		goto fail;
885 
886 	prevhdr = skb_network_header(skb) + nexthdr_offset;
887 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
888 	if (skb_has_frag_list(skb)) {
889 		unsigned int first_len = skb_pagelen(skb);
890 		struct ip6_fraglist_iter iter;
891 		struct sk_buff *frag2;
892 
893 		if (first_len - hlen > mtu ||
894 		    ((first_len - hlen) & 7) ||
895 		    skb_cloned(skb) ||
896 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
897 			goto slow_path;
898 
899 		skb_walk_frags(skb, frag) {
900 			/* Correct geometry. */
901 			if (frag->len > mtu ||
902 			    ((frag->len & 7) && frag->next) ||
903 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
904 				goto slow_path_clean;
905 
906 			/* Partially cloned skb? */
907 			if (skb_shared(frag))
908 				goto slow_path_clean;
909 
910 			BUG_ON(frag->sk);
911 			if (skb->sk) {
912 				frag->sk = skb->sk;
913 				frag->destructor = sock_wfree;
914 			}
915 			skb->truesize -= frag->truesize;
916 		}
917 
918 		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
919 					&iter);
920 		if (err < 0)
921 			goto fail;
922 
923 		for (;;) {
924 			/* Prepare header of the next frame,
925 			 * before previous one went down. */
926 			if (iter.frag)
927 				ip6_fraglist_prepare(skb, &iter);
928 
929 			skb->tstamp = tstamp;
930 			err = output(net, sk, skb);
931 			if (!err)
932 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
933 					      IPSTATS_MIB_FRAGCREATES);
934 
935 			if (err || !iter.frag)
936 				break;
937 
938 			skb = ip6_fraglist_next(&iter);
939 		}
940 
941 		kfree(iter.tmp_hdr);
942 
943 		if (err == 0) {
944 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
945 				      IPSTATS_MIB_FRAGOKS);
946 			return 0;
947 		}
948 
949 		kfree_skb_list(iter.frag);
950 
951 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
952 			      IPSTATS_MIB_FRAGFAILS);
953 		return err;
954 
955 slow_path_clean:
956 		skb_walk_frags(skb, frag2) {
957 			if (frag2 == frag)
958 				break;
959 			frag2->sk = NULL;
960 			frag2->destructor = NULL;
961 			skb->truesize += frag2->truesize;
962 		}
963 	}
964 
965 slow_path:
966 	/*
967 	 *	Fragment the datagram.
968 	 */
969 
970 	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
971 		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
972 		      &state);
973 
974 	/*
975 	 *	Keep copying data until we run out.
976 	 */
977 
978 	while (state.left > 0) {
979 		frag = ip6_frag_next(skb, &state);
980 		if (IS_ERR(frag)) {
981 			err = PTR_ERR(frag);
982 			goto fail;
983 		}
984 
985 		/*
986 		 *	Put this fragment into the sending queue.
987 		 */
988 		frag->tstamp = tstamp;
989 		err = output(net, sk, frag);
990 		if (err)
991 			goto fail;
992 
993 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
994 			      IPSTATS_MIB_FRAGCREATES);
995 	}
996 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
997 		      IPSTATS_MIB_FRAGOKS);
998 	consume_skb(skb);
999 	return err;
1000 
1001 fail_toobig:
1002 	if (skb->sk && dst_allfrag(skb_dst(skb)))
1003 		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
1004 
1005 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1006 	err = -EMSGSIZE;
1007 
1008 fail:
1009 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1010 		      IPSTATS_MIB_FRAGFAILS);
1011 	kfree_skb(skb);
1012 	return err;
1013 }
1014 
1015 static inline int ip6_rt_check(const struct rt6key *rt_key,
1016 			       const struct in6_addr *fl_addr,
1017 			       const struct in6_addr *addr_cache)
1018 {
1019 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1020 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1021 }
1022 
1023 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1024 					  struct dst_entry *dst,
1025 					  const struct flowi6 *fl6)
1026 {
1027 	struct ipv6_pinfo *np = inet6_sk(sk);
1028 	struct rt6_info *rt;
1029 
1030 	if (!dst)
1031 		goto out;
1032 
1033 	if (dst->ops->family != AF_INET6) {
1034 		dst_release(dst);
1035 		return NULL;
1036 	}
1037 
1038 	rt = (struct rt6_info *)dst;
1039 	/* Yes, checking route validity in not connected
1040 	 * case is not very simple. Take into account,
1041 	 * that we do not support routing by source, TOS,
1042 	 * and MSG_DONTROUTE		--ANK (980726)
1043 	 *
1044 	 * 1. ip6_rt_check(): If route was host route,
1045 	 *    check that cached destination is current.
1046 	 *    If it is network route, we still may
1047 	 *    check its validity using saved pointer
1048 	 *    to the last used address: daddr_cache.
1049 	 *    We do not want to save whole address now,
1050 	 *    (because main consumer of this service
1051 	 *    is tcp, which has not this problem),
1052 	 *    so that the last trick works only on connected
1053 	 *    sockets.
1054 	 * 2. oif also should be the same.
1055 	 */
1056 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1057 #ifdef CONFIG_IPV6_SUBTREES
1058 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1059 #endif
1060 	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
1061 	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
1062 		dst_release(dst);
1063 		dst = NULL;
1064 	}
1065 
1066 out:
1067 	return dst;
1068 }
1069 
1070 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1071 			       struct dst_entry **dst, struct flowi6 *fl6)
1072 {
1073 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1074 	struct neighbour *n;
1075 	struct rt6_info *rt;
1076 #endif
1077 	int err;
1078 	int flags = 0;
1079 
1080 	/* The correct way to handle this would be to do
1081 	 * ip6_route_get_saddr, and then ip6_route_output; however,
1082 	 * the route-specific preferred source forces the
1083 	 * ip6_route_output call _before_ ip6_route_get_saddr.
1084 	 *
1085 	 * In source specific routing (no src=any default route),
1086 	 * ip6_route_output will fail given src=any saddr, though, so
1087 	 * that's why we try it again later.
1088 	 */
1089 	if (ipv6_addr_any(&fl6->saddr)) {
1090 		struct fib6_info *from;
1091 		struct rt6_info *rt;
1092 
1093 		*dst = ip6_route_output(net, sk, fl6);
1094 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1095 
1096 		rcu_read_lock();
1097 		from = rt ? rcu_dereference(rt->from) : NULL;
1098 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1099 					  sk ? inet6_sk(sk)->srcprefs : 0,
1100 					  &fl6->saddr);
1101 		rcu_read_unlock();
1102 
1103 		if (err)
1104 			goto out_err_release;
1105 
1106 		/* If we had an erroneous initial result, pretend it
1107 		 * never existed and let the SA-enabled version take
1108 		 * over.
1109 		 */
1110 		if ((*dst)->error) {
1111 			dst_release(*dst);
1112 			*dst = NULL;
1113 		}
1114 
1115 		if (fl6->flowi6_oif)
1116 			flags |= RT6_LOOKUP_F_IFACE;
1117 	}
1118 
1119 	if (!*dst)
1120 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1121 
1122 	err = (*dst)->error;
1123 	if (err)
1124 		goto out_err_release;
1125 
1126 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1127 	/*
1128 	 * Here if the dst entry we've looked up
1129 	 * has a neighbour entry that is in the INCOMPLETE
1130 	 * state and the src address from the flow is
1131 	 * marked as OPTIMISTIC, we release the found
1132 	 * dst entry and replace it instead with the
1133 	 * dst entry of the nexthop router
1134 	 */
1135 	rt = (struct rt6_info *) *dst;
1136 	rcu_read_lock_bh();
1137 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1138 				      rt6_nexthop(rt, &fl6->daddr));
1139 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1140 	rcu_read_unlock_bh();
1141 
1142 	if (err) {
1143 		struct inet6_ifaddr *ifp;
1144 		struct flowi6 fl_gw6;
1145 		int redirect;
1146 
1147 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1148 				      (*dst)->dev, 1);
1149 
1150 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1151 		if (ifp)
1152 			in6_ifa_put(ifp);
1153 
1154 		if (redirect) {
1155 			/*
1156 			 * We need to get the dst entry for the
1157 			 * default router instead
1158 			 */
1159 			dst_release(*dst);
1160 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1161 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1162 			*dst = ip6_route_output(net, sk, &fl_gw6);
1163 			err = (*dst)->error;
1164 			if (err)
1165 				goto out_err_release;
1166 		}
1167 	}
1168 #endif
1169 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1170 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1171 		err = -EAFNOSUPPORT;
1172 		goto out_err_release;
1173 	}
1174 
1175 	return 0;
1176 
1177 out_err_release:
1178 	dst_release(*dst);
1179 	*dst = NULL;
1180 
1181 	if (err == -ENETUNREACH)
1182 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1183 	return err;
1184 }
1185 
1186 /**
1187  *	ip6_dst_lookup - perform route lookup on flow
1188  *	@net: Network namespace to perform lookup in
1189  *	@sk: socket which provides route info
1190  *	@dst: pointer to dst_entry * for result
1191  *	@fl6: flow to lookup
1192  *
1193  *	This function performs a route lookup on the given flow.
1194  *
1195  *	It returns zero on success, or a standard errno code on error.
1196  */
1197 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1198 		   struct flowi6 *fl6)
1199 {
1200 	*dst = NULL;
1201 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1202 }
1203 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1204 
1205 /**
1206  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1207  *	@net: Network namespace to perform lookup in
1208  *	@sk: socket which provides route info
1209  *	@fl6: flow to lookup
1210  *	@final_dst: final destination address for ipsec lookup
1211  *
1212  *	This function performs a route lookup on the given flow.
1213  *
1214  *	It returns a valid dst pointer on success, or a pointer encoded
1215  *	error code.
1216  */
1217 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1218 				      const struct in6_addr *final_dst)
1219 {
1220 	struct dst_entry *dst = NULL;
1221 	int err;
1222 
1223 	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1224 	if (err)
1225 		return ERR_PTR(err);
1226 	if (final_dst)
1227 		fl6->daddr = *final_dst;
1228 
1229 	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1230 }
1231 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1232 
1233 /**
1234  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1235  *	@sk: socket which provides the dst cache and route info
1236  *	@fl6: flow to lookup
1237  *	@final_dst: final destination address for ipsec lookup
1238  *	@connected: whether @sk is connected or not
1239  *
1240  *	This function performs a route lookup on the given flow with the
1241  *	possibility of using the cached route in the socket if it is valid.
1242  *	It will take the socket dst lock when operating on the dst cache.
1243  *	As a result, this function can only be used in process context.
1244  *
1245  *	In addition, for a connected socket, cache the dst in the socket
1246  *	if the current cache is not valid.
1247  *
1248  *	It returns a valid dst pointer on success, or a pointer encoded
1249  *	error code.
1250  */
1251 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1252 					 const struct in6_addr *final_dst,
1253 					 bool connected)
1254 {
1255 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1256 
1257 	dst = ip6_sk_dst_check(sk, dst, fl6);
1258 	if (dst)
1259 		return dst;
1260 
1261 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1262 	if (connected && !IS_ERR(dst))
1263 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1264 
1265 	return dst;
1266 }
1267 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1268 
1269 /**
1270  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1271  *      @skb: Packet for which lookup is done
1272  *      @dev: Tunnel device
1273  *      @net: Network namespace of tunnel device
1274  *      @sock: Socket which provides route info
1275  *      @saddr: Memory to store the src ip address
1276  *      @info: Tunnel information
1277  *      @protocol: IP protocol
1278  *      @use_cache: Flag to enable cache usage
1279  *      This function performs a route lookup on a tunnel
1280  *
1281  *      It returns a valid dst pointer and stores src address to be used in
1282  *      tunnel in param saddr on success, else a pointer encoded error code.
1283  */
1284 
1285 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1286 					struct net_device *dev,
1287 					struct net *net,
1288 					struct socket *sock,
1289 					struct in6_addr *saddr,
1290 					const struct ip_tunnel_info *info,
1291 					u8 protocol,
1292 					bool use_cache)
1293 {
1294 	struct dst_entry *dst = NULL;
1295 #ifdef CONFIG_DST_CACHE
1296 	struct dst_cache *dst_cache;
1297 #endif
1298 	struct flowi6 fl6;
1299 	__u8 prio;
1300 
1301 #ifdef CONFIG_DST_CACHE
1302 	dst_cache = (struct dst_cache *)&info->dst_cache;
1303 	if (use_cache) {
1304 		dst = dst_cache_get_ip6(dst_cache, saddr);
1305 		if (dst)
1306 			return dst;
1307 	}
1308 #endif
1309 	memset(&fl6, 0, sizeof(fl6));
1310 	fl6.flowi6_mark = skb->mark;
1311 	fl6.flowi6_proto = protocol;
1312 	fl6.daddr = info->key.u.ipv6.dst;
1313 	fl6.saddr = info->key.u.ipv6.src;
1314 	prio = info->key.tos;
1315 	fl6.flowlabel = ip6_make_flowinfo(RT_TOS(prio),
1316 					  info->key.label);
1317 
1318 	dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1319 					      NULL);
1320 	if (IS_ERR(dst)) {
1321 		netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1322 		return ERR_PTR(-ENETUNREACH);
1323 	}
1324 	if (dst->dev == dev) { /* is this necessary? */
1325 		netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1326 		dst_release(dst);
1327 		return ERR_PTR(-ELOOP);
1328 	}
1329 #ifdef CONFIG_DST_CACHE
1330 	if (use_cache)
1331 		dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1332 #endif
1333 	*saddr = fl6.saddr;
1334 	return dst;
1335 }
1336 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1337 
1338 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1339 					       gfp_t gfp)
1340 {
1341 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1342 }
1343 
1344 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1345 						gfp_t gfp)
1346 {
1347 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1348 }
1349 
1350 static void ip6_append_data_mtu(unsigned int *mtu,
1351 				int *maxfraglen,
1352 				unsigned int fragheaderlen,
1353 				struct sk_buff *skb,
1354 				struct rt6_info *rt,
1355 				unsigned int orig_mtu)
1356 {
1357 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1358 		if (!skb) {
1359 			/* first fragment, reserve header_len */
1360 			*mtu = orig_mtu - rt->dst.header_len;
1361 
1362 		} else {
1363 			/*
1364 			 * this fragment is not first, the headers
1365 			 * space is regarded as data space.
1366 			 */
1367 			*mtu = orig_mtu;
1368 		}
1369 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1370 			      + fragheaderlen - sizeof(struct frag_hdr);
1371 	}
1372 }
1373 
1374 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1375 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1376 			  struct rt6_info *rt, struct flowi6 *fl6)
1377 {
1378 	struct ipv6_pinfo *np = inet6_sk(sk);
1379 	unsigned int mtu;
1380 	struct ipv6_txoptions *opt = ipc6->opt;
1381 
1382 	/*
1383 	 * setup for corking
1384 	 */
1385 	if (opt) {
1386 		if (WARN_ON(v6_cork->opt))
1387 			return -EINVAL;
1388 
1389 		v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1390 		if (unlikely(!v6_cork->opt))
1391 			return -ENOBUFS;
1392 
1393 		v6_cork->opt->tot_len = sizeof(*opt);
1394 		v6_cork->opt->opt_flen = opt->opt_flen;
1395 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1396 
1397 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1398 						    sk->sk_allocation);
1399 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1400 			return -ENOBUFS;
1401 
1402 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1403 						    sk->sk_allocation);
1404 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1405 			return -ENOBUFS;
1406 
1407 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1408 						   sk->sk_allocation);
1409 		if (opt->hopopt && !v6_cork->opt->hopopt)
1410 			return -ENOBUFS;
1411 
1412 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1413 						    sk->sk_allocation);
1414 		if (opt->srcrt && !v6_cork->opt->srcrt)
1415 			return -ENOBUFS;
1416 
1417 		/* need source address above miyazawa*/
1418 	}
1419 	dst_hold(&rt->dst);
1420 	cork->base.dst = &rt->dst;
1421 	cork->fl.u.ip6 = *fl6;
1422 	v6_cork->hop_limit = ipc6->hlimit;
1423 	v6_cork->tclass = ipc6->tclass;
1424 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1425 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1426 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1427 	else
1428 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1429 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1430 	if (np->frag_size < mtu) {
1431 		if (np->frag_size)
1432 			mtu = np->frag_size;
1433 	}
1434 	if (mtu < IPV6_MIN_MTU)
1435 		return -EINVAL;
1436 	cork->base.fragsize = mtu;
1437 	cork->base.gso_size = ipc6->gso_size;
1438 	cork->base.tx_flags = 0;
1439 	cork->base.mark = ipc6->sockc.mark;
1440 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1441 
1442 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1443 		cork->base.flags |= IPCORK_ALLFRAG;
1444 	cork->base.length = 0;
1445 
1446 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1447 
1448 	return 0;
1449 }
1450 
1451 static int __ip6_append_data(struct sock *sk,
1452 			     struct flowi6 *fl6,
1453 			     struct sk_buff_head *queue,
1454 			     struct inet_cork *cork,
1455 			     struct inet6_cork *v6_cork,
1456 			     struct page_frag *pfrag,
1457 			     int getfrag(void *from, char *to, int offset,
1458 					 int len, int odd, struct sk_buff *skb),
1459 			     void *from, int length, int transhdrlen,
1460 			     unsigned int flags, struct ipcm6_cookie *ipc6)
1461 {
1462 	struct sk_buff *skb, *skb_prev = NULL;
1463 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1464 	struct ubuf_info *uarg = NULL;
1465 	int exthdrlen = 0;
1466 	int dst_exthdrlen = 0;
1467 	int hh_len;
1468 	int copy;
1469 	int err;
1470 	int offset = 0;
1471 	u32 tskey = 0;
1472 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1473 	struct ipv6_txoptions *opt = v6_cork->opt;
1474 	int csummode = CHECKSUM_NONE;
1475 	unsigned int maxnonfragsize, headersize;
1476 	unsigned int wmem_alloc_delta = 0;
1477 	bool paged, extra_uref = false;
1478 
1479 	skb = skb_peek_tail(queue);
1480 	if (!skb) {
1481 		exthdrlen = opt ? opt->opt_flen : 0;
1482 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1483 	}
1484 
1485 	paged = !!cork->gso_size;
1486 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1487 	orig_mtu = mtu;
1488 
1489 	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1490 	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1491 		tskey = sk->sk_tskey++;
1492 
1493 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1494 
1495 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1496 			(opt ? opt->opt_nflen : 0);
1497 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1498 		     sizeof(struct frag_hdr);
1499 
1500 	headersize = sizeof(struct ipv6hdr) +
1501 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1502 		     (dst_allfrag(&rt->dst) ?
1503 		      sizeof(struct frag_hdr) : 0) +
1504 		     rt->rt6i_nfheader_len;
1505 
1506 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1507 	 * the first fragment
1508 	 */
1509 	if (headersize + transhdrlen > mtu)
1510 		goto emsgsize;
1511 
1512 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1513 	    (sk->sk_protocol == IPPROTO_UDP ||
1514 	     sk->sk_protocol == IPPROTO_RAW)) {
1515 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1516 				sizeof(struct ipv6hdr));
1517 		goto emsgsize;
1518 	}
1519 
1520 	if (ip6_sk_ignore_df(sk))
1521 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1522 	else
1523 		maxnonfragsize = mtu;
1524 
1525 	if (cork->length + length > maxnonfragsize - headersize) {
1526 emsgsize:
1527 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1528 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1529 		return -EMSGSIZE;
1530 	}
1531 
1532 	/* CHECKSUM_PARTIAL only with no extension headers and when
1533 	 * we are not going to fragment
1534 	 */
1535 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1536 	    headersize == sizeof(struct ipv6hdr) &&
1537 	    length <= mtu - headersize &&
1538 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1539 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1540 		csummode = CHECKSUM_PARTIAL;
1541 
1542 	if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1543 		uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1544 		if (!uarg)
1545 			return -ENOBUFS;
1546 		extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1547 		if (rt->dst.dev->features & NETIF_F_SG &&
1548 		    csummode == CHECKSUM_PARTIAL) {
1549 			paged = true;
1550 		} else {
1551 			uarg->zerocopy = 0;
1552 			skb_zcopy_set(skb, uarg, &extra_uref);
1553 		}
1554 	}
1555 
1556 	/*
1557 	 * Let's try using as much space as possible.
1558 	 * Use MTU if total length of the message fits into the MTU.
1559 	 * Otherwise, we need to reserve fragment header and
1560 	 * fragment alignment (= 8-15 octects, in total).
1561 	 *
1562 	 * Note that we may need to "move" the data from the tail
1563 	 * of the buffer to the new fragment when we split
1564 	 * the message.
1565 	 *
1566 	 * FIXME: It may be fragmented into multiple chunks
1567 	 *        at once if non-fragmentable extension headers
1568 	 *        are too large.
1569 	 * --yoshfuji
1570 	 */
1571 
1572 	cork->length += length;
1573 	if (!skb)
1574 		goto alloc_new_skb;
1575 
1576 	while (length > 0) {
1577 		/* Check if the remaining data fits into current packet. */
1578 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1579 		if (copy < length)
1580 			copy = maxfraglen - skb->len;
1581 
1582 		if (copy <= 0) {
1583 			char *data;
1584 			unsigned int datalen;
1585 			unsigned int fraglen;
1586 			unsigned int fraggap;
1587 			unsigned int alloclen, alloc_extra;
1588 			unsigned int pagedlen;
1589 alloc_new_skb:
1590 			/* There's no room in the current skb */
1591 			if (skb)
1592 				fraggap = skb->len - maxfraglen;
1593 			else
1594 				fraggap = 0;
1595 			/* update mtu and maxfraglen if necessary */
1596 			if (!skb || !skb_prev)
1597 				ip6_append_data_mtu(&mtu, &maxfraglen,
1598 						    fragheaderlen, skb, rt,
1599 						    orig_mtu);
1600 
1601 			skb_prev = skb;
1602 
1603 			/*
1604 			 * If remaining data exceeds the mtu,
1605 			 * we know we need more fragment(s).
1606 			 */
1607 			datalen = length + fraggap;
1608 
1609 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1610 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1611 			fraglen = datalen + fragheaderlen;
1612 			pagedlen = 0;
1613 
1614 			alloc_extra = hh_len;
1615 			alloc_extra += dst_exthdrlen;
1616 			alloc_extra += rt->dst.trailer_len;
1617 
1618 			/* We just reserve space for fragment header.
1619 			 * Note: this may be overallocation if the message
1620 			 * (without MSG_MORE) fits into the MTU.
1621 			 */
1622 			alloc_extra += sizeof(struct frag_hdr);
1623 
1624 			if ((flags & MSG_MORE) &&
1625 			    !(rt->dst.dev->features&NETIF_F_SG))
1626 				alloclen = mtu;
1627 			else if (!paged &&
1628 				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1629 				  !(rt->dst.dev->features & NETIF_F_SG)))
1630 				alloclen = fraglen;
1631 			else {
1632 				alloclen = min_t(int, fraglen, MAX_HEADER);
1633 				pagedlen = fraglen - alloclen;
1634 			}
1635 			alloclen += alloc_extra;
1636 
1637 			if (datalen != length + fraggap) {
1638 				/*
1639 				 * this is not the last fragment, the trailer
1640 				 * space is regarded as data space.
1641 				 */
1642 				datalen += rt->dst.trailer_len;
1643 			}
1644 
1645 			fraglen = datalen + fragheaderlen;
1646 
1647 			copy = datalen - transhdrlen - fraggap - pagedlen;
1648 			if (copy < 0) {
1649 				err = -EINVAL;
1650 				goto error;
1651 			}
1652 			if (transhdrlen) {
1653 				skb = sock_alloc_send_skb(sk, alloclen,
1654 						(flags & MSG_DONTWAIT), &err);
1655 			} else {
1656 				skb = NULL;
1657 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1658 				    2 * sk->sk_sndbuf)
1659 					skb = alloc_skb(alloclen,
1660 							sk->sk_allocation);
1661 				if (unlikely(!skb))
1662 					err = -ENOBUFS;
1663 			}
1664 			if (!skb)
1665 				goto error;
1666 			/*
1667 			 *	Fill in the control structures
1668 			 */
1669 			skb->protocol = htons(ETH_P_IPV6);
1670 			skb->ip_summed = csummode;
1671 			skb->csum = 0;
1672 			/* reserve for fragmentation and ipsec header */
1673 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1674 				    dst_exthdrlen);
1675 
1676 			/*
1677 			 *	Find where to start putting bytes
1678 			 */
1679 			data = skb_put(skb, fraglen - pagedlen);
1680 			skb_set_network_header(skb, exthdrlen);
1681 			data += fragheaderlen;
1682 			skb->transport_header = (skb->network_header +
1683 						 fragheaderlen);
1684 			if (fraggap) {
1685 				skb->csum = skb_copy_and_csum_bits(
1686 					skb_prev, maxfraglen,
1687 					data + transhdrlen, fraggap);
1688 				skb_prev->csum = csum_sub(skb_prev->csum,
1689 							  skb->csum);
1690 				data += fraggap;
1691 				pskb_trim_unique(skb_prev, maxfraglen);
1692 			}
1693 			if (copy > 0 &&
1694 			    getfrag(from, data + transhdrlen, offset,
1695 				    copy, fraggap, skb) < 0) {
1696 				err = -EFAULT;
1697 				kfree_skb(skb);
1698 				goto error;
1699 			}
1700 
1701 			offset += copy;
1702 			length -= copy + transhdrlen;
1703 			transhdrlen = 0;
1704 			exthdrlen = 0;
1705 			dst_exthdrlen = 0;
1706 
1707 			/* Only the initial fragment is time stamped */
1708 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1709 			cork->tx_flags = 0;
1710 			skb_shinfo(skb)->tskey = tskey;
1711 			tskey = 0;
1712 			skb_zcopy_set(skb, uarg, &extra_uref);
1713 
1714 			if ((flags & MSG_CONFIRM) && !skb_prev)
1715 				skb_set_dst_pending_confirm(skb, 1);
1716 
1717 			/*
1718 			 * Put the packet on the pending queue
1719 			 */
1720 			if (!skb->destructor) {
1721 				skb->destructor = sock_wfree;
1722 				skb->sk = sk;
1723 				wmem_alloc_delta += skb->truesize;
1724 			}
1725 			__skb_queue_tail(queue, skb);
1726 			continue;
1727 		}
1728 
1729 		if (copy > length)
1730 			copy = length;
1731 
1732 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1733 		    skb_tailroom(skb) >= copy) {
1734 			unsigned int off;
1735 
1736 			off = skb->len;
1737 			if (getfrag(from, skb_put(skb, copy),
1738 						offset, copy, off, skb) < 0) {
1739 				__skb_trim(skb, off);
1740 				err = -EFAULT;
1741 				goto error;
1742 			}
1743 		} else if (!uarg || !uarg->zerocopy) {
1744 			int i = skb_shinfo(skb)->nr_frags;
1745 
1746 			err = -ENOMEM;
1747 			if (!sk_page_frag_refill(sk, pfrag))
1748 				goto error;
1749 
1750 			if (!skb_can_coalesce(skb, i, pfrag->page,
1751 					      pfrag->offset)) {
1752 				err = -EMSGSIZE;
1753 				if (i == MAX_SKB_FRAGS)
1754 					goto error;
1755 
1756 				__skb_fill_page_desc(skb, i, pfrag->page,
1757 						     pfrag->offset, 0);
1758 				skb_shinfo(skb)->nr_frags = ++i;
1759 				get_page(pfrag->page);
1760 			}
1761 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1762 			if (getfrag(from,
1763 				    page_address(pfrag->page) + pfrag->offset,
1764 				    offset, copy, skb->len, skb) < 0)
1765 				goto error_efault;
1766 
1767 			pfrag->offset += copy;
1768 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1769 			skb->len += copy;
1770 			skb->data_len += copy;
1771 			skb->truesize += copy;
1772 			wmem_alloc_delta += copy;
1773 		} else {
1774 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1775 			if (err < 0)
1776 				goto error;
1777 		}
1778 		offset += copy;
1779 		length -= copy;
1780 	}
1781 
1782 	if (wmem_alloc_delta)
1783 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1784 	return 0;
1785 
1786 error_efault:
1787 	err = -EFAULT;
1788 error:
1789 	net_zcopy_put_abort(uarg, extra_uref);
1790 	cork->length -= length;
1791 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1792 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1793 	return err;
1794 }
1795 
1796 int ip6_append_data(struct sock *sk,
1797 		    int getfrag(void *from, char *to, int offset, int len,
1798 				int odd, struct sk_buff *skb),
1799 		    void *from, int length, int transhdrlen,
1800 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1801 		    struct rt6_info *rt, unsigned int flags)
1802 {
1803 	struct inet_sock *inet = inet_sk(sk);
1804 	struct ipv6_pinfo *np = inet6_sk(sk);
1805 	int exthdrlen;
1806 	int err;
1807 
1808 	if (flags&MSG_PROBE)
1809 		return 0;
1810 	if (skb_queue_empty(&sk->sk_write_queue)) {
1811 		/*
1812 		 * setup for corking
1813 		 */
1814 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1815 				     ipc6, rt, fl6);
1816 		if (err)
1817 			return err;
1818 
1819 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1820 		length += exthdrlen;
1821 		transhdrlen += exthdrlen;
1822 	} else {
1823 		fl6 = &inet->cork.fl.u.ip6;
1824 		transhdrlen = 0;
1825 	}
1826 
1827 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1828 				 &np->cork, sk_page_frag(sk), getfrag,
1829 				 from, length, transhdrlen, flags, ipc6);
1830 }
1831 EXPORT_SYMBOL_GPL(ip6_append_data);
1832 
1833 static void ip6_cork_release(struct inet_cork_full *cork,
1834 			     struct inet6_cork *v6_cork)
1835 {
1836 	if (v6_cork->opt) {
1837 		kfree(v6_cork->opt->dst0opt);
1838 		kfree(v6_cork->opt->dst1opt);
1839 		kfree(v6_cork->opt->hopopt);
1840 		kfree(v6_cork->opt->srcrt);
1841 		kfree(v6_cork->opt);
1842 		v6_cork->opt = NULL;
1843 	}
1844 
1845 	if (cork->base.dst) {
1846 		dst_release(cork->base.dst);
1847 		cork->base.dst = NULL;
1848 		cork->base.flags &= ~IPCORK_ALLFRAG;
1849 	}
1850 	memset(&cork->fl, 0, sizeof(cork->fl));
1851 }
1852 
1853 struct sk_buff *__ip6_make_skb(struct sock *sk,
1854 			       struct sk_buff_head *queue,
1855 			       struct inet_cork_full *cork,
1856 			       struct inet6_cork *v6_cork)
1857 {
1858 	struct sk_buff *skb, *tmp_skb;
1859 	struct sk_buff **tail_skb;
1860 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1861 	struct ipv6_pinfo *np = inet6_sk(sk);
1862 	struct net *net = sock_net(sk);
1863 	struct ipv6hdr *hdr;
1864 	struct ipv6_txoptions *opt = v6_cork->opt;
1865 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1866 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1867 	unsigned char proto = fl6->flowi6_proto;
1868 
1869 	skb = __skb_dequeue(queue);
1870 	if (!skb)
1871 		goto out;
1872 	tail_skb = &(skb_shinfo(skb)->frag_list);
1873 
1874 	/* move skb->data to ip header from ext header */
1875 	if (skb->data < skb_network_header(skb))
1876 		__skb_pull(skb, skb_network_offset(skb));
1877 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1878 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1879 		*tail_skb = tmp_skb;
1880 		tail_skb = &(tmp_skb->next);
1881 		skb->len += tmp_skb->len;
1882 		skb->data_len += tmp_skb->len;
1883 		skb->truesize += tmp_skb->truesize;
1884 		tmp_skb->destructor = NULL;
1885 		tmp_skb->sk = NULL;
1886 	}
1887 
1888 	/* Allow local fragmentation. */
1889 	skb->ignore_df = ip6_sk_ignore_df(sk);
1890 
1891 	*final_dst = fl6->daddr;
1892 	__skb_pull(skb, skb_network_header_len(skb));
1893 	if (opt && opt->opt_flen)
1894 		ipv6_push_frag_opts(skb, opt, &proto);
1895 	if (opt && opt->opt_nflen)
1896 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1897 
1898 	skb_push(skb, sizeof(struct ipv6hdr));
1899 	skb_reset_network_header(skb);
1900 	hdr = ipv6_hdr(skb);
1901 
1902 	ip6_flow_hdr(hdr, v6_cork->tclass,
1903 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1904 					ip6_autoflowlabel(net, np), fl6));
1905 	hdr->hop_limit = v6_cork->hop_limit;
1906 	hdr->nexthdr = proto;
1907 	hdr->saddr = fl6->saddr;
1908 	hdr->daddr = *final_dst;
1909 
1910 	skb->priority = sk->sk_priority;
1911 	skb->mark = cork->base.mark;
1912 
1913 	skb->tstamp = cork->base.transmit_time;
1914 
1915 	skb_dst_set(skb, dst_clone(&rt->dst));
1916 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1917 	if (proto == IPPROTO_ICMPV6) {
1918 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1919 
1920 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1921 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1922 	}
1923 
1924 	ip6_cork_release(cork, v6_cork);
1925 out:
1926 	return skb;
1927 }
1928 
1929 int ip6_send_skb(struct sk_buff *skb)
1930 {
1931 	struct net *net = sock_net(skb->sk);
1932 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1933 	int err;
1934 
1935 	err = ip6_local_out(net, skb->sk, skb);
1936 	if (err) {
1937 		if (err > 0)
1938 			err = net_xmit_errno(err);
1939 		if (err)
1940 			IP6_INC_STATS(net, rt->rt6i_idev,
1941 				      IPSTATS_MIB_OUTDISCARDS);
1942 	}
1943 
1944 	return err;
1945 }
1946 
1947 int ip6_push_pending_frames(struct sock *sk)
1948 {
1949 	struct sk_buff *skb;
1950 
1951 	skb = ip6_finish_skb(sk);
1952 	if (!skb)
1953 		return 0;
1954 
1955 	return ip6_send_skb(skb);
1956 }
1957 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1958 
1959 static void __ip6_flush_pending_frames(struct sock *sk,
1960 				       struct sk_buff_head *queue,
1961 				       struct inet_cork_full *cork,
1962 				       struct inet6_cork *v6_cork)
1963 {
1964 	struct sk_buff *skb;
1965 
1966 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1967 		if (skb_dst(skb))
1968 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1969 				      IPSTATS_MIB_OUTDISCARDS);
1970 		kfree_skb(skb);
1971 	}
1972 
1973 	ip6_cork_release(cork, v6_cork);
1974 }
1975 
1976 void ip6_flush_pending_frames(struct sock *sk)
1977 {
1978 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1979 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1980 }
1981 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1982 
1983 struct sk_buff *ip6_make_skb(struct sock *sk,
1984 			     int getfrag(void *from, char *to, int offset,
1985 					 int len, int odd, struct sk_buff *skb),
1986 			     void *from, int length, int transhdrlen,
1987 			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1988 			     struct rt6_info *rt, unsigned int flags,
1989 			     struct inet_cork_full *cork)
1990 {
1991 	struct inet6_cork v6_cork;
1992 	struct sk_buff_head queue;
1993 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1994 	int err;
1995 
1996 	if (flags & MSG_PROBE)
1997 		return NULL;
1998 
1999 	__skb_queue_head_init(&queue);
2000 
2001 	cork->base.flags = 0;
2002 	cork->base.addr = 0;
2003 	cork->base.opt = NULL;
2004 	cork->base.dst = NULL;
2005 	v6_cork.opt = NULL;
2006 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
2007 	if (err) {
2008 		ip6_cork_release(cork, &v6_cork);
2009 		return ERR_PTR(err);
2010 	}
2011 	if (ipc6->dontfrag < 0)
2012 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2013 
2014 	err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
2015 				&current->task_frag, getfrag, from,
2016 				length + exthdrlen, transhdrlen + exthdrlen,
2017 				flags, ipc6);
2018 	if (err) {
2019 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2020 		return ERR_PTR(err);
2021 	}
2022 
2023 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2024 }
2025