xref: /openbmc/linux/net/ipv6/ip6_output.c (revision af8e6bbf66dad3eb92908f8076bf6a5bd60075f8)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	IPv6 output functions
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on linux/net/ipv4/ip_output.c
10  *
11  *	Changes:
12  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13  *				extension headers are implemented.
14  *				route changes now work.
15  *				ip6_forward does not confuse sniffers.
16  *				etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *	Imran Patel	:	frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *			:       add ip6_append_data and related functions
22  *				for datagram xmit
23  */
24 
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37 
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41 
42 #include <net/sock.h>
43 #include <net/snmp.h>
44 
45 #include <net/gso.h>
46 #include <net/ipv6.h>
47 #include <net/ndisc.h>
48 #include <net/protocol.h>
49 #include <net/ip6_route.h>
50 #include <net/addrconf.h>
51 #include <net/rawv6.h>
52 #include <net/icmp.h>
53 #include <net/xfrm.h>
54 #include <net/checksum.h>
55 #include <linux/mroute6.h>
56 #include <net/l3mdev.h>
57 #include <net/lwtunnel.h>
58 #include <net/ip_tunnels.h>
59 
60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61 {
62 	struct dst_entry *dst = skb_dst(skb);
63 	struct net_device *dev = dst->dev;
64 	struct inet6_dev *idev = ip6_dst_idev(dst);
65 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
66 	const struct in6_addr *daddr, *nexthop;
67 	struct ipv6hdr *hdr;
68 	struct neighbour *neigh;
69 	int ret;
70 
71 	/* Be paranoid, rather than too clever. */
72 	if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
73 		skb = skb_expand_head(skb, hh_len);
74 		if (!skb) {
75 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
76 			return -ENOMEM;
77 		}
78 	}
79 
80 	hdr = ipv6_hdr(skb);
81 	daddr = &hdr->daddr;
82 	if (ipv6_addr_is_multicast(daddr)) {
83 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
84 		    ((mroute6_is_socket(net, skb) &&
85 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
86 		     ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
87 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
88 
89 			/* Do not check for IFF_ALLMULTI; multicast routing
90 			   is not supported in any case.
91 			 */
92 			if (newskb)
93 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
94 					net, sk, newskb, NULL, newskb->dev,
95 					dev_loopback_xmit);
96 
97 			if (hdr->hop_limit == 0) {
98 				IP6_INC_STATS(net, idev,
99 					      IPSTATS_MIB_OUTDISCARDS);
100 				kfree_skb(skb);
101 				return 0;
102 			}
103 		}
104 
105 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
106 		if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
107 		    !(dev->flags & IFF_LOOPBACK)) {
108 			kfree_skb(skb);
109 			return 0;
110 		}
111 	}
112 
113 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
114 		int res = lwtunnel_xmit(skb);
115 
116 		if (res != LWTUNNEL_XMIT_CONTINUE)
117 			return res;
118 	}
119 
120 	IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
121 
122 	rcu_read_lock();
123 	nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
124 	neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
125 
126 	if (unlikely(IS_ERR_OR_NULL(neigh))) {
127 		if (unlikely(!neigh))
128 			neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
129 		if (IS_ERR(neigh)) {
130 			rcu_read_unlock();
131 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
132 			kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
133 			return -EINVAL;
134 		}
135 	}
136 	sock_confirm_neigh(skb, neigh);
137 	ret = neigh_output(neigh, skb, false);
138 	rcu_read_unlock();
139 	return ret;
140 }
141 
142 static int
143 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
144 				    struct sk_buff *skb, unsigned int mtu)
145 {
146 	struct sk_buff *segs, *nskb;
147 	netdev_features_t features;
148 	int ret = 0;
149 
150 	/* Please see corresponding comment in ip_finish_output_gso
151 	 * describing the cases where GSO segment length exceeds the
152 	 * egress MTU.
153 	 */
154 	features = netif_skb_features(skb);
155 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
156 	if (IS_ERR_OR_NULL(segs)) {
157 		kfree_skb(skb);
158 		return -ENOMEM;
159 	}
160 
161 	consume_skb(skb);
162 
163 	skb_list_walk_safe(segs, segs, nskb) {
164 		int err;
165 
166 		skb_mark_not_on_list(segs);
167 		/* Last GSO segment can be smaller than gso_size (and MTU).
168 		 * Adding a fragment header would produce an "atomic fragment",
169 		 * which is considered harmful (RFC-8021). Avoid that.
170 		 */
171 		err = segs->len > mtu ?
172 			ip6_fragment(net, sk, segs, ip6_finish_output2) :
173 			ip6_finish_output2(net, sk, segs);
174 		if (err && ret == 0)
175 			ret = err;
176 	}
177 
178 	return ret;
179 }
180 
181 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
182 {
183 	unsigned int mtu;
184 
185 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
186 	/* Policy lookup after SNAT yielded a new policy */
187 	if (skb_dst(skb)->xfrm) {
188 		IP6CB(skb)->flags |= IP6SKB_REROUTED;
189 		return dst_output(net, sk, skb);
190 	}
191 #endif
192 
193 	mtu = ip6_skb_dst_mtu(skb);
194 	if (skb_is_gso(skb) &&
195 	    !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
196 	    !skb_gso_validate_network_len(skb, mtu))
197 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
198 
199 	if ((skb->len > mtu && !skb_is_gso(skb)) ||
200 	    dst_allfrag(skb_dst(skb)) ||
201 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
202 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
203 	else
204 		return ip6_finish_output2(net, sk, skb);
205 }
206 
207 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
208 {
209 	int ret;
210 
211 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
212 	switch (ret) {
213 	case NET_XMIT_SUCCESS:
214 	case NET_XMIT_CN:
215 		return __ip6_finish_output(net, sk, skb) ? : ret;
216 	default:
217 		kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
218 		return ret;
219 	}
220 }
221 
222 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
223 {
224 	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
225 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
226 
227 	skb->protocol = htons(ETH_P_IPV6);
228 	skb->dev = dev;
229 
230 	if (unlikely(idev->cnf.disable_ipv6)) {
231 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
232 		kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
233 		return 0;
234 	}
235 
236 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
237 			    net, sk, skb, indev, dev,
238 			    ip6_finish_output,
239 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
240 }
241 EXPORT_SYMBOL(ip6_output);
242 
243 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
244 {
245 	if (!np->autoflowlabel_set)
246 		return ip6_default_np_autolabel(net);
247 	else
248 		return np->autoflowlabel;
249 }
250 
251 /*
252  * xmit an sk_buff (used by TCP, SCTP and DCCP)
253  * Note : socket lock is not held for SYNACK packets, but might be modified
254  * by calls to skb_set_owner_w() and ipv6_local_error(),
255  * which are using proper atomic operations or spinlocks.
256  */
257 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
258 	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
259 {
260 	struct net *net = sock_net(sk);
261 	const struct ipv6_pinfo *np = inet6_sk(sk);
262 	struct in6_addr *first_hop = &fl6->daddr;
263 	struct dst_entry *dst = skb_dst(skb);
264 	struct net_device *dev = dst->dev;
265 	struct inet6_dev *idev = ip6_dst_idev(dst);
266 	struct hop_jumbo_hdr *hop_jumbo;
267 	int hoplen = sizeof(*hop_jumbo);
268 	unsigned int head_room;
269 	struct ipv6hdr *hdr;
270 	u8  proto = fl6->flowi6_proto;
271 	int seg_len = skb->len;
272 	int hlimit = -1;
273 	u32 mtu;
274 
275 	head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
276 	if (opt)
277 		head_room += opt->opt_nflen + opt->opt_flen;
278 
279 	if (unlikely(head_room > skb_headroom(skb))) {
280 		skb = skb_expand_head(skb, head_room);
281 		if (!skb) {
282 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
283 			return -ENOBUFS;
284 		}
285 	}
286 
287 	if (opt) {
288 		seg_len += opt->opt_nflen + opt->opt_flen;
289 
290 		if (opt->opt_flen)
291 			ipv6_push_frag_opts(skb, opt, &proto);
292 
293 		if (opt->opt_nflen)
294 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
295 					     &fl6->saddr);
296 	}
297 
298 	if (unlikely(seg_len > IPV6_MAXPLEN)) {
299 		hop_jumbo = skb_push(skb, hoplen);
300 
301 		hop_jumbo->nexthdr = proto;
302 		hop_jumbo->hdrlen = 0;
303 		hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
304 		hop_jumbo->tlv_len = 4;
305 		hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
306 
307 		proto = IPPROTO_HOPOPTS;
308 		seg_len = 0;
309 		IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
310 	}
311 
312 	skb_push(skb, sizeof(struct ipv6hdr));
313 	skb_reset_network_header(skb);
314 	hdr = ipv6_hdr(skb);
315 
316 	/*
317 	 *	Fill in the IPv6 header
318 	 */
319 	if (np)
320 		hlimit = np->hop_limit;
321 	if (hlimit < 0)
322 		hlimit = ip6_dst_hoplimit(dst);
323 
324 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
325 				ip6_autoflowlabel(net, np), fl6));
326 
327 	hdr->payload_len = htons(seg_len);
328 	hdr->nexthdr = proto;
329 	hdr->hop_limit = hlimit;
330 
331 	hdr->saddr = fl6->saddr;
332 	hdr->daddr = *first_hop;
333 
334 	skb->protocol = htons(ETH_P_IPV6);
335 	skb->priority = priority;
336 	skb->mark = mark;
337 
338 	mtu = dst_mtu(dst);
339 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
340 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
341 
342 		/* if egress device is enslaved to an L3 master device pass the
343 		 * skb to its handler for processing
344 		 */
345 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
346 		if (unlikely(!skb))
347 			return 0;
348 
349 		/* hooks should never assume socket lock is held.
350 		 * we promote our socket to non const
351 		 */
352 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
353 			       net, (struct sock *)sk, skb, NULL, dev,
354 			       dst_output);
355 	}
356 
357 	skb->dev = dev;
358 	/* ipv6_local_error() does not require socket lock,
359 	 * we promote our socket to non const
360 	 */
361 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
362 
363 	IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
364 	kfree_skb(skb);
365 	return -EMSGSIZE;
366 }
367 EXPORT_SYMBOL(ip6_xmit);
368 
369 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
370 {
371 	struct ip6_ra_chain *ra;
372 	struct sock *last = NULL;
373 
374 	read_lock(&ip6_ra_lock);
375 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
376 		struct sock *sk = ra->sk;
377 		if (sk && ra->sel == sel &&
378 		    (!sk->sk_bound_dev_if ||
379 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
380 			struct ipv6_pinfo *np = inet6_sk(sk);
381 
382 			if (np && np->rtalert_isolate &&
383 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
384 				continue;
385 			}
386 			if (last) {
387 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
388 				if (skb2)
389 					rawv6_rcv(last, skb2);
390 			}
391 			last = sk;
392 		}
393 	}
394 
395 	if (last) {
396 		rawv6_rcv(last, skb);
397 		read_unlock(&ip6_ra_lock);
398 		return 1;
399 	}
400 	read_unlock(&ip6_ra_lock);
401 	return 0;
402 }
403 
404 static int ip6_forward_proxy_check(struct sk_buff *skb)
405 {
406 	struct ipv6hdr *hdr = ipv6_hdr(skb);
407 	u8 nexthdr = hdr->nexthdr;
408 	__be16 frag_off;
409 	int offset;
410 
411 	if (ipv6_ext_hdr(nexthdr)) {
412 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
413 		if (offset < 0)
414 			return 0;
415 	} else
416 		offset = sizeof(struct ipv6hdr);
417 
418 	if (nexthdr == IPPROTO_ICMPV6) {
419 		struct icmp6hdr *icmp6;
420 
421 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
422 					 offset + 1 - skb->data)))
423 			return 0;
424 
425 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
426 
427 		switch (icmp6->icmp6_type) {
428 		case NDISC_ROUTER_SOLICITATION:
429 		case NDISC_ROUTER_ADVERTISEMENT:
430 		case NDISC_NEIGHBOUR_SOLICITATION:
431 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
432 		case NDISC_REDIRECT:
433 			/* For reaction involving unicast neighbor discovery
434 			 * message destined to the proxied address, pass it to
435 			 * input function.
436 			 */
437 			return 1;
438 		default:
439 			break;
440 		}
441 	}
442 
443 	/*
444 	 * The proxying router can't forward traffic sent to a link-local
445 	 * address, so signal the sender and discard the packet. This
446 	 * behavior is clarified by the MIPv6 specification.
447 	 */
448 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
449 		dst_link_failure(skb);
450 		return -1;
451 	}
452 
453 	return 0;
454 }
455 
456 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
457 				     struct sk_buff *skb)
458 {
459 	struct dst_entry *dst = skb_dst(skb);
460 
461 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
462 
463 #ifdef CONFIG_NET_SWITCHDEV
464 	if (skb->offload_l3_fwd_mark) {
465 		consume_skb(skb);
466 		return 0;
467 	}
468 #endif
469 
470 	skb_clear_tstamp(skb);
471 	return dst_output(net, sk, skb);
472 }
473 
474 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
475 {
476 	if (skb->len <= mtu)
477 		return false;
478 
479 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
480 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
481 		return true;
482 
483 	if (skb->ignore_df)
484 		return false;
485 
486 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
487 		return false;
488 
489 	return true;
490 }
491 
492 int ip6_forward(struct sk_buff *skb)
493 {
494 	struct dst_entry *dst = skb_dst(skb);
495 	struct ipv6hdr *hdr = ipv6_hdr(skb);
496 	struct inet6_skb_parm *opt = IP6CB(skb);
497 	struct net *net = dev_net(dst->dev);
498 	struct inet6_dev *idev;
499 	SKB_DR(reason);
500 	u32 mtu;
501 
502 	idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
503 	if (net->ipv6.devconf_all->forwarding == 0)
504 		goto error;
505 
506 	if (skb->pkt_type != PACKET_HOST)
507 		goto drop;
508 
509 	if (unlikely(skb->sk))
510 		goto drop;
511 
512 	if (skb_warn_if_lro(skb))
513 		goto drop;
514 
515 	if (!net->ipv6.devconf_all->disable_policy &&
516 	    (!idev || !idev->cnf.disable_policy) &&
517 	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
518 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
519 		goto drop;
520 	}
521 
522 	skb_forward_csum(skb);
523 
524 	/*
525 	 *	We DO NOT make any processing on
526 	 *	RA packets, pushing them to user level AS IS
527 	 *	without ane WARRANTY that application will be able
528 	 *	to interpret them. The reason is that we
529 	 *	cannot make anything clever here.
530 	 *
531 	 *	We are not end-node, so that if packet contains
532 	 *	AH/ESP, we cannot make anything.
533 	 *	Defragmentation also would be mistake, RA packets
534 	 *	cannot be fragmented, because there is no warranty
535 	 *	that different fragments will go along one path. --ANK
536 	 */
537 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
538 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
539 			return 0;
540 	}
541 
542 	/*
543 	 *	check and decrement ttl
544 	 */
545 	if (hdr->hop_limit <= 1) {
546 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
547 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
548 
549 		kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
550 		return -ETIMEDOUT;
551 	}
552 
553 	/* XXX: idev->cnf.proxy_ndp? */
554 	if (net->ipv6.devconf_all->proxy_ndp &&
555 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
556 		int proxied = ip6_forward_proxy_check(skb);
557 		if (proxied > 0) {
558 			/* It's tempting to decrease the hop limit
559 			 * here by 1, as we do at the end of the
560 			 * function too.
561 			 *
562 			 * But that would be incorrect, as proxying is
563 			 * not forwarding.  The ip6_input function
564 			 * will handle this packet locally, and it
565 			 * depends on the hop limit being unchanged.
566 			 *
567 			 * One example is the NDP hop limit, that
568 			 * always has to stay 255, but other would be
569 			 * similar checks around RA packets, where the
570 			 * user can even change the desired limit.
571 			 */
572 			return ip6_input(skb);
573 		} else if (proxied < 0) {
574 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
575 			goto drop;
576 		}
577 	}
578 
579 	if (!xfrm6_route_forward(skb)) {
580 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
581 		SKB_DR_SET(reason, XFRM_POLICY);
582 		goto drop;
583 	}
584 	dst = skb_dst(skb);
585 
586 	/* IPv6 specs say nothing about it, but it is clear that we cannot
587 	   send redirects to source routed frames.
588 	   We don't send redirects to frames decapsulated from IPsec.
589 	 */
590 	if (IP6CB(skb)->iif == dst->dev->ifindex &&
591 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
592 		struct in6_addr *target = NULL;
593 		struct inet_peer *peer;
594 		struct rt6_info *rt;
595 
596 		/*
597 		 *	incoming and outgoing devices are the same
598 		 *	send a redirect.
599 		 */
600 
601 		rt = (struct rt6_info *) dst;
602 		if (rt->rt6i_flags & RTF_GATEWAY)
603 			target = &rt->rt6i_gateway;
604 		else
605 			target = &hdr->daddr;
606 
607 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
608 
609 		/* Limit redirects both by destination (here)
610 		   and by source (inside ndisc_send_redirect)
611 		 */
612 		if (inet_peer_xrlim_allow(peer, 1*HZ))
613 			ndisc_send_redirect(skb, target);
614 		if (peer)
615 			inet_putpeer(peer);
616 	} else {
617 		int addrtype = ipv6_addr_type(&hdr->saddr);
618 
619 		/* This check is security critical. */
620 		if (addrtype == IPV6_ADDR_ANY ||
621 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
622 			goto error;
623 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
624 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
625 				    ICMPV6_NOT_NEIGHBOUR, 0);
626 			goto error;
627 		}
628 	}
629 
630 	mtu = ip6_dst_mtu_maybe_forward(dst, true);
631 	if (mtu < IPV6_MIN_MTU)
632 		mtu = IPV6_MIN_MTU;
633 
634 	if (ip6_pkt_too_big(skb, mtu)) {
635 		/* Again, force OUTPUT device used as source address */
636 		skb->dev = dst->dev;
637 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
638 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
639 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
640 				IPSTATS_MIB_FRAGFAILS);
641 		kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
642 		return -EMSGSIZE;
643 	}
644 
645 	if (skb_cow(skb, dst->dev->hard_header_len)) {
646 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
647 				IPSTATS_MIB_OUTDISCARDS);
648 		goto drop;
649 	}
650 
651 	hdr = ipv6_hdr(skb);
652 
653 	/* Mangling hops number delayed to point after skb COW */
654 
655 	hdr->hop_limit--;
656 
657 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
658 		       net, NULL, skb, skb->dev, dst->dev,
659 		       ip6_forward_finish);
660 
661 error:
662 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
663 	SKB_DR_SET(reason, IP_INADDRERRORS);
664 drop:
665 	kfree_skb_reason(skb, reason);
666 	return -EINVAL;
667 }
668 
669 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
670 {
671 	to->pkt_type = from->pkt_type;
672 	to->priority = from->priority;
673 	to->protocol = from->protocol;
674 	skb_dst_drop(to);
675 	skb_dst_set(to, dst_clone(skb_dst(from)));
676 	to->dev = from->dev;
677 	to->mark = from->mark;
678 
679 	skb_copy_hash(to, from);
680 
681 #ifdef CONFIG_NET_SCHED
682 	to->tc_index = from->tc_index;
683 #endif
684 	nf_copy(to, from);
685 	skb_ext_copy(to, from);
686 	skb_copy_secmark(to, from);
687 }
688 
689 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
690 		      u8 nexthdr, __be32 frag_id,
691 		      struct ip6_fraglist_iter *iter)
692 {
693 	unsigned int first_len;
694 	struct frag_hdr *fh;
695 
696 	/* BUILD HEADER */
697 	*prevhdr = NEXTHDR_FRAGMENT;
698 	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
699 	if (!iter->tmp_hdr)
700 		return -ENOMEM;
701 
702 	iter->frag = skb_shinfo(skb)->frag_list;
703 	skb_frag_list_init(skb);
704 
705 	iter->offset = 0;
706 	iter->hlen = hlen;
707 	iter->frag_id = frag_id;
708 	iter->nexthdr = nexthdr;
709 
710 	__skb_pull(skb, hlen);
711 	fh = __skb_push(skb, sizeof(struct frag_hdr));
712 	__skb_push(skb, hlen);
713 	skb_reset_network_header(skb);
714 	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
715 
716 	fh->nexthdr = nexthdr;
717 	fh->reserved = 0;
718 	fh->frag_off = htons(IP6_MF);
719 	fh->identification = frag_id;
720 
721 	first_len = skb_pagelen(skb);
722 	skb->data_len = first_len - skb_headlen(skb);
723 	skb->len = first_len;
724 	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
725 
726 	return 0;
727 }
728 EXPORT_SYMBOL(ip6_fraglist_init);
729 
730 void ip6_fraglist_prepare(struct sk_buff *skb,
731 			  struct ip6_fraglist_iter *iter)
732 {
733 	struct sk_buff *frag = iter->frag;
734 	unsigned int hlen = iter->hlen;
735 	struct frag_hdr *fh;
736 
737 	frag->ip_summed = CHECKSUM_NONE;
738 	skb_reset_transport_header(frag);
739 	fh = __skb_push(frag, sizeof(struct frag_hdr));
740 	__skb_push(frag, hlen);
741 	skb_reset_network_header(frag);
742 	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
743 	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
744 	fh->nexthdr = iter->nexthdr;
745 	fh->reserved = 0;
746 	fh->frag_off = htons(iter->offset);
747 	if (frag->next)
748 		fh->frag_off |= htons(IP6_MF);
749 	fh->identification = iter->frag_id;
750 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
751 	ip6_copy_metadata(frag, skb);
752 }
753 EXPORT_SYMBOL(ip6_fraglist_prepare);
754 
755 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
756 		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
757 		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
758 {
759 	state->prevhdr = prevhdr;
760 	state->nexthdr = nexthdr;
761 	state->frag_id = frag_id;
762 
763 	state->hlen = hlen;
764 	state->mtu = mtu;
765 
766 	state->left = skb->len - hlen;	/* Space per frame */
767 	state->ptr = hlen;		/* Where to start from */
768 
769 	state->hroom = hdr_room;
770 	state->troom = needed_tailroom;
771 
772 	state->offset = 0;
773 }
774 EXPORT_SYMBOL(ip6_frag_init);
775 
776 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
777 {
778 	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
779 	struct sk_buff *frag;
780 	struct frag_hdr *fh;
781 	unsigned int len;
782 
783 	len = state->left;
784 	/* IF: it doesn't fit, use 'mtu' - the data space left */
785 	if (len > state->mtu)
786 		len = state->mtu;
787 	/* IF: we are not sending up to and including the packet end
788 	   then align the next start on an eight byte boundary */
789 	if (len < state->left)
790 		len &= ~7;
791 
792 	/* Allocate buffer */
793 	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
794 			 state->hroom + state->troom, GFP_ATOMIC);
795 	if (!frag)
796 		return ERR_PTR(-ENOMEM);
797 
798 	/*
799 	 *	Set up data on packet
800 	 */
801 
802 	ip6_copy_metadata(frag, skb);
803 	skb_reserve(frag, state->hroom);
804 	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
805 	skb_reset_network_header(frag);
806 	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
807 	frag->transport_header = (frag->network_header + state->hlen +
808 				  sizeof(struct frag_hdr));
809 
810 	/*
811 	 *	Charge the memory for the fragment to any owner
812 	 *	it might possess
813 	 */
814 	if (skb->sk)
815 		skb_set_owner_w(frag, skb->sk);
816 
817 	/*
818 	 *	Copy the packet header into the new buffer.
819 	 */
820 	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
821 
822 	fragnexthdr_offset = skb_network_header(frag);
823 	fragnexthdr_offset += prevhdr - skb_network_header(skb);
824 	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
825 
826 	/*
827 	 *	Build fragment header.
828 	 */
829 	fh->nexthdr = state->nexthdr;
830 	fh->reserved = 0;
831 	fh->identification = state->frag_id;
832 
833 	/*
834 	 *	Copy a block of the IP datagram.
835 	 */
836 	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
837 			     len));
838 	state->left -= len;
839 
840 	fh->frag_off = htons(state->offset);
841 	if (state->left > 0)
842 		fh->frag_off |= htons(IP6_MF);
843 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
844 
845 	state->ptr += len;
846 	state->offset += len;
847 
848 	return frag;
849 }
850 EXPORT_SYMBOL(ip6_frag_next);
851 
852 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
853 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
854 {
855 	struct sk_buff *frag;
856 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
857 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
858 				inet6_sk(skb->sk) : NULL;
859 	bool mono_delivery_time = skb->mono_delivery_time;
860 	struct ip6_frag_state state;
861 	unsigned int mtu, hlen, nexthdr_offset;
862 	ktime_t tstamp = skb->tstamp;
863 	int hroom, err = 0;
864 	__be32 frag_id;
865 	u8 *prevhdr, nexthdr = 0;
866 
867 	err = ip6_find_1stfragopt(skb, &prevhdr);
868 	if (err < 0)
869 		goto fail;
870 	hlen = err;
871 	nexthdr = *prevhdr;
872 	nexthdr_offset = prevhdr - skb_network_header(skb);
873 
874 	mtu = ip6_skb_dst_mtu(skb);
875 
876 	/* We must not fragment if the socket is set to force MTU discovery
877 	 * or if the skb it not generated by a local socket.
878 	 */
879 	if (unlikely(!skb->ignore_df && skb->len > mtu))
880 		goto fail_toobig;
881 
882 	if (IP6CB(skb)->frag_max_size) {
883 		if (IP6CB(skb)->frag_max_size > mtu)
884 			goto fail_toobig;
885 
886 		/* don't send fragments larger than what we received */
887 		mtu = IP6CB(skb)->frag_max_size;
888 		if (mtu < IPV6_MIN_MTU)
889 			mtu = IPV6_MIN_MTU;
890 	}
891 
892 	if (np && np->frag_size < mtu) {
893 		if (np->frag_size)
894 			mtu = np->frag_size;
895 	}
896 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
897 		goto fail_toobig;
898 	mtu -= hlen + sizeof(struct frag_hdr);
899 
900 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
901 				    &ipv6_hdr(skb)->saddr);
902 
903 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
904 	    (err = skb_checksum_help(skb)))
905 		goto fail;
906 
907 	prevhdr = skb_network_header(skb) + nexthdr_offset;
908 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
909 	if (skb_has_frag_list(skb)) {
910 		unsigned int first_len = skb_pagelen(skb);
911 		struct ip6_fraglist_iter iter;
912 		struct sk_buff *frag2;
913 
914 		if (first_len - hlen > mtu ||
915 		    ((first_len - hlen) & 7) ||
916 		    skb_cloned(skb) ||
917 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
918 			goto slow_path;
919 
920 		skb_walk_frags(skb, frag) {
921 			/* Correct geometry. */
922 			if (frag->len > mtu ||
923 			    ((frag->len & 7) && frag->next) ||
924 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
925 				goto slow_path_clean;
926 
927 			/* Partially cloned skb? */
928 			if (skb_shared(frag))
929 				goto slow_path_clean;
930 
931 			BUG_ON(frag->sk);
932 			if (skb->sk) {
933 				frag->sk = skb->sk;
934 				frag->destructor = sock_wfree;
935 			}
936 			skb->truesize -= frag->truesize;
937 		}
938 
939 		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
940 					&iter);
941 		if (err < 0)
942 			goto fail;
943 
944 		/* We prevent @rt from being freed. */
945 		rcu_read_lock();
946 
947 		for (;;) {
948 			/* Prepare header of the next frame,
949 			 * before previous one went down. */
950 			if (iter.frag)
951 				ip6_fraglist_prepare(skb, &iter);
952 
953 			skb_set_delivery_time(skb, tstamp, mono_delivery_time);
954 			err = output(net, sk, skb);
955 			if (!err)
956 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
957 					      IPSTATS_MIB_FRAGCREATES);
958 
959 			if (err || !iter.frag)
960 				break;
961 
962 			skb = ip6_fraglist_next(&iter);
963 		}
964 
965 		kfree(iter.tmp_hdr);
966 
967 		if (err == 0) {
968 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
969 				      IPSTATS_MIB_FRAGOKS);
970 			rcu_read_unlock();
971 			return 0;
972 		}
973 
974 		kfree_skb_list(iter.frag);
975 
976 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
977 			      IPSTATS_MIB_FRAGFAILS);
978 		rcu_read_unlock();
979 		return err;
980 
981 slow_path_clean:
982 		skb_walk_frags(skb, frag2) {
983 			if (frag2 == frag)
984 				break;
985 			frag2->sk = NULL;
986 			frag2->destructor = NULL;
987 			skb->truesize += frag2->truesize;
988 		}
989 	}
990 
991 slow_path:
992 	/*
993 	 *	Fragment the datagram.
994 	 */
995 
996 	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
997 		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
998 		      &state);
999 
1000 	/*
1001 	 *	Keep copying data until we run out.
1002 	 */
1003 
1004 	while (state.left > 0) {
1005 		frag = ip6_frag_next(skb, &state);
1006 		if (IS_ERR(frag)) {
1007 			err = PTR_ERR(frag);
1008 			goto fail;
1009 		}
1010 
1011 		/*
1012 		 *	Put this fragment into the sending queue.
1013 		 */
1014 		skb_set_delivery_time(frag, tstamp, mono_delivery_time);
1015 		err = output(net, sk, frag);
1016 		if (err)
1017 			goto fail;
1018 
1019 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1020 			      IPSTATS_MIB_FRAGCREATES);
1021 	}
1022 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1023 		      IPSTATS_MIB_FRAGOKS);
1024 	consume_skb(skb);
1025 	return err;
1026 
1027 fail_toobig:
1028 	if (skb->sk && dst_allfrag(skb_dst(skb)))
1029 		sk_gso_disable(skb->sk);
1030 
1031 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1032 	err = -EMSGSIZE;
1033 
1034 fail:
1035 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1036 		      IPSTATS_MIB_FRAGFAILS);
1037 	kfree_skb(skb);
1038 	return err;
1039 }
1040 
1041 static inline int ip6_rt_check(const struct rt6key *rt_key,
1042 			       const struct in6_addr *fl_addr,
1043 			       const struct in6_addr *addr_cache)
1044 {
1045 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1046 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1047 }
1048 
1049 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1050 					  struct dst_entry *dst,
1051 					  const struct flowi6 *fl6)
1052 {
1053 	struct ipv6_pinfo *np = inet6_sk(sk);
1054 	struct rt6_info *rt;
1055 
1056 	if (!dst)
1057 		goto out;
1058 
1059 	if (dst->ops->family != AF_INET6) {
1060 		dst_release(dst);
1061 		return NULL;
1062 	}
1063 
1064 	rt = (struct rt6_info *)dst;
1065 	/* Yes, checking route validity in not connected
1066 	 * case is not very simple. Take into account,
1067 	 * that we do not support routing by source, TOS,
1068 	 * and MSG_DONTROUTE		--ANK (980726)
1069 	 *
1070 	 * 1. ip6_rt_check(): If route was host route,
1071 	 *    check that cached destination is current.
1072 	 *    If it is network route, we still may
1073 	 *    check its validity using saved pointer
1074 	 *    to the last used address: daddr_cache.
1075 	 *    We do not want to save whole address now,
1076 	 *    (because main consumer of this service
1077 	 *    is tcp, which has not this problem),
1078 	 *    so that the last trick works only on connected
1079 	 *    sockets.
1080 	 * 2. oif also should be the same.
1081 	 */
1082 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1083 #ifdef CONFIG_IPV6_SUBTREES
1084 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1085 #endif
1086 	   (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
1087 		dst_release(dst);
1088 		dst = NULL;
1089 	}
1090 
1091 out:
1092 	return dst;
1093 }
1094 
1095 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1096 			       struct dst_entry **dst, struct flowi6 *fl6)
1097 {
1098 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1099 	struct neighbour *n;
1100 	struct rt6_info *rt;
1101 #endif
1102 	int err;
1103 	int flags = 0;
1104 
1105 	/* The correct way to handle this would be to do
1106 	 * ip6_route_get_saddr, and then ip6_route_output; however,
1107 	 * the route-specific preferred source forces the
1108 	 * ip6_route_output call _before_ ip6_route_get_saddr.
1109 	 *
1110 	 * In source specific routing (no src=any default route),
1111 	 * ip6_route_output will fail given src=any saddr, though, so
1112 	 * that's why we try it again later.
1113 	 */
1114 	if (ipv6_addr_any(&fl6->saddr)) {
1115 		struct fib6_info *from;
1116 		struct rt6_info *rt;
1117 
1118 		*dst = ip6_route_output(net, sk, fl6);
1119 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1120 
1121 		rcu_read_lock();
1122 		from = rt ? rcu_dereference(rt->from) : NULL;
1123 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1124 					  sk ? inet6_sk(sk)->srcprefs : 0,
1125 					  &fl6->saddr);
1126 		rcu_read_unlock();
1127 
1128 		if (err)
1129 			goto out_err_release;
1130 
1131 		/* If we had an erroneous initial result, pretend it
1132 		 * never existed and let the SA-enabled version take
1133 		 * over.
1134 		 */
1135 		if ((*dst)->error) {
1136 			dst_release(*dst);
1137 			*dst = NULL;
1138 		}
1139 
1140 		if (fl6->flowi6_oif)
1141 			flags |= RT6_LOOKUP_F_IFACE;
1142 	}
1143 
1144 	if (!*dst)
1145 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1146 
1147 	err = (*dst)->error;
1148 	if (err)
1149 		goto out_err_release;
1150 
1151 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1152 	/*
1153 	 * Here if the dst entry we've looked up
1154 	 * has a neighbour entry that is in the INCOMPLETE
1155 	 * state and the src address from the flow is
1156 	 * marked as OPTIMISTIC, we release the found
1157 	 * dst entry and replace it instead with the
1158 	 * dst entry of the nexthop router
1159 	 */
1160 	rt = (struct rt6_info *) *dst;
1161 	rcu_read_lock();
1162 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1163 				      rt6_nexthop(rt, &fl6->daddr));
1164 	err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
1165 	rcu_read_unlock();
1166 
1167 	if (err) {
1168 		struct inet6_ifaddr *ifp;
1169 		struct flowi6 fl_gw6;
1170 		int redirect;
1171 
1172 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1173 				      (*dst)->dev, 1);
1174 
1175 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1176 		if (ifp)
1177 			in6_ifa_put(ifp);
1178 
1179 		if (redirect) {
1180 			/*
1181 			 * We need to get the dst entry for the
1182 			 * default router instead
1183 			 */
1184 			dst_release(*dst);
1185 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1186 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1187 			*dst = ip6_route_output(net, sk, &fl_gw6);
1188 			err = (*dst)->error;
1189 			if (err)
1190 				goto out_err_release;
1191 		}
1192 	}
1193 #endif
1194 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1195 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1196 		err = -EAFNOSUPPORT;
1197 		goto out_err_release;
1198 	}
1199 
1200 	return 0;
1201 
1202 out_err_release:
1203 	dst_release(*dst);
1204 	*dst = NULL;
1205 
1206 	if (err == -ENETUNREACH)
1207 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1208 	return err;
1209 }
1210 
1211 /**
1212  *	ip6_dst_lookup - perform route lookup on flow
1213  *	@net: Network namespace to perform lookup in
1214  *	@sk: socket which provides route info
1215  *	@dst: pointer to dst_entry * for result
1216  *	@fl6: flow to lookup
1217  *
1218  *	This function performs a route lookup on the given flow.
1219  *
1220  *	It returns zero on success, or a standard errno code on error.
1221  */
1222 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1223 		   struct flowi6 *fl6)
1224 {
1225 	*dst = NULL;
1226 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1227 }
1228 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1229 
1230 /**
1231  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1232  *	@net: Network namespace to perform lookup in
1233  *	@sk: socket which provides route info
1234  *	@fl6: flow to lookup
1235  *	@final_dst: final destination address for ipsec lookup
1236  *
1237  *	This function performs a route lookup on the given flow.
1238  *
1239  *	It returns a valid dst pointer on success, or a pointer encoded
1240  *	error code.
1241  */
1242 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1243 				      const struct in6_addr *final_dst)
1244 {
1245 	struct dst_entry *dst = NULL;
1246 	int err;
1247 
1248 	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1249 	if (err)
1250 		return ERR_PTR(err);
1251 	if (final_dst)
1252 		fl6->daddr = *final_dst;
1253 
1254 	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1255 }
1256 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1257 
1258 /**
1259  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1260  *	@sk: socket which provides the dst cache and route info
1261  *	@fl6: flow to lookup
1262  *	@final_dst: final destination address for ipsec lookup
1263  *	@connected: whether @sk is connected or not
1264  *
1265  *	This function performs a route lookup on the given flow with the
1266  *	possibility of using the cached route in the socket if it is valid.
1267  *	It will take the socket dst lock when operating on the dst cache.
1268  *	As a result, this function can only be used in process context.
1269  *
1270  *	In addition, for a connected socket, cache the dst in the socket
1271  *	if the current cache is not valid.
1272  *
1273  *	It returns a valid dst pointer on success, or a pointer encoded
1274  *	error code.
1275  */
1276 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1277 					 const struct in6_addr *final_dst,
1278 					 bool connected)
1279 {
1280 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1281 
1282 	dst = ip6_sk_dst_check(sk, dst, fl6);
1283 	if (dst)
1284 		return dst;
1285 
1286 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1287 	if (connected && !IS_ERR(dst))
1288 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1289 
1290 	return dst;
1291 }
1292 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1293 
1294 /**
1295  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1296  *      @skb: Packet for which lookup is done
1297  *      @dev: Tunnel device
1298  *      @net: Network namespace of tunnel device
1299  *      @sock: Socket which provides route info
1300  *      @saddr: Memory to store the src ip address
1301  *      @info: Tunnel information
1302  *      @protocol: IP protocol
1303  *      @use_cache: Flag to enable cache usage
1304  *      This function performs a route lookup on a tunnel
1305  *
1306  *      It returns a valid dst pointer and stores src address to be used in
1307  *      tunnel in param saddr on success, else a pointer encoded error code.
1308  */
1309 
1310 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1311 					struct net_device *dev,
1312 					struct net *net,
1313 					struct socket *sock,
1314 					struct in6_addr *saddr,
1315 					const struct ip_tunnel_info *info,
1316 					u8 protocol,
1317 					bool use_cache)
1318 {
1319 	struct dst_entry *dst = NULL;
1320 #ifdef CONFIG_DST_CACHE
1321 	struct dst_cache *dst_cache;
1322 #endif
1323 	struct flowi6 fl6;
1324 	__u8 prio;
1325 
1326 #ifdef CONFIG_DST_CACHE
1327 	dst_cache = (struct dst_cache *)&info->dst_cache;
1328 	if (use_cache) {
1329 		dst = dst_cache_get_ip6(dst_cache, saddr);
1330 		if (dst)
1331 			return dst;
1332 	}
1333 #endif
1334 	memset(&fl6, 0, sizeof(fl6));
1335 	fl6.flowi6_mark = skb->mark;
1336 	fl6.flowi6_proto = protocol;
1337 	fl6.daddr = info->key.u.ipv6.dst;
1338 	fl6.saddr = info->key.u.ipv6.src;
1339 	prio = info->key.tos;
1340 	fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
1341 
1342 	dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1343 					      NULL);
1344 	if (IS_ERR(dst)) {
1345 		netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1346 		return ERR_PTR(-ENETUNREACH);
1347 	}
1348 	if (dst->dev == dev) { /* is this necessary? */
1349 		netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1350 		dst_release(dst);
1351 		return ERR_PTR(-ELOOP);
1352 	}
1353 #ifdef CONFIG_DST_CACHE
1354 	if (use_cache)
1355 		dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1356 #endif
1357 	*saddr = fl6.saddr;
1358 	return dst;
1359 }
1360 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1361 
1362 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1363 					       gfp_t gfp)
1364 {
1365 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1366 }
1367 
1368 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1369 						gfp_t gfp)
1370 {
1371 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1372 }
1373 
1374 static void ip6_append_data_mtu(unsigned int *mtu,
1375 				int *maxfraglen,
1376 				unsigned int fragheaderlen,
1377 				struct sk_buff *skb,
1378 				struct rt6_info *rt,
1379 				unsigned int orig_mtu)
1380 {
1381 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1382 		if (!skb) {
1383 			/* first fragment, reserve header_len */
1384 			*mtu = orig_mtu - rt->dst.header_len;
1385 
1386 		} else {
1387 			/*
1388 			 * this fragment is not first, the headers
1389 			 * space is regarded as data space.
1390 			 */
1391 			*mtu = orig_mtu;
1392 		}
1393 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1394 			      + fragheaderlen - sizeof(struct frag_hdr);
1395 	}
1396 }
1397 
1398 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1399 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1400 			  struct rt6_info *rt)
1401 {
1402 	struct ipv6_pinfo *np = inet6_sk(sk);
1403 	unsigned int mtu;
1404 	struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1405 
1406 	/* callers pass dst together with a reference, set it first so
1407 	 * ip6_cork_release() can put it down even in case of an error.
1408 	 */
1409 	cork->base.dst = &rt->dst;
1410 
1411 	/*
1412 	 * setup for corking
1413 	 */
1414 	if (opt) {
1415 		if (WARN_ON(v6_cork->opt))
1416 			return -EINVAL;
1417 
1418 		nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1419 		if (unlikely(!nopt))
1420 			return -ENOBUFS;
1421 
1422 		nopt->tot_len = sizeof(*opt);
1423 		nopt->opt_flen = opt->opt_flen;
1424 		nopt->opt_nflen = opt->opt_nflen;
1425 
1426 		nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1427 		if (opt->dst0opt && !nopt->dst0opt)
1428 			return -ENOBUFS;
1429 
1430 		nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1431 		if (opt->dst1opt && !nopt->dst1opt)
1432 			return -ENOBUFS;
1433 
1434 		nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1435 		if (opt->hopopt && !nopt->hopopt)
1436 			return -ENOBUFS;
1437 
1438 		nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1439 		if (opt->srcrt && !nopt->srcrt)
1440 			return -ENOBUFS;
1441 
1442 		/* need source address above miyazawa*/
1443 	}
1444 	v6_cork->hop_limit = ipc6->hlimit;
1445 	v6_cork->tclass = ipc6->tclass;
1446 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1447 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1448 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1449 	else
1450 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1451 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1452 	if (np->frag_size < mtu) {
1453 		if (np->frag_size)
1454 			mtu = np->frag_size;
1455 	}
1456 	cork->base.fragsize = mtu;
1457 	cork->base.gso_size = ipc6->gso_size;
1458 	cork->base.tx_flags = 0;
1459 	cork->base.mark = ipc6->sockc.mark;
1460 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1461 
1462 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1463 		cork->base.flags |= IPCORK_ALLFRAG;
1464 	cork->base.length = 0;
1465 
1466 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1467 
1468 	return 0;
1469 }
1470 
1471 static int __ip6_append_data(struct sock *sk,
1472 			     struct sk_buff_head *queue,
1473 			     struct inet_cork_full *cork_full,
1474 			     struct inet6_cork *v6_cork,
1475 			     struct page_frag *pfrag,
1476 			     int getfrag(void *from, char *to, int offset,
1477 					 int len, int odd, struct sk_buff *skb),
1478 			     void *from, size_t length, int transhdrlen,
1479 			     unsigned int flags, struct ipcm6_cookie *ipc6)
1480 {
1481 	struct sk_buff *skb, *skb_prev = NULL;
1482 	struct inet_cork *cork = &cork_full->base;
1483 	struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1484 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1485 	struct ubuf_info *uarg = NULL;
1486 	int exthdrlen = 0;
1487 	int dst_exthdrlen = 0;
1488 	int hh_len;
1489 	int copy;
1490 	int err;
1491 	int offset = 0;
1492 	bool zc = false;
1493 	u32 tskey = 0;
1494 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1495 	struct ipv6_txoptions *opt = v6_cork->opt;
1496 	int csummode = CHECKSUM_NONE;
1497 	unsigned int maxnonfragsize, headersize;
1498 	unsigned int wmem_alloc_delta = 0;
1499 	bool paged, extra_uref = false;
1500 
1501 	skb = skb_peek_tail(queue);
1502 	if (!skb) {
1503 		exthdrlen = opt ? opt->opt_flen : 0;
1504 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1505 	}
1506 
1507 	paged = !!cork->gso_size;
1508 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1509 	orig_mtu = mtu;
1510 
1511 	if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
1512 	    READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID)
1513 		tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1514 
1515 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1516 
1517 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1518 			(opt ? opt->opt_nflen : 0);
1519 
1520 	headersize = sizeof(struct ipv6hdr) +
1521 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1522 		     (dst_allfrag(&rt->dst) ?
1523 		      sizeof(struct frag_hdr) : 0) +
1524 		     rt->rt6i_nfheader_len;
1525 
1526 	if (mtu <= fragheaderlen ||
1527 	    ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1528 		goto emsgsize;
1529 
1530 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1531 		     sizeof(struct frag_hdr);
1532 
1533 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1534 	 * the first fragment
1535 	 */
1536 	if (headersize + transhdrlen > mtu)
1537 		goto emsgsize;
1538 
1539 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1540 	    (sk->sk_protocol == IPPROTO_UDP ||
1541 	     sk->sk_protocol == IPPROTO_ICMPV6 ||
1542 	     sk->sk_protocol == IPPROTO_RAW)) {
1543 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1544 				sizeof(struct ipv6hdr));
1545 		goto emsgsize;
1546 	}
1547 
1548 	if (ip6_sk_ignore_df(sk))
1549 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1550 	else
1551 		maxnonfragsize = mtu;
1552 
1553 	if (cork->length + length > maxnonfragsize - headersize) {
1554 emsgsize:
1555 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1556 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1557 		return -EMSGSIZE;
1558 	}
1559 
1560 	/* CHECKSUM_PARTIAL only with no extension headers and when
1561 	 * we are not going to fragment
1562 	 */
1563 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1564 	    headersize == sizeof(struct ipv6hdr) &&
1565 	    length <= mtu - headersize &&
1566 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1567 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1568 		csummode = CHECKSUM_PARTIAL;
1569 
1570 	if ((flags & MSG_ZEROCOPY) && length) {
1571 		struct msghdr *msg = from;
1572 
1573 		if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1574 			if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1575 				return -EINVAL;
1576 
1577 			/* Leave uarg NULL if can't zerocopy, callers should
1578 			 * be able to handle it.
1579 			 */
1580 			if ((rt->dst.dev->features & NETIF_F_SG) &&
1581 			    csummode == CHECKSUM_PARTIAL) {
1582 				paged = true;
1583 				zc = true;
1584 				uarg = msg->msg_ubuf;
1585 			}
1586 		} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1587 			uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1588 			if (!uarg)
1589 				return -ENOBUFS;
1590 			extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1591 			if (rt->dst.dev->features & NETIF_F_SG &&
1592 			    csummode == CHECKSUM_PARTIAL) {
1593 				paged = true;
1594 				zc = true;
1595 			} else {
1596 				uarg_to_msgzc(uarg)->zerocopy = 0;
1597 				skb_zcopy_set(skb, uarg, &extra_uref);
1598 			}
1599 		}
1600 	} else if ((flags & MSG_SPLICE_PAGES) && length) {
1601 		if (inet_test_bit(HDRINCL, sk))
1602 			return -EPERM;
1603 		if (rt->dst.dev->features & NETIF_F_SG &&
1604 		    getfrag == ip_generic_getfrag)
1605 			/* We need an empty buffer to attach stuff to */
1606 			paged = true;
1607 		else
1608 			flags &= ~MSG_SPLICE_PAGES;
1609 	}
1610 
1611 	/*
1612 	 * Let's try using as much space as possible.
1613 	 * Use MTU if total length of the message fits into the MTU.
1614 	 * Otherwise, we need to reserve fragment header and
1615 	 * fragment alignment (= 8-15 octects, in total).
1616 	 *
1617 	 * Note that we may need to "move" the data from the tail
1618 	 * of the buffer to the new fragment when we split
1619 	 * the message.
1620 	 *
1621 	 * FIXME: It may be fragmented into multiple chunks
1622 	 *        at once if non-fragmentable extension headers
1623 	 *        are too large.
1624 	 * --yoshfuji
1625 	 */
1626 
1627 	cork->length += length;
1628 	if (!skb)
1629 		goto alloc_new_skb;
1630 
1631 	while (length > 0) {
1632 		/* Check if the remaining data fits into current packet. */
1633 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1634 		if (copy < length)
1635 			copy = maxfraglen - skb->len;
1636 
1637 		if (copy <= 0) {
1638 			char *data;
1639 			unsigned int datalen;
1640 			unsigned int fraglen;
1641 			unsigned int fraggap;
1642 			unsigned int alloclen, alloc_extra;
1643 			unsigned int pagedlen;
1644 alloc_new_skb:
1645 			/* There's no room in the current skb */
1646 			if (skb)
1647 				fraggap = skb->len - maxfraglen;
1648 			else
1649 				fraggap = 0;
1650 			/* update mtu and maxfraglen if necessary */
1651 			if (!skb || !skb_prev)
1652 				ip6_append_data_mtu(&mtu, &maxfraglen,
1653 						    fragheaderlen, skb, rt,
1654 						    orig_mtu);
1655 
1656 			skb_prev = skb;
1657 
1658 			/*
1659 			 * If remaining data exceeds the mtu,
1660 			 * we know we need more fragment(s).
1661 			 */
1662 			datalen = length + fraggap;
1663 
1664 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1665 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1666 			fraglen = datalen + fragheaderlen;
1667 			pagedlen = 0;
1668 
1669 			alloc_extra = hh_len;
1670 			alloc_extra += dst_exthdrlen;
1671 			alloc_extra += rt->dst.trailer_len;
1672 
1673 			/* We just reserve space for fragment header.
1674 			 * Note: this may be overallocation if the message
1675 			 * (without MSG_MORE) fits into the MTU.
1676 			 */
1677 			alloc_extra += sizeof(struct frag_hdr);
1678 
1679 			if ((flags & MSG_MORE) &&
1680 			    !(rt->dst.dev->features&NETIF_F_SG))
1681 				alloclen = mtu;
1682 			else if (!paged &&
1683 				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1684 				  !(rt->dst.dev->features & NETIF_F_SG)))
1685 				alloclen = fraglen;
1686 			else {
1687 				alloclen = fragheaderlen + transhdrlen;
1688 				pagedlen = datalen - transhdrlen;
1689 			}
1690 			alloclen += alloc_extra;
1691 
1692 			if (datalen != length + fraggap) {
1693 				/*
1694 				 * this is not the last fragment, the trailer
1695 				 * space is regarded as data space.
1696 				 */
1697 				datalen += rt->dst.trailer_len;
1698 			}
1699 
1700 			fraglen = datalen + fragheaderlen;
1701 
1702 			copy = datalen - transhdrlen - fraggap - pagedlen;
1703 			/* [!] NOTE: copy may be negative if pagedlen>0
1704 			 * because then the equation may reduces to -fraggap.
1705 			 */
1706 			if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) {
1707 				err = -EINVAL;
1708 				goto error;
1709 			}
1710 			if (transhdrlen) {
1711 				skb = sock_alloc_send_skb(sk, alloclen,
1712 						(flags & MSG_DONTWAIT), &err);
1713 			} else {
1714 				skb = NULL;
1715 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1716 				    2 * sk->sk_sndbuf)
1717 					skb = alloc_skb(alloclen,
1718 							sk->sk_allocation);
1719 				if (unlikely(!skb))
1720 					err = -ENOBUFS;
1721 			}
1722 			if (!skb)
1723 				goto error;
1724 			/*
1725 			 *	Fill in the control structures
1726 			 */
1727 			skb->protocol = htons(ETH_P_IPV6);
1728 			skb->ip_summed = csummode;
1729 			skb->csum = 0;
1730 			/* reserve for fragmentation and ipsec header */
1731 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1732 				    dst_exthdrlen);
1733 
1734 			/*
1735 			 *	Find where to start putting bytes
1736 			 */
1737 			data = skb_put(skb, fraglen - pagedlen);
1738 			skb_set_network_header(skb, exthdrlen);
1739 			data += fragheaderlen;
1740 			skb->transport_header = (skb->network_header +
1741 						 fragheaderlen);
1742 			if (fraggap) {
1743 				skb->csum = skb_copy_and_csum_bits(
1744 					skb_prev, maxfraglen,
1745 					data + transhdrlen, fraggap);
1746 				skb_prev->csum = csum_sub(skb_prev->csum,
1747 							  skb->csum);
1748 				data += fraggap;
1749 				pskb_trim_unique(skb_prev, maxfraglen);
1750 			}
1751 			if (copy > 0 &&
1752 			    getfrag(from, data + transhdrlen, offset,
1753 				    copy, fraggap, skb) < 0) {
1754 				err = -EFAULT;
1755 				kfree_skb(skb);
1756 				goto error;
1757 			} else if (flags & MSG_SPLICE_PAGES) {
1758 				copy = 0;
1759 			}
1760 
1761 			offset += copy;
1762 			length -= copy + transhdrlen;
1763 			transhdrlen = 0;
1764 			exthdrlen = 0;
1765 			dst_exthdrlen = 0;
1766 
1767 			/* Only the initial fragment is time stamped */
1768 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1769 			cork->tx_flags = 0;
1770 			skb_shinfo(skb)->tskey = tskey;
1771 			tskey = 0;
1772 			skb_zcopy_set(skb, uarg, &extra_uref);
1773 
1774 			if ((flags & MSG_CONFIRM) && !skb_prev)
1775 				skb_set_dst_pending_confirm(skb, 1);
1776 
1777 			/*
1778 			 * Put the packet on the pending queue
1779 			 */
1780 			if (!skb->destructor) {
1781 				skb->destructor = sock_wfree;
1782 				skb->sk = sk;
1783 				wmem_alloc_delta += skb->truesize;
1784 			}
1785 			__skb_queue_tail(queue, skb);
1786 			continue;
1787 		}
1788 
1789 		if (copy > length)
1790 			copy = length;
1791 
1792 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1793 		    skb_tailroom(skb) >= copy) {
1794 			unsigned int off;
1795 
1796 			off = skb->len;
1797 			if (getfrag(from, skb_put(skb, copy),
1798 						offset, copy, off, skb) < 0) {
1799 				__skb_trim(skb, off);
1800 				err = -EFAULT;
1801 				goto error;
1802 			}
1803 		} else if (flags & MSG_SPLICE_PAGES) {
1804 			struct msghdr *msg = from;
1805 
1806 			err = -EIO;
1807 			if (WARN_ON_ONCE(copy > msg->msg_iter.count))
1808 				goto error;
1809 
1810 			err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
1811 						   sk->sk_allocation);
1812 			if (err < 0)
1813 				goto error;
1814 			copy = err;
1815 			wmem_alloc_delta += copy;
1816 		} else if (!zc) {
1817 			int i = skb_shinfo(skb)->nr_frags;
1818 
1819 			err = -ENOMEM;
1820 			if (!sk_page_frag_refill(sk, pfrag))
1821 				goto error;
1822 
1823 			skb_zcopy_downgrade_managed(skb);
1824 			if (!skb_can_coalesce(skb, i, pfrag->page,
1825 					      pfrag->offset)) {
1826 				err = -EMSGSIZE;
1827 				if (i == MAX_SKB_FRAGS)
1828 					goto error;
1829 
1830 				__skb_fill_page_desc(skb, i, pfrag->page,
1831 						     pfrag->offset, 0);
1832 				skb_shinfo(skb)->nr_frags = ++i;
1833 				get_page(pfrag->page);
1834 			}
1835 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1836 			if (getfrag(from,
1837 				    page_address(pfrag->page) + pfrag->offset,
1838 				    offset, copy, skb->len, skb) < 0)
1839 				goto error_efault;
1840 
1841 			pfrag->offset += copy;
1842 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1843 			skb->len += copy;
1844 			skb->data_len += copy;
1845 			skb->truesize += copy;
1846 			wmem_alloc_delta += copy;
1847 		} else {
1848 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1849 			if (err < 0)
1850 				goto error;
1851 		}
1852 		offset += copy;
1853 		length -= copy;
1854 	}
1855 
1856 	if (wmem_alloc_delta)
1857 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1858 	return 0;
1859 
1860 error_efault:
1861 	err = -EFAULT;
1862 error:
1863 	net_zcopy_put_abort(uarg, extra_uref);
1864 	cork->length -= length;
1865 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1866 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1867 	return err;
1868 }
1869 
1870 int ip6_append_data(struct sock *sk,
1871 		    int getfrag(void *from, char *to, int offset, int len,
1872 				int odd, struct sk_buff *skb),
1873 		    void *from, size_t length, int transhdrlen,
1874 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1875 		    struct rt6_info *rt, unsigned int flags)
1876 {
1877 	struct inet_sock *inet = inet_sk(sk);
1878 	struct ipv6_pinfo *np = inet6_sk(sk);
1879 	int exthdrlen;
1880 	int err;
1881 
1882 	if (flags&MSG_PROBE)
1883 		return 0;
1884 	if (skb_queue_empty(&sk->sk_write_queue)) {
1885 		/*
1886 		 * setup for corking
1887 		 */
1888 		dst_hold(&rt->dst);
1889 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1890 				     ipc6, rt);
1891 		if (err)
1892 			return err;
1893 
1894 		inet->cork.fl.u.ip6 = *fl6;
1895 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1896 		length += exthdrlen;
1897 		transhdrlen += exthdrlen;
1898 	} else {
1899 		transhdrlen = 0;
1900 	}
1901 
1902 	return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1903 				 &np->cork, sk_page_frag(sk), getfrag,
1904 				 from, length, transhdrlen, flags, ipc6);
1905 }
1906 EXPORT_SYMBOL_GPL(ip6_append_data);
1907 
1908 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1909 {
1910 	struct dst_entry *dst = cork->base.dst;
1911 
1912 	cork->base.dst = NULL;
1913 	cork->base.flags &= ~IPCORK_ALLFRAG;
1914 	skb_dst_set(skb, dst);
1915 }
1916 
1917 static void ip6_cork_release(struct inet_cork_full *cork,
1918 			     struct inet6_cork *v6_cork)
1919 {
1920 	if (v6_cork->opt) {
1921 		struct ipv6_txoptions *opt = v6_cork->opt;
1922 
1923 		kfree(opt->dst0opt);
1924 		kfree(opt->dst1opt);
1925 		kfree(opt->hopopt);
1926 		kfree(opt->srcrt);
1927 		kfree(opt);
1928 		v6_cork->opt = NULL;
1929 	}
1930 
1931 	if (cork->base.dst) {
1932 		dst_release(cork->base.dst);
1933 		cork->base.dst = NULL;
1934 		cork->base.flags &= ~IPCORK_ALLFRAG;
1935 	}
1936 }
1937 
1938 struct sk_buff *__ip6_make_skb(struct sock *sk,
1939 			       struct sk_buff_head *queue,
1940 			       struct inet_cork_full *cork,
1941 			       struct inet6_cork *v6_cork)
1942 {
1943 	struct sk_buff *skb, *tmp_skb;
1944 	struct sk_buff **tail_skb;
1945 	struct in6_addr *final_dst;
1946 	struct ipv6_pinfo *np = inet6_sk(sk);
1947 	struct net *net = sock_net(sk);
1948 	struct ipv6hdr *hdr;
1949 	struct ipv6_txoptions *opt = v6_cork->opt;
1950 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1951 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1952 	unsigned char proto = fl6->flowi6_proto;
1953 
1954 	skb = __skb_dequeue(queue);
1955 	if (!skb)
1956 		goto out;
1957 	tail_skb = &(skb_shinfo(skb)->frag_list);
1958 
1959 	/* move skb->data to ip header from ext header */
1960 	if (skb->data < skb_network_header(skb))
1961 		__skb_pull(skb, skb_network_offset(skb));
1962 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1963 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1964 		*tail_skb = tmp_skb;
1965 		tail_skb = &(tmp_skb->next);
1966 		skb->len += tmp_skb->len;
1967 		skb->data_len += tmp_skb->len;
1968 		skb->truesize += tmp_skb->truesize;
1969 		tmp_skb->destructor = NULL;
1970 		tmp_skb->sk = NULL;
1971 	}
1972 
1973 	/* Allow local fragmentation. */
1974 	skb->ignore_df = ip6_sk_ignore_df(sk);
1975 	__skb_pull(skb, skb_network_header_len(skb));
1976 
1977 	final_dst = &fl6->daddr;
1978 	if (opt && opt->opt_flen)
1979 		ipv6_push_frag_opts(skb, opt, &proto);
1980 	if (opt && opt->opt_nflen)
1981 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1982 
1983 	skb_push(skb, sizeof(struct ipv6hdr));
1984 	skb_reset_network_header(skb);
1985 	hdr = ipv6_hdr(skb);
1986 
1987 	ip6_flow_hdr(hdr, v6_cork->tclass,
1988 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1989 					ip6_autoflowlabel(net, np), fl6));
1990 	hdr->hop_limit = v6_cork->hop_limit;
1991 	hdr->nexthdr = proto;
1992 	hdr->saddr = fl6->saddr;
1993 	hdr->daddr = *final_dst;
1994 
1995 	skb->priority = sk->sk_priority;
1996 	skb->mark = cork->base.mark;
1997 	skb->tstamp = cork->base.transmit_time;
1998 
1999 	ip6_cork_steal_dst(skb, cork);
2000 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
2001 	if (proto == IPPROTO_ICMPV6) {
2002 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
2003 		u8 icmp6_type;
2004 
2005 		if (sk->sk_socket->type == SOCK_RAW &&
2006 		   !inet_test_bit(HDRINCL, sk))
2007 			icmp6_type = fl6->fl6_icmp_type;
2008 		else
2009 			icmp6_type = icmp6_hdr(skb)->icmp6_type;
2010 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
2011 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
2012 	}
2013 
2014 	ip6_cork_release(cork, v6_cork);
2015 out:
2016 	return skb;
2017 }
2018 
2019 int ip6_send_skb(struct sk_buff *skb)
2020 {
2021 	struct net *net = sock_net(skb->sk);
2022 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
2023 	int err;
2024 
2025 	err = ip6_local_out(net, skb->sk, skb);
2026 	if (err) {
2027 		if (err > 0)
2028 			err = net_xmit_errno(err);
2029 		if (err)
2030 			IP6_INC_STATS(net, rt->rt6i_idev,
2031 				      IPSTATS_MIB_OUTDISCARDS);
2032 	}
2033 
2034 	return err;
2035 }
2036 
2037 int ip6_push_pending_frames(struct sock *sk)
2038 {
2039 	struct sk_buff *skb;
2040 
2041 	skb = ip6_finish_skb(sk);
2042 	if (!skb)
2043 		return 0;
2044 
2045 	return ip6_send_skb(skb);
2046 }
2047 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
2048 
2049 static void __ip6_flush_pending_frames(struct sock *sk,
2050 				       struct sk_buff_head *queue,
2051 				       struct inet_cork_full *cork,
2052 				       struct inet6_cork *v6_cork)
2053 {
2054 	struct sk_buff *skb;
2055 
2056 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
2057 		if (skb_dst(skb))
2058 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
2059 				      IPSTATS_MIB_OUTDISCARDS);
2060 		kfree_skb(skb);
2061 	}
2062 
2063 	ip6_cork_release(cork, v6_cork);
2064 }
2065 
2066 void ip6_flush_pending_frames(struct sock *sk)
2067 {
2068 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2069 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
2070 }
2071 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2072 
2073 struct sk_buff *ip6_make_skb(struct sock *sk,
2074 			     int getfrag(void *from, char *to, int offset,
2075 					 int len, int odd, struct sk_buff *skb),
2076 			     void *from, size_t length, int transhdrlen,
2077 			     struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2078 			     unsigned int flags, struct inet_cork_full *cork)
2079 {
2080 	struct inet6_cork v6_cork;
2081 	struct sk_buff_head queue;
2082 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2083 	int err;
2084 
2085 	if (flags & MSG_PROBE) {
2086 		dst_release(&rt->dst);
2087 		return NULL;
2088 	}
2089 
2090 	__skb_queue_head_init(&queue);
2091 
2092 	cork->base.flags = 0;
2093 	cork->base.addr = 0;
2094 	cork->base.opt = NULL;
2095 	v6_cork.opt = NULL;
2096 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
2097 	if (err) {
2098 		ip6_cork_release(cork, &v6_cork);
2099 		return ERR_PTR(err);
2100 	}
2101 	if (ipc6->dontfrag < 0)
2102 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2103 
2104 	err = __ip6_append_data(sk, &queue, cork, &v6_cork,
2105 				&current->task_frag, getfrag, from,
2106 				length + exthdrlen, transhdrlen + exthdrlen,
2107 				flags, ipc6);
2108 	if (err) {
2109 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2110 		return ERR_PTR(err);
2111 	}
2112 
2113 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2114 }
2115