xref: /openbmc/linux/net/ipv6/ip6_output.c (revision f4356947f0297b0962fdd197672db7edf9f58be6)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	IPv6 output functions
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on linux/net/ipv4/ip_output.c
10  *
11  *	Changes:
12  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13  *				extension headers are implemented.
14  *				route changes now work.
15  *				ip6_forward does not confuse sniffers.
16  *				etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *	Imran Patel	:	frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *			:       add ip6_append_data and related functions
22  *				for datagram xmit
23  */
24 
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37 
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41 
42 #include <net/sock.h>
43 #include <net/snmp.h>
44 
45 #include <net/ipv6.h>
46 #include <net/ndisc.h>
47 #include <net/protocol.h>
48 #include <net/ip6_route.h>
49 #include <net/addrconf.h>
50 #include <net/rawv6.h>
51 #include <net/icmp.h>
52 #include <net/xfrm.h>
53 #include <net/checksum.h>
54 #include <linux/mroute6.h>
55 #include <net/l3mdev.h>
56 #include <net/lwtunnel.h>
57 #include <net/ip_tunnels.h>
58 
59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
60 {
61 	struct dst_entry *dst = skb_dst(skb);
62 	struct net_device *dev = dst->dev;
63 	struct inet6_dev *idev = ip6_dst_idev(dst);
64 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
65 	const struct in6_addr *daddr, *nexthop;
66 	struct ipv6hdr *hdr;
67 	struct neighbour *neigh;
68 	int ret;
69 
70 	/* Be paranoid, rather than too clever. */
71 	if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
72 		skb = skb_expand_head(skb, hh_len);
73 		if (!skb) {
74 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
75 			return -ENOMEM;
76 		}
77 	}
78 
79 	hdr = ipv6_hdr(skb);
80 	daddr = &hdr->daddr;
81 	if (ipv6_addr_is_multicast(daddr)) {
82 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
83 		    ((mroute6_is_socket(net, skb) &&
84 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
85 		     ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
86 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
87 
88 			/* Do not check for IFF_ALLMULTI; multicast routing
89 			   is not supported in any case.
90 			 */
91 			if (newskb)
92 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
93 					net, sk, newskb, NULL, newskb->dev,
94 					dev_loopback_xmit);
95 
96 			if (hdr->hop_limit == 0) {
97 				IP6_INC_STATS(net, idev,
98 					      IPSTATS_MIB_OUTDISCARDS);
99 				kfree_skb(skb);
100 				return 0;
101 			}
102 		}
103 
104 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
105 		if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
106 		    !(dev->flags & IFF_LOOPBACK)) {
107 			kfree_skb(skb);
108 			return 0;
109 		}
110 	}
111 
112 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
113 		int res = lwtunnel_xmit(skb);
114 
115 		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
116 			return res;
117 	}
118 
119 	rcu_read_lock();
120 	nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
121 	neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
122 
123 	if (unlikely(IS_ERR_OR_NULL(neigh))) {
124 		if (unlikely(!neigh))
125 			neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
126 		if (IS_ERR(neigh)) {
127 			rcu_read_unlock();
128 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
129 			kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
130 			return -EINVAL;
131 		}
132 	}
133 	sock_confirm_neigh(skb, neigh);
134 	ret = neigh_output(neigh, skb, false);
135 	rcu_read_unlock();
136 	return ret;
137 }
138 
139 static int
140 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
141 				    struct sk_buff *skb, unsigned int mtu)
142 {
143 	struct sk_buff *segs, *nskb;
144 	netdev_features_t features;
145 	int ret = 0;
146 
147 	/* Please see corresponding comment in ip_finish_output_gso
148 	 * describing the cases where GSO segment length exceeds the
149 	 * egress MTU.
150 	 */
151 	features = netif_skb_features(skb);
152 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
153 	if (IS_ERR_OR_NULL(segs)) {
154 		kfree_skb(skb);
155 		return -ENOMEM;
156 	}
157 
158 	consume_skb(skb);
159 
160 	skb_list_walk_safe(segs, segs, nskb) {
161 		int err;
162 
163 		skb_mark_not_on_list(segs);
164 		err = ip6_fragment(net, sk, segs, ip6_finish_output2);
165 		if (err && ret == 0)
166 			ret = err;
167 	}
168 
169 	return ret;
170 }
171 
172 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
173 {
174 	unsigned int mtu;
175 
176 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
177 	/* Policy lookup after SNAT yielded a new policy */
178 	if (skb_dst(skb)->xfrm) {
179 		IP6CB(skb)->flags |= IP6SKB_REROUTED;
180 		return dst_output(net, sk, skb);
181 	}
182 #endif
183 
184 	mtu = ip6_skb_dst_mtu(skb);
185 	if (skb_is_gso(skb) &&
186 	    !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
187 	    !skb_gso_validate_network_len(skb, mtu))
188 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
189 
190 	if ((skb->len > mtu && !skb_is_gso(skb)) ||
191 	    dst_allfrag(skb_dst(skb)) ||
192 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
193 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
194 	else
195 		return ip6_finish_output2(net, sk, skb);
196 }
197 
198 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
199 {
200 	int ret;
201 
202 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
203 	switch (ret) {
204 	case NET_XMIT_SUCCESS:
205 	case NET_XMIT_CN:
206 		return __ip6_finish_output(net, sk, skb) ? : ret;
207 	default:
208 		kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
209 		return ret;
210 	}
211 }
212 
213 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
214 {
215 	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
216 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
217 
218 	skb->protocol = htons(ETH_P_IPV6);
219 	skb->dev = dev;
220 
221 	if (unlikely(idev->cnf.disable_ipv6)) {
222 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
223 		kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
224 		return 0;
225 	}
226 
227 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
228 			    net, sk, skb, indev, dev,
229 			    ip6_finish_output,
230 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
231 }
232 EXPORT_SYMBOL(ip6_output);
233 
234 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
235 {
236 	if (!np->autoflowlabel_set)
237 		return ip6_default_np_autolabel(net);
238 	else
239 		return np->autoflowlabel;
240 }
241 
242 /*
243  * xmit an sk_buff (used by TCP, SCTP and DCCP)
244  * Note : socket lock is not held for SYNACK packets, but might be modified
245  * by calls to skb_set_owner_w() and ipv6_local_error(),
246  * which are using proper atomic operations or spinlocks.
247  */
248 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
249 	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
250 {
251 	struct net *net = sock_net(sk);
252 	const struct ipv6_pinfo *np = inet6_sk(sk);
253 	struct in6_addr *first_hop = &fl6->daddr;
254 	struct dst_entry *dst = skb_dst(skb);
255 	struct net_device *dev = dst->dev;
256 	struct inet6_dev *idev = ip6_dst_idev(dst);
257 	struct hop_jumbo_hdr *hop_jumbo;
258 	int hoplen = sizeof(*hop_jumbo);
259 	unsigned int head_room;
260 	struct ipv6hdr *hdr;
261 	u8  proto = fl6->flowi6_proto;
262 	int seg_len = skb->len;
263 	int hlimit = -1;
264 	u32 mtu;
265 
266 	head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
267 	if (opt)
268 		head_room += opt->opt_nflen + opt->opt_flen;
269 
270 	if (unlikely(head_room > skb_headroom(skb))) {
271 		skb = skb_expand_head(skb, head_room);
272 		if (!skb) {
273 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
274 			return -ENOBUFS;
275 		}
276 	}
277 
278 	if (opt) {
279 		seg_len += opt->opt_nflen + opt->opt_flen;
280 
281 		if (opt->opt_flen)
282 			ipv6_push_frag_opts(skb, opt, &proto);
283 
284 		if (opt->opt_nflen)
285 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
286 					     &fl6->saddr);
287 	}
288 
289 	if (unlikely(seg_len > IPV6_MAXPLEN)) {
290 		hop_jumbo = skb_push(skb, hoplen);
291 
292 		hop_jumbo->nexthdr = proto;
293 		hop_jumbo->hdrlen = 0;
294 		hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
295 		hop_jumbo->tlv_len = 4;
296 		hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
297 
298 		proto = IPPROTO_HOPOPTS;
299 		seg_len = 0;
300 		IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
301 	}
302 
303 	skb_push(skb, sizeof(struct ipv6hdr));
304 	skb_reset_network_header(skb);
305 	hdr = ipv6_hdr(skb);
306 
307 	/*
308 	 *	Fill in the IPv6 header
309 	 */
310 	if (np)
311 		hlimit = np->hop_limit;
312 	if (hlimit < 0)
313 		hlimit = ip6_dst_hoplimit(dst);
314 
315 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
316 				ip6_autoflowlabel(net, np), fl6));
317 
318 	hdr->payload_len = htons(seg_len);
319 	hdr->nexthdr = proto;
320 	hdr->hop_limit = hlimit;
321 
322 	hdr->saddr = fl6->saddr;
323 	hdr->daddr = *first_hop;
324 
325 	skb->protocol = htons(ETH_P_IPV6);
326 	skb->priority = priority;
327 	skb->mark = mark;
328 
329 	mtu = dst_mtu(dst);
330 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
331 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
332 
333 		/* if egress device is enslaved to an L3 master device pass the
334 		 * skb to its handler for processing
335 		 */
336 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
337 		if (unlikely(!skb))
338 			return 0;
339 
340 		/* hooks should never assume socket lock is held.
341 		 * we promote our socket to non const
342 		 */
343 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
344 			       net, (struct sock *)sk, skb, NULL, dev,
345 			       dst_output);
346 	}
347 
348 	skb->dev = dev;
349 	/* ipv6_local_error() does not require socket lock,
350 	 * we promote our socket to non const
351 	 */
352 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
353 
354 	IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
355 	kfree_skb(skb);
356 	return -EMSGSIZE;
357 }
358 EXPORT_SYMBOL(ip6_xmit);
359 
360 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
361 {
362 	struct ip6_ra_chain *ra;
363 	struct sock *last = NULL;
364 
365 	read_lock(&ip6_ra_lock);
366 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
367 		struct sock *sk = ra->sk;
368 		if (sk && ra->sel == sel &&
369 		    (!sk->sk_bound_dev_if ||
370 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
371 			struct ipv6_pinfo *np = inet6_sk(sk);
372 
373 			if (np && np->rtalert_isolate &&
374 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
375 				continue;
376 			}
377 			if (last) {
378 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
379 				if (skb2)
380 					rawv6_rcv(last, skb2);
381 			}
382 			last = sk;
383 		}
384 	}
385 
386 	if (last) {
387 		rawv6_rcv(last, skb);
388 		read_unlock(&ip6_ra_lock);
389 		return 1;
390 	}
391 	read_unlock(&ip6_ra_lock);
392 	return 0;
393 }
394 
395 static int ip6_forward_proxy_check(struct sk_buff *skb)
396 {
397 	struct ipv6hdr *hdr = ipv6_hdr(skb);
398 	u8 nexthdr = hdr->nexthdr;
399 	__be16 frag_off;
400 	int offset;
401 
402 	if (ipv6_ext_hdr(nexthdr)) {
403 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
404 		if (offset < 0)
405 			return 0;
406 	} else
407 		offset = sizeof(struct ipv6hdr);
408 
409 	if (nexthdr == IPPROTO_ICMPV6) {
410 		struct icmp6hdr *icmp6;
411 
412 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
413 					 offset + 1 - skb->data)))
414 			return 0;
415 
416 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
417 
418 		switch (icmp6->icmp6_type) {
419 		case NDISC_ROUTER_SOLICITATION:
420 		case NDISC_ROUTER_ADVERTISEMENT:
421 		case NDISC_NEIGHBOUR_SOLICITATION:
422 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
423 		case NDISC_REDIRECT:
424 			/* For reaction involving unicast neighbor discovery
425 			 * message destined to the proxied address, pass it to
426 			 * input function.
427 			 */
428 			return 1;
429 		default:
430 			break;
431 		}
432 	}
433 
434 	/*
435 	 * The proxying router can't forward traffic sent to a link-local
436 	 * address, so signal the sender and discard the packet. This
437 	 * behavior is clarified by the MIPv6 specification.
438 	 */
439 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
440 		dst_link_failure(skb);
441 		return -1;
442 	}
443 
444 	return 0;
445 }
446 
447 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
448 				     struct sk_buff *skb)
449 {
450 	struct dst_entry *dst = skb_dst(skb);
451 
452 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
453 	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
454 
455 #ifdef CONFIG_NET_SWITCHDEV
456 	if (skb->offload_l3_fwd_mark) {
457 		consume_skb(skb);
458 		return 0;
459 	}
460 #endif
461 
462 	skb_clear_tstamp(skb);
463 	return dst_output(net, sk, skb);
464 }
465 
466 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
467 {
468 	if (skb->len <= mtu)
469 		return false;
470 
471 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
472 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
473 		return true;
474 
475 	if (skb->ignore_df)
476 		return false;
477 
478 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
479 		return false;
480 
481 	return true;
482 }
483 
484 int ip6_forward(struct sk_buff *skb)
485 {
486 	struct dst_entry *dst = skb_dst(skb);
487 	struct ipv6hdr *hdr = ipv6_hdr(skb);
488 	struct inet6_skb_parm *opt = IP6CB(skb);
489 	struct net *net = dev_net(dst->dev);
490 	struct inet6_dev *idev;
491 	SKB_DR(reason);
492 	u32 mtu;
493 
494 	idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
495 	if (net->ipv6.devconf_all->forwarding == 0)
496 		goto error;
497 
498 	if (skb->pkt_type != PACKET_HOST)
499 		goto drop;
500 
501 	if (unlikely(skb->sk))
502 		goto drop;
503 
504 	if (skb_warn_if_lro(skb))
505 		goto drop;
506 
507 	if (!net->ipv6.devconf_all->disable_policy &&
508 	    (!idev || !idev->cnf.disable_policy) &&
509 	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
510 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
511 		goto drop;
512 	}
513 
514 	skb_forward_csum(skb);
515 
516 	/*
517 	 *	We DO NOT make any processing on
518 	 *	RA packets, pushing them to user level AS IS
519 	 *	without ane WARRANTY that application will be able
520 	 *	to interpret them. The reason is that we
521 	 *	cannot make anything clever here.
522 	 *
523 	 *	We are not end-node, so that if packet contains
524 	 *	AH/ESP, we cannot make anything.
525 	 *	Defragmentation also would be mistake, RA packets
526 	 *	cannot be fragmented, because there is no warranty
527 	 *	that different fragments will go along one path. --ANK
528 	 */
529 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
530 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
531 			return 0;
532 	}
533 
534 	/*
535 	 *	check and decrement ttl
536 	 */
537 	if (hdr->hop_limit <= 1) {
538 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
539 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
540 
541 		kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
542 		return -ETIMEDOUT;
543 	}
544 
545 	/* XXX: idev->cnf.proxy_ndp? */
546 	if (net->ipv6.devconf_all->proxy_ndp &&
547 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
548 		int proxied = ip6_forward_proxy_check(skb);
549 		if (proxied > 0) {
550 			/* It's tempting to decrease the hop limit
551 			 * here by 1, as we do at the end of the
552 			 * function too.
553 			 *
554 			 * But that would be incorrect, as proxying is
555 			 * not forwarding.  The ip6_input function
556 			 * will handle this packet locally, and it
557 			 * depends on the hop limit being unchanged.
558 			 *
559 			 * One example is the NDP hop limit, that
560 			 * always has to stay 255, but other would be
561 			 * similar checks around RA packets, where the
562 			 * user can even change the desired limit.
563 			 */
564 			return ip6_input(skb);
565 		} else if (proxied < 0) {
566 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
567 			goto drop;
568 		}
569 	}
570 
571 	if (!xfrm6_route_forward(skb)) {
572 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
573 		SKB_DR_SET(reason, XFRM_POLICY);
574 		goto drop;
575 	}
576 	dst = skb_dst(skb);
577 
578 	/* IPv6 specs say nothing about it, but it is clear that we cannot
579 	   send redirects to source routed frames.
580 	   We don't send redirects to frames decapsulated from IPsec.
581 	 */
582 	if (IP6CB(skb)->iif == dst->dev->ifindex &&
583 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
584 		struct in6_addr *target = NULL;
585 		struct inet_peer *peer;
586 		struct rt6_info *rt;
587 
588 		/*
589 		 *	incoming and outgoing devices are the same
590 		 *	send a redirect.
591 		 */
592 
593 		rt = (struct rt6_info *) dst;
594 		if (rt->rt6i_flags & RTF_GATEWAY)
595 			target = &rt->rt6i_gateway;
596 		else
597 			target = &hdr->daddr;
598 
599 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
600 
601 		/* Limit redirects both by destination (here)
602 		   and by source (inside ndisc_send_redirect)
603 		 */
604 		if (inet_peer_xrlim_allow(peer, 1*HZ))
605 			ndisc_send_redirect(skb, target);
606 		if (peer)
607 			inet_putpeer(peer);
608 	} else {
609 		int addrtype = ipv6_addr_type(&hdr->saddr);
610 
611 		/* This check is security critical. */
612 		if (addrtype == IPV6_ADDR_ANY ||
613 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
614 			goto error;
615 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
616 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
617 				    ICMPV6_NOT_NEIGHBOUR, 0);
618 			goto error;
619 		}
620 	}
621 
622 	mtu = ip6_dst_mtu_maybe_forward(dst, true);
623 	if (mtu < IPV6_MIN_MTU)
624 		mtu = IPV6_MIN_MTU;
625 
626 	if (ip6_pkt_too_big(skb, mtu)) {
627 		/* Again, force OUTPUT device used as source address */
628 		skb->dev = dst->dev;
629 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
630 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
631 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
632 				IPSTATS_MIB_FRAGFAILS);
633 		kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
634 		return -EMSGSIZE;
635 	}
636 
637 	if (skb_cow(skb, dst->dev->hard_header_len)) {
638 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
639 				IPSTATS_MIB_OUTDISCARDS);
640 		goto drop;
641 	}
642 
643 	hdr = ipv6_hdr(skb);
644 
645 	/* Mangling hops number delayed to point after skb COW */
646 
647 	hdr->hop_limit--;
648 
649 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
650 		       net, NULL, skb, skb->dev, dst->dev,
651 		       ip6_forward_finish);
652 
653 error:
654 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
655 	SKB_DR_SET(reason, IP_INADDRERRORS);
656 drop:
657 	kfree_skb_reason(skb, reason);
658 	return -EINVAL;
659 }
660 
661 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
662 {
663 	to->pkt_type = from->pkt_type;
664 	to->priority = from->priority;
665 	to->protocol = from->protocol;
666 	skb_dst_drop(to);
667 	skb_dst_set(to, dst_clone(skb_dst(from)));
668 	to->dev = from->dev;
669 	to->mark = from->mark;
670 
671 	skb_copy_hash(to, from);
672 
673 #ifdef CONFIG_NET_SCHED
674 	to->tc_index = from->tc_index;
675 #endif
676 	nf_copy(to, from);
677 	skb_ext_copy(to, from);
678 	skb_copy_secmark(to, from);
679 }
680 
681 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
682 		      u8 nexthdr, __be32 frag_id,
683 		      struct ip6_fraglist_iter *iter)
684 {
685 	unsigned int first_len;
686 	struct frag_hdr *fh;
687 
688 	/* BUILD HEADER */
689 	*prevhdr = NEXTHDR_FRAGMENT;
690 	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
691 	if (!iter->tmp_hdr)
692 		return -ENOMEM;
693 
694 	iter->frag = skb_shinfo(skb)->frag_list;
695 	skb_frag_list_init(skb);
696 
697 	iter->offset = 0;
698 	iter->hlen = hlen;
699 	iter->frag_id = frag_id;
700 	iter->nexthdr = nexthdr;
701 
702 	__skb_pull(skb, hlen);
703 	fh = __skb_push(skb, sizeof(struct frag_hdr));
704 	__skb_push(skb, hlen);
705 	skb_reset_network_header(skb);
706 	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
707 
708 	fh->nexthdr = nexthdr;
709 	fh->reserved = 0;
710 	fh->frag_off = htons(IP6_MF);
711 	fh->identification = frag_id;
712 
713 	first_len = skb_pagelen(skb);
714 	skb->data_len = first_len - skb_headlen(skb);
715 	skb->len = first_len;
716 	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
717 
718 	return 0;
719 }
720 EXPORT_SYMBOL(ip6_fraglist_init);
721 
722 void ip6_fraglist_prepare(struct sk_buff *skb,
723 			  struct ip6_fraglist_iter *iter)
724 {
725 	struct sk_buff *frag = iter->frag;
726 	unsigned int hlen = iter->hlen;
727 	struct frag_hdr *fh;
728 
729 	frag->ip_summed = CHECKSUM_NONE;
730 	skb_reset_transport_header(frag);
731 	fh = __skb_push(frag, sizeof(struct frag_hdr));
732 	__skb_push(frag, hlen);
733 	skb_reset_network_header(frag);
734 	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
735 	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
736 	fh->nexthdr = iter->nexthdr;
737 	fh->reserved = 0;
738 	fh->frag_off = htons(iter->offset);
739 	if (frag->next)
740 		fh->frag_off |= htons(IP6_MF);
741 	fh->identification = iter->frag_id;
742 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
743 	ip6_copy_metadata(frag, skb);
744 }
745 EXPORT_SYMBOL(ip6_fraglist_prepare);
746 
747 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
748 		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
749 		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
750 {
751 	state->prevhdr = prevhdr;
752 	state->nexthdr = nexthdr;
753 	state->frag_id = frag_id;
754 
755 	state->hlen = hlen;
756 	state->mtu = mtu;
757 
758 	state->left = skb->len - hlen;	/* Space per frame */
759 	state->ptr = hlen;		/* Where to start from */
760 
761 	state->hroom = hdr_room;
762 	state->troom = needed_tailroom;
763 
764 	state->offset = 0;
765 }
766 EXPORT_SYMBOL(ip6_frag_init);
767 
768 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
769 {
770 	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
771 	struct sk_buff *frag;
772 	struct frag_hdr *fh;
773 	unsigned int len;
774 
775 	len = state->left;
776 	/* IF: it doesn't fit, use 'mtu' - the data space left */
777 	if (len > state->mtu)
778 		len = state->mtu;
779 	/* IF: we are not sending up to and including the packet end
780 	   then align the next start on an eight byte boundary */
781 	if (len < state->left)
782 		len &= ~7;
783 
784 	/* Allocate buffer */
785 	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
786 			 state->hroom + state->troom, GFP_ATOMIC);
787 	if (!frag)
788 		return ERR_PTR(-ENOMEM);
789 
790 	/*
791 	 *	Set up data on packet
792 	 */
793 
794 	ip6_copy_metadata(frag, skb);
795 	skb_reserve(frag, state->hroom);
796 	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
797 	skb_reset_network_header(frag);
798 	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
799 	frag->transport_header = (frag->network_header + state->hlen +
800 				  sizeof(struct frag_hdr));
801 
802 	/*
803 	 *	Charge the memory for the fragment to any owner
804 	 *	it might possess
805 	 */
806 	if (skb->sk)
807 		skb_set_owner_w(frag, skb->sk);
808 
809 	/*
810 	 *	Copy the packet header into the new buffer.
811 	 */
812 	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
813 
814 	fragnexthdr_offset = skb_network_header(frag);
815 	fragnexthdr_offset += prevhdr - skb_network_header(skb);
816 	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
817 
818 	/*
819 	 *	Build fragment header.
820 	 */
821 	fh->nexthdr = state->nexthdr;
822 	fh->reserved = 0;
823 	fh->identification = state->frag_id;
824 
825 	/*
826 	 *	Copy a block of the IP datagram.
827 	 */
828 	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
829 			     len));
830 	state->left -= len;
831 
832 	fh->frag_off = htons(state->offset);
833 	if (state->left > 0)
834 		fh->frag_off |= htons(IP6_MF);
835 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
836 
837 	state->ptr += len;
838 	state->offset += len;
839 
840 	return frag;
841 }
842 EXPORT_SYMBOL(ip6_frag_next);
843 
844 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
845 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
846 {
847 	struct sk_buff *frag;
848 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
849 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
850 				inet6_sk(skb->sk) : NULL;
851 	bool mono_delivery_time = skb->mono_delivery_time;
852 	struct ip6_frag_state state;
853 	unsigned int mtu, hlen, nexthdr_offset;
854 	ktime_t tstamp = skb->tstamp;
855 	int hroom, err = 0;
856 	__be32 frag_id;
857 	u8 *prevhdr, nexthdr = 0;
858 
859 	err = ip6_find_1stfragopt(skb, &prevhdr);
860 	if (err < 0)
861 		goto fail;
862 	hlen = err;
863 	nexthdr = *prevhdr;
864 	nexthdr_offset = prevhdr - skb_network_header(skb);
865 
866 	mtu = ip6_skb_dst_mtu(skb);
867 
868 	/* We must not fragment if the socket is set to force MTU discovery
869 	 * or if the skb it not generated by a local socket.
870 	 */
871 	if (unlikely(!skb->ignore_df && skb->len > mtu))
872 		goto fail_toobig;
873 
874 	if (IP6CB(skb)->frag_max_size) {
875 		if (IP6CB(skb)->frag_max_size > mtu)
876 			goto fail_toobig;
877 
878 		/* don't send fragments larger than what we received */
879 		mtu = IP6CB(skb)->frag_max_size;
880 		if (mtu < IPV6_MIN_MTU)
881 			mtu = IPV6_MIN_MTU;
882 	}
883 
884 	if (np && np->frag_size < mtu) {
885 		if (np->frag_size)
886 			mtu = np->frag_size;
887 	}
888 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
889 		goto fail_toobig;
890 	mtu -= hlen + sizeof(struct frag_hdr);
891 
892 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
893 				    &ipv6_hdr(skb)->saddr);
894 
895 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
896 	    (err = skb_checksum_help(skb)))
897 		goto fail;
898 
899 	prevhdr = skb_network_header(skb) + nexthdr_offset;
900 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
901 	if (skb_has_frag_list(skb)) {
902 		unsigned int first_len = skb_pagelen(skb);
903 		struct ip6_fraglist_iter iter;
904 		struct sk_buff *frag2;
905 
906 		if (first_len - hlen > mtu ||
907 		    ((first_len - hlen) & 7) ||
908 		    skb_cloned(skb) ||
909 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
910 			goto slow_path;
911 
912 		skb_walk_frags(skb, frag) {
913 			/* Correct geometry. */
914 			if (frag->len > mtu ||
915 			    ((frag->len & 7) && frag->next) ||
916 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
917 				goto slow_path_clean;
918 
919 			/* Partially cloned skb? */
920 			if (skb_shared(frag))
921 				goto slow_path_clean;
922 
923 			BUG_ON(frag->sk);
924 			if (skb->sk) {
925 				frag->sk = skb->sk;
926 				frag->destructor = sock_wfree;
927 			}
928 			skb->truesize -= frag->truesize;
929 		}
930 
931 		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
932 					&iter);
933 		if (err < 0)
934 			goto fail;
935 
936 		/* We prevent @rt from being freed. */
937 		rcu_read_lock();
938 
939 		for (;;) {
940 			/* Prepare header of the next frame,
941 			 * before previous one went down. */
942 			if (iter.frag)
943 				ip6_fraglist_prepare(skb, &iter);
944 
945 			skb_set_delivery_time(skb, tstamp, mono_delivery_time);
946 			err = output(net, sk, skb);
947 			if (!err)
948 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
949 					      IPSTATS_MIB_FRAGCREATES);
950 
951 			if (err || !iter.frag)
952 				break;
953 
954 			skb = ip6_fraglist_next(&iter);
955 		}
956 
957 		kfree(iter.tmp_hdr);
958 
959 		if (err == 0) {
960 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
961 				      IPSTATS_MIB_FRAGOKS);
962 			rcu_read_unlock();
963 			return 0;
964 		}
965 
966 		kfree_skb_list(iter.frag);
967 
968 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
969 			      IPSTATS_MIB_FRAGFAILS);
970 		rcu_read_unlock();
971 		return err;
972 
973 slow_path_clean:
974 		skb_walk_frags(skb, frag2) {
975 			if (frag2 == frag)
976 				break;
977 			frag2->sk = NULL;
978 			frag2->destructor = NULL;
979 			skb->truesize += frag2->truesize;
980 		}
981 	}
982 
983 slow_path:
984 	/*
985 	 *	Fragment the datagram.
986 	 */
987 
988 	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
989 		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
990 		      &state);
991 
992 	/*
993 	 *	Keep copying data until we run out.
994 	 */
995 
996 	while (state.left > 0) {
997 		frag = ip6_frag_next(skb, &state);
998 		if (IS_ERR(frag)) {
999 			err = PTR_ERR(frag);
1000 			goto fail;
1001 		}
1002 
1003 		/*
1004 		 *	Put this fragment into the sending queue.
1005 		 */
1006 		skb_set_delivery_time(frag, tstamp, mono_delivery_time);
1007 		err = output(net, sk, frag);
1008 		if (err)
1009 			goto fail;
1010 
1011 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1012 			      IPSTATS_MIB_FRAGCREATES);
1013 	}
1014 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1015 		      IPSTATS_MIB_FRAGOKS);
1016 	consume_skb(skb);
1017 	return err;
1018 
1019 fail_toobig:
1020 	if (skb->sk && dst_allfrag(skb_dst(skb)))
1021 		sk_gso_disable(skb->sk);
1022 
1023 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1024 	err = -EMSGSIZE;
1025 
1026 fail:
1027 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1028 		      IPSTATS_MIB_FRAGFAILS);
1029 	kfree_skb(skb);
1030 	return err;
1031 }
1032 
1033 static inline int ip6_rt_check(const struct rt6key *rt_key,
1034 			       const struct in6_addr *fl_addr,
1035 			       const struct in6_addr *addr_cache)
1036 {
1037 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1038 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1039 }
1040 
1041 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1042 					  struct dst_entry *dst,
1043 					  const struct flowi6 *fl6)
1044 {
1045 	struct ipv6_pinfo *np = inet6_sk(sk);
1046 	struct rt6_info *rt;
1047 
1048 	if (!dst)
1049 		goto out;
1050 
1051 	if (dst->ops->family != AF_INET6) {
1052 		dst_release(dst);
1053 		return NULL;
1054 	}
1055 
1056 	rt = (struct rt6_info *)dst;
1057 	/* Yes, checking route validity in not connected
1058 	 * case is not very simple. Take into account,
1059 	 * that we do not support routing by source, TOS,
1060 	 * and MSG_DONTROUTE		--ANK (980726)
1061 	 *
1062 	 * 1. ip6_rt_check(): If route was host route,
1063 	 *    check that cached destination is current.
1064 	 *    If it is network route, we still may
1065 	 *    check its validity using saved pointer
1066 	 *    to the last used address: daddr_cache.
1067 	 *    We do not want to save whole address now,
1068 	 *    (because main consumer of this service
1069 	 *    is tcp, which has not this problem),
1070 	 *    so that the last trick works only on connected
1071 	 *    sockets.
1072 	 * 2. oif also should be the same.
1073 	 */
1074 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1075 #ifdef CONFIG_IPV6_SUBTREES
1076 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1077 #endif
1078 	   (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
1079 		dst_release(dst);
1080 		dst = NULL;
1081 	}
1082 
1083 out:
1084 	return dst;
1085 }
1086 
1087 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1088 			       struct dst_entry **dst, struct flowi6 *fl6)
1089 {
1090 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1091 	struct neighbour *n;
1092 	struct rt6_info *rt;
1093 #endif
1094 	int err;
1095 	int flags = 0;
1096 
1097 	/* The correct way to handle this would be to do
1098 	 * ip6_route_get_saddr, and then ip6_route_output; however,
1099 	 * the route-specific preferred source forces the
1100 	 * ip6_route_output call _before_ ip6_route_get_saddr.
1101 	 *
1102 	 * In source specific routing (no src=any default route),
1103 	 * ip6_route_output will fail given src=any saddr, though, so
1104 	 * that's why we try it again later.
1105 	 */
1106 	if (ipv6_addr_any(&fl6->saddr)) {
1107 		struct fib6_info *from;
1108 		struct rt6_info *rt;
1109 
1110 		*dst = ip6_route_output(net, sk, fl6);
1111 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1112 
1113 		rcu_read_lock();
1114 		from = rt ? rcu_dereference(rt->from) : NULL;
1115 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1116 					  sk ? inet6_sk(sk)->srcprefs : 0,
1117 					  &fl6->saddr);
1118 		rcu_read_unlock();
1119 
1120 		if (err)
1121 			goto out_err_release;
1122 
1123 		/* If we had an erroneous initial result, pretend it
1124 		 * never existed and let the SA-enabled version take
1125 		 * over.
1126 		 */
1127 		if ((*dst)->error) {
1128 			dst_release(*dst);
1129 			*dst = NULL;
1130 		}
1131 
1132 		if (fl6->flowi6_oif)
1133 			flags |= RT6_LOOKUP_F_IFACE;
1134 	}
1135 
1136 	if (!*dst)
1137 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1138 
1139 	err = (*dst)->error;
1140 	if (err)
1141 		goto out_err_release;
1142 
1143 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1144 	/*
1145 	 * Here if the dst entry we've looked up
1146 	 * has a neighbour entry that is in the INCOMPLETE
1147 	 * state and the src address from the flow is
1148 	 * marked as OPTIMISTIC, we release the found
1149 	 * dst entry and replace it instead with the
1150 	 * dst entry of the nexthop router
1151 	 */
1152 	rt = (struct rt6_info *) *dst;
1153 	rcu_read_lock();
1154 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1155 				      rt6_nexthop(rt, &fl6->daddr));
1156 	err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
1157 	rcu_read_unlock();
1158 
1159 	if (err) {
1160 		struct inet6_ifaddr *ifp;
1161 		struct flowi6 fl_gw6;
1162 		int redirect;
1163 
1164 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1165 				      (*dst)->dev, 1);
1166 
1167 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1168 		if (ifp)
1169 			in6_ifa_put(ifp);
1170 
1171 		if (redirect) {
1172 			/*
1173 			 * We need to get the dst entry for the
1174 			 * default router instead
1175 			 */
1176 			dst_release(*dst);
1177 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1178 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1179 			*dst = ip6_route_output(net, sk, &fl_gw6);
1180 			err = (*dst)->error;
1181 			if (err)
1182 				goto out_err_release;
1183 		}
1184 	}
1185 #endif
1186 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1187 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1188 		err = -EAFNOSUPPORT;
1189 		goto out_err_release;
1190 	}
1191 
1192 	return 0;
1193 
1194 out_err_release:
1195 	dst_release(*dst);
1196 	*dst = NULL;
1197 
1198 	if (err == -ENETUNREACH)
1199 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1200 	return err;
1201 }
1202 
1203 /**
1204  *	ip6_dst_lookup - perform route lookup on flow
1205  *	@net: Network namespace to perform lookup in
1206  *	@sk: socket which provides route info
1207  *	@dst: pointer to dst_entry * for result
1208  *	@fl6: flow to lookup
1209  *
1210  *	This function performs a route lookup on the given flow.
1211  *
1212  *	It returns zero on success, or a standard errno code on error.
1213  */
1214 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1215 		   struct flowi6 *fl6)
1216 {
1217 	*dst = NULL;
1218 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1219 }
1220 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1221 
1222 /**
1223  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1224  *	@net: Network namespace to perform lookup in
1225  *	@sk: socket which provides route info
1226  *	@fl6: flow to lookup
1227  *	@final_dst: final destination address for ipsec lookup
1228  *
1229  *	This function performs a route lookup on the given flow.
1230  *
1231  *	It returns a valid dst pointer on success, or a pointer encoded
1232  *	error code.
1233  */
1234 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1235 				      const struct in6_addr *final_dst)
1236 {
1237 	struct dst_entry *dst = NULL;
1238 	int err;
1239 
1240 	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1241 	if (err)
1242 		return ERR_PTR(err);
1243 	if (final_dst)
1244 		fl6->daddr = *final_dst;
1245 
1246 	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1247 }
1248 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1249 
1250 /**
1251  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1252  *	@sk: socket which provides the dst cache and route info
1253  *	@fl6: flow to lookup
1254  *	@final_dst: final destination address for ipsec lookup
1255  *	@connected: whether @sk is connected or not
1256  *
1257  *	This function performs a route lookup on the given flow with the
1258  *	possibility of using the cached route in the socket if it is valid.
1259  *	It will take the socket dst lock when operating on the dst cache.
1260  *	As a result, this function can only be used in process context.
1261  *
1262  *	In addition, for a connected socket, cache the dst in the socket
1263  *	if the current cache is not valid.
1264  *
1265  *	It returns a valid dst pointer on success, or a pointer encoded
1266  *	error code.
1267  */
1268 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1269 					 const struct in6_addr *final_dst,
1270 					 bool connected)
1271 {
1272 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1273 
1274 	dst = ip6_sk_dst_check(sk, dst, fl6);
1275 	if (dst)
1276 		return dst;
1277 
1278 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1279 	if (connected && !IS_ERR(dst))
1280 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1281 
1282 	return dst;
1283 }
1284 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1285 
1286 /**
1287  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1288  *      @skb: Packet for which lookup is done
1289  *      @dev: Tunnel device
1290  *      @net: Network namespace of tunnel device
1291  *      @sock: Socket which provides route info
1292  *      @saddr: Memory to store the src ip address
1293  *      @info: Tunnel information
1294  *      @protocol: IP protocol
1295  *      @use_cache: Flag to enable cache usage
1296  *      This function performs a route lookup on a tunnel
1297  *
1298  *      It returns a valid dst pointer and stores src address to be used in
1299  *      tunnel in param saddr on success, else a pointer encoded error code.
1300  */
1301 
1302 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1303 					struct net_device *dev,
1304 					struct net *net,
1305 					struct socket *sock,
1306 					struct in6_addr *saddr,
1307 					const struct ip_tunnel_info *info,
1308 					u8 protocol,
1309 					bool use_cache)
1310 {
1311 	struct dst_entry *dst = NULL;
1312 #ifdef CONFIG_DST_CACHE
1313 	struct dst_cache *dst_cache;
1314 #endif
1315 	struct flowi6 fl6;
1316 	__u8 prio;
1317 
1318 #ifdef CONFIG_DST_CACHE
1319 	dst_cache = (struct dst_cache *)&info->dst_cache;
1320 	if (use_cache) {
1321 		dst = dst_cache_get_ip6(dst_cache, saddr);
1322 		if (dst)
1323 			return dst;
1324 	}
1325 #endif
1326 	memset(&fl6, 0, sizeof(fl6));
1327 	fl6.flowi6_mark = skb->mark;
1328 	fl6.flowi6_proto = protocol;
1329 	fl6.daddr = info->key.u.ipv6.dst;
1330 	fl6.saddr = info->key.u.ipv6.src;
1331 	prio = info->key.tos;
1332 	fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
1333 
1334 	dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1335 					      NULL);
1336 	if (IS_ERR(dst)) {
1337 		netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1338 		return ERR_PTR(-ENETUNREACH);
1339 	}
1340 	if (dst->dev == dev) { /* is this necessary? */
1341 		netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1342 		dst_release(dst);
1343 		return ERR_PTR(-ELOOP);
1344 	}
1345 #ifdef CONFIG_DST_CACHE
1346 	if (use_cache)
1347 		dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1348 #endif
1349 	*saddr = fl6.saddr;
1350 	return dst;
1351 }
1352 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1353 
1354 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1355 					       gfp_t gfp)
1356 {
1357 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1358 }
1359 
1360 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1361 						gfp_t gfp)
1362 {
1363 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1364 }
1365 
1366 static void ip6_append_data_mtu(unsigned int *mtu,
1367 				int *maxfraglen,
1368 				unsigned int fragheaderlen,
1369 				struct sk_buff *skb,
1370 				struct rt6_info *rt,
1371 				unsigned int orig_mtu)
1372 {
1373 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1374 		if (!skb) {
1375 			/* first fragment, reserve header_len */
1376 			*mtu = orig_mtu - rt->dst.header_len;
1377 
1378 		} else {
1379 			/*
1380 			 * this fragment is not first, the headers
1381 			 * space is regarded as data space.
1382 			 */
1383 			*mtu = orig_mtu;
1384 		}
1385 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1386 			      + fragheaderlen - sizeof(struct frag_hdr);
1387 	}
1388 }
1389 
1390 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1391 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1392 			  struct rt6_info *rt)
1393 {
1394 	struct ipv6_pinfo *np = inet6_sk(sk);
1395 	unsigned int mtu;
1396 	struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1397 
1398 	/* callers pass dst together with a reference, set it first so
1399 	 * ip6_cork_release() can put it down even in case of an error.
1400 	 */
1401 	cork->base.dst = &rt->dst;
1402 
1403 	/*
1404 	 * setup for corking
1405 	 */
1406 	if (opt) {
1407 		if (WARN_ON(v6_cork->opt))
1408 			return -EINVAL;
1409 
1410 		nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1411 		if (unlikely(!nopt))
1412 			return -ENOBUFS;
1413 
1414 		nopt->tot_len = sizeof(*opt);
1415 		nopt->opt_flen = opt->opt_flen;
1416 		nopt->opt_nflen = opt->opt_nflen;
1417 
1418 		nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1419 		if (opt->dst0opt && !nopt->dst0opt)
1420 			return -ENOBUFS;
1421 
1422 		nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1423 		if (opt->dst1opt && !nopt->dst1opt)
1424 			return -ENOBUFS;
1425 
1426 		nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1427 		if (opt->hopopt && !nopt->hopopt)
1428 			return -ENOBUFS;
1429 
1430 		nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1431 		if (opt->srcrt && !nopt->srcrt)
1432 			return -ENOBUFS;
1433 
1434 		/* need source address above miyazawa*/
1435 	}
1436 	v6_cork->hop_limit = ipc6->hlimit;
1437 	v6_cork->tclass = ipc6->tclass;
1438 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1439 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1440 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1441 	else
1442 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1443 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1444 	if (np->frag_size < mtu) {
1445 		if (np->frag_size)
1446 			mtu = np->frag_size;
1447 	}
1448 	cork->base.fragsize = mtu;
1449 	cork->base.gso_size = ipc6->gso_size;
1450 	cork->base.tx_flags = 0;
1451 	cork->base.mark = ipc6->sockc.mark;
1452 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1453 
1454 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1455 		cork->base.flags |= IPCORK_ALLFRAG;
1456 	cork->base.length = 0;
1457 
1458 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1459 
1460 	return 0;
1461 }
1462 
1463 static int __ip6_append_data(struct sock *sk,
1464 			     struct sk_buff_head *queue,
1465 			     struct inet_cork_full *cork_full,
1466 			     struct inet6_cork *v6_cork,
1467 			     struct page_frag *pfrag,
1468 			     int getfrag(void *from, char *to, int offset,
1469 					 int len, int odd, struct sk_buff *skb),
1470 			     void *from, size_t length, int transhdrlen,
1471 			     unsigned int flags, struct ipcm6_cookie *ipc6)
1472 {
1473 	struct sk_buff *skb, *skb_prev = NULL;
1474 	struct inet_cork *cork = &cork_full->base;
1475 	struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1476 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1477 	struct ubuf_info *uarg = NULL;
1478 	int exthdrlen = 0;
1479 	int dst_exthdrlen = 0;
1480 	int hh_len;
1481 	int copy;
1482 	int err;
1483 	int offset = 0;
1484 	bool zc = false;
1485 	u32 tskey = 0;
1486 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1487 	struct ipv6_txoptions *opt = v6_cork->opt;
1488 	int csummode = CHECKSUM_NONE;
1489 	unsigned int maxnonfragsize, headersize;
1490 	unsigned int wmem_alloc_delta = 0;
1491 	bool paged, extra_uref = false;
1492 
1493 	skb = skb_peek_tail(queue);
1494 	if (!skb) {
1495 		exthdrlen = opt ? opt->opt_flen : 0;
1496 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1497 	}
1498 
1499 	paged = !!cork->gso_size;
1500 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1501 	orig_mtu = mtu;
1502 
1503 	if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
1504 	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1505 		tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1506 
1507 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1508 
1509 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1510 			(opt ? opt->opt_nflen : 0);
1511 
1512 	headersize = sizeof(struct ipv6hdr) +
1513 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1514 		     (dst_allfrag(&rt->dst) ?
1515 		      sizeof(struct frag_hdr) : 0) +
1516 		     rt->rt6i_nfheader_len;
1517 
1518 	if (mtu <= fragheaderlen ||
1519 	    ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1520 		goto emsgsize;
1521 
1522 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1523 		     sizeof(struct frag_hdr);
1524 
1525 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1526 	 * the first fragment
1527 	 */
1528 	if (headersize + transhdrlen > mtu)
1529 		goto emsgsize;
1530 
1531 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1532 	    (sk->sk_protocol == IPPROTO_UDP ||
1533 	     sk->sk_protocol == IPPROTO_ICMPV6 ||
1534 	     sk->sk_protocol == IPPROTO_RAW)) {
1535 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1536 				sizeof(struct ipv6hdr));
1537 		goto emsgsize;
1538 	}
1539 
1540 	if (ip6_sk_ignore_df(sk))
1541 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1542 	else
1543 		maxnonfragsize = mtu;
1544 
1545 	if (cork->length + length > maxnonfragsize - headersize) {
1546 emsgsize:
1547 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1548 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1549 		return -EMSGSIZE;
1550 	}
1551 
1552 	/* CHECKSUM_PARTIAL only with no extension headers and when
1553 	 * we are not going to fragment
1554 	 */
1555 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1556 	    headersize == sizeof(struct ipv6hdr) &&
1557 	    length <= mtu - headersize &&
1558 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1559 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1560 		csummode = CHECKSUM_PARTIAL;
1561 
1562 	if ((flags & MSG_ZEROCOPY) && length) {
1563 		struct msghdr *msg = from;
1564 
1565 		if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1566 			if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1567 				return -EINVAL;
1568 
1569 			/* Leave uarg NULL if can't zerocopy, callers should
1570 			 * be able to handle it.
1571 			 */
1572 			if ((rt->dst.dev->features & NETIF_F_SG) &&
1573 			    csummode == CHECKSUM_PARTIAL) {
1574 				paged = true;
1575 				zc = true;
1576 				uarg = msg->msg_ubuf;
1577 			}
1578 		} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1579 			uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1580 			if (!uarg)
1581 				return -ENOBUFS;
1582 			extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1583 			if (rt->dst.dev->features & NETIF_F_SG &&
1584 			    csummode == CHECKSUM_PARTIAL) {
1585 				paged = true;
1586 				zc = true;
1587 			} else {
1588 				uarg_to_msgzc(uarg)->zerocopy = 0;
1589 				skb_zcopy_set(skb, uarg, &extra_uref);
1590 			}
1591 		}
1592 	} else if ((flags & MSG_SPLICE_PAGES) && length) {
1593 		if (inet_sk(sk)->hdrincl)
1594 			return -EPERM;
1595 		if (rt->dst.dev->features & NETIF_F_SG)
1596 			/* We need an empty buffer to attach stuff to */
1597 			paged = true;
1598 		else
1599 			flags &= ~MSG_SPLICE_PAGES;
1600 	}
1601 
1602 	/*
1603 	 * Let's try using as much space as possible.
1604 	 * Use MTU if total length of the message fits into the MTU.
1605 	 * Otherwise, we need to reserve fragment header and
1606 	 * fragment alignment (= 8-15 octects, in total).
1607 	 *
1608 	 * Note that we may need to "move" the data from the tail
1609 	 * of the buffer to the new fragment when we split
1610 	 * the message.
1611 	 *
1612 	 * FIXME: It may be fragmented into multiple chunks
1613 	 *        at once if non-fragmentable extension headers
1614 	 *        are too large.
1615 	 * --yoshfuji
1616 	 */
1617 
1618 	cork->length += length;
1619 	if (!skb)
1620 		goto alloc_new_skb;
1621 
1622 	while (length > 0) {
1623 		/* Check if the remaining data fits into current packet. */
1624 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1625 		if (copy < length)
1626 			copy = maxfraglen - skb->len;
1627 
1628 		if (copy <= 0) {
1629 			char *data;
1630 			unsigned int datalen;
1631 			unsigned int fraglen;
1632 			unsigned int fraggap;
1633 			unsigned int alloclen, alloc_extra;
1634 			unsigned int pagedlen;
1635 alloc_new_skb:
1636 			/* There's no room in the current skb */
1637 			if (skb)
1638 				fraggap = skb->len - maxfraglen;
1639 			else
1640 				fraggap = 0;
1641 			/* update mtu and maxfraglen if necessary */
1642 			if (!skb || !skb_prev)
1643 				ip6_append_data_mtu(&mtu, &maxfraglen,
1644 						    fragheaderlen, skb, rt,
1645 						    orig_mtu);
1646 
1647 			skb_prev = skb;
1648 
1649 			/*
1650 			 * If remaining data exceeds the mtu,
1651 			 * we know we need more fragment(s).
1652 			 */
1653 			datalen = length + fraggap;
1654 
1655 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1656 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1657 			fraglen = datalen + fragheaderlen;
1658 			pagedlen = 0;
1659 
1660 			alloc_extra = hh_len;
1661 			alloc_extra += dst_exthdrlen;
1662 			alloc_extra += rt->dst.trailer_len;
1663 
1664 			/* We just reserve space for fragment header.
1665 			 * Note: this may be overallocation if the message
1666 			 * (without MSG_MORE) fits into the MTU.
1667 			 */
1668 			alloc_extra += sizeof(struct frag_hdr);
1669 
1670 			if ((flags & MSG_MORE) &&
1671 			    !(rt->dst.dev->features&NETIF_F_SG))
1672 				alloclen = mtu;
1673 			else if (!paged &&
1674 				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1675 				  !(rt->dst.dev->features & NETIF_F_SG)))
1676 				alloclen = fraglen;
1677 			else {
1678 				alloclen = fragheaderlen + transhdrlen;
1679 				pagedlen = datalen - transhdrlen;
1680 			}
1681 			alloclen += alloc_extra;
1682 
1683 			if (datalen != length + fraggap) {
1684 				/*
1685 				 * this is not the last fragment, the trailer
1686 				 * space is regarded as data space.
1687 				 */
1688 				datalen += rt->dst.trailer_len;
1689 			}
1690 
1691 			fraglen = datalen + fragheaderlen;
1692 
1693 			copy = datalen - transhdrlen - fraggap - pagedlen;
1694 			if (copy < 0) {
1695 				err = -EINVAL;
1696 				goto error;
1697 			}
1698 			if (transhdrlen) {
1699 				skb = sock_alloc_send_skb(sk, alloclen,
1700 						(flags & MSG_DONTWAIT), &err);
1701 			} else {
1702 				skb = NULL;
1703 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1704 				    2 * sk->sk_sndbuf)
1705 					skb = alloc_skb(alloclen,
1706 							sk->sk_allocation);
1707 				if (unlikely(!skb))
1708 					err = -ENOBUFS;
1709 			}
1710 			if (!skb)
1711 				goto error;
1712 			/*
1713 			 *	Fill in the control structures
1714 			 */
1715 			skb->protocol = htons(ETH_P_IPV6);
1716 			skb->ip_summed = csummode;
1717 			skb->csum = 0;
1718 			/* reserve for fragmentation and ipsec header */
1719 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1720 				    dst_exthdrlen);
1721 
1722 			/*
1723 			 *	Find where to start putting bytes
1724 			 */
1725 			data = skb_put(skb, fraglen - pagedlen);
1726 			skb_set_network_header(skb, exthdrlen);
1727 			data += fragheaderlen;
1728 			skb->transport_header = (skb->network_header +
1729 						 fragheaderlen);
1730 			if (fraggap) {
1731 				skb->csum = skb_copy_and_csum_bits(
1732 					skb_prev, maxfraglen,
1733 					data + transhdrlen, fraggap);
1734 				skb_prev->csum = csum_sub(skb_prev->csum,
1735 							  skb->csum);
1736 				data += fraggap;
1737 				pskb_trim_unique(skb_prev, maxfraglen);
1738 			}
1739 			if (copy > 0 &&
1740 			    getfrag(from, data + transhdrlen, offset,
1741 				    copy, fraggap, skb) < 0) {
1742 				err = -EFAULT;
1743 				kfree_skb(skb);
1744 				goto error;
1745 			}
1746 
1747 			offset += copy;
1748 			length -= copy + transhdrlen;
1749 			transhdrlen = 0;
1750 			exthdrlen = 0;
1751 			dst_exthdrlen = 0;
1752 
1753 			/* Only the initial fragment is time stamped */
1754 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1755 			cork->tx_flags = 0;
1756 			skb_shinfo(skb)->tskey = tskey;
1757 			tskey = 0;
1758 			skb_zcopy_set(skb, uarg, &extra_uref);
1759 
1760 			if ((flags & MSG_CONFIRM) && !skb_prev)
1761 				skb_set_dst_pending_confirm(skb, 1);
1762 
1763 			/*
1764 			 * Put the packet on the pending queue
1765 			 */
1766 			if (!skb->destructor) {
1767 				skb->destructor = sock_wfree;
1768 				skb->sk = sk;
1769 				wmem_alloc_delta += skb->truesize;
1770 			}
1771 			__skb_queue_tail(queue, skb);
1772 			continue;
1773 		}
1774 
1775 		if (copy > length)
1776 			copy = length;
1777 
1778 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1779 		    skb_tailroom(skb) >= copy) {
1780 			unsigned int off;
1781 
1782 			off = skb->len;
1783 			if (getfrag(from, skb_put(skb, copy),
1784 						offset, copy, off, skb) < 0) {
1785 				__skb_trim(skb, off);
1786 				err = -EFAULT;
1787 				goto error;
1788 			}
1789 		} else if (flags & MSG_SPLICE_PAGES) {
1790 			struct msghdr *msg = from;
1791 
1792 			err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
1793 						   sk->sk_allocation);
1794 			if (err < 0)
1795 				goto error;
1796 			copy = err;
1797 			wmem_alloc_delta += copy;
1798 		} else if (!zc) {
1799 			int i = skb_shinfo(skb)->nr_frags;
1800 
1801 			err = -ENOMEM;
1802 			if (!sk_page_frag_refill(sk, pfrag))
1803 				goto error;
1804 
1805 			skb_zcopy_downgrade_managed(skb);
1806 			if (!skb_can_coalesce(skb, i, pfrag->page,
1807 					      pfrag->offset)) {
1808 				err = -EMSGSIZE;
1809 				if (i == MAX_SKB_FRAGS)
1810 					goto error;
1811 
1812 				__skb_fill_page_desc(skb, i, pfrag->page,
1813 						     pfrag->offset, 0);
1814 				skb_shinfo(skb)->nr_frags = ++i;
1815 				get_page(pfrag->page);
1816 			}
1817 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1818 			if (getfrag(from,
1819 				    page_address(pfrag->page) + pfrag->offset,
1820 				    offset, copy, skb->len, skb) < 0)
1821 				goto error_efault;
1822 
1823 			pfrag->offset += copy;
1824 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1825 			skb->len += copy;
1826 			skb->data_len += copy;
1827 			skb->truesize += copy;
1828 			wmem_alloc_delta += copy;
1829 		} else {
1830 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1831 			if (err < 0)
1832 				goto error;
1833 		}
1834 		offset += copy;
1835 		length -= copy;
1836 	}
1837 
1838 	if (wmem_alloc_delta)
1839 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1840 	return 0;
1841 
1842 error_efault:
1843 	err = -EFAULT;
1844 error:
1845 	net_zcopy_put_abort(uarg, extra_uref);
1846 	cork->length -= length;
1847 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1848 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1849 	return err;
1850 }
1851 
1852 int ip6_append_data(struct sock *sk,
1853 		    int getfrag(void *from, char *to, int offset, int len,
1854 				int odd, struct sk_buff *skb),
1855 		    void *from, size_t length, int transhdrlen,
1856 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1857 		    struct rt6_info *rt, unsigned int flags)
1858 {
1859 	struct inet_sock *inet = inet_sk(sk);
1860 	struct ipv6_pinfo *np = inet6_sk(sk);
1861 	int exthdrlen;
1862 	int err;
1863 
1864 	if (flags&MSG_PROBE)
1865 		return 0;
1866 	if (skb_queue_empty(&sk->sk_write_queue)) {
1867 		/*
1868 		 * setup for corking
1869 		 */
1870 		dst_hold(&rt->dst);
1871 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1872 				     ipc6, rt);
1873 		if (err)
1874 			return err;
1875 
1876 		inet->cork.fl.u.ip6 = *fl6;
1877 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1878 		length += exthdrlen;
1879 		transhdrlen += exthdrlen;
1880 	} else {
1881 		transhdrlen = 0;
1882 	}
1883 
1884 	return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1885 				 &np->cork, sk_page_frag(sk), getfrag,
1886 				 from, length, transhdrlen, flags, ipc6);
1887 }
1888 EXPORT_SYMBOL_GPL(ip6_append_data);
1889 
1890 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1891 {
1892 	struct dst_entry *dst = cork->base.dst;
1893 
1894 	cork->base.dst = NULL;
1895 	cork->base.flags &= ~IPCORK_ALLFRAG;
1896 	skb_dst_set(skb, dst);
1897 }
1898 
1899 static void ip6_cork_release(struct inet_cork_full *cork,
1900 			     struct inet6_cork *v6_cork)
1901 {
1902 	if (v6_cork->opt) {
1903 		struct ipv6_txoptions *opt = v6_cork->opt;
1904 
1905 		kfree(opt->dst0opt);
1906 		kfree(opt->dst1opt);
1907 		kfree(opt->hopopt);
1908 		kfree(opt->srcrt);
1909 		kfree(opt);
1910 		v6_cork->opt = NULL;
1911 	}
1912 
1913 	if (cork->base.dst) {
1914 		dst_release(cork->base.dst);
1915 		cork->base.dst = NULL;
1916 		cork->base.flags &= ~IPCORK_ALLFRAG;
1917 	}
1918 }
1919 
1920 struct sk_buff *__ip6_make_skb(struct sock *sk,
1921 			       struct sk_buff_head *queue,
1922 			       struct inet_cork_full *cork,
1923 			       struct inet6_cork *v6_cork)
1924 {
1925 	struct sk_buff *skb, *tmp_skb;
1926 	struct sk_buff **tail_skb;
1927 	struct in6_addr *final_dst;
1928 	struct ipv6_pinfo *np = inet6_sk(sk);
1929 	struct net *net = sock_net(sk);
1930 	struct ipv6hdr *hdr;
1931 	struct ipv6_txoptions *opt = v6_cork->opt;
1932 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1933 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1934 	unsigned char proto = fl6->flowi6_proto;
1935 
1936 	skb = __skb_dequeue(queue);
1937 	if (!skb)
1938 		goto out;
1939 	tail_skb = &(skb_shinfo(skb)->frag_list);
1940 
1941 	/* move skb->data to ip header from ext header */
1942 	if (skb->data < skb_network_header(skb))
1943 		__skb_pull(skb, skb_network_offset(skb));
1944 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1945 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1946 		*tail_skb = tmp_skb;
1947 		tail_skb = &(tmp_skb->next);
1948 		skb->len += tmp_skb->len;
1949 		skb->data_len += tmp_skb->len;
1950 		skb->truesize += tmp_skb->truesize;
1951 		tmp_skb->destructor = NULL;
1952 		tmp_skb->sk = NULL;
1953 	}
1954 
1955 	/* Allow local fragmentation. */
1956 	skb->ignore_df = ip6_sk_ignore_df(sk);
1957 	__skb_pull(skb, skb_network_header_len(skb));
1958 
1959 	final_dst = &fl6->daddr;
1960 	if (opt && opt->opt_flen)
1961 		ipv6_push_frag_opts(skb, opt, &proto);
1962 	if (opt && opt->opt_nflen)
1963 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1964 
1965 	skb_push(skb, sizeof(struct ipv6hdr));
1966 	skb_reset_network_header(skb);
1967 	hdr = ipv6_hdr(skb);
1968 
1969 	ip6_flow_hdr(hdr, v6_cork->tclass,
1970 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1971 					ip6_autoflowlabel(net, np), fl6));
1972 	hdr->hop_limit = v6_cork->hop_limit;
1973 	hdr->nexthdr = proto;
1974 	hdr->saddr = fl6->saddr;
1975 	hdr->daddr = *final_dst;
1976 
1977 	skb->priority = sk->sk_priority;
1978 	skb->mark = cork->base.mark;
1979 	skb->tstamp = cork->base.transmit_time;
1980 
1981 	ip6_cork_steal_dst(skb, cork);
1982 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1983 	if (proto == IPPROTO_ICMPV6) {
1984 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1985 		u8 icmp6_type;
1986 
1987 		if (sk->sk_socket->type == SOCK_RAW && !inet_sk(sk)->hdrincl)
1988 			icmp6_type = fl6->fl6_icmp_type;
1989 		else
1990 			icmp6_type = icmp6_hdr(skb)->icmp6_type;
1991 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
1992 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1993 	}
1994 
1995 	ip6_cork_release(cork, v6_cork);
1996 out:
1997 	return skb;
1998 }
1999 
2000 int ip6_send_skb(struct sk_buff *skb)
2001 {
2002 	struct net *net = sock_net(skb->sk);
2003 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
2004 	int err;
2005 
2006 	err = ip6_local_out(net, skb->sk, skb);
2007 	if (err) {
2008 		if (err > 0)
2009 			err = net_xmit_errno(err);
2010 		if (err)
2011 			IP6_INC_STATS(net, rt->rt6i_idev,
2012 				      IPSTATS_MIB_OUTDISCARDS);
2013 	}
2014 
2015 	return err;
2016 }
2017 
2018 int ip6_push_pending_frames(struct sock *sk)
2019 {
2020 	struct sk_buff *skb;
2021 
2022 	skb = ip6_finish_skb(sk);
2023 	if (!skb)
2024 		return 0;
2025 
2026 	return ip6_send_skb(skb);
2027 }
2028 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
2029 
2030 static void __ip6_flush_pending_frames(struct sock *sk,
2031 				       struct sk_buff_head *queue,
2032 				       struct inet_cork_full *cork,
2033 				       struct inet6_cork *v6_cork)
2034 {
2035 	struct sk_buff *skb;
2036 
2037 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
2038 		if (skb_dst(skb))
2039 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
2040 				      IPSTATS_MIB_OUTDISCARDS);
2041 		kfree_skb(skb);
2042 	}
2043 
2044 	ip6_cork_release(cork, v6_cork);
2045 }
2046 
2047 void ip6_flush_pending_frames(struct sock *sk)
2048 {
2049 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2050 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
2051 }
2052 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2053 
2054 struct sk_buff *ip6_make_skb(struct sock *sk,
2055 			     int getfrag(void *from, char *to, int offset,
2056 					 int len, int odd, struct sk_buff *skb),
2057 			     void *from, size_t length, int transhdrlen,
2058 			     struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2059 			     unsigned int flags, struct inet_cork_full *cork)
2060 {
2061 	struct inet6_cork v6_cork;
2062 	struct sk_buff_head queue;
2063 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2064 	int err;
2065 
2066 	if (flags & MSG_PROBE) {
2067 		dst_release(&rt->dst);
2068 		return NULL;
2069 	}
2070 
2071 	__skb_queue_head_init(&queue);
2072 
2073 	cork->base.flags = 0;
2074 	cork->base.addr = 0;
2075 	cork->base.opt = NULL;
2076 	v6_cork.opt = NULL;
2077 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
2078 	if (err) {
2079 		ip6_cork_release(cork, &v6_cork);
2080 		return ERR_PTR(err);
2081 	}
2082 	if (ipc6->dontfrag < 0)
2083 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2084 
2085 	err = __ip6_append_data(sk, &queue, cork, &v6_cork,
2086 				&current->task_frag, getfrag, from,
2087 				length + exthdrlen, transhdrlen + exthdrlen,
2088 				flags, ipc6);
2089 	if (err) {
2090 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2091 		return ERR_PTR(err);
2092 	}
2093 
2094 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2095 }
2096