xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 72ed5d5624af384eaf74d84915810d54486a75e2)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	IPv6 output functions
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on linux/net/ipv4/ip_output.c
10  *
11  *	Changes:
12  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13  *				extension headers are implemented.
14  *				route changes now work.
15  *				ip6_forward does not confuse sniffers.
16  *				etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *	Imran Patel	:	frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *			:       add ip6_append_data and related functions
22  *				for datagram xmit
23  */
24 
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37 
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41 
42 #include <net/sock.h>
43 #include <net/snmp.h>
44 
45 #include <net/ipv6.h>
46 #include <net/ndisc.h>
47 #include <net/protocol.h>
48 #include <net/ip6_route.h>
49 #include <net/addrconf.h>
50 #include <net/rawv6.h>
51 #include <net/icmp.h>
52 #include <net/xfrm.h>
53 #include <net/checksum.h>
54 #include <linux/mroute6.h>
55 #include <net/l3mdev.h>
56 #include <net/lwtunnel.h>
57 #include <net/ip_tunnels.h>
58 
59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
60 {
61 	struct dst_entry *dst = skb_dst(skb);
62 	struct net_device *dev = dst->dev;
63 	struct inet6_dev *idev = ip6_dst_idev(dst);
64 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
65 	const struct in6_addr *daddr, *nexthop;
66 	struct ipv6hdr *hdr;
67 	struct neighbour *neigh;
68 	int ret;
69 
70 	/* Be paranoid, rather than too clever. */
71 	if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
72 		skb = skb_expand_head(skb, hh_len);
73 		if (!skb) {
74 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
75 			return -ENOMEM;
76 		}
77 	}
78 
79 	hdr = ipv6_hdr(skb);
80 	daddr = &hdr->daddr;
81 	if (ipv6_addr_is_multicast(daddr)) {
82 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
83 		    ((mroute6_is_socket(net, skb) &&
84 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
85 		     ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
86 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
87 
88 			/* Do not check for IFF_ALLMULTI; multicast routing
89 			   is not supported in any case.
90 			 */
91 			if (newskb)
92 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
93 					net, sk, newskb, NULL, newskb->dev,
94 					dev_loopback_xmit);
95 
96 			if (hdr->hop_limit == 0) {
97 				IP6_INC_STATS(net, idev,
98 					      IPSTATS_MIB_OUTDISCARDS);
99 				kfree_skb(skb);
100 				return 0;
101 			}
102 		}
103 
104 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
105 		if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
106 		    !(dev->flags & IFF_LOOPBACK)) {
107 			kfree_skb(skb);
108 			return 0;
109 		}
110 	}
111 
112 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
113 		int res = lwtunnel_xmit(skb);
114 
115 		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
116 			return res;
117 	}
118 
119 	rcu_read_lock_bh();
120 	nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
121 	neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
122 
123 	if (unlikely(IS_ERR_OR_NULL(neigh))) {
124 		if (unlikely(!neigh))
125 			neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
126 		if (IS_ERR(neigh)) {
127 			rcu_read_unlock_bh();
128 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
129 			kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
130 			return -EINVAL;
131 		}
132 	}
133 	sock_confirm_neigh(skb, neigh);
134 	ret = neigh_output(neigh, skb, false);
135 	rcu_read_unlock_bh();
136 	return ret;
137 }
138 
139 static int
140 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
141 				    struct sk_buff *skb, unsigned int mtu)
142 {
143 	struct sk_buff *segs, *nskb;
144 	netdev_features_t features;
145 	int ret = 0;
146 
147 	/* Please see corresponding comment in ip_finish_output_gso
148 	 * describing the cases where GSO segment length exceeds the
149 	 * egress MTU.
150 	 */
151 	features = netif_skb_features(skb);
152 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
153 	if (IS_ERR_OR_NULL(segs)) {
154 		kfree_skb(skb);
155 		return -ENOMEM;
156 	}
157 
158 	consume_skb(skb);
159 
160 	skb_list_walk_safe(segs, segs, nskb) {
161 		int err;
162 
163 		skb_mark_not_on_list(segs);
164 		err = ip6_fragment(net, sk, segs, ip6_finish_output2);
165 		if (err && ret == 0)
166 			ret = err;
167 	}
168 
169 	return ret;
170 }
171 
172 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
173 {
174 	unsigned int mtu;
175 
176 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
177 	/* Policy lookup after SNAT yielded a new policy */
178 	if (skb_dst(skb)->xfrm) {
179 		IP6CB(skb)->flags |= IP6SKB_REROUTED;
180 		return dst_output(net, sk, skb);
181 	}
182 #endif
183 
184 	mtu = ip6_skb_dst_mtu(skb);
185 	if (skb_is_gso(skb) &&
186 	    !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
187 	    !skb_gso_validate_network_len(skb, mtu))
188 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
189 
190 	if ((skb->len > mtu && !skb_is_gso(skb)) ||
191 	    dst_allfrag(skb_dst(skb)) ||
192 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
193 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
194 	else
195 		return ip6_finish_output2(net, sk, skb);
196 }
197 
198 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
199 {
200 	int ret;
201 
202 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
203 	switch (ret) {
204 	case NET_XMIT_SUCCESS:
205 	case NET_XMIT_CN:
206 		return __ip6_finish_output(net, sk, skb) ? : ret;
207 	default:
208 		kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
209 		return ret;
210 	}
211 }
212 
213 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
214 {
215 	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
216 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
217 
218 	skb->protocol = htons(ETH_P_IPV6);
219 	skb->dev = dev;
220 
221 	if (unlikely(idev->cnf.disable_ipv6)) {
222 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
223 		kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
224 		return 0;
225 	}
226 
227 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
228 			    net, sk, skb, indev, dev,
229 			    ip6_finish_output,
230 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
231 }
232 EXPORT_SYMBOL(ip6_output);
233 
234 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
235 {
236 	if (!np->autoflowlabel_set)
237 		return ip6_default_np_autolabel(net);
238 	else
239 		return np->autoflowlabel;
240 }
241 
242 /*
243  * xmit an sk_buff (used by TCP, SCTP and DCCP)
244  * Note : socket lock is not held for SYNACK packets, but might be modified
245  * by calls to skb_set_owner_w() and ipv6_local_error(),
246  * which are using proper atomic operations or spinlocks.
247  */
248 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
249 	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
250 {
251 	struct net *net = sock_net(sk);
252 	const struct ipv6_pinfo *np = inet6_sk(sk);
253 	struct in6_addr *first_hop = &fl6->daddr;
254 	struct dst_entry *dst = skb_dst(skb);
255 	struct net_device *dev = dst->dev;
256 	struct inet6_dev *idev = ip6_dst_idev(dst);
257 	struct hop_jumbo_hdr *hop_jumbo;
258 	int hoplen = sizeof(*hop_jumbo);
259 	unsigned int head_room;
260 	struct ipv6hdr *hdr;
261 	u8  proto = fl6->flowi6_proto;
262 	int seg_len = skb->len;
263 	int hlimit = -1;
264 	u32 mtu;
265 
266 	head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
267 	if (opt)
268 		head_room += opt->opt_nflen + opt->opt_flen;
269 
270 	if (unlikely(head_room > skb_headroom(skb))) {
271 		skb = skb_expand_head(skb, head_room);
272 		if (!skb) {
273 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
274 			return -ENOBUFS;
275 		}
276 	}
277 
278 	if (opt) {
279 		seg_len += opt->opt_nflen + opt->opt_flen;
280 
281 		if (opt->opt_flen)
282 			ipv6_push_frag_opts(skb, opt, &proto);
283 
284 		if (opt->opt_nflen)
285 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
286 					     &fl6->saddr);
287 	}
288 
289 	if (unlikely(seg_len > IPV6_MAXPLEN)) {
290 		hop_jumbo = skb_push(skb, hoplen);
291 
292 		hop_jumbo->nexthdr = proto;
293 		hop_jumbo->hdrlen = 0;
294 		hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
295 		hop_jumbo->tlv_len = 4;
296 		hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
297 
298 		proto = IPPROTO_HOPOPTS;
299 		seg_len = 0;
300 		IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
301 	}
302 
303 	skb_push(skb, sizeof(struct ipv6hdr));
304 	skb_reset_network_header(skb);
305 	hdr = ipv6_hdr(skb);
306 
307 	/*
308 	 *	Fill in the IPv6 header
309 	 */
310 	if (np)
311 		hlimit = np->hop_limit;
312 	if (hlimit < 0)
313 		hlimit = ip6_dst_hoplimit(dst);
314 
315 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
316 				ip6_autoflowlabel(net, np), fl6));
317 
318 	hdr->payload_len = htons(seg_len);
319 	hdr->nexthdr = proto;
320 	hdr->hop_limit = hlimit;
321 
322 	hdr->saddr = fl6->saddr;
323 	hdr->daddr = *first_hop;
324 
325 	skb->protocol = htons(ETH_P_IPV6);
326 	skb->priority = priority;
327 	skb->mark = mark;
328 
329 	mtu = dst_mtu(dst);
330 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
331 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
332 
333 		/* if egress device is enslaved to an L3 master device pass the
334 		 * skb to its handler for processing
335 		 */
336 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
337 		if (unlikely(!skb))
338 			return 0;
339 
340 		/* hooks should never assume socket lock is held.
341 		 * we promote our socket to non const
342 		 */
343 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
344 			       net, (struct sock *)sk, skb, NULL, dev,
345 			       dst_output);
346 	}
347 
348 	skb->dev = dev;
349 	/* ipv6_local_error() does not require socket lock,
350 	 * we promote our socket to non const
351 	 */
352 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
353 
354 	IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
355 	kfree_skb(skb);
356 	return -EMSGSIZE;
357 }
358 EXPORT_SYMBOL(ip6_xmit);
359 
360 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
361 {
362 	struct ip6_ra_chain *ra;
363 	struct sock *last = NULL;
364 
365 	read_lock(&ip6_ra_lock);
366 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
367 		struct sock *sk = ra->sk;
368 		if (sk && ra->sel == sel &&
369 		    (!sk->sk_bound_dev_if ||
370 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
371 			struct ipv6_pinfo *np = inet6_sk(sk);
372 
373 			if (np && np->rtalert_isolate &&
374 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
375 				continue;
376 			}
377 			if (last) {
378 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
379 				if (skb2)
380 					rawv6_rcv(last, skb2);
381 			}
382 			last = sk;
383 		}
384 	}
385 
386 	if (last) {
387 		rawv6_rcv(last, skb);
388 		read_unlock(&ip6_ra_lock);
389 		return 1;
390 	}
391 	read_unlock(&ip6_ra_lock);
392 	return 0;
393 }
394 
395 static int ip6_forward_proxy_check(struct sk_buff *skb)
396 {
397 	struct ipv6hdr *hdr = ipv6_hdr(skb);
398 	u8 nexthdr = hdr->nexthdr;
399 	__be16 frag_off;
400 	int offset;
401 
402 	if (ipv6_ext_hdr(nexthdr)) {
403 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
404 		if (offset < 0)
405 			return 0;
406 	} else
407 		offset = sizeof(struct ipv6hdr);
408 
409 	if (nexthdr == IPPROTO_ICMPV6) {
410 		struct icmp6hdr *icmp6;
411 
412 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
413 					 offset + 1 - skb->data)))
414 			return 0;
415 
416 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
417 
418 		switch (icmp6->icmp6_type) {
419 		case NDISC_ROUTER_SOLICITATION:
420 		case NDISC_ROUTER_ADVERTISEMENT:
421 		case NDISC_NEIGHBOUR_SOLICITATION:
422 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
423 		case NDISC_REDIRECT:
424 			/* For reaction involving unicast neighbor discovery
425 			 * message destined to the proxied address, pass it to
426 			 * input function.
427 			 */
428 			return 1;
429 		default:
430 			break;
431 		}
432 	}
433 
434 	/*
435 	 * The proxying router can't forward traffic sent to a link-local
436 	 * address, so signal the sender and discard the packet. This
437 	 * behavior is clarified by the MIPv6 specification.
438 	 */
439 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
440 		dst_link_failure(skb);
441 		return -1;
442 	}
443 
444 	return 0;
445 }
446 
447 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
448 				     struct sk_buff *skb)
449 {
450 	struct dst_entry *dst = skb_dst(skb);
451 
452 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
453 	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
454 
455 #ifdef CONFIG_NET_SWITCHDEV
456 	if (skb->offload_l3_fwd_mark) {
457 		consume_skb(skb);
458 		return 0;
459 	}
460 #endif
461 
462 	skb_clear_tstamp(skb);
463 	return dst_output(net, sk, skb);
464 }
465 
466 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
467 {
468 	if (skb->len <= mtu)
469 		return false;
470 
471 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
472 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
473 		return true;
474 
475 	if (skb->ignore_df)
476 		return false;
477 
478 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
479 		return false;
480 
481 	return true;
482 }
483 
484 int ip6_forward(struct sk_buff *skb)
485 {
486 	struct dst_entry *dst = skb_dst(skb);
487 	struct ipv6hdr *hdr = ipv6_hdr(skb);
488 	struct inet6_skb_parm *opt = IP6CB(skb);
489 	struct net *net = dev_net(dst->dev);
490 	struct inet6_dev *idev;
491 	SKB_DR(reason);
492 	u32 mtu;
493 
494 	idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
495 	if (net->ipv6.devconf_all->forwarding == 0)
496 		goto error;
497 
498 	if (skb->pkt_type != PACKET_HOST)
499 		goto drop;
500 
501 	if (unlikely(skb->sk))
502 		goto drop;
503 
504 	if (skb_warn_if_lro(skb))
505 		goto drop;
506 
507 	if (!net->ipv6.devconf_all->disable_policy &&
508 	    (!idev || !idev->cnf.disable_policy) &&
509 	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
510 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
511 		goto drop;
512 	}
513 
514 	skb_forward_csum(skb);
515 
516 	/*
517 	 *	We DO NOT make any processing on
518 	 *	RA packets, pushing them to user level AS IS
519 	 *	without ane WARRANTY that application will be able
520 	 *	to interpret them. The reason is that we
521 	 *	cannot make anything clever here.
522 	 *
523 	 *	We are not end-node, so that if packet contains
524 	 *	AH/ESP, we cannot make anything.
525 	 *	Defragmentation also would be mistake, RA packets
526 	 *	cannot be fragmented, because there is no warranty
527 	 *	that different fragments will go along one path. --ANK
528 	 */
529 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
530 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
531 			return 0;
532 	}
533 
534 	/*
535 	 *	check and decrement ttl
536 	 */
537 	if (hdr->hop_limit <= 1) {
538 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
539 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
540 
541 		kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
542 		return -ETIMEDOUT;
543 	}
544 
545 	/* XXX: idev->cnf.proxy_ndp? */
546 	if (net->ipv6.devconf_all->proxy_ndp &&
547 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
548 		int proxied = ip6_forward_proxy_check(skb);
549 		if (proxied > 0) {
550 			/* It's tempting to decrease the hop limit
551 			 * here by 1, as we do at the end of the
552 			 * function too.
553 			 *
554 			 * But that would be incorrect, as proxying is
555 			 * not forwarding.  The ip6_input function
556 			 * will handle this packet locally, and it
557 			 * depends on the hop limit being unchanged.
558 			 *
559 			 * One example is the NDP hop limit, that
560 			 * always has to stay 255, but other would be
561 			 * similar checks around RA packets, where the
562 			 * user can even change the desired limit.
563 			 */
564 			return ip6_input(skb);
565 		} else if (proxied < 0) {
566 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
567 			goto drop;
568 		}
569 	}
570 
571 	if (!xfrm6_route_forward(skb)) {
572 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
573 		SKB_DR_SET(reason, XFRM_POLICY);
574 		goto drop;
575 	}
576 	dst = skb_dst(skb);
577 
578 	/* IPv6 specs say nothing about it, but it is clear that we cannot
579 	   send redirects to source routed frames.
580 	   We don't send redirects to frames decapsulated from IPsec.
581 	 */
582 	if (IP6CB(skb)->iif == dst->dev->ifindex &&
583 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
584 		struct in6_addr *target = NULL;
585 		struct inet_peer *peer;
586 		struct rt6_info *rt;
587 
588 		/*
589 		 *	incoming and outgoing devices are the same
590 		 *	send a redirect.
591 		 */
592 
593 		rt = (struct rt6_info *) dst;
594 		if (rt->rt6i_flags & RTF_GATEWAY)
595 			target = &rt->rt6i_gateway;
596 		else
597 			target = &hdr->daddr;
598 
599 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
600 
601 		/* Limit redirects both by destination (here)
602 		   and by source (inside ndisc_send_redirect)
603 		 */
604 		if (inet_peer_xrlim_allow(peer, 1*HZ))
605 			ndisc_send_redirect(skb, target);
606 		if (peer)
607 			inet_putpeer(peer);
608 	} else {
609 		int addrtype = ipv6_addr_type(&hdr->saddr);
610 
611 		/* This check is security critical. */
612 		if (addrtype == IPV6_ADDR_ANY ||
613 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
614 			goto error;
615 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
616 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
617 				    ICMPV6_NOT_NEIGHBOUR, 0);
618 			goto error;
619 		}
620 	}
621 
622 	mtu = ip6_dst_mtu_maybe_forward(dst, true);
623 	if (mtu < IPV6_MIN_MTU)
624 		mtu = IPV6_MIN_MTU;
625 
626 	if (ip6_pkt_too_big(skb, mtu)) {
627 		/* Again, force OUTPUT device used as source address */
628 		skb->dev = dst->dev;
629 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
630 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
631 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
632 				IPSTATS_MIB_FRAGFAILS);
633 		kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
634 		return -EMSGSIZE;
635 	}
636 
637 	if (skb_cow(skb, dst->dev->hard_header_len)) {
638 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
639 				IPSTATS_MIB_OUTDISCARDS);
640 		goto drop;
641 	}
642 
643 	hdr = ipv6_hdr(skb);
644 
645 	/* Mangling hops number delayed to point after skb COW */
646 
647 	hdr->hop_limit--;
648 
649 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
650 		       net, NULL, skb, skb->dev, dst->dev,
651 		       ip6_forward_finish);
652 
653 error:
654 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
655 	SKB_DR_SET(reason, IP_INADDRERRORS);
656 drop:
657 	kfree_skb_reason(skb, reason);
658 	return -EINVAL;
659 }
660 
661 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
662 {
663 	to->pkt_type = from->pkt_type;
664 	to->priority = from->priority;
665 	to->protocol = from->protocol;
666 	skb_dst_drop(to);
667 	skb_dst_set(to, dst_clone(skb_dst(from)));
668 	to->dev = from->dev;
669 	to->mark = from->mark;
670 
671 	skb_copy_hash(to, from);
672 
673 #ifdef CONFIG_NET_SCHED
674 	to->tc_index = from->tc_index;
675 #endif
676 	nf_copy(to, from);
677 	skb_ext_copy(to, from);
678 	skb_copy_secmark(to, from);
679 }
680 
681 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
682 		      u8 nexthdr, __be32 frag_id,
683 		      struct ip6_fraglist_iter *iter)
684 {
685 	unsigned int first_len;
686 	struct frag_hdr *fh;
687 
688 	/* BUILD HEADER */
689 	*prevhdr = NEXTHDR_FRAGMENT;
690 	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
691 	if (!iter->tmp_hdr)
692 		return -ENOMEM;
693 
694 	iter->frag = skb_shinfo(skb)->frag_list;
695 	skb_frag_list_init(skb);
696 
697 	iter->offset = 0;
698 	iter->hlen = hlen;
699 	iter->frag_id = frag_id;
700 	iter->nexthdr = nexthdr;
701 
702 	__skb_pull(skb, hlen);
703 	fh = __skb_push(skb, sizeof(struct frag_hdr));
704 	__skb_push(skb, hlen);
705 	skb_reset_network_header(skb);
706 	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
707 
708 	fh->nexthdr = nexthdr;
709 	fh->reserved = 0;
710 	fh->frag_off = htons(IP6_MF);
711 	fh->identification = frag_id;
712 
713 	first_len = skb_pagelen(skb);
714 	skb->data_len = first_len - skb_headlen(skb);
715 	skb->len = first_len;
716 	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
717 
718 	return 0;
719 }
720 EXPORT_SYMBOL(ip6_fraglist_init);
721 
722 void ip6_fraglist_prepare(struct sk_buff *skb,
723 			  struct ip6_fraglist_iter *iter)
724 {
725 	struct sk_buff *frag = iter->frag;
726 	unsigned int hlen = iter->hlen;
727 	struct frag_hdr *fh;
728 
729 	frag->ip_summed = CHECKSUM_NONE;
730 	skb_reset_transport_header(frag);
731 	fh = __skb_push(frag, sizeof(struct frag_hdr));
732 	__skb_push(frag, hlen);
733 	skb_reset_network_header(frag);
734 	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
735 	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
736 	fh->nexthdr = iter->nexthdr;
737 	fh->reserved = 0;
738 	fh->frag_off = htons(iter->offset);
739 	if (frag->next)
740 		fh->frag_off |= htons(IP6_MF);
741 	fh->identification = iter->frag_id;
742 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
743 	ip6_copy_metadata(frag, skb);
744 }
745 EXPORT_SYMBOL(ip6_fraglist_prepare);
746 
747 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
748 		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
749 		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
750 {
751 	state->prevhdr = prevhdr;
752 	state->nexthdr = nexthdr;
753 	state->frag_id = frag_id;
754 
755 	state->hlen = hlen;
756 	state->mtu = mtu;
757 
758 	state->left = skb->len - hlen;	/* Space per frame */
759 	state->ptr = hlen;		/* Where to start from */
760 
761 	state->hroom = hdr_room;
762 	state->troom = needed_tailroom;
763 
764 	state->offset = 0;
765 }
766 EXPORT_SYMBOL(ip6_frag_init);
767 
768 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
769 {
770 	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
771 	struct sk_buff *frag;
772 	struct frag_hdr *fh;
773 	unsigned int len;
774 
775 	len = state->left;
776 	/* IF: it doesn't fit, use 'mtu' - the data space left */
777 	if (len > state->mtu)
778 		len = state->mtu;
779 	/* IF: we are not sending up to and including the packet end
780 	   then align the next start on an eight byte boundary */
781 	if (len < state->left)
782 		len &= ~7;
783 
784 	/* Allocate buffer */
785 	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
786 			 state->hroom + state->troom, GFP_ATOMIC);
787 	if (!frag)
788 		return ERR_PTR(-ENOMEM);
789 
790 	/*
791 	 *	Set up data on packet
792 	 */
793 
794 	ip6_copy_metadata(frag, skb);
795 	skb_reserve(frag, state->hroom);
796 	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
797 	skb_reset_network_header(frag);
798 	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
799 	frag->transport_header = (frag->network_header + state->hlen +
800 				  sizeof(struct frag_hdr));
801 
802 	/*
803 	 *	Charge the memory for the fragment to any owner
804 	 *	it might possess
805 	 */
806 	if (skb->sk)
807 		skb_set_owner_w(frag, skb->sk);
808 
809 	/*
810 	 *	Copy the packet header into the new buffer.
811 	 */
812 	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
813 
814 	fragnexthdr_offset = skb_network_header(frag);
815 	fragnexthdr_offset += prevhdr - skb_network_header(skb);
816 	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
817 
818 	/*
819 	 *	Build fragment header.
820 	 */
821 	fh->nexthdr = state->nexthdr;
822 	fh->reserved = 0;
823 	fh->identification = state->frag_id;
824 
825 	/*
826 	 *	Copy a block of the IP datagram.
827 	 */
828 	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
829 			     len));
830 	state->left -= len;
831 
832 	fh->frag_off = htons(state->offset);
833 	if (state->left > 0)
834 		fh->frag_off |= htons(IP6_MF);
835 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
836 
837 	state->ptr += len;
838 	state->offset += len;
839 
840 	return frag;
841 }
842 EXPORT_SYMBOL(ip6_frag_next);
843 
844 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
845 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
846 {
847 	struct sk_buff *frag;
848 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
849 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
850 				inet6_sk(skb->sk) : NULL;
851 	bool mono_delivery_time = skb->mono_delivery_time;
852 	struct ip6_frag_state state;
853 	unsigned int mtu, hlen, nexthdr_offset;
854 	ktime_t tstamp = skb->tstamp;
855 	int hroom, err = 0;
856 	__be32 frag_id;
857 	u8 *prevhdr, nexthdr = 0;
858 
859 	err = ip6_find_1stfragopt(skb, &prevhdr);
860 	if (err < 0)
861 		goto fail;
862 	hlen = err;
863 	nexthdr = *prevhdr;
864 	nexthdr_offset = prevhdr - skb_network_header(skb);
865 
866 	mtu = ip6_skb_dst_mtu(skb);
867 
868 	/* We must not fragment if the socket is set to force MTU discovery
869 	 * or if the skb it not generated by a local socket.
870 	 */
871 	if (unlikely(!skb->ignore_df && skb->len > mtu))
872 		goto fail_toobig;
873 
874 	if (IP6CB(skb)->frag_max_size) {
875 		if (IP6CB(skb)->frag_max_size > mtu)
876 			goto fail_toobig;
877 
878 		/* don't send fragments larger than what we received */
879 		mtu = IP6CB(skb)->frag_max_size;
880 		if (mtu < IPV6_MIN_MTU)
881 			mtu = IPV6_MIN_MTU;
882 	}
883 
884 	if (np && np->frag_size < mtu) {
885 		if (np->frag_size)
886 			mtu = np->frag_size;
887 	}
888 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
889 		goto fail_toobig;
890 	mtu -= hlen + sizeof(struct frag_hdr);
891 
892 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
893 				    &ipv6_hdr(skb)->saddr);
894 
895 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
896 	    (err = skb_checksum_help(skb)))
897 		goto fail;
898 
899 	prevhdr = skb_network_header(skb) + nexthdr_offset;
900 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
901 	if (skb_has_frag_list(skb)) {
902 		unsigned int first_len = skb_pagelen(skb);
903 		struct ip6_fraglist_iter iter;
904 		struct sk_buff *frag2;
905 
906 		if (first_len - hlen > mtu ||
907 		    ((first_len - hlen) & 7) ||
908 		    skb_cloned(skb) ||
909 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
910 			goto slow_path;
911 
912 		skb_walk_frags(skb, frag) {
913 			/* Correct geometry. */
914 			if (frag->len > mtu ||
915 			    ((frag->len & 7) && frag->next) ||
916 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
917 				goto slow_path_clean;
918 
919 			/* Partially cloned skb? */
920 			if (skb_shared(frag))
921 				goto slow_path_clean;
922 
923 			BUG_ON(frag->sk);
924 			if (skb->sk) {
925 				frag->sk = skb->sk;
926 				frag->destructor = sock_wfree;
927 			}
928 			skb->truesize -= frag->truesize;
929 		}
930 
931 		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
932 					&iter);
933 		if (err < 0)
934 			goto fail;
935 
936 		/* We prevent @rt from being freed. */
937 		rcu_read_lock();
938 
939 		for (;;) {
940 			/* Prepare header of the next frame,
941 			 * before previous one went down. */
942 			if (iter.frag)
943 				ip6_fraglist_prepare(skb, &iter);
944 
945 			skb_set_delivery_time(skb, tstamp, mono_delivery_time);
946 			err = output(net, sk, skb);
947 			if (!err)
948 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
949 					      IPSTATS_MIB_FRAGCREATES);
950 
951 			if (err || !iter.frag)
952 				break;
953 
954 			skb = ip6_fraglist_next(&iter);
955 		}
956 
957 		kfree(iter.tmp_hdr);
958 
959 		if (err == 0) {
960 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
961 				      IPSTATS_MIB_FRAGOKS);
962 			rcu_read_unlock();
963 			return 0;
964 		}
965 
966 		kfree_skb_list(iter.frag);
967 
968 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
969 			      IPSTATS_MIB_FRAGFAILS);
970 		rcu_read_unlock();
971 		return err;
972 
973 slow_path_clean:
974 		skb_walk_frags(skb, frag2) {
975 			if (frag2 == frag)
976 				break;
977 			frag2->sk = NULL;
978 			frag2->destructor = NULL;
979 			skb->truesize += frag2->truesize;
980 		}
981 	}
982 
983 slow_path:
984 	/*
985 	 *	Fragment the datagram.
986 	 */
987 
988 	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
989 		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
990 		      &state);
991 
992 	/*
993 	 *	Keep copying data until we run out.
994 	 */
995 
996 	while (state.left > 0) {
997 		frag = ip6_frag_next(skb, &state);
998 		if (IS_ERR(frag)) {
999 			err = PTR_ERR(frag);
1000 			goto fail;
1001 		}
1002 
1003 		/*
1004 		 *	Put this fragment into the sending queue.
1005 		 */
1006 		skb_set_delivery_time(frag, tstamp, mono_delivery_time);
1007 		err = output(net, sk, frag);
1008 		if (err)
1009 			goto fail;
1010 
1011 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1012 			      IPSTATS_MIB_FRAGCREATES);
1013 	}
1014 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1015 		      IPSTATS_MIB_FRAGOKS);
1016 	consume_skb(skb);
1017 	return err;
1018 
1019 fail_toobig:
1020 	if (skb->sk && dst_allfrag(skb_dst(skb)))
1021 		sk_gso_disable(skb->sk);
1022 
1023 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1024 	err = -EMSGSIZE;
1025 
1026 fail:
1027 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1028 		      IPSTATS_MIB_FRAGFAILS);
1029 	kfree_skb(skb);
1030 	return err;
1031 }
1032 
1033 static inline int ip6_rt_check(const struct rt6key *rt_key,
1034 			       const struct in6_addr *fl_addr,
1035 			       const struct in6_addr *addr_cache)
1036 {
1037 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1038 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1039 }
1040 
1041 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1042 					  struct dst_entry *dst,
1043 					  const struct flowi6 *fl6)
1044 {
1045 	struct ipv6_pinfo *np = inet6_sk(sk);
1046 	struct rt6_info *rt;
1047 
1048 	if (!dst)
1049 		goto out;
1050 
1051 	if (dst->ops->family != AF_INET6) {
1052 		dst_release(dst);
1053 		return NULL;
1054 	}
1055 
1056 	rt = (struct rt6_info *)dst;
1057 	/* Yes, checking route validity in not connected
1058 	 * case is not very simple. Take into account,
1059 	 * that we do not support routing by source, TOS,
1060 	 * and MSG_DONTROUTE		--ANK (980726)
1061 	 *
1062 	 * 1. ip6_rt_check(): If route was host route,
1063 	 *    check that cached destination is current.
1064 	 *    If it is network route, we still may
1065 	 *    check its validity using saved pointer
1066 	 *    to the last used address: daddr_cache.
1067 	 *    We do not want to save whole address now,
1068 	 *    (because main consumer of this service
1069 	 *    is tcp, which has not this problem),
1070 	 *    so that the last trick works only on connected
1071 	 *    sockets.
1072 	 * 2. oif also should be the same.
1073 	 */
1074 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1075 #ifdef CONFIG_IPV6_SUBTREES
1076 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1077 #endif
1078 	   (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
1079 		dst_release(dst);
1080 		dst = NULL;
1081 	}
1082 
1083 out:
1084 	return dst;
1085 }
1086 
1087 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1088 			       struct dst_entry **dst, struct flowi6 *fl6)
1089 {
1090 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1091 	struct neighbour *n;
1092 	struct rt6_info *rt;
1093 #endif
1094 	int err;
1095 	int flags = 0;
1096 
1097 	/* The correct way to handle this would be to do
1098 	 * ip6_route_get_saddr, and then ip6_route_output; however,
1099 	 * the route-specific preferred source forces the
1100 	 * ip6_route_output call _before_ ip6_route_get_saddr.
1101 	 *
1102 	 * In source specific routing (no src=any default route),
1103 	 * ip6_route_output will fail given src=any saddr, though, so
1104 	 * that's why we try it again later.
1105 	 */
1106 	if (ipv6_addr_any(&fl6->saddr)) {
1107 		struct fib6_info *from;
1108 		struct rt6_info *rt;
1109 
1110 		*dst = ip6_route_output(net, sk, fl6);
1111 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1112 
1113 		rcu_read_lock();
1114 		from = rt ? rcu_dereference(rt->from) : NULL;
1115 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1116 					  sk ? inet6_sk(sk)->srcprefs : 0,
1117 					  &fl6->saddr);
1118 		rcu_read_unlock();
1119 
1120 		if (err)
1121 			goto out_err_release;
1122 
1123 		/* If we had an erroneous initial result, pretend it
1124 		 * never existed and let the SA-enabled version take
1125 		 * over.
1126 		 */
1127 		if ((*dst)->error) {
1128 			dst_release(*dst);
1129 			*dst = NULL;
1130 		}
1131 
1132 		if (fl6->flowi6_oif)
1133 			flags |= RT6_LOOKUP_F_IFACE;
1134 	}
1135 
1136 	if (!*dst)
1137 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1138 
1139 	err = (*dst)->error;
1140 	if (err)
1141 		goto out_err_release;
1142 
1143 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1144 	/*
1145 	 * Here if the dst entry we've looked up
1146 	 * has a neighbour entry that is in the INCOMPLETE
1147 	 * state and the src address from the flow is
1148 	 * marked as OPTIMISTIC, we release the found
1149 	 * dst entry and replace it instead with the
1150 	 * dst entry of the nexthop router
1151 	 */
1152 	rt = (struct rt6_info *) *dst;
1153 	rcu_read_lock_bh();
1154 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1155 				      rt6_nexthop(rt, &fl6->daddr));
1156 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1157 	rcu_read_unlock_bh();
1158 
1159 	if (err) {
1160 		struct inet6_ifaddr *ifp;
1161 		struct flowi6 fl_gw6;
1162 		int redirect;
1163 
1164 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1165 				      (*dst)->dev, 1);
1166 
1167 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1168 		if (ifp)
1169 			in6_ifa_put(ifp);
1170 
1171 		if (redirect) {
1172 			/*
1173 			 * We need to get the dst entry for the
1174 			 * default router instead
1175 			 */
1176 			dst_release(*dst);
1177 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1178 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1179 			*dst = ip6_route_output(net, sk, &fl_gw6);
1180 			err = (*dst)->error;
1181 			if (err)
1182 				goto out_err_release;
1183 		}
1184 	}
1185 #endif
1186 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1187 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1188 		err = -EAFNOSUPPORT;
1189 		goto out_err_release;
1190 	}
1191 
1192 	return 0;
1193 
1194 out_err_release:
1195 	dst_release(*dst);
1196 	*dst = NULL;
1197 
1198 	if (err == -ENETUNREACH)
1199 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1200 	return err;
1201 }
1202 
1203 /**
1204  *	ip6_dst_lookup - perform route lookup on flow
1205  *	@net: Network namespace to perform lookup in
1206  *	@sk: socket which provides route info
1207  *	@dst: pointer to dst_entry * for result
1208  *	@fl6: flow to lookup
1209  *
1210  *	This function performs a route lookup on the given flow.
1211  *
1212  *	It returns zero on success, or a standard errno code on error.
1213  */
1214 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1215 		   struct flowi6 *fl6)
1216 {
1217 	*dst = NULL;
1218 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1219 }
1220 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1221 
1222 /**
1223  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1224  *	@net: Network namespace to perform lookup in
1225  *	@sk: socket which provides route info
1226  *	@fl6: flow to lookup
1227  *	@final_dst: final destination address for ipsec lookup
1228  *
1229  *	This function performs a route lookup on the given flow.
1230  *
1231  *	It returns a valid dst pointer on success, or a pointer encoded
1232  *	error code.
1233  */
1234 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1235 				      const struct in6_addr *final_dst)
1236 {
1237 	struct dst_entry *dst = NULL;
1238 	int err;
1239 
1240 	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1241 	if (err)
1242 		return ERR_PTR(err);
1243 	if (final_dst)
1244 		fl6->daddr = *final_dst;
1245 
1246 	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1247 }
1248 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1249 
1250 /**
1251  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1252  *	@sk: socket which provides the dst cache and route info
1253  *	@fl6: flow to lookup
1254  *	@final_dst: final destination address for ipsec lookup
1255  *	@connected: whether @sk is connected or not
1256  *
1257  *	This function performs a route lookup on the given flow with the
1258  *	possibility of using the cached route in the socket if it is valid.
1259  *	It will take the socket dst lock when operating on the dst cache.
1260  *	As a result, this function can only be used in process context.
1261  *
1262  *	In addition, for a connected socket, cache the dst in the socket
1263  *	if the current cache is not valid.
1264  *
1265  *	It returns a valid dst pointer on success, or a pointer encoded
1266  *	error code.
1267  */
1268 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1269 					 const struct in6_addr *final_dst,
1270 					 bool connected)
1271 {
1272 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1273 
1274 	dst = ip6_sk_dst_check(sk, dst, fl6);
1275 	if (dst)
1276 		return dst;
1277 
1278 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1279 	if (connected && !IS_ERR(dst))
1280 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1281 
1282 	return dst;
1283 }
1284 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1285 
1286 /**
1287  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1288  *      @skb: Packet for which lookup is done
1289  *      @dev: Tunnel device
1290  *      @net: Network namespace of tunnel device
1291  *      @sock: Socket which provides route info
1292  *      @saddr: Memory to store the src ip address
1293  *      @info: Tunnel information
1294  *      @protocol: IP protocol
1295  *      @use_cache: Flag to enable cache usage
1296  *      This function performs a route lookup on a tunnel
1297  *
1298  *      It returns a valid dst pointer and stores src address to be used in
1299  *      tunnel in param saddr on success, else a pointer encoded error code.
1300  */
1301 
1302 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1303 					struct net_device *dev,
1304 					struct net *net,
1305 					struct socket *sock,
1306 					struct in6_addr *saddr,
1307 					const struct ip_tunnel_info *info,
1308 					u8 protocol,
1309 					bool use_cache)
1310 {
1311 	struct dst_entry *dst = NULL;
1312 #ifdef CONFIG_DST_CACHE
1313 	struct dst_cache *dst_cache;
1314 #endif
1315 	struct flowi6 fl6;
1316 	__u8 prio;
1317 
1318 #ifdef CONFIG_DST_CACHE
1319 	dst_cache = (struct dst_cache *)&info->dst_cache;
1320 	if (use_cache) {
1321 		dst = dst_cache_get_ip6(dst_cache, saddr);
1322 		if (dst)
1323 			return dst;
1324 	}
1325 #endif
1326 	memset(&fl6, 0, sizeof(fl6));
1327 	fl6.flowi6_mark = skb->mark;
1328 	fl6.flowi6_proto = protocol;
1329 	fl6.daddr = info->key.u.ipv6.dst;
1330 	fl6.saddr = info->key.u.ipv6.src;
1331 	prio = info->key.tos;
1332 	fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
1333 
1334 	dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1335 					      NULL);
1336 	if (IS_ERR(dst)) {
1337 		netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1338 		return ERR_PTR(-ENETUNREACH);
1339 	}
1340 	if (dst->dev == dev) { /* is this necessary? */
1341 		netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1342 		dst_release(dst);
1343 		return ERR_PTR(-ELOOP);
1344 	}
1345 #ifdef CONFIG_DST_CACHE
1346 	if (use_cache)
1347 		dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1348 #endif
1349 	*saddr = fl6.saddr;
1350 	return dst;
1351 }
1352 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1353 
1354 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1355 					       gfp_t gfp)
1356 {
1357 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1358 }
1359 
1360 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1361 						gfp_t gfp)
1362 {
1363 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1364 }
1365 
1366 static void ip6_append_data_mtu(unsigned int *mtu,
1367 				int *maxfraglen,
1368 				unsigned int fragheaderlen,
1369 				struct sk_buff *skb,
1370 				struct rt6_info *rt,
1371 				unsigned int orig_mtu)
1372 {
1373 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1374 		if (!skb) {
1375 			/* first fragment, reserve header_len */
1376 			*mtu = orig_mtu - rt->dst.header_len;
1377 
1378 		} else {
1379 			/*
1380 			 * this fragment is not first, the headers
1381 			 * space is regarded as data space.
1382 			 */
1383 			*mtu = orig_mtu;
1384 		}
1385 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1386 			      + fragheaderlen - sizeof(struct frag_hdr);
1387 	}
1388 }
1389 
1390 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1391 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1392 			  struct rt6_info *rt)
1393 {
1394 	struct ipv6_pinfo *np = inet6_sk(sk);
1395 	unsigned int mtu;
1396 	struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1397 
1398 	/* callers pass dst together with a reference, set it first so
1399 	 * ip6_cork_release() can put it down even in case of an error.
1400 	 */
1401 	cork->base.dst = &rt->dst;
1402 
1403 	/*
1404 	 * setup for corking
1405 	 */
1406 	if (opt) {
1407 		if (WARN_ON(v6_cork->opt))
1408 			return -EINVAL;
1409 
1410 		nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1411 		if (unlikely(!nopt))
1412 			return -ENOBUFS;
1413 
1414 		nopt->tot_len = sizeof(*opt);
1415 		nopt->opt_flen = opt->opt_flen;
1416 		nopt->opt_nflen = opt->opt_nflen;
1417 
1418 		nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1419 		if (opt->dst0opt && !nopt->dst0opt)
1420 			return -ENOBUFS;
1421 
1422 		nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1423 		if (opt->dst1opt && !nopt->dst1opt)
1424 			return -ENOBUFS;
1425 
1426 		nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1427 		if (opt->hopopt && !nopt->hopopt)
1428 			return -ENOBUFS;
1429 
1430 		nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1431 		if (opt->srcrt && !nopt->srcrt)
1432 			return -ENOBUFS;
1433 
1434 		/* need source address above miyazawa*/
1435 	}
1436 	v6_cork->hop_limit = ipc6->hlimit;
1437 	v6_cork->tclass = ipc6->tclass;
1438 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1439 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1440 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1441 	else
1442 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1443 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1444 	if (np->frag_size < mtu) {
1445 		if (np->frag_size)
1446 			mtu = np->frag_size;
1447 	}
1448 	cork->base.fragsize = mtu;
1449 	cork->base.gso_size = ipc6->gso_size;
1450 	cork->base.tx_flags = 0;
1451 	cork->base.mark = ipc6->sockc.mark;
1452 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1453 
1454 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1455 		cork->base.flags |= IPCORK_ALLFRAG;
1456 	cork->base.length = 0;
1457 
1458 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1459 
1460 	return 0;
1461 }
1462 
1463 static int __ip6_append_data(struct sock *sk,
1464 			     struct sk_buff_head *queue,
1465 			     struct inet_cork_full *cork_full,
1466 			     struct inet6_cork *v6_cork,
1467 			     struct page_frag *pfrag,
1468 			     int getfrag(void *from, char *to, int offset,
1469 					 int len, int odd, struct sk_buff *skb),
1470 			     void *from, size_t length, int transhdrlen,
1471 			     unsigned int flags, struct ipcm6_cookie *ipc6)
1472 {
1473 	struct sk_buff *skb, *skb_prev = NULL;
1474 	struct inet_cork *cork = &cork_full->base;
1475 	struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1476 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1477 	struct ubuf_info *uarg = NULL;
1478 	int exthdrlen = 0;
1479 	int dst_exthdrlen = 0;
1480 	int hh_len;
1481 	int copy;
1482 	int err;
1483 	int offset = 0;
1484 	bool zc = false;
1485 	u32 tskey = 0;
1486 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1487 	struct ipv6_txoptions *opt = v6_cork->opt;
1488 	int csummode = CHECKSUM_NONE;
1489 	unsigned int maxnonfragsize, headersize;
1490 	unsigned int wmem_alloc_delta = 0;
1491 	bool paged, extra_uref = false;
1492 
1493 	skb = skb_peek_tail(queue);
1494 	if (!skb) {
1495 		exthdrlen = opt ? opt->opt_flen : 0;
1496 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1497 	}
1498 
1499 	paged = !!cork->gso_size;
1500 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1501 	orig_mtu = mtu;
1502 
1503 	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1504 	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1505 		tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1506 
1507 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1508 
1509 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1510 			(opt ? opt->opt_nflen : 0);
1511 
1512 	headersize = sizeof(struct ipv6hdr) +
1513 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1514 		     (dst_allfrag(&rt->dst) ?
1515 		      sizeof(struct frag_hdr) : 0) +
1516 		     rt->rt6i_nfheader_len;
1517 
1518 	if (mtu <= fragheaderlen ||
1519 	    ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1520 		goto emsgsize;
1521 
1522 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1523 		     sizeof(struct frag_hdr);
1524 
1525 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1526 	 * the first fragment
1527 	 */
1528 	if (headersize + transhdrlen > mtu)
1529 		goto emsgsize;
1530 
1531 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1532 	    (sk->sk_protocol == IPPROTO_UDP ||
1533 	     sk->sk_protocol == IPPROTO_ICMPV6 ||
1534 	     sk->sk_protocol == IPPROTO_RAW)) {
1535 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1536 				sizeof(struct ipv6hdr));
1537 		goto emsgsize;
1538 	}
1539 
1540 	if (ip6_sk_ignore_df(sk))
1541 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1542 	else
1543 		maxnonfragsize = mtu;
1544 
1545 	if (cork->length + length > maxnonfragsize - headersize) {
1546 emsgsize:
1547 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1548 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1549 		return -EMSGSIZE;
1550 	}
1551 
1552 	/* CHECKSUM_PARTIAL only with no extension headers and when
1553 	 * we are not going to fragment
1554 	 */
1555 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1556 	    headersize == sizeof(struct ipv6hdr) &&
1557 	    length <= mtu - headersize &&
1558 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1559 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1560 		csummode = CHECKSUM_PARTIAL;
1561 
1562 	if ((flags & MSG_ZEROCOPY) && length) {
1563 		struct msghdr *msg = from;
1564 
1565 		if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1566 			if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1567 				return -EINVAL;
1568 
1569 			/* Leave uarg NULL if can't zerocopy, callers should
1570 			 * be able to handle it.
1571 			 */
1572 			if ((rt->dst.dev->features & NETIF_F_SG) &&
1573 			    csummode == CHECKSUM_PARTIAL) {
1574 				paged = true;
1575 				zc = true;
1576 				uarg = msg->msg_ubuf;
1577 			}
1578 		} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1579 			uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1580 			if (!uarg)
1581 				return -ENOBUFS;
1582 			extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1583 			if (rt->dst.dev->features & NETIF_F_SG &&
1584 			    csummode == CHECKSUM_PARTIAL) {
1585 				paged = true;
1586 				zc = true;
1587 			} else {
1588 				uarg_to_msgzc(uarg)->zerocopy = 0;
1589 				skb_zcopy_set(skb, uarg, &extra_uref);
1590 			}
1591 		}
1592 	}
1593 
1594 	/*
1595 	 * Let's try using as much space as possible.
1596 	 * Use MTU if total length of the message fits into the MTU.
1597 	 * Otherwise, we need to reserve fragment header and
1598 	 * fragment alignment (= 8-15 octects, in total).
1599 	 *
1600 	 * Note that we may need to "move" the data from the tail
1601 	 * of the buffer to the new fragment when we split
1602 	 * the message.
1603 	 *
1604 	 * FIXME: It may be fragmented into multiple chunks
1605 	 *        at once if non-fragmentable extension headers
1606 	 *        are too large.
1607 	 * --yoshfuji
1608 	 */
1609 
1610 	cork->length += length;
1611 	if (!skb)
1612 		goto alloc_new_skb;
1613 
1614 	while (length > 0) {
1615 		/* Check if the remaining data fits into current packet. */
1616 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1617 		if (copy < length)
1618 			copy = maxfraglen - skb->len;
1619 
1620 		if (copy <= 0) {
1621 			char *data;
1622 			unsigned int datalen;
1623 			unsigned int fraglen;
1624 			unsigned int fraggap;
1625 			unsigned int alloclen, alloc_extra;
1626 			unsigned int pagedlen;
1627 alloc_new_skb:
1628 			/* There's no room in the current skb */
1629 			if (skb)
1630 				fraggap = skb->len - maxfraglen;
1631 			else
1632 				fraggap = 0;
1633 			/* update mtu and maxfraglen if necessary */
1634 			if (!skb || !skb_prev)
1635 				ip6_append_data_mtu(&mtu, &maxfraglen,
1636 						    fragheaderlen, skb, rt,
1637 						    orig_mtu);
1638 
1639 			skb_prev = skb;
1640 
1641 			/*
1642 			 * If remaining data exceeds the mtu,
1643 			 * we know we need more fragment(s).
1644 			 */
1645 			datalen = length + fraggap;
1646 
1647 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1648 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1649 			fraglen = datalen + fragheaderlen;
1650 			pagedlen = 0;
1651 
1652 			alloc_extra = hh_len;
1653 			alloc_extra += dst_exthdrlen;
1654 			alloc_extra += rt->dst.trailer_len;
1655 
1656 			/* We just reserve space for fragment header.
1657 			 * Note: this may be overallocation if the message
1658 			 * (without MSG_MORE) fits into the MTU.
1659 			 */
1660 			alloc_extra += sizeof(struct frag_hdr);
1661 
1662 			if ((flags & MSG_MORE) &&
1663 			    !(rt->dst.dev->features&NETIF_F_SG))
1664 				alloclen = mtu;
1665 			else if (!paged &&
1666 				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1667 				  !(rt->dst.dev->features & NETIF_F_SG)))
1668 				alloclen = fraglen;
1669 			else {
1670 				alloclen = fragheaderlen + transhdrlen;
1671 				pagedlen = datalen - transhdrlen;
1672 			}
1673 			alloclen += alloc_extra;
1674 
1675 			if (datalen != length + fraggap) {
1676 				/*
1677 				 * this is not the last fragment, the trailer
1678 				 * space is regarded as data space.
1679 				 */
1680 				datalen += rt->dst.trailer_len;
1681 			}
1682 
1683 			fraglen = datalen + fragheaderlen;
1684 
1685 			copy = datalen - transhdrlen - fraggap - pagedlen;
1686 			if (copy < 0) {
1687 				err = -EINVAL;
1688 				goto error;
1689 			}
1690 			if (transhdrlen) {
1691 				skb = sock_alloc_send_skb(sk, alloclen,
1692 						(flags & MSG_DONTWAIT), &err);
1693 			} else {
1694 				skb = NULL;
1695 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1696 				    2 * sk->sk_sndbuf)
1697 					skb = alloc_skb(alloclen,
1698 							sk->sk_allocation);
1699 				if (unlikely(!skb))
1700 					err = -ENOBUFS;
1701 			}
1702 			if (!skb)
1703 				goto error;
1704 			/*
1705 			 *	Fill in the control structures
1706 			 */
1707 			skb->protocol = htons(ETH_P_IPV6);
1708 			skb->ip_summed = csummode;
1709 			skb->csum = 0;
1710 			/* reserve for fragmentation and ipsec header */
1711 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1712 				    dst_exthdrlen);
1713 
1714 			/*
1715 			 *	Find where to start putting bytes
1716 			 */
1717 			data = skb_put(skb, fraglen - pagedlen);
1718 			skb_set_network_header(skb, exthdrlen);
1719 			data += fragheaderlen;
1720 			skb->transport_header = (skb->network_header +
1721 						 fragheaderlen);
1722 			if (fraggap) {
1723 				skb->csum = skb_copy_and_csum_bits(
1724 					skb_prev, maxfraglen,
1725 					data + transhdrlen, fraggap);
1726 				skb_prev->csum = csum_sub(skb_prev->csum,
1727 							  skb->csum);
1728 				data += fraggap;
1729 				pskb_trim_unique(skb_prev, maxfraglen);
1730 			}
1731 			if (copy > 0 &&
1732 			    getfrag(from, data + transhdrlen, offset,
1733 				    copy, fraggap, skb) < 0) {
1734 				err = -EFAULT;
1735 				kfree_skb(skb);
1736 				goto error;
1737 			}
1738 
1739 			offset += copy;
1740 			length -= copy + transhdrlen;
1741 			transhdrlen = 0;
1742 			exthdrlen = 0;
1743 			dst_exthdrlen = 0;
1744 
1745 			/* Only the initial fragment is time stamped */
1746 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1747 			cork->tx_flags = 0;
1748 			skb_shinfo(skb)->tskey = tskey;
1749 			tskey = 0;
1750 			skb_zcopy_set(skb, uarg, &extra_uref);
1751 
1752 			if ((flags & MSG_CONFIRM) && !skb_prev)
1753 				skb_set_dst_pending_confirm(skb, 1);
1754 
1755 			/*
1756 			 * Put the packet on the pending queue
1757 			 */
1758 			if (!skb->destructor) {
1759 				skb->destructor = sock_wfree;
1760 				skb->sk = sk;
1761 				wmem_alloc_delta += skb->truesize;
1762 			}
1763 			__skb_queue_tail(queue, skb);
1764 			continue;
1765 		}
1766 
1767 		if (copy > length)
1768 			copy = length;
1769 
1770 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1771 		    skb_tailroom(skb) >= copy) {
1772 			unsigned int off;
1773 
1774 			off = skb->len;
1775 			if (getfrag(from, skb_put(skb, copy),
1776 						offset, copy, off, skb) < 0) {
1777 				__skb_trim(skb, off);
1778 				err = -EFAULT;
1779 				goto error;
1780 			}
1781 		} else if (!zc) {
1782 			int i = skb_shinfo(skb)->nr_frags;
1783 
1784 			err = -ENOMEM;
1785 			if (!sk_page_frag_refill(sk, pfrag))
1786 				goto error;
1787 
1788 			skb_zcopy_downgrade_managed(skb);
1789 			if (!skb_can_coalesce(skb, i, pfrag->page,
1790 					      pfrag->offset)) {
1791 				err = -EMSGSIZE;
1792 				if (i == MAX_SKB_FRAGS)
1793 					goto error;
1794 
1795 				__skb_fill_page_desc(skb, i, pfrag->page,
1796 						     pfrag->offset, 0);
1797 				skb_shinfo(skb)->nr_frags = ++i;
1798 				get_page(pfrag->page);
1799 			}
1800 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1801 			if (getfrag(from,
1802 				    page_address(pfrag->page) + pfrag->offset,
1803 				    offset, copy, skb->len, skb) < 0)
1804 				goto error_efault;
1805 
1806 			pfrag->offset += copy;
1807 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1808 			skb->len += copy;
1809 			skb->data_len += copy;
1810 			skb->truesize += copy;
1811 			wmem_alloc_delta += copy;
1812 		} else {
1813 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1814 			if (err < 0)
1815 				goto error;
1816 		}
1817 		offset += copy;
1818 		length -= copy;
1819 	}
1820 
1821 	if (wmem_alloc_delta)
1822 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1823 	return 0;
1824 
1825 error_efault:
1826 	err = -EFAULT;
1827 error:
1828 	net_zcopy_put_abort(uarg, extra_uref);
1829 	cork->length -= length;
1830 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1831 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1832 	return err;
1833 }
1834 
1835 int ip6_append_data(struct sock *sk,
1836 		    int getfrag(void *from, char *to, int offset, int len,
1837 				int odd, struct sk_buff *skb),
1838 		    void *from, size_t length, int transhdrlen,
1839 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1840 		    struct rt6_info *rt, unsigned int flags)
1841 {
1842 	struct inet_sock *inet = inet_sk(sk);
1843 	struct ipv6_pinfo *np = inet6_sk(sk);
1844 	int exthdrlen;
1845 	int err;
1846 
1847 	if (flags&MSG_PROBE)
1848 		return 0;
1849 	if (skb_queue_empty(&sk->sk_write_queue)) {
1850 		/*
1851 		 * setup for corking
1852 		 */
1853 		dst_hold(&rt->dst);
1854 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1855 				     ipc6, rt);
1856 		if (err)
1857 			return err;
1858 
1859 		inet->cork.fl.u.ip6 = *fl6;
1860 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1861 		length += exthdrlen;
1862 		transhdrlen += exthdrlen;
1863 	} else {
1864 		transhdrlen = 0;
1865 	}
1866 
1867 	return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1868 				 &np->cork, sk_page_frag(sk), getfrag,
1869 				 from, length, transhdrlen, flags, ipc6);
1870 }
1871 EXPORT_SYMBOL_GPL(ip6_append_data);
1872 
1873 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1874 {
1875 	struct dst_entry *dst = cork->base.dst;
1876 
1877 	cork->base.dst = NULL;
1878 	cork->base.flags &= ~IPCORK_ALLFRAG;
1879 	skb_dst_set(skb, dst);
1880 }
1881 
1882 static void ip6_cork_release(struct inet_cork_full *cork,
1883 			     struct inet6_cork *v6_cork)
1884 {
1885 	if (v6_cork->opt) {
1886 		struct ipv6_txoptions *opt = v6_cork->opt;
1887 
1888 		kfree(opt->dst0opt);
1889 		kfree(opt->dst1opt);
1890 		kfree(opt->hopopt);
1891 		kfree(opt->srcrt);
1892 		kfree(opt);
1893 		v6_cork->opt = NULL;
1894 	}
1895 
1896 	if (cork->base.dst) {
1897 		dst_release(cork->base.dst);
1898 		cork->base.dst = NULL;
1899 		cork->base.flags &= ~IPCORK_ALLFRAG;
1900 	}
1901 }
1902 
1903 struct sk_buff *__ip6_make_skb(struct sock *sk,
1904 			       struct sk_buff_head *queue,
1905 			       struct inet_cork_full *cork,
1906 			       struct inet6_cork *v6_cork)
1907 {
1908 	struct sk_buff *skb, *tmp_skb;
1909 	struct sk_buff **tail_skb;
1910 	struct in6_addr *final_dst;
1911 	struct ipv6_pinfo *np = inet6_sk(sk);
1912 	struct net *net = sock_net(sk);
1913 	struct ipv6hdr *hdr;
1914 	struct ipv6_txoptions *opt = v6_cork->opt;
1915 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1916 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1917 	unsigned char proto = fl6->flowi6_proto;
1918 
1919 	skb = __skb_dequeue(queue);
1920 	if (!skb)
1921 		goto out;
1922 	tail_skb = &(skb_shinfo(skb)->frag_list);
1923 
1924 	/* move skb->data to ip header from ext header */
1925 	if (skb->data < skb_network_header(skb))
1926 		__skb_pull(skb, skb_network_offset(skb));
1927 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1928 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1929 		*tail_skb = tmp_skb;
1930 		tail_skb = &(tmp_skb->next);
1931 		skb->len += tmp_skb->len;
1932 		skb->data_len += tmp_skb->len;
1933 		skb->truesize += tmp_skb->truesize;
1934 		tmp_skb->destructor = NULL;
1935 		tmp_skb->sk = NULL;
1936 	}
1937 
1938 	/* Allow local fragmentation. */
1939 	skb->ignore_df = ip6_sk_ignore_df(sk);
1940 	__skb_pull(skb, skb_network_header_len(skb));
1941 
1942 	final_dst = &fl6->daddr;
1943 	if (opt && opt->opt_flen)
1944 		ipv6_push_frag_opts(skb, opt, &proto);
1945 	if (opt && opt->opt_nflen)
1946 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1947 
1948 	skb_push(skb, sizeof(struct ipv6hdr));
1949 	skb_reset_network_header(skb);
1950 	hdr = ipv6_hdr(skb);
1951 
1952 	ip6_flow_hdr(hdr, v6_cork->tclass,
1953 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1954 					ip6_autoflowlabel(net, np), fl6));
1955 	hdr->hop_limit = v6_cork->hop_limit;
1956 	hdr->nexthdr = proto;
1957 	hdr->saddr = fl6->saddr;
1958 	hdr->daddr = *final_dst;
1959 
1960 	skb->priority = sk->sk_priority;
1961 	skb->mark = cork->base.mark;
1962 	skb->tstamp = cork->base.transmit_time;
1963 
1964 	ip6_cork_steal_dst(skb, cork);
1965 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1966 	if (proto == IPPROTO_ICMPV6) {
1967 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1968 
1969 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1970 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1971 	}
1972 
1973 	ip6_cork_release(cork, v6_cork);
1974 out:
1975 	return skb;
1976 }
1977 
1978 int ip6_send_skb(struct sk_buff *skb)
1979 {
1980 	struct net *net = sock_net(skb->sk);
1981 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1982 	int err;
1983 
1984 	err = ip6_local_out(net, skb->sk, skb);
1985 	if (err) {
1986 		if (err > 0)
1987 			err = net_xmit_errno(err);
1988 		if (err)
1989 			IP6_INC_STATS(net, rt->rt6i_idev,
1990 				      IPSTATS_MIB_OUTDISCARDS);
1991 	}
1992 
1993 	return err;
1994 }
1995 
1996 int ip6_push_pending_frames(struct sock *sk)
1997 {
1998 	struct sk_buff *skb;
1999 
2000 	skb = ip6_finish_skb(sk);
2001 	if (!skb)
2002 		return 0;
2003 
2004 	return ip6_send_skb(skb);
2005 }
2006 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
2007 
2008 static void __ip6_flush_pending_frames(struct sock *sk,
2009 				       struct sk_buff_head *queue,
2010 				       struct inet_cork_full *cork,
2011 				       struct inet6_cork *v6_cork)
2012 {
2013 	struct sk_buff *skb;
2014 
2015 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
2016 		if (skb_dst(skb))
2017 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
2018 				      IPSTATS_MIB_OUTDISCARDS);
2019 		kfree_skb(skb);
2020 	}
2021 
2022 	ip6_cork_release(cork, v6_cork);
2023 }
2024 
2025 void ip6_flush_pending_frames(struct sock *sk)
2026 {
2027 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2028 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
2029 }
2030 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2031 
2032 struct sk_buff *ip6_make_skb(struct sock *sk,
2033 			     int getfrag(void *from, char *to, int offset,
2034 					 int len, int odd, struct sk_buff *skb),
2035 			     void *from, size_t length, int transhdrlen,
2036 			     struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2037 			     unsigned int flags, struct inet_cork_full *cork)
2038 {
2039 	struct inet6_cork v6_cork;
2040 	struct sk_buff_head queue;
2041 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2042 	int err;
2043 
2044 	if (flags & MSG_PROBE) {
2045 		dst_release(&rt->dst);
2046 		return NULL;
2047 	}
2048 
2049 	__skb_queue_head_init(&queue);
2050 
2051 	cork->base.flags = 0;
2052 	cork->base.addr = 0;
2053 	cork->base.opt = NULL;
2054 	v6_cork.opt = NULL;
2055 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
2056 	if (err) {
2057 		ip6_cork_release(cork, &v6_cork);
2058 		return ERR_PTR(err);
2059 	}
2060 	if (ipc6->dontfrag < 0)
2061 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2062 
2063 	err = __ip6_append_data(sk, &queue, cork, &v6_cork,
2064 				&current->task_frag, getfrag, from,
2065 				length + exthdrlen, transhdrlen + exthdrlen,
2066 				flags, ipc6);
2067 	if (err) {
2068 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2069 		return ERR_PTR(err);
2070 	}
2071 
2072 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2073 }
2074