xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 69868c3b)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	IPv6 output functions
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on linux/net/ipv4/ip_output.c
10  *
11  *	Changes:
12  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13  *				extension headers are implemented.
14  *				route changes now work.
15  *				ip6_forward does not confuse sniffers.
16  *				etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *	Imran Patel	:	frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *			:       add ip6_append_data and related functions
22  *				for datagram xmit
23  */
24 
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37 
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41 
42 #include <net/sock.h>
43 #include <net/snmp.h>
44 
45 #include <net/ipv6.h>
46 #include <net/ndisc.h>
47 #include <net/protocol.h>
48 #include <net/ip6_route.h>
49 #include <net/addrconf.h>
50 #include <net/rawv6.h>
51 #include <net/icmp.h>
52 #include <net/xfrm.h>
53 #include <net/checksum.h>
54 #include <linux/mroute6.h>
55 #include <net/l3mdev.h>
56 #include <net/lwtunnel.h>
57 #include <net/ip_tunnels.h>
58 
59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
60 {
61 	struct dst_entry *dst = skb_dst(skb);
62 	struct net_device *dev = dst->dev;
63 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
64 	int delta = hh_len - skb_headroom(skb);
65 	const struct in6_addr *nexthop;
66 	struct neighbour *neigh;
67 	int ret;
68 
69 	/* Be paranoid, rather than too clever. */
70 	if (unlikely(delta > 0) && dev->header_ops) {
71 		/* pskb_expand_head() might crash, if skb is shared */
72 		if (skb_shared(skb)) {
73 			struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
74 
75 			if (likely(nskb)) {
76 				if (skb->sk)
77 					skb_set_owner_w(skb, skb->sk);
78 				consume_skb(skb);
79 			} else {
80 				kfree_skb(skb);
81 			}
82 			skb = nskb;
83 		}
84 		if (skb &&
85 		    pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
86 			kfree_skb(skb);
87 			skb = NULL;
88 		}
89 		if (!skb) {
90 			IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
91 			return -ENOMEM;
92 		}
93 	}
94 
95 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
96 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
97 
98 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
99 		    ((mroute6_is_socket(net, skb) &&
100 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
101 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
102 					 &ipv6_hdr(skb)->saddr))) {
103 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
104 
105 			/* Do not check for IFF_ALLMULTI; multicast routing
106 			   is not supported in any case.
107 			 */
108 			if (newskb)
109 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
110 					net, sk, newskb, NULL, newskb->dev,
111 					dev_loopback_xmit);
112 
113 			if (ipv6_hdr(skb)->hop_limit == 0) {
114 				IP6_INC_STATS(net, idev,
115 					      IPSTATS_MIB_OUTDISCARDS);
116 				kfree_skb(skb);
117 				return 0;
118 			}
119 		}
120 
121 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
122 
123 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
124 		    IPV6_ADDR_SCOPE_NODELOCAL &&
125 		    !(dev->flags & IFF_LOOPBACK)) {
126 			kfree_skb(skb);
127 			return 0;
128 		}
129 	}
130 
131 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
132 		int res = lwtunnel_xmit(skb);
133 
134 		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
135 			return res;
136 	}
137 
138 	rcu_read_lock_bh();
139 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
140 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
141 	if (unlikely(!neigh))
142 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
143 	if (!IS_ERR(neigh)) {
144 		sock_confirm_neigh(skb, neigh);
145 		ret = neigh_output(neigh, skb, false);
146 		rcu_read_unlock_bh();
147 		return ret;
148 	}
149 	rcu_read_unlock_bh();
150 
151 	IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
152 	kfree_skb(skb);
153 	return -EINVAL;
154 }
155 
156 static int
157 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
158 				    struct sk_buff *skb, unsigned int mtu)
159 {
160 	struct sk_buff *segs, *nskb;
161 	netdev_features_t features;
162 	int ret = 0;
163 
164 	/* Please see corresponding comment in ip_finish_output_gso
165 	 * describing the cases where GSO segment length exceeds the
166 	 * egress MTU.
167 	 */
168 	features = netif_skb_features(skb);
169 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
170 	if (IS_ERR_OR_NULL(segs)) {
171 		kfree_skb(skb);
172 		return -ENOMEM;
173 	}
174 
175 	consume_skb(skb);
176 
177 	skb_list_walk_safe(segs, segs, nskb) {
178 		int err;
179 
180 		skb_mark_not_on_list(segs);
181 		err = ip6_fragment(net, sk, segs, ip6_finish_output2);
182 		if (err && ret == 0)
183 			ret = err;
184 	}
185 
186 	return ret;
187 }
188 
189 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
190 {
191 	unsigned int mtu;
192 
193 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
194 	/* Policy lookup after SNAT yielded a new policy */
195 	if (skb_dst(skb)->xfrm) {
196 		IPCB(skb)->flags |= IPSKB_REROUTED;
197 		return dst_output(net, sk, skb);
198 	}
199 #endif
200 
201 	mtu = ip6_skb_dst_mtu(skb);
202 	if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
203 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
204 
205 	if ((skb->len > mtu && !skb_is_gso(skb)) ||
206 	    dst_allfrag(skb_dst(skb)) ||
207 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
208 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
209 	else
210 		return ip6_finish_output2(net, sk, skb);
211 }
212 
213 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
214 {
215 	int ret;
216 
217 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
218 	switch (ret) {
219 	case NET_XMIT_SUCCESS:
220 		return __ip6_finish_output(net, sk, skb);
221 	case NET_XMIT_CN:
222 		return __ip6_finish_output(net, sk, skb) ? : ret;
223 	default:
224 		kfree_skb(skb);
225 		return ret;
226 	}
227 }
228 
229 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
230 {
231 	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
232 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
233 
234 	skb->protocol = htons(ETH_P_IPV6);
235 	skb->dev = dev;
236 
237 	if (unlikely(idev->cnf.disable_ipv6)) {
238 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
239 		kfree_skb(skb);
240 		return 0;
241 	}
242 
243 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
244 			    net, sk, skb, indev, dev,
245 			    ip6_finish_output,
246 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
247 }
248 EXPORT_SYMBOL(ip6_output);
249 
250 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
251 {
252 	if (!np->autoflowlabel_set)
253 		return ip6_default_np_autolabel(net);
254 	else
255 		return np->autoflowlabel;
256 }
257 
258 /*
259  * xmit an sk_buff (used by TCP, SCTP and DCCP)
260  * Note : socket lock is not held for SYNACK packets, but might be modified
261  * by calls to skb_set_owner_w() and ipv6_local_error(),
262  * which are using proper atomic operations or spinlocks.
263  */
264 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
265 	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
266 {
267 	struct net *net = sock_net(sk);
268 	const struct ipv6_pinfo *np = inet6_sk(sk);
269 	struct in6_addr *first_hop = &fl6->daddr;
270 	struct dst_entry *dst = skb_dst(skb);
271 	unsigned int head_room;
272 	struct ipv6hdr *hdr;
273 	u8  proto = fl6->flowi6_proto;
274 	int seg_len = skb->len;
275 	int hlimit = -1;
276 	u32 mtu;
277 
278 	head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
279 	if (opt)
280 		head_room += opt->opt_nflen + opt->opt_flen;
281 
282 	if (unlikely(skb_headroom(skb) < head_room)) {
283 		struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
284 		if (!skb2) {
285 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
286 				      IPSTATS_MIB_OUTDISCARDS);
287 			kfree_skb(skb);
288 			return -ENOBUFS;
289 		}
290 		if (skb->sk)
291 			skb_set_owner_w(skb2, skb->sk);
292 		consume_skb(skb);
293 		skb = skb2;
294 	}
295 
296 	if (opt) {
297 		seg_len += opt->opt_nflen + opt->opt_flen;
298 
299 		if (opt->opt_flen)
300 			ipv6_push_frag_opts(skb, opt, &proto);
301 
302 		if (opt->opt_nflen)
303 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
304 					     &fl6->saddr);
305 	}
306 
307 	skb_push(skb, sizeof(struct ipv6hdr));
308 	skb_reset_network_header(skb);
309 	hdr = ipv6_hdr(skb);
310 
311 	/*
312 	 *	Fill in the IPv6 header
313 	 */
314 	if (np)
315 		hlimit = np->hop_limit;
316 	if (hlimit < 0)
317 		hlimit = ip6_dst_hoplimit(dst);
318 
319 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
320 				ip6_autoflowlabel(net, np), fl6));
321 
322 	hdr->payload_len = htons(seg_len);
323 	hdr->nexthdr = proto;
324 	hdr->hop_limit = hlimit;
325 
326 	hdr->saddr = fl6->saddr;
327 	hdr->daddr = *first_hop;
328 
329 	skb->protocol = htons(ETH_P_IPV6);
330 	skb->priority = priority;
331 	skb->mark = mark;
332 
333 	mtu = dst_mtu(dst);
334 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
335 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
336 			      IPSTATS_MIB_OUT, skb->len);
337 
338 		/* if egress device is enslaved to an L3 master device pass the
339 		 * skb to its handler for processing
340 		 */
341 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
342 		if (unlikely(!skb))
343 			return 0;
344 
345 		/* hooks should never assume socket lock is held.
346 		 * we promote our socket to non const
347 		 */
348 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
349 			       net, (struct sock *)sk, skb, NULL, dst->dev,
350 			       dst_output);
351 	}
352 
353 	skb->dev = dst->dev;
354 	/* ipv6_local_error() does not require socket lock,
355 	 * we promote our socket to non const
356 	 */
357 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
358 
359 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
360 	kfree_skb(skb);
361 	return -EMSGSIZE;
362 }
363 EXPORT_SYMBOL(ip6_xmit);
364 
365 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
366 {
367 	struct ip6_ra_chain *ra;
368 	struct sock *last = NULL;
369 
370 	read_lock(&ip6_ra_lock);
371 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
372 		struct sock *sk = ra->sk;
373 		if (sk && ra->sel == sel &&
374 		    (!sk->sk_bound_dev_if ||
375 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
376 			struct ipv6_pinfo *np = inet6_sk(sk);
377 
378 			if (np && np->rtalert_isolate &&
379 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
380 				continue;
381 			}
382 			if (last) {
383 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
384 				if (skb2)
385 					rawv6_rcv(last, skb2);
386 			}
387 			last = sk;
388 		}
389 	}
390 
391 	if (last) {
392 		rawv6_rcv(last, skb);
393 		read_unlock(&ip6_ra_lock);
394 		return 1;
395 	}
396 	read_unlock(&ip6_ra_lock);
397 	return 0;
398 }
399 
400 static int ip6_forward_proxy_check(struct sk_buff *skb)
401 {
402 	struct ipv6hdr *hdr = ipv6_hdr(skb);
403 	u8 nexthdr = hdr->nexthdr;
404 	__be16 frag_off;
405 	int offset;
406 
407 	if (ipv6_ext_hdr(nexthdr)) {
408 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
409 		if (offset < 0)
410 			return 0;
411 	} else
412 		offset = sizeof(struct ipv6hdr);
413 
414 	if (nexthdr == IPPROTO_ICMPV6) {
415 		struct icmp6hdr *icmp6;
416 
417 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
418 					 offset + 1 - skb->data)))
419 			return 0;
420 
421 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
422 
423 		switch (icmp6->icmp6_type) {
424 		case NDISC_ROUTER_SOLICITATION:
425 		case NDISC_ROUTER_ADVERTISEMENT:
426 		case NDISC_NEIGHBOUR_SOLICITATION:
427 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
428 		case NDISC_REDIRECT:
429 			/* For reaction involving unicast neighbor discovery
430 			 * message destined to the proxied address, pass it to
431 			 * input function.
432 			 */
433 			return 1;
434 		default:
435 			break;
436 		}
437 	}
438 
439 	/*
440 	 * The proxying router can't forward traffic sent to a link-local
441 	 * address, so signal the sender and discard the packet. This
442 	 * behavior is clarified by the MIPv6 specification.
443 	 */
444 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
445 		dst_link_failure(skb);
446 		return -1;
447 	}
448 
449 	return 0;
450 }
451 
452 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
453 				     struct sk_buff *skb)
454 {
455 	struct dst_entry *dst = skb_dst(skb);
456 
457 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
458 	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
459 
460 #ifdef CONFIG_NET_SWITCHDEV
461 	if (skb->offload_l3_fwd_mark) {
462 		consume_skb(skb);
463 		return 0;
464 	}
465 #endif
466 
467 	skb->tstamp = 0;
468 	return dst_output(net, sk, skb);
469 }
470 
471 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
472 {
473 	if (skb->len <= mtu)
474 		return false;
475 
476 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
477 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
478 		return true;
479 
480 	if (skb->ignore_df)
481 		return false;
482 
483 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
484 		return false;
485 
486 	return true;
487 }
488 
489 int ip6_forward(struct sk_buff *skb)
490 {
491 	struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
492 	struct dst_entry *dst = skb_dst(skb);
493 	struct ipv6hdr *hdr = ipv6_hdr(skb);
494 	struct inet6_skb_parm *opt = IP6CB(skb);
495 	struct net *net = dev_net(dst->dev);
496 	u32 mtu;
497 
498 	if (net->ipv6.devconf_all->forwarding == 0)
499 		goto error;
500 
501 	if (skb->pkt_type != PACKET_HOST)
502 		goto drop;
503 
504 	if (unlikely(skb->sk))
505 		goto drop;
506 
507 	if (skb_warn_if_lro(skb))
508 		goto drop;
509 
510 	if (!net->ipv6.devconf_all->disable_policy &&
511 	    !idev->cnf.disable_policy &&
512 	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
513 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
514 		goto drop;
515 	}
516 
517 	skb_forward_csum(skb);
518 
519 	/*
520 	 *	We DO NOT make any processing on
521 	 *	RA packets, pushing them to user level AS IS
522 	 *	without ane WARRANTY that application will be able
523 	 *	to interpret them. The reason is that we
524 	 *	cannot make anything clever here.
525 	 *
526 	 *	We are not end-node, so that if packet contains
527 	 *	AH/ESP, we cannot make anything.
528 	 *	Defragmentation also would be mistake, RA packets
529 	 *	cannot be fragmented, because there is no warranty
530 	 *	that different fragments will go along one path. --ANK
531 	 */
532 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
533 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
534 			return 0;
535 	}
536 
537 	/*
538 	 *	check and decrement ttl
539 	 */
540 	if (hdr->hop_limit <= 1) {
541 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
542 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
543 
544 		kfree_skb(skb);
545 		return -ETIMEDOUT;
546 	}
547 
548 	/* XXX: idev->cnf.proxy_ndp? */
549 	if (net->ipv6.devconf_all->proxy_ndp &&
550 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
551 		int proxied = ip6_forward_proxy_check(skb);
552 		if (proxied > 0)
553 			return ip6_input(skb);
554 		else if (proxied < 0) {
555 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
556 			goto drop;
557 		}
558 	}
559 
560 	if (!xfrm6_route_forward(skb)) {
561 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
562 		goto drop;
563 	}
564 	dst = skb_dst(skb);
565 
566 	/* IPv6 specs say nothing about it, but it is clear that we cannot
567 	   send redirects to source routed frames.
568 	   We don't send redirects to frames decapsulated from IPsec.
569 	 */
570 	if (IP6CB(skb)->iif == dst->dev->ifindex &&
571 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
572 		struct in6_addr *target = NULL;
573 		struct inet_peer *peer;
574 		struct rt6_info *rt;
575 
576 		/*
577 		 *	incoming and outgoing devices are the same
578 		 *	send a redirect.
579 		 */
580 
581 		rt = (struct rt6_info *) dst;
582 		if (rt->rt6i_flags & RTF_GATEWAY)
583 			target = &rt->rt6i_gateway;
584 		else
585 			target = &hdr->daddr;
586 
587 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
588 
589 		/* Limit redirects both by destination (here)
590 		   and by source (inside ndisc_send_redirect)
591 		 */
592 		if (inet_peer_xrlim_allow(peer, 1*HZ))
593 			ndisc_send_redirect(skb, target);
594 		if (peer)
595 			inet_putpeer(peer);
596 	} else {
597 		int addrtype = ipv6_addr_type(&hdr->saddr);
598 
599 		/* This check is security critical. */
600 		if (addrtype == IPV6_ADDR_ANY ||
601 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
602 			goto error;
603 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
604 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
605 				    ICMPV6_NOT_NEIGHBOUR, 0);
606 			goto error;
607 		}
608 	}
609 
610 	mtu = ip6_dst_mtu_forward(dst);
611 	if (mtu < IPV6_MIN_MTU)
612 		mtu = IPV6_MIN_MTU;
613 
614 	if (ip6_pkt_too_big(skb, mtu)) {
615 		/* Again, force OUTPUT device used as source address */
616 		skb->dev = dst->dev;
617 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
618 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
619 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
620 				IPSTATS_MIB_FRAGFAILS);
621 		kfree_skb(skb);
622 		return -EMSGSIZE;
623 	}
624 
625 	if (skb_cow(skb, dst->dev->hard_header_len)) {
626 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
627 				IPSTATS_MIB_OUTDISCARDS);
628 		goto drop;
629 	}
630 
631 	hdr = ipv6_hdr(skb);
632 
633 	/* Mangling hops number delayed to point after skb COW */
634 
635 	hdr->hop_limit--;
636 
637 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
638 		       net, NULL, skb, skb->dev, dst->dev,
639 		       ip6_forward_finish);
640 
641 error:
642 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
643 drop:
644 	kfree_skb(skb);
645 	return -EINVAL;
646 }
647 
648 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
649 {
650 	to->pkt_type = from->pkt_type;
651 	to->priority = from->priority;
652 	to->protocol = from->protocol;
653 	skb_dst_drop(to);
654 	skb_dst_set(to, dst_clone(skb_dst(from)));
655 	to->dev = from->dev;
656 	to->mark = from->mark;
657 
658 	skb_copy_hash(to, from);
659 
660 #ifdef CONFIG_NET_SCHED
661 	to->tc_index = from->tc_index;
662 #endif
663 	nf_copy(to, from);
664 	skb_ext_copy(to, from);
665 	skb_copy_secmark(to, from);
666 }
667 
668 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
669 		      u8 nexthdr, __be32 frag_id,
670 		      struct ip6_fraglist_iter *iter)
671 {
672 	unsigned int first_len;
673 	struct frag_hdr *fh;
674 
675 	/* BUILD HEADER */
676 	*prevhdr = NEXTHDR_FRAGMENT;
677 	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
678 	if (!iter->tmp_hdr)
679 		return -ENOMEM;
680 
681 	iter->frag = skb_shinfo(skb)->frag_list;
682 	skb_frag_list_init(skb);
683 
684 	iter->offset = 0;
685 	iter->hlen = hlen;
686 	iter->frag_id = frag_id;
687 	iter->nexthdr = nexthdr;
688 
689 	__skb_pull(skb, hlen);
690 	fh = __skb_push(skb, sizeof(struct frag_hdr));
691 	__skb_push(skb, hlen);
692 	skb_reset_network_header(skb);
693 	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
694 
695 	fh->nexthdr = nexthdr;
696 	fh->reserved = 0;
697 	fh->frag_off = htons(IP6_MF);
698 	fh->identification = frag_id;
699 
700 	first_len = skb_pagelen(skb);
701 	skb->data_len = first_len - skb_headlen(skb);
702 	skb->len = first_len;
703 	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
704 
705 	return 0;
706 }
707 EXPORT_SYMBOL(ip6_fraglist_init);
708 
709 void ip6_fraglist_prepare(struct sk_buff *skb,
710 			  struct ip6_fraglist_iter *iter)
711 {
712 	struct sk_buff *frag = iter->frag;
713 	unsigned int hlen = iter->hlen;
714 	struct frag_hdr *fh;
715 
716 	frag->ip_summed = CHECKSUM_NONE;
717 	skb_reset_transport_header(frag);
718 	fh = __skb_push(frag, sizeof(struct frag_hdr));
719 	__skb_push(frag, hlen);
720 	skb_reset_network_header(frag);
721 	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
722 	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
723 	fh->nexthdr = iter->nexthdr;
724 	fh->reserved = 0;
725 	fh->frag_off = htons(iter->offset);
726 	if (frag->next)
727 		fh->frag_off |= htons(IP6_MF);
728 	fh->identification = iter->frag_id;
729 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
730 	ip6_copy_metadata(frag, skb);
731 }
732 EXPORT_SYMBOL(ip6_fraglist_prepare);
733 
734 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
735 		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
736 		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
737 {
738 	state->prevhdr = prevhdr;
739 	state->nexthdr = nexthdr;
740 	state->frag_id = frag_id;
741 
742 	state->hlen = hlen;
743 	state->mtu = mtu;
744 
745 	state->left = skb->len - hlen;	/* Space per frame */
746 	state->ptr = hlen;		/* Where to start from */
747 
748 	state->hroom = hdr_room;
749 	state->troom = needed_tailroom;
750 
751 	state->offset = 0;
752 }
753 EXPORT_SYMBOL(ip6_frag_init);
754 
755 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
756 {
757 	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
758 	struct sk_buff *frag;
759 	struct frag_hdr *fh;
760 	unsigned int len;
761 
762 	len = state->left;
763 	/* IF: it doesn't fit, use 'mtu' - the data space left */
764 	if (len > state->mtu)
765 		len = state->mtu;
766 	/* IF: we are not sending up to and including the packet end
767 	   then align the next start on an eight byte boundary */
768 	if (len < state->left)
769 		len &= ~7;
770 
771 	/* Allocate buffer */
772 	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
773 			 state->hroom + state->troom, GFP_ATOMIC);
774 	if (!frag)
775 		return ERR_PTR(-ENOMEM);
776 
777 	/*
778 	 *	Set up data on packet
779 	 */
780 
781 	ip6_copy_metadata(frag, skb);
782 	skb_reserve(frag, state->hroom);
783 	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
784 	skb_reset_network_header(frag);
785 	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
786 	frag->transport_header = (frag->network_header + state->hlen +
787 				  sizeof(struct frag_hdr));
788 
789 	/*
790 	 *	Charge the memory for the fragment to any owner
791 	 *	it might possess
792 	 */
793 	if (skb->sk)
794 		skb_set_owner_w(frag, skb->sk);
795 
796 	/*
797 	 *	Copy the packet header into the new buffer.
798 	 */
799 	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
800 
801 	fragnexthdr_offset = skb_network_header(frag);
802 	fragnexthdr_offset += prevhdr - skb_network_header(skb);
803 	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
804 
805 	/*
806 	 *	Build fragment header.
807 	 */
808 	fh->nexthdr = state->nexthdr;
809 	fh->reserved = 0;
810 	fh->identification = state->frag_id;
811 
812 	/*
813 	 *	Copy a block of the IP datagram.
814 	 */
815 	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
816 			     len));
817 	state->left -= len;
818 
819 	fh->frag_off = htons(state->offset);
820 	if (state->left > 0)
821 		fh->frag_off |= htons(IP6_MF);
822 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
823 
824 	state->ptr += len;
825 	state->offset += len;
826 
827 	return frag;
828 }
829 EXPORT_SYMBOL(ip6_frag_next);
830 
831 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
832 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
833 {
834 	struct sk_buff *frag;
835 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
836 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
837 				inet6_sk(skb->sk) : NULL;
838 	struct ip6_frag_state state;
839 	unsigned int mtu, hlen, nexthdr_offset;
840 	ktime_t tstamp = skb->tstamp;
841 	int hroom, err = 0;
842 	__be32 frag_id;
843 	u8 *prevhdr, nexthdr = 0;
844 
845 	err = ip6_find_1stfragopt(skb, &prevhdr);
846 	if (err < 0)
847 		goto fail;
848 	hlen = err;
849 	nexthdr = *prevhdr;
850 	nexthdr_offset = prevhdr - skb_network_header(skb);
851 
852 	mtu = ip6_skb_dst_mtu(skb);
853 
854 	/* We must not fragment if the socket is set to force MTU discovery
855 	 * or if the skb it not generated by a local socket.
856 	 */
857 	if (unlikely(!skb->ignore_df && skb->len > mtu))
858 		goto fail_toobig;
859 
860 	if (IP6CB(skb)->frag_max_size) {
861 		if (IP6CB(skb)->frag_max_size > mtu)
862 			goto fail_toobig;
863 
864 		/* don't send fragments larger than what we received */
865 		mtu = IP6CB(skb)->frag_max_size;
866 		if (mtu < IPV6_MIN_MTU)
867 			mtu = IPV6_MIN_MTU;
868 	}
869 
870 	if (np && np->frag_size < mtu) {
871 		if (np->frag_size)
872 			mtu = np->frag_size;
873 	}
874 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
875 		goto fail_toobig;
876 	mtu -= hlen + sizeof(struct frag_hdr);
877 
878 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
879 				    &ipv6_hdr(skb)->saddr);
880 
881 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
882 	    (err = skb_checksum_help(skb)))
883 		goto fail;
884 
885 	prevhdr = skb_network_header(skb) + nexthdr_offset;
886 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
887 	if (skb_has_frag_list(skb)) {
888 		unsigned int first_len = skb_pagelen(skb);
889 		struct ip6_fraglist_iter iter;
890 		struct sk_buff *frag2;
891 
892 		if (first_len - hlen > mtu ||
893 		    ((first_len - hlen) & 7) ||
894 		    skb_cloned(skb) ||
895 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
896 			goto slow_path;
897 
898 		skb_walk_frags(skb, frag) {
899 			/* Correct geometry. */
900 			if (frag->len > mtu ||
901 			    ((frag->len & 7) && frag->next) ||
902 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
903 				goto slow_path_clean;
904 
905 			/* Partially cloned skb? */
906 			if (skb_shared(frag))
907 				goto slow_path_clean;
908 
909 			BUG_ON(frag->sk);
910 			if (skb->sk) {
911 				frag->sk = skb->sk;
912 				frag->destructor = sock_wfree;
913 			}
914 			skb->truesize -= frag->truesize;
915 		}
916 
917 		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
918 					&iter);
919 		if (err < 0)
920 			goto fail;
921 
922 		for (;;) {
923 			/* Prepare header of the next frame,
924 			 * before previous one went down. */
925 			if (iter.frag)
926 				ip6_fraglist_prepare(skb, &iter);
927 
928 			skb->tstamp = tstamp;
929 			err = output(net, sk, skb);
930 			if (!err)
931 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
932 					      IPSTATS_MIB_FRAGCREATES);
933 
934 			if (err || !iter.frag)
935 				break;
936 
937 			skb = ip6_fraglist_next(&iter);
938 		}
939 
940 		kfree(iter.tmp_hdr);
941 
942 		if (err == 0) {
943 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
944 				      IPSTATS_MIB_FRAGOKS);
945 			return 0;
946 		}
947 
948 		kfree_skb_list(iter.frag);
949 
950 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
951 			      IPSTATS_MIB_FRAGFAILS);
952 		return err;
953 
954 slow_path_clean:
955 		skb_walk_frags(skb, frag2) {
956 			if (frag2 == frag)
957 				break;
958 			frag2->sk = NULL;
959 			frag2->destructor = NULL;
960 			skb->truesize += frag2->truesize;
961 		}
962 	}
963 
964 slow_path:
965 	/*
966 	 *	Fragment the datagram.
967 	 */
968 
969 	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
970 		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
971 		      &state);
972 
973 	/*
974 	 *	Keep copying data until we run out.
975 	 */
976 
977 	while (state.left > 0) {
978 		frag = ip6_frag_next(skb, &state);
979 		if (IS_ERR(frag)) {
980 			err = PTR_ERR(frag);
981 			goto fail;
982 		}
983 
984 		/*
985 		 *	Put this fragment into the sending queue.
986 		 */
987 		frag->tstamp = tstamp;
988 		err = output(net, sk, frag);
989 		if (err)
990 			goto fail;
991 
992 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
993 			      IPSTATS_MIB_FRAGCREATES);
994 	}
995 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
996 		      IPSTATS_MIB_FRAGOKS);
997 	consume_skb(skb);
998 	return err;
999 
1000 fail_toobig:
1001 	if (skb->sk && dst_allfrag(skb_dst(skb)))
1002 		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
1003 
1004 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1005 	err = -EMSGSIZE;
1006 
1007 fail:
1008 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1009 		      IPSTATS_MIB_FRAGFAILS);
1010 	kfree_skb(skb);
1011 	return err;
1012 }
1013 
1014 static inline int ip6_rt_check(const struct rt6key *rt_key,
1015 			       const struct in6_addr *fl_addr,
1016 			       const struct in6_addr *addr_cache)
1017 {
1018 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1019 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1020 }
1021 
1022 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1023 					  struct dst_entry *dst,
1024 					  const struct flowi6 *fl6)
1025 {
1026 	struct ipv6_pinfo *np = inet6_sk(sk);
1027 	struct rt6_info *rt;
1028 
1029 	if (!dst)
1030 		goto out;
1031 
1032 	if (dst->ops->family != AF_INET6) {
1033 		dst_release(dst);
1034 		return NULL;
1035 	}
1036 
1037 	rt = (struct rt6_info *)dst;
1038 	/* Yes, checking route validity in not connected
1039 	 * case is not very simple. Take into account,
1040 	 * that we do not support routing by source, TOS,
1041 	 * and MSG_DONTROUTE		--ANK (980726)
1042 	 *
1043 	 * 1. ip6_rt_check(): If route was host route,
1044 	 *    check that cached destination is current.
1045 	 *    If it is network route, we still may
1046 	 *    check its validity using saved pointer
1047 	 *    to the last used address: daddr_cache.
1048 	 *    We do not want to save whole address now,
1049 	 *    (because main consumer of this service
1050 	 *    is tcp, which has not this problem),
1051 	 *    so that the last trick works only on connected
1052 	 *    sockets.
1053 	 * 2. oif also should be the same.
1054 	 */
1055 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1056 #ifdef CONFIG_IPV6_SUBTREES
1057 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1058 #endif
1059 	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
1060 	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
1061 		dst_release(dst);
1062 		dst = NULL;
1063 	}
1064 
1065 out:
1066 	return dst;
1067 }
1068 
1069 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1070 			       struct dst_entry **dst, struct flowi6 *fl6)
1071 {
1072 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1073 	struct neighbour *n;
1074 	struct rt6_info *rt;
1075 #endif
1076 	int err;
1077 	int flags = 0;
1078 
1079 	/* The correct way to handle this would be to do
1080 	 * ip6_route_get_saddr, and then ip6_route_output; however,
1081 	 * the route-specific preferred source forces the
1082 	 * ip6_route_output call _before_ ip6_route_get_saddr.
1083 	 *
1084 	 * In source specific routing (no src=any default route),
1085 	 * ip6_route_output will fail given src=any saddr, though, so
1086 	 * that's why we try it again later.
1087 	 */
1088 	if (ipv6_addr_any(&fl6->saddr)) {
1089 		struct fib6_info *from;
1090 		struct rt6_info *rt;
1091 
1092 		*dst = ip6_route_output(net, sk, fl6);
1093 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1094 
1095 		rcu_read_lock();
1096 		from = rt ? rcu_dereference(rt->from) : NULL;
1097 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1098 					  sk ? inet6_sk(sk)->srcprefs : 0,
1099 					  &fl6->saddr);
1100 		rcu_read_unlock();
1101 
1102 		if (err)
1103 			goto out_err_release;
1104 
1105 		/* If we had an erroneous initial result, pretend it
1106 		 * never existed and let the SA-enabled version take
1107 		 * over.
1108 		 */
1109 		if ((*dst)->error) {
1110 			dst_release(*dst);
1111 			*dst = NULL;
1112 		}
1113 
1114 		if (fl6->flowi6_oif)
1115 			flags |= RT6_LOOKUP_F_IFACE;
1116 	}
1117 
1118 	if (!*dst)
1119 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1120 
1121 	err = (*dst)->error;
1122 	if (err)
1123 		goto out_err_release;
1124 
1125 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1126 	/*
1127 	 * Here if the dst entry we've looked up
1128 	 * has a neighbour entry that is in the INCOMPLETE
1129 	 * state and the src address from the flow is
1130 	 * marked as OPTIMISTIC, we release the found
1131 	 * dst entry and replace it instead with the
1132 	 * dst entry of the nexthop router
1133 	 */
1134 	rt = (struct rt6_info *) *dst;
1135 	rcu_read_lock_bh();
1136 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1137 				      rt6_nexthop(rt, &fl6->daddr));
1138 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1139 	rcu_read_unlock_bh();
1140 
1141 	if (err) {
1142 		struct inet6_ifaddr *ifp;
1143 		struct flowi6 fl_gw6;
1144 		int redirect;
1145 
1146 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1147 				      (*dst)->dev, 1);
1148 
1149 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1150 		if (ifp)
1151 			in6_ifa_put(ifp);
1152 
1153 		if (redirect) {
1154 			/*
1155 			 * We need to get the dst entry for the
1156 			 * default router instead
1157 			 */
1158 			dst_release(*dst);
1159 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1160 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1161 			*dst = ip6_route_output(net, sk, &fl_gw6);
1162 			err = (*dst)->error;
1163 			if (err)
1164 				goto out_err_release;
1165 		}
1166 	}
1167 #endif
1168 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1169 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1170 		err = -EAFNOSUPPORT;
1171 		goto out_err_release;
1172 	}
1173 
1174 	return 0;
1175 
1176 out_err_release:
1177 	dst_release(*dst);
1178 	*dst = NULL;
1179 
1180 	if (err == -ENETUNREACH)
1181 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1182 	return err;
1183 }
1184 
1185 /**
1186  *	ip6_dst_lookup - perform route lookup on flow
1187  *	@net: Network namespace to perform lookup in
1188  *	@sk: socket which provides route info
1189  *	@dst: pointer to dst_entry * for result
1190  *	@fl6: flow to lookup
1191  *
1192  *	This function performs a route lookup on the given flow.
1193  *
1194  *	It returns zero on success, or a standard errno code on error.
1195  */
1196 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1197 		   struct flowi6 *fl6)
1198 {
1199 	*dst = NULL;
1200 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1201 }
1202 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1203 
1204 /**
1205  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1206  *	@net: Network namespace to perform lookup in
1207  *	@sk: socket which provides route info
1208  *	@fl6: flow to lookup
1209  *	@final_dst: final destination address for ipsec lookup
1210  *
1211  *	This function performs a route lookup on the given flow.
1212  *
1213  *	It returns a valid dst pointer on success, or a pointer encoded
1214  *	error code.
1215  */
1216 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1217 				      const struct in6_addr *final_dst)
1218 {
1219 	struct dst_entry *dst = NULL;
1220 	int err;
1221 
1222 	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1223 	if (err)
1224 		return ERR_PTR(err);
1225 	if (final_dst)
1226 		fl6->daddr = *final_dst;
1227 
1228 	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1229 }
1230 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1231 
1232 /**
1233  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1234  *	@sk: socket which provides the dst cache and route info
1235  *	@fl6: flow to lookup
1236  *	@final_dst: final destination address for ipsec lookup
1237  *	@connected: whether @sk is connected or not
1238  *
1239  *	This function performs a route lookup on the given flow with the
1240  *	possibility of using the cached route in the socket if it is valid.
1241  *	It will take the socket dst lock when operating on the dst cache.
1242  *	As a result, this function can only be used in process context.
1243  *
1244  *	In addition, for a connected socket, cache the dst in the socket
1245  *	if the current cache is not valid.
1246  *
1247  *	It returns a valid dst pointer on success, or a pointer encoded
1248  *	error code.
1249  */
1250 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1251 					 const struct in6_addr *final_dst,
1252 					 bool connected)
1253 {
1254 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1255 
1256 	dst = ip6_sk_dst_check(sk, dst, fl6);
1257 	if (dst)
1258 		return dst;
1259 
1260 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1261 	if (connected && !IS_ERR(dst))
1262 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1263 
1264 	return dst;
1265 }
1266 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1267 
1268 /**
1269  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1270  *      @skb: Packet for which lookup is done
1271  *      @dev: Tunnel device
1272  *      @net: Network namespace of tunnel device
1273  *      @sock: Socket which provides route info
1274  *      @saddr: Memory to store the src ip address
1275  *      @info: Tunnel information
1276  *      @protocol: IP protocol
1277  *      @use_cache: Flag to enable cache usage
1278  *      This function performs a route lookup on a tunnel
1279  *
1280  *      It returns a valid dst pointer and stores src address to be used in
1281  *      tunnel in param saddr on success, else a pointer encoded error code.
1282  */
1283 
1284 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1285 					struct net_device *dev,
1286 					struct net *net,
1287 					struct socket *sock,
1288 					struct in6_addr *saddr,
1289 					const struct ip_tunnel_info *info,
1290 					u8 protocol,
1291 					bool use_cache)
1292 {
1293 	struct dst_entry *dst = NULL;
1294 #ifdef CONFIG_DST_CACHE
1295 	struct dst_cache *dst_cache;
1296 #endif
1297 	struct flowi6 fl6;
1298 	__u8 prio;
1299 
1300 #ifdef CONFIG_DST_CACHE
1301 	dst_cache = (struct dst_cache *)&info->dst_cache;
1302 	if (use_cache) {
1303 		dst = dst_cache_get_ip6(dst_cache, saddr);
1304 		if (dst)
1305 			return dst;
1306 	}
1307 #endif
1308 	memset(&fl6, 0, sizeof(fl6));
1309 	fl6.flowi6_mark = skb->mark;
1310 	fl6.flowi6_proto = protocol;
1311 	fl6.daddr = info->key.u.ipv6.dst;
1312 	fl6.saddr = info->key.u.ipv6.src;
1313 	prio = info->key.tos;
1314 	fl6.flowlabel = ip6_make_flowinfo(RT_TOS(prio),
1315 					  info->key.label);
1316 
1317 	dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1318 					      NULL);
1319 	if (IS_ERR(dst)) {
1320 		netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1321 		return ERR_PTR(-ENETUNREACH);
1322 	}
1323 	if (dst->dev == dev) { /* is this necessary? */
1324 		netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1325 		dst_release(dst);
1326 		return ERR_PTR(-ELOOP);
1327 	}
1328 #ifdef CONFIG_DST_CACHE
1329 	if (use_cache)
1330 		dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1331 #endif
1332 	*saddr = fl6.saddr;
1333 	return dst;
1334 }
1335 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1336 
1337 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1338 					       gfp_t gfp)
1339 {
1340 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1341 }
1342 
1343 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1344 						gfp_t gfp)
1345 {
1346 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1347 }
1348 
1349 static void ip6_append_data_mtu(unsigned int *mtu,
1350 				int *maxfraglen,
1351 				unsigned int fragheaderlen,
1352 				struct sk_buff *skb,
1353 				struct rt6_info *rt,
1354 				unsigned int orig_mtu)
1355 {
1356 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1357 		if (!skb) {
1358 			/* first fragment, reserve header_len */
1359 			*mtu = orig_mtu - rt->dst.header_len;
1360 
1361 		} else {
1362 			/*
1363 			 * this fragment is not first, the headers
1364 			 * space is regarded as data space.
1365 			 */
1366 			*mtu = orig_mtu;
1367 		}
1368 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1369 			      + fragheaderlen - sizeof(struct frag_hdr);
1370 	}
1371 }
1372 
1373 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1374 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1375 			  struct rt6_info *rt, struct flowi6 *fl6)
1376 {
1377 	struct ipv6_pinfo *np = inet6_sk(sk);
1378 	unsigned int mtu;
1379 	struct ipv6_txoptions *opt = ipc6->opt;
1380 
1381 	/*
1382 	 * setup for corking
1383 	 */
1384 	if (opt) {
1385 		if (WARN_ON(v6_cork->opt))
1386 			return -EINVAL;
1387 
1388 		v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1389 		if (unlikely(!v6_cork->opt))
1390 			return -ENOBUFS;
1391 
1392 		v6_cork->opt->tot_len = sizeof(*opt);
1393 		v6_cork->opt->opt_flen = opt->opt_flen;
1394 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1395 
1396 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1397 						    sk->sk_allocation);
1398 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1399 			return -ENOBUFS;
1400 
1401 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1402 						    sk->sk_allocation);
1403 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1404 			return -ENOBUFS;
1405 
1406 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1407 						   sk->sk_allocation);
1408 		if (opt->hopopt && !v6_cork->opt->hopopt)
1409 			return -ENOBUFS;
1410 
1411 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1412 						    sk->sk_allocation);
1413 		if (opt->srcrt && !v6_cork->opt->srcrt)
1414 			return -ENOBUFS;
1415 
1416 		/* need source address above miyazawa*/
1417 	}
1418 	dst_hold(&rt->dst);
1419 	cork->base.dst = &rt->dst;
1420 	cork->fl.u.ip6 = *fl6;
1421 	v6_cork->hop_limit = ipc6->hlimit;
1422 	v6_cork->tclass = ipc6->tclass;
1423 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1424 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1425 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1426 	else
1427 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1428 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1429 	if (np->frag_size < mtu) {
1430 		if (np->frag_size)
1431 			mtu = np->frag_size;
1432 	}
1433 	if (mtu < IPV6_MIN_MTU)
1434 		return -EINVAL;
1435 	cork->base.fragsize = mtu;
1436 	cork->base.gso_size = ipc6->gso_size;
1437 	cork->base.tx_flags = 0;
1438 	cork->base.mark = ipc6->sockc.mark;
1439 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1440 
1441 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1442 		cork->base.flags |= IPCORK_ALLFRAG;
1443 	cork->base.length = 0;
1444 
1445 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1446 
1447 	return 0;
1448 }
1449 
1450 static int __ip6_append_data(struct sock *sk,
1451 			     struct flowi6 *fl6,
1452 			     struct sk_buff_head *queue,
1453 			     struct inet_cork *cork,
1454 			     struct inet6_cork *v6_cork,
1455 			     struct page_frag *pfrag,
1456 			     int getfrag(void *from, char *to, int offset,
1457 					 int len, int odd, struct sk_buff *skb),
1458 			     void *from, int length, int transhdrlen,
1459 			     unsigned int flags, struct ipcm6_cookie *ipc6)
1460 {
1461 	struct sk_buff *skb, *skb_prev = NULL;
1462 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1463 	struct ubuf_info *uarg = NULL;
1464 	int exthdrlen = 0;
1465 	int dst_exthdrlen = 0;
1466 	int hh_len;
1467 	int copy;
1468 	int err;
1469 	int offset = 0;
1470 	u32 tskey = 0;
1471 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1472 	struct ipv6_txoptions *opt = v6_cork->opt;
1473 	int csummode = CHECKSUM_NONE;
1474 	unsigned int maxnonfragsize, headersize;
1475 	unsigned int wmem_alloc_delta = 0;
1476 	bool paged, extra_uref = false;
1477 
1478 	skb = skb_peek_tail(queue);
1479 	if (!skb) {
1480 		exthdrlen = opt ? opt->opt_flen : 0;
1481 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1482 	}
1483 
1484 	paged = !!cork->gso_size;
1485 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1486 	orig_mtu = mtu;
1487 
1488 	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1489 	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1490 		tskey = sk->sk_tskey++;
1491 
1492 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1493 
1494 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1495 			(opt ? opt->opt_nflen : 0);
1496 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1497 		     sizeof(struct frag_hdr);
1498 
1499 	headersize = sizeof(struct ipv6hdr) +
1500 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1501 		     (dst_allfrag(&rt->dst) ?
1502 		      sizeof(struct frag_hdr) : 0) +
1503 		     rt->rt6i_nfheader_len;
1504 
1505 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1506 	 * the first fragment
1507 	 */
1508 	if (headersize + transhdrlen > mtu)
1509 		goto emsgsize;
1510 
1511 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1512 	    (sk->sk_protocol == IPPROTO_UDP ||
1513 	     sk->sk_protocol == IPPROTO_RAW)) {
1514 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1515 				sizeof(struct ipv6hdr));
1516 		goto emsgsize;
1517 	}
1518 
1519 	if (ip6_sk_ignore_df(sk))
1520 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1521 	else
1522 		maxnonfragsize = mtu;
1523 
1524 	if (cork->length + length > maxnonfragsize - headersize) {
1525 emsgsize:
1526 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1527 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1528 		return -EMSGSIZE;
1529 	}
1530 
1531 	/* CHECKSUM_PARTIAL only with no extension headers and when
1532 	 * we are not going to fragment
1533 	 */
1534 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1535 	    headersize == sizeof(struct ipv6hdr) &&
1536 	    length <= mtu - headersize &&
1537 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1538 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1539 		csummode = CHECKSUM_PARTIAL;
1540 
1541 	if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1542 		uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1543 		if (!uarg)
1544 			return -ENOBUFS;
1545 		extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1546 		if (rt->dst.dev->features & NETIF_F_SG &&
1547 		    csummode == CHECKSUM_PARTIAL) {
1548 			paged = true;
1549 		} else {
1550 			uarg->zerocopy = 0;
1551 			skb_zcopy_set(skb, uarg, &extra_uref);
1552 		}
1553 	}
1554 
1555 	/*
1556 	 * Let's try using as much space as possible.
1557 	 * Use MTU if total length of the message fits into the MTU.
1558 	 * Otherwise, we need to reserve fragment header and
1559 	 * fragment alignment (= 8-15 octects, in total).
1560 	 *
1561 	 * Note that we may need to "move" the data from the tail
1562 	 * of the buffer to the new fragment when we split
1563 	 * the message.
1564 	 *
1565 	 * FIXME: It may be fragmented into multiple chunks
1566 	 *        at once if non-fragmentable extension headers
1567 	 *        are too large.
1568 	 * --yoshfuji
1569 	 */
1570 
1571 	cork->length += length;
1572 	if (!skb)
1573 		goto alloc_new_skb;
1574 
1575 	while (length > 0) {
1576 		/* Check if the remaining data fits into current packet. */
1577 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1578 		if (copy < length)
1579 			copy = maxfraglen - skb->len;
1580 
1581 		if (copy <= 0) {
1582 			char *data;
1583 			unsigned int datalen;
1584 			unsigned int fraglen;
1585 			unsigned int fraggap;
1586 			unsigned int alloclen, alloc_extra;
1587 			unsigned int pagedlen;
1588 alloc_new_skb:
1589 			/* There's no room in the current skb */
1590 			if (skb)
1591 				fraggap = skb->len - maxfraglen;
1592 			else
1593 				fraggap = 0;
1594 			/* update mtu and maxfraglen if necessary */
1595 			if (!skb || !skb_prev)
1596 				ip6_append_data_mtu(&mtu, &maxfraglen,
1597 						    fragheaderlen, skb, rt,
1598 						    orig_mtu);
1599 
1600 			skb_prev = skb;
1601 
1602 			/*
1603 			 * If remaining data exceeds the mtu,
1604 			 * we know we need more fragment(s).
1605 			 */
1606 			datalen = length + fraggap;
1607 
1608 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1609 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1610 			fraglen = datalen + fragheaderlen;
1611 			pagedlen = 0;
1612 
1613 			alloc_extra = hh_len;
1614 			alloc_extra += dst_exthdrlen;
1615 			alloc_extra += rt->dst.trailer_len;
1616 
1617 			/* We just reserve space for fragment header.
1618 			 * Note: this may be overallocation if the message
1619 			 * (without MSG_MORE) fits into the MTU.
1620 			 */
1621 			alloc_extra += sizeof(struct frag_hdr);
1622 
1623 			if ((flags & MSG_MORE) &&
1624 			    !(rt->dst.dev->features&NETIF_F_SG))
1625 				alloclen = mtu;
1626 			else if (!paged &&
1627 				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1628 				  !(rt->dst.dev->features & NETIF_F_SG)))
1629 				alloclen = fraglen;
1630 			else {
1631 				alloclen = min_t(int, fraglen, MAX_HEADER);
1632 				pagedlen = fraglen - alloclen;
1633 			}
1634 			alloclen += alloc_extra;
1635 
1636 			if (datalen != length + fraggap) {
1637 				/*
1638 				 * this is not the last fragment, the trailer
1639 				 * space is regarded as data space.
1640 				 */
1641 				datalen += rt->dst.trailer_len;
1642 			}
1643 
1644 			fraglen = datalen + fragheaderlen;
1645 
1646 			copy = datalen - transhdrlen - fraggap - pagedlen;
1647 			if (copy < 0) {
1648 				err = -EINVAL;
1649 				goto error;
1650 			}
1651 			if (transhdrlen) {
1652 				skb = sock_alloc_send_skb(sk, alloclen,
1653 						(flags & MSG_DONTWAIT), &err);
1654 			} else {
1655 				skb = NULL;
1656 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1657 				    2 * sk->sk_sndbuf)
1658 					skb = alloc_skb(alloclen,
1659 							sk->sk_allocation);
1660 				if (unlikely(!skb))
1661 					err = -ENOBUFS;
1662 			}
1663 			if (!skb)
1664 				goto error;
1665 			/*
1666 			 *	Fill in the control structures
1667 			 */
1668 			skb->protocol = htons(ETH_P_IPV6);
1669 			skb->ip_summed = csummode;
1670 			skb->csum = 0;
1671 			/* reserve for fragmentation and ipsec header */
1672 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1673 				    dst_exthdrlen);
1674 
1675 			/*
1676 			 *	Find where to start putting bytes
1677 			 */
1678 			data = skb_put(skb, fraglen - pagedlen);
1679 			skb_set_network_header(skb, exthdrlen);
1680 			data += fragheaderlen;
1681 			skb->transport_header = (skb->network_header +
1682 						 fragheaderlen);
1683 			if (fraggap) {
1684 				skb->csum = skb_copy_and_csum_bits(
1685 					skb_prev, maxfraglen,
1686 					data + transhdrlen, fraggap);
1687 				skb_prev->csum = csum_sub(skb_prev->csum,
1688 							  skb->csum);
1689 				data += fraggap;
1690 				pskb_trim_unique(skb_prev, maxfraglen);
1691 			}
1692 			if (copy > 0 &&
1693 			    getfrag(from, data + transhdrlen, offset,
1694 				    copy, fraggap, skb) < 0) {
1695 				err = -EFAULT;
1696 				kfree_skb(skb);
1697 				goto error;
1698 			}
1699 
1700 			offset += copy;
1701 			length -= copy + transhdrlen;
1702 			transhdrlen = 0;
1703 			exthdrlen = 0;
1704 			dst_exthdrlen = 0;
1705 
1706 			/* Only the initial fragment is time stamped */
1707 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1708 			cork->tx_flags = 0;
1709 			skb_shinfo(skb)->tskey = tskey;
1710 			tskey = 0;
1711 			skb_zcopy_set(skb, uarg, &extra_uref);
1712 
1713 			if ((flags & MSG_CONFIRM) && !skb_prev)
1714 				skb_set_dst_pending_confirm(skb, 1);
1715 
1716 			/*
1717 			 * Put the packet on the pending queue
1718 			 */
1719 			if (!skb->destructor) {
1720 				skb->destructor = sock_wfree;
1721 				skb->sk = sk;
1722 				wmem_alloc_delta += skb->truesize;
1723 			}
1724 			__skb_queue_tail(queue, skb);
1725 			continue;
1726 		}
1727 
1728 		if (copy > length)
1729 			copy = length;
1730 
1731 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1732 		    skb_tailroom(skb) >= copy) {
1733 			unsigned int off;
1734 
1735 			off = skb->len;
1736 			if (getfrag(from, skb_put(skb, copy),
1737 						offset, copy, off, skb) < 0) {
1738 				__skb_trim(skb, off);
1739 				err = -EFAULT;
1740 				goto error;
1741 			}
1742 		} else if (!uarg || !uarg->zerocopy) {
1743 			int i = skb_shinfo(skb)->nr_frags;
1744 
1745 			err = -ENOMEM;
1746 			if (!sk_page_frag_refill(sk, pfrag))
1747 				goto error;
1748 
1749 			if (!skb_can_coalesce(skb, i, pfrag->page,
1750 					      pfrag->offset)) {
1751 				err = -EMSGSIZE;
1752 				if (i == MAX_SKB_FRAGS)
1753 					goto error;
1754 
1755 				__skb_fill_page_desc(skb, i, pfrag->page,
1756 						     pfrag->offset, 0);
1757 				skb_shinfo(skb)->nr_frags = ++i;
1758 				get_page(pfrag->page);
1759 			}
1760 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1761 			if (getfrag(from,
1762 				    page_address(pfrag->page) + pfrag->offset,
1763 				    offset, copy, skb->len, skb) < 0)
1764 				goto error_efault;
1765 
1766 			pfrag->offset += copy;
1767 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1768 			skb->len += copy;
1769 			skb->data_len += copy;
1770 			skb->truesize += copy;
1771 			wmem_alloc_delta += copy;
1772 		} else {
1773 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1774 			if (err < 0)
1775 				goto error;
1776 		}
1777 		offset += copy;
1778 		length -= copy;
1779 	}
1780 
1781 	if (wmem_alloc_delta)
1782 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1783 	return 0;
1784 
1785 error_efault:
1786 	err = -EFAULT;
1787 error:
1788 	net_zcopy_put_abort(uarg, extra_uref);
1789 	cork->length -= length;
1790 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1791 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1792 	return err;
1793 }
1794 
1795 int ip6_append_data(struct sock *sk,
1796 		    int getfrag(void *from, char *to, int offset, int len,
1797 				int odd, struct sk_buff *skb),
1798 		    void *from, int length, int transhdrlen,
1799 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1800 		    struct rt6_info *rt, unsigned int flags)
1801 {
1802 	struct inet_sock *inet = inet_sk(sk);
1803 	struct ipv6_pinfo *np = inet6_sk(sk);
1804 	int exthdrlen;
1805 	int err;
1806 
1807 	if (flags&MSG_PROBE)
1808 		return 0;
1809 	if (skb_queue_empty(&sk->sk_write_queue)) {
1810 		/*
1811 		 * setup for corking
1812 		 */
1813 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1814 				     ipc6, rt, fl6);
1815 		if (err)
1816 			return err;
1817 
1818 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1819 		length += exthdrlen;
1820 		transhdrlen += exthdrlen;
1821 	} else {
1822 		fl6 = &inet->cork.fl.u.ip6;
1823 		transhdrlen = 0;
1824 	}
1825 
1826 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1827 				 &np->cork, sk_page_frag(sk), getfrag,
1828 				 from, length, transhdrlen, flags, ipc6);
1829 }
1830 EXPORT_SYMBOL_GPL(ip6_append_data);
1831 
1832 static void ip6_cork_release(struct inet_cork_full *cork,
1833 			     struct inet6_cork *v6_cork)
1834 {
1835 	if (v6_cork->opt) {
1836 		kfree(v6_cork->opt->dst0opt);
1837 		kfree(v6_cork->opt->dst1opt);
1838 		kfree(v6_cork->opt->hopopt);
1839 		kfree(v6_cork->opt->srcrt);
1840 		kfree(v6_cork->opt);
1841 		v6_cork->opt = NULL;
1842 	}
1843 
1844 	if (cork->base.dst) {
1845 		dst_release(cork->base.dst);
1846 		cork->base.dst = NULL;
1847 		cork->base.flags &= ~IPCORK_ALLFRAG;
1848 	}
1849 	memset(&cork->fl, 0, sizeof(cork->fl));
1850 }
1851 
1852 struct sk_buff *__ip6_make_skb(struct sock *sk,
1853 			       struct sk_buff_head *queue,
1854 			       struct inet_cork_full *cork,
1855 			       struct inet6_cork *v6_cork)
1856 {
1857 	struct sk_buff *skb, *tmp_skb;
1858 	struct sk_buff **tail_skb;
1859 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1860 	struct ipv6_pinfo *np = inet6_sk(sk);
1861 	struct net *net = sock_net(sk);
1862 	struct ipv6hdr *hdr;
1863 	struct ipv6_txoptions *opt = v6_cork->opt;
1864 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1865 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1866 	unsigned char proto = fl6->flowi6_proto;
1867 
1868 	skb = __skb_dequeue(queue);
1869 	if (!skb)
1870 		goto out;
1871 	tail_skb = &(skb_shinfo(skb)->frag_list);
1872 
1873 	/* move skb->data to ip header from ext header */
1874 	if (skb->data < skb_network_header(skb))
1875 		__skb_pull(skb, skb_network_offset(skb));
1876 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1877 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1878 		*tail_skb = tmp_skb;
1879 		tail_skb = &(tmp_skb->next);
1880 		skb->len += tmp_skb->len;
1881 		skb->data_len += tmp_skb->len;
1882 		skb->truesize += tmp_skb->truesize;
1883 		tmp_skb->destructor = NULL;
1884 		tmp_skb->sk = NULL;
1885 	}
1886 
1887 	/* Allow local fragmentation. */
1888 	skb->ignore_df = ip6_sk_ignore_df(sk);
1889 
1890 	*final_dst = fl6->daddr;
1891 	__skb_pull(skb, skb_network_header_len(skb));
1892 	if (opt && opt->opt_flen)
1893 		ipv6_push_frag_opts(skb, opt, &proto);
1894 	if (opt && opt->opt_nflen)
1895 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1896 
1897 	skb_push(skb, sizeof(struct ipv6hdr));
1898 	skb_reset_network_header(skb);
1899 	hdr = ipv6_hdr(skb);
1900 
1901 	ip6_flow_hdr(hdr, v6_cork->tclass,
1902 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1903 					ip6_autoflowlabel(net, np), fl6));
1904 	hdr->hop_limit = v6_cork->hop_limit;
1905 	hdr->nexthdr = proto;
1906 	hdr->saddr = fl6->saddr;
1907 	hdr->daddr = *final_dst;
1908 
1909 	skb->priority = sk->sk_priority;
1910 	skb->mark = cork->base.mark;
1911 
1912 	skb->tstamp = cork->base.transmit_time;
1913 
1914 	skb_dst_set(skb, dst_clone(&rt->dst));
1915 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1916 	if (proto == IPPROTO_ICMPV6) {
1917 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1918 
1919 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1920 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1921 	}
1922 
1923 	ip6_cork_release(cork, v6_cork);
1924 out:
1925 	return skb;
1926 }
1927 
1928 int ip6_send_skb(struct sk_buff *skb)
1929 {
1930 	struct net *net = sock_net(skb->sk);
1931 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1932 	int err;
1933 
1934 	err = ip6_local_out(net, skb->sk, skb);
1935 	if (err) {
1936 		if (err > 0)
1937 			err = net_xmit_errno(err);
1938 		if (err)
1939 			IP6_INC_STATS(net, rt->rt6i_idev,
1940 				      IPSTATS_MIB_OUTDISCARDS);
1941 	}
1942 
1943 	return err;
1944 }
1945 
1946 int ip6_push_pending_frames(struct sock *sk)
1947 {
1948 	struct sk_buff *skb;
1949 
1950 	skb = ip6_finish_skb(sk);
1951 	if (!skb)
1952 		return 0;
1953 
1954 	return ip6_send_skb(skb);
1955 }
1956 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1957 
1958 static void __ip6_flush_pending_frames(struct sock *sk,
1959 				       struct sk_buff_head *queue,
1960 				       struct inet_cork_full *cork,
1961 				       struct inet6_cork *v6_cork)
1962 {
1963 	struct sk_buff *skb;
1964 
1965 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1966 		if (skb_dst(skb))
1967 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1968 				      IPSTATS_MIB_OUTDISCARDS);
1969 		kfree_skb(skb);
1970 	}
1971 
1972 	ip6_cork_release(cork, v6_cork);
1973 }
1974 
1975 void ip6_flush_pending_frames(struct sock *sk)
1976 {
1977 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1978 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1979 }
1980 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1981 
1982 struct sk_buff *ip6_make_skb(struct sock *sk,
1983 			     int getfrag(void *from, char *to, int offset,
1984 					 int len, int odd, struct sk_buff *skb),
1985 			     void *from, int length, int transhdrlen,
1986 			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1987 			     struct rt6_info *rt, unsigned int flags,
1988 			     struct inet_cork_full *cork)
1989 {
1990 	struct inet6_cork v6_cork;
1991 	struct sk_buff_head queue;
1992 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1993 	int err;
1994 
1995 	if (flags & MSG_PROBE)
1996 		return NULL;
1997 
1998 	__skb_queue_head_init(&queue);
1999 
2000 	cork->base.flags = 0;
2001 	cork->base.addr = 0;
2002 	cork->base.opt = NULL;
2003 	cork->base.dst = NULL;
2004 	v6_cork.opt = NULL;
2005 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
2006 	if (err) {
2007 		ip6_cork_release(cork, &v6_cork);
2008 		return ERR_PTR(err);
2009 	}
2010 	if (ipc6->dontfrag < 0)
2011 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2012 
2013 	err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
2014 				&current->task_frag, getfrag, from,
2015 				length + exthdrlen, transhdrlen + exthdrlen,
2016 				flags, ipc6);
2017 	if (err) {
2018 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2019 		return ERR_PTR(err);
2020 	}
2021 
2022 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2023 }
2024