xref: /openbmc/linux/net/ipv6/ip6_output.c (revision e4340bbb07dd38339c0773543dd928886e512a57)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	:	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 
59 static int ip6_finish_output2(struct sock *sk, struct sk_buff *skb)
60 {
61 	struct dst_entry *dst = skb_dst(skb);
62 	struct net_device *dev = dst->dev;
63 	struct neighbour *neigh;
64 	struct in6_addr *nexthop;
65 	int ret;
66 
67 	skb->protocol = htons(ETH_P_IPV6);
68 	skb->dev = dev;
69 
70 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72 
73 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74 		    ((mroute6_socket(dev_net(dev), skb) &&
75 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 					 &ipv6_hdr(skb)->saddr))) {
78 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79 
80 			/* Do not check for IFF_ALLMULTI; multicast routing
81 			   is not supported in any case.
82 			 */
83 			if (newskb)
84 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 					sk, newskb, NULL, newskb->dev,
86 					dev_loopback_xmit);
87 
88 			if (ipv6_hdr(skb)->hop_limit == 0) {
89 				IP6_INC_STATS(dev_net(dev), idev,
90 					      IPSTATS_MIB_OUTDISCARDS);
91 				kfree_skb(skb);
92 				return 0;
93 			}
94 		}
95 
96 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
97 				skb->len);
98 
99 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
100 		    IPV6_ADDR_SCOPE_NODELOCAL &&
101 		    !(dev->flags & IFF_LOOPBACK)) {
102 			kfree_skb(skb);
103 			return 0;
104 		}
105 	}
106 
107 	rcu_read_lock_bh();
108 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
109 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
110 	if (unlikely(!neigh))
111 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
112 	if (!IS_ERR(neigh)) {
113 		ret = dst_neigh_output(dst, neigh, skb);
114 		rcu_read_unlock_bh();
115 		return ret;
116 	}
117 	rcu_read_unlock_bh();
118 
119 	IP6_INC_STATS(dev_net(dst->dev),
120 		      ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
121 	kfree_skb(skb);
122 	return -EINVAL;
123 }
124 
125 static int ip6_finish_output(struct sock *sk, struct sk_buff *skb)
126 {
127 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
128 	    dst_allfrag(skb_dst(skb)) ||
129 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
130 		return ip6_fragment(sk, skb, ip6_finish_output2);
131 	else
132 		return ip6_finish_output2(sk, skb);
133 }
134 
135 int ip6_output(struct sock *sk, struct sk_buff *skb)
136 {
137 	struct net_device *dev = skb_dst(skb)->dev;
138 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
139 	if (unlikely(idev->cnf.disable_ipv6)) {
140 		IP6_INC_STATS(dev_net(dev), idev,
141 			      IPSTATS_MIB_OUTDISCARDS);
142 		kfree_skb(skb);
143 		return 0;
144 	}
145 
146 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, sk, skb,
147 			    NULL, dev,
148 			    ip6_finish_output,
149 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
150 }
151 
152 /*
153  *	xmit an sk_buff (used by TCP, SCTP and DCCP)
154  */
155 
156 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
157 	     struct ipv6_txoptions *opt, int tclass)
158 {
159 	struct net *net = sock_net(sk);
160 	struct ipv6_pinfo *np = inet6_sk(sk);
161 	struct in6_addr *first_hop = &fl6->daddr;
162 	struct dst_entry *dst = skb_dst(skb);
163 	struct ipv6hdr *hdr;
164 	u8  proto = fl6->flowi6_proto;
165 	int seg_len = skb->len;
166 	int hlimit = -1;
167 	u32 mtu;
168 
169 	if (opt) {
170 		unsigned int head_room;
171 
172 		/* First: exthdrs may take lots of space (~8K for now)
173 		   MAX_HEADER is not enough.
174 		 */
175 		head_room = opt->opt_nflen + opt->opt_flen;
176 		seg_len += head_room;
177 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
178 
179 		if (skb_headroom(skb) < head_room) {
180 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
181 			if (!skb2) {
182 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
183 					      IPSTATS_MIB_OUTDISCARDS);
184 				kfree_skb(skb);
185 				return -ENOBUFS;
186 			}
187 			consume_skb(skb);
188 			skb = skb2;
189 			skb_set_owner_w(skb, sk);
190 		}
191 		if (opt->opt_flen)
192 			ipv6_push_frag_opts(skb, opt, &proto);
193 		if (opt->opt_nflen)
194 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
195 	}
196 
197 	skb_push(skb, sizeof(struct ipv6hdr));
198 	skb_reset_network_header(skb);
199 	hdr = ipv6_hdr(skb);
200 
201 	/*
202 	 *	Fill in the IPv6 header
203 	 */
204 	if (np)
205 		hlimit = np->hop_limit;
206 	if (hlimit < 0)
207 		hlimit = ip6_dst_hoplimit(dst);
208 
209 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
210 						     np->autoflowlabel, fl6));
211 
212 	hdr->payload_len = htons(seg_len);
213 	hdr->nexthdr = proto;
214 	hdr->hop_limit = hlimit;
215 
216 	hdr->saddr = fl6->saddr;
217 	hdr->daddr = *first_hop;
218 
219 	skb->protocol = htons(ETH_P_IPV6);
220 	skb->priority = sk->sk_priority;
221 	skb->mark = sk->sk_mark;
222 
223 	mtu = dst_mtu(dst);
224 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
225 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
226 			      IPSTATS_MIB_OUT, skb->len);
227 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb,
228 			       NULL, dst->dev, dst_output_sk);
229 	}
230 
231 	skb->dev = dst->dev;
232 	ipv6_local_error(sk, EMSGSIZE, fl6, mtu);
233 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
234 	kfree_skb(skb);
235 	return -EMSGSIZE;
236 }
237 EXPORT_SYMBOL(ip6_xmit);
238 
239 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
240 {
241 	struct ip6_ra_chain *ra;
242 	struct sock *last = NULL;
243 
244 	read_lock(&ip6_ra_lock);
245 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
246 		struct sock *sk = ra->sk;
247 		if (sk && ra->sel == sel &&
248 		    (!sk->sk_bound_dev_if ||
249 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
250 			if (last) {
251 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
252 				if (skb2)
253 					rawv6_rcv(last, skb2);
254 			}
255 			last = sk;
256 		}
257 	}
258 
259 	if (last) {
260 		rawv6_rcv(last, skb);
261 		read_unlock(&ip6_ra_lock);
262 		return 1;
263 	}
264 	read_unlock(&ip6_ra_lock);
265 	return 0;
266 }
267 
268 static int ip6_forward_proxy_check(struct sk_buff *skb)
269 {
270 	struct ipv6hdr *hdr = ipv6_hdr(skb);
271 	u8 nexthdr = hdr->nexthdr;
272 	__be16 frag_off;
273 	int offset;
274 
275 	if (ipv6_ext_hdr(nexthdr)) {
276 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
277 		if (offset < 0)
278 			return 0;
279 	} else
280 		offset = sizeof(struct ipv6hdr);
281 
282 	if (nexthdr == IPPROTO_ICMPV6) {
283 		struct icmp6hdr *icmp6;
284 
285 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
286 					 offset + 1 - skb->data)))
287 			return 0;
288 
289 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
290 
291 		switch (icmp6->icmp6_type) {
292 		case NDISC_ROUTER_SOLICITATION:
293 		case NDISC_ROUTER_ADVERTISEMENT:
294 		case NDISC_NEIGHBOUR_SOLICITATION:
295 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
296 		case NDISC_REDIRECT:
297 			/* For reaction involving unicast neighbor discovery
298 			 * message destined to the proxied address, pass it to
299 			 * input function.
300 			 */
301 			return 1;
302 		default:
303 			break;
304 		}
305 	}
306 
307 	/*
308 	 * The proxying router can't forward traffic sent to a link-local
309 	 * address, so signal the sender and discard the packet. This
310 	 * behavior is clarified by the MIPv6 specification.
311 	 */
312 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
313 		dst_link_failure(skb);
314 		return -1;
315 	}
316 
317 	return 0;
318 }
319 
320 static inline int ip6_forward_finish(struct sock *sk, struct sk_buff *skb)
321 {
322 	skb_sender_cpu_clear(skb);
323 	return dst_output_sk(sk, skb);
324 }
325 
326 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
327 {
328 	unsigned int mtu;
329 	struct inet6_dev *idev;
330 
331 	if (dst_metric_locked(dst, RTAX_MTU)) {
332 		mtu = dst_metric_raw(dst, RTAX_MTU);
333 		if (mtu)
334 			return mtu;
335 	}
336 
337 	mtu = IPV6_MIN_MTU;
338 	rcu_read_lock();
339 	idev = __in6_dev_get(dst->dev);
340 	if (idev)
341 		mtu = idev->cnf.mtu6;
342 	rcu_read_unlock();
343 
344 	return mtu;
345 }
346 
347 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
348 {
349 	if (skb->len <= mtu)
350 		return false;
351 
352 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
353 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
354 		return true;
355 
356 	if (skb->ignore_df)
357 		return false;
358 
359 	if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
360 		return false;
361 
362 	return true;
363 }
364 
365 int ip6_forward(struct sk_buff *skb)
366 {
367 	struct dst_entry *dst = skb_dst(skb);
368 	struct ipv6hdr *hdr = ipv6_hdr(skb);
369 	struct inet6_skb_parm *opt = IP6CB(skb);
370 	struct net *net = dev_net(dst->dev);
371 	u32 mtu;
372 
373 	if (net->ipv6.devconf_all->forwarding == 0)
374 		goto error;
375 
376 	if (skb->pkt_type != PACKET_HOST)
377 		goto drop;
378 
379 	if (unlikely(skb->sk))
380 		goto drop;
381 
382 	if (skb_warn_if_lro(skb))
383 		goto drop;
384 
385 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
386 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
387 				 IPSTATS_MIB_INDISCARDS);
388 		goto drop;
389 	}
390 
391 	skb_forward_csum(skb);
392 
393 	/*
394 	 *	We DO NOT make any processing on
395 	 *	RA packets, pushing them to user level AS IS
396 	 *	without ane WARRANTY that application will be able
397 	 *	to interpret them. The reason is that we
398 	 *	cannot make anything clever here.
399 	 *
400 	 *	We are not end-node, so that if packet contains
401 	 *	AH/ESP, we cannot make anything.
402 	 *	Defragmentation also would be mistake, RA packets
403 	 *	cannot be fragmented, because there is no warranty
404 	 *	that different fragments will go along one path. --ANK
405 	 */
406 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
407 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
408 			return 0;
409 	}
410 
411 	/*
412 	 *	check and decrement ttl
413 	 */
414 	if (hdr->hop_limit <= 1) {
415 		/* Force OUTPUT device used as source address */
416 		skb->dev = dst->dev;
417 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
418 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
419 				 IPSTATS_MIB_INHDRERRORS);
420 
421 		kfree_skb(skb);
422 		return -ETIMEDOUT;
423 	}
424 
425 	/* XXX: idev->cnf.proxy_ndp? */
426 	if (net->ipv6.devconf_all->proxy_ndp &&
427 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
428 		int proxied = ip6_forward_proxy_check(skb);
429 		if (proxied > 0)
430 			return ip6_input(skb);
431 		else if (proxied < 0) {
432 			IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
433 					 IPSTATS_MIB_INDISCARDS);
434 			goto drop;
435 		}
436 	}
437 
438 	if (!xfrm6_route_forward(skb)) {
439 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
440 				 IPSTATS_MIB_INDISCARDS);
441 		goto drop;
442 	}
443 	dst = skb_dst(skb);
444 
445 	/* IPv6 specs say nothing about it, but it is clear that we cannot
446 	   send redirects to source routed frames.
447 	   We don't send redirects to frames decapsulated from IPsec.
448 	 */
449 	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
450 		struct in6_addr *target = NULL;
451 		struct inet_peer *peer;
452 		struct rt6_info *rt;
453 
454 		/*
455 		 *	incoming and outgoing devices are the same
456 		 *	send a redirect.
457 		 */
458 
459 		rt = (struct rt6_info *) dst;
460 		if (rt->rt6i_flags & RTF_GATEWAY)
461 			target = &rt->rt6i_gateway;
462 		else
463 			target = &hdr->daddr;
464 
465 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
466 
467 		/* Limit redirects both by destination (here)
468 		   and by source (inside ndisc_send_redirect)
469 		 */
470 		if (inet_peer_xrlim_allow(peer, 1*HZ))
471 			ndisc_send_redirect(skb, target);
472 		if (peer)
473 			inet_putpeer(peer);
474 	} else {
475 		int addrtype = ipv6_addr_type(&hdr->saddr);
476 
477 		/* This check is security critical. */
478 		if (addrtype == IPV6_ADDR_ANY ||
479 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
480 			goto error;
481 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
482 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
483 				    ICMPV6_NOT_NEIGHBOUR, 0);
484 			goto error;
485 		}
486 	}
487 
488 	mtu = ip6_dst_mtu_forward(dst);
489 	if (mtu < IPV6_MIN_MTU)
490 		mtu = IPV6_MIN_MTU;
491 
492 	if (ip6_pkt_too_big(skb, mtu)) {
493 		/* Again, force OUTPUT device used as source address */
494 		skb->dev = dst->dev;
495 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
496 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
497 				 IPSTATS_MIB_INTOOBIGERRORS);
498 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
499 				 IPSTATS_MIB_FRAGFAILS);
500 		kfree_skb(skb);
501 		return -EMSGSIZE;
502 	}
503 
504 	if (skb_cow(skb, dst->dev->hard_header_len)) {
505 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
506 				 IPSTATS_MIB_OUTDISCARDS);
507 		goto drop;
508 	}
509 
510 	hdr = ipv6_hdr(skb);
511 
512 	/* Mangling hops number delayed to point after skb COW */
513 
514 	hdr->hop_limit--;
515 
516 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
517 	IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
518 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, NULL, skb,
519 		       skb->dev, dst->dev,
520 		       ip6_forward_finish);
521 
522 error:
523 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
524 drop:
525 	kfree_skb(skb);
526 	return -EINVAL;
527 }
528 
529 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
530 {
531 	to->pkt_type = from->pkt_type;
532 	to->priority = from->priority;
533 	to->protocol = from->protocol;
534 	skb_dst_drop(to);
535 	skb_dst_set(to, dst_clone(skb_dst(from)));
536 	to->dev = from->dev;
537 	to->mark = from->mark;
538 
539 #ifdef CONFIG_NET_SCHED
540 	to->tc_index = from->tc_index;
541 #endif
542 	nf_copy(to, from);
543 	skb_copy_secmark(to, from);
544 }
545 
546 int ip6_fragment(struct sock *sk, struct sk_buff *skb,
547 		 int (*output)(struct sock *, struct sk_buff *))
548 {
549 	struct sk_buff *frag;
550 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
551 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
552 				inet6_sk(skb->sk) : NULL;
553 	struct ipv6hdr *tmp_hdr;
554 	struct frag_hdr *fh;
555 	unsigned int mtu, hlen, left, len;
556 	int hroom, troom;
557 	__be32 frag_id;
558 	int ptr, offset = 0, err = 0;
559 	u8 *prevhdr, nexthdr = 0;
560 	struct net *net = dev_net(skb_dst(skb)->dev);
561 
562 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
563 	nexthdr = *prevhdr;
564 
565 	mtu = ip6_skb_dst_mtu(skb);
566 
567 	/* We must not fragment if the socket is set to force MTU discovery
568 	 * or if the skb it not generated by a local socket.
569 	 */
570 	if (unlikely(!skb->ignore_df && skb->len > mtu))
571 		goto fail_toobig;
572 
573 	if (IP6CB(skb)->frag_max_size) {
574 		if (IP6CB(skb)->frag_max_size > mtu)
575 			goto fail_toobig;
576 
577 		/* don't send fragments larger than what we received */
578 		mtu = IP6CB(skb)->frag_max_size;
579 		if (mtu < IPV6_MIN_MTU)
580 			mtu = IPV6_MIN_MTU;
581 	}
582 
583 	if (np && np->frag_size < mtu) {
584 		if (np->frag_size)
585 			mtu = np->frag_size;
586 	}
587 	mtu -= hlen + sizeof(struct frag_hdr);
588 
589 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
590 				    &ipv6_hdr(skb)->saddr);
591 
592 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
593 	if (skb_has_frag_list(skb)) {
594 		int first_len = skb_pagelen(skb);
595 		struct sk_buff *frag2;
596 
597 		if (first_len - hlen > mtu ||
598 		    ((first_len - hlen) & 7) ||
599 		    skb_cloned(skb) ||
600 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
601 			goto slow_path;
602 
603 		skb_walk_frags(skb, frag) {
604 			/* Correct geometry. */
605 			if (frag->len > mtu ||
606 			    ((frag->len & 7) && frag->next) ||
607 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
608 				goto slow_path_clean;
609 
610 			/* Partially cloned skb? */
611 			if (skb_shared(frag))
612 				goto slow_path_clean;
613 
614 			BUG_ON(frag->sk);
615 			if (skb->sk) {
616 				frag->sk = skb->sk;
617 				frag->destructor = sock_wfree;
618 			}
619 			skb->truesize -= frag->truesize;
620 		}
621 
622 		err = 0;
623 		offset = 0;
624 		/* BUILD HEADER */
625 
626 		*prevhdr = NEXTHDR_FRAGMENT;
627 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
628 		if (!tmp_hdr) {
629 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
630 				      IPSTATS_MIB_FRAGFAILS);
631 			err = -ENOMEM;
632 			goto fail;
633 		}
634 		frag = skb_shinfo(skb)->frag_list;
635 		skb_frag_list_init(skb);
636 
637 		__skb_pull(skb, hlen);
638 		fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
639 		__skb_push(skb, hlen);
640 		skb_reset_network_header(skb);
641 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
642 
643 		fh->nexthdr = nexthdr;
644 		fh->reserved = 0;
645 		fh->frag_off = htons(IP6_MF);
646 		fh->identification = frag_id;
647 
648 		first_len = skb_pagelen(skb);
649 		skb->data_len = first_len - skb_headlen(skb);
650 		skb->len = first_len;
651 		ipv6_hdr(skb)->payload_len = htons(first_len -
652 						   sizeof(struct ipv6hdr));
653 
654 		dst_hold(&rt->dst);
655 
656 		for (;;) {
657 			/* Prepare header of the next frame,
658 			 * before previous one went down. */
659 			if (frag) {
660 				frag->ip_summed = CHECKSUM_NONE;
661 				skb_reset_transport_header(frag);
662 				fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
663 				__skb_push(frag, hlen);
664 				skb_reset_network_header(frag);
665 				memcpy(skb_network_header(frag), tmp_hdr,
666 				       hlen);
667 				offset += skb->len - hlen - sizeof(struct frag_hdr);
668 				fh->nexthdr = nexthdr;
669 				fh->reserved = 0;
670 				fh->frag_off = htons(offset);
671 				if (frag->next)
672 					fh->frag_off |= htons(IP6_MF);
673 				fh->identification = frag_id;
674 				ipv6_hdr(frag)->payload_len =
675 						htons(frag->len -
676 						      sizeof(struct ipv6hdr));
677 				ip6_copy_metadata(frag, skb);
678 			}
679 
680 			err = output(sk, skb);
681 			if (!err)
682 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
683 					      IPSTATS_MIB_FRAGCREATES);
684 
685 			if (err || !frag)
686 				break;
687 
688 			skb = frag;
689 			frag = skb->next;
690 			skb->next = NULL;
691 		}
692 
693 		kfree(tmp_hdr);
694 
695 		if (err == 0) {
696 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
697 				      IPSTATS_MIB_FRAGOKS);
698 			ip6_rt_put(rt);
699 			return 0;
700 		}
701 
702 		kfree_skb_list(frag);
703 
704 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
705 			      IPSTATS_MIB_FRAGFAILS);
706 		ip6_rt_put(rt);
707 		return err;
708 
709 slow_path_clean:
710 		skb_walk_frags(skb, frag2) {
711 			if (frag2 == frag)
712 				break;
713 			frag2->sk = NULL;
714 			frag2->destructor = NULL;
715 			skb->truesize += frag2->truesize;
716 		}
717 	}
718 
719 slow_path:
720 	if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
721 	    skb_checksum_help(skb))
722 		goto fail;
723 
724 	left = skb->len - hlen;		/* Space per frame */
725 	ptr = hlen;			/* Where to start from */
726 
727 	/*
728 	 *	Fragment the datagram.
729 	 */
730 
731 	*prevhdr = NEXTHDR_FRAGMENT;
732 	troom = rt->dst.dev->needed_tailroom;
733 
734 	/*
735 	 *	Keep copying data until we run out.
736 	 */
737 	while (left > 0)	{
738 		len = left;
739 		/* IF: it doesn't fit, use 'mtu' - the data space left */
740 		if (len > mtu)
741 			len = mtu;
742 		/* IF: we are not sending up to and including the packet end
743 		   then align the next start on an eight byte boundary */
744 		if (len < left)	{
745 			len &= ~7;
746 		}
747 
748 		/* Allocate buffer */
749 		frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
750 				 hroom + troom, GFP_ATOMIC);
751 		if (!frag) {
752 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
753 				      IPSTATS_MIB_FRAGFAILS);
754 			err = -ENOMEM;
755 			goto fail;
756 		}
757 
758 		/*
759 		 *	Set up data on packet
760 		 */
761 
762 		ip6_copy_metadata(frag, skb);
763 		skb_reserve(frag, hroom);
764 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
765 		skb_reset_network_header(frag);
766 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
767 		frag->transport_header = (frag->network_header + hlen +
768 					  sizeof(struct frag_hdr));
769 
770 		/*
771 		 *	Charge the memory for the fragment to any owner
772 		 *	it might possess
773 		 */
774 		if (skb->sk)
775 			skb_set_owner_w(frag, skb->sk);
776 
777 		/*
778 		 *	Copy the packet header into the new buffer.
779 		 */
780 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
781 
782 		/*
783 		 *	Build fragment header.
784 		 */
785 		fh->nexthdr = nexthdr;
786 		fh->reserved = 0;
787 		fh->identification = frag_id;
788 
789 		/*
790 		 *	Copy a block of the IP datagram.
791 		 */
792 		BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
793 				     len));
794 		left -= len;
795 
796 		fh->frag_off = htons(offset);
797 		if (left > 0)
798 			fh->frag_off |= htons(IP6_MF);
799 		ipv6_hdr(frag)->payload_len = htons(frag->len -
800 						    sizeof(struct ipv6hdr));
801 
802 		ptr += len;
803 		offset += len;
804 
805 		/*
806 		 *	Put this fragment into the sending queue.
807 		 */
808 		err = output(sk, frag);
809 		if (err)
810 			goto fail;
811 
812 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
813 			      IPSTATS_MIB_FRAGCREATES);
814 	}
815 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
816 		      IPSTATS_MIB_FRAGOKS);
817 	consume_skb(skb);
818 	return err;
819 
820 fail_toobig:
821 	if (skb->sk && dst_allfrag(skb_dst(skb)))
822 		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
823 
824 	skb->dev = skb_dst(skb)->dev;
825 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
826 	err = -EMSGSIZE;
827 
828 fail:
829 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
830 		      IPSTATS_MIB_FRAGFAILS);
831 	kfree_skb(skb);
832 	return err;
833 }
834 
835 static inline int ip6_rt_check(const struct rt6key *rt_key,
836 			       const struct in6_addr *fl_addr,
837 			       const struct in6_addr *addr_cache)
838 {
839 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
840 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
841 }
842 
843 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
844 					  struct dst_entry *dst,
845 					  const struct flowi6 *fl6)
846 {
847 	struct ipv6_pinfo *np = inet6_sk(sk);
848 	struct rt6_info *rt;
849 
850 	if (!dst)
851 		goto out;
852 
853 	if (dst->ops->family != AF_INET6) {
854 		dst_release(dst);
855 		return NULL;
856 	}
857 
858 	rt = (struct rt6_info *)dst;
859 	/* Yes, checking route validity in not connected
860 	 * case is not very simple. Take into account,
861 	 * that we do not support routing by source, TOS,
862 	 * and MSG_DONTROUTE		--ANK (980726)
863 	 *
864 	 * 1. ip6_rt_check(): If route was host route,
865 	 *    check that cached destination is current.
866 	 *    If it is network route, we still may
867 	 *    check its validity using saved pointer
868 	 *    to the last used address: daddr_cache.
869 	 *    We do not want to save whole address now,
870 	 *    (because main consumer of this service
871 	 *    is tcp, which has not this problem),
872 	 *    so that the last trick works only on connected
873 	 *    sockets.
874 	 * 2. oif also should be the same.
875 	 */
876 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
877 #ifdef CONFIG_IPV6_SUBTREES
878 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
879 #endif
880 	    (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
881 		dst_release(dst);
882 		dst = NULL;
883 	}
884 
885 out:
886 	return dst;
887 }
888 
889 static int ip6_dst_lookup_tail(struct net *net, struct sock *sk,
890 			       struct dst_entry **dst, struct flowi6 *fl6)
891 {
892 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
893 	struct neighbour *n;
894 	struct rt6_info *rt;
895 #endif
896 	int err;
897 
898 	/* The correct way to handle this would be to do
899 	 * ip6_route_get_saddr, and then ip6_route_output; however,
900 	 * the route-specific preferred source forces the
901 	 * ip6_route_output call _before_ ip6_route_get_saddr.
902 	 *
903 	 * In source specific routing (no src=any default route),
904 	 * ip6_route_output will fail given src=any saddr, though, so
905 	 * that's why we try it again later.
906 	 */
907 	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
908 		struct rt6_info *rt;
909 		bool had_dst = *dst != NULL;
910 
911 		if (!had_dst)
912 			*dst = ip6_route_output(net, sk, fl6);
913 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
914 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
915 					  sk ? inet6_sk(sk)->srcprefs : 0,
916 					  &fl6->saddr);
917 		if (err)
918 			goto out_err_release;
919 
920 		/* If we had an erroneous initial result, pretend it
921 		 * never existed and let the SA-enabled version take
922 		 * over.
923 		 */
924 		if (!had_dst && (*dst)->error) {
925 			dst_release(*dst);
926 			*dst = NULL;
927 		}
928 	}
929 
930 	if (!*dst)
931 		*dst = ip6_route_output(net, sk, fl6);
932 
933 	err = (*dst)->error;
934 	if (err)
935 		goto out_err_release;
936 
937 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
938 	/*
939 	 * Here if the dst entry we've looked up
940 	 * has a neighbour entry that is in the INCOMPLETE
941 	 * state and the src address from the flow is
942 	 * marked as OPTIMISTIC, we release the found
943 	 * dst entry and replace it instead with the
944 	 * dst entry of the nexthop router
945 	 */
946 	rt = (struct rt6_info *) *dst;
947 	rcu_read_lock_bh();
948 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
949 				      rt6_nexthop(rt, &fl6->daddr));
950 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
951 	rcu_read_unlock_bh();
952 
953 	if (err) {
954 		struct inet6_ifaddr *ifp;
955 		struct flowi6 fl_gw6;
956 		int redirect;
957 
958 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
959 				      (*dst)->dev, 1);
960 
961 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
962 		if (ifp)
963 			in6_ifa_put(ifp);
964 
965 		if (redirect) {
966 			/*
967 			 * We need to get the dst entry for the
968 			 * default router instead
969 			 */
970 			dst_release(*dst);
971 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
972 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
973 			*dst = ip6_route_output(net, sk, &fl_gw6);
974 			err = (*dst)->error;
975 			if (err)
976 				goto out_err_release;
977 		}
978 	}
979 #endif
980 
981 	return 0;
982 
983 out_err_release:
984 	if (err == -ENETUNREACH)
985 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
986 	dst_release(*dst);
987 	*dst = NULL;
988 	return err;
989 }
990 
991 /**
992  *	ip6_dst_lookup - perform route lookup on flow
993  *	@sk: socket which provides route info
994  *	@dst: pointer to dst_entry * for result
995  *	@fl6: flow to lookup
996  *
997  *	This function performs a route lookup on the given flow.
998  *
999  *	It returns zero on success, or a standard errno code on error.
1000  */
1001 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1002 		   struct flowi6 *fl6)
1003 {
1004 	*dst = NULL;
1005 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1006 }
1007 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1008 
1009 /**
1010  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1011  *	@sk: socket which provides route info
1012  *	@fl6: flow to lookup
1013  *	@final_dst: final destination address for ipsec lookup
1014  *
1015  *	This function performs a route lookup on the given flow.
1016  *
1017  *	It returns a valid dst pointer on success, or a pointer encoded
1018  *	error code.
1019  */
1020 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1021 				      const struct in6_addr *final_dst)
1022 {
1023 	struct dst_entry *dst = NULL;
1024 	int err;
1025 
1026 	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1027 	if (err)
1028 		return ERR_PTR(err);
1029 	if (final_dst)
1030 		fl6->daddr = *final_dst;
1031 	if (!fl6->flowi6_oif)
1032 		fl6->flowi6_oif = dst->dev->ifindex;
1033 
1034 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1035 }
1036 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1037 
1038 /**
1039  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1040  *	@sk: socket which provides the dst cache and route info
1041  *	@fl6: flow to lookup
1042  *	@final_dst: final destination address for ipsec lookup
1043  *
1044  *	This function performs a route lookup on the given flow with the
1045  *	possibility of using the cached route in the socket if it is valid.
1046  *	It will take the socket dst lock when operating on the dst cache.
1047  *	As a result, this function can only be used in process context.
1048  *
1049  *	It returns a valid dst pointer on success, or a pointer encoded
1050  *	error code.
1051  */
1052 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1053 					 const struct in6_addr *final_dst)
1054 {
1055 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1056 	int err;
1057 
1058 	dst = ip6_sk_dst_check(sk, dst, fl6);
1059 
1060 	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1061 	if (err)
1062 		return ERR_PTR(err);
1063 	if (final_dst)
1064 		fl6->daddr = *final_dst;
1065 
1066 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1067 }
1068 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1069 
1070 static inline int ip6_ufo_append_data(struct sock *sk,
1071 			struct sk_buff_head *queue,
1072 			int getfrag(void *from, char *to, int offset, int len,
1073 			int odd, struct sk_buff *skb),
1074 			void *from, int length, int hh_len, int fragheaderlen,
1075 			int transhdrlen, int mtu, unsigned int flags,
1076 			const struct flowi6 *fl6)
1077 
1078 {
1079 	struct sk_buff *skb;
1080 	int err;
1081 
1082 	/* There is support for UDP large send offload by network
1083 	 * device, so create one single skb packet containing complete
1084 	 * udp datagram
1085 	 */
1086 	skb = skb_peek_tail(queue);
1087 	if (!skb) {
1088 		skb = sock_alloc_send_skb(sk,
1089 			hh_len + fragheaderlen + transhdrlen + 20,
1090 			(flags & MSG_DONTWAIT), &err);
1091 		if (!skb)
1092 			return err;
1093 
1094 		/* reserve space for Hardware header */
1095 		skb_reserve(skb, hh_len);
1096 
1097 		/* create space for UDP/IP header */
1098 		skb_put(skb, fragheaderlen + transhdrlen);
1099 
1100 		/* initialize network header pointer */
1101 		skb_reset_network_header(skb);
1102 
1103 		/* initialize protocol header pointer */
1104 		skb->transport_header = skb->network_header + fragheaderlen;
1105 
1106 		skb->protocol = htons(ETH_P_IPV6);
1107 		skb->csum = 0;
1108 
1109 		__skb_queue_tail(queue, skb);
1110 	} else if (skb_is_gso(skb)) {
1111 		goto append;
1112 	}
1113 
1114 	skb->ip_summed = CHECKSUM_PARTIAL;
1115 	/* Specify the length of each IPv6 datagram fragment.
1116 	 * It has to be a multiple of 8.
1117 	 */
1118 	skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1119 				     sizeof(struct frag_hdr)) & ~7;
1120 	skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1121 	skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1122 							 &fl6->daddr,
1123 							 &fl6->saddr);
1124 
1125 append:
1126 	return skb_append_datato_frags(sk, skb, getfrag, from,
1127 				       (length - transhdrlen));
1128 }
1129 
1130 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1131 					       gfp_t gfp)
1132 {
1133 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1134 }
1135 
1136 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1137 						gfp_t gfp)
1138 {
1139 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1140 }
1141 
1142 static void ip6_append_data_mtu(unsigned int *mtu,
1143 				int *maxfraglen,
1144 				unsigned int fragheaderlen,
1145 				struct sk_buff *skb,
1146 				struct rt6_info *rt,
1147 				unsigned int orig_mtu)
1148 {
1149 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1150 		if (!skb) {
1151 			/* first fragment, reserve header_len */
1152 			*mtu = orig_mtu - rt->dst.header_len;
1153 
1154 		} else {
1155 			/*
1156 			 * this fragment is not first, the headers
1157 			 * space is regarded as data space.
1158 			 */
1159 			*mtu = orig_mtu;
1160 		}
1161 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1162 			      + fragheaderlen - sizeof(struct frag_hdr);
1163 	}
1164 }
1165 
1166 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1167 			  struct inet6_cork *v6_cork,
1168 			  int hlimit, int tclass, struct ipv6_txoptions *opt,
1169 			  struct rt6_info *rt, struct flowi6 *fl6)
1170 {
1171 	struct ipv6_pinfo *np = inet6_sk(sk);
1172 	unsigned int mtu;
1173 
1174 	/*
1175 	 * setup for corking
1176 	 */
1177 	if (opt) {
1178 		if (WARN_ON(v6_cork->opt))
1179 			return -EINVAL;
1180 
1181 		v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1182 		if (unlikely(!v6_cork->opt))
1183 			return -ENOBUFS;
1184 
1185 		v6_cork->opt->tot_len = opt->tot_len;
1186 		v6_cork->opt->opt_flen = opt->opt_flen;
1187 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1188 
1189 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1190 						    sk->sk_allocation);
1191 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1192 			return -ENOBUFS;
1193 
1194 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1195 						    sk->sk_allocation);
1196 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1197 			return -ENOBUFS;
1198 
1199 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1200 						   sk->sk_allocation);
1201 		if (opt->hopopt && !v6_cork->opt->hopopt)
1202 			return -ENOBUFS;
1203 
1204 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1205 						    sk->sk_allocation);
1206 		if (opt->srcrt && !v6_cork->opt->srcrt)
1207 			return -ENOBUFS;
1208 
1209 		/* need source address above miyazawa*/
1210 	}
1211 	dst_hold(&rt->dst);
1212 	cork->base.dst = &rt->dst;
1213 	cork->fl.u.ip6 = *fl6;
1214 	v6_cork->hop_limit = hlimit;
1215 	v6_cork->tclass = tclass;
1216 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1217 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1218 		      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1219 	else
1220 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1221 		      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1222 	if (np->frag_size < mtu) {
1223 		if (np->frag_size)
1224 			mtu = np->frag_size;
1225 	}
1226 	cork->base.fragsize = mtu;
1227 	if (dst_allfrag(rt->dst.path))
1228 		cork->base.flags |= IPCORK_ALLFRAG;
1229 	cork->base.length = 0;
1230 
1231 	return 0;
1232 }
1233 
1234 static int __ip6_append_data(struct sock *sk,
1235 			     struct flowi6 *fl6,
1236 			     struct sk_buff_head *queue,
1237 			     struct inet_cork *cork,
1238 			     struct inet6_cork *v6_cork,
1239 			     struct page_frag *pfrag,
1240 			     int getfrag(void *from, char *to, int offset,
1241 					 int len, int odd, struct sk_buff *skb),
1242 			     void *from, int length, int transhdrlen,
1243 			     unsigned int flags, int dontfrag)
1244 {
1245 	struct sk_buff *skb, *skb_prev = NULL;
1246 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1247 	int exthdrlen = 0;
1248 	int dst_exthdrlen = 0;
1249 	int hh_len;
1250 	int copy;
1251 	int err;
1252 	int offset = 0;
1253 	__u8 tx_flags = 0;
1254 	u32 tskey = 0;
1255 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1256 	struct ipv6_txoptions *opt = v6_cork->opt;
1257 	int csummode = CHECKSUM_NONE;
1258 
1259 	skb = skb_peek_tail(queue);
1260 	if (!skb) {
1261 		exthdrlen = opt ? opt->opt_flen : 0;
1262 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1263 	}
1264 
1265 	mtu = cork->fragsize;
1266 	orig_mtu = mtu;
1267 
1268 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1269 
1270 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1271 			(opt ? opt->opt_nflen : 0);
1272 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1273 		     sizeof(struct frag_hdr);
1274 
1275 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1276 		unsigned int maxnonfragsize, headersize;
1277 
1278 		headersize = sizeof(struct ipv6hdr) +
1279 			     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1280 			     (dst_allfrag(&rt->dst) ?
1281 			      sizeof(struct frag_hdr) : 0) +
1282 			     rt->rt6i_nfheader_len;
1283 
1284 		if (ip6_sk_ignore_df(sk))
1285 			maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1286 		else
1287 			maxnonfragsize = mtu;
1288 
1289 		/* dontfrag active */
1290 		if ((cork->length + length > mtu - headersize) && dontfrag &&
1291 		    (sk->sk_protocol == IPPROTO_UDP ||
1292 		     sk->sk_protocol == IPPROTO_RAW)) {
1293 			ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1294 						   sizeof(struct ipv6hdr));
1295 			goto emsgsize;
1296 		}
1297 
1298 		if (cork->length + length > maxnonfragsize - headersize) {
1299 emsgsize:
1300 			ipv6_local_error(sk, EMSGSIZE, fl6,
1301 					 mtu - headersize +
1302 					 sizeof(struct ipv6hdr));
1303 			return -EMSGSIZE;
1304 		}
1305 	}
1306 
1307 	if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1308 		sock_tx_timestamp(sk, &tx_flags);
1309 		if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1310 		    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1311 			tskey = sk->sk_tskey++;
1312 	}
1313 
1314 	/* If this is the first and only packet and device
1315 	 * supports checksum offloading, let's use it.
1316 	 * Use transhdrlen, same as IPv4, because partial
1317 	 * sums only work when transhdrlen is set.
1318 	 */
1319 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1320 	    length + fragheaderlen < mtu &&
1321 	    rt->dst.dev->features & NETIF_F_V6_CSUM &&
1322 	    !exthdrlen)
1323 		csummode = CHECKSUM_PARTIAL;
1324 	/*
1325 	 * Let's try using as much space as possible.
1326 	 * Use MTU if total length of the message fits into the MTU.
1327 	 * Otherwise, we need to reserve fragment header and
1328 	 * fragment alignment (= 8-15 octects, in total).
1329 	 *
1330 	 * Note that we may need to "move" the data from the tail of
1331 	 * of the buffer to the new fragment when we split
1332 	 * the message.
1333 	 *
1334 	 * FIXME: It may be fragmented into multiple chunks
1335 	 *        at once if non-fragmentable extension headers
1336 	 *        are too large.
1337 	 * --yoshfuji
1338 	 */
1339 
1340 	cork->length += length;
1341 	if (((length > mtu) ||
1342 	     (skb && skb_is_gso(skb))) &&
1343 	    (sk->sk_protocol == IPPROTO_UDP) &&
1344 	    (rt->dst.dev->features & NETIF_F_UFO) &&
1345 	    (sk->sk_type == SOCK_DGRAM)) {
1346 		err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1347 					  hh_len, fragheaderlen,
1348 					  transhdrlen, mtu, flags, fl6);
1349 		if (err)
1350 			goto error;
1351 		return 0;
1352 	}
1353 
1354 	if (!skb)
1355 		goto alloc_new_skb;
1356 
1357 	while (length > 0) {
1358 		/* Check if the remaining data fits into current packet. */
1359 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1360 		if (copy < length)
1361 			copy = maxfraglen - skb->len;
1362 
1363 		if (copy <= 0) {
1364 			char *data;
1365 			unsigned int datalen;
1366 			unsigned int fraglen;
1367 			unsigned int fraggap;
1368 			unsigned int alloclen;
1369 alloc_new_skb:
1370 			/* There's no room in the current skb */
1371 			if (skb)
1372 				fraggap = skb->len - maxfraglen;
1373 			else
1374 				fraggap = 0;
1375 			/* update mtu and maxfraglen if necessary */
1376 			if (!skb || !skb_prev)
1377 				ip6_append_data_mtu(&mtu, &maxfraglen,
1378 						    fragheaderlen, skb, rt,
1379 						    orig_mtu);
1380 
1381 			skb_prev = skb;
1382 
1383 			/*
1384 			 * If remaining data exceeds the mtu,
1385 			 * we know we need more fragment(s).
1386 			 */
1387 			datalen = length + fraggap;
1388 
1389 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1390 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1391 			if ((flags & MSG_MORE) &&
1392 			    !(rt->dst.dev->features&NETIF_F_SG))
1393 				alloclen = mtu;
1394 			else
1395 				alloclen = datalen + fragheaderlen;
1396 
1397 			alloclen += dst_exthdrlen;
1398 
1399 			if (datalen != length + fraggap) {
1400 				/*
1401 				 * this is not the last fragment, the trailer
1402 				 * space is regarded as data space.
1403 				 */
1404 				datalen += rt->dst.trailer_len;
1405 			}
1406 
1407 			alloclen += rt->dst.trailer_len;
1408 			fraglen = datalen + fragheaderlen;
1409 
1410 			/*
1411 			 * We just reserve space for fragment header.
1412 			 * Note: this may be overallocation if the message
1413 			 * (without MSG_MORE) fits into the MTU.
1414 			 */
1415 			alloclen += sizeof(struct frag_hdr);
1416 
1417 			if (transhdrlen) {
1418 				skb = sock_alloc_send_skb(sk,
1419 						alloclen + hh_len,
1420 						(flags & MSG_DONTWAIT), &err);
1421 			} else {
1422 				skb = NULL;
1423 				if (atomic_read(&sk->sk_wmem_alloc) <=
1424 				    2 * sk->sk_sndbuf)
1425 					skb = sock_wmalloc(sk,
1426 							   alloclen + hh_len, 1,
1427 							   sk->sk_allocation);
1428 				if (unlikely(!skb))
1429 					err = -ENOBUFS;
1430 			}
1431 			if (!skb)
1432 				goto error;
1433 			/*
1434 			 *	Fill in the control structures
1435 			 */
1436 			skb->protocol = htons(ETH_P_IPV6);
1437 			skb->ip_summed = csummode;
1438 			skb->csum = 0;
1439 			/* reserve for fragmentation and ipsec header */
1440 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1441 				    dst_exthdrlen);
1442 
1443 			/* Only the initial fragment is time stamped */
1444 			skb_shinfo(skb)->tx_flags = tx_flags;
1445 			tx_flags = 0;
1446 			skb_shinfo(skb)->tskey = tskey;
1447 			tskey = 0;
1448 
1449 			/*
1450 			 *	Find where to start putting bytes
1451 			 */
1452 			data = skb_put(skb, fraglen);
1453 			skb_set_network_header(skb, exthdrlen);
1454 			data += fragheaderlen;
1455 			skb->transport_header = (skb->network_header +
1456 						 fragheaderlen);
1457 			if (fraggap) {
1458 				skb->csum = skb_copy_and_csum_bits(
1459 					skb_prev, maxfraglen,
1460 					data + transhdrlen, fraggap, 0);
1461 				skb_prev->csum = csum_sub(skb_prev->csum,
1462 							  skb->csum);
1463 				data += fraggap;
1464 				pskb_trim_unique(skb_prev, maxfraglen);
1465 			}
1466 			copy = datalen - transhdrlen - fraggap;
1467 
1468 			if (copy < 0) {
1469 				err = -EINVAL;
1470 				kfree_skb(skb);
1471 				goto error;
1472 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1473 				err = -EFAULT;
1474 				kfree_skb(skb);
1475 				goto error;
1476 			}
1477 
1478 			offset += copy;
1479 			length -= datalen - fraggap;
1480 			transhdrlen = 0;
1481 			exthdrlen = 0;
1482 			dst_exthdrlen = 0;
1483 
1484 			/*
1485 			 * Put the packet on the pending queue
1486 			 */
1487 			__skb_queue_tail(queue, skb);
1488 			continue;
1489 		}
1490 
1491 		if (copy > length)
1492 			copy = length;
1493 
1494 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1495 			unsigned int off;
1496 
1497 			off = skb->len;
1498 			if (getfrag(from, skb_put(skb, copy),
1499 						offset, copy, off, skb) < 0) {
1500 				__skb_trim(skb, off);
1501 				err = -EFAULT;
1502 				goto error;
1503 			}
1504 		} else {
1505 			int i = skb_shinfo(skb)->nr_frags;
1506 
1507 			err = -ENOMEM;
1508 			if (!sk_page_frag_refill(sk, pfrag))
1509 				goto error;
1510 
1511 			if (!skb_can_coalesce(skb, i, pfrag->page,
1512 					      pfrag->offset)) {
1513 				err = -EMSGSIZE;
1514 				if (i == MAX_SKB_FRAGS)
1515 					goto error;
1516 
1517 				__skb_fill_page_desc(skb, i, pfrag->page,
1518 						     pfrag->offset, 0);
1519 				skb_shinfo(skb)->nr_frags = ++i;
1520 				get_page(pfrag->page);
1521 			}
1522 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1523 			if (getfrag(from,
1524 				    page_address(pfrag->page) + pfrag->offset,
1525 				    offset, copy, skb->len, skb) < 0)
1526 				goto error_efault;
1527 
1528 			pfrag->offset += copy;
1529 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1530 			skb->len += copy;
1531 			skb->data_len += copy;
1532 			skb->truesize += copy;
1533 			atomic_add(copy, &sk->sk_wmem_alloc);
1534 		}
1535 		offset += copy;
1536 		length -= copy;
1537 	}
1538 
1539 	return 0;
1540 
1541 error_efault:
1542 	err = -EFAULT;
1543 error:
1544 	cork->length -= length;
1545 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1546 	return err;
1547 }
1548 
1549 int ip6_append_data(struct sock *sk,
1550 		    int getfrag(void *from, char *to, int offset, int len,
1551 				int odd, struct sk_buff *skb),
1552 		    void *from, int length, int transhdrlen, int hlimit,
1553 		    int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1554 		    struct rt6_info *rt, unsigned int flags, int dontfrag)
1555 {
1556 	struct inet_sock *inet = inet_sk(sk);
1557 	struct ipv6_pinfo *np = inet6_sk(sk);
1558 	int exthdrlen;
1559 	int err;
1560 
1561 	if (flags&MSG_PROBE)
1562 		return 0;
1563 	if (skb_queue_empty(&sk->sk_write_queue)) {
1564 		/*
1565 		 * setup for corking
1566 		 */
1567 		err = ip6_setup_cork(sk, &inet->cork, &np->cork, hlimit,
1568 				     tclass, opt, rt, fl6);
1569 		if (err)
1570 			return err;
1571 
1572 		exthdrlen = (opt ? opt->opt_flen : 0);
1573 		length += exthdrlen;
1574 		transhdrlen += exthdrlen;
1575 	} else {
1576 		fl6 = &inet->cork.fl.u.ip6;
1577 		transhdrlen = 0;
1578 	}
1579 
1580 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1581 				 &np->cork, sk_page_frag(sk), getfrag,
1582 				 from, length, transhdrlen, flags, dontfrag);
1583 }
1584 EXPORT_SYMBOL_GPL(ip6_append_data);
1585 
1586 static void ip6_cork_release(struct inet_cork_full *cork,
1587 			     struct inet6_cork *v6_cork)
1588 {
1589 	if (v6_cork->opt) {
1590 		kfree(v6_cork->opt->dst0opt);
1591 		kfree(v6_cork->opt->dst1opt);
1592 		kfree(v6_cork->opt->hopopt);
1593 		kfree(v6_cork->opt->srcrt);
1594 		kfree(v6_cork->opt);
1595 		v6_cork->opt = NULL;
1596 	}
1597 
1598 	if (cork->base.dst) {
1599 		dst_release(cork->base.dst);
1600 		cork->base.dst = NULL;
1601 		cork->base.flags &= ~IPCORK_ALLFRAG;
1602 	}
1603 	memset(&cork->fl, 0, sizeof(cork->fl));
1604 }
1605 
1606 struct sk_buff *__ip6_make_skb(struct sock *sk,
1607 			       struct sk_buff_head *queue,
1608 			       struct inet_cork_full *cork,
1609 			       struct inet6_cork *v6_cork)
1610 {
1611 	struct sk_buff *skb, *tmp_skb;
1612 	struct sk_buff **tail_skb;
1613 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1614 	struct ipv6_pinfo *np = inet6_sk(sk);
1615 	struct net *net = sock_net(sk);
1616 	struct ipv6hdr *hdr;
1617 	struct ipv6_txoptions *opt = v6_cork->opt;
1618 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1619 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1620 	unsigned char proto = fl6->flowi6_proto;
1621 
1622 	skb = __skb_dequeue(queue);
1623 	if (!skb)
1624 		goto out;
1625 	tail_skb = &(skb_shinfo(skb)->frag_list);
1626 
1627 	/* move skb->data to ip header from ext header */
1628 	if (skb->data < skb_network_header(skb))
1629 		__skb_pull(skb, skb_network_offset(skb));
1630 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1631 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1632 		*tail_skb = tmp_skb;
1633 		tail_skb = &(tmp_skb->next);
1634 		skb->len += tmp_skb->len;
1635 		skb->data_len += tmp_skb->len;
1636 		skb->truesize += tmp_skb->truesize;
1637 		tmp_skb->destructor = NULL;
1638 		tmp_skb->sk = NULL;
1639 	}
1640 
1641 	/* Allow local fragmentation. */
1642 	skb->ignore_df = ip6_sk_ignore_df(sk);
1643 
1644 	*final_dst = fl6->daddr;
1645 	__skb_pull(skb, skb_network_header_len(skb));
1646 	if (opt && opt->opt_flen)
1647 		ipv6_push_frag_opts(skb, opt, &proto);
1648 	if (opt && opt->opt_nflen)
1649 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1650 
1651 	skb_push(skb, sizeof(struct ipv6hdr));
1652 	skb_reset_network_header(skb);
1653 	hdr = ipv6_hdr(skb);
1654 
1655 	ip6_flow_hdr(hdr, v6_cork->tclass,
1656 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1657 					np->autoflowlabel, fl6));
1658 	hdr->hop_limit = v6_cork->hop_limit;
1659 	hdr->nexthdr = proto;
1660 	hdr->saddr = fl6->saddr;
1661 	hdr->daddr = *final_dst;
1662 
1663 	skb->priority = sk->sk_priority;
1664 	skb->mark = sk->sk_mark;
1665 
1666 	skb_dst_set(skb, dst_clone(&rt->dst));
1667 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1668 	if (proto == IPPROTO_ICMPV6) {
1669 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1670 
1671 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1672 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1673 	}
1674 
1675 	ip6_cork_release(cork, v6_cork);
1676 out:
1677 	return skb;
1678 }
1679 
1680 int ip6_send_skb(struct sk_buff *skb)
1681 {
1682 	struct net *net = sock_net(skb->sk);
1683 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1684 	int err;
1685 
1686 	err = ip6_local_out(skb);
1687 	if (err) {
1688 		if (err > 0)
1689 			err = net_xmit_errno(err);
1690 		if (err)
1691 			IP6_INC_STATS(net, rt->rt6i_idev,
1692 				      IPSTATS_MIB_OUTDISCARDS);
1693 	}
1694 
1695 	return err;
1696 }
1697 
1698 int ip6_push_pending_frames(struct sock *sk)
1699 {
1700 	struct sk_buff *skb;
1701 
1702 	skb = ip6_finish_skb(sk);
1703 	if (!skb)
1704 		return 0;
1705 
1706 	return ip6_send_skb(skb);
1707 }
1708 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1709 
1710 static void __ip6_flush_pending_frames(struct sock *sk,
1711 				       struct sk_buff_head *queue,
1712 				       struct inet_cork_full *cork,
1713 				       struct inet6_cork *v6_cork)
1714 {
1715 	struct sk_buff *skb;
1716 
1717 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1718 		if (skb_dst(skb))
1719 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1720 				      IPSTATS_MIB_OUTDISCARDS);
1721 		kfree_skb(skb);
1722 	}
1723 
1724 	ip6_cork_release(cork, v6_cork);
1725 }
1726 
1727 void ip6_flush_pending_frames(struct sock *sk)
1728 {
1729 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1730 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1731 }
1732 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1733 
1734 struct sk_buff *ip6_make_skb(struct sock *sk,
1735 			     int getfrag(void *from, char *to, int offset,
1736 					 int len, int odd, struct sk_buff *skb),
1737 			     void *from, int length, int transhdrlen,
1738 			     int hlimit, int tclass,
1739 			     struct ipv6_txoptions *opt, struct flowi6 *fl6,
1740 			     struct rt6_info *rt, unsigned int flags,
1741 			     int dontfrag)
1742 {
1743 	struct inet_cork_full cork;
1744 	struct inet6_cork v6_cork;
1745 	struct sk_buff_head queue;
1746 	int exthdrlen = (opt ? opt->opt_flen : 0);
1747 	int err;
1748 
1749 	if (flags & MSG_PROBE)
1750 		return NULL;
1751 
1752 	__skb_queue_head_init(&queue);
1753 
1754 	cork.base.flags = 0;
1755 	cork.base.addr = 0;
1756 	cork.base.opt = NULL;
1757 	v6_cork.opt = NULL;
1758 	err = ip6_setup_cork(sk, &cork, &v6_cork, hlimit, tclass, opt, rt, fl6);
1759 	if (err)
1760 		return ERR_PTR(err);
1761 
1762 	if (dontfrag < 0)
1763 		dontfrag = inet6_sk(sk)->dontfrag;
1764 
1765 	err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1766 				&current->task_frag, getfrag, from,
1767 				length + exthdrlen, transhdrlen + exthdrlen,
1768 				flags, dontfrag);
1769 	if (err) {
1770 		__ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1771 		return ERR_PTR(err);
1772 	}
1773 
1774 	return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1775 }
1776