xref: /openbmc/linux/net/ipv6/ip6_output.c (revision b96fc2f3)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	:	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 
59 static int ip6_finish_output2(struct sock *sk, struct sk_buff *skb)
60 {
61 	struct dst_entry *dst = skb_dst(skb);
62 	struct net_device *dev = dst->dev;
63 	struct neighbour *neigh;
64 	struct in6_addr *nexthop;
65 	int ret;
66 
67 	skb->protocol = htons(ETH_P_IPV6);
68 	skb->dev = dev;
69 
70 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72 
73 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74 		    ((mroute6_socket(dev_net(dev), skb) &&
75 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 					 &ipv6_hdr(skb)->saddr))) {
78 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79 
80 			/* Do not check for IFF_ALLMULTI; multicast routing
81 			   is not supported in any case.
82 			 */
83 			if (newskb)
84 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 					sk, newskb, NULL, newskb->dev,
86 					dev_loopback_xmit);
87 
88 			if (ipv6_hdr(skb)->hop_limit == 0) {
89 				IP6_INC_STATS(dev_net(dev), idev,
90 					      IPSTATS_MIB_OUTDISCARDS);
91 				kfree_skb(skb);
92 				return 0;
93 			}
94 		}
95 
96 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
97 				skb->len);
98 
99 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
100 		    IPV6_ADDR_SCOPE_NODELOCAL &&
101 		    !(dev->flags & IFF_LOOPBACK)) {
102 			kfree_skb(skb);
103 			return 0;
104 		}
105 	}
106 
107 	rcu_read_lock_bh();
108 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
109 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
110 	if (unlikely(!neigh))
111 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
112 	if (!IS_ERR(neigh)) {
113 		ret = dst_neigh_output(dst, neigh, skb);
114 		rcu_read_unlock_bh();
115 		return ret;
116 	}
117 	rcu_read_unlock_bh();
118 
119 	IP6_INC_STATS(dev_net(dst->dev),
120 		      ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
121 	kfree_skb(skb);
122 	return -EINVAL;
123 }
124 
125 static int ip6_finish_output(struct sock *sk, struct sk_buff *skb)
126 {
127 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
128 	    dst_allfrag(skb_dst(skb)) ||
129 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
130 		return ip6_fragment(sk, skb, ip6_finish_output2);
131 	else
132 		return ip6_finish_output2(sk, skb);
133 }
134 
135 int ip6_output(struct sock *sk, struct sk_buff *skb)
136 {
137 	struct net_device *dev = skb_dst(skb)->dev;
138 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
139 	if (unlikely(idev->cnf.disable_ipv6)) {
140 		IP6_INC_STATS(dev_net(dev), idev,
141 			      IPSTATS_MIB_OUTDISCARDS);
142 		kfree_skb(skb);
143 		return 0;
144 	}
145 
146 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, sk, skb,
147 			    NULL, dev,
148 			    ip6_finish_output,
149 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
150 }
151 
152 /*
153  *	xmit an sk_buff (used by TCP, SCTP and DCCP)
154  */
155 
156 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
157 	     struct ipv6_txoptions *opt, int tclass)
158 {
159 	struct net *net = sock_net(sk);
160 	struct ipv6_pinfo *np = inet6_sk(sk);
161 	struct in6_addr *first_hop = &fl6->daddr;
162 	struct dst_entry *dst = skb_dst(skb);
163 	struct ipv6hdr *hdr;
164 	u8  proto = fl6->flowi6_proto;
165 	int seg_len = skb->len;
166 	int hlimit = -1;
167 	u32 mtu;
168 
169 	if (opt) {
170 		unsigned int head_room;
171 
172 		/* First: exthdrs may take lots of space (~8K for now)
173 		   MAX_HEADER is not enough.
174 		 */
175 		head_room = opt->opt_nflen + opt->opt_flen;
176 		seg_len += head_room;
177 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
178 
179 		if (skb_headroom(skb) < head_room) {
180 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
181 			if (!skb2) {
182 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
183 					      IPSTATS_MIB_OUTDISCARDS);
184 				kfree_skb(skb);
185 				return -ENOBUFS;
186 			}
187 			consume_skb(skb);
188 			skb = skb2;
189 			skb_set_owner_w(skb, sk);
190 		}
191 		if (opt->opt_flen)
192 			ipv6_push_frag_opts(skb, opt, &proto);
193 		if (opt->opt_nflen)
194 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
195 	}
196 
197 	skb_push(skb, sizeof(struct ipv6hdr));
198 	skb_reset_network_header(skb);
199 	hdr = ipv6_hdr(skb);
200 
201 	/*
202 	 *	Fill in the IPv6 header
203 	 */
204 	if (np)
205 		hlimit = np->hop_limit;
206 	if (hlimit < 0)
207 		hlimit = ip6_dst_hoplimit(dst);
208 
209 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
210 						     np->autoflowlabel, fl6));
211 
212 	hdr->payload_len = htons(seg_len);
213 	hdr->nexthdr = proto;
214 	hdr->hop_limit = hlimit;
215 
216 	hdr->saddr = fl6->saddr;
217 	hdr->daddr = *first_hop;
218 
219 	skb->protocol = htons(ETH_P_IPV6);
220 	skb->priority = sk->sk_priority;
221 	skb->mark = sk->sk_mark;
222 
223 	mtu = dst_mtu(dst);
224 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
225 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
226 			      IPSTATS_MIB_OUT, skb->len);
227 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb,
228 			       NULL, dst->dev, dst_output_sk);
229 	}
230 
231 	skb->dev = dst->dev;
232 	ipv6_local_error(sk, EMSGSIZE, fl6, mtu);
233 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
234 	kfree_skb(skb);
235 	return -EMSGSIZE;
236 }
237 EXPORT_SYMBOL(ip6_xmit);
238 
239 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
240 {
241 	struct ip6_ra_chain *ra;
242 	struct sock *last = NULL;
243 
244 	read_lock(&ip6_ra_lock);
245 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
246 		struct sock *sk = ra->sk;
247 		if (sk && ra->sel == sel &&
248 		    (!sk->sk_bound_dev_if ||
249 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
250 			if (last) {
251 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
252 				if (skb2)
253 					rawv6_rcv(last, skb2);
254 			}
255 			last = sk;
256 		}
257 	}
258 
259 	if (last) {
260 		rawv6_rcv(last, skb);
261 		read_unlock(&ip6_ra_lock);
262 		return 1;
263 	}
264 	read_unlock(&ip6_ra_lock);
265 	return 0;
266 }
267 
268 static int ip6_forward_proxy_check(struct sk_buff *skb)
269 {
270 	struct ipv6hdr *hdr = ipv6_hdr(skb);
271 	u8 nexthdr = hdr->nexthdr;
272 	__be16 frag_off;
273 	int offset;
274 
275 	if (ipv6_ext_hdr(nexthdr)) {
276 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
277 		if (offset < 0)
278 			return 0;
279 	} else
280 		offset = sizeof(struct ipv6hdr);
281 
282 	if (nexthdr == IPPROTO_ICMPV6) {
283 		struct icmp6hdr *icmp6;
284 
285 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
286 					 offset + 1 - skb->data)))
287 			return 0;
288 
289 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
290 
291 		switch (icmp6->icmp6_type) {
292 		case NDISC_ROUTER_SOLICITATION:
293 		case NDISC_ROUTER_ADVERTISEMENT:
294 		case NDISC_NEIGHBOUR_SOLICITATION:
295 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
296 		case NDISC_REDIRECT:
297 			/* For reaction involving unicast neighbor discovery
298 			 * message destined to the proxied address, pass it to
299 			 * input function.
300 			 */
301 			return 1;
302 		default:
303 			break;
304 		}
305 	}
306 
307 	/*
308 	 * The proxying router can't forward traffic sent to a link-local
309 	 * address, so signal the sender and discard the packet. This
310 	 * behavior is clarified by the MIPv6 specification.
311 	 */
312 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
313 		dst_link_failure(skb);
314 		return -1;
315 	}
316 
317 	return 0;
318 }
319 
320 static inline int ip6_forward_finish(struct sock *sk, struct sk_buff *skb)
321 {
322 	skb_sender_cpu_clear(skb);
323 	return dst_output_sk(sk, skb);
324 }
325 
326 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
327 {
328 	unsigned int mtu;
329 	struct inet6_dev *idev;
330 
331 	if (dst_metric_locked(dst, RTAX_MTU)) {
332 		mtu = dst_metric_raw(dst, RTAX_MTU);
333 		if (mtu)
334 			return mtu;
335 	}
336 
337 	mtu = IPV6_MIN_MTU;
338 	rcu_read_lock();
339 	idev = __in6_dev_get(dst->dev);
340 	if (idev)
341 		mtu = idev->cnf.mtu6;
342 	rcu_read_unlock();
343 
344 	return mtu;
345 }
346 
347 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
348 {
349 	if (skb->len <= mtu)
350 		return false;
351 
352 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
353 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
354 		return true;
355 
356 	if (skb->ignore_df)
357 		return false;
358 
359 	if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
360 		return false;
361 
362 	return true;
363 }
364 
365 int ip6_forward(struct sk_buff *skb)
366 {
367 	struct dst_entry *dst = skb_dst(skb);
368 	struct ipv6hdr *hdr = ipv6_hdr(skb);
369 	struct inet6_skb_parm *opt = IP6CB(skb);
370 	struct net *net = dev_net(dst->dev);
371 	u32 mtu;
372 
373 	if (net->ipv6.devconf_all->forwarding == 0)
374 		goto error;
375 
376 	if (skb->pkt_type != PACKET_HOST)
377 		goto drop;
378 
379 	if (skb_warn_if_lro(skb))
380 		goto drop;
381 
382 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
383 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
384 				 IPSTATS_MIB_INDISCARDS);
385 		goto drop;
386 	}
387 
388 	skb_forward_csum(skb);
389 
390 	/*
391 	 *	We DO NOT make any processing on
392 	 *	RA packets, pushing them to user level AS IS
393 	 *	without ane WARRANTY that application will be able
394 	 *	to interpret them. The reason is that we
395 	 *	cannot make anything clever here.
396 	 *
397 	 *	We are not end-node, so that if packet contains
398 	 *	AH/ESP, we cannot make anything.
399 	 *	Defragmentation also would be mistake, RA packets
400 	 *	cannot be fragmented, because there is no warranty
401 	 *	that different fragments will go along one path. --ANK
402 	 */
403 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
404 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
405 			return 0;
406 	}
407 
408 	/*
409 	 *	check and decrement ttl
410 	 */
411 	if (hdr->hop_limit <= 1) {
412 		/* Force OUTPUT device used as source address */
413 		skb->dev = dst->dev;
414 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
415 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
416 				 IPSTATS_MIB_INHDRERRORS);
417 
418 		kfree_skb(skb);
419 		return -ETIMEDOUT;
420 	}
421 
422 	/* XXX: idev->cnf.proxy_ndp? */
423 	if (net->ipv6.devconf_all->proxy_ndp &&
424 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
425 		int proxied = ip6_forward_proxy_check(skb);
426 		if (proxied > 0)
427 			return ip6_input(skb);
428 		else if (proxied < 0) {
429 			IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
430 					 IPSTATS_MIB_INDISCARDS);
431 			goto drop;
432 		}
433 	}
434 
435 	if (!xfrm6_route_forward(skb)) {
436 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
437 				 IPSTATS_MIB_INDISCARDS);
438 		goto drop;
439 	}
440 	dst = skb_dst(skb);
441 
442 	/* IPv6 specs say nothing about it, but it is clear that we cannot
443 	   send redirects to source routed frames.
444 	   We don't send redirects to frames decapsulated from IPsec.
445 	 */
446 	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
447 		struct in6_addr *target = NULL;
448 		struct inet_peer *peer;
449 		struct rt6_info *rt;
450 
451 		/*
452 		 *	incoming and outgoing devices are the same
453 		 *	send a redirect.
454 		 */
455 
456 		rt = (struct rt6_info *) dst;
457 		if (rt->rt6i_flags & RTF_GATEWAY)
458 			target = &rt->rt6i_gateway;
459 		else
460 			target = &hdr->daddr;
461 
462 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
463 
464 		/* Limit redirects both by destination (here)
465 		   and by source (inside ndisc_send_redirect)
466 		 */
467 		if (inet_peer_xrlim_allow(peer, 1*HZ))
468 			ndisc_send_redirect(skb, target);
469 		if (peer)
470 			inet_putpeer(peer);
471 	} else {
472 		int addrtype = ipv6_addr_type(&hdr->saddr);
473 
474 		/* This check is security critical. */
475 		if (addrtype == IPV6_ADDR_ANY ||
476 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
477 			goto error;
478 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
479 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
480 				    ICMPV6_NOT_NEIGHBOUR, 0);
481 			goto error;
482 		}
483 	}
484 
485 	mtu = ip6_dst_mtu_forward(dst);
486 	if (mtu < IPV6_MIN_MTU)
487 		mtu = IPV6_MIN_MTU;
488 
489 	if (ip6_pkt_too_big(skb, mtu)) {
490 		/* Again, force OUTPUT device used as source address */
491 		skb->dev = dst->dev;
492 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
493 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
494 				 IPSTATS_MIB_INTOOBIGERRORS);
495 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
496 				 IPSTATS_MIB_FRAGFAILS);
497 		kfree_skb(skb);
498 		return -EMSGSIZE;
499 	}
500 
501 	if (skb_cow(skb, dst->dev->hard_header_len)) {
502 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
503 				 IPSTATS_MIB_OUTDISCARDS);
504 		goto drop;
505 	}
506 
507 	hdr = ipv6_hdr(skb);
508 
509 	/* Mangling hops number delayed to point after skb COW */
510 
511 	hdr->hop_limit--;
512 
513 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
514 	IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
515 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, NULL, skb,
516 		       skb->dev, dst->dev,
517 		       ip6_forward_finish);
518 
519 error:
520 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
521 drop:
522 	kfree_skb(skb);
523 	return -EINVAL;
524 }
525 
526 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
527 {
528 	to->pkt_type = from->pkt_type;
529 	to->priority = from->priority;
530 	to->protocol = from->protocol;
531 	skb_dst_drop(to);
532 	skb_dst_set(to, dst_clone(skb_dst(from)));
533 	to->dev = from->dev;
534 	to->mark = from->mark;
535 
536 #ifdef CONFIG_NET_SCHED
537 	to->tc_index = from->tc_index;
538 #endif
539 	nf_copy(to, from);
540 	skb_copy_secmark(to, from);
541 }
542 
543 int ip6_fragment(struct sock *sk, struct sk_buff *skb,
544 		 int (*output)(struct sock *, struct sk_buff *))
545 {
546 	struct sk_buff *frag;
547 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
548 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
549 				inet6_sk(skb->sk) : NULL;
550 	struct ipv6hdr *tmp_hdr;
551 	struct frag_hdr *fh;
552 	unsigned int mtu, hlen, left, len;
553 	int hroom, troom;
554 	__be32 frag_id;
555 	int ptr, offset = 0, err = 0;
556 	u8 *prevhdr, nexthdr = 0;
557 	struct net *net = dev_net(skb_dst(skb)->dev);
558 
559 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
560 	nexthdr = *prevhdr;
561 
562 	mtu = ip6_skb_dst_mtu(skb);
563 
564 	/* We must not fragment if the socket is set to force MTU discovery
565 	 * or if the skb it not generated by a local socket.
566 	 */
567 	if (unlikely(!skb->ignore_df && skb->len > mtu))
568 		goto fail_toobig;
569 
570 	if (IP6CB(skb)->frag_max_size) {
571 		if (IP6CB(skb)->frag_max_size > mtu)
572 			goto fail_toobig;
573 
574 		/* don't send fragments larger than what we received */
575 		mtu = IP6CB(skb)->frag_max_size;
576 		if (mtu < IPV6_MIN_MTU)
577 			mtu = IPV6_MIN_MTU;
578 	}
579 
580 	if (np && np->frag_size < mtu) {
581 		if (np->frag_size)
582 			mtu = np->frag_size;
583 	}
584 	mtu -= hlen + sizeof(struct frag_hdr);
585 
586 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
587 				    &ipv6_hdr(skb)->saddr);
588 
589 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
590 	if (skb_has_frag_list(skb)) {
591 		int first_len = skb_pagelen(skb);
592 		struct sk_buff *frag2;
593 
594 		if (first_len - hlen > mtu ||
595 		    ((first_len - hlen) & 7) ||
596 		    skb_cloned(skb) ||
597 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
598 			goto slow_path;
599 
600 		skb_walk_frags(skb, frag) {
601 			/* Correct geometry. */
602 			if (frag->len > mtu ||
603 			    ((frag->len & 7) && frag->next) ||
604 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
605 				goto slow_path_clean;
606 
607 			/* Partially cloned skb? */
608 			if (skb_shared(frag))
609 				goto slow_path_clean;
610 
611 			BUG_ON(frag->sk);
612 			if (skb->sk) {
613 				frag->sk = skb->sk;
614 				frag->destructor = sock_wfree;
615 			}
616 			skb->truesize -= frag->truesize;
617 		}
618 
619 		err = 0;
620 		offset = 0;
621 		/* BUILD HEADER */
622 
623 		*prevhdr = NEXTHDR_FRAGMENT;
624 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
625 		if (!tmp_hdr) {
626 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
627 				      IPSTATS_MIB_FRAGFAILS);
628 			err = -ENOMEM;
629 			goto fail;
630 		}
631 		frag = skb_shinfo(skb)->frag_list;
632 		skb_frag_list_init(skb);
633 
634 		__skb_pull(skb, hlen);
635 		fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
636 		__skb_push(skb, hlen);
637 		skb_reset_network_header(skb);
638 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
639 
640 		fh->nexthdr = nexthdr;
641 		fh->reserved = 0;
642 		fh->frag_off = htons(IP6_MF);
643 		fh->identification = frag_id;
644 
645 		first_len = skb_pagelen(skb);
646 		skb->data_len = first_len - skb_headlen(skb);
647 		skb->len = first_len;
648 		ipv6_hdr(skb)->payload_len = htons(first_len -
649 						   sizeof(struct ipv6hdr));
650 
651 		dst_hold(&rt->dst);
652 
653 		for (;;) {
654 			/* Prepare header of the next frame,
655 			 * before previous one went down. */
656 			if (frag) {
657 				frag->ip_summed = CHECKSUM_NONE;
658 				skb_reset_transport_header(frag);
659 				fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
660 				__skb_push(frag, hlen);
661 				skb_reset_network_header(frag);
662 				memcpy(skb_network_header(frag), tmp_hdr,
663 				       hlen);
664 				offset += skb->len - hlen - sizeof(struct frag_hdr);
665 				fh->nexthdr = nexthdr;
666 				fh->reserved = 0;
667 				fh->frag_off = htons(offset);
668 				if (frag->next)
669 					fh->frag_off |= htons(IP6_MF);
670 				fh->identification = frag_id;
671 				ipv6_hdr(frag)->payload_len =
672 						htons(frag->len -
673 						      sizeof(struct ipv6hdr));
674 				ip6_copy_metadata(frag, skb);
675 			}
676 
677 			err = output(sk, skb);
678 			if (!err)
679 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
680 					      IPSTATS_MIB_FRAGCREATES);
681 
682 			if (err || !frag)
683 				break;
684 
685 			skb = frag;
686 			frag = skb->next;
687 			skb->next = NULL;
688 		}
689 
690 		kfree(tmp_hdr);
691 
692 		if (err == 0) {
693 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
694 				      IPSTATS_MIB_FRAGOKS);
695 			ip6_rt_put(rt);
696 			return 0;
697 		}
698 
699 		kfree_skb_list(frag);
700 
701 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
702 			      IPSTATS_MIB_FRAGFAILS);
703 		ip6_rt_put(rt);
704 		return err;
705 
706 slow_path_clean:
707 		skb_walk_frags(skb, frag2) {
708 			if (frag2 == frag)
709 				break;
710 			frag2->sk = NULL;
711 			frag2->destructor = NULL;
712 			skb->truesize += frag2->truesize;
713 		}
714 	}
715 
716 slow_path:
717 	if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
718 	    skb_checksum_help(skb))
719 		goto fail;
720 
721 	left = skb->len - hlen;		/* Space per frame */
722 	ptr = hlen;			/* Where to start from */
723 
724 	/*
725 	 *	Fragment the datagram.
726 	 */
727 
728 	*prevhdr = NEXTHDR_FRAGMENT;
729 	troom = rt->dst.dev->needed_tailroom;
730 
731 	/*
732 	 *	Keep copying data until we run out.
733 	 */
734 	while (left > 0)	{
735 		len = left;
736 		/* IF: it doesn't fit, use 'mtu' - the data space left */
737 		if (len > mtu)
738 			len = mtu;
739 		/* IF: we are not sending up to and including the packet end
740 		   then align the next start on an eight byte boundary */
741 		if (len < left)	{
742 			len &= ~7;
743 		}
744 
745 		/* Allocate buffer */
746 		frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
747 				 hroom + troom, GFP_ATOMIC);
748 		if (!frag) {
749 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
750 				      IPSTATS_MIB_FRAGFAILS);
751 			err = -ENOMEM;
752 			goto fail;
753 		}
754 
755 		/*
756 		 *	Set up data on packet
757 		 */
758 
759 		ip6_copy_metadata(frag, skb);
760 		skb_reserve(frag, hroom);
761 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
762 		skb_reset_network_header(frag);
763 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
764 		frag->transport_header = (frag->network_header + hlen +
765 					  sizeof(struct frag_hdr));
766 
767 		/*
768 		 *	Charge the memory for the fragment to any owner
769 		 *	it might possess
770 		 */
771 		if (skb->sk)
772 			skb_set_owner_w(frag, skb->sk);
773 
774 		/*
775 		 *	Copy the packet header into the new buffer.
776 		 */
777 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
778 
779 		/*
780 		 *	Build fragment header.
781 		 */
782 		fh->nexthdr = nexthdr;
783 		fh->reserved = 0;
784 		fh->identification = frag_id;
785 
786 		/*
787 		 *	Copy a block of the IP datagram.
788 		 */
789 		BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
790 				     len));
791 		left -= len;
792 
793 		fh->frag_off = htons(offset);
794 		if (left > 0)
795 			fh->frag_off |= htons(IP6_MF);
796 		ipv6_hdr(frag)->payload_len = htons(frag->len -
797 						    sizeof(struct ipv6hdr));
798 
799 		ptr += len;
800 		offset += len;
801 
802 		/*
803 		 *	Put this fragment into the sending queue.
804 		 */
805 		err = output(sk, frag);
806 		if (err)
807 			goto fail;
808 
809 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
810 			      IPSTATS_MIB_FRAGCREATES);
811 	}
812 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
813 		      IPSTATS_MIB_FRAGOKS);
814 	consume_skb(skb);
815 	return err;
816 
817 fail_toobig:
818 	if (skb->sk && dst_allfrag(skb_dst(skb)))
819 		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
820 
821 	skb->dev = skb_dst(skb)->dev;
822 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
823 	err = -EMSGSIZE;
824 
825 fail:
826 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
827 		      IPSTATS_MIB_FRAGFAILS);
828 	kfree_skb(skb);
829 	return err;
830 }
831 
832 static inline int ip6_rt_check(const struct rt6key *rt_key,
833 			       const struct in6_addr *fl_addr,
834 			       const struct in6_addr *addr_cache)
835 {
836 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
837 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
838 }
839 
840 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
841 					  struct dst_entry *dst,
842 					  const struct flowi6 *fl6)
843 {
844 	struct ipv6_pinfo *np = inet6_sk(sk);
845 	struct rt6_info *rt;
846 
847 	if (!dst)
848 		goto out;
849 
850 	if (dst->ops->family != AF_INET6) {
851 		dst_release(dst);
852 		return NULL;
853 	}
854 
855 	rt = (struct rt6_info *)dst;
856 	/* Yes, checking route validity in not connected
857 	 * case is not very simple. Take into account,
858 	 * that we do not support routing by source, TOS,
859 	 * and MSG_DONTROUTE		--ANK (980726)
860 	 *
861 	 * 1. ip6_rt_check(): If route was host route,
862 	 *    check that cached destination is current.
863 	 *    If it is network route, we still may
864 	 *    check its validity using saved pointer
865 	 *    to the last used address: daddr_cache.
866 	 *    We do not want to save whole address now,
867 	 *    (because main consumer of this service
868 	 *    is tcp, which has not this problem),
869 	 *    so that the last trick works only on connected
870 	 *    sockets.
871 	 * 2. oif also should be the same.
872 	 */
873 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
874 #ifdef CONFIG_IPV6_SUBTREES
875 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
876 #endif
877 	    (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
878 		dst_release(dst);
879 		dst = NULL;
880 	}
881 
882 out:
883 	return dst;
884 }
885 
886 static int ip6_dst_lookup_tail(struct net *net, struct sock *sk,
887 			       struct dst_entry **dst, struct flowi6 *fl6)
888 {
889 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
890 	struct neighbour *n;
891 	struct rt6_info *rt;
892 #endif
893 	int err;
894 
895 	/* The correct way to handle this would be to do
896 	 * ip6_route_get_saddr, and then ip6_route_output; however,
897 	 * the route-specific preferred source forces the
898 	 * ip6_route_output call _before_ ip6_route_get_saddr.
899 	 *
900 	 * In source specific routing (no src=any default route),
901 	 * ip6_route_output will fail given src=any saddr, though, so
902 	 * that's why we try it again later.
903 	 */
904 	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
905 		struct rt6_info *rt;
906 		bool had_dst = *dst != NULL;
907 
908 		if (!had_dst)
909 			*dst = ip6_route_output(net, sk, fl6);
910 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
911 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
912 					  sk ? inet6_sk(sk)->srcprefs : 0,
913 					  &fl6->saddr);
914 		if (err)
915 			goto out_err_release;
916 
917 		/* If we had an erroneous initial result, pretend it
918 		 * never existed and let the SA-enabled version take
919 		 * over.
920 		 */
921 		if (!had_dst && (*dst)->error) {
922 			dst_release(*dst);
923 			*dst = NULL;
924 		}
925 	}
926 
927 	if (!*dst)
928 		*dst = ip6_route_output(net, sk, fl6);
929 
930 	err = (*dst)->error;
931 	if (err)
932 		goto out_err_release;
933 
934 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
935 	/*
936 	 * Here if the dst entry we've looked up
937 	 * has a neighbour entry that is in the INCOMPLETE
938 	 * state and the src address from the flow is
939 	 * marked as OPTIMISTIC, we release the found
940 	 * dst entry and replace it instead with the
941 	 * dst entry of the nexthop router
942 	 */
943 	rt = (struct rt6_info *) *dst;
944 	rcu_read_lock_bh();
945 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
946 				      rt6_nexthop(rt, &fl6->daddr));
947 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
948 	rcu_read_unlock_bh();
949 
950 	if (err) {
951 		struct inet6_ifaddr *ifp;
952 		struct flowi6 fl_gw6;
953 		int redirect;
954 
955 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
956 				      (*dst)->dev, 1);
957 
958 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
959 		if (ifp)
960 			in6_ifa_put(ifp);
961 
962 		if (redirect) {
963 			/*
964 			 * We need to get the dst entry for the
965 			 * default router instead
966 			 */
967 			dst_release(*dst);
968 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
969 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
970 			*dst = ip6_route_output(net, sk, &fl_gw6);
971 			err = (*dst)->error;
972 			if (err)
973 				goto out_err_release;
974 		}
975 	}
976 #endif
977 
978 	return 0;
979 
980 out_err_release:
981 	if (err == -ENETUNREACH)
982 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
983 	dst_release(*dst);
984 	*dst = NULL;
985 	return err;
986 }
987 
988 /**
989  *	ip6_dst_lookup - perform route lookup on flow
990  *	@sk: socket which provides route info
991  *	@dst: pointer to dst_entry * for result
992  *	@fl6: flow to lookup
993  *
994  *	This function performs a route lookup on the given flow.
995  *
996  *	It returns zero on success, or a standard errno code on error.
997  */
998 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
999 		   struct flowi6 *fl6)
1000 {
1001 	*dst = NULL;
1002 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1003 }
1004 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1005 
1006 /**
1007  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1008  *	@sk: socket which provides route info
1009  *	@fl6: flow to lookup
1010  *	@final_dst: final destination address for ipsec lookup
1011  *
1012  *	This function performs a route lookup on the given flow.
1013  *
1014  *	It returns a valid dst pointer on success, or a pointer encoded
1015  *	error code.
1016  */
1017 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1018 				      const struct in6_addr *final_dst)
1019 {
1020 	struct dst_entry *dst = NULL;
1021 	int err;
1022 
1023 	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1024 	if (err)
1025 		return ERR_PTR(err);
1026 	if (final_dst)
1027 		fl6->daddr = *final_dst;
1028 	if (!fl6->flowi6_oif)
1029 		fl6->flowi6_oif = dst->dev->ifindex;
1030 
1031 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1032 }
1033 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1034 
1035 /**
1036  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1037  *	@sk: socket which provides the dst cache and route info
1038  *	@fl6: flow to lookup
1039  *	@final_dst: final destination address for ipsec lookup
1040  *
1041  *	This function performs a route lookup on the given flow with the
1042  *	possibility of using the cached route in the socket if it is valid.
1043  *	It will take the socket dst lock when operating on the dst cache.
1044  *	As a result, this function can only be used in process context.
1045  *
1046  *	It returns a valid dst pointer on success, or a pointer encoded
1047  *	error code.
1048  */
1049 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1050 					 const struct in6_addr *final_dst)
1051 {
1052 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1053 	int err;
1054 
1055 	dst = ip6_sk_dst_check(sk, dst, fl6);
1056 
1057 	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1058 	if (err)
1059 		return ERR_PTR(err);
1060 	if (final_dst)
1061 		fl6->daddr = *final_dst;
1062 
1063 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1064 }
1065 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1066 
1067 static inline int ip6_ufo_append_data(struct sock *sk,
1068 			struct sk_buff_head *queue,
1069 			int getfrag(void *from, char *to, int offset, int len,
1070 			int odd, struct sk_buff *skb),
1071 			void *from, int length, int hh_len, int fragheaderlen,
1072 			int transhdrlen, int mtu, unsigned int flags,
1073 			const struct flowi6 *fl6)
1074 
1075 {
1076 	struct sk_buff *skb;
1077 	int err;
1078 
1079 	/* There is support for UDP large send offload by network
1080 	 * device, so create one single skb packet containing complete
1081 	 * udp datagram
1082 	 */
1083 	skb = skb_peek_tail(queue);
1084 	if (!skb) {
1085 		skb = sock_alloc_send_skb(sk,
1086 			hh_len + fragheaderlen + transhdrlen + 20,
1087 			(flags & MSG_DONTWAIT), &err);
1088 		if (!skb)
1089 			return err;
1090 
1091 		/* reserve space for Hardware header */
1092 		skb_reserve(skb, hh_len);
1093 
1094 		/* create space for UDP/IP header */
1095 		skb_put(skb, fragheaderlen + transhdrlen);
1096 
1097 		/* initialize network header pointer */
1098 		skb_reset_network_header(skb);
1099 
1100 		/* initialize protocol header pointer */
1101 		skb->transport_header = skb->network_header + fragheaderlen;
1102 
1103 		skb->protocol = htons(ETH_P_IPV6);
1104 		skb->csum = 0;
1105 
1106 		__skb_queue_tail(queue, skb);
1107 	} else if (skb_is_gso(skb)) {
1108 		goto append;
1109 	}
1110 
1111 	skb->ip_summed = CHECKSUM_PARTIAL;
1112 	/* Specify the length of each IPv6 datagram fragment.
1113 	 * It has to be a multiple of 8.
1114 	 */
1115 	skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1116 				     sizeof(struct frag_hdr)) & ~7;
1117 	skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1118 	skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1119 							 &fl6->daddr,
1120 							 &fl6->saddr);
1121 
1122 append:
1123 	return skb_append_datato_frags(sk, skb, getfrag, from,
1124 				       (length - transhdrlen));
1125 }
1126 
1127 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1128 					       gfp_t gfp)
1129 {
1130 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1131 }
1132 
1133 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1134 						gfp_t gfp)
1135 {
1136 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1137 }
1138 
1139 static void ip6_append_data_mtu(unsigned int *mtu,
1140 				int *maxfraglen,
1141 				unsigned int fragheaderlen,
1142 				struct sk_buff *skb,
1143 				struct rt6_info *rt,
1144 				unsigned int orig_mtu)
1145 {
1146 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1147 		if (!skb) {
1148 			/* first fragment, reserve header_len */
1149 			*mtu = orig_mtu - rt->dst.header_len;
1150 
1151 		} else {
1152 			/*
1153 			 * this fragment is not first, the headers
1154 			 * space is regarded as data space.
1155 			 */
1156 			*mtu = orig_mtu;
1157 		}
1158 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1159 			      + fragheaderlen - sizeof(struct frag_hdr);
1160 	}
1161 }
1162 
1163 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1164 			  struct inet6_cork *v6_cork,
1165 			  int hlimit, int tclass, struct ipv6_txoptions *opt,
1166 			  struct rt6_info *rt, struct flowi6 *fl6)
1167 {
1168 	struct ipv6_pinfo *np = inet6_sk(sk);
1169 	unsigned int mtu;
1170 
1171 	/*
1172 	 * setup for corking
1173 	 */
1174 	if (opt) {
1175 		if (WARN_ON(v6_cork->opt))
1176 			return -EINVAL;
1177 
1178 		v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1179 		if (unlikely(!v6_cork->opt))
1180 			return -ENOBUFS;
1181 
1182 		v6_cork->opt->tot_len = opt->tot_len;
1183 		v6_cork->opt->opt_flen = opt->opt_flen;
1184 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1185 
1186 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1187 						    sk->sk_allocation);
1188 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1189 			return -ENOBUFS;
1190 
1191 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1192 						    sk->sk_allocation);
1193 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1194 			return -ENOBUFS;
1195 
1196 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1197 						   sk->sk_allocation);
1198 		if (opt->hopopt && !v6_cork->opt->hopopt)
1199 			return -ENOBUFS;
1200 
1201 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1202 						    sk->sk_allocation);
1203 		if (opt->srcrt && !v6_cork->opt->srcrt)
1204 			return -ENOBUFS;
1205 
1206 		/* need source address above miyazawa*/
1207 	}
1208 	dst_hold(&rt->dst);
1209 	cork->base.dst = &rt->dst;
1210 	cork->fl.u.ip6 = *fl6;
1211 	v6_cork->hop_limit = hlimit;
1212 	v6_cork->tclass = tclass;
1213 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1214 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1215 		      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1216 	else
1217 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1218 		      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1219 	if (np->frag_size < mtu) {
1220 		if (np->frag_size)
1221 			mtu = np->frag_size;
1222 	}
1223 	cork->base.fragsize = mtu;
1224 	if (dst_allfrag(rt->dst.path))
1225 		cork->base.flags |= IPCORK_ALLFRAG;
1226 	cork->base.length = 0;
1227 
1228 	return 0;
1229 }
1230 
1231 static int __ip6_append_data(struct sock *sk,
1232 			     struct flowi6 *fl6,
1233 			     struct sk_buff_head *queue,
1234 			     struct inet_cork *cork,
1235 			     struct inet6_cork *v6_cork,
1236 			     struct page_frag *pfrag,
1237 			     int getfrag(void *from, char *to, int offset,
1238 					 int len, int odd, struct sk_buff *skb),
1239 			     void *from, int length, int transhdrlen,
1240 			     unsigned int flags, int dontfrag)
1241 {
1242 	struct sk_buff *skb, *skb_prev = NULL;
1243 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1244 	int exthdrlen = 0;
1245 	int dst_exthdrlen = 0;
1246 	int hh_len;
1247 	int copy;
1248 	int err;
1249 	int offset = 0;
1250 	__u8 tx_flags = 0;
1251 	u32 tskey = 0;
1252 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1253 	struct ipv6_txoptions *opt = v6_cork->opt;
1254 	int csummode = CHECKSUM_NONE;
1255 
1256 	skb = skb_peek_tail(queue);
1257 	if (!skb) {
1258 		exthdrlen = opt ? opt->opt_flen : 0;
1259 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1260 	}
1261 
1262 	mtu = cork->fragsize;
1263 	orig_mtu = mtu;
1264 
1265 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1266 
1267 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1268 			(opt ? opt->opt_nflen : 0);
1269 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1270 		     sizeof(struct frag_hdr);
1271 
1272 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1273 		unsigned int maxnonfragsize, headersize;
1274 
1275 		headersize = sizeof(struct ipv6hdr) +
1276 			     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1277 			     (dst_allfrag(&rt->dst) ?
1278 			      sizeof(struct frag_hdr) : 0) +
1279 			     rt->rt6i_nfheader_len;
1280 
1281 		if (ip6_sk_ignore_df(sk))
1282 			maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1283 		else
1284 			maxnonfragsize = mtu;
1285 
1286 		/* dontfrag active */
1287 		if ((cork->length + length > mtu - headersize) && dontfrag &&
1288 		    (sk->sk_protocol == IPPROTO_UDP ||
1289 		     sk->sk_protocol == IPPROTO_RAW)) {
1290 			ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1291 						   sizeof(struct ipv6hdr));
1292 			goto emsgsize;
1293 		}
1294 
1295 		if (cork->length + length > maxnonfragsize - headersize) {
1296 emsgsize:
1297 			ipv6_local_error(sk, EMSGSIZE, fl6,
1298 					 mtu - headersize +
1299 					 sizeof(struct ipv6hdr));
1300 			return -EMSGSIZE;
1301 		}
1302 	}
1303 
1304 	if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1305 		sock_tx_timestamp(sk, &tx_flags);
1306 		if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1307 		    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1308 			tskey = sk->sk_tskey++;
1309 	}
1310 
1311 	/* If this is the first and only packet and device
1312 	 * supports checksum offloading, let's use it.
1313 	 * Use transhdrlen, same as IPv4, because partial
1314 	 * sums only work when transhdrlen is set.
1315 	 */
1316 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1317 	    length + fragheaderlen < mtu &&
1318 	    rt->dst.dev->features & NETIF_F_V6_CSUM &&
1319 	    !exthdrlen)
1320 		csummode = CHECKSUM_PARTIAL;
1321 	/*
1322 	 * Let's try using as much space as possible.
1323 	 * Use MTU if total length of the message fits into the MTU.
1324 	 * Otherwise, we need to reserve fragment header and
1325 	 * fragment alignment (= 8-15 octects, in total).
1326 	 *
1327 	 * Note that we may need to "move" the data from the tail of
1328 	 * of the buffer to the new fragment when we split
1329 	 * the message.
1330 	 *
1331 	 * FIXME: It may be fragmented into multiple chunks
1332 	 *        at once if non-fragmentable extension headers
1333 	 *        are too large.
1334 	 * --yoshfuji
1335 	 */
1336 
1337 	cork->length += length;
1338 	if (((length > mtu) ||
1339 	     (skb && skb_is_gso(skb))) &&
1340 	    (sk->sk_protocol == IPPROTO_UDP) &&
1341 	    (rt->dst.dev->features & NETIF_F_UFO) &&
1342 	    (sk->sk_type == SOCK_DGRAM)) {
1343 		err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1344 					  hh_len, fragheaderlen,
1345 					  transhdrlen, mtu, flags, fl6);
1346 		if (err)
1347 			goto error;
1348 		return 0;
1349 	}
1350 
1351 	if (!skb)
1352 		goto alloc_new_skb;
1353 
1354 	while (length > 0) {
1355 		/* Check if the remaining data fits into current packet. */
1356 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1357 		if (copy < length)
1358 			copy = maxfraglen - skb->len;
1359 
1360 		if (copy <= 0) {
1361 			char *data;
1362 			unsigned int datalen;
1363 			unsigned int fraglen;
1364 			unsigned int fraggap;
1365 			unsigned int alloclen;
1366 alloc_new_skb:
1367 			/* There's no room in the current skb */
1368 			if (skb)
1369 				fraggap = skb->len - maxfraglen;
1370 			else
1371 				fraggap = 0;
1372 			/* update mtu and maxfraglen if necessary */
1373 			if (!skb || !skb_prev)
1374 				ip6_append_data_mtu(&mtu, &maxfraglen,
1375 						    fragheaderlen, skb, rt,
1376 						    orig_mtu);
1377 
1378 			skb_prev = skb;
1379 
1380 			/*
1381 			 * If remaining data exceeds the mtu,
1382 			 * we know we need more fragment(s).
1383 			 */
1384 			datalen = length + fraggap;
1385 
1386 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1387 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1388 			if ((flags & MSG_MORE) &&
1389 			    !(rt->dst.dev->features&NETIF_F_SG))
1390 				alloclen = mtu;
1391 			else
1392 				alloclen = datalen + fragheaderlen;
1393 
1394 			alloclen += dst_exthdrlen;
1395 
1396 			if (datalen != length + fraggap) {
1397 				/*
1398 				 * this is not the last fragment, the trailer
1399 				 * space is regarded as data space.
1400 				 */
1401 				datalen += rt->dst.trailer_len;
1402 			}
1403 
1404 			alloclen += rt->dst.trailer_len;
1405 			fraglen = datalen + fragheaderlen;
1406 
1407 			/*
1408 			 * We just reserve space for fragment header.
1409 			 * Note: this may be overallocation if the message
1410 			 * (without MSG_MORE) fits into the MTU.
1411 			 */
1412 			alloclen += sizeof(struct frag_hdr);
1413 
1414 			if (transhdrlen) {
1415 				skb = sock_alloc_send_skb(sk,
1416 						alloclen + hh_len,
1417 						(flags & MSG_DONTWAIT), &err);
1418 			} else {
1419 				skb = NULL;
1420 				if (atomic_read(&sk->sk_wmem_alloc) <=
1421 				    2 * sk->sk_sndbuf)
1422 					skb = sock_wmalloc(sk,
1423 							   alloclen + hh_len, 1,
1424 							   sk->sk_allocation);
1425 				if (unlikely(!skb))
1426 					err = -ENOBUFS;
1427 			}
1428 			if (!skb)
1429 				goto error;
1430 			/*
1431 			 *	Fill in the control structures
1432 			 */
1433 			skb->protocol = htons(ETH_P_IPV6);
1434 			skb->ip_summed = csummode;
1435 			skb->csum = 0;
1436 			/* reserve for fragmentation and ipsec header */
1437 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1438 				    dst_exthdrlen);
1439 
1440 			/* Only the initial fragment is time stamped */
1441 			skb_shinfo(skb)->tx_flags = tx_flags;
1442 			tx_flags = 0;
1443 			skb_shinfo(skb)->tskey = tskey;
1444 			tskey = 0;
1445 
1446 			/*
1447 			 *	Find where to start putting bytes
1448 			 */
1449 			data = skb_put(skb, fraglen);
1450 			skb_set_network_header(skb, exthdrlen);
1451 			data += fragheaderlen;
1452 			skb->transport_header = (skb->network_header +
1453 						 fragheaderlen);
1454 			if (fraggap) {
1455 				skb->csum = skb_copy_and_csum_bits(
1456 					skb_prev, maxfraglen,
1457 					data + transhdrlen, fraggap, 0);
1458 				skb_prev->csum = csum_sub(skb_prev->csum,
1459 							  skb->csum);
1460 				data += fraggap;
1461 				pskb_trim_unique(skb_prev, maxfraglen);
1462 			}
1463 			copy = datalen - transhdrlen - fraggap;
1464 
1465 			if (copy < 0) {
1466 				err = -EINVAL;
1467 				kfree_skb(skb);
1468 				goto error;
1469 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1470 				err = -EFAULT;
1471 				kfree_skb(skb);
1472 				goto error;
1473 			}
1474 
1475 			offset += copy;
1476 			length -= datalen - fraggap;
1477 			transhdrlen = 0;
1478 			exthdrlen = 0;
1479 			dst_exthdrlen = 0;
1480 
1481 			/*
1482 			 * Put the packet on the pending queue
1483 			 */
1484 			__skb_queue_tail(queue, skb);
1485 			continue;
1486 		}
1487 
1488 		if (copy > length)
1489 			copy = length;
1490 
1491 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1492 			unsigned int off;
1493 
1494 			off = skb->len;
1495 			if (getfrag(from, skb_put(skb, copy),
1496 						offset, copy, off, skb) < 0) {
1497 				__skb_trim(skb, off);
1498 				err = -EFAULT;
1499 				goto error;
1500 			}
1501 		} else {
1502 			int i = skb_shinfo(skb)->nr_frags;
1503 
1504 			err = -ENOMEM;
1505 			if (!sk_page_frag_refill(sk, pfrag))
1506 				goto error;
1507 
1508 			if (!skb_can_coalesce(skb, i, pfrag->page,
1509 					      pfrag->offset)) {
1510 				err = -EMSGSIZE;
1511 				if (i == MAX_SKB_FRAGS)
1512 					goto error;
1513 
1514 				__skb_fill_page_desc(skb, i, pfrag->page,
1515 						     pfrag->offset, 0);
1516 				skb_shinfo(skb)->nr_frags = ++i;
1517 				get_page(pfrag->page);
1518 			}
1519 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1520 			if (getfrag(from,
1521 				    page_address(pfrag->page) + pfrag->offset,
1522 				    offset, copy, skb->len, skb) < 0)
1523 				goto error_efault;
1524 
1525 			pfrag->offset += copy;
1526 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1527 			skb->len += copy;
1528 			skb->data_len += copy;
1529 			skb->truesize += copy;
1530 			atomic_add(copy, &sk->sk_wmem_alloc);
1531 		}
1532 		offset += copy;
1533 		length -= copy;
1534 	}
1535 
1536 	return 0;
1537 
1538 error_efault:
1539 	err = -EFAULT;
1540 error:
1541 	cork->length -= length;
1542 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1543 	return err;
1544 }
1545 
1546 int ip6_append_data(struct sock *sk,
1547 		    int getfrag(void *from, char *to, int offset, int len,
1548 				int odd, struct sk_buff *skb),
1549 		    void *from, int length, int transhdrlen, int hlimit,
1550 		    int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1551 		    struct rt6_info *rt, unsigned int flags, int dontfrag)
1552 {
1553 	struct inet_sock *inet = inet_sk(sk);
1554 	struct ipv6_pinfo *np = inet6_sk(sk);
1555 	int exthdrlen;
1556 	int err;
1557 
1558 	if (flags&MSG_PROBE)
1559 		return 0;
1560 	if (skb_queue_empty(&sk->sk_write_queue)) {
1561 		/*
1562 		 * setup for corking
1563 		 */
1564 		err = ip6_setup_cork(sk, &inet->cork, &np->cork, hlimit,
1565 				     tclass, opt, rt, fl6);
1566 		if (err)
1567 			return err;
1568 
1569 		exthdrlen = (opt ? opt->opt_flen : 0);
1570 		length += exthdrlen;
1571 		transhdrlen += exthdrlen;
1572 	} else {
1573 		fl6 = &inet->cork.fl.u.ip6;
1574 		transhdrlen = 0;
1575 	}
1576 
1577 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1578 				 &np->cork, sk_page_frag(sk), getfrag,
1579 				 from, length, transhdrlen, flags, dontfrag);
1580 }
1581 EXPORT_SYMBOL_GPL(ip6_append_data);
1582 
1583 static void ip6_cork_release(struct inet_cork_full *cork,
1584 			     struct inet6_cork *v6_cork)
1585 {
1586 	if (v6_cork->opt) {
1587 		kfree(v6_cork->opt->dst0opt);
1588 		kfree(v6_cork->opt->dst1opt);
1589 		kfree(v6_cork->opt->hopopt);
1590 		kfree(v6_cork->opt->srcrt);
1591 		kfree(v6_cork->opt);
1592 		v6_cork->opt = NULL;
1593 	}
1594 
1595 	if (cork->base.dst) {
1596 		dst_release(cork->base.dst);
1597 		cork->base.dst = NULL;
1598 		cork->base.flags &= ~IPCORK_ALLFRAG;
1599 	}
1600 	memset(&cork->fl, 0, sizeof(cork->fl));
1601 }
1602 
1603 struct sk_buff *__ip6_make_skb(struct sock *sk,
1604 			       struct sk_buff_head *queue,
1605 			       struct inet_cork_full *cork,
1606 			       struct inet6_cork *v6_cork)
1607 {
1608 	struct sk_buff *skb, *tmp_skb;
1609 	struct sk_buff **tail_skb;
1610 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1611 	struct ipv6_pinfo *np = inet6_sk(sk);
1612 	struct net *net = sock_net(sk);
1613 	struct ipv6hdr *hdr;
1614 	struct ipv6_txoptions *opt = v6_cork->opt;
1615 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1616 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1617 	unsigned char proto = fl6->flowi6_proto;
1618 
1619 	skb = __skb_dequeue(queue);
1620 	if (!skb)
1621 		goto out;
1622 	tail_skb = &(skb_shinfo(skb)->frag_list);
1623 
1624 	/* move skb->data to ip header from ext header */
1625 	if (skb->data < skb_network_header(skb))
1626 		__skb_pull(skb, skb_network_offset(skb));
1627 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1628 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1629 		*tail_skb = tmp_skb;
1630 		tail_skb = &(tmp_skb->next);
1631 		skb->len += tmp_skb->len;
1632 		skb->data_len += tmp_skb->len;
1633 		skb->truesize += tmp_skb->truesize;
1634 		tmp_skb->destructor = NULL;
1635 		tmp_skb->sk = NULL;
1636 	}
1637 
1638 	/* Allow local fragmentation. */
1639 	skb->ignore_df = ip6_sk_ignore_df(sk);
1640 
1641 	*final_dst = fl6->daddr;
1642 	__skb_pull(skb, skb_network_header_len(skb));
1643 	if (opt && opt->opt_flen)
1644 		ipv6_push_frag_opts(skb, opt, &proto);
1645 	if (opt && opt->opt_nflen)
1646 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1647 
1648 	skb_push(skb, sizeof(struct ipv6hdr));
1649 	skb_reset_network_header(skb);
1650 	hdr = ipv6_hdr(skb);
1651 
1652 	ip6_flow_hdr(hdr, v6_cork->tclass,
1653 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1654 					np->autoflowlabel, fl6));
1655 	hdr->hop_limit = v6_cork->hop_limit;
1656 	hdr->nexthdr = proto;
1657 	hdr->saddr = fl6->saddr;
1658 	hdr->daddr = *final_dst;
1659 
1660 	skb->priority = sk->sk_priority;
1661 	skb->mark = sk->sk_mark;
1662 
1663 	skb_dst_set(skb, dst_clone(&rt->dst));
1664 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1665 	if (proto == IPPROTO_ICMPV6) {
1666 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1667 
1668 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1669 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1670 	}
1671 
1672 	ip6_cork_release(cork, v6_cork);
1673 out:
1674 	return skb;
1675 }
1676 
1677 int ip6_send_skb(struct sk_buff *skb)
1678 {
1679 	struct net *net = sock_net(skb->sk);
1680 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1681 	int err;
1682 
1683 	err = ip6_local_out(skb);
1684 	if (err) {
1685 		if (err > 0)
1686 			err = net_xmit_errno(err);
1687 		if (err)
1688 			IP6_INC_STATS(net, rt->rt6i_idev,
1689 				      IPSTATS_MIB_OUTDISCARDS);
1690 	}
1691 
1692 	return err;
1693 }
1694 
1695 int ip6_push_pending_frames(struct sock *sk)
1696 {
1697 	struct sk_buff *skb;
1698 
1699 	skb = ip6_finish_skb(sk);
1700 	if (!skb)
1701 		return 0;
1702 
1703 	return ip6_send_skb(skb);
1704 }
1705 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1706 
1707 static void __ip6_flush_pending_frames(struct sock *sk,
1708 				       struct sk_buff_head *queue,
1709 				       struct inet_cork_full *cork,
1710 				       struct inet6_cork *v6_cork)
1711 {
1712 	struct sk_buff *skb;
1713 
1714 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1715 		if (skb_dst(skb))
1716 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1717 				      IPSTATS_MIB_OUTDISCARDS);
1718 		kfree_skb(skb);
1719 	}
1720 
1721 	ip6_cork_release(cork, v6_cork);
1722 }
1723 
1724 void ip6_flush_pending_frames(struct sock *sk)
1725 {
1726 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1727 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1728 }
1729 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1730 
1731 struct sk_buff *ip6_make_skb(struct sock *sk,
1732 			     int getfrag(void *from, char *to, int offset,
1733 					 int len, int odd, struct sk_buff *skb),
1734 			     void *from, int length, int transhdrlen,
1735 			     int hlimit, int tclass,
1736 			     struct ipv6_txoptions *opt, struct flowi6 *fl6,
1737 			     struct rt6_info *rt, unsigned int flags,
1738 			     int dontfrag)
1739 {
1740 	struct inet_cork_full cork;
1741 	struct inet6_cork v6_cork;
1742 	struct sk_buff_head queue;
1743 	int exthdrlen = (opt ? opt->opt_flen : 0);
1744 	int err;
1745 
1746 	if (flags & MSG_PROBE)
1747 		return NULL;
1748 
1749 	__skb_queue_head_init(&queue);
1750 
1751 	cork.base.flags = 0;
1752 	cork.base.addr = 0;
1753 	cork.base.opt = NULL;
1754 	v6_cork.opt = NULL;
1755 	err = ip6_setup_cork(sk, &cork, &v6_cork, hlimit, tclass, opt, rt, fl6);
1756 	if (err)
1757 		return ERR_PTR(err);
1758 
1759 	if (dontfrag < 0)
1760 		dontfrag = inet6_sk(sk)->dontfrag;
1761 
1762 	err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1763 				&current->task_frag, getfrag, from,
1764 				length + exthdrlen, transhdrlen + exthdrlen,
1765 				flags, dontfrag);
1766 	if (err) {
1767 		__ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1768 		return ERR_PTR(err);
1769 	}
1770 
1771 	return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1772 }
1773