xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 52fb57e7)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	:	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 
59 static int ip6_finish_output2(struct sock *sk, struct sk_buff *skb)
60 {
61 	struct dst_entry *dst = skb_dst(skb);
62 	struct net_device *dev = dst->dev;
63 	struct neighbour *neigh;
64 	struct in6_addr *nexthop;
65 	int ret;
66 
67 	skb->protocol = htons(ETH_P_IPV6);
68 	skb->dev = dev;
69 
70 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72 
73 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74 		    ((mroute6_socket(dev_net(dev), skb) &&
75 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 					 &ipv6_hdr(skb)->saddr))) {
78 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79 
80 			/* Do not check for IFF_ALLMULTI; multicast routing
81 			   is not supported in any case.
82 			 */
83 			if (newskb)
84 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 					sk, newskb, NULL, newskb->dev,
86 					dev_loopback_xmit);
87 
88 			if (ipv6_hdr(skb)->hop_limit == 0) {
89 				IP6_INC_STATS(dev_net(dev), idev,
90 					      IPSTATS_MIB_OUTDISCARDS);
91 				kfree_skb(skb);
92 				return 0;
93 			}
94 		}
95 
96 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
97 				skb->len);
98 
99 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
100 		    IPV6_ADDR_SCOPE_NODELOCAL &&
101 		    !(dev->flags & IFF_LOOPBACK)) {
102 			kfree_skb(skb);
103 			return 0;
104 		}
105 	}
106 
107 	rcu_read_lock_bh();
108 	nexthop = rt6_nexthop((struct rt6_info *)dst);
109 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
110 	if (unlikely(!neigh))
111 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
112 	if (!IS_ERR(neigh)) {
113 		ret = dst_neigh_output(dst, neigh, skb);
114 		rcu_read_unlock_bh();
115 		return ret;
116 	}
117 	rcu_read_unlock_bh();
118 
119 	IP6_INC_STATS(dev_net(dst->dev),
120 		      ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
121 	kfree_skb(skb);
122 	return -EINVAL;
123 }
124 
125 static int ip6_finish_output(struct sock *sk, struct sk_buff *skb)
126 {
127 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
128 	    dst_allfrag(skb_dst(skb)) ||
129 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
130 		return ip6_fragment(sk, skb, ip6_finish_output2);
131 	else
132 		return ip6_finish_output2(sk, skb);
133 }
134 
135 int ip6_output(struct sock *sk, struct sk_buff *skb)
136 {
137 	struct net_device *dev = skb_dst(skb)->dev;
138 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
139 	if (unlikely(idev->cnf.disable_ipv6)) {
140 		IP6_INC_STATS(dev_net(dev), idev,
141 			      IPSTATS_MIB_OUTDISCARDS);
142 		kfree_skb(skb);
143 		return 0;
144 	}
145 
146 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, sk, skb,
147 			    NULL, dev,
148 			    ip6_finish_output,
149 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
150 }
151 
152 /*
153  *	xmit an sk_buff (used by TCP, SCTP and DCCP)
154  */
155 
156 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
157 	     struct ipv6_txoptions *opt, int tclass)
158 {
159 	struct net *net = sock_net(sk);
160 	struct ipv6_pinfo *np = inet6_sk(sk);
161 	struct in6_addr *first_hop = &fl6->daddr;
162 	struct dst_entry *dst = skb_dst(skb);
163 	struct ipv6hdr *hdr;
164 	u8  proto = fl6->flowi6_proto;
165 	int seg_len = skb->len;
166 	int hlimit = -1;
167 	u32 mtu;
168 
169 	if (opt) {
170 		unsigned int head_room;
171 
172 		/* First: exthdrs may take lots of space (~8K for now)
173 		   MAX_HEADER is not enough.
174 		 */
175 		head_room = opt->opt_nflen + opt->opt_flen;
176 		seg_len += head_room;
177 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
178 
179 		if (skb_headroom(skb) < head_room) {
180 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
181 			if (!skb2) {
182 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
183 					      IPSTATS_MIB_OUTDISCARDS);
184 				kfree_skb(skb);
185 				return -ENOBUFS;
186 			}
187 			consume_skb(skb);
188 			skb = skb2;
189 			skb_set_owner_w(skb, sk);
190 		}
191 		if (opt->opt_flen)
192 			ipv6_push_frag_opts(skb, opt, &proto);
193 		if (opt->opt_nflen)
194 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
195 	}
196 
197 	skb_push(skb, sizeof(struct ipv6hdr));
198 	skb_reset_network_header(skb);
199 	hdr = ipv6_hdr(skb);
200 
201 	/*
202 	 *	Fill in the IPv6 header
203 	 */
204 	if (np)
205 		hlimit = np->hop_limit;
206 	if (hlimit < 0)
207 		hlimit = ip6_dst_hoplimit(dst);
208 
209 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
210 						     np->autoflowlabel));
211 
212 	hdr->payload_len = htons(seg_len);
213 	hdr->nexthdr = proto;
214 	hdr->hop_limit = hlimit;
215 
216 	hdr->saddr = fl6->saddr;
217 	hdr->daddr = *first_hop;
218 
219 	skb->protocol = htons(ETH_P_IPV6);
220 	skb->priority = sk->sk_priority;
221 	skb->mark = sk->sk_mark;
222 
223 	mtu = dst_mtu(dst);
224 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
225 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
226 			      IPSTATS_MIB_OUT, skb->len);
227 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb,
228 			       NULL, dst->dev, dst_output_sk);
229 	}
230 
231 	skb->dev = dst->dev;
232 	ipv6_local_error(sk, EMSGSIZE, fl6, mtu);
233 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
234 	kfree_skb(skb);
235 	return -EMSGSIZE;
236 }
237 EXPORT_SYMBOL(ip6_xmit);
238 
239 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
240 {
241 	struct ip6_ra_chain *ra;
242 	struct sock *last = NULL;
243 
244 	read_lock(&ip6_ra_lock);
245 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
246 		struct sock *sk = ra->sk;
247 		if (sk && ra->sel == sel &&
248 		    (!sk->sk_bound_dev_if ||
249 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
250 			if (last) {
251 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
252 				if (skb2)
253 					rawv6_rcv(last, skb2);
254 			}
255 			last = sk;
256 		}
257 	}
258 
259 	if (last) {
260 		rawv6_rcv(last, skb);
261 		read_unlock(&ip6_ra_lock);
262 		return 1;
263 	}
264 	read_unlock(&ip6_ra_lock);
265 	return 0;
266 }
267 
268 static int ip6_forward_proxy_check(struct sk_buff *skb)
269 {
270 	struct ipv6hdr *hdr = ipv6_hdr(skb);
271 	u8 nexthdr = hdr->nexthdr;
272 	__be16 frag_off;
273 	int offset;
274 
275 	if (ipv6_ext_hdr(nexthdr)) {
276 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
277 		if (offset < 0)
278 			return 0;
279 	} else
280 		offset = sizeof(struct ipv6hdr);
281 
282 	if (nexthdr == IPPROTO_ICMPV6) {
283 		struct icmp6hdr *icmp6;
284 
285 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
286 					 offset + 1 - skb->data)))
287 			return 0;
288 
289 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
290 
291 		switch (icmp6->icmp6_type) {
292 		case NDISC_ROUTER_SOLICITATION:
293 		case NDISC_ROUTER_ADVERTISEMENT:
294 		case NDISC_NEIGHBOUR_SOLICITATION:
295 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
296 		case NDISC_REDIRECT:
297 			/* For reaction involving unicast neighbor discovery
298 			 * message destined to the proxied address, pass it to
299 			 * input function.
300 			 */
301 			return 1;
302 		default:
303 			break;
304 		}
305 	}
306 
307 	/*
308 	 * The proxying router can't forward traffic sent to a link-local
309 	 * address, so signal the sender and discard the packet. This
310 	 * behavior is clarified by the MIPv6 specification.
311 	 */
312 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
313 		dst_link_failure(skb);
314 		return -1;
315 	}
316 
317 	return 0;
318 }
319 
320 static inline int ip6_forward_finish(struct sock *sk, struct sk_buff *skb)
321 {
322 	skb_sender_cpu_clear(skb);
323 	return dst_output_sk(sk, skb);
324 }
325 
326 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
327 {
328 	unsigned int mtu;
329 	struct inet6_dev *idev;
330 
331 	if (dst_metric_locked(dst, RTAX_MTU)) {
332 		mtu = dst_metric_raw(dst, RTAX_MTU);
333 		if (mtu)
334 			return mtu;
335 	}
336 
337 	mtu = IPV6_MIN_MTU;
338 	rcu_read_lock();
339 	idev = __in6_dev_get(dst->dev);
340 	if (idev)
341 		mtu = idev->cnf.mtu6;
342 	rcu_read_unlock();
343 
344 	return mtu;
345 }
346 
347 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
348 {
349 	if (skb->len <= mtu)
350 		return false;
351 
352 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
353 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
354 		return true;
355 
356 	if (skb->ignore_df)
357 		return false;
358 
359 	if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
360 		return false;
361 
362 	return true;
363 }
364 
365 int ip6_forward(struct sk_buff *skb)
366 {
367 	struct dst_entry *dst = skb_dst(skb);
368 	struct ipv6hdr *hdr = ipv6_hdr(skb);
369 	struct inet6_skb_parm *opt = IP6CB(skb);
370 	struct net *net = dev_net(dst->dev);
371 	u32 mtu;
372 
373 	if (net->ipv6.devconf_all->forwarding == 0)
374 		goto error;
375 
376 	if (skb->pkt_type != PACKET_HOST)
377 		goto drop;
378 
379 	if (skb_warn_if_lro(skb))
380 		goto drop;
381 
382 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
383 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
384 				 IPSTATS_MIB_INDISCARDS);
385 		goto drop;
386 	}
387 
388 	skb_forward_csum(skb);
389 
390 	/*
391 	 *	We DO NOT make any processing on
392 	 *	RA packets, pushing them to user level AS IS
393 	 *	without ane WARRANTY that application will be able
394 	 *	to interpret them. The reason is that we
395 	 *	cannot make anything clever here.
396 	 *
397 	 *	We are not end-node, so that if packet contains
398 	 *	AH/ESP, we cannot make anything.
399 	 *	Defragmentation also would be mistake, RA packets
400 	 *	cannot be fragmented, because there is no warranty
401 	 *	that different fragments will go along one path. --ANK
402 	 */
403 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
404 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
405 			return 0;
406 	}
407 
408 	/*
409 	 *	check and decrement ttl
410 	 */
411 	if (hdr->hop_limit <= 1) {
412 		/* Force OUTPUT device used as source address */
413 		skb->dev = dst->dev;
414 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
415 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
416 				 IPSTATS_MIB_INHDRERRORS);
417 
418 		kfree_skb(skb);
419 		return -ETIMEDOUT;
420 	}
421 
422 	/* XXX: idev->cnf.proxy_ndp? */
423 	if (net->ipv6.devconf_all->proxy_ndp &&
424 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
425 		int proxied = ip6_forward_proxy_check(skb);
426 		if (proxied > 0)
427 			return ip6_input(skb);
428 		else if (proxied < 0) {
429 			IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
430 					 IPSTATS_MIB_INDISCARDS);
431 			goto drop;
432 		}
433 	}
434 
435 	if (!xfrm6_route_forward(skb)) {
436 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
437 				 IPSTATS_MIB_INDISCARDS);
438 		goto drop;
439 	}
440 	dst = skb_dst(skb);
441 
442 	/* IPv6 specs say nothing about it, but it is clear that we cannot
443 	   send redirects to source routed frames.
444 	   We don't send redirects to frames decapsulated from IPsec.
445 	 */
446 	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
447 		struct in6_addr *target = NULL;
448 		struct inet_peer *peer;
449 		struct rt6_info *rt;
450 
451 		/*
452 		 *	incoming and outgoing devices are the same
453 		 *	send a redirect.
454 		 */
455 
456 		rt = (struct rt6_info *) dst;
457 		if (rt->rt6i_flags & RTF_GATEWAY)
458 			target = &rt->rt6i_gateway;
459 		else
460 			target = &hdr->daddr;
461 
462 		peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
463 
464 		/* Limit redirects both by destination (here)
465 		   and by source (inside ndisc_send_redirect)
466 		 */
467 		if (inet_peer_xrlim_allow(peer, 1*HZ))
468 			ndisc_send_redirect(skb, target);
469 		if (peer)
470 			inet_putpeer(peer);
471 	} else {
472 		int addrtype = ipv6_addr_type(&hdr->saddr);
473 
474 		/* This check is security critical. */
475 		if (addrtype == IPV6_ADDR_ANY ||
476 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
477 			goto error;
478 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
479 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
480 				    ICMPV6_NOT_NEIGHBOUR, 0);
481 			goto error;
482 		}
483 	}
484 
485 	mtu = ip6_dst_mtu_forward(dst);
486 	if (mtu < IPV6_MIN_MTU)
487 		mtu = IPV6_MIN_MTU;
488 
489 	if (ip6_pkt_too_big(skb, mtu)) {
490 		/* Again, force OUTPUT device used as source address */
491 		skb->dev = dst->dev;
492 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
493 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
494 				 IPSTATS_MIB_INTOOBIGERRORS);
495 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
496 				 IPSTATS_MIB_FRAGFAILS);
497 		kfree_skb(skb);
498 		return -EMSGSIZE;
499 	}
500 
501 	if (skb_cow(skb, dst->dev->hard_header_len)) {
502 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
503 				 IPSTATS_MIB_OUTDISCARDS);
504 		goto drop;
505 	}
506 
507 	hdr = ipv6_hdr(skb);
508 
509 	/* Mangling hops number delayed to point after skb COW */
510 
511 	hdr->hop_limit--;
512 
513 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
514 	IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
515 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, NULL, skb,
516 		       skb->dev, dst->dev,
517 		       ip6_forward_finish);
518 
519 error:
520 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
521 drop:
522 	kfree_skb(skb);
523 	return -EINVAL;
524 }
525 
526 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
527 {
528 	to->pkt_type = from->pkt_type;
529 	to->priority = from->priority;
530 	to->protocol = from->protocol;
531 	skb_dst_drop(to);
532 	skb_dst_set(to, dst_clone(skb_dst(from)));
533 	to->dev = from->dev;
534 	to->mark = from->mark;
535 
536 #ifdef CONFIG_NET_SCHED
537 	to->tc_index = from->tc_index;
538 #endif
539 	nf_copy(to, from);
540 	skb_copy_secmark(to, from);
541 }
542 
543 int ip6_fragment(struct sock *sk, struct sk_buff *skb,
544 		 int (*output)(struct sock *, struct sk_buff *))
545 {
546 	struct sk_buff *frag;
547 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
548 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
549 				inet6_sk(skb->sk) : NULL;
550 	struct ipv6hdr *tmp_hdr;
551 	struct frag_hdr *fh;
552 	unsigned int mtu, hlen, left, len;
553 	int hroom, troom;
554 	__be32 frag_id = 0;
555 	int ptr, offset = 0, err = 0;
556 	u8 *prevhdr, nexthdr = 0;
557 	struct net *net = dev_net(skb_dst(skb)->dev);
558 
559 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
560 	nexthdr = *prevhdr;
561 
562 	mtu = ip6_skb_dst_mtu(skb);
563 
564 	/* We must not fragment if the socket is set to force MTU discovery
565 	 * or if the skb it not generated by a local socket.
566 	 */
567 	if (unlikely(!skb->ignore_df && skb->len > mtu) ||
568 		     (IP6CB(skb)->frag_max_size &&
569 		      IP6CB(skb)->frag_max_size > mtu)) {
570 		if (skb->sk && dst_allfrag(skb_dst(skb)))
571 			sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
572 
573 		skb->dev = skb_dst(skb)->dev;
574 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
575 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
576 			      IPSTATS_MIB_FRAGFAILS);
577 		kfree_skb(skb);
578 		return -EMSGSIZE;
579 	}
580 
581 	if (np && np->frag_size < mtu) {
582 		if (np->frag_size)
583 			mtu = np->frag_size;
584 	}
585 	mtu -= hlen + sizeof(struct frag_hdr);
586 
587 	if (skb_has_frag_list(skb)) {
588 		int first_len = skb_pagelen(skb);
589 		struct sk_buff *frag2;
590 
591 		if (first_len - hlen > mtu ||
592 		    ((first_len - hlen) & 7) ||
593 		    skb_cloned(skb))
594 			goto slow_path;
595 
596 		skb_walk_frags(skb, frag) {
597 			/* Correct geometry. */
598 			if (frag->len > mtu ||
599 			    ((frag->len & 7) && frag->next) ||
600 			    skb_headroom(frag) < hlen)
601 				goto slow_path_clean;
602 
603 			/* Partially cloned skb? */
604 			if (skb_shared(frag))
605 				goto slow_path_clean;
606 
607 			BUG_ON(frag->sk);
608 			if (skb->sk) {
609 				frag->sk = skb->sk;
610 				frag->destructor = sock_wfree;
611 			}
612 			skb->truesize -= frag->truesize;
613 		}
614 
615 		err = 0;
616 		offset = 0;
617 		frag = skb_shinfo(skb)->frag_list;
618 		skb_frag_list_init(skb);
619 		/* BUILD HEADER */
620 
621 		*prevhdr = NEXTHDR_FRAGMENT;
622 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
623 		if (!tmp_hdr) {
624 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
625 				      IPSTATS_MIB_FRAGFAILS);
626 			return -ENOMEM;
627 		}
628 
629 		__skb_pull(skb, hlen);
630 		fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
631 		__skb_push(skb, hlen);
632 		skb_reset_network_header(skb);
633 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
634 
635 		ipv6_select_ident(net, fh, rt);
636 		fh->nexthdr = nexthdr;
637 		fh->reserved = 0;
638 		fh->frag_off = htons(IP6_MF);
639 		frag_id = fh->identification;
640 
641 		first_len = skb_pagelen(skb);
642 		skb->data_len = first_len - skb_headlen(skb);
643 		skb->len = first_len;
644 		ipv6_hdr(skb)->payload_len = htons(first_len -
645 						   sizeof(struct ipv6hdr));
646 
647 		dst_hold(&rt->dst);
648 
649 		for (;;) {
650 			/* Prepare header of the next frame,
651 			 * before previous one went down. */
652 			if (frag) {
653 				frag->ip_summed = CHECKSUM_NONE;
654 				skb_reset_transport_header(frag);
655 				fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
656 				__skb_push(frag, hlen);
657 				skb_reset_network_header(frag);
658 				memcpy(skb_network_header(frag), tmp_hdr,
659 				       hlen);
660 				offset += skb->len - hlen - sizeof(struct frag_hdr);
661 				fh->nexthdr = nexthdr;
662 				fh->reserved = 0;
663 				fh->frag_off = htons(offset);
664 				if (frag->next)
665 					fh->frag_off |= htons(IP6_MF);
666 				fh->identification = frag_id;
667 				ipv6_hdr(frag)->payload_len =
668 						htons(frag->len -
669 						      sizeof(struct ipv6hdr));
670 				ip6_copy_metadata(frag, skb);
671 			}
672 
673 			err = output(sk, skb);
674 			if (!err)
675 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
676 					      IPSTATS_MIB_FRAGCREATES);
677 
678 			if (err || !frag)
679 				break;
680 
681 			skb = frag;
682 			frag = skb->next;
683 			skb->next = NULL;
684 		}
685 
686 		kfree(tmp_hdr);
687 
688 		if (err == 0) {
689 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
690 				      IPSTATS_MIB_FRAGOKS);
691 			ip6_rt_put(rt);
692 			return 0;
693 		}
694 
695 		kfree_skb_list(frag);
696 
697 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
698 			      IPSTATS_MIB_FRAGFAILS);
699 		ip6_rt_put(rt);
700 		return err;
701 
702 slow_path_clean:
703 		skb_walk_frags(skb, frag2) {
704 			if (frag2 == frag)
705 				break;
706 			frag2->sk = NULL;
707 			frag2->destructor = NULL;
708 			skb->truesize += frag2->truesize;
709 		}
710 	}
711 
712 slow_path:
713 	if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
714 	    skb_checksum_help(skb))
715 		goto fail;
716 
717 	left = skb->len - hlen;		/* Space per frame */
718 	ptr = hlen;			/* Where to start from */
719 
720 	/*
721 	 *	Fragment the datagram.
722 	 */
723 
724 	*prevhdr = NEXTHDR_FRAGMENT;
725 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
726 	troom = rt->dst.dev->needed_tailroom;
727 
728 	/*
729 	 *	Keep copying data until we run out.
730 	 */
731 	while (left > 0)	{
732 		len = left;
733 		/* IF: it doesn't fit, use 'mtu' - the data space left */
734 		if (len > mtu)
735 			len = mtu;
736 		/* IF: we are not sending up to and including the packet end
737 		   then align the next start on an eight byte boundary */
738 		if (len < left)	{
739 			len &= ~7;
740 		}
741 
742 		/* Allocate buffer */
743 		frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
744 				 hroom + troom, GFP_ATOMIC);
745 		if (!frag) {
746 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
747 				      IPSTATS_MIB_FRAGFAILS);
748 			err = -ENOMEM;
749 			goto fail;
750 		}
751 
752 		/*
753 		 *	Set up data on packet
754 		 */
755 
756 		ip6_copy_metadata(frag, skb);
757 		skb_reserve(frag, hroom);
758 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
759 		skb_reset_network_header(frag);
760 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
761 		frag->transport_header = (frag->network_header + hlen +
762 					  sizeof(struct frag_hdr));
763 
764 		/*
765 		 *	Charge the memory for the fragment to any owner
766 		 *	it might possess
767 		 */
768 		if (skb->sk)
769 			skb_set_owner_w(frag, skb->sk);
770 
771 		/*
772 		 *	Copy the packet header into the new buffer.
773 		 */
774 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
775 
776 		/*
777 		 *	Build fragment header.
778 		 */
779 		fh->nexthdr = nexthdr;
780 		fh->reserved = 0;
781 		if (!frag_id) {
782 			ipv6_select_ident(net, fh, rt);
783 			frag_id = fh->identification;
784 		} else
785 			fh->identification = frag_id;
786 
787 		/*
788 		 *	Copy a block of the IP datagram.
789 		 */
790 		BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
791 				     len));
792 		left -= len;
793 
794 		fh->frag_off = htons(offset);
795 		if (left > 0)
796 			fh->frag_off |= htons(IP6_MF);
797 		ipv6_hdr(frag)->payload_len = htons(frag->len -
798 						    sizeof(struct ipv6hdr));
799 
800 		ptr += len;
801 		offset += len;
802 
803 		/*
804 		 *	Put this fragment into the sending queue.
805 		 */
806 		err = output(sk, frag);
807 		if (err)
808 			goto fail;
809 
810 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
811 			      IPSTATS_MIB_FRAGCREATES);
812 	}
813 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
814 		      IPSTATS_MIB_FRAGOKS);
815 	consume_skb(skb);
816 	return err;
817 
818 fail:
819 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
820 		      IPSTATS_MIB_FRAGFAILS);
821 	kfree_skb(skb);
822 	return err;
823 }
824 
825 static inline int ip6_rt_check(const struct rt6key *rt_key,
826 			       const struct in6_addr *fl_addr,
827 			       const struct in6_addr *addr_cache)
828 {
829 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
830 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
831 }
832 
833 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
834 					  struct dst_entry *dst,
835 					  const struct flowi6 *fl6)
836 {
837 	struct ipv6_pinfo *np = inet6_sk(sk);
838 	struct rt6_info *rt;
839 
840 	if (!dst)
841 		goto out;
842 
843 	if (dst->ops->family != AF_INET6) {
844 		dst_release(dst);
845 		return NULL;
846 	}
847 
848 	rt = (struct rt6_info *)dst;
849 	/* Yes, checking route validity in not connected
850 	 * case is not very simple. Take into account,
851 	 * that we do not support routing by source, TOS,
852 	 * and MSG_DONTROUTE		--ANK (980726)
853 	 *
854 	 * 1. ip6_rt_check(): If route was host route,
855 	 *    check that cached destination is current.
856 	 *    If it is network route, we still may
857 	 *    check its validity using saved pointer
858 	 *    to the last used address: daddr_cache.
859 	 *    We do not want to save whole address now,
860 	 *    (because main consumer of this service
861 	 *    is tcp, which has not this problem),
862 	 *    so that the last trick works only on connected
863 	 *    sockets.
864 	 * 2. oif also should be the same.
865 	 */
866 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
867 #ifdef CONFIG_IPV6_SUBTREES
868 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
869 #endif
870 	    (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
871 		dst_release(dst);
872 		dst = NULL;
873 	}
874 
875 out:
876 	return dst;
877 }
878 
879 static int ip6_dst_lookup_tail(struct sock *sk,
880 			       struct dst_entry **dst, struct flowi6 *fl6)
881 {
882 	struct net *net = sock_net(sk);
883 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
884 	struct neighbour *n;
885 	struct rt6_info *rt;
886 #endif
887 	int err;
888 
889 	/* The correct way to handle this would be to do
890 	 * ip6_route_get_saddr, and then ip6_route_output; however,
891 	 * the route-specific preferred source forces the
892 	 * ip6_route_output call _before_ ip6_route_get_saddr.
893 	 *
894 	 * In source specific routing (no src=any default route),
895 	 * ip6_route_output will fail given src=any saddr, though, so
896 	 * that's why we try it again later.
897 	 */
898 	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
899 		struct rt6_info *rt;
900 		bool had_dst = *dst != NULL;
901 
902 		if (!had_dst)
903 			*dst = ip6_route_output(net, sk, fl6);
904 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
905 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
906 					  sk ? inet6_sk(sk)->srcprefs : 0,
907 					  &fl6->saddr);
908 		if (err)
909 			goto out_err_release;
910 
911 		/* If we had an erroneous initial result, pretend it
912 		 * never existed and let the SA-enabled version take
913 		 * over.
914 		 */
915 		if (!had_dst && (*dst)->error) {
916 			dst_release(*dst);
917 			*dst = NULL;
918 		}
919 	}
920 
921 	if (!*dst)
922 		*dst = ip6_route_output(net, sk, fl6);
923 
924 	err = (*dst)->error;
925 	if (err)
926 		goto out_err_release;
927 
928 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
929 	/*
930 	 * Here if the dst entry we've looked up
931 	 * has a neighbour entry that is in the INCOMPLETE
932 	 * state and the src address from the flow is
933 	 * marked as OPTIMISTIC, we release the found
934 	 * dst entry and replace it instead with the
935 	 * dst entry of the nexthop router
936 	 */
937 	rt = (struct rt6_info *) *dst;
938 	rcu_read_lock_bh();
939 	n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt));
940 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
941 	rcu_read_unlock_bh();
942 
943 	if (err) {
944 		struct inet6_ifaddr *ifp;
945 		struct flowi6 fl_gw6;
946 		int redirect;
947 
948 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
949 				      (*dst)->dev, 1);
950 
951 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
952 		if (ifp)
953 			in6_ifa_put(ifp);
954 
955 		if (redirect) {
956 			/*
957 			 * We need to get the dst entry for the
958 			 * default router instead
959 			 */
960 			dst_release(*dst);
961 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
962 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
963 			*dst = ip6_route_output(net, sk, &fl_gw6);
964 			err = (*dst)->error;
965 			if (err)
966 				goto out_err_release;
967 		}
968 	}
969 #endif
970 
971 	return 0;
972 
973 out_err_release:
974 	if (err == -ENETUNREACH)
975 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
976 	dst_release(*dst);
977 	*dst = NULL;
978 	return err;
979 }
980 
981 /**
982  *	ip6_dst_lookup - perform route lookup on flow
983  *	@sk: socket which provides route info
984  *	@dst: pointer to dst_entry * for result
985  *	@fl6: flow to lookup
986  *
987  *	This function performs a route lookup on the given flow.
988  *
989  *	It returns zero on success, or a standard errno code on error.
990  */
991 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
992 {
993 	*dst = NULL;
994 	return ip6_dst_lookup_tail(sk, dst, fl6);
995 }
996 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
997 
998 /**
999  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1000  *	@sk: socket which provides route info
1001  *	@fl6: flow to lookup
1002  *	@final_dst: final destination address for ipsec lookup
1003  *
1004  *	This function performs a route lookup on the given flow.
1005  *
1006  *	It returns a valid dst pointer on success, or a pointer encoded
1007  *	error code.
1008  */
1009 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1010 				      const struct in6_addr *final_dst)
1011 {
1012 	struct dst_entry *dst = NULL;
1013 	int err;
1014 
1015 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1016 	if (err)
1017 		return ERR_PTR(err);
1018 	if (final_dst)
1019 		fl6->daddr = *final_dst;
1020 
1021 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1022 }
1023 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1024 
1025 /**
1026  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1027  *	@sk: socket which provides the dst cache and route info
1028  *	@fl6: flow to lookup
1029  *	@final_dst: final destination address for ipsec lookup
1030  *
1031  *	This function performs a route lookup on the given flow with the
1032  *	possibility of using the cached route in the socket if it is valid.
1033  *	It will take the socket dst lock when operating on the dst cache.
1034  *	As a result, this function can only be used in process context.
1035  *
1036  *	It returns a valid dst pointer on success, or a pointer encoded
1037  *	error code.
1038  */
1039 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1040 					 const struct in6_addr *final_dst)
1041 {
1042 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1043 	int err;
1044 
1045 	dst = ip6_sk_dst_check(sk, dst, fl6);
1046 
1047 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1048 	if (err)
1049 		return ERR_PTR(err);
1050 	if (final_dst)
1051 		fl6->daddr = *final_dst;
1052 
1053 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1054 }
1055 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1056 
1057 static inline int ip6_ufo_append_data(struct sock *sk,
1058 			struct sk_buff_head *queue,
1059 			int getfrag(void *from, char *to, int offset, int len,
1060 			int odd, struct sk_buff *skb),
1061 			void *from, int length, int hh_len, int fragheaderlen,
1062 			int transhdrlen, int mtu, unsigned int flags,
1063 			struct rt6_info *rt)
1064 
1065 {
1066 	struct sk_buff *skb;
1067 	struct frag_hdr fhdr;
1068 	int err;
1069 
1070 	/* There is support for UDP large send offload by network
1071 	 * device, so create one single skb packet containing complete
1072 	 * udp datagram
1073 	 */
1074 	skb = skb_peek_tail(queue);
1075 	if (!skb) {
1076 		skb = sock_alloc_send_skb(sk,
1077 			hh_len + fragheaderlen + transhdrlen + 20,
1078 			(flags & MSG_DONTWAIT), &err);
1079 		if (!skb)
1080 			return err;
1081 
1082 		/* reserve space for Hardware header */
1083 		skb_reserve(skb, hh_len);
1084 
1085 		/* create space for UDP/IP header */
1086 		skb_put(skb, fragheaderlen + transhdrlen);
1087 
1088 		/* initialize network header pointer */
1089 		skb_reset_network_header(skb);
1090 
1091 		/* initialize protocol header pointer */
1092 		skb->transport_header = skb->network_header + fragheaderlen;
1093 
1094 		skb->protocol = htons(ETH_P_IPV6);
1095 		skb->csum = 0;
1096 
1097 		__skb_queue_tail(queue, skb);
1098 	} else if (skb_is_gso(skb)) {
1099 		goto append;
1100 	}
1101 
1102 	skb->ip_summed = CHECKSUM_PARTIAL;
1103 	/* Specify the length of each IPv6 datagram fragment.
1104 	 * It has to be a multiple of 8.
1105 	 */
1106 	skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1107 				     sizeof(struct frag_hdr)) & ~7;
1108 	skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1109 	ipv6_select_ident(sock_net(sk), &fhdr, rt);
1110 	skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1111 
1112 append:
1113 	return skb_append_datato_frags(sk, skb, getfrag, from,
1114 				       (length - transhdrlen));
1115 }
1116 
1117 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1118 					       gfp_t gfp)
1119 {
1120 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1121 }
1122 
1123 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1124 						gfp_t gfp)
1125 {
1126 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1127 }
1128 
1129 static void ip6_append_data_mtu(unsigned int *mtu,
1130 				int *maxfraglen,
1131 				unsigned int fragheaderlen,
1132 				struct sk_buff *skb,
1133 				struct rt6_info *rt,
1134 				unsigned int orig_mtu)
1135 {
1136 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1137 		if (!skb) {
1138 			/* first fragment, reserve header_len */
1139 			*mtu = orig_mtu - rt->dst.header_len;
1140 
1141 		} else {
1142 			/*
1143 			 * this fragment is not first, the headers
1144 			 * space is regarded as data space.
1145 			 */
1146 			*mtu = orig_mtu;
1147 		}
1148 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1149 			      + fragheaderlen - sizeof(struct frag_hdr);
1150 	}
1151 }
1152 
1153 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1154 			  struct inet6_cork *v6_cork,
1155 			  int hlimit, int tclass, struct ipv6_txoptions *opt,
1156 			  struct rt6_info *rt, struct flowi6 *fl6)
1157 {
1158 	struct ipv6_pinfo *np = inet6_sk(sk);
1159 	unsigned int mtu;
1160 
1161 	/*
1162 	 * setup for corking
1163 	 */
1164 	if (opt) {
1165 		if (WARN_ON(v6_cork->opt))
1166 			return -EINVAL;
1167 
1168 		v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1169 		if (unlikely(!v6_cork->opt))
1170 			return -ENOBUFS;
1171 
1172 		v6_cork->opt->tot_len = opt->tot_len;
1173 		v6_cork->opt->opt_flen = opt->opt_flen;
1174 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1175 
1176 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1177 						    sk->sk_allocation);
1178 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1179 			return -ENOBUFS;
1180 
1181 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1182 						    sk->sk_allocation);
1183 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1184 			return -ENOBUFS;
1185 
1186 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1187 						   sk->sk_allocation);
1188 		if (opt->hopopt && !v6_cork->opt->hopopt)
1189 			return -ENOBUFS;
1190 
1191 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1192 						    sk->sk_allocation);
1193 		if (opt->srcrt && !v6_cork->opt->srcrt)
1194 			return -ENOBUFS;
1195 
1196 		/* need source address above miyazawa*/
1197 	}
1198 	dst_hold(&rt->dst);
1199 	cork->base.dst = &rt->dst;
1200 	cork->fl.u.ip6 = *fl6;
1201 	v6_cork->hop_limit = hlimit;
1202 	v6_cork->tclass = tclass;
1203 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1204 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1205 		      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1206 	else
1207 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1208 		      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1209 	if (np->frag_size < mtu) {
1210 		if (np->frag_size)
1211 			mtu = np->frag_size;
1212 	}
1213 	cork->base.fragsize = mtu;
1214 	if (dst_allfrag(rt->dst.path))
1215 		cork->base.flags |= IPCORK_ALLFRAG;
1216 	cork->base.length = 0;
1217 
1218 	return 0;
1219 }
1220 
1221 static int __ip6_append_data(struct sock *sk,
1222 			     struct flowi6 *fl6,
1223 			     struct sk_buff_head *queue,
1224 			     struct inet_cork *cork,
1225 			     struct inet6_cork *v6_cork,
1226 			     struct page_frag *pfrag,
1227 			     int getfrag(void *from, char *to, int offset,
1228 					 int len, int odd, struct sk_buff *skb),
1229 			     void *from, int length, int transhdrlen,
1230 			     unsigned int flags, int dontfrag)
1231 {
1232 	struct sk_buff *skb, *skb_prev = NULL;
1233 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1234 	int exthdrlen = 0;
1235 	int dst_exthdrlen = 0;
1236 	int hh_len;
1237 	int copy;
1238 	int err;
1239 	int offset = 0;
1240 	__u8 tx_flags = 0;
1241 	u32 tskey = 0;
1242 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1243 	struct ipv6_txoptions *opt = v6_cork->opt;
1244 	int csummode = CHECKSUM_NONE;
1245 
1246 	skb = skb_peek_tail(queue);
1247 	if (!skb) {
1248 		exthdrlen = opt ? opt->opt_flen : 0;
1249 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1250 	}
1251 
1252 	mtu = cork->fragsize;
1253 	orig_mtu = mtu;
1254 
1255 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1256 
1257 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1258 			(opt ? opt->opt_nflen : 0);
1259 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1260 		     sizeof(struct frag_hdr);
1261 
1262 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1263 		unsigned int maxnonfragsize, headersize;
1264 
1265 		headersize = sizeof(struct ipv6hdr) +
1266 			     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1267 			     (dst_allfrag(&rt->dst) ?
1268 			      sizeof(struct frag_hdr) : 0) +
1269 			     rt->rt6i_nfheader_len;
1270 
1271 		if (ip6_sk_ignore_df(sk))
1272 			maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1273 		else
1274 			maxnonfragsize = mtu;
1275 
1276 		/* dontfrag active */
1277 		if ((cork->length + length > mtu - headersize) && dontfrag &&
1278 		    (sk->sk_protocol == IPPROTO_UDP ||
1279 		     sk->sk_protocol == IPPROTO_RAW)) {
1280 			ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1281 						   sizeof(struct ipv6hdr));
1282 			goto emsgsize;
1283 		}
1284 
1285 		if (cork->length + length > maxnonfragsize - headersize) {
1286 emsgsize:
1287 			ipv6_local_error(sk, EMSGSIZE, fl6,
1288 					 mtu - headersize +
1289 					 sizeof(struct ipv6hdr));
1290 			return -EMSGSIZE;
1291 		}
1292 	}
1293 
1294 	if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1295 		sock_tx_timestamp(sk, &tx_flags);
1296 		if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1297 		    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1298 			tskey = sk->sk_tskey++;
1299 	}
1300 
1301 	/* If this is the first and only packet and device
1302 	 * supports checksum offloading, let's use it.
1303 	 * Use transhdrlen, same as IPv4, because partial
1304 	 * sums only work when transhdrlen is set.
1305 	 */
1306 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1307 	    length + fragheaderlen < mtu &&
1308 	    rt->dst.dev->features & NETIF_F_V6_CSUM &&
1309 	    !exthdrlen)
1310 		csummode = CHECKSUM_PARTIAL;
1311 	/*
1312 	 * Let's try using as much space as possible.
1313 	 * Use MTU if total length of the message fits into the MTU.
1314 	 * Otherwise, we need to reserve fragment header and
1315 	 * fragment alignment (= 8-15 octects, in total).
1316 	 *
1317 	 * Note that we may need to "move" the data from the tail of
1318 	 * of the buffer to the new fragment when we split
1319 	 * the message.
1320 	 *
1321 	 * FIXME: It may be fragmented into multiple chunks
1322 	 *        at once if non-fragmentable extension headers
1323 	 *        are too large.
1324 	 * --yoshfuji
1325 	 */
1326 
1327 	cork->length += length;
1328 	if (((length > mtu) ||
1329 	     (skb && skb_is_gso(skb))) &&
1330 	    (sk->sk_protocol == IPPROTO_UDP) &&
1331 	    (rt->dst.dev->features & NETIF_F_UFO) &&
1332 	    (sk->sk_type == SOCK_DGRAM)) {
1333 		err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1334 					  hh_len, fragheaderlen,
1335 					  transhdrlen, mtu, flags, rt);
1336 		if (err)
1337 			goto error;
1338 		return 0;
1339 	}
1340 
1341 	if (!skb)
1342 		goto alloc_new_skb;
1343 
1344 	while (length > 0) {
1345 		/* Check if the remaining data fits into current packet. */
1346 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1347 		if (copy < length)
1348 			copy = maxfraglen - skb->len;
1349 
1350 		if (copy <= 0) {
1351 			char *data;
1352 			unsigned int datalen;
1353 			unsigned int fraglen;
1354 			unsigned int fraggap;
1355 			unsigned int alloclen;
1356 alloc_new_skb:
1357 			/* There's no room in the current skb */
1358 			if (skb)
1359 				fraggap = skb->len - maxfraglen;
1360 			else
1361 				fraggap = 0;
1362 			/* update mtu and maxfraglen if necessary */
1363 			if (!skb || !skb_prev)
1364 				ip6_append_data_mtu(&mtu, &maxfraglen,
1365 						    fragheaderlen, skb, rt,
1366 						    orig_mtu);
1367 
1368 			skb_prev = skb;
1369 
1370 			/*
1371 			 * If remaining data exceeds the mtu,
1372 			 * we know we need more fragment(s).
1373 			 */
1374 			datalen = length + fraggap;
1375 
1376 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1377 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1378 			if ((flags & MSG_MORE) &&
1379 			    !(rt->dst.dev->features&NETIF_F_SG))
1380 				alloclen = mtu;
1381 			else
1382 				alloclen = datalen + fragheaderlen;
1383 
1384 			alloclen += dst_exthdrlen;
1385 
1386 			if (datalen != length + fraggap) {
1387 				/*
1388 				 * this is not the last fragment, the trailer
1389 				 * space is regarded as data space.
1390 				 */
1391 				datalen += rt->dst.trailer_len;
1392 			}
1393 
1394 			alloclen += rt->dst.trailer_len;
1395 			fraglen = datalen + fragheaderlen;
1396 
1397 			/*
1398 			 * We just reserve space for fragment header.
1399 			 * Note: this may be overallocation if the message
1400 			 * (without MSG_MORE) fits into the MTU.
1401 			 */
1402 			alloclen += sizeof(struct frag_hdr);
1403 
1404 			if (transhdrlen) {
1405 				skb = sock_alloc_send_skb(sk,
1406 						alloclen + hh_len,
1407 						(flags & MSG_DONTWAIT), &err);
1408 			} else {
1409 				skb = NULL;
1410 				if (atomic_read(&sk->sk_wmem_alloc) <=
1411 				    2 * sk->sk_sndbuf)
1412 					skb = sock_wmalloc(sk,
1413 							   alloclen + hh_len, 1,
1414 							   sk->sk_allocation);
1415 				if (unlikely(!skb))
1416 					err = -ENOBUFS;
1417 			}
1418 			if (!skb)
1419 				goto error;
1420 			/*
1421 			 *	Fill in the control structures
1422 			 */
1423 			skb->protocol = htons(ETH_P_IPV6);
1424 			skb->ip_summed = csummode;
1425 			skb->csum = 0;
1426 			/* reserve for fragmentation and ipsec header */
1427 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1428 				    dst_exthdrlen);
1429 
1430 			/* Only the initial fragment is time stamped */
1431 			skb_shinfo(skb)->tx_flags = tx_flags;
1432 			tx_flags = 0;
1433 			skb_shinfo(skb)->tskey = tskey;
1434 			tskey = 0;
1435 
1436 			/*
1437 			 *	Find where to start putting bytes
1438 			 */
1439 			data = skb_put(skb, fraglen);
1440 			skb_set_network_header(skb, exthdrlen);
1441 			data += fragheaderlen;
1442 			skb->transport_header = (skb->network_header +
1443 						 fragheaderlen);
1444 			if (fraggap) {
1445 				skb->csum = skb_copy_and_csum_bits(
1446 					skb_prev, maxfraglen,
1447 					data + transhdrlen, fraggap, 0);
1448 				skb_prev->csum = csum_sub(skb_prev->csum,
1449 							  skb->csum);
1450 				data += fraggap;
1451 				pskb_trim_unique(skb_prev, maxfraglen);
1452 			}
1453 			copy = datalen - transhdrlen - fraggap;
1454 
1455 			if (copy < 0) {
1456 				err = -EINVAL;
1457 				kfree_skb(skb);
1458 				goto error;
1459 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1460 				err = -EFAULT;
1461 				kfree_skb(skb);
1462 				goto error;
1463 			}
1464 
1465 			offset += copy;
1466 			length -= datalen - fraggap;
1467 			transhdrlen = 0;
1468 			exthdrlen = 0;
1469 			dst_exthdrlen = 0;
1470 
1471 			/*
1472 			 * Put the packet on the pending queue
1473 			 */
1474 			__skb_queue_tail(queue, skb);
1475 			continue;
1476 		}
1477 
1478 		if (copy > length)
1479 			copy = length;
1480 
1481 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1482 			unsigned int off;
1483 
1484 			off = skb->len;
1485 			if (getfrag(from, skb_put(skb, copy),
1486 						offset, copy, off, skb) < 0) {
1487 				__skb_trim(skb, off);
1488 				err = -EFAULT;
1489 				goto error;
1490 			}
1491 		} else {
1492 			int i = skb_shinfo(skb)->nr_frags;
1493 
1494 			err = -ENOMEM;
1495 			if (!sk_page_frag_refill(sk, pfrag))
1496 				goto error;
1497 
1498 			if (!skb_can_coalesce(skb, i, pfrag->page,
1499 					      pfrag->offset)) {
1500 				err = -EMSGSIZE;
1501 				if (i == MAX_SKB_FRAGS)
1502 					goto error;
1503 
1504 				__skb_fill_page_desc(skb, i, pfrag->page,
1505 						     pfrag->offset, 0);
1506 				skb_shinfo(skb)->nr_frags = ++i;
1507 				get_page(pfrag->page);
1508 			}
1509 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1510 			if (getfrag(from,
1511 				    page_address(pfrag->page) + pfrag->offset,
1512 				    offset, copy, skb->len, skb) < 0)
1513 				goto error_efault;
1514 
1515 			pfrag->offset += copy;
1516 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1517 			skb->len += copy;
1518 			skb->data_len += copy;
1519 			skb->truesize += copy;
1520 			atomic_add(copy, &sk->sk_wmem_alloc);
1521 		}
1522 		offset += copy;
1523 		length -= copy;
1524 	}
1525 
1526 	return 0;
1527 
1528 error_efault:
1529 	err = -EFAULT;
1530 error:
1531 	cork->length -= length;
1532 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1533 	return err;
1534 }
1535 
1536 int ip6_append_data(struct sock *sk,
1537 		    int getfrag(void *from, char *to, int offset, int len,
1538 				int odd, struct sk_buff *skb),
1539 		    void *from, int length, int transhdrlen, int hlimit,
1540 		    int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1541 		    struct rt6_info *rt, unsigned int flags, int dontfrag)
1542 {
1543 	struct inet_sock *inet = inet_sk(sk);
1544 	struct ipv6_pinfo *np = inet6_sk(sk);
1545 	int exthdrlen;
1546 	int err;
1547 
1548 	if (flags&MSG_PROBE)
1549 		return 0;
1550 	if (skb_queue_empty(&sk->sk_write_queue)) {
1551 		/*
1552 		 * setup for corking
1553 		 */
1554 		err = ip6_setup_cork(sk, &inet->cork, &np->cork, hlimit,
1555 				     tclass, opt, rt, fl6);
1556 		if (err)
1557 			return err;
1558 
1559 		exthdrlen = (opt ? opt->opt_flen : 0);
1560 		length += exthdrlen;
1561 		transhdrlen += exthdrlen;
1562 	} else {
1563 		fl6 = &inet->cork.fl.u.ip6;
1564 		transhdrlen = 0;
1565 	}
1566 
1567 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1568 				 &np->cork, sk_page_frag(sk), getfrag,
1569 				 from, length, transhdrlen, flags, dontfrag);
1570 }
1571 EXPORT_SYMBOL_GPL(ip6_append_data);
1572 
1573 static void ip6_cork_release(struct inet_cork_full *cork,
1574 			     struct inet6_cork *v6_cork)
1575 {
1576 	if (v6_cork->opt) {
1577 		kfree(v6_cork->opt->dst0opt);
1578 		kfree(v6_cork->opt->dst1opt);
1579 		kfree(v6_cork->opt->hopopt);
1580 		kfree(v6_cork->opt->srcrt);
1581 		kfree(v6_cork->opt);
1582 		v6_cork->opt = NULL;
1583 	}
1584 
1585 	if (cork->base.dst) {
1586 		dst_release(cork->base.dst);
1587 		cork->base.dst = NULL;
1588 		cork->base.flags &= ~IPCORK_ALLFRAG;
1589 	}
1590 	memset(&cork->fl, 0, sizeof(cork->fl));
1591 }
1592 
1593 struct sk_buff *__ip6_make_skb(struct sock *sk,
1594 			       struct sk_buff_head *queue,
1595 			       struct inet_cork_full *cork,
1596 			       struct inet6_cork *v6_cork)
1597 {
1598 	struct sk_buff *skb, *tmp_skb;
1599 	struct sk_buff **tail_skb;
1600 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1601 	struct ipv6_pinfo *np = inet6_sk(sk);
1602 	struct net *net = sock_net(sk);
1603 	struct ipv6hdr *hdr;
1604 	struct ipv6_txoptions *opt = v6_cork->opt;
1605 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1606 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1607 	unsigned char proto = fl6->flowi6_proto;
1608 
1609 	skb = __skb_dequeue(queue);
1610 	if (!skb)
1611 		goto out;
1612 	tail_skb = &(skb_shinfo(skb)->frag_list);
1613 
1614 	/* move skb->data to ip header from ext header */
1615 	if (skb->data < skb_network_header(skb))
1616 		__skb_pull(skb, skb_network_offset(skb));
1617 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1618 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1619 		*tail_skb = tmp_skb;
1620 		tail_skb = &(tmp_skb->next);
1621 		skb->len += tmp_skb->len;
1622 		skb->data_len += tmp_skb->len;
1623 		skb->truesize += tmp_skb->truesize;
1624 		tmp_skb->destructor = NULL;
1625 		tmp_skb->sk = NULL;
1626 	}
1627 
1628 	/* Allow local fragmentation. */
1629 	skb->ignore_df = ip6_sk_ignore_df(sk);
1630 
1631 	*final_dst = fl6->daddr;
1632 	__skb_pull(skb, skb_network_header_len(skb));
1633 	if (opt && opt->opt_flen)
1634 		ipv6_push_frag_opts(skb, opt, &proto);
1635 	if (opt && opt->opt_nflen)
1636 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1637 
1638 	skb_push(skb, sizeof(struct ipv6hdr));
1639 	skb_reset_network_header(skb);
1640 	hdr = ipv6_hdr(skb);
1641 
1642 	ip6_flow_hdr(hdr, v6_cork->tclass,
1643 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1644 					np->autoflowlabel));
1645 	hdr->hop_limit = v6_cork->hop_limit;
1646 	hdr->nexthdr = proto;
1647 	hdr->saddr = fl6->saddr;
1648 	hdr->daddr = *final_dst;
1649 
1650 	skb->priority = sk->sk_priority;
1651 	skb->mark = sk->sk_mark;
1652 
1653 	skb_dst_set(skb, dst_clone(&rt->dst));
1654 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1655 	if (proto == IPPROTO_ICMPV6) {
1656 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1657 
1658 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1659 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1660 	}
1661 
1662 	ip6_cork_release(cork, v6_cork);
1663 out:
1664 	return skb;
1665 }
1666 
1667 int ip6_send_skb(struct sk_buff *skb)
1668 {
1669 	struct net *net = sock_net(skb->sk);
1670 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1671 	int err;
1672 
1673 	err = ip6_local_out(skb);
1674 	if (err) {
1675 		if (err > 0)
1676 			err = net_xmit_errno(err);
1677 		if (err)
1678 			IP6_INC_STATS(net, rt->rt6i_idev,
1679 				      IPSTATS_MIB_OUTDISCARDS);
1680 	}
1681 
1682 	return err;
1683 }
1684 
1685 int ip6_push_pending_frames(struct sock *sk)
1686 {
1687 	struct sk_buff *skb;
1688 
1689 	skb = ip6_finish_skb(sk);
1690 	if (!skb)
1691 		return 0;
1692 
1693 	return ip6_send_skb(skb);
1694 }
1695 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1696 
1697 static void __ip6_flush_pending_frames(struct sock *sk,
1698 				       struct sk_buff_head *queue,
1699 				       struct inet_cork_full *cork,
1700 				       struct inet6_cork *v6_cork)
1701 {
1702 	struct sk_buff *skb;
1703 
1704 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1705 		if (skb_dst(skb))
1706 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1707 				      IPSTATS_MIB_OUTDISCARDS);
1708 		kfree_skb(skb);
1709 	}
1710 
1711 	ip6_cork_release(cork, v6_cork);
1712 }
1713 
1714 void ip6_flush_pending_frames(struct sock *sk)
1715 {
1716 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1717 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1718 }
1719 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1720 
1721 struct sk_buff *ip6_make_skb(struct sock *sk,
1722 			     int getfrag(void *from, char *to, int offset,
1723 					 int len, int odd, struct sk_buff *skb),
1724 			     void *from, int length, int transhdrlen,
1725 			     int hlimit, int tclass,
1726 			     struct ipv6_txoptions *opt, struct flowi6 *fl6,
1727 			     struct rt6_info *rt, unsigned int flags,
1728 			     int dontfrag)
1729 {
1730 	struct inet_cork_full cork;
1731 	struct inet6_cork v6_cork;
1732 	struct sk_buff_head queue;
1733 	int exthdrlen = (opt ? opt->opt_flen : 0);
1734 	int err;
1735 
1736 	if (flags & MSG_PROBE)
1737 		return NULL;
1738 
1739 	__skb_queue_head_init(&queue);
1740 
1741 	cork.base.flags = 0;
1742 	cork.base.addr = 0;
1743 	cork.base.opt = NULL;
1744 	v6_cork.opt = NULL;
1745 	err = ip6_setup_cork(sk, &cork, &v6_cork, hlimit, tclass, opt, rt, fl6);
1746 	if (err)
1747 		return ERR_PTR(err);
1748 
1749 	if (dontfrag < 0)
1750 		dontfrag = inet6_sk(sk)->dontfrag;
1751 
1752 	err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1753 				&current->task_frag, getfrag, from,
1754 				length + exthdrlen, transhdrlen + exthdrlen,
1755 				flags, dontfrag);
1756 	if (err) {
1757 		__ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1758 		return ERR_PTR(err);
1759 	}
1760 
1761 	return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1762 }
1763