xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 1e0d69a9cc9172d7896c2113f983a74f6e8ff303)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	:	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 
59 static int ip6_finish_output2(struct sock *sk, struct sk_buff *skb)
60 {
61 	struct dst_entry *dst = skb_dst(skb);
62 	struct net_device *dev = dst->dev;
63 	struct neighbour *neigh;
64 	struct in6_addr *nexthop;
65 	int ret;
66 
67 	skb->protocol = htons(ETH_P_IPV6);
68 	skb->dev = dev;
69 
70 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72 
73 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74 		    ((mroute6_socket(dev_net(dev), skb) &&
75 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 					 &ipv6_hdr(skb)->saddr))) {
78 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79 
80 			/* Do not check for IFF_ALLMULTI; multicast routing
81 			   is not supported in any case.
82 			 */
83 			if (newskb)
84 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 					sk, newskb, NULL, newskb->dev,
86 					dev_loopback_xmit);
87 
88 			if (ipv6_hdr(skb)->hop_limit == 0) {
89 				IP6_INC_STATS(dev_net(dev), idev,
90 					      IPSTATS_MIB_OUTDISCARDS);
91 				kfree_skb(skb);
92 				return 0;
93 			}
94 		}
95 
96 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
97 				skb->len);
98 
99 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
100 		    IPV6_ADDR_SCOPE_NODELOCAL &&
101 		    !(dev->flags & IFF_LOOPBACK)) {
102 			kfree_skb(skb);
103 			return 0;
104 		}
105 	}
106 
107 	rcu_read_lock_bh();
108 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
109 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
110 	if (unlikely(!neigh))
111 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
112 	if (!IS_ERR(neigh)) {
113 		ret = dst_neigh_output(dst, neigh, skb);
114 		rcu_read_unlock_bh();
115 		return ret;
116 	}
117 	rcu_read_unlock_bh();
118 
119 	IP6_INC_STATS(dev_net(dst->dev),
120 		      ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
121 	kfree_skb(skb);
122 	return -EINVAL;
123 }
124 
125 static int ip6_finish_output(struct sock *sk, struct sk_buff *skb)
126 {
127 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
128 	    dst_allfrag(skb_dst(skb)) ||
129 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
130 		return ip6_fragment(sk, skb, ip6_finish_output2);
131 	else
132 		return ip6_finish_output2(sk, skb);
133 }
134 
135 int ip6_output(struct sock *sk, struct sk_buff *skb)
136 {
137 	struct net_device *dev = skb_dst(skb)->dev;
138 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
139 	if (unlikely(idev->cnf.disable_ipv6)) {
140 		IP6_INC_STATS(dev_net(dev), idev,
141 			      IPSTATS_MIB_OUTDISCARDS);
142 		kfree_skb(skb);
143 		return 0;
144 	}
145 
146 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, sk, skb,
147 			    NULL, dev,
148 			    ip6_finish_output,
149 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
150 }
151 
152 /*
153  *	xmit an sk_buff (used by TCP, SCTP and DCCP)
154  */
155 
156 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
157 	     struct ipv6_txoptions *opt, int tclass)
158 {
159 	struct net *net = sock_net(sk);
160 	struct ipv6_pinfo *np = inet6_sk(sk);
161 	struct in6_addr *first_hop = &fl6->daddr;
162 	struct dst_entry *dst = skb_dst(skb);
163 	struct ipv6hdr *hdr;
164 	u8  proto = fl6->flowi6_proto;
165 	int seg_len = skb->len;
166 	int hlimit = -1;
167 	u32 mtu;
168 
169 	if (opt) {
170 		unsigned int head_room;
171 
172 		/* First: exthdrs may take lots of space (~8K for now)
173 		   MAX_HEADER is not enough.
174 		 */
175 		head_room = opt->opt_nflen + opt->opt_flen;
176 		seg_len += head_room;
177 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
178 
179 		if (skb_headroom(skb) < head_room) {
180 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
181 			if (!skb2) {
182 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
183 					      IPSTATS_MIB_OUTDISCARDS);
184 				kfree_skb(skb);
185 				return -ENOBUFS;
186 			}
187 			consume_skb(skb);
188 			skb = skb2;
189 			skb_set_owner_w(skb, sk);
190 		}
191 		if (opt->opt_flen)
192 			ipv6_push_frag_opts(skb, opt, &proto);
193 		if (opt->opt_nflen)
194 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
195 	}
196 
197 	skb_push(skb, sizeof(struct ipv6hdr));
198 	skb_reset_network_header(skb);
199 	hdr = ipv6_hdr(skb);
200 
201 	/*
202 	 *	Fill in the IPv6 header
203 	 */
204 	if (np)
205 		hlimit = np->hop_limit;
206 	if (hlimit < 0)
207 		hlimit = ip6_dst_hoplimit(dst);
208 
209 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
210 						     np->autoflowlabel, fl6));
211 
212 	hdr->payload_len = htons(seg_len);
213 	hdr->nexthdr = proto;
214 	hdr->hop_limit = hlimit;
215 
216 	hdr->saddr = fl6->saddr;
217 	hdr->daddr = *first_hop;
218 
219 	skb->protocol = htons(ETH_P_IPV6);
220 	skb->priority = sk->sk_priority;
221 	skb->mark = sk->sk_mark;
222 
223 	mtu = dst_mtu(dst);
224 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
225 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
226 			      IPSTATS_MIB_OUT, skb->len);
227 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb,
228 			       NULL, dst->dev, dst_output_sk);
229 	}
230 
231 	skb->dev = dst->dev;
232 	ipv6_local_error(sk, EMSGSIZE, fl6, mtu);
233 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
234 	kfree_skb(skb);
235 	return -EMSGSIZE;
236 }
237 EXPORT_SYMBOL(ip6_xmit);
238 
239 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
240 {
241 	struct ip6_ra_chain *ra;
242 	struct sock *last = NULL;
243 
244 	read_lock(&ip6_ra_lock);
245 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
246 		struct sock *sk = ra->sk;
247 		if (sk && ra->sel == sel &&
248 		    (!sk->sk_bound_dev_if ||
249 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
250 			if (last) {
251 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
252 				if (skb2)
253 					rawv6_rcv(last, skb2);
254 			}
255 			last = sk;
256 		}
257 	}
258 
259 	if (last) {
260 		rawv6_rcv(last, skb);
261 		read_unlock(&ip6_ra_lock);
262 		return 1;
263 	}
264 	read_unlock(&ip6_ra_lock);
265 	return 0;
266 }
267 
268 static int ip6_forward_proxy_check(struct sk_buff *skb)
269 {
270 	struct ipv6hdr *hdr = ipv6_hdr(skb);
271 	u8 nexthdr = hdr->nexthdr;
272 	__be16 frag_off;
273 	int offset;
274 
275 	if (ipv6_ext_hdr(nexthdr)) {
276 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
277 		if (offset < 0)
278 			return 0;
279 	} else
280 		offset = sizeof(struct ipv6hdr);
281 
282 	if (nexthdr == IPPROTO_ICMPV6) {
283 		struct icmp6hdr *icmp6;
284 
285 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
286 					 offset + 1 - skb->data)))
287 			return 0;
288 
289 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
290 
291 		switch (icmp6->icmp6_type) {
292 		case NDISC_ROUTER_SOLICITATION:
293 		case NDISC_ROUTER_ADVERTISEMENT:
294 		case NDISC_NEIGHBOUR_SOLICITATION:
295 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
296 		case NDISC_REDIRECT:
297 			/* For reaction involving unicast neighbor discovery
298 			 * message destined to the proxied address, pass it to
299 			 * input function.
300 			 */
301 			return 1;
302 		default:
303 			break;
304 		}
305 	}
306 
307 	/*
308 	 * The proxying router can't forward traffic sent to a link-local
309 	 * address, so signal the sender and discard the packet. This
310 	 * behavior is clarified by the MIPv6 specification.
311 	 */
312 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
313 		dst_link_failure(skb);
314 		return -1;
315 	}
316 
317 	return 0;
318 }
319 
320 static inline int ip6_forward_finish(struct sock *sk, struct sk_buff *skb)
321 {
322 	skb_sender_cpu_clear(skb);
323 	return dst_output_sk(sk, skb);
324 }
325 
326 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
327 {
328 	unsigned int mtu;
329 	struct inet6_dev *idev;
330 
331 	if (dst_metric_locked(dst, RTAX_MTU)) {
332 		mtu = dst_metric_raw(dst, RTAX_MTU);
333 		if (mtu)
334 			return mtu;
335 	}
336 
337 	mtu = IPV6_MIN_MTU;
338 	rcu_read_lock();
339 	idev = __in6_dev_get(dst->dev);
340 	if (idev)
341 		mtu = idev->cnf.mtu6;
342 	rcu_read_unlock();
343 
344 	return mtu;
345 }
346 
347 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
348 {
349 	if (skb->len <= mtu)
350 		return false;
351 
352 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
353 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
354 		return true;
355 
356 	if (skb->ignore_df)
357 		return false;
358 
359 	if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
360 		return false;
361 
362 	return true;
363 }
364 
365 int ip6_forward(struct sk_buff *skb)
366 {
367 	struct dst_entry *dst = skb_dst(skb);
368 	struct ipv6hdr *hdr = ipv6_hdr(skb);
369 	struct inet6_skb_parm *opt = IP6CB(skb);
370 	struct net *net = dev_net(dst->dev);
371 	u32 mtu;
372 
373 	if (net->ipv6.devconf_all->forwarding == 0)
374 		goto error;
375 
376 	if (skb->pkt_type != PACKET_HOST)
377 		goto drop;
378 
379 	if (unlikely(skb->sk))
380 		goto drop;
381 
382 	if (skb_warn_if_lro(skb))
383 		goto drop;
384 
385 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
386 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
387 				 IPSTATS_MIB_INDISCARDS);
388 		goto drop;
389 	}
390 
391 	skb_forward_csum(skb);
392 
393 	/*
394 	 *	We DO NOT make any processing on
395 	 *	RA packets, pushing them to user level AS IS
396 	 *	without ane WARRANTY that application will be able
397 	 *	to interpret them. The reason is that we
398 	 *	cannot make anything clever here.
399 	 *
400 	 *	We are not end-node, so that if packet contains
401 	 *	AH/ESP, we cannot make anything.
402 	 *	Defragmentation also would be mistake, RA packets
403 	 *	cannot be fragmented, because there is no warranty
404 	 *	that different fragments will go along one path. --ANK
405 	 */
406 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
407 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
408 			return 0;
409 	}
410 
411 	/*
412 	 *	check and decrement ttl
413 	 */
414 	if (hdr->hop_limit <= 1) {
415 		/* Force OUTPUT device used as source address */
416 		skb->dev = dst->dev;
417 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
418 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
419 				 IPSTATS_MIB_INHDRERRORS);
420 
421 		kfree_skb(skb);
422 		return -ETIMEDOUT;
423 	}
424 
425 	/* XXX: idev->cnf.proxy_ndp? */
426 	if (net->ipv6.devconf_all->proxy_ndp &&
427 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
428 		int proxied = ip6_forward_proxy_check(skb);
429 		if (proxied > 0)
430 			return ip6_input(skb);
431 		else if (proxied < 0) {
432 			IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
433 					 IPSTATS_MIB_INDISCARDS);
434 			goto drop;
435 		}
436 	}
437 
438 	if (!xfrm6_route_forward(skb)) {
439 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
440 				 IPSTATS_MIB_INDISCARDS);
441 		goto drop;
442 	}
443 	dst = skb_dst(skb);
444 
445 	/* IPv6 specs say nothing about it, but it is clear that we cannot
446 	   send redirects to source routed frames.
447 	   We don't send redirects to frames decapsulated from IPsec.
448 	 */
449 	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
450 		struct in6_addr *target = NULL;
451 		struct inet_peer *peer;
452 		struct rt6_info *rt;
453 
454 		/*
455 		 *	incoming and outgoing devices are the same
456 		 *	send a redirect.
457 		 */
458 
459 		rt = (struct rt6_info *) dst;
460 		if (rt->rt6i_flags & RTF_GATEWAY)
461 			target = &rt->rt6i_gateway;
462 		else
463 			target = &hdr->daddr;
464 
465 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
466 
467 		/* Limit redirects both by destination (here)
468 		   and by source (inside ndisc_send_redirect)
469 		 */
470 		if (inet_peer_xrlim_allow(peer, 1*HZ))
471 			ndisc_send_redirect(skb, target);
472 		if (peer)
473 			inet_putpeer(peer);
474 	} else {
475 		int addrtype = ipv6_addr_type(&hdr->saddr);
476 
477 		/* This check is security critical. */
478 		if (addrtype == IPV6_ADDR_ANY ||
479 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
480 			goto error;
481 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
482 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
483 				    ICMPV6_NOT_NEIGHBOUR, 0);
484 			goto error;
485 		}
486 	}
487 
488 	mtu = ip6_dst_mtu_forward(dst);
489 	if (mtu < IPV6_MIN_MTU)
490 		mtu = IPV6_MIN_MTU;
491 
492 	if (ip6_pkt_too_big(skb, mtu)) {
493 		/* Again, force OUTPUT device used as source address */
494 		skb->dev = dst->dev;
495 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
496 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
497 				 IPSTATS_MIB_INTOOBIGERRORS);
498 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
499 				 IPSTATS_MIB_FRAGFAILS);
500 		kfree_skb(skb);
501 		return -EMSGSIZE;
502 	}
503 
504 	if (skb_cow(skb, dst->dev->hard_header_len)) {
505 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
506 				 IPSTATS_MIB_OUTDISCARDS);
507 		goto drop;
508 	}
509 
510 	hdr = ipv6_hdr(skb);
511 
512 	/* Mangling hops number delayed to point after skb COW */
513 
514 	hdr->hop_limit--;
515 
516 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
517 	IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
518 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, NULL, skb,
519 		       skb->dev, dst->dev,
520 		       ip6_forward_finish);
521 
522 error:
523 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
524 drop:
525 	kfree_skb(skb);
526 	return -EINVAL;
527 }
528 
529 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
530 {
531 	to->pkt_type = from->pkt_type;
532 	to->priority = from->priority;
533 	to->protocol = from->protocol;
534 	skb_dst_drop(to);
535 	skb_dst_set(to, dst_clone(skb_dst(from)));
536 	to->dev = from->dev;
537 	to->mark = from->mark;
538 
539 #ifdef CONFIG_NET_SCHED
540 	to->tc_index = from->tc_index;
541 #endif
542 	nf_copy(to, from);
543 	skb_copy_secmark(to, from);
544 }
545 
546 int ip6_fragment(struct sock *sk, struct sk_buff *skb,
547 		 int (*output)(struct sock *, struct sk_buff *))
548 {
549 	struct sk_buff *frag;
550 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
551 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
552 				inet6_sk(skb->sk) : NULL;
553 	struct ipv6hdr *tmp_hdr;
554 	struct frag_hdr *fh;
555 	unsigned int mtu, hlen, left, len;
556 	int hroom, troom;
557 	__be32 frag_id;
558 	int ptr, offset = 0, err = 0;
559 	u8 *prevhdr, nexthdr = 0;
560 	struct net *net = dev_net(skb_dst(skb)->dev);
561 
562 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
563 	nexthdr = *prevhdr;
564 
565 	mtu = ip6_skb_dst_mtu(skb);
566 
567 	/* We must not fragment if the socket is set to force MTU discovery
568 	 * or if the skb it not generated by a local socket.
569 	 */
570 	if (unlikely(!skb->ignore_df && skb->len > mtu))
571 		goto fail_toobig;
572 
573 	if (IP6CB(skb)->frag_max_size) {
574 		if (IP6CB(skb)->frag_max_size > mtu)
575 			goto fail_toobig;
576 
577 		/* don't send fragments larger than what we received */
578 		mtu = IP6CB(skb)->frag_max_size;
579 		if (mtu < IPV6_MIN_MTU)
580 			mtu = IPV6_MIN_MTU;
581 	}
582 
583 	if (np && np->frag_size < mtu) {
584 		if (np->frag_size)
585 			mtu = np->frag_size;
586 	}
587 	mtu -= hlen + sizeof(struct frag_hdr);
588 
589 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
590 				    &ipv6_hdr(skb)->saddr);
591 
592 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
593 	if (skb_has_frag_list(skb)) {
594 		int first_len = skb_pagelen(skb);
595 		struct sk_buff *frag2;
596 
597 		if (first_len - hlen > mtu ||
598 		    ((first_len - hlen) & 7) ||
599 		    skb_cloned(skb) ||
600 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
601 			goto slow_path;
602 
603 		skb_walk_frags(skb, frag) {
604 			/* Correct geometry. */
605 			if (frag->len > mtu ||
606 			    ((frag->len & 7) && frag->next) ||
607 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
608 				goto slow_path_clean;
609 
610 			/* Partially cloned skb? */
611 			if (skb_shared(frag))
612 				goto slow_path_clean;
613 
614 			BUG_ON(frag->sk);
615 			if (skb->sk) {
616 				frag->sk = skb->sk;
617 				frag->destructor = sock_wfree;
618 			}
619 			skb->truesize -= frag->truesize;
620 		}
621 
622 		err = 0;
623 		offset = 0;
624 		/* BUILD HEADER */
625 
626 		*prevhdr = NEXTHDR_FRAGMENT;
627 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
628 		if (!tmp_hdr) {
629 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
630 				      IPSTATS_MIB_FRAGFAILS);
631 			err = -ENOMEM;
632 			goto fail;
633 		}
634 		frag = skb_shinfo(skb)->frag_list;
635 		skb_frag_list_init(skb);
636 
637 		__skb_pull(skb, hlen);
638 		fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
639 		__skb_push(skb, hlen);
640 		skb_reset_network_header(skb);
641 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
642 
643 		fh->nexthdr = nexthdr;
644 		fh->reserved = 0;
645 		fh->frag_off = htons(IP6_MF);
646 		fh->identification = frag_id;
647 
648 		first_len = skb_pagelen(skb);
649 		skb->data_len = first_len - skb_headlen(skb);
650 		skb->len = first_len;
651 		ipv6_hdr(skb)->payload_len = htons(first_len -
652 						   sizeof(struct ipv6hdr));
653 
654 		dst_hold(&rt->dst);
655 
656 		for (;;) {
657 			/* Prepare header of the next frame,
658 			 * before previous one went down. */
659 			if (frag) {
660 				frag->ip_summed = CHECKSUM_NONE;
661 				skb_reset_transport_header(frag);
662 				fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
663 				__skb_push(frag, hlen);
664 				skb_reset_network_header(frag);
665 				memcpy(skb_network_header(frag), tmp_hdr,
666 				       hlen);
667 				offset += skb->len - hlen - sizeof(struct frag_hdr);
668 				fh->nexthdr = nexthdr;
669 				fh->reserved = 0;
670 				fh->frag_off = htons(offset);
671 				if (frag->next)
672 					fh->frag_off |= htons(IP6_MF);
673 				fh->identification = frag_id;
674 				ipv6_hdr(frag)->payload_len =
675 						htons(frag->len -
676 						      sizeof(struct ipv6hdr));
677 				ip6_copy_metadata(frag, skb);
678 			}
679 
680 			err = output(sk, skb);
681 			if (!err)
682 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
683 					      IPSTATS_MIB_FRAGCREATES);
684 
685 			if (err || !frag)
686 				break;
687 
688 			skb = frag;
689 			frag = skb->next;
690 			skb->next = NULL;
691 		}
692 
693 		kfree(tmp_hdr);
694 
695 		if (err == 0) {
696 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
697 				      IPSTATS_MIB_FRAGOKS);
698 			ip6_rt_put(rt);
699 			return 0;
700 		}
701 
702 		kfree_skb_list(frag);
703 
704 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
705 			      IPSTATS_MIB_FRAGFAILS);
706 		ip6_rt_put(rt);
707 		return err;
708 
709 slow_path_clean:
710 		skb_walk_frags(skb, frag2) {
711 			if (frag2 == frag)
712 				break;
713 			frag2->sk = NULL;
714 			frag2->destructor = NULL;
715 			skb->truesize += frag2->truesize;
716 		}
717 	}
718 
719 slow_path:
720 	if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
721 	    skb_checksum_help(skb))
722 		goto fail;
723 
724 	left = skb->len - hlen;		/* Space per frame */
725 	ptr = hlen;			/* Where to start from */
726 
727 	/*
728 	 *	Fragment the datagram.
729 	 */
730 
731 	*prevhdr = NEXTHDR_FRAGMENT;
732 	troom = rt->dst.dev->needed_tailroom;
733 
734 	/*
735 	 *	Keep copying data until we run out.
736 	 */
737 	while (left > 0)	{
738 		len = left;
739 		/* IF: it doesn't fit, use 'mtu' - the data space left */
740 		if (len > mtu)
741 			len = mtu;
742 		/* IF: we are not sending up to and including the packet end
743 		   then align the next start on an eight byte boundary */
744 		if (len < left)	{
745 			len &= ~7;
746 		}
747 
748 		/* Allocate buffer */
749 		frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
750 				 hroom + troom, GFP_ATOMIC);
751 		if (!frag) {
752 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
753 				      IPSTATS_MIB_FRAGFAILS);
754 			err = -ENOMEM;
755 			goto fail;
756 		}
757 
758 		/*
759 		 *	Set up data on packet
760 		 */
761 
762 		ip6_copy_metadata(frag, skb);
763 		skb_reserve(frag, hroom);
764 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
765 		skb_reset_network_header(frag);
766 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
767 		frag->transport_header = (frag->network_header + hlen +
768 					  sizeof(struct frag_hdr));
769 
770 		/*
771 		 *	Charge the memory for the fragment to any owner
772 		 *	it might possess
773 		 */
774 		if (skb->sk)
775 			skb_set_owner_w(frag, skb->sk);
776 
777 		/*
778 		 *	Copy the packet header into the new buffer.
779 		 */
780 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
781 
782 		/*
783 		 *	Build fragment header.
784 		 */
785 		fh->nexthdr = nexthdr;
786 		fh->reserved = 0;
787 		fh->identification = frag_id;
788 
789 		/*
790 		 *	Copy a block of the IP datagram.
791 		 */
792 		BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
793 				     len));
794 		left -= len;
795 
796 		fh->frag_off = htons(offset);
797 		if (left > 0)
798 			fh->frag_off |= htons(IP6_MF);
799 		ipv6_hdr(frag)->payload_len = htons(frag->len -
800 						    sizeof(struct ipv6hdr));
801 
802 		ptr += len;
803 		offset += len;
804 
805 		/*
806 		 *	Put this fragment into the sending queue.
807 		 */
808 		err = output(sk, frag);
809 		if (err)
810 			goto fail;
811 
812 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
813 			      IPSTATS_MIB_FRAGCREATES);
814 	}
815 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
816 		      IPSTATS_MIB_FRAGOKS);
817 	consume_skb(skb);
818 	return err;
819 
820 fail_toobig:
821 	if (skb->sk && dst_allfrag(skb_dst(skb)))
822 		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
823 
824 	skb->dev = skb_dst(skb)->dev;
825 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
826 	err = -EMSGSIZE;
827 
828 fail:
829 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
830 		      IPSTATS_MIB_FRAGFAILS);
831 	kfree_skb(skb);
832 	return err;
833 }
834 
835 static inline int ip6_rt_check(const struct rt6key *rt_key,
836 			       const struct in6_addr *fl_addr,
837 			       const struct in6_addr *addr_cache)
838 {
839 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
840 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
841 }
842 
843 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
844 					  struct dst_entry *dst,
845 					  const struct flowi6 *fl6)
846 {
847 	struct ipv6_pinfo *np = inet6_sk(sk);
848 	struct rt6_info *rt;
849 
850 	if (!dst)
851 		goto out;
852 
853 	if (dst->ops->family != AF_INET6) {
854 		dst_release(dst);
855 		return NULL;
856 	}
857 
858 	rt = (struct rt6_info *)dst;
859 	/* Yes, checking route validity in not connected
860 	 * case is not very simple. Take into account,
861 	 * that we do not support routing by source, TOS,
862 	 * and MSG_DONTROUTE		--ANK (980726)
863 	 *
864 	 * 1. ip6_rt_check(): If route was host route,
865 	 *    check that cached destination is current.
866 	 *    If it is network route, we still may
867 	 *    check its validity using saved pointer
868 	 *    to the last used address: daddr_cache.
869 	 *    We do not want to save whole address now,
870 	 *    (because main consumer of this service
871 	 *    is tcp, which has not this problem),
872 	 *    so that the last trick works only on connected
873 	 *    sockets.
874 	 * 2. oif also should be the same.
875 	 */
876 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
877 #ifdef CONFIG_IPV6_SUBTREES
878 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
879 #endif
880 	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
881 	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
882 		dst_release(dst);
883 		dst = NULL;
884 	}
885 
886 out:
887 	return dst;
888 }
889 
890 static int ip6_dst_lookup_tail(struct net *net, struct sock *sk,
891 			       struct dst_entry **dst, struct flowi6 *fl6)
892 {
893 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
894 	struct neighbour *n;
895 	struct rt6_info *rt;
896 #endif
897 	int err;
898 
899 	/* The correct way to handle this would be to do
900 	 * ip6_route_get_saddr, and then ip6_route_output; however,
901 	 * the route-specific preferred source forces the
902 	 * ip6_route_output call _before_ ip6_route_get_saddr.
903 	 *
904 	 * In source specific routing (no src=any default route),
905 	 * ip6_route_output will fail given src=any saddr, though, so
906 	 * that's why we try it again later.
907 	 */
908 	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
909 		struct rt6_info *rt;
910 		bool had_dst = *dst != NULL;
911 
912 		if (!had_dst)
913 			*dst = ip6_route_output(net, sk, fl6);
914 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
915 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
916 					  sk ? inet6_sk(sk)->srcprefs : 0,
917 					  &fl6->saddr);
918 		if (err)
919 			goto out_err_release;
920 
921 		/* If we had an erroneous initial result, pretend it
922 		 * never existed and let the SA-enabled version take
923 		 * over.
924 		 */
925 		if (!had_dst && (*dst)->error) {
926 			dst_release(*dst);
927 			*dst = NULL;
928 		}
929 	}
930 
931 	if (!*dst)
932 		*dst = ip6_route_output(net, sk, fl6);
933 
934 	err = (*dst)->error;
935 	if (err)
936 		goto out_err_release;
937 
938 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
939 	/*
940 	 * Here if the dst entry we've looked up
941 	 * has a neighbour entry that is in the INCOMPLETE
942 	 * state and the src address from the flow is
943 	 * marked as OPTIMISTIC, we release the found
944 	 * dst entry and replace it instead with the
945 	 * dst entry of the nexthop router
946 	 */
947 	rt = (struct rt6_info *) *dst;
948 	rcu_read_lock_bh();
949 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
950 				      rt6_nexthop(rt, &fl6->daddr));
951 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
952 	rcu_read_unlock_bh();
953 
954 	if (err) {
955 		struct inet6_ifaddr *ifp;
956 		struct flowi6 fl_gw6;
957 		int redirect;
958 
959 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
960 				      (*dst)->dev, 1);
961 
962 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
963 		if (ifp)
964 			in6_ifa_put(ifp);
965 
966 		if (redirect) {
967 			/*
968 			 * We need to get the dst entry for the
969 			 * default router instead
970 			 */
971 			dst_release(*dst);
972 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
973 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
974 			*dst = ip6_route_output(net, sk, &fl_gw6);
975 			err = (*dst)->error;
976 			if (err)
977 				goto out_err_release;
978 		}
979 	}
980 #endif
981 
982 	return 0;
983 
984 out_err_release:
985 	if (err == -ENETUNREACH)
986 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
987 	dst_release(*dst);
988 	*dst = NULL;
989 	return err;
990 }
991 
992 /**
993  *	ip6_dst_lookup - perform route lookup on flow
994  *	@sk: socket which provides route info
995  *	@dst: pointer to dst_entry * for result
996  *	@fl6: flow to lookup
997  *
998  *	This function performs a route lookup on the given flow.
999  *
1000  *	It returns zero on success, or a standard errno code on error.
1001  */
1002 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1003 		   struct flowi6 *fl6)
1004 {
1005 	*dst = NULL;
1006 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1007 }
1008 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1009 
1010 /**
1011  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1012  *	@sk: socket which provides route info
1013  *	@fl6: flow to lookup
1014  *	@final_dst: final destination address for ipsec lookup
1015  *
1016  *	This function performs a route lookup on the given flow.
1017  *
1018  *	It returns a valid dst pointer on success, or a pointer encoded
1019  *	error code.
1020  */
1021 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1022 				      const struct in6_addr *final_dst)
1023 {
1024 	struct dst_entry *dst = NULL;
1025 	int err;
1026 
1027 	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1028 	if (err)
1029 		return ERR_PTR(err);
1030 	if (final_dst)
1031 		fl6->daddr = *final_dst;
1032 	if (!fl6->flowi6_oif)
1033 		fl6->flowi6_oif = dst->dev->ifindex;
1034 
1035 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1036 }
1037 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1038 
1039 /**
1040  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1041  *	@sk: socket which provides the dst cache and route info
1042  *	@fl6: flow to lookup
1043  *	@final_dst: final destination address for ipsec lookup
1044  *
1045  *	This function performs a route lookup on the given flow with the
1046  *	possibility of using the cached route in the socket if it is valid.
1047  *	It will take the socket dst lock when operating on the dst cache.
1048  *	As a result, this function can only be used in process context.
1049  *
1050  *	It returns a valid dst pointer on success, or a pointer encoded
1051  *	error code.
1052  */
1053 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1054 					 const struct in6_addr *final_dst)
1055 {
1056 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1057 	int err;
1058 
1059 	dst = ip6_sk_dst_check(sk, dst, fl6);
1060 
1061 	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1062 	if (err)
1063 		return ERR_PTR(err);
1064 	if (final_dst)
1065 		fl6->daddr = *final_dst;
1066 
1067 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1068 }
1069 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1070 
1071 static inline int ip6_ufo_append_data(struct sock *sk,
1072 			struct sk_buff_head *queue,
1073 			int getfrag(void *from, char *to, int offset, int len,
1074 			int odd, struct sk_buff *skb),
1075 			void *from, int length, int hh_len, int fragheaderlen,
1076 			int transhdrlen, int mtu, unsigned int flags,
1077 			const struct flowi6 *fl6)
1078 
1079 {
1080 	struct sk_buff *skb;
1081 	int err;
1082 
1083 	/* There is support for UDP large send offload by network
1084 	 * device, so create one single skb packet containing complete
1085 	 * udp datagram
1086 	 */
1087 	skb = skb_peek_tail(queue);
1088 	if (!skb) {
1089 		skb = sock_alloc_send_skb(sk,
1090 			hh_len + fragheaderlen + transhdrlen + 20,
1091 			(flags & MSG_DONTWAIT), &err);
1092 		if (!skb)
1093 			return err;
1094 
1095 		/* reserve space for Hardware header */
1096 		skb_reserve(skb, hh_len);
1097 
1098 		/* create space for UDP/IP header */
1099 		skb_put(skb, fragheaderlen + transhdrlen);
1100 
1101 		/* initialize network header pointer */
1102 		skb_reset_network_header(skb);
1103 
1104 		/* initialize protocol header pointer */
1105 		skb->transport_header = skb->network_header + fragheaderlen;
1106 
1107 		skb->protocol = htons(ETH_P_IPV6);
1108 		skb->csum = 0;
1109 
1110 		__skb_queue_tail(queue, skb);
1111 	} else if (skb_is_gso(skb)) {
1112 		goto append;
1113 	}
1114 
1115 	skb->ip_summed = CHECKSUM_PARTIAL;
1116 	/* Specify the length of each IPv6 datagram fragment.
1117 	 * It has to be a multiple of 8.
1118 	 */
1119 	skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1120 				     sizeof(struct frag_hdr)) & ~7;
1121 	skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1122 	skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1123 							 &fl6->daddr,
1124 							 &fl6->saddr);
1125 
1126 append:
1127 	return skb_append_datato_frags(sk, skb, getfrag, from,
1128 				       (length - transhdrlen));
1129 }
1130 
1131 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1132 					       gfp_t gfp)
1133 {
1134 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1135 }
1136 
1137 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1138 						gfp_t gfp)
1139 {
1140 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1141 }
1142 
1143 static void ip6_append_data_mtu(unsigned int *mtu,
1144 				int *maxfraglen,
1145 				unsigned int fragheaderlen,
1146 				struct sk_buff *skb,
1147 				struct rt6_info *rt,
1148 				unsigned int orig_mtu)
1149 {
1150 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1151 		if (!skb) {
1152 			/* first fragment, reserve header_len */
1153 			*mtu = orig_mtu - rt->dst.header_len;
1154 
1155 		} else {
1156 			/*
1157 			 * this fragment is not first, the headers
1158 			 * space is regarded as data space.
1159 			 */
1160 			*mtu = orig_mtu;
1161 		}
1162 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1163 			      + fragheaderlen - sizeof(struct frag_hdr);
1164 	}
1165 }
1166 
1167 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1168 			  struct inet6_cork *v6_cork,
1169 			  int hlimit, int tclass, struct ipv6_txoptions *opt,
1170 			  struct rt6_info *rt, struct flowi6 *fl6)
1171 {
1172 	struct ipv6_pinfo *np = inet6_sk(sk);
1173 	unsigned int mtu;
1174 
1175 	/*
1176 	 * setup for corking
1177 	 */
1178 	if (opt) {
1179 		if (WARN_ON(v6_cork->opt))
1180 			return -EINVAL;
1181 
1182 		v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1183 		if (unlikely(!v6_cork->opt))
1184 			return -ENOBUFS;
1185 
1186 		v6_cork->opt->tot_len = opt->tot_len;
1187 		v6_cork->opt->opt_flen = opt->opt_flen;
1188 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1189 
1190 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1191 						    sk->sk_allocation);
1192 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1193 			return -ENOBUFS;
1194 
1195 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1196 						    sk->sk_allocation);
1197 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1198 			return -ENOBUFS;
1199 
1200 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1201 						   sk->sk_allocation);
1202 		if (opt->hopopt && !v6_cork->opt->hopopt)
1203 			return -ENOBUFS;
1204 
1205 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1206 						    sk->sk_allocation);
1207 		if (opt->srcrt && !v6_cork->opt->srcrt)
1208 			return -ENOBUFS;
1209 
1210 		/* need source address above miyazawa*/
1211 	}
1212 	dst_hold(&rt->dst);
1213 	cork->base.dst = &rt->dst;
1214 	cork->fl.u.ip6 = *fl6;
1215 	v6_cork->hop_limit = hlimit;
1216 	v6_cork->tclass = tclass;
1217 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1218 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1219 		      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1220 	else
1221 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1222 		      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1223 	if (np->frag_size < mtu) {
1224 		if (np->frag_size)
1225 			mtu = np->frag_size;
1226 	}
1227 	cork->base.fragsize = mtu;
1228 	if (dst_allfrag(rt->dst.path))
1229 		cork->base.flags |= IPCORK_ALLFRAG;
1230 	cork->base.length = 0;
1231 
1232 	return 0;
1233 }
1234 
1235 static int __ip6_append_data(struct sock *sk,
1236 			     struct flowi6 *fl6,
1237 			     struct sk_buff_head *queue,
1238 			     struct inet_cork *cork,
1239 			     struct inet6_cork *v6_cork,
1240 			     struct page_frag *pfrag,
1241 			     int getfrag(void *from, char *to, int offset,
1242 					 int len, int odd, struct sk_buff *skb),
1243 			     void *from, int length, int transhdrlen,
1244 			     unsigned int flags, int dontfrag)
1245 {
1246 	struct sk_buff *skb, *skb_prev = NULL;
1247 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1248 	int exthdrlen = 0;
1249 	int dst_exthdrlen = 0;
1250 	int hh_len;
1251 	int copy;
1252 	int err;
1253 	int offset = 0;
1254 	__u8 tx_flags = 0;
1255 	u32 tskey = 0;
1256 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1257 	struct ipv6_txoptions *opt = v6_cork->opt;
1258 	int csummode = CHECKSUM_NONE;
1259 
1260 	skb = skb_peek_tail(queue);
1261 	if (!skb) {
1262 		exthdrlen = opt ? opt->opt_flen : 0;
1263 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1264 	}
1265 
1266 	mtu = cork->fragsize;
1267 	orig_mtu = mtu;
1268 
1269 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1270 
1271 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1272 			(opt ? opt->opt_nflen : 0);
1273 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1274 		     sizeof(struct frag_hdr);
1275 
1276 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1277 		unsigned int maxnonfragsize, headersize;
1278 
1279 		headersize = sizeof(struct ipv6hdr) +
1280 			     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1281 			     (dst_allfrag(&rt->dst) ?
1282 			      sizeof(struct frag_hdr) : 0) +
1283 			     rt->rt6i_nfheader_len;
1284 
1285 		if (ip6_sk_ignore_df(sk))
1286 			maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1287 		else
1288 			maxnonfragsize = mtu;
1289 
1290 		/* dontfrag active */
1291 		if ((cork->length + length > mtu - headersize) && dontfrag &&
1292 		    (sk->sk_protocol == IPPROTO_UDP ||
1293 		     sk->sk_protocol == IPPROTO_RAW)) {
1294 			ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1295 						   sizeof(struct ipv6hdr));
1296 			goto emsgsize;
1297 		}
1298 
1299 		if (cork->length + length > maxnonfragsize - headersize) {
1300 emsgsize:
1301 			ipv6_local_error(sk, EMSGSIZE, fl6,
1302 					 mtu - headersize +
1303 					 sizeof(struct ipv6hdr));
1304 			return -EMSGSIZE;
1305 		}
1306 	}
1307 
1308 	if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1309 		sock_tx_timestamp(sk, &tx_flags);
1310 		if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1311 		    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1312 			tskey = sk->sk_tskey++;
1313 	}
1314 
1315 	/* If this is the first and only packet and device
1316 	 * supports checksum offloading, let's use it.
1317 	 * Use transhdrlen, same as IPv4, because partial
1318 	 * sums only work when transhdrlen is set.
1319 	 */
1320 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1321 	    length + fragheaderlen < mtu &&
1322 	    rt->dst.dev->features & NETIF_F_V6_CSUM &&
1323 	    !exthdrlen)
1324 		csummode = CHECKSUM_PARTIAL;
1325 	/*
1326 	 * Let's try using as much space as possible.
1327 	 * Use MTU if total length of the message fits into the MTU.
1328 	 * Otherwise, we need to reserve fragment header and
1329 	 * fragment alignment (= 8-15 octects, in total).
1330 	 *
1331 	 * Note that we may need to "move" the data from the tail of
1332 	 * of the buffer to the new fragment when we split
1333 	 * the message.
1334 	 *
1335 	 * FIXME: It may be fragmented into multiple chunks
1336 	 *        at once if non-fragmentable extension headers
1337 	 *        are too large.
1338 	 * --yoshfuji
1339 	 */
1340 
1341 	cork->length += length;
1342 	if (((length > mtu) ||
1343 	     (skb && skb_is_gso(skb))) &&
1344 	    (sk->sk_protocol == IPPROTO_UDP) &&
1345 	    (rt->dst.dev->features & NETIF_F_UFO) &&
1346 	    (sk->sk_type == SOCK_DGRAM)) {
1347 		err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1348 					  hh_len, fragheaderlen,
1349 					  transhdrlen, mtu, flags, fl6);
1350 		if (err)
1351 			goto error;
1352 		return 0;
1353 	}
1354 
1355 	if (!skb)
1356 		goto alloc_new_skb;
1357 
1358 	while (length > 0) {
1359 		/* Check if the remaining data fits into current packet. */
1360 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1361 		if (copy < length)
1362 			copy = maxfraglen - skb->len;
1363 
1364 		if (copy <= 0) {
1365 			char *data;
1366 			unsigned int datalen;
1367 			unsigned int fraglen;
1368 			unsigned int fraggap;
1369 			unsigned int alloclen;
1370 alloc_new_skb:
1371 			/* There's no room in the current skb */
1372 			if (skb)
1373 				fraggap = skb->len - maxfraglen;
1374 			else
1375 				fraggap = 0;
1376 			/* update mtu and maxfraglen if necessary */
1377 			if (!skb || !skb_prev)
1378 				ip6_append_data_mtu(&mtu, &maxfraglen,
1379 						    fragheaderlen, skb, rt,
1380 						    orig_mtu);
1381 
1382 			skb_prev = skb;
1383 
1384 			/*
1385 			 * If remaining data exceeds the mtu,
1386 			 * we know we need more fragment(s).
1387 			 */
1388 			datalen = length + fraggap;
1389 
1390 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1391 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1392 			if ((flags & MSG_MORE) &&
1393 			    !(rt->dst.dev->features&NETIF_F_SG))
1394 				alloclen = mtu;
1395 			else
1396 				alloclen = datalen + fragheaderlen;
1397 
1398 			alloclen += dst_exthdrlen;
1399 
1400 			if (datalen != length + fraggap) {
1401 				/*
1402 				 * this is not the last fragment, the trailer
1403 				 * space is regarded as data space.
1404 				 */
1405 				datalen += rt->dst.trailer_len;
1406 			}
1407 
1408 			alloclen += rt->dst.trailer_len;
1409 			fraglen = datalen + fragheaderlen;
1410 
1411 			/*
1412 			 * We just reserve space for fragment header.
1413 			 * Note: this may be overallocation if the message
1414 			 * (without MSG_MORE) fits into the MTU.
1415 			 */
1416 			alloclen += sizeof(struct frag_hdr);
1417 
1418 			if (transhdrlen) {
1419 				skb = sock_alloc_send_skb(sk,
1420 						alloclen + hh_len,
1421 						(flags & MSG_DONTWAIT), &err);
1422 			} else {
1423 				skb = NULL;
1424 				if (atomic_read(&sk->sk_wmem_alloc) <=
1425 				    2 * sk->sk_sndbuf)
1426 					skb = sock_wmalloc(sk,
1427 							   alloclen + hh_len, 1,
1428 							   sk->sk_allocation);
1429 				if (unlikely(!skb))
1430 					err = -ENOBUFS;
1431 			}
1432 			if (!skb)
1433 				goto error;
1434 			/*
1435 			 *	Fill in the control structures
1436 			 */
1437 			skb->protocol = htons(ETH_P_IPV6);
1438 			skb->ip_summed = csummode;
1439 			skb->csum = 0;
1440 			/* reserve for fragmentation and ipsec header */
1441 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1442 				    dst_exthdrlen);
1443 
1444 			/* Only the initial fragment is time stamped */
1445 			skb_shinfo(skb)->tx_flags = tx_flags;
1446 			tx_flags = 0;
1447 			skb_shinfo(skb)->tskey = tskey;
1448 			tskey = 0;
1449 
1450 			/*
1451 			 *	Find where to start putting bytes
1452 			 */
1453 			data = skb_put(skb, fraglen);
1454 			skb_set_network_header(skb, exthdrlen);
1455 			data += fragheaderlen;
1456 			skb->transport_header = (skb->network_header +
1457 						 fragheaderlen);
1458 			if (fraggap) {
1459 				skb->csum = skb_copy_and_csum_bits(
1460 					skb_prev, maxfraglen,
1461 					data + transhdrlen, fraggap, 0);
1462 				skb_prev->csum = csum_sub(skb_prev->csum,
1463 							  skb->csum);
1464 				data += fraggap;
1465 				pskb_trim_unique(skb_prev, maxfraglen);
1466 			}
1467 			copy = datalen - transhdrlen - fraggap;
1468 
1469 			if (copy < 0) {
1470 				err = -EINVAL;
1471 				kfree_skb(skb);
1472 				goto error;
1473 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1474 				err = -EFAULT;
1475 				kfree_skb(skb);
1476 				goto error;
1477 			}
1478 
1479 			offset += copy;
1480 			length -= datalen - fraggap;
1481 			transhdrlen = 0;
1482 			exthdrlen = 0;
1483 			dst_exthdrlen = 0;
1484 
1485 			/*
1486 			 * Put the packet on the pending queue
1487 			 */
1488 			__skb_queue_tail(queue, skb);
1489 			continue;
1490 		}
1491 
1492 		if (copy > length)
1493 			copy = length;
1494 
1495 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1496 			unsigned int off;
1497 
1498 			off = skb->len;
1499 			if (getfrag(from, skb_put(skb, copy),
1500 						offset, copy, off, skb) < 0) {
1501 				__skb_trim(skb, off);
1502 				err = -EFAULT;
1503 				goto error;
1504 			}
1505 		} else {
1506 			int i = skb_shinfo(skb)->nr_frags;
1507 
1508 			err = -ENOMEM;
1509 			if (!sk_page_frag_refill(sk, pfrag))
1510 				goto error;
1511 
1512 			if (!skb_can_coalesce(skb, i, pfrag->page,
1513 					      pfrag->offset)) {
1514 				err = -EMSGSIZE;
1515 				if (i == MAX_SKB_FRAGS)
1516 					goto error;
1517 
1518 				__skb_fill_page_desc(skb, i, pfrag->page,
1519 						     pfrag->offset, 0);
1520 				skb_shinfo(skb)->nr_frags = ++i;
1521 				get_page(pfrag->page);
1522 			}
1523 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1524 			if (getfrag(from,
1525 				    page_address(pfrag->page) + pfrag->offset,
1526 				    offset, copy, skb->len, skb) < 0)
1527 				goto error_efault;
1528 
1529 			pfrag->offset += copy;
1530 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1531 			skb->len += copy;
1532 			skb->data_len += copy;
1533 			skb->truesize += copy;
1534 			atomic_add(copy, &sk->sk_wmem_alloc);
1535 		}
1536 		offset += copy;
1537 		length -= copy;
1538 	}
1539 
1540 	return 0;
1541 
1542 error_efault:
1543 	err = -EFAULT;
1544 error:
1545 	cork->length -= length;
1546 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1547 	return err;
1548 }
1549 
1550 int ip6_append_data(struct sock *sk,
1551 		    int getfrag(void *from, char *to, int offset, int len,
1552 				int odd, struct sk_buff *skb),
1553 		    void *from, int length, int transhdrlen, int hlimit,
1554 		    int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1555 		    struct rt6_info *rt, unsigned int flags, int dontfrag)
1556 {
1557 	struct inet_sock *inet = inet_sk(sk);
1558 	struct ipv6_pinfo *np = inet6_sk(sk);
1559 	int exthdrlen;
1560 	int err;
1561 
1562 	if (flags&MSG_PROBE)
1563 		return 0;
1564 	if (skb_queue_empty(&sk->sk_write_queue)) {
1565 		/*
1566 		 * setup for corking
1567 		 */
1568 		err = ip6_setup_cork(sk, &inet->cork, &np->cork, hlimit,
1569 				     tclass, opt, rt, fl6);
1570 		if (err)
1571 			return err;
1572 
1573 		exthdrlen = (opt ? opt->opt_flen : 0);
1574 		length += exthdrlen;
1575 		transhdrlen += exthdrlen;
1576 	} else {
1577 		fl6 = &inet->cork.fl.u.ip6;
1578 		transhdrlen = 0;
1579 	}
1580 
1581 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1582 				 &np->cork, sk_page_frag(sk), getfrag,
1583 				 from, length, transhdrlen, flags, dontfrag);
1584 }
1585 EXPORT_SYMBOL_GPL(ip6_append_data);
1586 
1587 static void ip6_cork_release(struct inet_cork_full *cork,
1588 			     struct inet6_cork *v6_cork)
1589 {
1590 	if (v6_cork->opt) {
1591 		kfree(v6_cork->opt->dst0opt);
1592 		kfree(v6_cork->opt->dst1opt);
1593 		kfree(v6_cork->opt->hopopt);
1594 		kfree(v6_cork->opt->srcrt);
1595 		kfree(v6_cork->opt);
1596 		v6_cork->opt = NULL;
1597 	}
1598 
1599 	if (cork->base.dst) {
1600 		dst_release(cork->base.dst);
1601 		cork->base.dst = NULL;
1602 		cork->base.flags &= ~IPCORK_ALLFRAG;
1603 	}
1604 	memset(&cork->fl, 0, sizeof(cork->fl));
1605 }
1606 
1607 struct sk_buff *__ip6_make_skb(struct sock *sk,
1608 			       struct sk_buff_head *queue,
1609 			       struct inet_cork_full *cork,
1610 			       struct inet6_cork *v6_cork)
1611 {
1612 	struct sk_buff *skb, *tmp_skb;
1613 	struct sk_buff **tail_skb;
1614 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1615 	struct ipv6_pinfo *np = inet6_sk(sk);
1616 	struct net *net = sock_net(sk);
1617 	struct ipv6hdr *hdr;
1618 	struct ipv6_txoptions *opt = v6_cork->opt;
1619 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1620 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1621 	unsigned char proto = fl6->flowi6_proto;
1622 
1623 	skb = __skb_dequeue(queue);
1624 	if (!skb)
1625 		goto out;
1626 	tail_skb = &(skb_shinfo(skb)->frag_list);
1627 
1628 	/* move skb->data to ip header from ext header */
1629 	if (skb->data < skb_network_header(skb))
1630 		__skb_pull(skb, skb_network_offset(skb));
1631 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1632 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1633 		*tail_skb = tmp_skb;
1634 		tail_skb = &(tmp_skb->next);
1635 		skb->len += tmp_skb->len;
1636 		skb->data_len += tmp_skb->len;
1637 		skb->truesize += tmp_skb->truesize;
1638 		tmp_skb->destructor = NULL;
1639 		tmp_skb->sk = NULL;
1640 	}
1641 
1642 	/* Allow local fragmentation. */
1643 	skb->ignore_df = ip6_sk_ignore_df(sk);
1644 
1645 	*final_dst = fl6->daddr;
1646 	__skb_pull(skb, skb_network_header_len(skb));
1647 	if (opt && opt->opt_flen)
1648 		ipv6_push_frag_opts(skb, opt, &proto);
1649 	if (opt && opt->opt_nflen)
1650 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1651 
1652 	skb_push(skb, sizeof(struct ipv6hdr));
1653 	skb_reset_network_header(skb);
1654 	hdr = ipv6_hdr(skb);
1655 
1656 	ip6_flow_hdr(hdr, v6_cork->tclass,
1657 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1658 					np->autoflowlabel, fl6));
1659 	hdr->hop_limit = v6_cork->hop_limit;
1660 	hdr->nexthdr = proto;
1661 	hdr->saddr = fl6->saddr;
1662 	hdr->daddr = *final_dst;
1663 
1664 	skb->priority = sk->sk_priority;
1665 	skb->mark = sk->sk_mark;
1666 
1667 	skb_dst_set(skb, dst_clone(&rt->dst));
1668 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1669 	if (proto == IPPROTO_ICMPV6) {
1670 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1671 
1672 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1673 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1674 	}
1675 
1676 	ip6_cork_release(cork, v6_cork);
1677 out:
1678 	return skb;
1679 }
1680 
1681 int ip6_send_skb(struct sk_buff *skb)
1682 {
1683 	struct net *net = sock_net(skb->sk);
1684 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1685 	int err;
1686 
1687 	err = ip6_local_out(skb);
1688 	if (err) {
1689 		if (err > 0)
1690 			err = net_xmit_errno(err);
1691 		if (err)
1692 			IP6_INC_STATS(net, rt->rt6i_idev,
1693 				      IPSTATS_MIB_OUTDISCARDS);
1694 	}
1695 
1696 	return err;
1697 }
1698 
1699 int ip6_push_pending_frames(struct sock *sk)
1700 {
1701 	struct sk_buff *skb;
1702 
1703 	skb = ip6_finish_skb(sk);
1704 	if (!skb)
1705 		return 0;
1706 
1707 	return ip6_send_skb(skb);
1708 }
1709 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1710 
1711 static void __ip6_flush_pending_frames(struct sock *sk,
1712 				       struct sk_buff_head *queue,
1713 				       struct inet_cork_full *cork,
1714 				       struct inet6_cork *v6_cork)
1715 {
1716 	struct sk_buff *skb;
1717 
1718 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1719 		if (skb_dst(skb))
1720 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1721 				      IPSTATS_MIB_OUTDISCARDS);
1722 		kfree_skb(skb);
1723 	}
1724 
1725 	ip6_cork_release(cork, v6_cork);
1726 }
1727 
1728 void ip6_flush_pending_frames(struct sock *sk)
1729 {
1730 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1731 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1732 }
1733 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1734 
1735 struct sk_buff *ip6_make_skb(struct sock *sk,
1736 			     int getfrag(void *from, char *to, int offset,
1737 					 int len, int odd, struct sk_buff *skb),
1738 			     void *from, int length, int transhdrlen,
1739 			     int hlimit, int tclass,
1740 			     struct ipv6_txoptions *opt, struct flowi6 *fl6,
1741 			     struct rt6_info *rt, unsigned int flags,
1742 			     int dontfrag)
1743 {
1744 	struct inet_cork_full cork;
1745 	struct inet6_cork v6_cork;
1746 	struct sk_buff_head queue;
1747 	int exthdrlen = (opt ? opt->opt_flen : 0);
1748 	int err;
1749 
1750 	if (flags & MSG_PROBE)
1751 		return NULL;
1752 
1753 	__skb_queue_head_init(&queue);
1754 
1755 	cork.base.flags = 0;
1756 	cork.base.addr = 0;
1757 	cork.base.opt = NULL;
1758 	v6_cork.opt = NULL;
1759 	err = ip6_setup_cork(sk, &cork, &v6_cork, hlimit, tclass, opt, rt, fl6);
1760 	if (err)
1761 		return ERR_PTR(err);
1762 
1763 	if (dontfrag < 0)
1764 		dontfrag = inet6_sk(sk)->dontfrag;
1765 
1766 	err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1767 				&current->task_frag, getfrag, from,
1768 				length + exthdrlen, transhdrlen + exthdrlen,
1769 				flags, dontfrag);
1770 	if (err) {
1771 		__ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1772 		return ERR_PTR(err);
1773 	}
1774 
1775 	return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1776 }
1777