xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 8b235f2f)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	:	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 
59 static int ip6_finish_output2(struct sock *sk, struct sk_buff *skb)
60 {
61 	struct dst_entry *dst = skb_dst(skb);
62 	struct net_device *dev = dst->dev;
63 	struct neighbour *neigh;
64 	struct in6_addr *nexthop;
65 	int ret;
66 
67 	skb->protocol = htons(ETH_P_IPV6);
68 	skb->dev = dev;
69 
70 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72 
73 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74 		    ((mroute6_socket(dev_net(dev), skb) &&
75 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 					 &ipv6_hdr(skb)->saddr))) {
78 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79 
80 			/* Do not check for IFF_ALLMULTI; multicast routing
81 			   is not supported in any case.
82 			 */
83 			if (newskb)
84 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 					sk, newskb, NULL, newskb->dev,
86 					dev_loopback_xmit);
87 
88 			if (ipv6_hdr(skb)->hop_limit == 0) {
89 				IP6_INC_STATS(dev_net(dev), idev,
90 					      IPSTATS_MIB_OUTDISCARDS);
91 				kfree_skb(skb);
92 				return 0;
93 			}
94 		}
95 
96 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
97 				skb->len);
98 
99 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
100 		    IPV6_ADDR_SCOPE_NODELOCAL &&
101 		    !(dev->flags & IFF_LOOPBACK)) {
102 			kfree_skb(skb);
103 			return 0;
104 		}
105 	}
106 
107 	rcu_read_lock_bh();
108 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
109 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
110 	if (unlikely(!neigh))
111 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
112 	if (!IS_ERR(neigh)) {
113 		ret = dst_neigh_output(dst, neigh, skb);
114 		rcu_read_unlock_bh();
115 		return ret;
116 	}
117 	rcu_read_unlock_bh();
118 
119 	IP6_INC_STATS(dev_net(dst->dev),
120 		      ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
121 	kfree_skb(skb);
122 	return -EINVAL;
123 }
124 
125 static int ip6_finish_output(struct sock *sk, struct sk_buff *skb)
126 {
127 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
128 	    dst_allfrag(skb_dst(skb)) ||
129 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
130 		return ip6_fragment(sk, skb, ip6_finish_output2);
131 	else
132 		return ip6_finish_output2(sk, skb);
133 }
134 
135 int ip6_output(struct sock *sk, struct sk_buff *skb)
136 {
137 	struct net_device *dev = skb_dst(skb)->dev;
138 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
139 	if (unlikely(idev->cnf.disable_ipv6)) {
140 		IP6_INC_STATS(dev_net(dev), idev,
141 			      IPSTATS_MIB_OUTDISCARDS);
142 		kfree_skb(skb);
143 		return 0;
144 	}
145 
146 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, sk, skb,
147 			    NULL, dev,
148 			    ip6_finish_output,
149 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
150 }
151 
152 /*
153  *	xmit an sk_buff (used by TCP, SCTP and DCCP)
154  */
155 
156 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
157 	     struct ipv6_txoptions *opt, int tclass)
158 {
159 	struct net *net = sock_net(sk);
160 	struct ipv6_pinfo *np = inet6_sk(sk);
161 	struct in6_addr *first_hop = &fl6->daddr;
162 	struct dst_entry *dst = skb_dst(skb);
163 	struct ipv6hdr *hdr;
164 	u8  proto = fl6->flowi6_proto;
165 	int seg_len = skb->len;
166 	int hlimit = -1;
167 	u32 mtu;
168 
169 	if (opt) {
170 		unsigned int head_room;
171 
172 		/* First: exthdrs may take lots of space (~8K for now)
173 		   MAX_HEADER is not enough.
174 		 */
175 		head_room = opt->opt_nflen + opt->opt_flen;
176 		seg_len += head_room;
177 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
178 
179 		if (skb_headroom(skb) < head_room) {
180 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
181 			if (!skb2) {
182 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
183 					      IPSTATS_MIB_OUTDISCARDS);
184 				kfree_skb(skb);
185 				return -ENOBUFS;
186 			}
187 			consume_skb(skb);
188 			skb = skb2;
189 			skb_set_owner_w(skb, sk);
190 		}
191 		if (opt->opt_flen)
192 			ipv6_push_frag_opts(skb, opt, &proto);
193 		if (opt->opt_nflen)
194 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
195 	}
196 
197 	skb_push(skb, sizeof(struct ipv6hdr));
198 	skb_reset_network_header(skb);
199 	hdr = ipv6_hdr(skb);
200 
201 	/*
202 	 *	Fill in the IPv6 header
203 	 */
204 	if (np)
205 		hlimit = np->hop_limit;
206 	if (hlimit < 0)
207 		hlimit = ip6_dst_hoplimit(dst);
208 
209 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
210 						     np->autoflowlabel, fl6));
211 
212 	hdr->payload_len = htons(seg_len);
213 	hdr->nexthdr = proto;
214 	hdr->hop_limit = hlimit;
215 
216 	hdr->saddr = fl6->saddr;
217 	hdr->daddr = *first_hop;
218 
219 	skb->protocol = htons(ETH_P_IPV6);
220 	skb->priority = sk->sk_priority;
221 	skb->mark = sk->sk_mark;
222 
223 	mtu = dst_mtu(dst);
224 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
225 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
226 			      IPSTATS_MIB_OUT, skb->len);
227 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb,
228 			       NULL, dst->dev, dst_output_sk);
229 	}
230 
231 	skb->dev = dst->dev;
232 	ipv6_local_error(sk, EMSGSIZE, fl6, mtu);
233 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
234 	kfree_skb(skb);
235 	return -EMSGSIZE;
236 }
237 EXPORT_SYMBOL(ip6_xmit);
238 
239 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
240 {
241 	struct ip6_ra_chain *ra;
242 	struct sock *last = NULL;
243 
244 	read_lock(&ip6_ra_lock);
245 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
246 		struct sock *sk = ra->sk;
247 		if (sk && ra->sel == sel &&
248 		    (!sk->sk_bound_dev_if ||
249 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
250 			if (last) {
251 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
252 				if (skb2)
253 					rawv6_rcv(last, skb2);
254 			}
255 			last = sk;
256 		}
257 	}
258 
259 	if (last) {
260 		rawv6_rcv(last, skb);
261 		read_unlock(&ip6_ra_lock);
262 		return 1;
263 	}
264 	read_unlock(&ip6_ra_lock);
265 	return 0;
266 }
267 
268 static int ip6_forward_proxy_check(struct sk_buff *skb)
269 {
270 	struct ipv6hdr *hdr = ipv6_hdr(skb);
271 	u8 nexthdr = hdr->nexthdr;
272 	__be16 frag_off;
273 	int offset;
274 
275 	if (ipv6_ext_hdr(nexthdr)) {
276 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
277 		if (offset < 0)
278 			return 0;
279 	} else
280 		offset = sizeof(struct ipv6hdr);
281 
282 	if (nexthdr == IPPROTO_ICMPV6) {
283 		struct icmp6hdr *icmp6;
284 
285 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
286 					 offset + 1 - skb->data)))
287 			return 0;
288 
289 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
290 
291 		switch (icmp6->icmp6_type) {
292 		case NDISC_ROUTER_SOLICITATION:
293 		case NDISC_ROUTER_ADVERTISEMENT:
294 		case NDISC_NEIGHBOUR_SOLICITATION:
295 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
296 		case NDISC_REDIRECT:
297 			/* For reaction involving unicast neighbor discovery
298 			 * message destined to the proxied address, pass it to
299 			 * input function.
300 			 */
301 			return 1;
302 		default:
303 			break;
304 		}
305 	}
306 
307 	/*
308 	 * The proxying router can't forward traffic sent to a link-local
309 	 * address, so signal the sender and discard the packet. This
310 	 * behavior is clarified by the MIPv6 specification.
311 	 */
312 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
313 		dst_link_failure(skb);
314 		return -1;
315 	}
316 
317 	return 0;
318 }
319 
320 static inline int ip6_forward_finish(struct sock *sk, struct sk_buff *skb)
321 {
322 	skb_sender_cpu_clear(skb);
323 	return dst_output_sk(sk, skb);
324 }
325 
326 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
327 {
328 	unsigned int mtu;
329 	struct inet6_dev *idev;
330 
331 	if (dst_metric_locked(dst, RTAX_MTU)) {
332 		mtu = dst_metric_raw(dst, RTAX_MTU);
333 		if (mtu)
334 			return mtu;
335 	}
336 
337 	mtu = IPV6_MIN_MTU;
338 	rcu_read_lock();
339 	idev = __in6_dev_get(dst->dev);
340 	if (idev)
341 		mtu = idev->cnf.mtu6;
342 	rcu_read_unlock();
343 
344 	return mtu;
345 }
346 
347 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
348 {
349 	if (skb->len <= mtu)
350 		return false;
351 
352 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
353 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
354 		return true;
355 
356 	if (skb->ignore_df)
357 		return false;
358 
359 	if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
360 		return false;
361 
362 	return true;
363 }
364 
365 int ip6_forward(struct sk_buff *skb)
366 {
367 	struct dst_entry *dst = skb_dst(skb);
368 	struct ipv6hdr *hdr = ipv6_hdr(skb);
369 	struct inet6_skb_parm *opt = IP6CB(skb);
370 	struct net *net = dev_net(dst->dev);
371 	u32 mtu;
372 
373 	if (net->ipv6.devconf_all->forwarding == 0)
374 		goto error;
375 
376 	if (skb->pkt_type != PACKET_HOST)
377 		goto drop;
378 
379 	if (skb_warn_if_lro(skb))
380 		goto drop;
381 
382 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
383 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
384 				 IPSTATS_MIB_INDISCARDS);
385 		goto drop;
386 	}
387 
388 	skb_forward_csum(skb);
389 
390 	/*
391 	 *	We DO NOT make any processing on
392 	 *	RA packets, pushing them to user level AS IS
393 	 *	without ane WARRANTY that application will be able
394 	 *	to interpret them. The reason is that we
395 	 *	cannot make anything clever here.
396 	 *
397 	 *	We are not end-node, so that if packet contains
398 	 *	AH/ESP, we cannot make anything.
399 	 *	Defragmentation also would be mistake, RA packets
400 	 *	cannot be fragmented, because there is no warranty
401 	 *	that different fragments will go along one path. --ANK
402 	 */
403 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
404 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
405 			return 0;
406 	}
407 
408 	/*
409 	 *	check and decrement ttl
410 	 */
411 	if (hdr->hop_limit <= 1) {
412 		/* Force OUTPUT device used as source address */
413 		skb->dev = dst->dev;
414 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
415 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
416 				 IPSTATS_MIB_INHDRERRORS);
417 
418 		kfree_skb(skb);
419 		return -ETIMEDOUT;
420 	}
421 
422 	/* XXX: idev->cnf.proxy_ndp? */
423 	if (net->ipv6.devconf_all->proxy_ndp &&
424 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
425 		int proxied = ip6_forward_proxy_check(skb);
426 		if (proxied > 0)
427 			return ip6_input(skb);
428 		else if (proxied < 0) {
429 			IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
430 					 IPSTATS_MIB_INDISCARDS);
431 			goto drop;
432 		}
433 	}
434 
435 	if (!xfrm6_route_forward(skb)) {
436 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
437 				 IPSTATS_MIB_INDISCARDS);
438 		goto drop;
439 	}
440 	dst = skb_dst(skb);
441 
442 	/* IPv6 specs say nothing about it, but it is clear that we cannot
443 	   send redirects to source routed frames.
444 	   We don't send redirects to frames decapsulated from IPsec.
445 	 */
446 	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
447 		struct in6_addr *target = NULL;
448 		struct inet_peer *peer;
449 		struct rt6_info *rt;
450 
451 		/*
452 		 *	incoming and outgoing devices are the same
453 		 *	send a redirect.
454 		 */
455 
456 		rt = (struct rt6_info *) dst;
457 		if (rt->rt6i_flags & RTF_GATEWAY)
458 			target = &rt->rt6i_gateway;
459 		else
460 			target = &hdr->daddr;
461 
462 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
463 
464 		/* Limit redirects both by destination (here)
465 		   and by source (inside ndisc_send_redirect)
466 		 */
467 		if (inet_peer_xrlim_allow(peer, 1*HZ))
468 			ndisc_send_redirect(skb, target);
469 		if (peer)
470 			inet_putpeer(peer);
471 	} else {
472 		int addrtype = ipv6_addr_type(&hdr->saddr);
473 
474 		/* This check is security critical. */
475 		if (addrtype == IPV6_ADDR_ANY ||
476 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
477 			goto error;
478 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
479 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
480 				    ICMPV6_NOT_NEIGHBOUR, 0);
481 			goto error;
482 		}
483 	}
484 
485 	mtu = ip6_dst_mtu_forward(dst);
486 	if (mtu < IPV6_MIN_MTU)
487 		mtu = IPV6_MIN_MTU;
488 
489 	if (ip6_pkt_too_big(skb, mtu)) {
490 		/* Again, force OUTPUT device used as source address */
491 		skb->dev = dst->dev;
492 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
493 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
494 				 IPSTATS_MIB_INTOOBIGERRORS);
495 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
496 				 IPSTATS_MIB_FRAGFAILS);
497 		kfree_skb(skb);
498 		return -EMSGSIZE;
499 	}
500 
501 	if (skb_cow(skb, dst->dev->hard_header_len)) {
502 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
503 				 IPSTATS_MIB_OUTDISCARDS);
504 		goto drop;
505 	}
506 
507 	hdr = ipv6_hdr(skb);
508 
509 	/* Mangling hops number delayed to point after skb COW */
510 
511 	hdr->hop_limit--;
512 
513 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
514 	IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
515 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, NULL, skb,
516 		       skb->dev, dst->dev,
517 		       ip6_forward_finish);
518 
519 error:
520 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
521 drop:
522 	kfree_skb(skb);
523 	return -EINVAL;
524 }
525 
526 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
527 {
528 	to->pkt_type = from->pkt_type;
529 	to->priority = from->priority;
530 	to->protocol = from->protocol;
531 	skb_dst_drop(to);
532 	skb_dst_set(to, dst_clone(skb_dst(from)));
533 	to->dev = from->dev;
534 	to->mark = from->mark;
535 
536 #ifdef CONFIG_NET_SCHED
537 	to->tc_index = from->tc_index;
538 #endif
539 	nf_copy(to, from);
540 	skb_copy_secmark(to, from);
541 }
542 
543 int ip6_fragment(struct sock *sk, struct sk_buff *skb,
544 		 int (*output)(struct sock *, struct sk_buff *))
545 {
546 	struct sk_buff *frag;
547 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
548 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
549 				inet6_sk(skb->sk) : NULL;
550 	struct ipv6hdr *tmp_hdr;
551 	struct frag_hdr *fh;
552 	unsigned int mtu, hlen, left, len;
553 	int hroom, troom;
554 	__be32 frag_id;
555 	int ptr, offset = 0, err = 0;
556 	u8 *prevhdr, nexthdr = 0;
557 	struct net *net = dev_net(skb_dst(skb)->dev);
558 
559 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
560 	nexthdr = *prevhdr;
561 
562 	mtu = ip6_skb_dst_mtu(skb);
563 
564 	/* We must not fragment if the socket is set to force MTU discovery
565 	 * or if the skb it not generated by a local socket.
566 	 */
567 	if (unlikely(!skb->ignore_df && skb->len > mtu))
568 		goto fail_toobig;
569 
570 	if (IP6CB(skb)->frag_max_size) {
571 		if (IP6CB(skb)->frag_max_size > mtu)
572 			goto fail_toobig;
573 
574 		/* don't send fragments larger than what we received */
575 		mtu = IP6CB(skb)->frag_max_size;
576 		if (mtu < IPV6_MIN_MTU)
577 			mtu = IPV6_MIN_MTU;
578 	}
579 
580 	if (np && np->frag_size < mtu) {
581 		if (np->frag_size)
582 			mtu = np->frag_size;
583 	}
584 	mtu -= hlen + sizeof(struct frag_hdr);
585 
586 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
587 				    &ipv6_hdr(skb)->saddr);
588 
589 	if (skb_has_frag_list(skb)) {
590 		int first_len = skb_pagelen(skb);
591 		struct sk_buff *frag2;
592 
593 		if (first_len - hlen > mtu ||
594 		    ((first_len - hlen) & 7) ||
595 		    skb_cloned(skb))
596 			goto slow_path;
597 
598 		skb_walk_frags(skb, frag) {
599 			/* Correct geometry. */
600 			if (frag->len > mtu ||
601 			    ((frag->len & 7) && frag->next) ||
602 			    skb_headroom(frag) < hlen)
603 				goto slow_path_clean;
604 
605 			/* Partially cloned skb? */
606 			if (skb_shared(frag))
607 				goto slow_path_clean;
608 
609 			BUG_ON(frag->sk);
610 			if (skb->sk) {
611 				frag->sk = skb->sk;
612 				frag->destructor = sock_wfree;
613 			}
614 			skb->truesize -= frag->truesize;
615 		}
616 
617 		err = 0;
618 		offset = 0;
619 		frag = skb_shinfo(skb)->frag_list;
620 		skb_frag_list_init(skb);
621 		/* BUILD HEADER */
622 
623 		*prevhdr = NEXTHDR_FRAGMENT;
624 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
625 		if (!tmp_hdr) {
626 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
627 				      IPSTATS_MIB_FRAGFAILS);
628 			return -ENOMEM;
629 		}
630 
631 		__skb_pull(skb, hlen);
632 		fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
633 		__skb_push(skb, hlen);
634 		skb_reset_network_header(skb);
635 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
636 
637 		fh->nexthdr = nexthdr;
638 		fh->reserved = 0;
639 		fh->frag_off = htons(IP6_MF);
640 		fh->identification = frag_id;
641 
642 		first_len = skb_pagelen(skb);
643 		skb->data_len = first_len - skb_headlen(skb);
644 		skb->len = first_len;
645 		ipv6_hdr(skb)->payload_len = htons(first_len -
646 						   sizeof(struct ipv6hdr));
647 
648 		dst_hold(&rt->dst);
649 
650 		for (;;) {
651 			/* Prepare header of the next frame,
652 			 * before previous one went down. */
653 			if (frag) {
654 				frag->ip_summed = CHECKSUM_NONE;
655 				skb_reset_transport_header(frag);
656 				fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
657 				__skb_push(frag, hlen);
658 				skb_reset_network_header(frag);
659 				memcpy(skb_network_header(frag), tmp_hdr,
660 				       hlen);
661 				offset += skb->len - hlen - sizeof(struct frag_hdr);
662 				fh->nexthdr = nexthdr;
663 				fh->reserved = 0;
664 				fh->frag_off = htons(offset);
665 				if (frag->next)
666 					fh->frag_off |= htons(IP6_MF);
667 				fh->identification = frag_id;
668 				ipv6_hdr(frag)->payload_len =
669 						htons(frag->len -
670 						      sizeof(struct ipv6hdr));
671 				ip6_copy_metadata(frag, skb);
672 			}
673 
674 			err = output(sk, skb);
675 			if (!err)
676 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
677 					      IPSTATS_MIB_FRAGCREATES);
678 
679 			if (err || !frag)
680 				break;
681 
682 			skb = frag;
683 			frag = skb->next;
684 			skb->next = NULL;
685 		}
686 
687 		kfree(tmp_hdr);
688 
689 		if (err == 0) {
690 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
691 				      IPSTATS_MIB_FRAGOKS);
692 			ip6_rt_put(rt);
693 			return 0;
694 		}
695 
696 		kfree_skb_list(frag);
697 
698 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
699 			      IPSTATS_MIB_FRAGFAILS);
700 		ip6_rt_put(rt);
701 		return err;
702 
703 slow_path_clean:
704 		skb_walk_frags(skb, frag2) {
705 			if (frag2 == frag)
706 				break;
707 			frag2->sk = NULL;
708 			frag2->destructor = NULL;
709 			skb->truesize += frag2->truesize;
710 		}
711 	}
712 
713 slow_path:
714 	if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
715 	    skb_checksum_help(skb))
716 		goto fail;
717 
718 	left = skb->len - hlen;		/* Space per frame */
719 	ptr = hlen;			/* Where to start from */
720 
721 	/*
722 	 *	Fragment the datagram.
723 	 */
724 
725 	*prevhdr = NEXTHDR_FRAGMENT;
726 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
727 	troom = rt->dst.dev->needed_tailroom;
728 
729 	/*
730 	 *	Keep copying data until we run out.
731 	 */
732 	while (left > 0)	{
733 		len = left;
734 		/* IF: it doesn't fit, use 'mtu' - the data space left */
735 		if (len > mtu)
736 			len = mtu;
737 		/* IF: we are not sending up to and including the packet end
738 		   then align the next start on an eight byte boundary */
739 		if (len < left)	{
740 			len &= ~7;
741 		}
742 
743 		/* Allocate buffer */
744 		frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
745 				 hroom + troom, GFP_ATOMIC);
746 		if (!frag) {
747 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
748 				      IPSTATS_MIB_FRAGFAILS);
749 			err = -ENOMEM;
750 			goto fail;
751 		}
752 
753 		/*
754 		 *	Set up data on packet
755 		 */
756 
757 		ip6_copy_metadata(frag, skb);
758 		skb_reserve(frag, hroom);
759 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
760 		skb_reset_network_header(frag);
761 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
762 		frag->transport_header = (frag->network_header + hlen +
763 					  sizeof(struct frag_hdr));
764 
765 		/*
766 		 *	Charge the memory for the fragment to any owner
767 		 *	it might possess
768 		 */
769 		if (skb->sk)
770 			skb_set_owner_w(frag, skb->sk);
771 
772 		/*
773 		 *	Copy the packet header into the new buffer.
774 		 */
775 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
776 
777 		/*
778 		 *	Build fragment header.
779 		 */
780 		fh->nexthdr = nexthdr;
781 		fh->reserved = 0;
782 		fh->identification = frag_id;
783 
784 		/*
785 		 *	Copy a block of the IP datagram.
786 		 */
787 		BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
788 				     len));
789 		left -= len;
790 
791 		fh->frag_off = htons(offset);
792 		if (left > 0)
793 			fh->frag_off |= htons(IP6_MF);
794 		ipv6_hdr(frag)->payload_len = htons(frag->len -
795 						    sizeof(struct ipv6hdr));
796 
797 		ptr += len;
798 		offset += len;
799 
800 		/*
801 		 *	Put this fragment into the sending queue.
802 		 */
803 		err = output(sk, frag);
804 		if (err)
805 			goto fail;
806 
807 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
808 			      IPSTATS_MIB_FRAGCREATES);
809 	}
810 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
811 		      IPSTATS_MIB_FRAGOKS);
812 	consume_skb(skb);
813 	return err;
814 
815 fail_toobig:
816 	if (skb->sk && dst_allfrag(skb_dst(skb)))
817 		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
818 
819 	skb->dev = skb_dst(skb)->dev;
820 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
821 	err = -EMSGSIZE;
822 
823 fail:
824 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
825 		      IPSTATS_MIB_FRAGFAILS);
826 	kfree_skb(skb);
827 	return err;
828 }
829 
830 static inline int ip6_rt_check(const struct rt6key *rt_key,
831 			       const struct in6_addr *fl_addr,
832 			       const struct in6_addr *addr_cache)
833 {
834 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
835 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
836 }
837 
838 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
839 					  struct dst_entry *dst,
840 					  const struct flowi6 *fl6)
841 {
842 	struct ipv6_pinfo *np = inet6_sk(sk);
843 	struct rt6_info *rt;
844 
845 	if (!dst)
846 		goto out;
847 
848 	if (dst->ops->family != AF_INET6) {
849 		dst_release(dst);
850 		return NULL;
851 	}
852 
853 	rt = (struct rt6_info *)dst;
854 	/* Yes, checking route validity in not connected
855 	 * case is not very simple. Take into account,
856 	 * that we do not support routing by source, TOS,
857 	 * and MSG_DONTROUTE		--ANK (980726)
858 	 *
859 	 * 1. ip6_rt_check(): If route was host route,
860 	 *    check that cached destination is current.
861 	 *    If it is network route, we still may
862 	 *    check its validity using saved pointer
863 	 *    to the last used address: daddr_cache.
864 	 *    We do not want to save whole address now,
865 	 *    (because main consumer of this service
866 	 *    is tcp, which has not this problem),
867 	 *    so that the last trick works only on connected
868 	 *    sockets.
869 	 * 2. oif also should be the same.
870 	 */
871 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
872 #ifdef CONFIG_IPV6_SUBTREES
873 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
874 #endif
875 	    (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
876 		dst_release(dst);
877 		dst = NULL;
878 	}
879 
880 out:
881 	return dst;
882 }
883 
884 static int ip6_dst_lookup_tail(struct net *net, struct sock *sk,
885 			       struct dst_entry **dst, struct flowi6 *fl6)
886 {
887 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
888 	struct neighbour *n;
889 	struct rt6_info *rt;
890 #endif
891 	int err;
892 
893 	/* The correct way to handle this would be to do
894 	 * ip6_route_get_saddr, and then ip6_route_output; however,
895 	 * the route-specific preferred source forces the
896 	 * ip6_route_output call _before_ ip6_route_get_saddr.
897 	 *
898 	 * In source specific routing (no src=any default route),
899 	 * ip6_route_output will fail given src=any saddr, though, so
900 	 * that's why we try it again later.
901 	 */
902 	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
903 		struct rt6_info *rt;
904 		bool had_dst = *dst != NULL;
905 
906 		if (!had_dst)
907 			*dst = ip6_route_output(net, sk, fl6);
908 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
909 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
910 					  sk ? inet6_sk(sk)->srcprefs : 0,
911 					  &fl6->saddr);
912 		if (err)
913 			goto out_err_release;
914 
915 		/* If we had an erroneous initial result, pretend it
916 		 * never existed and let the SA-enabled version take
917 		 * over.
918 		 */
919 		if (!had_dst && (*dst)->error) {
920 			dst_release(*dst);
921 			*dst = NULL;
922 		}
923 	}
924 
925 	if (!*dst)
926 		*dst = ip6_route_output(net, sk, fl6);
927 
928 	err = (*dst)->error;
929 	if (err)
930 		goto out_err_release;
931 
932 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
933 	/*
934 	 * Here if the dst entry we've looked up
935 	 * has a neighbour entry that is in the INCOMPLETE
936 	 * state and the src address from the flow is
937 	 * marked as OPTIMISTIC, we release the found
938 	 * dst entry and replace it instead with the
939 	 * dst entry of the nexthop router
940 	 */
941 	rt = (struct rt6_info *) *dst;
942 	rcu_read_lock_bh();
943 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
944 				      rt6_nexthop(rt, &fl6->daddr));
945 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
946 	rcu_read_unlock_bh();
947 
948 	if (err) {
949 		struct inet6_ifaddr *ifp;
950 		struct flowi6 fl_gw6;
951 		int redirect;
952 
953 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
954 				      (*dst)->dev, 1);
955 
956 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
957 		if (ifp)
958 			in6_ifa_put(ifp);
959 
960 		if (redirect) {
961 			/*
962 			 * We need to get the dst entry for the
963 			 * default router instead
964 			 */
965 			dst_release(*dst);
966 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
967 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
968 			*dst = ip6_route_output(net, sk, &fl_gw6);
969 			err = (*dst)->error;
970 			if (err)
971 				goto out_err_release;
972 		}
973 	}
974 #endif
975 
976 	return 0;
977 
978 out_err_release:
979 	if (err == -ENETUNREACH)
980 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
981 	dst_release(*dst);
982 	*dst = NULL;
983 	return err;
984 }
985 
986 /**
987  *	ip6_dst_lookup - perform route lookup on flow
988  *	@sk: socket which provides route info
989  *	@dst: pointer to dst_entry * for result
990  *	@fl6: flow to lookup
991  *
992  *	This function performs a route lookup on the given flow.
993  *
994  *	It returns zero on success, or a standard errno code on error.
995  */
996 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
997 		   struct flowi6 *fl6)
998 {
999 	*dst = NULL;
1000 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1001 }
1002 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1003 
1004 /**
1005  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1006  *	@sk: socket which provides route info
1007  *	@fl6: flow to lookup
1008  *	@final_dst: final destination address for ipsec lookup
1009  *
1010  *	This function performs a route lookup on the given flow.
1011  *
1012  *	It returns a valid dst pointer on success, or a pointer encoded
1013  *	error code.
1014  */
1015 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1016 				      const struct in6_addr *final_dst)
1017 {
1018 	struct dst_entry *dst = NULL;
1019 	int err;
1020 
1021 	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1022 	if (err)
1023 		return ERR_PTR(err);
1024 	if (final_dst)
1025 		fl6->daddr = *final_dst;
1026 	if (!fl6->flowi6_oif)
1027 		fl6->flowi6_oif = dst->dev->ifindex;
1028 
1029 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1030 }
1031 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1032 
1033 /**
1034  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1035  *	@sk: socket which provides the dst cache and route info
1036  *	@fl6: flow to lookup
1037  *	@final_dst: final destination address for ipsec lookup
1038  *
1039  *	This function performs a route lookup on the given flow with the
1040  *	possibility of using the cached route in the socket if it is valid.
1041  *	It will take the socket dst lock when operating on the dst cache.
1042  *	As a result, this function can only be used in process context.
1043  *
1044  *	It returns a valid dst pointer on success, or a pointer encoded
1045  *	error code.
1046  */
1047 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1048 					 const struct in6_addr *final_dst)
1049 {
1050 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1051 	int err;
1052 
1053 	dst = ip6_sk_dst_check(sk, dst, fl6);
1054 
1055 	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1056 	if (err)
1057 		return ERR_PTR(err);
1058 	if (final_dst)
1059 		fl6->daddr = *final_dst;
1060 
1061 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1062 }
1063 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1064 
1065 static inline int ip6_ufo_append_data(struct sock *sk,
1066 			struct sk_buff_head *queue,
1067 			int getfrag(void *from, char *to, int offset, int len,
1068 			int odd, struct sk_buff *skb),
1069 			void *from, int length, int hh_len, int fragheaderlen,
1070 			int transhdrlen, int mtu, unsigned int flags,
1071 			const struct flowi6 *fl6)
1072 
1073 {
1074 	struct sk_buff *skb;
1075 	int err;
1076 
1077 	/* There is support for UDP large send offload by network
1078 	 * device, so create one single skb packet containing complete
1079 	 * udp datagram
1080 	 */
1081 	skb = skb_peek_tail(queue);
1082 	if (!skb) {
1083 		skb = sock_alloc_send_skb(sk,
1084 			hh_len + fragheaderlen + transhdrlen + 20,
1085 			(flags & MSG_DONTWAIT), &err);
1086 		if (!skb)
1087 			return err;
1088 
1089 		/* reserve space for Hardware header */
1090 		skb_reserve(skb, hh_len);
1091 
1092 		/* create space for UDP/IP header */
1093 		skb_put(skb, fragheaderlen + transhdrlen);
1094 
1095 		/* initialize network header pointer */
1096 		skb_reset_network_header(skb);
1097 
1098 		/* initialize protocol header pointer */
1099 		skb->transport_header = skb->network_header + fragheaderlen;
1100 
1101 		skb->protocol = htons(ETH_P_IPV6);
1102 		skb->csum = 0;
1103 
1104 		__skb_queue_tail(queue, skb);
1105 	} else if (skb_is_gso(skb)) {
1106 		goto append;
1107 	}
1108 
1109 	skb->ip_summed = CHECKSUM_PARTIAL;
1110 	/* Specify the length of each IPv6 datagram fragment.
1111 	 * It has to be a multiple of 8.
1112 	 */
1113 	skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1114 				     sizeof(struct frag_hdr)) & ~7;
1115 	skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1116 	skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1117 							 &fl6->daddr,
1118 							 &fl6->saddr);
1119 
1120 append:
1121 	return skb_append_datato_frags(sk, skb, getfrag, from,
1122 				       (length - transhdrlen));
1123 }
1124 
1125 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1126 					       gfp_t gfp)
1127 {
1128 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1129 }
1130 
1131 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1132 						gfp_t gfp)
1133 {
1134 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1135 }
1136 
1137 static void ip6_append_data_mtu(unsigned int *mtu,
1138 				int *maxfraglen,
1139 				unsigned int fragheaderlen,
1140 				struct sk_buff *skb,
1141 				struct rt6_info *rt,
1142 				unsigned int orig_mtu)
1143 {
1144 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1145 		if (!skb) {
1146 			/* first fragment, reserve header_len */
1147 			*mtu = orig_mtu - rt->dst.header_len;
1148 
1149 		} else {
1150 			/*
1151 			 * this fragment is not first, the headers
1152 			 * space is regarded as data space.
1153 			 */
1154 			*mtu = orig_mtu;
1155 		}
1156 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1157 			      + fragheaderlen - sizeof(struct frag_hdr);
1158 	}
1159 }
1160 
1161 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1162 			  struct inet6_cork *v6_cork,
1163 			  int hlimit, int tclass, struct ipv6_txoptions *opt,
1164 			  struct rt6_info *rt, struct flowi6 *fl6)
1165 {
1166 	struct ipv6_pinfo *np = inet6_sk(sk);
1167 	unsigned int mtu;
1168 
1169 	/*
1170 	 * setup for corking
1171 	 */
1172 	if (opt) {
1173 		if (WARN_ON(v6_cork->opt))
1174 			return -EINVAL;
1175 
1176 		v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1177 		if (unlikely(!v6_cork->opt))
1178 			return -ENOBUFS;
1179 
1180 		v6_cork->opt->tot_len = opt->tot_len;
1181 		v6_cork->opt->opt_flen = opt->opt_flen;
1182 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1183 
1184 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1185 						    sk->sk_allocation);
1186 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1187 			return -ENOBUFS;
1188 
1189 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1190 						    sk->sk_allocation);
1191 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1192 			return -ENOBUFS;
1193 
1194 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1195 						   sk->sk_allocation);
1196 		if (opt->hopopt && !v6_cork->opt->hopopt)
1197 			return -ENOBUFS;
1198 
1199 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1200 						    sk->sk_allocation);
1201 		if (opt->srcrt && !v6_cork->opt->srcrt)
1202 			return -ENOBUFS;
1203 
1204 		/* need source address above miyazawa*/
1205 	}
1206 	dst_hold(&rt->dst);
1207 	cork->base.dst = &rt->dst;
1208 	cork->fl.u.ip6 = *fl6;
1209 	v6_cork->hop_limit = hlimit;
1210 	v6_cork->tclass = tclass;
1211 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1212 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1213 		      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1214 	else
1215 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1216 		      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1217 	if (np->frag_size < mtu) {
1218 		if (np->frag_size)
1219 			mtu = np->frag_size;
1220 	}
1221 	cork->base.fragsize = mtu;
1222 	if (dst_allfrag(rt->dst.path))
1223 		cork->base.flags |= IPCORK_ALLFRAG;
1224 	cork->base.length = 0;
1225 
1226 	return 0;
1227 }
1228 
1229 static int __ip6_append_data(struct sock *sk,
1230 			     struct flowi6 *fl6,
1231 			     struct sk_buff_head *queue,
1232 			     struct inet_cork *cork,
1233 			     struct inet6_cork *v6_cork,
1234 			     struct page_frag *pfrag,
1235 			     int getfrag(void *from, char *to, int offset,
1236 					 int len, int odd, struct sk_buff *skb),
1237 			     void *from, int length, int transhdrlen,
1238 			     unsigned int flags, int dontfrag)
1239 {
1240 	struct sk_buff *skb, *skb_prev = NULL;
1241 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1242 	int exthdrlen = 0;
1243 	int dst_exthdrlen = 0;
1244 	int hh_len;
1245 	int copy;
1246 	int err;
1247 	int offset = 0;
1248 	__u8 tx_flags = 0;
1249 	u32 tskey = 0;
1250 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1251 	struct ipv6_txoptions *opt = v6_cork->opt;
1252 	int csummode = CHECKSUM_NONE;
1253 
1254 	skb = skb_peek_tail(queue);
1255 	if (!skb) {
1256 		exthdrlen = opt ? opt->opt_flen : 0;
1257 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1258 	}
1259 
1260 	mtu = cork->fragsize;
1261 	orig_mtu = mtu;
1262 
1263 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1264 
1265 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1266 			(opt ? opt->opt_nflen : 0);
1267 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1268 		     sizeof(struct frag_hdr);
1269 
1270 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1271 		unsigned int maxnonfragsize, headersize;
1272 
1273 		headersize = sizeof(struct ipv6hdr) +
1274 			     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1275 			     (dst_allfrag(&rt->dst) ?
1276 			      sizeof(struct frag_hdr) : 0) +
1277 			     rt->rt6i_nfheader_len;
1278 
1279 		if (ip6_sk_ignore_df(sk))
1280 			maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1281 		else
1282 			maxnonfragsize = mtu;
1283 
1284 		/* dontfrag active */
1285 		if ((cork->length + length > mtu - headersize) && dontfrag &&
1286 		    (sk->sk_protocol == IPPROTO_UDP ||
1287 		     sk->sk_protocol == IPPROTO_RAW)) {
1288 			ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1289 						   sizeof(struct ipv6hdr));
1290 			goto emsgsize;
1291 		}
1292 
1293 		if (cork->length + length > maxnonfragsize - headersize) {
1294 emsgsize:
1295 			ipv6_local_error(sk, EMSGSIZE, fl6,
1296 					 mtu - headersize +
1297 					 sizeof(struct ipv6hdr));
1298 			return -EMSGSIZE;
1299 		}
1300 	}
1301 
1302 	if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1303 		sock_tx_timestamp(sk, &tx_flags);
1304 		if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1305 		    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1306 			tskey = sk->sk_tskey++;
1307 	}
1308 
1309 	/* If this is the first and only packet and device
1310 	 * supports checksum offloading, let's use it.
1311 	 * Use transhdrlen, same as IPv4, because partial
1312 	 * sums only work when transhdrlen is set.
1313 	 */
1314 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1315 	    length + fragheaderlen < mtu &&
1316 	    rt->dst.dev->features & NETIF_F_V6_CSUM &&
1317 	    !exthdrlen)
1318 		csummode = CHECKSUM_PARTIAL;
1319 	/*
1320 	 * Let's try using as much space as possible.
1321 	 * Use MTU if total length of the message fits into the MTU.
1322 	 * Otherwise, we need to reserve fragment header and
1323 	 * fragment alignment (= 8-15 octects, in total).
1324 	 *
1325 	 * Note that we may need to "move" the data from the tail of
1326 	 * of the buffer to the new fragment when we split
1327 	 * the message.
1328 	 *
1329 	 * FIXME: It may be fragmented into multiple chunks
1330 	 *        at once if non-fragmentable extension headers
1331 	 *        are too large.
1332 	 * --yoshfuji
1333 	 */
1334 
1335 	cork->length += length;
1336 	if (((length > mtu) ||
1337 	     (skb && skb_is_gso(skb))) &&
1338 	    (sk->sk_protocol == IPPROTO_UDP) &&
1339 	    (rt->dst.dev->features & NETIF_F_UFO) &&
1340 	    (sk->sk_type == SOCK_DGRAM)) {
1341 		err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1342 					  hh_len, fragheaderlen,
1343 					  transhdrlen, mtu, flags, fl6);
1344 		if (err)
1345 			goto error;
1346 		return 0;
1347 	}
1348 
1349 	if (!skb)
1350 		goto alloc_new_skb;
1351 
1352 	while (length > 0) {
1353 		/* Check if the remaining data fits into current packet. */
1354 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1355 		if (copy < length)
1356 			copy = maxfraglen - skb->len;
1357 
1358 		if (copy <= 0) {
1359 			char *data;
1360 			unsigned int datalen;
1361 			unsigned int fraglen;
1362 			unsigned int fraggap;
1363 			unsigned int alloclen;
1364 alloc_new_skb:
1365 			/* There's no room in the current skb */
1366 			if (skb)
1367 				fraggap = skb->len - maxfraglen;
1368 			else
1369 				fraggap = 0;
1370 			/* update mtu and maxfraglen if necessary */
1371 			if (!skb || !skb_prev)
1372 				ip6_append_data_mtu(&mtu, &maxfraglen,
1373 						    fragheaderlen, skb, rt,
1374 						    orig_mtu);
1375 
1376 			skb_prev = skb;
1377 
1378 			/*
1379 			 * If remaining data exceeds the mtu,
1380 			 * we know we need more fragment(s).
1381 			 */
1382 			datalen = length + fraggap;
1383 
1384 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1385 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1386 			if ((flags & MSG_MORE) &&
1387 			    !(rt->dst.dev->features&NETIF_F_SG))
1388 				alloclen = mtu;
1389 			else
1390 				alloclen = datalen + fragheaderlen;
1391 
1392 			alloclen += dst_exthdrlen;
1393 
1394 			if (datalen != length + fraggap) {
1395 				/*
1396 				 * this is not the last fragment, the trailer
1397 				 * space is regarded as data space.
1398 				 */
1399 				datalen += rt->dst.trailer_len;
1400 			}
1401 
1402 			alloclen += rt->dst.trailer_len;
1403 			fraglen = datalen + fragheaderlen;
1404 
1405 			/*
1406 			 * We just reserve space for fragment header.
1407 			 * Note: this may be overallocation if the message
1408 			 * (without MSG_MORE) fits into the MTU.
1409 			 */
1410 			alloclen += sizeof(struct frag_hdr);
1411 
1412 			if (transhdrlen) {
1413 				skb = sock_alloc_send_skb(sk,
1414 						alloclen + hh_len,
1415 						(flags & MSG_DONTWAIT), &err);
1416 			} else {
1417 				skb = NULL;
1418 				if (atomic_read(&sk->sk_wmem_alloc) <=
1419 				    2 * sk->sk_sndbuf)
1420 					skb = sock_wmalloc(sk,
1421 							   alloclen + hh_len, 1,
1422 							   sk->sk_allocation);
1423 				if (unlikely(!skb))
1424 					err = -ENOBUFS;
1425 			}
1426 			if (!skb)
1427 				goto error;
1428 			/*
1429 			 *	Fill in the control structures
1430 			 */
1431 			skb->protocol = htons(ETH_P_IPV6);
1432 			skb->ip_summed = csummode;
1433 			skb->csum = 0;
1434 			/* reserve for fragmentation and ipsec header */
1435 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1436 				    dst_exthdrlen);
1437 
1438 			/* Only the initial fragment is time stamped */
1439 			skb_shinfo(skb)->tx_flags = tx_flags;
1440 			tx_flags = 0;
1441 			skb_shinfo(skb)->tskey = tskey;
1442 			tskey = 0;
1443 
1444 			/*
1445 			 *	Find where to start putting bytes
1446 			 */
1447 			data = skb_put(skb, fraglen);
1448 			skb_set_network_header(skb, exthdrlen);
1449 			data += fragheaderlen;
1450 			skb->transport_header = (skb->network_header +
1451 						 fragheaderlen);
1452 			if (fraggap) {
1453 				skb->csum = skb_copy_and_csum_bits(
1454 					skb_prev, maxfraglen,
1455 					data + transhdrlen, fraggap, 0);
1456 				skb_prev->csum = csum_sub(skb_prev->csum,
1457 							  skb->csum);
1458 				data += fraggap;
1459 				pskb_trim_unique(skb_prev, maxfraglen);
1460 			}
1461 			copy = datalen - transhdrlen - fraggap;
1462 
1463 			if (copy < 0) {
1464 				err = -EINVAL;
1465 				kfree_skb(skb);
1466 				goto error;
1467 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1468 				err = -EFAULT;
1469 				kfree_skb(skb);
1470 				goto error;
1471 			}
1472 
1473 			offset += copy;
1474 			length -= datalen - fraggap;
1475 			transhdrlen = 0;
1476 			exthdrlen = 0;
1477 			dst_exthdrlen = 0;
1478 
1479 			/*
1480 			 * Put the packet on the pending queue
1481 			 */
1482 			__skb_queue_tail(queue, skb);
1483 			continue;
1484 		}
1485 
1486 		if (copy > length)
1487 			copy = length;
1488 
1489 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1490 			unsigned int off;
1491 
1492 			off = skb->len;
1493 			if (getfrag(from, skb_put(skb, copy),
1494 						offset, copy, off, skb) < 0) {
1495 				__skb_trim(skb, off);
1496 				err = -EFAULT;
1497 				goto error;
1498 			}
1499 		} else {
1500 			int i = skb_shinfo(skb)->nr_frags;
1501 
1502 			err = -ENOMEM;
1503 			if (!sk_page_frag_refill(sk, pfrag))
1504 				goto error;
1505 
1506 			if (!skb_can_coalesce(skb, i, pfrag->page,
1507 					      pfrag->offset)) {
1508 				err = -EMSGSIZE;
1509 				if (i == MAX_SKB_FRAGS)
1510 					goto error;
1511 
1512 				__skb_fill_page_desc(skb, i, pfrag->page,
1513 						     pfrag->offset, 0);
1514 				skb_shinfo(skb)->nr_frags = ++i;
1515 				get_page(pfrag->page);
1516 			}
1517 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1518 			if (getfrag(from,
1519 				    page_address(pfrag->page) + pfrag->offset,
1520 				    offset, copy, skb->len, skb) < 0)
1521 				goto error_efault;
1522 
1523 			pfrag->offset += copy;
1524 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1525 			skb->len += copy;
1526 			skb->data_len += copy;
1527 			skb->truesize += copy;
1528 			atomic_add(copy, &sk->sk_wmem_alloc);
1529 		}
1530 		offset += copy;
1531 		length -= copy;
1532 	}
1533 
1534 	return 0;
1535 
1536 error_efault:
1537 	err = -EFAULT;
1538 error:
1539 	cork->length -= length;
1540 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1541 	return err;
1542 }
1543 
1544 int ip6_append_data(struct sock *sk,
1545 		    int getfrag(void *from, char *to, int offset, int len,
1546 				int odd, struct sk_buff *skb),
1547 		    void *from, int length, int transhdrlen, int hlimit,
1548 		    int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1549 		    struct rt6_info *rt, unsigned int flags, int dontfrag)
1550 {
1551 	struct inet_sock *inet = inet_sk(sk);
1552 	struct ipv6_pinfo *np = inet6_sk(sk);
1553 	int exthdrlen;
1554 	int err;
1555 
1556 	if (flags&MSG_PROBE)
1557 		return 0;
1558 	if (skb_queue_empty(&sk->sk_write_queue)) {
1559 		/*
1560 		 * setup for corking
1561 		 */
1562 		err = ip6_setup_cork(sk, &inet->cork, &np->cork, hlimit,
1563 				     tclass, opt, rt, fl6);
1564 		if (err)
1565 			return err;
1566 
1567 		exthdrlen = (opt ? opt->opt_flen : 0);
1568 		length += exthdrlen;
1569 		transhdrlen += exthdrlen;
1570 	} else {
1571 		fl6 = &inet->cork.fl.u.ip6;
1572 		transhdrlen = 0;
1573 	}
1574 
1575 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1576 				 &np->cork, sk_page_frag(sk), getfrag,
1577 				 from, length, transhdrlen, flags, dontfrag);
1578 }
1579 EXPORT_SYMBOL_GPL(ip6_append_data);
1580 
1581 static void ip6_cork_release(struct inet_cork_full *cork,
1582 			     struct inet6_cork *v6_cork)
1583 {
1584 	if (v6_cork->opt) {
1585 		kfree(v6_cork->opt->dst0opt);
1586 		kfree(v6_cork->opt->dst1opt);
1587 		kfree(v6_cork->opt->hopopt);
1588 		kfree(v6_cork->opt->srcrt);
1589 		kfree(v6_cork->opt);
1590 		v6_cork->opt = NULL;
1591 	}
1592 
1593 	if (cork->base.dst) {
1594 		dst_release(cork->base.dst);
1595 		cork->base.dst = NULL;
1596 		cork->base.flags &= ~IPCORK_ALLFRAG;
1597 	}
1598 	memset(&cork->fl, 0, sizeof(cork->fl));
1599 }
1600 
1601 struct sk_buff *__ip6_make_skb(struct sock *sk,
1602 			       struct sk_buff_head *queue,
1603 			       struct inet_cork_full *cork,
1604 			       struct inet6_cork *v6_cork)
1605 {
1606 	struct sk_buff *skb, *tmp_skb;
1607 	struct sk_buff **tail_skb;
1608 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1609 	struct ipv6_pinfo *np = inet6_sk(sk);
1610 	struct net *net = sock_net(sk);
1611 	struct ipv6hdr *hdr;
1612 	struct ipv6_txoptions *opt = v6_cork->opt;
1613 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1614 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1615 	unsigned char proto = fl6->flowi6_proto;
1616 
1617 	skb = __skb_dequeue(queue);
1618 	if (!skb)
1619 		goto out;
1620 	tail_skb = &(skb_shinfo(skb)->frag_list);
1621 
1622 	/* move skb->data to ip header from ext header */
1623 	if (skb->data < skb_network_header(skb))
1624 		__skb_pull(skb, skb_network_offset(skb));
1625 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1626 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1627 		*tail_skb = tmp_skb;
1628 		tail_skb = &(tmp_skb->next);
1629 		skb->len += tmp_skb->len;
1630 		skb->data_len += tmp_skb->len;
1631 		skb->truesize += tmp_skb->truesize;
1632 		tmp_skb->destructor = NULL;
1633 		tmp_skb->sk = NULL;
1634 	}
1635 
1636 	/* Allow local fragmentation. */
1637 	skb->ignore_df = ip6_sk_ignore_df(sk);
1638 
1639 	*final_dst = fl6->daddr;
1640 	__skb_pull(skb, skb_network_header_len(skb));
1641 	if (opt && opt->opt_flen)
1642 		ipv6_push_frag_opts(skb, opt, &proto);
1643 	if (opt && opt->opt_nflen)
1644 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1645 
1646 	skb_push(skb, sizeof(struct ipv6hdr));
1647 	skb_reset_network_header(skb);
1648 	hdr = ipv6_hdr(skb);
1649 
1650 	ip6_flow_hdr(hdr, v6_cork->tclass,
1651 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1652 					np->autoflowlabel, fl6));
1653 	hdr->hop_limit = v6_cork->hop_limit;
1654 	hdr->nexthdr = proto;
1655 	hdr->saddr = fl6->saddr;
1656 	hdr->daddr = *final_dst;
1657 
1658 	skb->priority = sk->sk_priority;
1659 	skb->mark = sk->sk_mark;
1660 
1661 	skb_dst_set(skb, dst_clone(&rt->dst));
1662 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1663 	if (proto == IPPROTO_ICMPV6) {
1664 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1665 
1666 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1667 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1668 	}
1669 
1670 	ip6_cork_release(cork, v6_cork);
1671 out:
1672 	return skb;
1673 }
1674 
1675 int ip6_send_skb(struct sk_buff *skb)
1676 {
1677 	struct net *net = sock_net(skb->sk);
1678 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1679 	int err;
1680 
1681 	err = ip6_local_out(skb);
1682 	if (err) {
1683 		if (err > 0)
1684 			err = net_xmit_errno(err);
1685 		if (err)
1686 			IP6_INC_STATS(net, rt->rt6i_idev,
1687 				      IPSTATS_MIB_OUTDISCARDS);
1688 	}
1689 
1690 	return err;
1691 }
1692 
1693 int ip6_push_pending_frames(struct sock *sk)
1694 {
1695 	struct sk_buff *skb;
1696 
1697 	skb = ip6_finish_skb(sk);
1698 	if (!skb)
1699 		return 0;
1700 
1701 	return ip6_send_skb(skb);
1702 }
1703 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1704 
1705 static void __ip6_flush_pending_frames(struct sock *sk,
1706 				       struct sk_buff_head *queue,
1707 				       struct inet_cork_full *cork,
1708 				       struct inet6_cork *v6_cork)
1709 {
1710 	struct sk_buff *skb;
1711 
1712 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1713 		if (skb_dst(skb))
1714 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1715 				      IPSTATS_MIB_OUTDISCARDS);
1716 		kfree_skb(skb);
1717 	}
1718 
1719 	ip6_cork_release(cork, v6_cork);
1720 }
1721 
1722 void ip6_flush_pending_frames(struct sock *sk)
1723 {
1724 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1725 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1726 }
1727 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1728 
1729 struct sk_buff *ip6_make_skb(struct sock *sk,
1730 			     int getfrag(void *from, char *to, int offset,
1731 					 int len, int odd, struct sk_buff *skb),
1732 			     void *from, int length, int transhdrlen,
1733 			     int hlimit, int tclass,
1734 			     struct ipv6_txoptions *opt, struct flowi6 *fl6,
1735 			     struct rt6_info *rt, unsigned int flags,
1736 			     int dontfrag)
1737 {
1738 	struct inet_cork_full cork;
1739 	struct inet6_cork v6_cork;
1740 	struct sk_buff_head queue;
1741 	int exthdrlen = (opt ? opt->opt_flen : 0);
1742 	int err;
1743 
1744 	if (flags & MSG_PROBE)
1745 		return NULL;
1746 
1747 	__skb_queue_head_init(&queue);
1748 
1749 	cork.base.flags = 0;
1750 	cork.base.addr = 0;
1751 	cork.base.opt = NULL;
1752 	v6_cork.opt = NULL;
1753 	err = ip6_setup_cork(sk, &cork, &v6_cork, hlimit, tclass, opt, rt, fl6);
1754 	if (err)
1755 		return ERR_PTR(err);
1756 
1757 	if (dontfrag < 0)
1758 		dontfrag = inet6_sk(sk)->dontfrag;
1759 
1760 	err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1761 				&current->task_frag, getfrag, from,
1762 				length + exthdrlen, transhdrlen + exthdrlen,
1763 				flags, dontfrag);
1764 	if (err) {
1765 		__ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1766 		return ERR_PTR(err);
1767 	}
1768 
1769 	return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1770 }
1771