xref: /openbmc/linux/net/ipv6/ip6_output.c (revision 95e9fd10)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	: 	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63 	int len;
64 
65 	len = skb->len - sizeof(struct ipv6hdr);
66 	if (len > IPV6_MAXPLEN)
67 		len = 0;
68 	ipv6_hdr(skb)->payload_len = htons(len);
69 
70 	return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 		       skb_dst(skb)->dev, dst_output);
72 }
73 
74 int ip6_local_out(struct sk_buff *skb)
75 {
76 	int err;
77 
78 	err = __ip6_local_out(skb);
79 	if (likely(err == 1))
80 		err = dst_output(skb);
81 
82 	return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85 
86 static int ip6_finish_output2(struct sk_buff *skb)
87 {
88 	struct dst_entry *dst = skb_dst(skb);
89 	struct net_device *dev = dst->dev;
90 	struct neighbour *neigh;
91 	struct rt6_info *rt;
92 
93 	skb->protocol = htons(ETH_P_IPV6);
94 	skb->dev = dev;
95 
96 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
97 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
98 
99 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
100 		    ((mroute6_socket(dev_net(dev), skb) &&
101 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
102 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
103 					 &ipv6_hdr(skb)->saddr))) {
104 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
105 
106 			/* Do not check for IFF_ALLMULTI; multicast routing
107 			   is not supported in any case.
108 			 */
109 			if (newskb)
110 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
111 					newskb, NULL, newskb->dev,
112 					dev_loopback_xmit);
113 
114 			if (ipv6_hdr(skb)->hop_limit == 0) {
115 				IP6_INC_STATS(dev_net(dev), idev,
116 					      IPSTATS_MIB_OUTDISCARDS);
117 				kfree_skb(skb);
118 				return 0;
119 			}
120 		}
121 
122 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
123 				skb->len);
124 	}
125 
126 	rcu_read_lock();
127 	rt = (struct rt6_info *) dst;
128 	neigh = rt->n;
129 	if (neigh) {
130 		int res = dst_neigh_output(dst, neigh, skb);
131 
132 		rcu_read_unlock();
133 		return res;
134 	}
135 	rcu_read_unlock();
136 	IP6_INC_STATS_BH(dev_net(dst->dev),
137 			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
138 	kfree_skb(skb);
139 	return -EINVAL;
140 }
141 
142 static int ip6_finish_output(struct sk_buff *skb)
143 {
144 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
145 	    dst_allfrag(skb_dst(skb)))
146 		return ip6_fragment(skb, ip6_finish_output2);
147 	else
148 		return ip6_finish_output2(skb);
149 }
150 
151 int ip6_output(struct sk_buff *skb)
152 {
153 	struct net_device *dev = skb_dst(skb)->dev;
154 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
155 	if (unlikely(idev->cnf.disable_ipv6)) {
156 		IP6_INC_STATS(dev_net(dev), idev,
157 			      IPSTATS_MIB_OUTDISCARDS);
158 		kfree_skb(skb);
159 		return 0;
160 	}
161 
162 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
163 			    ip6_finish_output,
164 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
165 }
166 
167 /*
168  *	xmit an sk_buff (used by TCP, SCTP and DCCP)
169  */
170 
171 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
172 	     struct ipv6_txoptions *opt, int tclass)
173 {
174 	struct net *net = sock_net(sk);
175 	struct ipv6_pinfo *np = inet6_sk(sk);
176 	struct in6_addr *first_hop = &fl6->daddr;
177 	struct dst_entry *dst = skb_dst(skb);
178 	struct ipv6hdr *hdr;
179 	u8  proto = fl6->flowi6_proto;
180 	int seg_len = skb->len;
181 	int hlimit = -1;
182 	u32 mtu;
183 
184 	if (opt) {
185 		unsigned int head_room;
186 
187 		/* First: exthdrs may take lots of space (~8K for now)
188 		   MAX_HEADER is not enough.
189 		 */
190 		head_room = opt->opt_nflen + opt->opt_flen;
191 		seg_len += head_room;
192 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
193 
194 		if (skb_headroom(skb) < head_room) {
195 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
196 			if (skb2 == NULL) {
197 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
198 					      IPSTATS_MIB_OUTDISCARDS);
199 				kfree_skb(skb);
200 				return -ENOBUFS;
201 			}
202 			consume_skb(skb);
203 			skb = skb2;
204 			skb_set_owner_w(skb, sk);
205 		}
206 		if (opt->opt_flen)
207 			ipv6_push_frag_opts(skb, opt, &proto);
208 		if (opt->opt_nflen)
209 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
210 	}
211 
212 	skb_push(skb, sizeof(struct ipv6hdr));
213 	skb_reset_network_header(skb);
214 	hdr = ipv6_hdr(skb);
215 
216 	/*
217 	 *	Fill in the IPv6 header
218 	 */
219 	if (np)
220 		hlimit = np->hop_limit;
221 	if (hlimit < 0)
222 		hlimit = ip6_dst_hoplimit(dst);
223 
224 	*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
225 
226 	hdr->payload_len = htons(seg_len);
227 	hdr->nexthdr = proto;
228 	hdr->hop_limit = hlimit;
229 
230 	hdr->saddr = fl6->saddr;
231 	hdr->daddr = *first_hop;
232 
233 	skb->priority = sk->sk_priority;
234 	skb->mark = sk->sk_mark;
235 
236 	mtu = dst_mtu(dst);
237 	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
238 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
239 			      IPSTATS_MIB_OUT, skb->len);
240 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
241 			       dst->dev, dst_output);
242 	}
243 
244 	net_dbg_ratelimited("IPv6: sending pkt_too_big to self\n");
245 	skb->dev = dst->dev;
246 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
247 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
248 	kfree_skb(skb);
249 	return -EMSGSIZE;
250 }
251 
252 EXPORT_SYMBOL(ip6_xmit);
253 
254 /*
255  *	To avoid extra problems ND packets are send through this
256  *	routine. It's code duplication but I really want to avoid
257  *	extra checks since ipv6_build_header is used by TCP (which
258  *	is for us performance critical)
259  */
260 
261 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
262 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
263 	       int proto, int len)
264 {
265 	struct ipv6_pinfo *np = inet6_sk(sk);
266 	struct ipv6hdr *hdr;
267 
268 	skb->protocol = htons(ETH_P_IPV6);
269 	skb->dev = dev;
270 
271 	skb_reset_network_header(skb);
272 	skb_put(skb, sizeof(struct ipv6hdr));
273 	hdr = ipv6_hdr(skb);
274 
275 	*(__be32*)hdr = htonl(0x60000000);
276 
277 	hdr->payload_len = htons(len);
278 	hdr->nexthdr = proto;
279 	hdr->hop_limit = np->hop_limit;
280 
281 	hdr->saddr = *saddr;
282 	hdr->daddr = *daddr;
283 
284 	return 0;
285 }
286 
287 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
288 {
289 	struct ip6_ra_chain *ra;
290 	struct sock *last = NULL;
291 
292 	read_lock(&ip6_ra_lock);
293 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
294 		struct sock *sk = ra->sk;
295 		if (sk && ra->sel == sel &&
296 		    (!sk->sk_bound_dev_if ||
297 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
298 			if (last) {
299 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
300 				if (skb2)
301 					rawv6_rcv(last, skb2);
302 			}
303 			last = sk;
304 		}
305 	}
306 
307 	if (last) {
308 		rawv6_rcv(last, skb);
309 		read_unlock(&ip6_ra_lock);
310 		return 1;
311 	}
312 	read_unlock(&ip6_ra_lock);
313 	return 0;
314 }
315 
316 static int ip6_forward_proxy_check(struct sk_buff *skb)
317 {
318 	struct ipv6hdr *hdr = ipv6_hdr(skb);
319 	u8 nexthdr = hdr->nexthdr;
320 	__be16 frag_off;
321 	int offset;
322 
323 	if (ipv6_ext_hdr(nexthdr)) {
324 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
325 		if (offset < 0)
326 			return 0;
327 	} else
328 		offset = sizeof(struct ipv6hdr);
329 
330 	if (nexthdr == IPPROTO_ICMPV6) {
331 		struct icmp6hdr *icmp6;
332 
333 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
334 					 offset + 1 - skb->data)))
335 			return 0;
336 
337 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
338 
339 		switch (icmp6->icmp6_type) {
340 		case NDISC_ROUTER_SOLICITATION:
341 		case NDISC_ROUTER_ADVERTISEMENT:
342 		case NDISC_NEIGHBOUR_SOLICITATION:
343 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
344 		case NDISC_REDIRECT:
345 			/* For reaction involving unicast neighbor discovery
346 			 * message destined to the proxied address, pass it to
347 			 * input function.
348 			 */
349 			return 1;
350 		default:
351 			break;
352 		}
353 	}
354 
355 	/*
356 	 * The proxying router can't forward traffic sent to a link-local
357 	 * address, so signal the sender and discard the packet. This
358 	 * behavior is clarified by the MIPv6 specification.
359 	 */
360 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
361 		dst_link_failure(skb);
362 		return -1;
363 	}
364 
365 	return 0;
366 }
367 
368 static inline int ip6_forward_finish(struct sk_buff *skb)
369 {
370 	return dst_output(skb);
371 }
372 
373 int ip6_forward(struct sk_buff *skb)
374 {
375 	struct dst_entry *dst = skb_dst(skb);
376 	struct ipv6hdr *hdr = ipv6_hdr(skb);
377 	struct inet6_skb_parm *opt = IP6CB(skb);
378 	struct net *net = dev_net(dst->dev);
379 	u32 mtu;
380 
381 	if (net->ipv6.devconf_all->forwarding == 0)
382 		goto error;
383 
384 	if (skb_warn_if_lro(skb))
385 		goto drop;
386 
387 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
388 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
389 		goto drop;
390 	}
391 
392 	if (skb->pkt_type != PACKET_HOST)
393 		goto drop;
394 
395 	skb_forward_csum(skb);
396 
397 	/*
398 	 *	We DO NOT make any processing on
399 	 *	RA packets, pushing them to user level AS IS
400 	 *	without ane WARRANTY that application will be able
401 	 *	to interpret them. The reason is that we
402 	 *	cannot make anything clever here.
403 	 *
404 	 *	We are not end-node, so that if packet contains
405 	 *	AH/ESP, we cannot make anything.
406 	 *	Defragmentation also would be mistake, RA packets
407 	 *	cannot be fragmented, because there is no warranty
408 	 *	that different fragments will go along one path. --ANK
409 	 */
410 	if (opt->ra) {
411 		u8 *ptr = skb_network_header(skb) + opt->ra;
412 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
413 			return 0;
414 	}
415 
416 	/*
417 	 *	check and decrement ttl
418 	 */
419 	if (hdr->hop_limit <= 1) {
420 		/* Force OUTPUT device used as source address */
421 		skb->dev = dst->dev;
422 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
423 		IP6_INC_STATS_BH(net,
424 				 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
425 
426 		kfree_skb(skb);
427 		return -ETIMEDOUT;
428 	}
429 
430 	/* XXX: idev->cnf.proxy_ndp? */
431 	if (net->ipv6.devconf_all->proxy_ndp &&
432 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
433 		int proxied = ip6_forward_proxy_check(skb);
434 		if (proxied > 0)
435 			return ip6_input(skb);
436 		else if (proxied < 0) {
437 			IP6_INC_STATS(net, ip6_dst_idev(dst),
438 				      IPSTATS_MIB_INDISCARDS);
439 			goto drop;
440 		}
441 	}
442 
443 	if (!xfrm6_route_forward(skb)) {
444 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
445 		goto drop;
446 	}
447 	dst = skb_dst(skb);
448 
449 	/* IPv6 specs say nothing about it, but it is clear that we cannot
450 	   send redirects to source routed frames.
451 	   We don't send redirects to frames decapsulated from IPsec.
452 	 */
453 	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
454 		struct in6_addr *target = NULL;
455 		struct inet_peer *peer;
456 		struct rt6_info *rt;
457 
458 		/*
459 		 *	incoming and outgoing devices are the same
460 		 *	send a redirect.
461 		 */
462 
463 		rt = (struct rt6_info *) dst;
464 		if (rt->rt6i_flags & RTF_GATEWAY)
465 			target = &rt->rt6i_gateway;
466 		else
467 			target = &hdr->daddr;
468 
469 		peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
470 
471 		/* Limit redirects both by destination (here)
472 		   and by source (inside ndisc_send_redirect)
473 		 */
474 		if (inet_peer_xrlim_allow(peer, 1*HZ))
475 			ndisc_send_redirect(skb, target);
476 		if (peer)
477 			inet_putpeer(peer);
478 	} else {
479 		int addrtype = ipv6_addr_type(&hdr->saddr);
480 
481 		/* This check is security critical. */
482 		if (addrtype == IPV6_ADDR_ANY ||
483 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
484 			goto error;
485 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
486 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
487 				    ICMPV6_NOT_NEIGHBOUR, 0);
488 			goto error;
489 		}
490 	}
491 
492 	mtu = dst_mtu(dst);
493 	if (mtu < IPV6_MIN_MTU)
494 		mtu = IPV6_MIN_MTU;
495 
496 	if (skb->len > mtu && !skb_is_gso(skb)) {
497 		/* Again, force OUTPUT device used as source address */
498 		skb->dev = dst->dev;
499 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
500 		IP6_INC_STATS_BH(net,
501 				 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
502 		IP6_INC_STATS_BH(net,
503 				 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
504 		kfree_skb(skb);
505 		return -EMSGSIZE;
506 	}
507 
508 	if (skb_cow(skb, dst->dev->hard_header_len)) {
509 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
510 		goto drop;
511 	}
512 
513 	hdr = ipv6_hdr(skb);
514 
515 	/* Mangling hops number delayed to point after skb COW */
516 
517 	hdr->hop_limit--;
518 
519 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
520 	IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
521 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
522 		       ip6_forward_finish);
523 
524 error:
525 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
526 drop:
527 	kfree_skb(skb);
528 	return -EINVAL;
529 }
530 
531 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
532 {
533 	to->pkt_type = from->pkt_type;
534 	to->priority = from->priority;
535 	to->protocol = from->protocol;
536 	skb_dst_drop(to);
537 	skb_dst_set(to, dst_clone(skb_dst(from)));
538 	to->dev = from->dev;
539 	to->mark = from->mark;
540 
541 #ifdef CONFIG_NET_SCHED
542 	to->tc_index = from->tc_index;
543 #endif
544 	nf_copy(to, from);
545 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
546     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
547 	to->nf_trace = from->nf_trace;
548 #endif
549 	skb_copy_secmark(to, from);
550 }
551 
552 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
553 {
554 	u16 offset = sizeof(struct ipv6hdr);
555 	struct ipv6_opt_hdr *exthdr =
556 				(struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
557 	unsigned int packet_len = skb->tail - skb->network_header;
558 	int found_rhdr = 0;
559 	*nexthdr = &ipv6_hdr(skb)->nexthdr;
560 
561 	while (offset + 1 <= packet_len) {
562 
563 		switch (**nexthdr) {
564 
565 		case NEXTHDR_HOP:
566 			break;
567 		case NEXTHDR_ROUTING:
568 			found_rhdr = 1;
569 			break;
570 		case NEXTHDR_DEST:
571 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
572 			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
573 				break;
574 #endif
575 			if (found_rhdr)
576 				return offset;
577 			break;
578 		default :
579 			return offset;
580 		}
581 
582 		offset += ipv6_optlen(exthdr);
583 		*nexthdr = &exthdr->nexthdr;
584 		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
585 						 offset);
586 	}
587 
588 	return offset;
589 }
590 
591 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
592 {
593 	static atomic_t ipv6_fragmentation_id;
594 	int old, new;
595 
596 	if (rt && !(rt->dst.flags & DST_NOPEER)) {
597 		struct inet_peer *peer;
598 		struct net *net;
599 
600 		net = dev_net(rt->dst.dev);
601 		peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
602 		if (peer) {
603 			fhdr->identification = htonl(inet_getid(peer, 0));
604 			inet_putpeer(peer);
605 			return;
606 		}
607 	}
608 	do {
609 		old = atomic_read(&ipv6_fragmentation_id);
610 		new = old + 1;
611 		if (!new)
612 			new = 1;
613 	} while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
614 	fhdr->identification = htonl(new);
615 }
616 
617 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
618 {
619 	struct sk_buff *frag;
620 	struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
621 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
622 	struct ipv6hdr *tmp_hdr;
623 	struct frag_hdr *fh;
624 	unsigned int mtu, hlen, left, len;
625 	int hroom, troom;
626 	__be32 frag_id = 0;
627 	int ptr, offset = 0, err=0;
628 	u8 *prevhdr, nexthdr = 0;
629 	struct net *net = dev_net(skb_dst(skb)->dev);
630 
631 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
632 	nexthdr = *prevhdr;
633 
634 	mtu = ip6_skb_dst_mtu(skb);
635 
636 	/* We must not fragment if the socket is set to force MTU discovery
637 	 * or if the skb it not generated by a local socket.
638 	 */
639 	if (unlikely(!skb->local_df && skb->len > mtu)) {
640 		if (skb->sk && dst_allfrag(skb_dst(skb)))
641 			sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
642 
643 		skb->dev = skb_dst(skb)->dev;
644 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
645 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
646 			      IPSTATS_MIB_FRAGFAILS);
647 		kfree_skb(skb);
648 		return -EMSGSIZE;
649 	}
650 
651 	if (np && np->frag_size < mtu) {
652 		if (np->frag_size)
653 			mtu = np->frag_size;
654 	}
655 	mtu -= hlen + sizeof(struct frag_hdr);
656 
657 	if (skb_has_frag_list(skb)) {
658 		int first_len = skb_pagelen(skb);
659 		struct sk_buff *frag2;
660 
661 		if (first_len - hlen > mtu ||
662 		    ((first_len - hlen) & 7) ||
663 		    skb_cloned(skb))
664 			goto slow_path;
665 
666 		skb_walk_frags(skb, frag) {
667 			/* Correct geometry. */
668 			if (frag->len > mtu ||
669 			    ((frag->len & 7) && frag->next) ||
670 			    skb_headroom(frag) < hlen)
671 				goto slow_path_clean;
672 
673 			/* Partially cloned skb? */
674 			if (skb_shared(frag))
675 				goto slow_path_clean;
676 
677 			BUG_ON(frag->sk);
678 			if (skb->sk) {
679 				frag->sk = skb->sk;
680 				frag->destructor = sock_wfree;
681 			}
682 			skb->truesize -= frag->truesize;
683 		}
684 
685 		err = 0;
686 		offset = 0;
687 		frag = skb_shinfo(skb)->frag_list;
688 		skb_frag_list_init(skb);
689 		/* BUILD HEADER */
690 
691 		*prevhdr = NEXTHDR_FRAGMENT;
692 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
693 		if (!tmp_hdr) {
694 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
695 				      IPSTATS_MIB_FRAGFAILS);
696 			return -ENOMEM;
697 		}
698 
699 		__skb_pull(skb, hlen);
700 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
701 		__skb_push(skb, hlen);
702 		skb_reset_network_header(skb);
703 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
704 
705 		ipv6_select_ident(fh, rt);
706 		fh->nexthdr = nexthdr;
707 		fh->reserved = 0;
708 		fh->frag_off = htons(IP6_MF);
709 		frag_id = fh->identification;
710 
711 		first_len = skb_pagelen(skb);
712 		skb->data_len = first_len - skb_headlen(skb);
713 		skb->len = first_len;
714 		ipv6_hdr(skb)->payload_len = htons(first_len -
715 						   sizeof(struct ipv6hdr));
716 
717 		dst_hold(&rt->dst);
718 
719 		for (;;) {
720 			/* Prepare header of the next frame,
721 			 * before previous one went down. */
722 			if (frag) {
723 				frag->ip_summed = CHECKSUM_NONE;
724 				skb_reset_transport_header(frag);
725 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
726 				__skb_push(frag, hlen);
727 				skb_reset_network_header(frag);
728 				memcpy(skb_network_header(frag), tmp_hdr,
729 				       hlen);
730 				offset += skb->len - hlen - sizeof(struct frag_hdr);
731 				fh->nexthdr = nexthdr;
732 				fh->reserved = 0;
733 				fh->frag_off = htons(offset);
734 				if (frag->next != NULL)
735 					fh->frag_off |= htons(IP6_MF);
736 				fh->identification = frag_id;
737 				ipv6_hdr(frag)->payload_len =
738 						htons(frag->len -
739 						      sizeof(struct ipv6hdr));
740 				ip6_copy_metadata(frag, skb);
741 			}
742 
743 			err = output(skb);
744 			if(!err)
745 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
746 					      IPSTATS_MIB_FRAGCREATES);
747 
748 			if (err || !frag)
749 				break;
750 
751 			skb = frag;
752 			frag = skb->next;
753 			skb->next = NULL;
754 		}
755 
756 		kfree(tmp_hdr);
757 
758 		if (err == 0) {
759 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
760 				      IPSTATS_MIB_FRAGOKS);
761 			dst_release(&rt->dst);
762 			return 0;
763 		}
764 
765 		while (frag) {
766 			skb = frag->next;
767 			kfree_skb(frag);
768 			frag = skb;
769 		}
770 
771 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
772 			      IPSTATS_MIB_FRAGFAILS);
773 		dst_release(&rt->dst);
774 		return err;
775 
776 slow_path_clean:
777 		skb_walk_frags(skb, frag2) {
778 			if (frag2 == frag)
779 				break;
780 			frag2->sk = NULL;
781 			frag2->destructor = NULL;
782 			skb->truesize += frag2->truesize;
783 		}
784 	}
785 
786 slow_path:
787 	if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
788 	    skb_checksum_help(skb))
789 		goto fail;
790 
791 	left = skb->len - hlen;		/* Space per frame */
792 	ptr = hlen;			/* Where to start from */
793 
794 	/*
795 	 *	Fragment the datagram.
796 	 */
797 
798 	*prevhdr = NEXTHDR_FRAGMENT;
799 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
800 	troom = rt->dst.dev->needed_tailroom;
801 
802 	/*
803 	 *	Keep copying data until we run out.
804 	 */
805 	while(left > 0)	{
806 		len = left;
807 		/* IF: it doesn't fit, use 'mtu' - the data space left */
808 		if (len > mtu)
809 			len = mtu;
810 		/* IF: we are not sending up to and including the packet end
811 		   then align the next start on an eight byte boundary */
812 		if (len < left)	{
813 			len &= ~7;
814 		}
815 		/*
816 		 *	Allocate buffer.
817 		 */
818 
819 		if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
820 				      hroom + troom, GFP_ATOMIC)) == NULL) {
821 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
822 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
823 				      IPSTATS_MIB_FRAGFAILS);
824 			err = -ENOMEM;
825 			goto fail;
826 		}
827 
828 		/*
829 		 *	Set up data on packet
830 		 */
831 
832 		ip6_copy_metadata(frag, skb);
833 		skb_reserve(frag, hroom);
834 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
835 		skb_reset_network_header(frag);
836 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
837 		frag->transport_header = (frag->network_header + hlen +
838 					  sizeof(struct frag_hdr));
839 
840 		/*
841 		 *	Charge the memory for the fragment to any owner
842 		 *	it might possess
843 		 */
844 		if (skb->sk)
845 			skb_set_owner_w(frag, skb->sk);
846 
847 		/*
848 		 *	Copy the packet header into the new buffer.
849 		 */
850 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
851 
852 		/*
853 		 *	Build fragment header.
854 		 */
855 		fh->nexthdr = nexthdr;
856 		fh->reserved = 0;
857 		if (!frag_id) {
858 			ipv6_select_ident(fh, rt);
859 			frag_id = fh->identification;
860 		} else
861 			fh->identification = frag_id;
862 
863 		/*
864 		 *	Copy a block of the IP datagram.
865 		 */
866 		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
867 			BUG();
868 		left -= len;
869 
870 		fh->frag_off = htons(offset);
871 		if (left > 0)
872 			fh->frag_off |= htons(IP6_MF);
873 		ipv6_hdr(frag)->payload_len = htons(frag->len -
874 						    sizeof(struct ipv6hdr));
875 
876 		ptr += len;
877 		offset += len;
878 
879 		/*
880 		 *	Put this fragment into the sending queue.
881 		 */
882 		err = output(frag);
883 		if (err)
884 			goto fail;
885 
886 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
887 			      IPSTATS_MIB_FRAGCREATES);
888 	}
889 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
890 		      IPSTATS_MIB_FRAGOKS);
891 	consume_skb(skb);
892 	return err;
893 
894 fail:
895 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
896 		      IPSTATS_MIB_FRAGFAILS);
897 	kfree_skb(skb);
898 	return err;
899 }
900 
901 static inline int ip6_rt_check(const struct rt6key *rt_key,
902 			       const struct in6_addr *fl_addr,
903 			       const struct in6_addr *addr_cache)
904 {
905 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
906 		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
907 }
908 
909 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
910 					  struct dst_entry *dst,
911 					  const struct flowi6 *fl6)
912 {
913 	struct ipv6_pinfo *np = inet6_sk(sk);
914 	struct rt6_info *rt = (struct rt6_info *)dst;
915 
916 	if (!dst)
917 		goto out;
918 
919 	/* Yes, checking route validity in not connected
920 	 * case is not very simple. Take into account,
921 	 * that we do not support routing by source, TOS,
922 	 * and MSG_DONTROUTE 		--ANK (980726)
923 	 *
924 	 * 1. ip6_rt_check(): If route was host route,
925 	 *    check that cached destination is current.
926 	 *    If it is network route, we still may
927 	 *    check its validity using saved pointer
928 	 *    to the last used address: daddr_cache.
929 	 *    We do not want to save whole address now,
930 	 *    (because main consumer of this service
931 	 *    is tcp, which has not this problem),
932 	 *    so that the last trick works only on connected
933 	 *    sockets.
934 	 * 2. oif also should be the same.
935 	 */
936 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
937 #ifdef CONFIG_IPV6_SUBTREES
938 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
939 #endif
940 	    (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
941 		dst_release(dst);
942 		dst = NULL;
943 	}
944 
945 out:
946 	return dst;
947 }
948 
949 static int ip6_dst_lookup_tail(struct sock *sk,
950 			       struct dst_entry **dst, struct flowi6 *fl6)
951 {
952 	struct net *net = sock_net(sk);
953 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
954 	struct neighbour *n;
955 	struct rt6_info *rt;
956 #endif
957 	int err;
958 
959 	if (*dst == NULL)
960 		*dst = ip6_route_output(net, sk, fl6);
961 
962 	if ((err = (*dst)->error))
963 		goto out_err_release;
964 
965 	if (ipv6_addr_any(&fl6->saddr)) {
966 		struct rt6_info *rt = (struct rt6_info *) *dst;
967 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
968 					  sk ? inet6_sk(sk)->srcprefs : 0,
969 					  &fl6->saddr);
970 		if (err)
971 			goto out_err_release;
972 	}
973 
974 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
975 	/*
976 	 * Here if the dst entry we've looked up
977 	 * has a neighbour entry that is in the INCOMPLETE
978 	 * state and the src address from the flow is
979 	 * marked as OPTIMISTIC, we release the found
980 	 * dst entry and replace it instead with the
981 	 * dst entry of the nexthop router
982 	 */
983 	rcu_read_lock();
984 	rt = (struct rt6_info *) *dst;
985 	n = rt->n;
986 	if (n && !(n->nud_state & NUD_VALID)) {
987 		struct inet6_ifaddr *ifp;
988 		struct flowi6 fl_gw6;
989 		int redirect;
990 
991 		rcu_read_unlock();
992 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
993 				      (*dst)->dev, 1);
994 
995 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
996 		if (ifp)
997 			in6_ifa_put(ifp);
998 
999 		if (redirect) {
1000 			/*
1001 			 * We need to get the dst entry for the
1002 			 * default router instead
1003 			 */
1004 			dst_release(*dst);
1005 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1006 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1007 			*dst = ip6_route_output(net, sk, &fl_gw6);
1008 			if ((err = (*dst)->error))
1009 				goto out_err_release;
1010 		}
1011 	} else {
1012 		rcu_read_unlock();
1013 	}
1014 #endif
1015 
1016 	return 0;
1017 
1018 out_err_release:
1019 	if (err == -ENETUNREACH)
1020 		IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1021 	dst_release(*dst);
1022 	*dst = NULL;
1023 	return err;
1024 }
1025 
1026 /**
1027  *	ip6_dst_lookup - perform route lookup on flow
1028  *	@sk: socket which provides route info
1029  *	@dst: pointer to dst_entry * for result
1030  *	@fl6: flow to lookup
1031  *
1032  *	This function performs a route lookup on the given flow.
1033  *
1034  *	It returns zero on success, or a standard errno code on error.
1035  */
1036 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1037 {
1038 	*dst = NULL;
1039 	return ip6_dst_lookup_tail(sk, dst, fl6);
1040 }
1041 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1042 
1043 /**
1044  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1045  *	@sk: socket which provides route info
1046  *	@fl6: flow to lookup
1047  *	@final_dst: final destination address for ipsec lookup
1048  *	@can_sleep: we are in a sleepable context
1049  *
1050  *	This function performs a route lookup on the given flow.
1051  *
1052  *	It returns a valid dst pointer on success, or a pointer encoded
1053  *	error code.
1054  */
1055 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1056 				      const struct in6_addr *final_dst,
1057 				      bool can_sleep)
1058 {
1059 	struct dst_entry *dst = NULL;
1060 	int err;
1061 
1062 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1063 	if (err)
1064 		return ERR_PTR(err);
1065 	if (final_dst)
1066 		fl6->daddr = *final_dst;
1067 	if (can_sleep)
1068 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1069 
1070 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1071 }
1072 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1073 
1074 /**
1075  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1076  *	@sk: socket which provides the dst cache and route info
1077  *	@fl6: flow to lookup
1078  *	@final_dst: final destination address for ipsec lookup
1079  *	@can_sleep: we are in a sleepable context
1080  *
1081  *	This function performs a route lookup on the given flow with the
1082  *	possibility of using the cached route in the socket if it is valid.
1083  *	It will take the socket dst lock when operating on the dst cache.
1084  *	As a result, this function can only be used in process context.
1085  *
1086  *	It returns a valid dst pointer on success, or a pointer encoded
1087  *	error code.
1088  */
1089 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1090 					 const struct in6_addr *final_dst,
1091 					 bool can_sleep)
1092 {
1093 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1094 	int err;
1095 
1096 	dst = ip6_sk_dst_check(sk, dst, fl6);
1097 
1098 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1099 	if (err)
1100 		return ERR_PTR(err);
1101 	if (final_dst)
1102 		fl6->daddr = *final_dst;
1103 	if (can_sleep)
1104 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1105 
1106 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1107 }
1108 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1109 
1110 static inline int ip6_ufo_append_data(struct sock *sk,
1111 			int getfrag(void *from, char *to, int offset, int len,
1112 			int odd, struct sk_buff *skb),
1113 			void *from, int length, int hh_len, int fragheaderlen,
1114 			int transhdrlen, int mtu,unsigned int flags,
1115 			struct rt6_info *rt)
1116 
1117 {
1118 	struct sk_buff *skb;
1119 	int err;
1120 
1121 	/* There is support for UDP large send offload by network
1122 	 * device, so create one single skb packet containing complete
1123 	 * udp datagram
1124 	 */
1125 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1126 		skb = sock_alloc_send_skb(sk,
1127 			hh_len + fragheaderlen + transhdrlen + 20,
1128 			(flags & MSG_DONTWAIT), &err);
1129 		if (skb == NULL)
1130 			return err;
1131 
1132 		/* reserve space for Hardware header */
1133 		skb_reserve(skb, hh_len);
1134 
1135 		/* create space for UDP/IP header */
1136 		skb_put(skb,fragheaderlen + transhdrlen);
1137 
1138 		/* initialize network header pointer */
1139 		skb_reset_network_header(skb);
1140 
1141 		/* initialize protocol header pointer */
1142 		skb->transport_header = skb->network_header + fragheaderlen;
1143 
1144 		skb->ip_summed = CHECKSUM_PARTIAL;
1145 		skb->csum = 0;
1146 	}
1147 
1148 	err = skb_append_datato_frags(sk,skb, getfrag, from,
1149 				      (length - transhdrlen));
1150 	if (!err) {
1151 		struct frag_hdr fhdr;
1152 
1153 		/* Specify the length of each IPv6 datagram fragment.
1154 		 * It has to be a multiple of 8.
1155 		 */
1156 		skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1157 					     sizeof(struct frag_hdr)) & ~7;
1158 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1159 		ipv6_select_ident(&fhdr, rt);
1160 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1161 		__skb_queue_tail(&sk->sk_write_queue, skb);
1162 
1163 		return 0;
1164 	}
1165 	/* There is not enough support do UPD LSO,
1166 	 * so follow normal path
1167 	 */
1168 	kfree_skb(skb);
1169 
1170 	return err;
1171 }
1172 
1173 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1174 					       gfp_t gfp)
1175 {
1176 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1177 }
1178 
1179 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1180 						gfp_t gfp)
1181 {
1182 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1183 }
1184 
1185 static void ip6_append_data_mtu(int *mtu,
1186 				int *maxfraglen,
1187 				unsigned int fragheaderlen,
1188 				struct sk_buff *skb,
1189 				struct rt6_info *rt)
1190 {
1191 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1192 		if (skb == NULL) {
1193 			/* first fragment, reserve header_len */
1194 			*mtu = *mtu - rt->dst.header_len;
1195 
1196 		} else {
1197 			/*
1198 			 * this fragment is not first, the headers
1199 			 * space is regarded as data space.
1200 			 */
1201 			*mtu = dst_mtu(rt->dst.path);
1202 		}
1203 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1204 			      + fragheaderlen - sizeof(struct frag_hdr);
1205 	}
1206 }
1207 
1208 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1209 	int offset, int len, int odd, struct sk_buff *skb),
1210 	void *from, int length, int transhdrlen,
1211 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1212 	struct rt6_info *rt, unsigned int flags, int dontfrag)
1213 {
1214 	struct inet_sock *inet = inet_sk(sk);
1215 	struct ipv6_pinfo *np = inet6_sk(sk);
1216 	struct inet_cork *cork;
1217 	struct sk_buff *skb, *skb_prev = NULL;
1218 	unsigned int maxfraglen, fragheaderlen;
1219 	int exthdrlen;
1220 	int dst_exthdrlen;
1221 	int hh_len;
1222 	int mtu;
1223 	int copy;
1224 	int err;
1225 	int offset = 0;
1226 	__u8 tx_flags = 0;
1227 
1228 	if (flags&MSG_PROBE)
1229 		return 0;
1230 	cork = &inet->cork.base;
1231 	if (skb_queue_empty(&sk->sk_write_queue)) {
1232 		/*
1233 		 * setup for corking
1234 		 */
1235 		if (opt) {
1236 			if (WARN_ON(np->cork.opt))
1237 				return -EINVAL;
1238 
1239 			np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1240 			if (unlikely(np->cork.opt == NULL))
1241 				return -ENOBUFS;
1242 
1243 			np->cork.opt->tot_len = opt->tot_len;
1244 			np->cork.opt->opt_flen = opt->opt_flen;
1245 			np->cork.opt->opt_nflen = opt->opt_nflen;
1246 
1247 			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1248 							    sk->sk_allocation);
1249 			if (opt->dst0opt && !np->cork.opt->dst0opt)
1250 				return -ENOBUFS;
1251 
1252 			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1253 							    sk->sk_allocation);
1254 			if (opt->dst1opt && !np->cork.opt->dst1opt)
1255 				return -ENOBUFS;
1256 
1257 			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1258 							   sk->sk_allocation);
1259 			if (opt->hopopt && !np->cork.opt->hopopt)
1260 				return -ENOBUFS;
1261 
1262 			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1263 							    sk->sk_allocation);
1264 			if (opt->srcrt && !np->cork.opt->srcrt)
1265 				return -ENOBUFS;
1266 
1267 			/* need source address above miyazawa*/
1268 		}
1269 		dst_hold(&rt->dst);
1270 		cork->dst = &rt->dst;
1271 		inet->cork.fl.u.ip6 = *fl6;
1272 		np->cork.hop_limit = hlimit;
1273 		np->cork.tclass = tclass;
1274 		if (rt->dst.flags & DST_XFRM_TUNNEL)
1275 			mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1276 			      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1277 		else
1278 			mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1279 			      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1280 		if (np->frag_size < mtu) {
1281 			if (np->frag_size)
1282 				mtu = np->frag_size;
1283 		}
1284 		cork->fragsize = mtu;
1285 		if (dst_allfrag(rt->dst.path))
1286 			cork->flags |= IPCORK_ALLFRAG;
1287 		cork->length = 0;
1288 		sk->sk_sndmsg_page = NULL;
1289 		sk->sk_sndmsg_off = 0;
1290 		exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1291 		length += exthdrlen;
1292 		transhdrlen += exthdrlen;
1293 		dst_exthdrlen = rt->dst.header_len;
1294 	} else {
1295 		rt = (struct rt6_info *)cork->dst;
1296 		fl6 = &inet->cork.fl.u.ip6;
1297 		opt = np->cork.opt;
1298 		transhdrlen = 0;
1299 		exthdrlen = 0;
1300 		dst_exthdrlen = 0;
1301 		mtu = cork->fragsize;
1302 	}
1303 
1304 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1305 
1306 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1307 			(opt ? opt->opt_nflen : 0);
1308 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1309 
1310 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1311 		if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1312 			ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1313 			return -EMSGSIZE;
1314 		}
1315 	}
1316 
1317 	/* For UDP, check if TX timestamp is enabled */
1318 	if (sk->sk_type == SOCK_DGRAM) {
1319 		err = sock_tx_timestamp(sk, &tx_flags);
1320 		if (err)
1321 			goto error;
1322 	}
1323 
1324 	/*
1325 	 * Let's try using as much space as possible.
1326 	 * Use MTU if total length of the message fits into the MTU.
1327 	 * Otherwise, we need to reserve fragment header and
1328 	 * fragment alignment (= 8-15 octects, in total).
1329 	 *
1330 	 * Note that we may need to "move" the data from the tail of
1331 	 * of the buffer to the new fragment when we split
1332 	 * the message.
1333 	 *
1334 	 * FIXME: It may be fragmented into multiple chunks
1335 	 *        at once if non-fragmentable extension headers
1336 	 *        are too large.
1337 	 * --yoshfuji
1338 	 */
1339 
1340 	cork->length += length;
1341 	if (length > mtu) {
1342 		int proto = sk->sk_protocol;
1343 		if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1344 			ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1345 			return -EMSGSIZE;
1346 		}
1347 
1348 		if (proto == IPPROTO_UDP &&
1349 		    (rt->dst.dev->features & NETIF_F_UFO)) {
1350 
1351 			err = ip6_ufo_append_data(sk, getfrag, from, length,
1352 						  hh_len, fragheaderlen,
1353 						  transhdrlen, mtu, flags, rt);
1354 			if (err)
1355 				goto error;
1356 			return 0;
1357 		}
1358 	}
1359 
1360 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1361 		goto alloc_new_skb;
1362 
1363 	while (length > 0) {
1364 		/* Check if the remaining data fits into current packet. */
1365 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1366 		if (copy < length)
1367 			copy = maxfraglen - skb->len;
1368 
1369 		if (copy <= 0) {
1370 			char *data;
1371 			unsigned int datalen;
1372 			unsigned int fraglen;
1373 			unsigned int fraggap;
1374 			unsigned int alloclen;
1375 alloc_new_skb:
1376 			/* There's no room in the current skb */
1377 			if (skb)
1378 				fraggap = skb->len - maxfraglen;
1379 			else
1380 				fraggap = 0;
1381 			/* update mtu and maxfraglen if necessary */
1382 			if (skb == NULL || skb_prev == NULL)
1383 				ip6_append_data_mtu(&mtu, &maxfraglen,
1384 						    fragheaderlen, skb, rt);
1385 
1386 			skb_prev = skb;
1387 
1388 			/*
1389 			 * If remaining data exceeds the mtu,
1390 			 * we know we need more fragment(s).
1391 			 */
1392 			datalen = length + fraggap;
1393 
1394 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1395 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1396 			if ((flags & MSG_MORE) &&
1397 			    !(rt->dst.dev->features&NETIF_F_SG))
1398 				alloclen = mtu;
1399 			else
1400 				alloclen = datalen + fragheaderlen;
1401 
1402 			alloclen += dst_exthdrlen;
1403 
1404 			if (datalen != length + fraggap) {
1405 				/*
1406 				 * this is not the last fragment, the trailer
1407 				 * space is regarded as data space.
1408 				 */
1409 				datalen += rt->dst.trailer_len;
1410 			}
1411 
1412 			alloclen += rt->dst.trailer_len;
1413 			fraglen = datalen + fragheaderlen;
1414 
1415 			/*
1416 			 * We just reserve space for fragment header.
1417 			 * Note: this may be overallocation if the message
1418 			 * (without MSG_MORE) fits into the MTU.
1419 			 */
1420 			alloclen += sizeof(struct frag_hdr);
1421 
1422 			if (transhdrlen) {
1423 				skb = sock_alloc_send_skb(sk,
1424 						alloclen + hh_len,
1425 						(flags & MSG_DONTWAIT), &err);
1426 			} else {
1427 				skb = NULL;
1428 				if (atomic_read(&sk->sk_wmem_alloc) <=
1429 				    2 * sk->sk_sndbuf)
1430 					skb = sock_wmalloc(sk,
1431 							   alloclen + hh_len, 1,
1432 							   sk->sk_allocation);
1433 				if (unlikely(skb == NULL))
1434 					err = -ENOBUFS;
1435 				else {
1436 					/* Only the initial fragment
1437 					 * is time stamped.
1438 					 */
1439 					tx_flags = 0;
1440 				}
1441 			}
1442 			if (skb == NULL)
1443 				goto error;
1444 			/*
1445 			 *	Fill in the control structures
1446 			 */
1447 			skb->ip_summed = CHECKSUM_NONE;
1448 			skb->csum = 0;
1449 			/* reserve for fragmentation and ipsec header */
1450 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1451 				    dst_exthdrlen);
1452 
1453 			if (sk->sk_type == SOCK_DGRAM)
1454 				skb_shinfo(skb)->tx_flags = tx_flags;
1455 
1456 			/*
1457 			 *	Find where to start putting bytes
1458 			 */
1459 			data = skb_put(skb, fraglen);
1460 			skb_set_network_header(skb, exthdrlen);
1461 			data += fragheaderlen;
1462 			skb->transport_header = (skb->network_header +
1463 						 fragheaderlen);
1464 			if (fraggap) {
1465 				skb->csum = skb_copy_and_csum_bits(
1466 					skb_prev, maxfraglen,
1467 					data + transhdrlen, fraggap, 0);
1468 				skb_prev->csum = csum_sub(skb_prev->csum,
1469 							  skb->csum);
1470 				data += fraggap;
1471 				pskb_trim_unique(skb_prev, maxfraglen);
1472 			}
1473 			copy = datalen - transhdrlen - fraggap;
1474 
1475 			if (copy < 0) {
1476 				err = -EINVAL;
1477 				kfree_skb(skb);
1478 				goto error;
1479 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1480 				err = -EFAULT;
1481 				kfree_skb(skb);
1482 				goto error;
1483 			}
1484 
1485 			offset += copy;
1486 			length -= datalen - fraggap;
1487 			transhdrlen = 0;
1488 			exthdrlen = 0;
1489 			dst_exthdrlen = 0;
1490 
1491 			/*
1492 			 * Put the packet on the pending queue
1493 			 */
1494 			__skb_queue_tail(&sk->sk_write_queue, skb);
1495 			continue;
1496 		}
1497 
1498 		if (copy > length)
1499 			copy = length;
1500 
1501 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1502 			unsigned int off;
1503 
1504 			off = skb->len;
1505 			if (getfrag(from, skb_put(skb, copy),
1506 						offset, copy, off, skb) < 0) {
1507 				__skb_trim(skb, off);
1508 				err = -EFAULT;
1509 				goto error;
1510 			}
1511 		} else {
1512 			int i = skb_shinfo(skb)->nr_frags;
1513 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1514 			struct page *page = sk->sk_sndmsg_page;
1515 			int off = sk->sk_sndmsg_off;
1516 			unsigned int left;
1517 
1518 			if (page && (left = PAGE_SIZE - off) > 0) {
1519 				if (copy >= left)
1520 					copy = left;
1521 				if (page != skb_frag_page(frag)) {
1522 					if (i == MAX_SKB_FRAGS) {
1523 						err = -EMSGSIZE;
1524 						goto error;
1525 					}
1526 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1527 					skb_frag_ref(skb, i);
1528 					frag = &skb_shinfo(skb)->frags[i];
1529 				}
1530 			} else if(i < MAX_SKB_FRAGS) {
1531 				if (copy > PAGE_SIZE)
1532 					copy = PAGE_SIZE;
1533 				page = alloc_pages(sk->sk_allocation, 0);
1534 				if (page == NULL) {
1535 					err = -ENOMEM;
1536 					goto error;
1537 				}
1538 				sk->sk_sndmsg_page = page;
1539 				sk->sk_sndmsg_off = 0;
1540 
1541 				skb_fill_page_desc(skb, i, page, 0, 0);
1542 				frag = &skb_shinfo(skb)->frags[i];
1543 			} else {
1544 				err = -EMSGSIZE;
1545 				goto error;
1546 			}
1547 			if (getfrag(from,
1548 				    skb_frag_address(frag) + skb_frag_size(frag),
1549 				    offset, copy, skb->len, skb) < 0) {
1550 				err = -EFAULT;
1551 				goto error;
1552 			}
1553 			sk->sk_sndmsg_off += copy;
1554 			skb_frag_size_add(frag, copy);
1555 			skb->len += copy;
1556 			skb->data_len += copy;
1557 			skb->truesize += copy;
1558 			atomic_add(copy, &sk->sk_wmem_alloc);
1559 		}
1560 		offset += copy;
1561 		length -= copy;
1562 	}
1563 	return 0;
1564 error:
1565 	cork->length -= length;
1566 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1567 	return err;
1568 }
1569 EXPORT_SYMBOL_GPL(ip6_append_data);
1570 
1571 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1572 {
1573 	if (np->cork.opt) {
1574 		kfree(np->cork.opt->dst0opt);
1575 		kfree(np->cork.opt->dst1opt);
1576 		kfree(np->cork.opt->hopopt);
1577 		kfree(np->cork.opt->srcrt);
1578 		kfree(np->cork.opt);
1579 		np->cork.opt = NULL;
1580 	}
1581 
1582 	if (inet->cork.base.dst) {
1583 		dst_release(inet->cork.base.dst);
1584 		inet->cork.base.dst = NULL;
1585 		inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1586 	}
1587 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1588 }
1589 
1590 int ip6_push_pending_frames(struct sock *sk)
1591 {
1592 	struct sk_buff *skb, *tmp_skb;
1593 	struct sk_buff **tail_skb;
1594 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1595 	struct inet_sock *inet = inet_sk(sk);
1596 	struct ipv6_pinfo *np = inet6_sk(sk);
1597 	struct net *net = sock_net(sk);
1598 	struct ipv6hdr *hdr;
1599 	struct ipv6_txoptions *opt = np->cork.opt;
1600 	struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1601 	struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1602 	unsigned char proto = fl6->flowi6_proto;
1603 	int err = 0;
1604 
1605 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1606 		goto out;
1607 	tail_skb = &(skb_shinfo(skb)->frag_list);
1608 
1609 	/* move skb->data to ip header from ext header */
1610 	if (skb->data < skb_network_header(skb))
1611 		__skb_pull(skb, skb_network_offset(skb));
1612 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1613 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1614 		*tail_skb = tmp_skb;
1615 		tail_skb = &(tmp_skb->next);
1616 		skb->len += tmp_skb->len;
1617 		skb->data_len += tmp_skb->len;
1618 		skb->truesize += tmp_skb->truesize;
1619 		tmp_skb->destructor = NULL;
1620 		tmp_skb->sk = NULL;
1621 	}
1622 
1623 	/* Allow local fragmentation. */
1624 	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1625 		skb->local_df = 1;
1626 
1627 	*final_dst = fl6->daddr;
1628 	__skb_pull(skb, skb_network_header_len(skb));
1629 	if (opt && opt->opt_flen)
1630 		ipv6_push_frag_opts(skb, opt, &proto);
1631 	if (opt && opt->opt_nflen)
1632 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1633 
1634 	skb_push(skb, sizeof(struct ipv6hdr));
1635 	skb_reset_network_header(skb);
1636 	hdr = ipv6_hdr(skb);
1637 
1638 	*(__be32*)hdr = fl6->flowlabel |
1639 		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1640 
1641 	hdr->hop_limit = np->cork.hop_limit;
1642 	hdr->nexthdr = proto;
1643 	hdr->saddr = fl6->saddr;
1644 	hdr->daddr = *final_dst;
1645 
1646 	skb->priority = sk->sk_priority;
1647 	skb->mark = sk->sk_mark;
1648 
1649 	skb_dst_set(skb, dst_clone(&rt->dst));
1650 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1651 	if (proto == IPPROTO_ICMPV6) {
1652 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1653 
1654 		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1655 		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1656 	}
1657 
1658 	err = ip6_local_out(skb);
1659 	if (err) {
1660 		if (err > 0)
1661 			err = net_xmit_errno(err);
1662 		if (err)
1663 			goto error;
1664 	}
1665 
1666 out:
1667 	ip6_cork_release(inet, np);
1668 	return err;
1669 error:
1670 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1671 	goto out;
1672 }
1673 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1674 
1675 void ip6_flush_pending_frames(struct sock *sk)
1676 {
1677 	struct sk_buff *skb;
1678 
1679 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1680 		if (skb_dst(skb))
1681 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1682 				      IPSTATS_MIB_OUTDISCARDS);
1683 		kfree_skb(skb);
1684 	}
1685 
1686 	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1687 }
1688 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1689