xref: /openbmc/linux/net/ipv4/ip_output.c (revision df2634f43f5106947f3735a0b61a6527a4b278cd)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		The Internet Protocol (IP) output module.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Donald Becker, <becker@super.org>
11  *		Alan Cox, <Alan.Cox@linux.org>
12  *		Richard Underwood
13  *		Stefan Becker, <stefanb@yello.ping.de>
14  *		Jorge Cwik, <jorge@laser.satlink.net>
15  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16  *		Hirokazu Takahashi, <taka@valinux.co.jp>
17  *
18  *	See ip_input.c for original log
19  *
20  *	Fixes:
21  *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
22  *		Mike Kilburn	:	htons() missing in ip_build_xmit.
23  *		Bradford Johnson:	Fix faulty handling of some frames when
24  *					no route is found.
25  *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
26  *					(in case if packet not accepted by
27  *					output firewall rules)
28  *		Mike McLagan	:	Routing by source
29  *		Alexey Kuznetsov:	use new route cache
30  *		Andi Kleen:		Fix broken PMTU recovery and remove
31  *					some redundant tests.
32  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
33  *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
34  *		Andi Kleen	:	Split fast and slow ip_build_xmit path
35  *					for decreased register pressure on x86
36  *					and more readibility.
37  *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
38  *					silently drop skb instead of failing with -EPERM.
39  *		Detlev Wengorz	:	Copy protocol for fragments.
40  *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
41  *					datagrams.
42  *		Hirokazu Takahashi:	sendfile() on UDP works now.
43  */
44 
45 #include <asm/uaccess.h>
46 #include <asm/system.h>
47 #include <linux/module.h>
48 #include <linux/types.h>
49 #include <linux/kernel.h>
50 #include <linux/mm.h>
51 #include <linux/string.h>
52 #include <linux/errno.h>
53 #include <linux/highmem.h>
54 #include <linux/slab.h>
55 
56 #include <linux/socket.h>
57 #include <linux/sockios.h>
58 #include <linux/in.h>
59 #include <linux/inet.h>
60 #include <linux/netdevice.h>
61 #include <linux/etherdevice.h>
62 #include <linux/proc_fs.h>
63 #include <linux/stat.h>
64 #include <linux/init.h>
65 
66 #include <net/snmp.h>
67 #include <net/ip.h>
68 #include <net/protocol.h>
69 #include <net/route.h>
70 #include <net/xfrm.h>
71 #include <linux/skbuff.h>
72 #include <net/sock.h>
73 #include <net/arp.h>
74 #include <net/icmp.h>
75 #include <net/checksum.h>
76 #include <net/inetpeer.h>
77 #include <linux/igmp.h>
78 #include <linux/netfilter_ipv4.h>
79 #include <linux/netfilter_bridge.h>
80 #include <linux/mroute.h>
81 #include <linux/netlink.h>
82 #include <linux/tcp.h>
83 
84 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
85 EXPORT_SYMBOL(sysctl_ip_default_ttl);
86 
87 /* Generate a checksum for an outgoing IP datagram. */
88 __inline__ void ip_send_check(struct iphdr *iph)
89 {
90 	iph->check = 0;
91 	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
92 }
93 EXPORT_SYMBOL(ip_send_check);
94 
95 int __ip_local_out(struct sk_buff *skb)
96 {
97 	struct iphdr *iph = ip_hdr(skb);
98 
99 	iph->tot_len = htons(skb->len);
100 	ip_send_check(iph);
101 	return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
102 		       skb_dst(skb)->dev, dst_output);
103 }
104 
105 int ip_local_out(struct sk_buff *skb)
106 {
107 	int err;
108 
109 	err = __ip_local_out(skb);
110 	if (likely(err == 1))
111 		err = dst_output(skb);
112 
113 	return err;
114 }
115 EXPORT_SYMBOL_GPL(ip_local_out);
116 
117 /* dev_loopback_xmit for use with netfilter. */
118 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
119 {
120 	skb_reset_mac_header(newskb);
121 	__skb_pull(newskb, skb_network_offset(newskb));
122 	newskb->pkt_type = PACKET_LOOPBACK;
123 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
124 	WARN_ON(!skb_dst(newskb));
125 	netif_rx_ni(newskb);
126 	return 0;
127 }
128 
129 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
130 {
131 	int ttl = inet->uc_ttl;
132 
133 	if (ttl < 0)
134 		ttl = ip4_dst_hoplimit(dst);
135 	return ttl;
136 }
137 
138 /*
139  *		Add an ip header to a skbuff and send it out.
140  *
141  */
142 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
143 			  __be32 saddr, __be32 daddr, struct ip_options *opt)
144 {
145 	struct inet_sock *inet = inet_sk(sk);
146 	struct rtable *rt = skb_rtable(skb);
147 	struct iphdr *iph;
148 
149 	/* Build the IP header. */
150 	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
151 	skb_reset_network_header(skb);
152 	iph = ip_hdr(skb);
153 	iph->version  = 4;
154 	iph->ihl      = 5;
155 	iph->tos      = inet->tos;
156 	if (ip_dont_fragment(sk, &rt->dst))
157 		iph->frag_off = htons(IP_DF);
158 	else
159 		iph->frag_off = 0;
160 	iph->ttl      = ip_select_ttl(inet, &rt->dst);
161 	iph->daddr    = rt->rt_dst;
162 	iph->saddr    = rt->rt_src;
163 	iph->protocol = sk->sk_protocol;
164 	ip_select_ident(iph, &rt->dst, sk);
165 
166 	if (opt && opt->optlen) {
167 		iph->ihl += opt->optlen>>2;
168 		ip_options_build(skb, opt, daddr, rt, 0);
169 	}
170 
171 	skb->priority = sk->sk_priority;
172 	skb->mark = sk->sk_mark;
173 
174 	/* Send it out. */
175 	return ip_local_out(skb);
176 }
177 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
178 
179 static inline int ip_finish_output2(struct sk_buff *skb)
180 {
181 	struct dst_entry *dst = skb_dst(skb);
182 	struct rtable *rt = (struct rtable *)dst;
183 	struct net_device *dev = dst->dev;
184 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
185 
186 	if (rt->rt_type == RTN_MULTICAST) {
187 		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
188 	} else if (rt->rt_type == RTN_BROADCAST)
189 		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
190 
191 	/* Be paranoid, rather than too clever. */
192 	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
193 		struct sk_buff *skb2;
194 
195 		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
196 		if (skb2 == NULL) {
197 			kfree_skb(skb);
198 			return -ENOMEM;
199 		}
200 		if (skb->sk)
201 			skb_set_owner_w(skb2, skb->sk);
202 		kfree_skb(skb);
203 		skb = skb2;
204 	}
205 
206 	if (dst->hh)
207 		return neigh_hh_output(dst->hh, skb);
208 	else if (dst->neighbour)
209 		return dst->neighbour->output(skb);
210 
211 	if (net_ratelimit())
212 		printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
213 	kfree_skb(skb);
214 	return -EINVAL;
215 }
216 
217 static inline int ip_skb_dst_mtu(struct sk_buff *skb)
218 {
219 	struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
220 
221 	return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
222 	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
223 }
224 
225 static int ip_finish_output(struct sk_buff *skb)
226 {
227 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
228 	/* Policy lookup after SNAT yielded a new policy */
229 	if (skb_dst(skb)->xfrm != NULL) {
230 		IPCB(skb)->flags |= IPSKB_REROUTED;
231 		return dst_output(skb);
232 	}
233 #endif
234 	if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
235 		return ip_fragment(skb, ip_finish_output2);
236 	else
237 		return ip_finish_output2(skb);
238 }
239 
240 int ip_mc_output(struct sk_buff *skb)
241 {
242 	struct sock *sk = skb->sk;
243 	struct rtable *rt = skb_rtable(skb);
244 	struct net_device *dev = rt->dst.dev;
245 
246 	/*
247 	 *	If the indicated interface is up and running, send the packet.
248 	 */
249 	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
250 
251 	skb->dev = dev;
252 	skb->protocol = htons(ETH_P_IP);
253 
254 	/*
255 	 *	Multicasts are looped back for other local users
256 	 */
257 
258 	if (rt->rt_flags&RTCF_MULTICAST) {
259 		if (sk_mc_loop(sk)
260 #ifdef CONFIG_IP_MROUTE
261 		/* Small optimization: do not loopback not local frames,
262 		   which returned after forwarding; they will be  dropped
263 		   by ip_mr_input in any case.
264 		   Note, that local frames are looped back to be delivered
265 		   to local recipients.
266 
267 		   This check is duplicated in ip_mr_input at the moment.
268 		 */
269 		    &&
270 		    ((rt->rt_flags & RTCF_LOCAL) ||
271 		     !(IPCB(skb)->flags & IPSKB_FORWARDED))
272 #endif
273 		   ) {
274 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
275 			if (newskb)
276 				NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
277 					newskb, NULL, newskb->dev,
278 					ip_dev_loopback_xmit);
279 		}
280 
281 		/* Multicasts with ttl 0 must not go beyond the host */
282 
283 		if (ip_hdr(skb)->ttl == 0) {
284 			kfree_skb(skb);
285 			return 0;
286 		}
287 	}
288 
289 	if (rt->rt_flags&RTCF_BROADCAST) {
290 		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
291 		if (newskb)
292 			NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
293 				NULL, newskb->dev, ip_dev_loopback_xmit);
294 	}
295 
296 	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
297 			    skb->dev, ip_finish_output,
298 			    !(IPCB(skb)->flags & IPSKB_REROUTED));
299 }
300 
301 int ip_output(struct sk_buff *skb)
302 {
303 	struct net_device *dev = skb_dst(skb)->dev;
304 
305 	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
306 
307 	skb->dev = dev;
308 	skb->protocol = htons(ETH_P_IP);
309 
310 	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
311 			    ip_finish_output,
312 			    !(IPCB(skb)->flags & IPSKB_REROUTED));
313 }
314 
315 int ip_queue_xmit(struct sk_buff *skb)
316 {
317 	struct sock *sk = skb->sk;
318 	struct inet_sock *inet = inet_sk(sk);
319 	struct ip_options *opt = inet->opt;
320 	struct rtable *rt;
321 	struct iphdr *iph;
322 	int res;
323 
324 	/* Skip all of this if the packet is already routed,
325 	 * f.e. by something like SCTP.
326 	 */
327 	rcu_read_lock();
328 	rt = skb_rtable(skb);
329 	if (rt != NULL)
330 		goto packet_routed;
331 
332 	/* Make sure we can route this packet. */
333 	rt = (struct rtable *)__sk_dst_check(sk, 0);
334 	if (rt == NULL) {
335 		__be32 daddr;
336 
337 		/* Use correct destination address if we have options. */
338 		daddr = inet->inet_daddr;
339 		if(opt && opt->srr)
340 			daddr = opt->faddr;
341 
342 		{
343 			struct flowi fl = { .oif = sk->sk_bound_dev_if,
344 					    .mark = sk->sk_mark,
345 					    .fl4_dst = daddr,
346 					    .fl4_src = inet->inet_saddr,
347 					    .fl4_tos = RT_CONN_FLAGS(sk),
348 					    .proto = sk->sk_protocol,
349 					    .flags = inet_sk_flowi_flags(sk),
350 					    .fl_ip_sport = inet->inet_sport,
351 					    .fl_ip_dport = inet->inet_dport };
352 
353 			/* If this fails, retransmit mechanism of transport layer will
354 			 * keep trying until route appears or the connection times
355 			 * itself out.
356 			 */
357 			security_sk_classify_flow(sk, &fl);
358 			if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
359 				goto no_route;
360 		}
361 		sk_setup_caps(sk, &rt->dst);
362 	}
363 	skb_dst_set_noref(skb, &rt->dst);
364 
365 packet_routed:
366 	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
367 		goto no_route;
368 
369 	/* OK, we know where to send it, allocate and build IP header. */
370 	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
371 	skb_reset_network_header(skb);
372 	iph = ip_hdr(skb);
373 	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
374 	if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
375 		iph->frag_off = htons(IP_DF);
376 	else
377 		iph->frag_off = 0;
378 	iph->ttl      = ip_select_ttl(inet, &rt->dst);
379 	iph->protocol = sk->sk_protocol;
380 	iph->saddr    = rt->rt_src;
381 	iph->daddr    = rt->rt_dst;
382 	/* Transport layer set skb->h.foo itself. */
383 
384 	if (opt && opt->optlen) {
385 		iph->ihl += opt->optlen >> 2;
386 		ip_options_build(skb, opt, inet->inet_daddr, rt, 0);
387 	}
388 
389 	ip_select_ident_more(iph, &rt->dst, sk,
390 			     (skb_shinfo(skb)->gso_segs ?: 1) - 1);
391 
392 	skb->priority = sk->sk_priority;
393 	skb->mark = sk->sk_mark;
394 
395 	res = ip_local_out(skb);
396 	rcu_read_unlock();
397 	return res;
398 
399 no_route:
400 	rcu_read_unlock();
401 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
402 	kfree_skb(skb);
403 	return -EHOSTUNREACH;
404 }
405 EXPORT_SYMBOL(ip_queue_xmit);
406 
407 
408 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
409 {
410 	to->pkt_type = from->pkt_type;
411 	to->priority = from->priority;
412 	to->protocol = from->protocol;
413 	skb_dst_drop(to);
414 	skb_dst_copy(to, from);
415 	to->dev = from->dev;
416 	to->mark = from->mark;
417 
418 	/* Copy the flags to each fragment. */
419 	IPCB(to)->flags = IPCB(from)->flags;
420 
421 #ifdef CONFIG_NET_SCHED
422 	to->tc_index = from->tc_index;
423 #endif
424 	nf_copy(to, from);
425 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
426     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
427 	to->nf_trace = from->nf_trace;
428 #endif
429 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
430 	to->ipvs_property = from->ipvs_property;
431 #endif
432 	skb_copy_secmark(to, from);
433 }
434 
435 /*
436  *	This IP datagram is too large to be sent in one piece.  Break it up into
437  *	smaller pieces (each of size equal to IP header plus
438  *	a block of the data of the original IP data part) that will yet fit in a
439  *	single device frame, and queue such a frame for sending.
440  */
441 
442 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
443 {
444 	struct iphdr *iph;
445 	int ptr;
446 	struct net_device *dev;
447 	struct sk_buff *skb2;
448 	unsigned int mtu, hlen, left, len, ll_rs;
449 	int offset;
450 	__be16 not_last_frag;
451 	struct rtable *rt = skb_rtable(skb);
452 	int err = 0;
453 
454 	dev = rt->dst.dev;
455 
456 	/*
457 	 *	Point into the IP datagram header.
458 	 */
459 
460 	iph = ip_hdr(skb);
461 
462 	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
463 		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
464 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
465 			  htonl(ip_skb_dst_mtu(skb)));
466 		kfree_skb(skb);
467 		return -EMSGSIZE;
468 	}
469 
470 	/*
471 	 *	Setup starting values.
472 	 */
473 
474 	hlen = iph->ihl * 4;
475 	mtu = dst_mtu(&rt->dst) - hlen;	/* Size of data space */
476 #ifdef CONFIG_BRIDGE_NETFILTER
477 	if (skb->nf_bridge)
478 		mtu -= nf_bridge_mtu_reduction(skb);
479 #endif
480 	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
481 
482 	/* When frag_list is given, use it. First, check its validity:
483 	 * some transformers could create wrong frag_list or break existing
484 	 * one, it is not prohibited. In this case fall back to copying.
485 	 *
486 	 * LATER: this step can be merged to real generation of fragments,
487 	 * we can switch to copy when see the first bad fragment.
488 	 */
489 	if (skb_has_frag_list(skb)) {
490 		struct sk_buff *frag, *frag2;
491 		int first_len = skb_pagelen(skb);
492 
493 		if (first_len - hlen > mtu ||
494 		    ((first_len - hlen) & 7) ||
495 		    (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
496 		    skb_cloned(skb))
497 			goto slow_path;
498 
499 		skb_walk_frags(skb, frag) {
500 			/* Correct geometry. */
501 			if (frag->len > mtu ||
502 			    ((frag->len & 7) && frag->next) ||
503 			    skb_headroom(frag) < hlen)
504 				goto slow_path_clean;
505 
506 			/* Partially cloned skb? */
507 			if (skb_shared(frag))
508 				goto slow_path_clean;
509 
510 			BUG_ON(frag->sk);
511 			if (skb->sk) {
512 				frag->sk = skb->sk;
513 				frag->destructor = sock_wfree;
514 			}
515 			skb->truesize -= frag->truesize;
516 		}
517 
518 		/* Everything is OK. Generate! */
519 
520 		err = 0;
521 		offset = 0;
522 		frag = skb_shinfo(skb)->frag_list;
523 		skb_frag_list_init(skb);
524 		skb->data_len = first_len - skb_headlen(skb);
525 		skb->len = first_len;
526 		iph->tot_len = htons(first_len);
527 		iph->frag_off = htons(IP_MF);
528 		ip_send_check(iph);
529 
530 		for (;;) {
531 			/* Prepare header of the next frame,
532 			 * before previous one went down. */
533 			if (frag) {
534 				frag->ip_summed = CHECKSUM_NONE;
535 				skb_reset_transport_header(frag);
536 				__skb_push(frag, hlen);
537 				skb_reset_network_header(frag);
538 				memcpy(skb_network_header(frag), iph, hlen);
539 				iph = ip_hdr(frag);
540 				iph->tot_len = htons(frag->len);
541 				ip_copy_metadata(frag, skb);
542 				if (offset == 0)
543 					ip_options_fragment(frag);
544 				offset += skb->len - hlen;
545 				iph->frag_off = htons(offset>>3);
546 				if (frag->next != NULL)
547 					iph->frag_off |= htons(IP_MF);
548 				/* Ready, complete checksum */
549 				ip_send_check(iph);
550 			}
551 
552 			err = output(skb);
553 
554 			if (!err)
555 				IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
556 			if (err || !frag)
557 				break;
558 
559 			skb = frag;
560 			frag = skb->next;
561 			skb->next = NULL;
562 		}
563 
564 		if (err == 0) {
565 			IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
566 			return 0;
567 		}
568 
569 		while (frag) {
570 			skb = frag->next;
571 			kfree_skb(frag);
572 			frag = skb;
573 		}
574 		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
575 		return err;
576 
577 slow_path_clean:
578 		skb_walk_frags(skb, frag2) {
579 			if (frag2 == frag)
580 				break;
581 			frag2->sk = NULL;
582 			frag2->destructor = NULL;
583 			skb->truesize += frag2->truesize;
584 		}
585 	}
586 
587 slow_path:
588 	left = skb->len - hlen;		/* Space per frame */
589 	ptr = hlen;		/* Where to start from */
590 
591 	/* for bridged IP traffic encapsulated inside f.e. a vlan header,
592 	 * we need to make room for the encapsulating header
593 	 */
594 	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
595 
596 	/*
597 	 *	Fragment the datagram.
598 	 */
599 
600 	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
601 	not_last_frag = iph->frag_off & htons(IP_MF);
602 
603 	/*
604 	 *	Keep copying data until we run out.
605 	 */
606 
607 	while (left > 0) {
608 		len = left;
609 		/* IF: it doesn't fit, use 'mtu' - the data space left */
610 		if (len > mtu)
611 			len = mtu;
612 		/* IF: we are not sending upto and including the packet end
613 		   then align the next start on an eight byte boundary */
614 		if (len < left)	{
615 			len &= ~7;
616 		}
617 		/*
618 		 *	Allocate buffer.
619 		 */
620 
621 		if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
622 			NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
623 			err = -ENOMEM;
624 			goto fail;
625 		}
626 
627 		/*
628 		 *	Set up data on packet
629 		 */
630 
631 		ip_copy_metadata(skb2, skb);
632 		skb_reserve(skb2, ll_rs);
633 		skb_put(skb2, len + hlen);
634 		skb_reset_network_header(skb2);
635 		skb2->transport_header = skb2->network_header + hlen;
636 
637 		/*
638 		 *	Charge the memory for the fragment to any owner
639 		 *	it might possess
640 		 */
641 
642 		if (skb->sk)
643 			skb_set_owner_w(skb2, skb->sk);
644 
645 		/*
646 		 *	Copy the packet header into the new buffer.
647 		 */
648 
649 		skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
650 
651 		/*
652 		 *	Copy a block of the IP datagram.
653 		 */
654 		if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
655 			BUG();
656 		left -= len;
657 
658 		/*
659 		 *	Fill in the new header fields.
660 		 */
661 		iph = ip_hdr(skb2);
662 		iph->frag_off = htons((offset >> 3));
663 
664 		/* ANK: dirty, but effective trick. Upgrade options only if
665 		 * the segment to be fragmented was THE FIRST (otherwise,
666 		 * options are already fixed) and make it ONCE
667 		 * on the initial skb, so that all the following fragments
668 		 * will inherit fixed options.
669 		 */
670 		if (offset == 0)
671 			ip_options_fragment(skb);
672 
673 		/*
674 		 *	Added AC : If we are fragmenting a fragment that's not the
675 		 *		   last fragment then keep MF on each bit
676 		 */
677 		if (left > 0 || not_last_frag)
678 			iph->frag_off |= htons(IP_MF);
679 		ptr += len;
680 		offset += len;
681 
682 		/*
683 		 *	Put this fragment into the sending queue.
684 		 */
685 		iph->tot_len = htons(len + hlen);
686 
687 		ip_send_check(iph);
688 
689 		err = output(skb2);
690 		if (err)
691 			goto fail;
692 
693 		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
694 	}
695 	kfree_skb(skb);
696 	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
697 	return err;
698 
699 fail:
700 	kfree_skb(skb);
701 	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
702 	return err;
703 }
704 EXPORT_SYMBOL(ip_fragment);
705 
706 int
707 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
708 {
709 	struct iovec *iov = from;
710 
711 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
712 		if (memcpy_fromiovecend(to, iov, offset, len) < 0)
713 			return -EFAULT;
714 	} else {
715 		__wsum csum = 0;
716 		if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
717 			return -EFAULT;
718 		skb->csum = csum_block_add(skb->csum, csum, odd);
719 	}
720 	return 0;
721 }
722 EXPORT_SYMBOL(ip_generic_getfrag);
723 
724 static inline __wsum
725 csum_page(struct page *page, int offset, int copy)
726 {
727 	char *kaddr;
728 	__wsum csum;
729 	kaddr = kmap(page);
730 	csum = csum_partial(kaddr + offset, copy, 0);
731 	kunmap(page);
732 	return csum;
733 }
734 
735 static inline int ip_ufo_append_data(struct sock *sk,
736 			int getfrag(void *from, char *to, int offset, int len,
737 			       int odd, struct sk_buff *skb),
738 			void *from, int length, int hh_len, int fragheaderlen,
739 			int transhdrlen, int mtu, unsigned int flags)
740 {
741 	struct sk_buff *skb;
742 	int err;
743 
744 	/* There is support for UDP fragmentation offload by network
745 	 * device, so create one single skb packet containing complete
746 	 * udp datagram
747 	 */
748 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
749 		skb = sock_alloc_send_skb(sk,
750 			hh_len + fragheaderlen + transhdrlen + 20,
751 			(flags & MSG_DONTWAIT), &err);
752 
753 		if (skb == NULL)
754 			return err;
755 
756 		/* reserve space for Hardware header */
757 		skb_reserve(skb, hh_len);
758 
759 		/* create space for UDP/IP header */
760 		skb_put(skb, fragheaderlen + transhdrlen);
761 
762 		/* initialize network header pointer */
763 		skb_reset_network_header(skb);
764 
765 		/* initialize protocol header pointer */
766 		skb->transport_header = skb->network_header + fragheaderlen;
767 
768 		skb->ip_summed = CHECKSUM_PARTIAL;
769 		skb->csum = 0;
770 		sk->sk_sndmsg_off = 0;
771 
772 		/* specify the length of each IP datagram fragment */
773 		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
774 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
775 		__skb_queue_tail(&sk->sk_write_queue, skb);
776 	}
777 
778 	return skb_append_datato_frags(sk, skb, getfrag, from,
779 				       (length - transhdrlen));
780 }
781 
782 /*
783  *	ip_append_data() and ip_append_page() can make one large IP datagram
784  *	from many pieces of data. Each pieces will be holded on the socket
785  *	until ip_push_pending_frames() is called. Each piece can be a page
786  *	or non-page data.
787  *
788  *	Not only UDP, other transport protocols - e.g. raw sockets - can use
789  *	this interface potentially.
790  *
791  *	LATER: length must be adjusted by pad at tail, when it is required.
792  */
793 int ip_append_data(struct sock *sk,
794 		   int getfrag(void *from, char *to, int offset, int len,
795 			       int odd, struct sk_buff *skb),
796 		   void *from, int length, int transhdrlen,
797 		   struct ipcm_cookie *ipc, struct rtable **rtp,
798 		   unsigned int flags)
799 {
800 	struct inet_sock *inet = inet_sk(sk);
801 	struct sk_buff *skb;
802 
803 	struct ip_options *opt = NULL;
804 	int hh_len;
805 	int exthdrlen;
806 	int mtu;
807 	int copy;
808 	int err;
809 	int offset = 0;
810 	unsigned int maxfraglen, fragheaderlen;
811 	int csummode = CHECKSUM_NONE;
812 	struct rtable *rt;
813 
814 	if (flags&MSG_PROBE)
815 		return 0;
816 
817 	if (skb_queue_empty(&sk->sk_write_queue)) {
818 		/*
819 		 * setup for corking.
820 		 */
821 		opt = ipc->opt;
822 		if (opt) {
823 			if (inet->cork.opt == NULL) {
824 				inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
825 				if (unlikely(inet->cork.opt == NULL))
826 					return -ENOBUFS;
827 			}
828 			memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
829 			inet->cork.flags |= IPCORK_OPT;
830 			inet->cork.addr = ipc->addr;
831 		}
832 		rt = *rtp;
833 		if (unlikely(!rt))
834 			return -EFAULT;
835 		/*
836 		 * We steal reference to this route, caller should not release it
837 		 */
838 		*rtp = NULL;
839 		inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
840 					    rt->dst.dev->mtu :
841 					    dst_mtu(rt->dst.path);
842 		inet->cork.dst = &rt->dst;
843 		inet->cork.length = 0;
844 		sk->sk_sndmsg_page = NULL;
845 		sk->sk_sndmsg_off = 0;
846 		exthdrlen = rt->dst.header_len;
847 		length += exthdrlen;
848 		transhdrlen += exthdrlen;
849 	} else {
850 		rt = (struct rtable *)inet->cork.dst;
851 		if (inet->cork.flags & IPCORK_OPT)
852 			opt = inet->cork.opt;
853 
854 		transhdrlen = 0;
855 		exthdrlen = 0;
856 		mtu = inet->cork.fragsize;
857 	}
858 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
859 
860 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
861 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
862 
863 	if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
864 		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
865 			       mtu-exthdrlen);
866 		return -EMSGSIZE;
867 	}
868 
869 	/*
870 	 * transhdrlen > 0 means that this is the first fragment and we wish
871 	 * it won't be fragmented in the future.
872 	 */
873 	if (transhdrlen &&
874 	    length + fragheaderlen <= mtu &&
875 	    rt->dst.dev->features & NETIF_F_V4_CSUM &&
876 	    !exthdrlen)
877 		csummode = CHECKSUM_PARTIAL;
878 
879 	skb = skb_peek_tail(&sk->sk_write_queue);
880 
881 	inet->cork.length += length;
882 	if (((length > mtu) || (skb && skb_is_gso(skb))) &&
883 	    (sk->sk_protocol == IPPROTO_UDP) &&
884 	    (rt->dst.dev->features & NETIF_F_UFO)) {
885 		err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
886 					 fragheaderlen, transhdrlen, mtu,
887 					 flags);
888 		if (err)
889 			goto error;
890 		return 0;
891 	}
892 
893 	/* So, what's going on in the loop below?
894 	 *
895 	 * We use calculated fragment length to generate chained skb,
896 	 * each of segments is IP fragment ready for sending to network after
897 	 * adding appropriate IP header.
898 	 */
899 
900 	if (!skb)
901 		goto alloc_new_skb;
902 
903 	while (length > 0) {
904 		/* Check if the remaining data fits into current packet. */
905 		copy = mtu - skb->len;
906 		if (copy < length)
907 			copy = maxfraglen - skb->len;
908 		if (copy <= 0) {
909 			char *data;
910 			unsigned int datalen;
911 			unsigned int fraglen;
912 			unsigned int fraggap;
913 			unsigned int alloclen;
914 			struct sk_buff *skb_prev;
915 alloc_new_skb:
916 			skb_prev = skb;
917 			if (skb_prev)
918 				fraggap = skb_prev->len - maxfraglen;
919 			else
920 				fraggap = 0;
921 
922 			/*
923 			 * If remaining data exceeds the mtu,
924 			 * we know we need more fragment(s).
925 			 */
926 			datalen = length + fraggap;
927 			if (datalen > mtu - fragheaderlen)
928 				datalen = maxfraglen - fragheaderlen;
929 			fraglen = datalen + fragheaderlen;
930 
931 			if ((flags & MSG_MORE) &&
932 			    !(rt->dst.dev->features&NETIF_F_SG))
933 				alloclen = mtu;
934 			else
935 				alloclen = fraglen;
936 
937 			/* The last fragment gets additional space at tail.
938 			 * Note, with MSG_MORE we overallocate on fragments,
939 			 * because we have no idea what fragment will be
940 			 * the last.
941 			 */
942 			if (datalen == length + fraggap) {
943 				alloclen += rt->dst.trailer_len;
944 				/* make sure mtu is not reached */
945 				if (datalen > mtu - fragheaderlen - rt->dst.trailer_len)
946 					datalen -= ALIGN(rt->dst.trailer_len, 8);
947 			}
948 			if (transhdrlen) {
949 				skb = sock_alloc_send_skb(sk,
950 						alloclen + hh_len + 15,
951 						(flags & MSG_DONTWAIT), &err);
952 			} else {
953 				skb = NULL;
954 				if (atomic_read(&sk->sk_wmem_alloc) <=
955 				    2 * sk->sk_sndbuf)
956 					skb = sock_wmalloc(sk,
957 							   alloclen + hh_len + 15, 1,
958 							   sk->sk_allocation);
959 				if (unlikely(skb == NULL))
960 					err = -ENOBUFS;
961 				else
962 					/* only the initial fragment is
963 					   time stamped */
964 					ipc->tx_flags = 0;
965 			}
966 			if (skb == NULL)
967 				goto error;
968 
969 			/*
970 			 *	Fill in the control structures
971 			 */
972 			skb->ip_summed = csummode;
973 			skb->csum = 0;
974 			skb_reserve(skb, hh_len);
975 			skb_shinfo(skb)->tx_flags = ipc->tx_flags;
976 
977 			/*
978 			 *	Find where to start putting bytes.
979 			 */
980 			data = skb_put(skb, fraglen);
981 			skb_set_network_header(skb, exthdrlen);
982 			skb->transport_header = (skb->network_header +
983 						 fragheaderlen);
984 			data += fragheaderlen;
985 
986 			if (fraggap) {
987 				skb->csum = skb_copy_and_csum_bits(
988 					skb_prev, maxfraglen,
989 					data + transhdrlen, fraggap, 0);
990 				skb_prev->csum = csum_sub(skb_prev->csum,
991 							  skb->csum);
992 				data += fraggap;
993 				pskb_trim_unique(skb_prev, maxfraglen);
994 			}
995 
996 			copy = datalen - transhdrlen - fraggap;
997 			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
998 				err = -EFAULT;
999 				kfree_skb(skb);
1000 				goto error;
1001 			}
1002 
1003 			offset += copy;
1004 			length -= datalen - fraggap;
1005 			transhdrlen = 0;
1006 			exthdrlen = 0;
1007 			csummode = CHECKSUM_NONE;
1008 
1009 			/*
1010 			 * Put the packet on the pending queue.
1011 			 */
1012 			__skb_queue_tail(&sk->sk_write_queue, skb);
1013 			continue;
1014 		}
1015 
1016 		if (copy > length)
1017 			copy = length;
1018 
1019 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1020 			unsigned int off;
1021 
1022 			off = skb->len;
1023 			if (getfrag(from, skb_put(skb, copy),
1024 					offset, copy, off, skb) < 0) {
1025 				__skb_trim(skb, off);
1026 				err = -EFAULT;
1027 				goto error;
1028 			}
1029 		} else {
1030 			int i = skb_shinfo(skb)->nr_frags;
1031 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1032 			struct page *page = sk->sk_sndmsg_page;
1033 			int off = sk->sk_sndmsg_off;
1034 			unsigned int left;
1035 
1036 			if (page && (left = PAGE_SIZE - off) > 0) {
1037 				if (copy >= left)
1038 					copy = left;
1039 				if (page != frag->page) {
1040 					if (i == MAX_SKB_FRAGS) {
1041 						err = -EMSGSIZE;
1042 						goto error;
1043 					}
1044 					get_page(page);
1045 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1046 					frag = &skb_shinfo(skb)->frags[i];
1047 				}
1048 			} else if (i < MAX_SKB_FRAGS) {
1049 				if (copy > PAGE_SIZE)
1050 					copy = PAGE_SIZE;
1051 				page = alloc_pages(sk->sk_allocation, 0);
1052 				if (page == NULL)  {
1053 					err = -ENOMEM;
1054 					goto error;
1055 				}
1056 				sk->sk_sndmsg_page = page;
1057 				sk->sk_sndmsg_off = 0;
1058 
1059 				skb_fill_page_desc(skb, i, page, 0, 0);
1060 				frag = &skb_shinfo(skb)->frags[i];
1061 			} else {
1062 				err = -EMSGSIZE;
1063 				goto error;
1064 			}
1065 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1066 				err = -EFAULT;
1067 				goto error;
1068 			}
1069 			sk->sk_sndmsg_off += copy;
1070 			frag->size += copy;
1071 			skb->len += copy;
1072 			skb->data_len += copy;
1073 			skb->truesize += copy;
1074 			atomic_add(copy, &sk->sk_wmem_alloc);
1075 		}
1076 		offset += copy;
1077 		length -= copy;
1078 	}
1079 
1080 	return 0;
1081 
1082 error:
1083 	inet->cork.length -= length;
1084 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1085 	return err;
1086 }
1087 
1088 ssize_t	ip_append_page(struct sock *sk, struct page *page,
1089 		       int offset, size_t size, int flags)
1090 {
1091 	struct inet_sock *inet = inet_sk(sk);
1092 	struct sk_buff *skb;
1093 	struct rtable *rt;
1094 	struct ip_options *opt = NULL;
1095 	int hh_len;
1096 	int mtu;
1097 	int len;
1098 	int err;
1099 	unsigned int maxfraglen, fragheaderlen, fraggap;
1100 
1101 	if (inet->hdrincl)
1102 		return -EPERM;
1103 
1104 	if (flags&MSG_PROBE)
1105 		return 0;
1106 
1107 	if (skb_queue_empty(&sk->sk_write_queue))
1108 		return -EINVAL;
1109 
1110 	rt = (struct rtable *)inet->cork.dst;
1111 	if (inet->cork.flags & IPCORK_OPT)
1112 		opt = inet->cork.opt;
1113 
1114 	if (!(rt->dst.dev->features&NETIF_F_SG))
1115 		return -EOPNOTSUPP;
1116 
1117 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1118 	mtu = inet->cork.fragsize;
1119 
1120 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1121 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1122 
1123 	if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1124 		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu);
1125 		return -EMSGSIZE;
1126 	}
1127 
1128 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1129 		return -EINVAL;
1130 
1131 	inet->cork.length += size;
1132 	if ((size + skb->len > mtu) &&
1133 	    (sk->sk_protocol == IPPROTO_UDP) &&
1134 	    (rt->dst.dev->features & NETIF_F_UFO)) {
1135 		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1136 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1137 	}
1138 
1139 
1140 	while (size > 0) {
1141 		int i;
1142 
1143 		if (skb_is_gso(skb))
1144 			len = size;
1145 		else {
1146 
1147 			/* Check if the remaining data fits into current packet. */
1148 			len = mtu - skb->len;
1149 			if (len < size)
1150 				len = maxfraglen - skb->len;
1151 		}
1152 		if (len <= 0) {
1153 			struct sk_buff *skb_prev;
1154 			int alloclen;
1155 
1156 			skb_prev = skb;
1157 			fraggap = skb_prev->len - maxfraglen;
1158 
1159 			alloclen = fragheaderlen + hh_len + fraggap + 15;
1160 			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1161 			if (unlikely(!skb)) {
1162 				err = -ENOBUFS;
1163 				goto error;
1164 			}
1165 
1166 			/*
1167 			 *	Fill in the control structures
1168 			 */
1169 			skb->ip_summed = CHECKSUM_NONE;
1170 			skb->csum = 0;
1171 			skb_reserve(skb, hh_len);
1172 
1173 			/*
1174 			 *	Find where to start putting bytes.
1175 			 */
1176 			skb_put(skb, fragheaderlen + fraggap);
1177 			skb_reset_network_header(skb);
1178 			skb->transport_header = (skb->network_header +
1179 						 fragheaderlen);
1180 			if (fraggap) {
1181 				skb->csum = skb_copy_and_csum_bits(skb_prev,
1182 								   maxfraglen,
1183 						    skb_transport_header(skb),
1184 								   fraggap, 0);
1185 				skb_prev->csum = csum_sub(skb_prev->csum,
1186 							  skb->csum);
1187 				pskb_trim_unique(skb_prev, maxfraglen);
1188 			}
1189 
1190 			/*
1191 			 * Put the packet on the pending queue.
1192 			 */
1193 			__skb_queue_tail(&sk->sk_write_queue, skb);
1194 			continue;
1195 		}
1196 
1197 		i = skb_shinfo(skb)->nr_frags;
1198 		if (len > size)
1199 			len = size;
1200 		if (skb_can_coalesce(skb, i, page, offset)) {
1201 			skb_shinfo(skb)->frags[i-1].size += len;
1202 		} else if (i < MAX_SKB_FRAGS) {
1203 			get_page(page);
1204 			skb_fill_page_desc(skb, i, page, offset, len);
1205 		} else {
1206 			err = -EMSGSIZE;
1207 			goto error;
1208 		}
1209 
1210 		if (skb->ip_summed == CHECKSUM_NONE) {
1211 			__wsum csum;
1212 			csum = csum_page(page, offset, len);
1213 			skb->csum = csum_block_add(skb->csum, csum, skb->len);
1214 		}
1215 
1216 		skb->len += len;
1217 		skb->data_len += len;
1218 		skb->truesize += len;
1219 		atomic_add(len, &sk->sk_wmem_alloc);
1220 		offset += len;
1221 		size -= len;
1222 	}
1223 	return 0;
1224 
1225 error:
1226 	inet->cork.length -= size;
1227 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1228 	return err;
1229 }
1230 
1231 static void ip_cork_release(struct inet_sock *inet)
1232 {
1233 	inet->cork.flags &= ~IPCORK_OPT;
1234 	kfree(inet->cork.opt);
1235 	inet->cork.opt = NULL;
1236 	dst_release(inet->cork.dst);
1237 	inet->cork.dst = NULL;
1238 }
1239 
1240 /*
1241  *	Combined all pending IP fragments on the socket as one IP datagram
1242  *	and push them out.
1243  */
1244 int ip_push_pending_frames(struct sock *sk)
1245 {
1246 	struct sk_buff *skb, *tmp_skb;
1247 	struct sk_buff **tail_skb;
1248 	struct inet_sock *inet = inet_sk(sk);
1249 	struct net *net = sock_net(sk);
1250 	struct ip_options *opt = NULL;
1251 	struct rtable *rt = (struct rtable *)inet->cork.dst;
1252 	struct iphdr *iph;
1253 	__be16 df = 0;
1254 	__u8 ttl;
1255 	int err = 0;
1256 
1257 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1258 		goto out;
1259 	tail_skb = &(skb_shinfo(skb)->frag_list);
1260 
1261 	/* move skb->data to ip header from ext header */
1262 	if (skb->data < skb_network_header(skb))
1263 		__skb_pull(skb, skb_network_offset(skb));
1264 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1265 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1266 		*tail_skb = tmp_skb;
1267 		tail_skb = &(tmp_skb->next);
1268 		skb->len += tmp_skb->len;
1269 		skb->data_len += tmp_skb->len;
1270 		skb->truesize += tmp_skb->truesize;
1271 		tmp_skb->destructor = NULL;
1272 		tmp_skb->sk = NULL;
1273 	}
1274 
1275 	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1276 	 * to fragment the frame generated here. No matter, what transforms
1277 	 * how transforms change size of the packet, it will come out.
1278 	 */
1279 	if (inet->pmtudisc < IP_PMTUDISC_DO)
1280 		skb->local_df = 1;
1281 
1282 	/* DF bit is set when we want to see DF on outgoing frames.
1283 	 * If local_df is set too, we still allow to fragment this frame
1284 	 * locally. */
1285 	if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1286 	    (skb->len <= dst_mtu(&rt->dst) &&
1287 	     ip_dont_fragment(sk, &rt->dst)))
1288 		df = htons(IP_DF);
1289 
1290 	if (inet->cork.flags & IPCORK_OPT)
1291 		opt = inet->cork.opt;
1292 
1293 	if (rt->rt_type == RTN_MULTICAST)
1294 		ttl = inet->mc_ttl;
1295 	else
1296 		ttl = ip_select_ttl(inet, &rt->dst);
1297 
1298 	iph = (struct iphdr *)skb->data;
1299 	iph->version = 4;
1300 	iph->ihl = 5;
1301 	if (opt) {
1302 		iph->ihl += opt->optlen>>2;
1303 		ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1304 	}
1305 	iph->tos = inet->tos;
1306 	iph->frag_off = df;
1307 	ip_select_ident(iph, &rt->dst, sk);
1308 	iph->ttl = ttl;
1309 	iph->protocol = sk->sk_protocol;
1310 	iph->saddr = rt->rt_src;
1311 	iph->daddr = rt->rt_dst;
1312 
1313 	skb->priority = sk->sk_priority;
1314 	skb->mark = sk->sk_mark;
1315 	/*
1316 	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1317 	 * on dst refcount
1318 	 */
1319 	inet->cork.dst = NULL;
1320 	skb_dst_set(skb, &rt->dst);
1321 
1322 	if (iph->protocol == IPPROTO_ICMP)
1323 		icmp_out_count(net, ((struct icmphdr *)
1324 			skb_transport_header(skb))->type);
1325 
1326 	/* Netfilter gets whole the not fragmented skb. */
1327 	err = ip_local_out(skb);
1328 	if (err) {
1329 		if (err > 0)
1330 			err = net_xmit_errno(err);
1331 		if (err)
1332 			goto error;
1333 	}
1334 
1335 out:
1336 	ip_cork_release(inet);
1337 	return err;
1338 
1339 error:
1340 	IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1341 	goto out;
1342 }
1343 
1344 /*
1345  *	Throw away all pending data on the socket.
1346  */
1347 void ip_flush_pending_frames(struct sock *sk)
1348 {
1349 	struct sk_buff *skb;
1350 
1351 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1352 		kfree_skb(skb);
1353 
1354 	ip_cork_release(inet_sk(sk));
1355 }
1356 
1357 
1358 /*
1359  *	Fetch data from kernel space and fill in checksum if needed.
1360  */
1361 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1362 			      int len, int odd, struct sk_buff *skb)
1363 {
1364 	__wsum csum;
1365 
1366 	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1367 	skb->csum = csum_block_add(skb->csum, csum, odd);
1368 	return 0;
1369 }
1370 
1371 /*
1372  *	Generic function to send a packet as reply to another packet.
1373  *	Used to send TCP resets so far. ICMP should use this function too.
1374  *
1375  *	Should run single threaded per socket because it uses the sock
1376  *     	structure to pass arguments.
1377  */
1378 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1379 		   unsigned int len)
1380 {
1381 	struct inet_sock *inet = inet_sk(sk);
1382 	struct {
1383 		struct ip_options	opt;
1384 		char			data[40];
1385 	} replyopts;
1386 	struct ipcm_cookie ipc;
1387 	__be32 daddr;
1388 	struct rtable *rt = skb_rtable(skb);
1389 
1390 	if (ip_options_echo(&replyopts.opt, skb))
1391 		return;
1392 
1393 	daddr = ipc.addr = rt->rt_src;
1394 	ipc.opt = NULL;
1395 	ipc.tx_flags = 0;
1396 
1397 	if (replyopts.opt.optlen) {
1398 		ipc.opt = &replyopts.opt;
1399 
1400 		if (ipc.opt->srr)
1401 			daddr = replyopts.opt.faddr;
1402 	}
1403 
1404 	{
1405 		struct flowi fl = { .oif = arg->bound_dev_if,
1406 				    .fl4_dst = daddr,
1407 				    .fl4_src = rt->rt_spec_dst,
1408 				    .fl4_tos = RT_TOS(ip_hdr(skb)->tos),
1409 				    .fl_ip_sport = tcp_hdr(skb)->dest,
1410 				    .fl_ip_dport = tcp_hdr(skb)->source,
1411 				    .proto = sk->sk_protocol,
1412 				    .flags = ip_reply_arg_flowi_flags(arg) };
1413 		security_skb_classify_flow(skb, &fl);
1414 		if (ip_route_output_key(sock_net(sk), &rt, &fl))
1415 			return;
1416 	}
1417 
1418 	/* And let IP do all the hard work.
1419 
1420 	   This chunk is not reenterable, hence spinlock.
1421 	   Note that it uses the fact, that this function is called
1422 	   with locally disabled BH and that sk cannot be already spinlocked.
1423 	 */
1424 	bh_lock_sock(sk);
1425 	inet->tos = ip_hdr(skb)->tos;
1426 	sk->sk_priority = skb->priority;
1427 	sk->sk_protocol = ip_hdr(skb)->protocol;
1428 	sk->sk_bound_dev_if = arg->bound_dev_if;
1429 	ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1430 		       &ipc, &rt, MSG_DONTWAIT);
1431 	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1432 		if (arg->csumoffset >= 0)
1433 			*((__sum16 *)skb_transport_header(skb) +
1434 			  arg->csumoffset) = csum_fold(csum_add(skb->csum,
1435 								arg->csum));
1436 		skb->ip_summed = CHECKSUM_NONE;
1437 		ip_push_pending_frames(sk);
1438 	}
1439 
1440 	bh_unlock_sock(sk);
1441 
1442 	ip_rt_put(rt);
1443 }
1444 
1445 void __init ip_init(void)
1446 {
1447 	ip_rt_init();
1448 	inet_initpeers();
1449 
1450 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1451 	igmp_mc_proc_init();
1452 #endif
1453 }
1454