xref: /openbmc/linux/net/ipv4/ip_output.c (revision a09d2831)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		The Internet Protocol (IP) output module.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Donald Becker, <becker@super.org>
11  *		Alan Cox, <Alan.Cox@linux.org>
12  *		Richard Underwood
13  *		Stefan Becker, <stefanb@yello.ping.de>
14  *		Jorge Cwik, <jorge@laser.satlink.net>
15  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16  *		Hirokazu Takahashi, <taka@valinux.co.jp>
17  *
18  *	See ip_input.c for original log
19  *
20  *	Fixes:
21  *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
22  *		Mike Kilburn	:	htons() missing in ip_build_xmit.
23  *		Bradford Johnson:	Fix faulty handling of some frames when
24  *					no route is found.
25  *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
26  *					(in case if packet not accepted by
27  *					output firewall rules)
28  *		Mike McLagan	:	Routing by source
29  *		Alexey Kuznetsov:	use new route cache
30  *		Andi Kleen:		Fix broken PMTU recovery and remove
31  *					some redundant tests.
32  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
33  *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
34  *		Andi Kleen	:	Split fast and slow ip_build_xmit path
35  *					for decreased register pressure on x86
36  *					and more readibility.
37  *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
38  *					silently drop skb instead of failing with -EPERM.
39  *		Detlev Wengorz	:	Copy protocol for fragments.
40  *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
41  *					datagrams.
42  *		Hirokazu Takahashi:	sendfile() on UDP works now.
43  */
44 
45 #include <asm/uaccess.h>
46 #include <asm/system.h>
47 #include <linux/module.h>
48 #include <linux/types.h>
49 #include <linux/kernel.h>
50 #include <linux/mm.h>
51 #include <linux/string.h>
52 #include <linux/errno.h>
53 #include <linux/highmem.h>
54 
55 #include <linux/socket.h>
56 #include <linux/sockios.h>
57 #include <linux/in.h>
58 #include <linux/inet.h>
59 #include <linux/netdevice.h>
60 #include <linux/etherdevice.h>
61 #include <linux/proc_fs.h>
62 #include <linux/stat.h>
63 #include <linux/init.h>
64 
65 #include <net/snmp.h>
66 #include <net/ip.h>
67 #include <net/protocol.h>
68 #include <net/route.h>
69 #include <net/xfrm.h>
70 #include <linux/skbuff.h>
71 #include <net/sock.h>
72 #include <net/arp.h>
73 #include <net/icmp.h>
74 #include <net/checksum.h>
75 #include <net/inetpeer.h>
76 #include <linux/igmp.h>
77 #include <linux/netfilter_ipv4.h>
78 #include <linux/netfilter_bridge.h>
79 #include <linux/mroute.h>
80 #include <linux/netlink.h>
81 #include <linux/tcp.h>
82 
83 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
84 
85 /* Generate a checksum for an outgoing IP datagram. */
86 __inline__ void ip_send_check(struct iphdr *iph)
87 {
88 	iph->check = 0;
89 	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
90 }
91 
92 int __ip_local_out(struct sk_buff *skb)
93 {
94 	struct iphdr *iph = ip_hdr(skb);
95 
96 	iph->tot_len = htons(skb->len);
97 	ip_send_check(iph);
98 	return nf_hook(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
99 		       dst_output);
100 }
101 
102 int ip_local_out(struct sk_buff *skb)
103 {
104 	int err;
105 
106 	err = __ip_local_out(skb);
107 	if (likely(err == 1))
108 		err = dst_output(skb);
109 
110 	return err;
111 }
112 EXPORT_SYMBOL_GPL(ip_local_out);
113 
114 /* dev_loopback_xmit for use with netfilter. */
115 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
116 {
117 	skb_reset_mac_header(newskb);
118 	__skb_pull(newskb, skb_network_offset(newskb));
119 	newskb->pkt_type = PACKET_LOOPBACK;
120 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
121 	WARN_ON(!skb_dst(newskb));
122 	netif_rx(newskb);
123 	return 0;
124 }
125 
126 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
127 {
128 	int ttl = inet->uc_ttl;
129 
130 	if (ttl < 0)
131 		ttl = dst_metric(dst, RTAX_HOPLIMIT);
132 	return ttl;
133 }
134 
135 /*
136  *		Add an ip header to a skbuff and send it out.
137  *
138  */
139 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
140 			  __be32 saddr, __be32 daddr, struct ip_options *opt)
141 {
142 	struct inet_sock *inet = inet_sk(sk);
143 	struct rtable *rt = skb_rtable(skb);
144 	struct iphdr *iph;
145 
146 	/* Build the IP header. */
147 	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
148 	skb_reset_network_header(skb);
149 	iph = ip_hdr(skb);
150 	iph->version  = 4;
151 	iph->ihl      = 5;
152 	iph->tos      = inet->tos;
153 	if (ip_dont_fragment(sk, &rt->u.dst))
154 		iph->frag_off = htons(IP_DF);
155 	else
156 		iph->frag_off = 0;
157 	iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
158 	iph->daddr    = rt->rt_dst;
159 	iph->saddr    = rt->rt_src;
160 	iph->protocol = sk->sk_protocol;
161 	ip_select_ident(iph, &rt->u.dst, sk);
162 
163 	if (opt && opt->optlen) {
164 		iph->ihl += opt->optlen>>2;
165 		ip_options_build(skb, opt, daddr, rt, 0);
166 	}
167 
168 	skb->priority = sk->sk_priority;
169 	skb->mark = sk->sk_mark;
170 
171 	/* Send it out. */
172 	return ip_local_out(skb);
173 }
174 
175 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
176 
177 static inline int ip_finish_output2(struct sk_buff *skb)
178 {
179 	struct dst_entry *dst = skb_dst(skb);
180 	struct rtable *rt = (struct rtable *)dst;
181 	struct net_device *dev = dst->dev;
182 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
183 
184 	if (rt->rt_type == RTN_MULTICAST) {
185 		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
186 	} else if (rt->rt_type == RTN_BROADCAST)
187 		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
188 
189 	/* Be paranoid, rather than too clever. */
190 	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
191 		struct sk_buff *skb2;
192 
193 		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
194 		if (skb2 == NULL) {
195 			kfree_skb(skb);
196 			return -ENOMEM;
197 		}
198 		if (skb->sk)
199 			skb_set_owner_w(skb2, skb->sk);
200 		kfree_skb(skb);
201 		skb = skb2;
202 	}
203 
204 	if (dst->hh)
205 		return neigh_hh_output(dst->hh, skb);
206 	else if (dst->neighbour)
207 		return dst->neighbour->output(skb);
208 
209 	if (net_ratelimit())
210 		printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
211 	kfree_skb(skb);
212 	return -EINVAL;
213 }
214 
215 static inline int ip_skb_dst_mtu(struct sk_buff *skb)
216 {
217 	struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
218 
219 	return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
220 	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
221 }
222 
223 static int ip_finish_output(struct sk_buff *skb)
224 {
225 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
226 	/* Policy lookup after SNAT yielded a new policy */
227 	if (skb_dst(skb)->xfrm != NULL) {
228 		IPCB(skb)->flags |= IPSKB_REROUTED;
229 		return dst_output(skb);
230 	}
231 #endif
232 	if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
233 		return ip_fragment(skb, ip_finish_output2);
234 	else
235 		return ip_finish_output2(skb);
236 }
237 
238 int ip_mc_output(struct sk_buff *skb)
239 {
240 	struct sock *sk = skb->sk;
241 	struct rtable *rt = skb_rtable(skb);
242 	struct net_device *dev = rt->u.dst.dev;
243 
244 	/*
245 	 *	If the indicated interface is up and running, send the packet.
246 	 */
247 	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
248 
249 	skb->dev = dev;
250 	skb->protocol = htons(ETH_P_IP);
251 
252 	/*
253 	 *	Multicasts are looped back for other local users
254 	 */
255 
256 	if (rt->rt_flags&RTCF_MULTICAST) {
257 		if ((!sk || inet_sk(sk)->mc_loop)
258 #ifdef CONFIG_IP_MROUTE
259 		/* Small optimization: do not loopback not local frames,
260 		   which returned after forwarding; they will be  dropped
261 		   by ip_mr_input in any case.
262 		   Note, that local frames are looped back to be delivered
263 		   to local recipients.
264 
265 		   This check is duplicated in ip_mr_input at the moment.
266 		 */
267 		    &&
268 		    ((rt->rt_flags & RTCF_LOCAL) ||
269 		     !(IPCB(skb)->flags & IPSKB_FORWARDED))
270 #endif
271 		   ) {
272 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
273 			if (newskb)
274 				NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb,
275 					NULL, newskb->dev,
276 					ip_dev_loopback_xmit);
277 		}
278 
279 		/* Multicasts with ttl 0 must not go beyond the host */
280 
281 		if (ip_hdr(skb)->ttl == 0) {
282 			kfree_skb(skb);
283 			return 0;
284 		}
285 	}
286 
287 	if (rt->rt_flags&RTCF_BROADCAST) {
288 		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
289 		if (newskb)
290 			NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb, NULL,
291 				newskb->dev, ip_dev_loopback_xmit);
292 	}
293 
294 	return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
295 			    ip_finish_output,
296 			    !(IPCB(skb)->flags & IPSKB_REROUTED));
297 }
298 
299 int ip_output(struct sk_buff *skb)
300 {
301 	struct net_device *dev = skb_dst(skb)->dev;
302 
303 	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
304 
305 	skb->dev = dev;
306 	skb->protocol = htons(ETH_P_IP);
307 
308 	return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, dev,
309 			    ip_finish_output,
310 			    !(IPCB(skb)->flags & IPSKB_REROUTED));
311 }
312 
313 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
314 {
315 	struct sock *sk = skb->sk;
316 	struct inet_sock *inet = inet_sk(sk);
317 	struct ip_options *opt = inet->opt;
318 	struct rtable *rt;
319 	struct iphdr *iph;
320 
321 	/* Skip all of this if the packet is already routed,
322 	 * f.e. by something like SCTP.
323 	 */
324 	rt = skb_rtable(skb);
325 	if (rt != NULL)
326 		goto packet_routed;
327 
328 	/* Make sure we can route this packet. */
329 	rt = (struct rtable *)__sk_dst_check(sk, 0);
330 	if (rt == NULL) {
331 		__be32 daddr;
332 
333 		/* Use correct destination address if we have options. */
334 		daddr = inet->inet_daddr;
335 		if(opt && opt->srr)
336 			daddr = opt->faddr;
337 
338 		{
339 			struct flowi fl = { .oif = sk->sk_bound_dev_if,
340 					    .mark = sk->sk_mark,
341 					    .nl_u = { .ip4_u =
342 						      { .daddr = daddr,
343 							.saddr = inet->inet_saddr,
344 							.tos = RT_CONN_FLAGS(sk) } },
345 					    .proto = sk->sk_protocol,
346 					    .flags = inet_sk_flowi_flags(sk),
347 					    .uli_u = { .ports =
348 						       { .sport = inet->inet_sport,
349 							 .dport = inet->inet_dport } } };
350 
351 			/* If this fails, retransmit mechanism of transport layer will
352 			 * keep trying until route appears or the connection times
353 			 * itself out.
354 			 */
355 			security_sk_classify_flow(sk, &fl);
356 			if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
357 				goto no_route;
358 		}
359 		sk_setup_caps(sk, &rt->u.dst);
360 	}
361 	skb_dst_set(skb, dst_clone(&rt->u.dst));
362 
363 packet_routed:
364 	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
365 		goto no_route;
366 
367 	/* OK, we know where to send it, allocate and build IP header. */
368 	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
369 	skb_reset_network_header(skb);
370 	iph = ip_hdr(skb);
371 	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
372 	if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
373 		iph->frag_off = htons(IP_DF);
374 	else
375 		iph->frag_off = 0;
376 	iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
377 	iph->protocol = sk->sk_protocol;
378 	iph->saddr    = rt->rt_src;
379 	iph->daddr    = rt->rt_dst;
380 	/* Transport layer set skb->h.foo itself. */
381 
382 	if (opt && opt->optlen) {
383 		iph->ihl += opt->optlen >> 2;
384 		ip_options_build(skb, opt, inet->inet_daddr, rt, 0);
385 	}
386 
387 	ip_select_ident_more(iph, &rt->u.dst, sk,
388 			     (skb_shinfo(skb)->gso_segs ?: 1) - 1);
389 
390 	skb->priority = sk->sk_priority;
391 	skb->mark = sk->sk_mark;
392 
393 	return ip_local_out(skb);
394 
395 no_route:
396 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
397 	kfree_skb(skb);
398 	return -EHOSTUNREACH;
399 }
400 
401 
402 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
403 {
404 	to->pkt_type = from->pkt_type;
405 	to->priority = from->priority;
406 	to->protocol = from->protocol;
407 	skb_dst_drop(to);
408 	skb_dst_set(to, dst_clone(skb_dst(from)));
409 	to->dev = from->dev;
410 	to->mark = from->mark;
411 
412 	/* Copy the flags to each fragment. */
413 	IPCB(to)->flags = IPCB(from)->flags;
414 
415 #ifdef CONFIG_NET_SCHED
416 	to->tc_index = from->tc_index;
417 #endif
418 	nf_copy(to, from);
419 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
420     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
421 	to->nf_trace = from->nf_trace;
422 #endif
423 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
424 	to->ipvs_property = from->ipvs_property;
425 #endif
426 	skb_copy_secmark(to, from);
427 }
428 
429 /*
430  *	This IP datagram is too large to be sent in one piece.  Break it up into
431  *	smaller pieces (each of size equal to IP header plus
432  *	a block of the data of the original IP data part) that will yet fit in a
433  *	single device frame, and queue such a frame for sending.
434  */
435 
436 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
437 {
438 	struct iphdr *iph;
439 	int raw = 0;
440 	int ptr;
441 	struct net_device *dev;
442 	struct sk_buff *skb2;
443 	unsigned int mtu, hlen, left, len, ll_rs, pad;
444 	int offset;
445 	__be16 not_last_frag;
446 	struct rtable *rt = skb_rtable(skb);
447 	int err = 0;
448 
449 	dev = rt->u.dst.dev;
450 
451 	/*
452 	 *	Point into the IP datagram header.
453 	 */
454 
455 	iph = ip_hdr(skb);
456 
457 	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
458 		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
459 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
460 			  htonl(ip_skb_dst_mtu(skb)));
461 		kfree_skb(skb);
462 		return -EMSGSIZE;
463 	}
464 
465 	/*
466 	 *	Setup starting values.
467 	 */
468 
469 	hlen = iph->ihl * 4;
470 	mtu = dst_mtu(&rt->u.dst) - hlen;	/* Size of data space */
471 	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
472 
473 	/* When frag_list is given, use it. First, check its validity:
474 	 * some transformers could create wrong frag_list or break existing
475 	 * one, it is not prohibited. In this case fall back to copying.
476 	 *
477 	 * LATER: this step can be merged to real generation of fragments,
478 	 * we can switch to copy when see the first bad fragment.
479 	 */
480 	if (skb_has_frags(skb)) {
481 		struct sk_buff *frag;
482 		int first_len = skb_pagelen(skb);
483 		int truesizes = 0;
484 
485 		if (first_len - hlen > mtu ||
486 		    ((first_len - hlen) & 7) ||
487 		    (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
488 		    skb_cloned(skb))
489 			goto slow_path;
490 
491 		skb_walk_frags(skb, frag) {
492 			/* Correct geometry. */
493 			if (frag->len > mtu ||
494 			    ((frag->len & 7) && frag->next) ||
495 			    skb_headroom(frag) < hlen)
496 			    goto slow_path;
497 
498 			/* Partially cloned skb? */
499 			if (skb_shared(frag))
500 				goto slow_path;
501 
502 			BUG_ON(frag->sk);
503 			if (skb->sk) {
504 				frag->sk = skb->sk;
505 				frag->destructor = sock_wfree;
506 			}
507 			truesizes += frag->truesize;
508 		}
509 
510 		/* Everything is OK. Generate! */
511 
512 		err = 0;
513 		offset = 0;
514 		frag = skb_shinfo(skb)->frag_list;
515 		skb_frag_list_init(skb);
516 		skb->data_len = first_len - skb_headlen(skb);
517 		skb->truesize -= truesizes;
518 		skb->len = first_len;
519 		iph->tot_len = htons(first_len);
520 		iph->frag_off = htons(IP_MF);
521 		ip_send_check(iph);
522 
523 		for (;;) {
524 			/* Prepare header of the next frame,
525 			 * before previous one went down. */
526 			if (frag) {
527 				frag->ip_summed = CHECKSUM_NONE;
528 				skb_reset_transport_header(frag);
529 				__skb_push(frag, hlen);
530 				skb_reset_network_header(frag);
531 				memcpy(skb_network_header(frag), iph, hlen);
532 				iph = ip_hdr(frag);
533 				iph->tot_len = htons(frag->len);
534 				ip_copy_metadata(frag, skb);
535 				if (offset == 0)
536 					ip_options_fragment(frag);
537 				offset += skb->len - hlen;
538 				iph->frag_off = htons(offset>>3);
539 				if (frag->next != NULL)
540 					iph->frag_off |= htons(IP_MF);
541 				/* Ready, complete checksum */
542 				ip_send_check(iph);
543 			}
544 
545 			err = output(skb);
546 
547 			if (!err)
548 				IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
549 			if (err || !frag)
550 				break;
551 
552 			skb = frag;
553 			frag = skb->next;
554 			skb->next = NULL;
555 		}
556 
557 		if (err == 0) {
558 			IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
559 			return 0;
560 		}
561 
562 		while (frag) {
563 			skb = frag->next;
564 			kfree_skb(frag);
565 			frag = skb;
566 		}
567 		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
568 		return err;
569 	}
570 
571 slow_path:
572 	left = skb->len - hlen;		/* Space per frame */
573 	ptr = raw + hlen;		/* Where to start from */
574 
575 	/* for bridged IP traffic encapsulated inside f.e. a vlan header,
576 	 * we need to make room for the encapsulating header
577 	 */
578 	pad = nf_bridge_pad(skb);
579 	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
580 	mtu -= pad;
581 
582 	/*
583 	 *	Fragment the datagram.
584 	 */
585 
586 	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
587 	not_last_frag = iph->frag_off & htons(IP_MF);
588 
589 	/*
590 	 *	Keep copying data until we run out.
591 	 */
592 
593 	while (left > 0) {
594 		len = left;
595 		/* IF: it doesn't fit, use 'mtu' - the data space left */
596 		if (len > mtu)
597 			len = mtu;
598 		/* IF: we are not sending upto and including the packet end
599 		   then align the next start on an eight byte boundary */
600 		if (len < left)	{
601 			len &= ~7;
602 		}
603 		/*
604 		 *	Allocate buffer.
605 		 */
606 
607 		if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
608 			NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
609 			err = -ENOMEM;
610 			goto fail;
611 		}
612 
613 		/*
614 		 *	Set up data on packet
615 		 */
616 
617 		ip_copy_metadata(skb2, skb);
618 		skb_reserve(skb2, ll_rs);
619 		skb_put(skb2, len + hlen);
620 		skb_reset_network_header(skb2);
621 		skb2->transport_header = skb2->network_header + hlen;
622 
623 		/*
624 		 *	Charge the memory for the fragment to any owner
625 		 *	it might possess
626 		 */
627 
628 		if (skb->sk)
629 			skb_set_owner_w(skb2, skb->sk);
630 
631 		/*
632 		 *	Copy the packet header into the new buffer.
633 		 */
634 
635 		skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
636 
637 		/*
638 		 *	Copy a block of the IP datagram.
639 		 */
640 		if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
641 			BUG();
642 		left -= len;
643 
644 		/*
645 		 *	Fill in the new header fields.
646 		 */
647 		iph = ip_hdr(skb2);
648 		iph->frag_off = htons((offset >> 3));
649 
650 		/* ANK: dirty, but effective trick. Upgrade options only if
651 		 * the segment to be fragmented was THE FIRST (otherwise,
652 		 * options are already fixed) and make it ONCE
653 		 * on the initial skb, so that all the following fragments
654 		 * will inherit fixed options.
655 		 */
656 		if (offset == 0)
657 			ip_options_fragment(skb);
658 
659 		/*
660 		 *	Added AC : If we are fragmenting a fragment that's not the
661 		 *		   last fragment then keep MF on each bit
662 		 */
663 		if (left > 0 || not_last_frag)
664 			iph->frag_off |= htons(IP_MF);
665 		ptr += len;
666 		offset += len;
667 
668 		/*
669 		 *	Put this fragment into the sending queue.
670 		 */
671 		iph->tot_len = htons(len + hlen);
672 
673 		ip_send_check(iph);
674 
675 		err = output(skb2);
676 		if (err)
677 			goto fail;
678 
679 		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
680 	}
681 	kfree_skb(skb);
682 	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
683 	return err;
684 
685 fail:
686 	kfree_skb(skb);
687 	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
688 	return err;
689 }
690 
691 EXPORT_SYMBOL(ip_fragment);
692 
693 int
694 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
695 {
696 	struct iovec *iov = from;
697 
698 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
699 		if (memcpy_fromiovecend(to, iov, offset, len) < 0)
700 			return -EFAULT;
701 	} else {
702 		__wsum csum = 0;
703 		if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
704 			return -EFAULT;
705 		skb->csum = csum_block_add(skb->csum, csum, odd);
706 	}
707 	return 0;
708 }
709 
710 static inline __wsum
711 csum_page(struct page *page, int offset, int copy)
712 {
713 	char *kaddr;
714 	__wsum csum;
715 	kaddr = kmap(page);
716 	csum = csum_partial(kaddr + offset, copy, 0);
717 	kunmap(page);
718 	return csum;
719 }
720 
721 static inline int ip_ufo_append_data(struct sock *sk,
722 			int getfrag(void *from, char *to, int offset, int len,
723 			       int odd, struct sk_buff *skb),
724 			void *from, int length, int hh_len, int fragheaderlen,
725 			int transhdrlen, int mtu, unsigned int flags)
726 {
727 	struct sk_buff *skb;
728 	int err;
729 
730 	/* There is support for UDP fragmentation offload by network
731 	 * device, so create one single skb packet containing complete
732 	 * udp datagram
733 	 */
734 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
735 		skb = sock_alloc_send_skb(sk,
736 			hh_len + fragheaderlen + transhdrlen + 20,
737 			(flags & MSG_DONTWAIT), &err);
738 
739 		if (skb == NULL)
740 			return err;
741 
742 		/* reserve space for Hardware header */
743 		skb_reserve(skb, hh_len);
744 
745 		/* create space for UDP/IP header */
746 		skb_put(skb, fragheaderlen + transhdrlen);
747 
748 		/* initialize network header pointer */
749 		skb_reset_network_header(skb);
750 
751 		/* initialize protocol header pointer */
752 		skb->transport_header = skb->network_header + fragheaderlen;
753 
754 		skb->ip_summed = CHECKSUM_PARTIAL;
755 		skb->csum = 0;
756 		sk->sk_sndmsg_off = 0;
757 
758 		/* specify the length of each IP datagram fragment */
759 		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
760 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
761 		__skb_queue_tail(&sk->sk_write_queue, skb);
762 	}
763 
764 	return skb_append_datato_frags(sk, skb, getfrag, from,
765 				       (length - transhdrlen));
766 }
767 
768 /*
769  *	ip_append_data() and ip_append_page() can make one large IP datagram
770  *	from many pieces of data. Each pieces will be holded on the socket
771  *	until ip_push_pending_frames() is called. Each piece can be a page
772  *	or non-page data.
773  *
774  *	Not only UDP, other transport protocols - e.g. raw sockets - can use
775  *	this interface potentially.
776  *
777  *	LATER: length must be adjusted by pad at tail, when it is required.
778  */
779 int ip_append_data(struct sock *sk,
780 		   int getfrag(void *from, char *to, int offset, int len,
781 			       int odd, struct sk_buff *skb),
782 		   void *from, int length, int transhdrlen,
783 		   struct ipcm_cookie *ipc, struct rtable **rtp,
784 		   unsigned int flags)
785 {
786 	struct inet_sock *inet = inet_sk(sk);
787 	struct sk_buff *skb;
788 
789 	struct ip_options *opt = NULL;
790 	int hh_len;
791 	int exthdrlen;
792 	int mtu;
793 	int copy;
794 	int err;
795 	int offset = 0;
796 	unsigned int maxfraglen, fragheaderlen;
797 	int csummode = CHECKSUM_NONE;
798 	struct rtable *rt;
799 
800 	if (flags&MSG_PROBE)
801 		return 0;
802 
803 	if (skb_queue_empty(&sk->sk_write_queue)) {
804 		/*
805 		 * setup for corking.
806 		 */
807 		opt = ipc->opt;
808 		if (opt) {
809 			if (inet->cork.opt == NULL) {
810 				inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
811 				if (unlikely(inet->cork.opt == NULL))
812 					return -ENOBUFS;
813 			}
814 			memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
815 			inet->cork.flags |= IPCORK_OPT;
816 			inet->cork.addr = ipc->addr;
817 		}
818 		rt = *rtp;
819 		if (unlikely(!rt))
820 			return -EFAULT;
821 		/*
822 		 * We steal reference to this route, caller should not release it
823 		 */
824 		*rtp = NULL;
825 		inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
826 					    rt->u.dst.dev->mtu :
827 					    dst_mtu(rt->u.dst.path);
828 		inet->cork.dst = &rt->u.dst;
829 		inet->cork.length = 0;
830 		sk->sk_sndmsg_page = NULL;
831 		sk->sk_sndmsg_off = 0;
832 		if ((exthdrlen = rt->u.dst.header_len) != 0) {
833 			length += exthdrlen;
834 			transhdrlen += exthdrlen;
835 		}
836 	} else {
837 		rt = (struct rtable *)inet->cork.dst;
838 		if (inet->cork.flags & IPCORK_OPT)
839 			opt = inet->cork.opt;
840 
841 		transhdrlen = 0;
842 		exthdrlen = 0;
843 		mtu = inet->cork.fragsize;
844 	}
845 	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
846 
847 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
848 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
849 
850 	if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
851 		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
852 			       mtu-exthdrlen);
853 		return -EMSGSIZE;
854 	}
855 
856 	/*
857 	 * transhdrlen > 0 means that this is the first fragment and we wish
858 	 * it won't be fragmented in the future.
859 	 */
860 	if (transhdrlen &&
861 	    length + fragheaderlen <= mtu &&
862 	    rt->u.dst.dev->features & NETIF_F_V4_CSUM &&
863 	    !exthdrlen)
864 		csummode = CHECKSUM_PARTIAL;
865 
866 	inet->cork.length += length;
867 	if (((length> mtu) || !skb_queue_empty(&sk->sk_write_queue)) &&
868 	    (sk->sk_protocol == IPPROTO_UDP) &&
869 	    (rt->u.dst.dev->features & NETIF_F_UFO)) {
870 		err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
871 					 fragheaderlen, transhdrlen, mtu,
872 					 flags);
873 		if (err)
874 			goto error;
875 		return 0;
876 	}
877 
878 	/* So, what's going on in the loop below?
879 	 *
880 	 * We use calculated fragment length to generate chained skb,
881 	 * each of segments is IP fragment ready for sending to network after
882 	 * adding appropriate IP header.
883 	 */
884 
885 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
886 		goto alloc_new_skb;
887 
888 	while (length > 0) {
889 		/* Check if the remaining data fits into current packet. */
890 		copy = mtu - skb->len;
891 		if (copy < length)
892 			copy = maxfraglen - skb->len;
893 		if (copy <= 0) {
894 			char *data;
895 			unsigned int datalen;
896 			unsigned int fraglen;
897 			unsigned int fraggap;
898 			unsigned int alloclen;
899 			struct sk_buff *skb_prev;
900 alloc_new_skb:
901 			skb_prev = skb;
902 			if (skb_prev)
903 				fraggap = skb_prev->len - maxfraglen;
904 			else
905 				fraggap = 0;
906 
907 			/*
908 			 * If remaining data exceeds the mtu,
909 			 * we know we need more fragment(s).
910 			 */
911 			datalen = length + fraggap;
912 			if (datalen > mtu - fragheaderlen)
913 				datalen = maxfraglen - fragheaderlen;
914 			fraglen = datalen + fragheaderlen;
915 
916 			if ((flags & MSG_MORE) &&
917 			    !(rt->u.dst.dev->features&NETIF_F_SG))
918 				alloclen = mtu;
919 			else
920 				alloclen = datalen + fragheaderlen;
921 
922 			/* The last fragment gets additional space at tail.
923 			 * Note, with MSG_MORE we overallocate on fragments,
924 			 * because we have no idea what fragment will be
925 			 * the last.
926 			 */
927 			if (datalen == length + fraggap)
928 				alloclen += rt->u.dst.trailer_len;
929 
930 			if (transhdrlen) {
931 				skb = sock_alloc_send_skb(sk,
932 						alloclen + hh_len + 15,
933 						(flags & MSG_DONTWAIT), &err);
934 			} else {
935 				skb = NULL;
936 				if (atomic_read(&sk->sk_wmem_alloc) <=
937 				    2 * sk->sk_sndbuf)
938 					skb = sock_wmalloc(sk,
939 							   alloclen + hh_len + 15, 1,
940 							   sk->sk_allocation);
941 				if (unlikely(skb == NULL))
942 					err = -ENOBUFS;
943 				else
944 					/* only the initial fragment is
945 					   time stamped */
946 					ipc->shtx.flags = 0;
947 			}
948 			if (skb == NULL)
949 				goto error;
950 
951 			/*
952 			 *	Fill in the control structures
953 			 */
954 			skb->ip_summed = csummode;
955 			skb->csum = 0;
956 			skb_reserve(skb, hh_len);
957 			*skb_tx(skb) = ipc->shtx;
958 
959 			/*
960 			 *	Find where to start putting bytes.
961 			 */
962 			data = skb_put(skb, fraglen);
963 			skb_set_network_header(skb, exthdrlen);
964 			skb->transport_header = (skb->network_header +
965 						 fragheaderlen);
966 			data += fragheaderlen;
967 
968 			if (fraggap) {
969 				skb->csum = skb_copy_and_csum_bits(
970 					skb_prev, maxfraglen,
971 					data + transhdrlen, fraggap, 0);
972 				skb_prev->csum = csum_sub(skb_prev->csum,
973 							  skb->csum);
974 				data += fraggap;
975 				pskb_trim_unique(skb_prev, maxfraglen);
976 			}
977 
978 			copy = datalen - transhdrlen - fraggap;
979 			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
980 				err = -EFAULT;
981 				kfree_skb(skb);
982 				goto error;
983 			}
984 
985 			offset += copy;
986 			length -= datalen - fraggap;
987 			transhdrlen = 0;
988 			exthdrlen = 0;
989 			csummode = CHECKSUM_NONE;
990 
991 			/*
992 			 * Put the packet on the pending queue.
993 			 */
994 			__skb_queue_tail(&sk->sk_write_queue, skb);
995 			continue;
996 		}
997 
998 		if (copy > length)
999 			copy = length;
1000 
1001 		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1002 			unsigned int off;
1003 
1004 			off = skb->len;
1005 			if (getfrag(from, skb_put(skb, copy),
1006 					offset, copy, off, skb) < 0) {
1007 				__skb_trim(skb, off);
1008 				err = -EFAULT;
1009 				goto error;
1010 			}
1011 		} else {
1012 			int i = skb_shinfo(skb)->nr_frags;
1013 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1014 			struct page *page = sk->sk_sndmsg_page;
1015 			int off = sk->sk_sndmsg_off;
1016 			unsigned int left;
1017 
1018 			if (page && (left = PAGE_SIZE - off) > 0) {
1019 				if (copy >= left)
1020 					copy = left;
1021 				if (page != frag->page) {
1022 					if (i == MAX_SKB_FRAGS) {
1023 						err = -EMSGSIZE;
1024 						goto error;
1025 					}
1026 					get_page(page);
1027 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1028 					frag = &skb_shinfo(skb)->frags[i];
1029 				}
1030 			} else if (i < MAX_SKB_FRAGS) {
1031 				if (copy > PAGE_SIZE)
1032 					copy = PAGE_SIZE;
1033 				page = alloc_pages(sk->sk_allocation, 0);
1034 				if (page == NULL)  {
1035 					err = -ENOMEM;
1036 					goto error;
1037 				}
1038 				sk->sk_sndmsg_page = page;
1039 				sk->sk_sndmsg_off = 0;
1040 
1041 				skb_fill_page_desc(skb, i, page, 0, 0);
1042 				frag = &skb_shinfo(skb)->frags[i];
1043 			} else {
1044 				err = -EMSGSIZE;
1045 				goto error;
1046 			}
1047 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1048 				err = -EFAULT;
1049 				goto error;
1050 			}
1051 			sk->sk_sndmsg_off += copy;
1052 			frag->size += copy;
1053 			skb->len += copy;
1054 			skb->data_len += copy;
1055 			skb->truesize += copy;
1056 			atomic_add(copy, &sk->sk_wmem_alloc);
1057 		}
1058 		offset += copy;
1059 		length -= copy;
1060 	}
1061 
1062 	return 0;
1063 
1064 error:
1065 	inet->cork.length -= length;
1066 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1067 	return err;
1068 }
1069 
1070 ssize_t	ip_append_page(struct sock *sk, struct page *page,
1071 		       int offset, size_t size, int flags)
1072 {
1073 	struct inet_sock *inet = inet_sk(sk);
1074 	struct sk_buff *skb;
1075 	struct rtable *rt;
1076 	struct ip_options *opt = NULL;
1077 	int hh_len;
1078 	int mtu;
1079 	int len;
1080 	int err;
1081 	unsigned int maxfraglen, fragheaderlen, fraggap;
1082 
1083 	if (inet->hdrincl)
1084 		return -EPERM;
1085 
1086 	if (flags&MSG_PROBE)
1087 		return 0;
1088 
1089 	if (skb_queue_empty(&sk->sk_write_queue))
1090 		return -EINVAL;
1091 
1092 	rt = (struct rtable *)inet->cork.dst;
1093 	if (inet->cork.flags & IPCORK_OPT)
1094 		opt = inet->cork.opt;
1095 
1096 	if (!(rt->u.dst.dev->features&NETIF_F_SG))
1097 		return -EOPNOTSUPP;
1098 
1099 	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1100 	mtu = inet->cork.fragsize;
1101 
1102 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1103 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1104 
1105 	if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1106 		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu);
1107 		return -EMSGSIZE;
1108 	}
1109 
1110 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1111 		return -EINVAL;
1112 
1113 	inet->cork.length += size;
1114 	if ((sk->sk_protocol == IPPROTO_UDP) &&
1115 	    (rt->u.dst.dev->features & NETIF_F_UFO)) {
1116 		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1117 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1118 	}
1119 
1120 
1121 	while (size > 0) {
1122 		int i;
1123 
1124 		if (skb_is_gso(skb))
1125 			len = size;
1126 		else {
1127 
1128 			/* Check if the remaining data fits into current packet. */
1129 			len = mtu - skb->len;
1130 			if (len < size)
1131 				len = maxfraglen - skb->len;
1132 		}
1133 		if (len <= 0) {
1134 			struct sk_buff *skb_prev;
1135 			int alloclen;
1136 
1137 			skb_prev = skb;
1138 			fraggap = skb_prev->len - maxfraglen;
1139 
1140 			alloclen = fragheaderlen + hh_len + fraggap + 15;
1141 			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1142 			if (unlikely(!skb)) {
1143 				err = -ENOBUFS;
1144 				goto error;
1145 			}
1146 
1147 			/*
1148 			 *	Fill in the control structures
1149 			 */
1150 			skb->ip_summed = CHECKSUM_NONE;
1151 			skb->csum = 0;
1152 			skb_reserve(skb, hh_len);
1153 
1154 			/*
1155 			 *	Find where to start putting bytes.
1156 			 */
1157 			skb_put(skb, fragheaderlen + fraggap);
1158 			skb_reset_network_header(skb);
1159 			skb->transport_header = (skb->network_header +
1160 						 fragheaderlen);
1161 			if (fraggap) {
1162 				skb->csum = skb_copy_and_csum_bits(skb_prev,
1163 								   maxfraglen,
1164 						    skb_transport_header(skb),
1165 								   fraggap, 0);
1166 				skb_prev->csum = csum_sub(skb_prev->csum,
1167 							  skb->csum);
1168 				pskb_trim_unique(skb_prev, maxfraglen);
1169 			}
1170 
1171 			/*
1172 			 * Put the packet on the pending queue.
1173 			 */
1174 			__skb_queue_tail(&sk->sk_write_queue, skb);
1175 			continue;
1176 		}
1177 
1178 		i = skb_shinfo(skb)->nr_frags;
1179 		if (len > size)
1180 			len = size;
1181 		if (skb_can_coalesce(skb, i, page, offset)) {
1182 			skb_shinfo(skb)->frags[i-1].size += len;
1183 		} else if (i < MAX_SKB_FRAGS) {
1184 			get_page(page);
1185 			skb_fill_page_desc(skb, i, page, offset, len);
1186 		} else {
1187 			err = -EMSGSIZE;
1188 			goto error;
1189 		}
1190 
1191 		if (skb->ip_summed == CHECKSUM_NONE) {
1192 			__wsum csum;
1193 			csum = csum_page(page, offset, len);
1194 			skb->csum = csum_block_add(skb->csum, csum, skb->len);
1195 		}
1196 
1197 		skb->len += len;
1198 		skb->data_len += len;
1199 		skb->truesize += len;
1200 		atomic_add(len, &sk->sk_wmem_alloc);
1201 		offset += len;
1202 		size -= len;
1203 	}
1204 	return 0;
1205 
1206 error:
1207 	inet->cork.length -= size;
1208 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1209 	return err;
1210 }
1211 
1212 static void ip_cork_release(struct inet_sock *inet)
1213 {
1214 	inet->cork.flags &= ~IPCORK_OPT;
1215 	kfree(inet->cork.opt);
1216 	inet->cork.opt = NULL;
1217 	dst_release(inet->cork.dst);
1218 	inet->cork.dst = NULL;
1219 }
1220 
1221 /*
1222  *	Combined all pending IP fragments on the socket as one IP datagram
1223  *	and push them out.
1224  */
1225 int ip_push_pending_frames(struct sock *sk)
1226 {
1227 	struct sk_buff *skb, *tmp_skb;
1228 	struct sk_buff **tail_skb;
1229 	struct inet_sock *inet = inet_sk(sk);
1230 	struct net *net = sock_net(sk);
1231 	struct ip_options *opt = NULL;
1232 	struct rtable *rt = (struct rtable *)inet->cork.dst;
1233 	struct iphdr *iph;
1234 	__be16 df = 0;
1235 	__u8 ttl;
1236 	int err = 0;
1237 
1238 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1239 		goto out;
1240 	tail_skb = &(skb_shinfo(skb)->frag_list);
1241 
1242 	/* move skb->data to ip header from ext header */
1243 	if (skb->data < skb_network_header(skb))
1244 		__skb_pull(skb, skb_network_offset(skb));
1245 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1246 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1247 		*tail_skb = tmp_skb;
1248 		tail_skb = &(tmp_skb->next);
1249 		skb->len += tmp_skb->len;
1250 		skb->data_len += tmp_skb->len;
1251 		skb->truesize += tmp_skb->truesize;
1252 		tmp_skb->destructor = NULL;
1253 		tmp_skb->sk = NULL;
1254 	}
1255 
1256 	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1257 	 * to fragment the frame generated here. No matter, what transforms
1258 	 * how transforms change size of the packet, it will come out.
1259 	 */
1260 	if (inet->pmtudisc < IP_PMTUDISC_DO)
1261 		skb->local_df = 1;
1262 
1263 	/* DF bit is set when we want to see DF on outgoing frames.
1264 	 * If local_df is set too, we still allow to fragment this frame
1265 	 * locally. */
1266 	if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1267 	    (skb->len <= dst_mtu(&rt->u.dst) &&
1268 	     ip_dont_fragment(sk, &rt->u.dst)))
1269 		df = htons(IP_DF);
1270 
1271 	if (inet->cork.flags & IPCORK_OPT)
1272 		opt = inet->cork.opt;
1273 
1274 	if (rt->rt_type == RTN_MULTICAST)
1275 		ttl = inet->mc_ttl;
1276 	else
1277 		ttl = ip_select_ttl(inet, &rt->u.dst);
1278 
1279 	iph = (struct iphdr *)skb->data;
1280 	iph->version = 4;
1281 	iph->ihl = 5;
1282 	if (opt) {
1283 		iph->ihl += opt->optlen>>2;
1284 		ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1285 	}
1286 	iph->tos = inet->tos;
1287 	iph->frag_off = df;
1288 	ip_select_ident(iph, &rt->u.dst, sk);
1289 	iph->ttl = ttl;
1290 	iph->protocol = sk->sk_protocol;
1291 	iph->saddr = rt->rt_src;
1292 	iph->daddr = rt->rt_dst;
1293 
1294 	skb->priority = sk->sk_priority;
1295 	skb->mark = sk->sk_mark;
1296 	/*
1297 	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1298 	 * on dst refcount
1299 	 */
1300 	inet->cork.dst = NULL;
1301 	skb_dst_set(skb, &rt->u.dst);
1302 
1303 	if (iph->protocol == IPPROTO_ICMP)
1304 		icmp_out_count(net, ((struct icmphdr *)
1305 			skb_transport_header(skb))->type);
1306 
1307 	/* Netfilter gets whole the not fragmented skb. */
1308 	err = ip_local_out(skb);
1309 	if (err) {
1310 		if (err > 0)
1311 			err = net_xmit_errno(err);
1312 		if (err)
1313 			goto error;
1314 	}
1315 
1316 out:
1317 	ip_cork_release(inet);
1318 	return err;
1319 
1320 error:
1321 	IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1322 	goto out;
1323 }
1324 
1325 /*
1326  *	Throw away all pending data on the socket.
1327  */
1328 void ip_flush_pending_frames(struct sock *sk)
1329 {
1330 	struct sk_buff *skb;
1331 
1332 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1333 		kfree_skb(skb);
1334 
1335 	ip_cork_release(inet_sk(sk));
1336 }
1337 
1338 
1339 /*
1340  *	Fetch data from kernel space and fill in checksum if needed.
1341  */
1342 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1343 			      int len, int odd, struct sk_buff *skb)
1344 {
1345 	__wsum csum;
1346 
1347 	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1348 	skb->csum = csum_block_add(skb->csum, csum, odd);
1349 	return 0;
1350 }
1351 
1352 /*
1353  *	Generic function to send a packet as reply to another packet.
1354  *	Used to send TCP resets so far. ICMP should use this function too.
1355  *
1356  *	Should run single threaded per socket because it uses the sock
1357  *     	structure to pass arguments.
1358  */
1359 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1360 		   unsigned int len)
1361 {
1362 	struct inet_sock *inet = inet_sk(sk);
1363 	struct {
1364 		struct ip_options	opt;
1365 		char			data[40];
1366 	} replyopts;
1367 	struct ipcm_cookie ipc;
1368 	__be32 daddr;
1369 	struct rtable *rt = skb_rtable(skb);
1370 
1371 	if (ip_options_echo(&replyopts.opt, skb))
1372 		return;
1373 
1374 	daddr = ipc.addr = rt->rt_src;
1375 	ipc.opt = NULL;
1376 	ipc.shtx.flags = 0;
1377 
1378 	if (replyopts.opt.optlen) {
1379 		ipc.opt = &replyopts.opt;
1380 
1381 		if (ipc.opt->srr)
1382 			daddr = replyopts.opt.faddr;
1383 	}
1384 
1385 	{
1386 		struct flowi fl = { .oif = arg->bound_dev_if,
1387 				    .nl_u = { .ip4_u =
1388 					      { .daddr = daddr,
1389 						.saddr = rt->rt_spec_dst,
1390 						.tos = RT_TOS(ip_hdr(skb)->tos) } },
1391 				    /* Not quite clean, but right. */
1392 				    .uli_u = { .ports =
1393 					       { .sport = tcp_hdr(skb)->dest,
1394 						 .dport = tcp_hdr(skb)->source } },
1395 				    .proto = sk->sk_protocol,
1396 				    .flags = ip_reply_arg_flowi_flags(arg) };
1397 		security_skb_classify_flow(skb, &fl);
1398 		if (ip_route_output_key(sock_net(sk), &rt, &fl))
1399 			return;
1400 	}
1401 
1402 	/* And let IP do all the hard work.
1403 
1404 	   This chunk is not reenterable, hence spinlock.
1405 	   Note that it uses the fact, that this function is called
1406 	   with locally disabled BH and that sk cannot be already spinlocked.
1407 	 */
1408 	bh_lock_sock(sk);
1409 	inet->tos = ip_hdr(skb)->tos;
1410 	sk->sk_priority = skb->priority;
1411 	sk->sk_protocol = ip_hdr(skb)->protocol;
1412 	sk->sk_bound_dev_if = arg->bound_dev_if;
1413 	ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1414 		       &ipc, &rt, MSG_DONTWAIT);
1415 	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1416 		if (arg->csumoffset >= 0)
1417 			*((__sum16 *)skb_transport_header(skb) +
1418 			  arg->csumoffset) = csum_fold(csum_add(skb->csum,
1419 								arg->csum));
1420 		skb->ip_summed = CHECKSUM_NONE;
1421 		ip_push_pending_frames(sk);
1422 	}
1423 
1424 	bh_unlock_sock(sk);
1425 
1426 	ip_rt_put(rt);
1427 }
1428 
1429 void __init ip_init(void)
1430 {
1431 	ip_rt_init();
1432 	inet_initpeers();
1433 
1434 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1435 	igmp_mc_proc_init();
1436 #endif
1437 }
1438 
1439 EXPORT_SYMBOL(ip_generic_getfrag);
1440 EXPORT_SYMBOL(ip_queue_xmit);
1441 EXPORT_SYMBOL(ip_send_check);
1442