xref: /openbmc/linux/net/ipv4/ip_output.c (revision f6b72b6217f8c24f2a54988e58af858b4e66024d)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		The Internet Protocol (IP) output module.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Donald Becker, <becker@super.org>
11  *		Alan Cox, <Alan.Cox@linux.org>
12  *		Richard Underwood
13  *		Stefan Becker, <stefanb@yello.ping.de>
14  *		Jorge Cwik, <jorge@laser.satlink.net>
15  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16  *		Hirokazu Takahashi, <taka@valinux.co.jp>
17  *
18  *	See ip_input.c for original log
19  *
20  *	Fixes:
21  *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
22  *		Mike Kilburn	:	htons() missing in ip_build_xmit.
23  *		Bradford Johnson:	Fix faulty handling of some frames when
24  *					no route is found.
25  *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
26  *					(in case if packet not accepted by
27  *					output firewall rules)
28  *		Mike McLagan	:	Routing by source
29  *		Alexey Kuznetsov:	use new route cache
30  *		Andi Kleen:		Fix broken PMTU recovery and remove
31  *					some redundant tests.
32  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
33  *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
34  *		Andi Kleen	:	Split fast and slow ip_build_xmit path
35  *					for decreased register pressure on x86
36  *					and more readibility.
37  *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
38  *					silently drop skb instead of failing with -EPERM.
39  *		Detlev Wengorz	:	Copy protocol for fragments.
40  *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
41  *					datagrams.
42  *		Hirokazu Takahashi:	sendfile() on UDP works now.
43  */
44 
45 #include <asm/uaccess.h>
46 #include <asm/system.h>
47 #include <linux/module.h>
48 #include <linux/types.h>
49 #include <linux/kernel.h>
50 #include <linux/mm.h>
51 #include <linux/string.h>
52 #include <linux/errno.h>
53 #include <linux/highmem.h>
54 #include <linux/slab.h>
55 
56 #include <linux/socket.h>
57 #include <linux/sockios.h>
58 #include <linux/in.h>
59 #include <linux/inet.h>
60 #include <linux/netdevice.h>
61 #include <linux/etherdevice.h>
62 #include <linux/proc_fs.h>
63 #include <linux/stat.h>
64 #include <linux/init.h>
65 
66 #include <net/snmp.h>
67 #include <net/ip.h>
68 #include <net/protocol.h>
69 #include <net/route.h>
70 #include <net/xfrm.h>
71 #include <linux/skbuff.h>
72 #include <net/sock.h>
73 #include <net/arp.h>
74 #include <net/icmp.h>
75 #include <net/checksum.h>
76 #include <net/inetpeer.h>
77 #include <linux/igmp.h>
78 #include <linux/netfilter_ipv4.h>
79 #include <linux/netfilter_bridge.h>
80 #include <linux/mroute.h>
81 #include <linux/netlink.h>
82 #include <linux/tcp.h>
83 
84 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
85 EXPORT_SYMBOL(sysctl_ip_default_ttl);
86 
87 /* Generate a checksum for an outgoing IP datagram. */
88 __inline__ void ip_send_check(struct iphdr *iph)
89 {
90 	iph->check = 0;
91 	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
92 }
93 EXPORT_SYMBOL(ip_send_check);
94 
95 int __ip_local_out(struct sk_buff *skb)
96 {
97 	struct iphdr *iph = ip_hdr(skb);
98 
99 	iph->tot_len = htons(skb->len);
100 	ip_send_check(iph);
101 	return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
102 		       skb_dst(skb)->dev, dst_output);
103 }
104 
105 int ip_local_out(struct sk_buff *skb)
106 {
107 	int err;
108 
109 	err = __ip_local_out(skb);
110 	if (likely(err == 1))
111 		err = dst_output(skb);
112 
113 	return err;
114 }
115 EXPORT_SYMBOL_GPL(ip_local_out);
116 
117 /* dev_loopback_xmit for use with netfilter. */
118 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
119 {
120 	skb_reset_mac_header(newskb);
121 	__skb_pull(newskb, skb_network_offset(newskb));
122 	newskb->pkt_type = PACKET_LOOPBACK;
123 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
124 	WARN_ON(!skb_dst(newskb));
125 	netif_rx_ni(newskb);
126 	return 0;
127 }
128 
129 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
130 {
131 	int ttl = inet->uc_ttl;
132 
133 	if (ttl < 0)
134 		ttl = ip4_dst_hoplimit(dst);
135 	return ttl;
136 }
137 
138 /*
139  *		Add an ip header to a skbuff and send it out.
140  *
141  */
142 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
143 			  __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
144 {
145 	struct inet_sock *inet = inet_sk(sk);
146 	struct rtable *rt = skb_rtable(skb);
147 	struct iphdr *iph;
148 
149 	/* Build the IP header. */
150 	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
151 	skb_reset_network_header(skb);
152 	iph = ip_hdr(skb);
153 	iph->version  = 4;
154 	iph->ihl      = 5;
155 	iph->tos      = inet->tos;
156 	if (ip_dont_fragment(sk, &rt->dst))
157 		iph->frag_off = htons(IP_DF);
158 	else
159 		iph->frag_off = 0;
160 	iph->ttl      = ip_select_ttl(inet, &rt->dst);
161 	iph->daddr    = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
162 	iph->saddr    = saddr;
163 	iph->protocol = sk->sk_protocol;
164 	ip_select_ident(iph, &rt->dst, sk);
165 
166 	if (opt && opt->opt.optlen) {
167 		iph->ihl += opt->opt.optlen>>2;
168 		ip_options_build(skb, &opt->opt, daddr, rt, 0);
169 	}
170 
171 	skb->priority = sk->sk_priority;
172 	skb->mark = sk->sk_mark;
173 
174 	/* Send it out. */
175 	return ip_local_out(skb);
176 }
177 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
178 
179 static inline int ip_finish_output2(struct sk_buff *skb)
180 {
181 	struct dst_entry *dst = skb_dst(skb);
182 	struct rtable *rt = (struct rtable *)dst;
183 	struct net_device *dev = dst->dev;
184 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
185 	struct neighbour *neigh;
186 
187 	if (rt->rt_type == RTN_MULTICAST) {
188 		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
189 	} else if (rt->rt_type == RTN_BROADCAST)
190 		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
191 
192 	/* Be paranoid, rather than too clever. */
193 	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
194 		struct sk_buff *skb2;
195 
196 		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
197 		if (skb2 == NULL) {
198 			kfree_skb(skb);
199 			return -ENOMEM;
200 		}
201 		if (skb->sk)
202 			skb_set_owner_w(skb2, skb->sk);
203 		kfree_skb(skb);
204 		skb = skb2;
205 	}
206 
207 	neigh = dst->neighbour;
208 	if (neigh) {
209 		struct hh_cache *hh = &neigh->hh;
210 		if (hh->hh_len)
211 			return neigh_hh_output(hh, skb);
212 		else
213 			return dst->neighbour->output(skb);
214 	}
215 	if (net_ratelimit())
216 		printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
217 	kfree_skb(skb);
218 	return -EINVAL;
219 }
220 
221 static inline int ip_skb_dst_mtu(struct sk_buff *skb)
222 {
223 	struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
224 
225 	return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
226 	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
227 }
228 
229 static int ip_finish_output(struct sk_buff *skb)
230 {
231 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
232 	/* Policy lookup after SNAT yielded a new policy */
233 	if (skb_dst(skb)->xfrm != NULL) {
234 		IPCB(skb)->flags |= IPSKB_REROUTED;
235 		return dst_output(skb);
236 	}
237 #endif
238 	if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
239 		return ip_fragment(skb, ip_finish_output2);
240 	else
241 		return ip_finish_output2(skb);
242 }
243 
244 int ip_mc_output(struct sk_buff *skb)
245 {
246 	struct sock *sk = skb->sk;
247 	struct rtable *rt = skb_rtable(skb);
248 	struct net_device *dev = rt->dst.dev;
249 
250 	/*
251 	 *	If the indicated interface is up and running, send the packet.
252 	 */
253 	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
254 
255 	skb->dev = dev;
256 	skb->protocol = htons(ETH_P_IP);
257 
258 	/*
259 	 *	Multicasts are looped back for other local users
260 	 */
261 
262 	if (rt->rt_flags&RTCF_MULTICAST) {
263 		if (sk_mc_loop(sk)
264 #ifdef CONFIG_IP_MROUTE
265 		/* Small optimization: do not loopback not local frames,
266 		   which returned after forwarding; they will be  dropped
267 		   by ip_mr_input in any case.
268 		   Note, that local frames are looped back to be delivered
269 		   to local recipients.
270 
271 		   This check is duplicated in ip_mr_input at the moment.
272 		 */
273 		    &&
274 		    ((rt->rt_flags & RTCF_LOCAL) ||
275 		     !(IPCB(skb)->flags & IPSKB_FORWARDED))
276 #endif
277 		   ) {
278 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
279 			if (newskb)
280 				NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
281 					newskb, NULL, newskb->dev,
282 					ip_dev_loopback_xmit);
283 		}
284 
285 		/* Multicasts with ttl 0 must not go beyond the host */
286 
287 		if (ip_hdr(skb)->ttl == 0) {
288 			kfree_skb(skb);
289 			return 0;
290 		}
291 	}
292 
293 	if (rt->rt_flags&RTCF_BROADCAST) {
294 		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
295 		if (newskb)
296 			NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
297 				NULL, newskb->dev, ip_dev_loopback_xmit);
298 	}
299 
300 	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
301 			    skb->dev, ip_finish_output,
302 			    !(IPCB(skb)->flags & IPSKB_REROUTED));
303 }
304 
305 int ip_output(struct sk_buff *skb)
306 {
307 	struct net_device *dev = skb_dst(skb)->dev;
308 
309 	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
310 
311 	skb->dev = dev;
312 	skb->protocol = htons(ETH_P_IP);
313 
314 	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
315 			    ip_finish_output,
316 			    !(IPCB(skb)->flags & IPSKB_REROUTED));
317 }
318 
319 int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
320 {
321 	struct sock *sk = skb->sk;
322 	struct inet_sock *inet = inet_sk(sk);
323 	struct ip_options_rcu *inet_opt;
324 	struct flowi4 *fl4;
325 	struct rtable *rt;
326 	struct iphdr *iph;
327 	int res;
328 
329 	/* Skip all of this if the packet is already routed,
330 	 * f.e. by something like SCTP.
331 	 */
332 	rcu_read_lock();
333 	inet_opt = rcu_dereference(inet->inet_opt);
334 	fl4 = &fl->u.ip4;
335 	rt = skb_rtable(skb);
336 	if (rt != NULL)
337 		goto packet_routed;
338 
339 	/* Make sure we can route this packet. */
340 	rt = (struct rtable *)__sk_dst_check(sk, 0);
341 	if (rt == NULL) {
342 		__be32 daddr;
343 
344 		/* Use correct destination address if we have options. */
345 		daddr = inet->inet_daddr;
346 		if (inet_opt && inet_opt->opt.srr)
347 			daddr = inet_opt->opt.faddr;
348 
349 		/* If this fails, retransmit mechanism of transport layer will
350 		 * keep trying until route appears or the connection times
351 		 * itself out.
352 		 */
353 		rt = ip_route_output_ports(sock_net(sk), fl4, sk,
354 					   daddr, inet->inet_saddr,
355 					   inet->inet_dport,
356 					   inet->inet_sport,
357 					   sk->sk_protocol,
358 					   RT_CONN_FLAGS(sk),
359 					   sk->sk_bound_dev_if);
360 		if (IS_ERR(rt))
361 			goto no_route;
362 		sk_setup_caps(sk, &rt->dst);
363 	}
364 	skb_dst_set_noref(skb, &rt->dst);
365 
366 packet_routed:
367 	if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
368 		goto no_route;
369 
370 	/* OK, we know where to send it, allocate and build IP header. */
371 	skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
372 	skb_reset_network_header(skb);
373 	iph = ip_hdr(skb);
374 	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
375 	if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
376 		iph->frag_off = htons(IP_DF);
377 	else
378 		iph->frag_off = 0;
379 	iph->ttl      = ip_select_ttl(inet, &rt->dst);
380 	iph->protocol = sk->sk_protocol;
381 	iph->saddr    = fl4->saddr;
382 	iph->daddr    = fl4->daddr;
383 	/* Transport layer set skb->h.foo itself. */
384 
385 	if (inet_opt && inet_opt->opt.optlen) {
386 		iph->ihl += inet_opt->opt.optlen >> 2;
387 		ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
388 	}
389 
390 	ip_select_ident_more(iph, &rt->dst, sk,
391 			     (skb_shinfo(skb)->gso_segs ?: 1) - 1);
392 
393 	skb->priority = sk->sk_priority;
394 	skb->mark = sk->sk_mark;
395 
396 	res = ip_local_out(skb);
397 	rcu_read_unlock();
398 	return res;
399 
400 no_route:
401 	rcu_read_unlock();
402 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
403 	kfree_skb(skb);
404 	return -EHOSTUNREACH;
405 }
406 EXPORT_SYMBOL(ip_queue_xmit);
407 
408 
409 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
410 {
411 	to->pkt_type = from->pkt_type;
412 	to->priority = from->priority;
413 	to->protocol = from->protocol;
414 	skb_dst_drop(to);
415 	skb_dst_copy(to, from);
416 	to->dev = from->dev;
417 	to->mark = from->mark;
418 
419 	/* Copy the flags to each fragment. */
420 	IPCB(to)->flags = IPCB(from)->flags;
421 
422 #ifdef CONFIG_NET_SCHED
423 	to->tc_index = from->tc_index;
424 #endif
425 	nf_copy(to, from);
426 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
427     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
428 	to->nf_trace = from->nf_trace;
429 #endif
430 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
431 	to->ipvs_property = from->ipvs_property;
432 #endif
433 	skb_copy_secmark(to, from);
434 }
435 
436 /*
437  *	This IP datagram is too large to be sent in one piece.  Break it up into
438  *	smaller pieces (each of size equal to IP header plus
439  *	a block of the data of the original IP data part) that will yet fit in a
440  *	single device frame, and queue such a frame for sending.
441  */
442 
443 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
444 {
445 	struct iphdr *iph;
446 	int ptr;
447 	struct net_device *dev;
448 	struct sk_buff *skb2;
449 	unsigned int mtu, hlen, left, len, ll_rs;
450 	int offset;
451 	__be16 not_last_frag;
452 	struct rtable *rt = skb_rtable(skb);
453 	int err = 0;
454 
455 	dev = rt->dst.dev;
456 
457 	/*
458 	 *	Point into the IP datagram header.
459 	 */
460 
461 	iph = ip_hdr(skb);
462 
463 	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
464 		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
465 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
466 			  htonl(ip_skb_dst_mtu(skb)));
467 		kfree_skb(skb);
468 		return -EMSGSIZE;
469 	}
470 
471 	/*
472 	 *	Setup starting values.
473 	 */
474 
475 	hlen = iph->ihl * 4;
476 	mtu = dst_mtu(&rt->dst) - hlen;	/* Size of data space */
477 #ifdef CONFIG_BRIDGE_NETFILTER
478 	if (skb->nf_bridge)
479 		mtu -= nf_bridge_mtu_reduction(skb);
480 #endif
481 	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
482 
483 	/* When frag_list is given, use it. First, check its validity:
484 	 * some transformers could create wrong frag_list or break existing
485 	 * one, it is not prohibited. In this case fall back to copying.
486 	 *
487 	 * LATER: this step can be merged to real generation of fragments,
488 	 * we can switch to copy when see the first bad fragment.
489 	 */
490 	if (skb_has_frag_list(skb)) {
491 		struct sk_buff *frag, *frag2;
492 		int first_len = skb_pagelen(skb);
493 
494 		if (first_len - hlen > mtu ||
495 		    ((first_len - hlen) & 7) ||
496 		    ip_is_fragment(iph) ||
497 		    skb_cloned(skb))
498 			goto slow_path;
499 
500 		skb_walk_frags(skb, frag) {
501 			/* Correct geometry. */
502 			if (frag->len > mtu ||
503 			    ((frag->len & 7) && frag->next) ||
504 			    skb_headroom(frag) < hlen)
505 				goto slow_path_clean;
506 
507 			/* Partially cloned skb? */
508 			if (skb_shared(frag))
509 				goto slow_path_clean;
510 
511 			BUG_ON(frag->sk);
512 			if (skb->sk) {
513 				frag->sk = skb->sk;
514 				frag->destructor = sock_wfree;
515 			}
516 			skb->truesize -= frag->truesize;
517 		}
518 
519 		/* Everything is OK. Generate! */
520 
521 		err = 0;
522 		offset = 0;
523 		frag = skb_shinfo(skb)->frag_list;
524 		skb_frag_list_init(skb);
525 		skb->data_len = first_len - skb_headlen(skb);
526 		skb->len = first_len;
527 		iph->tot_len = htons(first_len);
528 		iph->frag_off = htons(IP_MF);
529 		ip_send_check(iph);
530 
531 		for (;;) {
532 			/* Prepare header of the next frame,
533 			 * before previous one went down. */
534 			if (frag) {
535 				frag->ip_summed = CHECKSUM_NONE;
536 				skb_reset_transport_header(frag);
537 				__skb_push(frag, hlen);
538 				skb_reset_network_header(frag);
539 				memcpy(skb_network_header(frag), iph, hlen);
540 				iph = ip_hdr(frag);
541 				iph->tot_len = htons(frag->len);
542 				ip_copy_metadata(frag, skb);
543 				if (offset == 0)
544 					ip_options_fragment(frag);
545 				offset += skb->len - hlen;
546 				iph->frag_off = htons(offset>>3);
547 				if (frag->next != NULL)
548 					iph->frag_off |= htons(IP_MF);
549 				/* Ready, complete checksum */
550 				ip_send_check(iph);
551 			}
552 
553 			err = output(skb);
554 
555 			if (!err)
556 				IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
557 			if (err || !frag)
558 				break;
559 
560 			skb = frag;
561 			frag = skb->next;
562 			skb->next = NULL;
563 		}
564 
565 		if (err == 0) {
566 			IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
567 			return 0;
568 		}
569 
570 		while (frag) {
571 			skb = frag->next;
572 			kfree_skb(frag);
573 			frag = skb;
574 		}
575 		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
576 		return err;
577 
578 slow_path_clean:
579 		skb_walk_frags(skb, frag2) {
580 			if (frag2 == frag)
581 				break;
582 			frag2->sk = NULL;
583 			frag2->destructor = NULL;
584 			skb->truesize += frag2->truesize;
585 		}
586 	}
587 
588 slow_path:
589 	left = skb->len - hlen;		/* Space per frame */
590 	ptr = hlen;		/* Where to start from */
591 
592 	/* for bridged IP traffic encapsulated inside f.e. a vlan header,
593 	 * we need to make room for the encapsulating header
594 	 */
595 	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
596 
597 	/*
598 	 *	Fragment the datagram.
599 	 */
600 
601 	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
602 	not_last_frag = iph->frag_off & htons(IP_MF);
603 
604 	/*
605 	 *	Keep copying data until we run out.
606 	 */
607 
608 	while (left > 0) {
609 		len = left;
610 		/* IF: it doesn't fit, use 'mtu' - the data space left */
611 		if (len > mtu)
612 			len = mtu;
613 		/* IF: we are not sending up to and including the packet end
614 		   then align the next start on an eight byte boundary */
615 		if (len < left)	{
616 			len &= ~7;
617 		}
618 		/*
619 		 *	Allocate buffer.
620 		 */
621 
622 		if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
623 			NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
624 			err = -ENOMEM;
625 			goto fail;
626 		}
627 
628 		/*
629 		 *	Set up data on packet
630 		 */
631 
632 		ip_copy_metadata(skb2, skb);
633 		skb_reserve(skb2, ll_rs);
634 		skb_put(skb2, len + hlen);
635 		skb_reset_network_header(skb2);
636 		skb2->transport_header = skb2->network_header + hlen;
637 
638 		/*
639 		 *	Charge the memory for the fragment to any owner
640 		 *	it might possess
641 		 */
642 
643 		if (skb->sk)
644 			skb_set_owner_w(skb2, skb->sk);
645 
646 		/*
647 		 *	Copy the packet header into the new buffer.
648 		 */
649 
650 		skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
651 
652 		/*
653 		 *	Copy a block of the IP datagram.
654 		 */
655 		if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
656 			BUG();
657 		left -= len;
658 
659 		/*
660 		 *	Fill in the new header fields.
661 		 */
662 		iph = ip_hdr(skb2);
663 		iph->frag_off = htons((offset >> 3));
664 
665 		/* ANK: dirty, but effective trick. Upgrade options only if
666 		 * the segment to be fragmented was THE FIRST (otherwise,
667 		 * options are already fixed) and make it ONCE
668 		 * on the initial skb, so that all the following fragments
669 		 * will inherit fixed options.
670 		 */
671 		if (offset == 0)
672 			ip_options_fragment(skb);
673 
674 		/*
675 		 *	Added AC : If we are fragmenting a fragment that's not the
676 		 *		   last fragment then keep MF on each bit
677 		 */
678 		if (left > 0 || not_last_frag)
679 			iph->frag_off |= htons(IP_MF);
680 		ptr += len;
681 		offset += len;
682 
683 		/*
684 		 *	Put this fragment into the sending queue.
685 		 */
686 		iph->tot_len = htons(len + hlen);
687 
688 		ip_send_check(iph);
689 
690 		err = output(skb2);
691 		if (err)
692 			goto fail;
693 
694 		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
695 	}
696 	kfree_skb(skb);
697 	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
698 	return err;
699 
700 fail:
701 	kfree_skb(skb);
702 	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
703 	return err;
704 }
705 EXPORT_SYMBOL(ip_fragment);
706 
707 int
708 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
709 {
710 	struct iovec *iov = from;
711 
712 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
713 		if (memcpy_fromiovecend(to, iov, offset, len) < 0)
714 			return -EFAULT;
715 	} else {
716 		__wsum csum = 0;
717 		if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
718 			return -EFAULT;
719 		skb->csum = csum_block_add(skb->csum, csum, odd);
720 	}
721 	return 0;
722 }
723 EXPORT_SYMBOL(ip_generic_getfrag);
724 
725 static inline __wsum
726 csum_page(struct page *page, int offset, int copy)
727 {
728 	char *kaddr;
729 	__wsum csum;
730 	kaddr = kmap(page);
731 	csum = csum_partial(kaddr + offset, copy, 0);
732 	kunmap(page);
733 	return csum;
734 }
735 
736 static inline int ip_ufo_append_data(struct sock *sk,
737 			struct sk_buff_head *queue,
738 			int getfrag(void *from, char *to, int offset, int len,
739 			       int odd, struct sk_buff *skb),
740 			void *from, int length, int hh_len, int fragheaderlen,
741 			int transhdrlen, int mtu, unsigned int flags)
742 {
743 	struct sk_buff *skb;
744 	int err;
745 
746 	/* There is support for UDP fragmentation offload by network
747 	 * device, so create one single skb packet containing complete
748 	 * udp datagram
749 	 */
750 	if ((skb = skb_peek_tail(queue)) == NULL) {
751 		skb = sock_alloc_send_skb(sk,
752 			hh_len + fragheaderlen + transhdrlen + 20,
753 			(flags & MSG_DONTWAIT), &err);
754 
755 		if (skb == NULL)
756 			return err;
757 
758 		/* reserve space for Hardware header */
759 		skb_reserve(skb, hh_len);
760 
761 		/* create space for UDP/IP header */
762 		skb_put(skb, fragheaderlen + transhdrlen);
763 
764 		/* initialize network header pointer */
765 		skb_reset_network_header(skb);
766 
767 		/* initialize protocol header pointer */
768 		skb->transport_header = skb->network_header + fragheaderlen;
769 
770 		skb->ip_summed = CHECKSUM_PARTIAL;
771 		skb->csum = 0;
772 
773 		/* specify the length of each IP datagram fragment */
774 		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
775 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
776 		__skb_queue_tail(queue, skb);
777 	}
778 
779 	return skb_append_datato_frags(sk, skb, getfrag, from,
780 				       (length - transhdrlen));
781 }
782 
783 static int __ip_append_data(struct sock *sk,
784 			    struct flowi4 *fl4,
785 			    struct sk_buff_head *queue,
786 			    struct inet_cork *cork,
787 			    int getfrag(void *from, char *to, int offset,
788 					int len, int odd, struct sk_buff *skb),
789 			    void *from, int length, int transhdrlen,
790 			    unsigned int flags)
791 {
792 	struct inet_sock *inet = inet_sk(sk);
793 	struct sk_buff *skb;
794 
795 	struct ip_options *opt = cork->opt;
796 	int hh_len;
797 	int exthdrlen;
798 	int mtu;
799 	int copy;
800 	int err;
801 	int offset = 0;
802 	unsigned int maxfraglen, fragheaderlen;
803 	int csummode = CHECKSUM_NONE;
804 	struct rtable *rt = (struct rtable *)cork->dst;
805 
806 	skb = skb_peek_tail(queue);
807 
808 	exthdrlen = !skb ? rt->dst.header_len : 0;
809 	mtu = cork->fragsize;
810 
811 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
812 
813 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
814 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
815 
816 	if (cork->length + length > 0xFFFF - fragheaderlen) {
817 		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
818 			       mtu-exthdrlen);
819 		return -EMSGSIZE;
820 	}
821 
822 	/*
823 	 * transhdrlen > 0 means that this is the first fragment and we wish
824 	 * it won't be fragmented in the future.
825 	 */
826 	if (transhdrlen &&
827 	    length + fragheaderlen <= mtu &&
828 	    rt->dst.dev->features & NETIF_F_V4_CSUM &&
829 	    !exthdrlen)
830 		csummode = CHECKSUM_PARTIAL;
831 
832 	cork->length += length;
833 	if (((length > mtu) || (skb && skb_is_gso(skb))) &&
834 	    (sk->sk_protocol == IPPROTO_UDP) &&
835 	    (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
836 		err = ip_ufo_append_data(sk, queue, getfrag, from, length,
837 					 hh_len, fragheaderlen, transhdrlen,
838 					 mtu, flags);
839 		if (err)
840 			goto error;
841 		return 0;
842 	}
843 
844 	/* So, what's going on in the loop below?
845 	 *
846 	 * We use calculated fragment length to generate chained skb,
847 	 * each of segments is IP fragment ready for sending to network after
848 	 * adding appropriate IP header.
849 	 */
850 
851 	if (!skb)
852 		goto alloc_new_skb;
853 
854 	while (length > 0) {
855 		/* Check if the remaining data fits into current packet. */
856 		copy = mtu - skb->len;
857 		if (copy < length)
858 			copy = maxfraglen - skb->len;
859 		if (copy <= 0) {
860 			char *data;
861 			unsigned int datalen;
862 			unsigned int fraglen;
863 			unsigned int fraggap;
864 			unsigned int alloclen;
865 			struct sk_buff *skb_prev;
866 alloc_new_skb:
867 			skb_prev = skb;
868 			if (skb_prev)
869 				fraggap = skb_prev->len - maxfraglen;
870 			else
871 				fraggap = 0;
872 
873 			/*
874 			 * If remaining data exceeds the mtu,
875 			 * we know we need more fragment(s).
876 			 */
877 			datalen = length + fraggap;
878 			if (datalen > mtu - fragheaderlen)
879 				datalen = maxfraglen - fragheaderlen;
880 			fraglen = datalen + fragheaderlen;
881 
882 			if ((flags & MSG_MORE) &&
883 			    !(rt->dst.dev->features&NETIF_F_SG))
884 				alloclen = mtu;
885 			else
886 				alloclen = fraglen;
887 
888 			alloclen += exthdrlen;
889 
890 			/* The last fragment gets additional space at tail.
891 			 * Note, with MSG_MORE we overallocate on fragments,
892 			 * because we have no idea what fragment will be
893 			 * the last.
894 			 */
895 			if (datalen == length + fraggap)
896 				alloclen += rt->dst.trailer_len;
897 
898 			if (transhdrlen) {
899 				skb = sock_alloc_send_skb(sk,
900 						alloclen + hh_len + 15,
901 						(flags & MSG_DONTWAIT), &err);
902 			} else {
903 				skb = NULL;
904 				if (atomic_read(&sk->sk_wmem_alloc) <=
905 				    2 * sk->sk_sndbuf)
906 					skb = sock_wmalloc(sk,
907 							   alloclen + hh_len + 15, 1,
908 							   sk->sk_allocation);
909 				if (unlikely(skb == NULL))
910 					err = -ENOBUFS;
911 				else
912 					/* only the initial fragment is
913 					   time stamped */
914 					cork->tx_flags = 0;
915 			}
916 			if (skb == NULL)
917 				goto error;
918 
919 			/*
920 			 *	Fill in the control structures
921 			 */
922 			skb->ip_summed = csummode;
923 			skb->csum = 0;
924 			skb_reserve(skb, hh_len);
925 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
926 
927 			/*
928 			 *	Find where to start putting bytes.
929 			 */
930 			data = skb_put(skb, fraglen + exthdrlen);
931 			skb_set_network_header(skb, exthdrlen);
932 			skb->transport_header = (skb->network_header +
933 						 fragheaderlen);
934 			data += fragheaderlen + exthdrlen;
935 
936 			if (fraggap) {
937 				skb->csum = skb_copy_and_csum_bits(
938 					skb_prev, maxfraglen,
939 					data + transhdrlen, fraggap, 0);
940 				skb_prev->csum = csum_sub(skb_prev->csum,
941 							  skb->csum);
942 				data += fraggap;
943 				pskb_trim_unique(skb_prev, maxfraglen);
944 			}
945 
946 			copy = datalen - transhdrlen - fraggap;
947 			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
948 				err = -EFAULT;
949 				kfree_skb(skb);
950 				goto error;
951 			}
952 
953 			offset += copy;
954 			length -= datalen - fraggap;
955 			transhdrlen = 0;
956 			exthdrlen = 0;
957 			csummode = CHECKSUM_NONE;
958 
959 			/*
960 			 * Put the packet on the pending queue.
961 			 */
962 			__skb_queue_tail(queue, skb);
963 			continue;
964 		}
965 
966 		if (copy > length)
967 			copy = length;
968 
969 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
970 			unsigned int off;
971 
972 			off = skb->len;
973 			if (getfrag(from, skb_put(skb, copy),
974 					offset, copy, off, skb) < 0) {
975 				__skb_trim(skb, off);
976 				err = -EFAULT;
977 				goto error;
978 			}
979 		} else {
980 			int i = skb_shinfo(skb)->nr_frags;
981 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
982 			struct page *page = cork->page;
983 			int off = cork->off;
984 			unsigned int left;
985 
986 			if (page && (left = PAGE_SIZE - off) > 0) {
987 				if (copy >= left)
988 					copy = left;
989 				if (page != frag->page) {
990 					if (i == MAX_SKB_FRAGS) {
991 						err = -EMSGSIZE;
992 						goto error;
993 					}
994 					get_page(page);
995 					skb_fill_page_desc(skb, i, page, off, 0);
996 					frag = &skb_shinfo(skb)->frags[i];
997 				}
998 			} else if (i < MAX_SKB_FRAGS) {
999 				if (copy > PAGE_SIZE)
1000 					copy = PAGE_SIZE;
1001 				page = alloc_pages(sk->sk_allocation, 0);
1002 				if (page == NULL)  {
1003 					err = -ENOMEM;
1004 					goto error;
1005 				}
1006 				cork->page = page;
1007 				cork->off = 0;
1008 
1009 				skb_fill_page_desc(skb, i, page, 0, 0);
1010 				frag = &skb_shinfo(skb)->frags[i];
1011 			} else {
1012 				err = -EMSGSIZE;
1013 				goto error;
1014 			}
1015 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1016 				err = -EFAULT;
1017 				goto error;
1018 			}
1019 			cork->off += copy;
1020 			frag->size += copy;
1021 			skb->len += copy;
1022 			skb->data_len += copy;
1023 			skb->truesize += copy;
1024 			atomic_add(copy, &sk->sk_wmem_alloc);
1025 		}
1026 		offset += copy;
1027 		length -= copy;
1028 	}
1029 
1030 	return 0;
1031 
1032 error:
1033 	cork->length -= length;
1034 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1035 	return err;
1036 }
1037 
1038 static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1039 			 struct ipcm_cookie *ipc, struct rtable **rtp)
1040 {
1041 	struct inet_sock *inet = inet_sk(sk);
1042 	struct ip_options_rcu *opt;
1043 	struct rtable *rt;
1044 
1045 	/*
1046 	 * setup for corking.
1047 	 */
1048 	opt = ipc->opt;
1049 	if (opt) {
1050 		if (cork->opt == NULL) {
1051 			cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1052 					    sk->sk_allocation);
1053 			if (unlikely(cork->opt == NULL))
1054 				return -ENOBUFS;
1055 		}
1056 		memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1057 		cork->flags |= IPCORK_OPT;
1058 		cork->addr = ipc->addr;
1059 	}
1060 	rt = *rtp;
1061 	if (unlikely(!rt))
1062 		return -EFAULT;
1063 	/*
1064 	 * We steal reference to this route, caller should not release it
1065 	 */
1066 	*rtp = NULL;
1067 	cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1068 			 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1069 	cork->dst = &rt->dst;
1070 	cork->length = 0;
1071 	cork->tx_flags = ipc->tx_flags;
1072 	cork->page = NULL;
1073 	cork->off = 0;
1074 
1075 	return 0;
1076 }
1077 
1078 /*
1079  *	ip_append_data() and ip_append_page() can make one large IP datagram
1080  *	from many pieces of data. Each pieces will be holded on the socket
1081  *	until ip_push_pending_frames() is called. Each piece can be a page
1082  *	or non-page data.
1083  *
1084  *	Not only UDP, other transport protocols - e.g. raw sockets - can use
1085  *	this interface potentially.
1086  *
1087  *	LATER: length must be adjusted by pad at tail, when it is required.
1088  */
1089 int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1090 		   int getfrag(void *from, char *to, int offset, int len,
1091 			       int odd, struct sk_buff *skb),
1092 		   void *from, int length, int transhdrlen,
1093 		   struct ipcm_cookie *ipc, struct rtable **rtp,
1094 		   unsigned int flags)
1095 {
1096 	struct inet_sock *inet = inet_sk(sk);
1097 	int err;
1098 
1099 	if (flags&MSG_PROBE)
1100 		return 0;
1101 
1102 	if (skb_queue_empty(&sk->sk_write_queue)) {
1103 		err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1104 		if (err)
1105 			return err;
1106 	} else {
1107 		transhdrlen = 0;
1108 	}
1109 
1110 	return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
1111 				from, length, transhdrlen, flags);
1112 }
1113 
1114 ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1115 		       int offset, size_t size, int flags)
1116 {
1117 	struct inet_sock *inet = inet_sk(sk);
1118 	struct sk_buff *skb;
1119 	struct rtable *rt;
1120 	struct ip_options *opt = NULL;
1121 	struct inet_cork *cork;
1122 	int hh_len;
1123 	int mtu;
1124 	int len;
1125 	int err;
1126 	unsigned int maxfraglen, fragheaderlen, fraggap;
1127 
1128 	if (inet->hdrincl)
1129 		return -EPERM;
1130 
1131 	if (flags&MSG_PROBE)
1132 		return 0;
1133 
1134 	if (skb_queue_empty(&sk->sk_write_queue))
1135 		return -EINVAL;
1136 
1137 	cork = &inet->cork.base;
1138 	rt = (struct rtable *)cork->dst;
1139 	if (cork->flags & IPCORK_OPT)
1140 		opt = cork->opt;
1141 
1142 	if (!(rt->dst.dev->features&NETIF_F_SG))
1143 		return -EOPNOTSUPP;
1144 
1145 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1146 	mtu = cork->fragsize;
1147 
1148 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1149 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1150 
1151 	if (cork->length + size > 0xFFFF - fragheaderlen) {
1152 		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
1153 		return -EMSGSIZE;
1154 	}
1155 
1156 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1157 		return -EINVAL;
1158 
1159 	cork->length += size;
1160 	if ((size + skb->len > mtu) &&
1161 	    (sk->sk_protocol == IPPROTO_UDP) &&
1162 	    (rt->dst.dev->features & NETIF_F_UFO)) {
1163 		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1164 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1165 	}
1166 
1167 
1168 	while (size > 0) {
1169 		int i;
1170 
1171 		if (skb_is_gso(skb))
1172 			len = size;
1173 		else {
1174 
1175 			/* Check if the remaining data fits into current packet. */
1176 			len = mtu - skb->len;
1177 			if (len < size)
1178 				len = maxfraglen - skb->len;
1179 		}
1180 		if (len <= 0) {
1181 			struct sk_buff *skb_prev;
1182 			int alloclen;
1183 
1184 			skb_prev = skb;
1185 			fraggap = skb_prev->len - maxfraglen;
1186 
1187 			alloclen = fragheaderlen + hh_len + fraggap + 15;
1188 			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1189 			if (unlikely(!skb)) {
1190 				err = -ENOBUFS;
1191 				goto error;
1192 			}
1193 
1194 			/*
1195 			 *	Fill in the control structures
1196 			 */
1197 			skb->ip_summed = CHECKSUM_NONE;
1198 			skb->csum = 0;
1199 			skb_reserve(skb, hh_len);
1200 
1201 			/*
1202 			 *	Find where to start putting bytes.
1203 			 */
1204 			skb_put(skb, fragheaderlen + fraggap);
1205 			skb_reset_network_header(skb);
1206 			skb->transport_header = (skb->network_header +
1207 						 fragheaderlen);
1208 			if (fraggap) {
1209 				skb->csum = skb_copy_and_csum_bits(skb_prev,
1210 								   maxfraglen,
1211 						    skb_transport_header(skb),
1212 								   fraggap, 0);
1213 				skb_prev->csum = csum_sub(skb_prev->csum,
1214 							  skb->csum);
1215 				pskb_trim_unique(skb_prev, maxfraglen);
1216 			}
1217 
1218 			/*
1219 			 * Put the packet on the pending queue.
1220 			 */
1221 			__skb_queue_tail(&sk->sk_write_queue, skb);
1222 			continue;
1223 		}
1224 
1225 		i = skb_shinfo(skb)->nr_frags;
1226 		if (len > size)
1227 			len = size;
1228 		if (skb_can_coalesce(skb, i, page, offset)) {
1229 			skb_shinfo(skb)->frags[i-1].size += len;
1230 		} else if (i < MAX_SKB_FRAGS) {
1231 			get_page(page);
1232 			skb_fill_page_desc(skb, i, page, offset, len);
1233 		} else {
1234 			err = -EMSGSIZE;
1235 			goto error;
1236 		}
1237 
1238 		if (skb->ip_summed == CHECKSUM_NONE) {
1239 			__wsum csum;
1240 			csum = csum_page(page, offset, len);
1241 			skb->csum = csum_block_add(skb->csum, csum, skb->len);
1242 		}
1243 
1244 		skb->len += len;
1245 		skb->data_len += len;
1246 		skb->truesize += len;
1247 		atomic_add(len, &sk->sk_wmem_alloc);
1248 		offset += len;
1249 		size -= len;
1250 	}
1251 	return 0;
1252 
1253 error:
1254 	cork->length -= size;
1255 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1256 	return err;
1257 }
1258 
1259 static void ip_cork_release(struct inet_cork *cork)
1260 {
1261 	cork->flags &= ~IPCORK_OPT;
1262 	kfree(cork->opt);
1263 	cork->opt = NULL;
1264 	dst_release(cork->dst);
1265 	cork->dst = NULL;
1266 }
1267 
1268 /*
1269  *	Combined all pending IP fragments on the socket as one IP datagram
1270  *	and push them out.
1271  */
1272 struct sk_buff *__ip_make_skb(struct sock *sk,
1273 			      struct flowi4 *fl4,
1274 			      struct sk_buff_head *queue,
1275 			      struct inet_cork *cork)
1276 {
1277 	struct sk_buff *skb, *tmp_skb;
1278 	struct sk_buff **tail_skb;
1279 	struct inet_sock *inet = inet_sk(sk);
1280 	struct net *net = sock_net(sk);
1281 	struct ip_options *opt = NULL;
1282 	struct rtable *rt = (struct rtable *)cork->dst;
1283 	struct iphdr *iph;
1284 	__be16 df = 0;
1285 	__u8 ttl;
1286 
1287 	if ((skb = __skb_dequeue(queue)) == NULL)
1288 		goto out;
1289 	tail_skb = &(skb_shinfo(skb)->frag_list);
1290 
1291 	/* move skb->data to ip header from ext header */
1292 	if (skb->data < skb_network_header(skb))
1293 		__skb_pull(skb, skb_network_offset(skb));
1294 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1295 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1296 		*tail_skb = tmp_skb;
1297 		tail_skb = &(tmp_skb->next);
1298 		skb->len += tmp_skb->len;
1299 		skb->data_len += tmp_skb->len;
1300 		skb->truesize += tmp_skb->truesize;
1301 		tmp_skb->destructor = NULL;
1302 		tmp_skb->sk = NULL;
1303 	}
1304 
1305 	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1306 	 * to fragment the frame generated here. No matter, what transforms
1307 	 * how transforms change size of the packet, it will come out.
1308 	 */
1309 	if (inet->pmtudisc < IP_PMTUDISC_DO)
1310 		skb->local_df = 1;
1311 
1312 	/* DF bit is set when we want to see DF on outgoing frames.
1313 	 * If local_df is set too, we still allow to fragment this frame
1314 	 * locally. */
1315 	if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1316 	    (skb->len <= dst_mtu(&rt->dst) &&
1317 	     ip_dont_fragment(sk, &rt->dst)))
1318 		df = htons(IP_DF);
1319 
1320 	if (cork->flags & IPCORK_OPT)
1321 		opt = cork->opt;
1322 
1323 	if (rt->rt_type == RTN_MULTICAST)
1324 		ttl = inet->mc_ttl;
1325 	else
1326 		ttl = ip_select_ttl(inet, &rt->dst);
1327 
1328 	iph = (struct iphdr *)skb->data;
1329 	iph->version = 4;
1330 	iph->ihl = 5;
1331 	iph->tos = inet->tos;
1332 	iph->frag_off = df;
1333 	ip_select_ident(iph, &rt->dst, sk);
1334 	iph->ttl = ttl;
1335 	iph->protocol = sk->sk_protocol;
1336 	iph->saddr = fl4->saddr;
1337 	iph->daddr = fl4->daddr;
1338 
1339 	if (opt) {
1340 		iph->ihl += opt->optlen>>2;
1341 		ip_options_build(skb, opt, cork->addr, rt, 0);
1342 	}
1343 
1344 	skb->priority = sk->sk_priority;
1345 	skb->mark = sk->sk_mark;
1346 	/*
1347 	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1348 	 * on dst refcount
1349 	 */
1350 	cork->dst = NULL;
1351 	skb_dst_set(skb, &rt->dst);
1352 
1353 	if (iph->protocol == IPPROTO_ICMP)
1354 		icmp_out_count(net, ((struct icmphdr *)
1355 			skb_transport_header(skb))->type);
1356 
1357 	ip_cork_release(cork);
1358 out:
1359 	return skb;
1360 }
1361 
1362 int ip_send_skb(struct sk_buff *skb)
1363 {
1364 	struct net *net = sock_net(skb->sk);
1365 	int err;
1366 
1367 	err = ip_local_out(skb);
1368 	if (err) {
1369 		if (err > 0)
1370 			err = net_xmit_errno(err);
1371 		if (err)
1372 			IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1373 	}
1374 
1375 	return err;
1376 }
1377 
1378 int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1379 {
1380 	struct sk_buff *skb;
1381 
1382 	skb = ip_finish_skb(sk, fl4);
1383 	if (!skb)
1384 		return 0;
1385 
1386 	/* Netfilter gets whole the not fragmented skb. */
1387 	return ip_send_skb(skb);
1388 }
1389 
1390 /*
1391  *	Throw away all pending data on the socket.
1392  */
1393 static void __ip_flush_pending_frames(struct sock *sk,
1394 				      struct sk_buff_head *queue,
1395 				      struct inet_cork *cork)
1396 {
1397 	struct sk_buff *skb;
1398 
1399 	while ((skb = __skb_dequeue_tail(queue)) != NULL)
1400 		kfree_skb(skb);
1401 
1402 	ip_cork_release(cork);
1403 }
1404 
1405 void ip_flush_pending_frames(struct sock *sk)
1406 {
1407 	__ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1408 }
1409 
1410 struct sk_buff *ip_make_skb(struct sock *sk,
1411 			    struct flowi4 *fl4,
1412 			    int getfrag(void *from, char *to, int offset,
1413 					int len, int odd, struct sk_buff *skb),
1414 			    void *from, int length, int transhdrlen,
1415 			    struct ipcm_cookie *ipc, struct rtable **rtp,
1416 			    unsigned int flags)
1417 {
1418 	struct inet_cork cork;
1419 	struct sk_buff_head queue;
1420 	int err;
1421 
1422 	if (flags & MSG_PROBE)
1423 		return NULL;
1424 
1425 	__skb_queue_head_init(&queue);
1426 
1427 	cork.flags = 0;
1428 	cork.addr = 0;
1429 	cork.opt = NULL;
1430 	err = ip_setup_cork(sk, &cork, ipc, rtp);
1431 	if (err)
1432 		return ERR_PTR(err);
1433 
1434 	err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
1435 			       from, length, transhdrlen, flags);
1436 	if (err) {
1437 		__ip_flush_pending_frames(sk, &queue, &cork);
1438 		return ERR_PTR(err);
1439 	}
1440 
1441 	return __ip_make_skb(sk, fl4, &queue, &cork);
1442 }
1443 
1444 /*
1445  *	Fetch data from kernel space and fill in checksum if needed.
1446  */
1447 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1448 			      int len, int odd, struct sk_buff *skb)
1449 {
1450 	__wsum csum;
1451 
1452 	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1453 	skb->csum = csum_block_add(skb->csum, csum, odd);
1454 	return 0;
1455 }
1456 
1457 /*
1458  *	Generic function to send a packet as reply to another packet.
1459  *	Used to send TCP resets so far. ICMP should use this function too.
1460  *
1461  *	Should run single threaded per socket because it uses the sock
1462  *     	structure to pass arguments.
1463  */
1464 void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1465 		   struct ip_reply_arg *arg, unsigned int len)
1466 {
1467 	struct inet_sock *inet = inet_sk(sk);
1468 	struct ip_options_data replyopts;
1469 	struct ipcm_cookie ipc;
1470 	struct flowi4 fl4;
1471 	struct rtable *rt = skb_rtable(skb);
1472 
1473 	if (ip_options_echo(&replyopts.opt.opt, skb))
1474 		return;
1475 
1476 	ipc.addr = daddr;
1477 	ipc.opt = NULL;
1478 	ipc.tx_flags = 0;
1479 
1480 	if (replyopts.opt.opt.optlen) {
1481 		ipc.opt = &replyopts.opt;
1482 
1483 		if (replyopts.opt.opt.srr)
1484 			daddr = replyopts.opt.opt.faddr;
1485 	}
1486 
1487 	flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1488 			   RT_TOS(ip_hdr(skb)->tos),
1489 			   RT_SCOPE_UNIVERSE, sk->sk_protocol,
1490 			   ip_reply_arg_flowi_flags(arg),
1491 			   daddr, rt->rt_spec_dst,
1492 			   tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1493 	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1494 	rt = ip_route_output_key(sock_net(sk), &fl4);
1495 	if (IS_ERR(rt))
1496 		return;
1497 
1498 	/* And let IP do all the hard work.
1499 
1500 	   This chunk is not reenterable, hence spinlock.
1501 	   Note that it uses the fact, that this function is called
1502 	   with locally disabled BH and that sk cannot be already spinlocked.
1503 	 */
1504 	bh_lock_sock(sk);
1505 	inet->tos = ip_hdr(skb)->tos;
1506 	sk->sk_priority = skb->priority;
1507 	sk->sk_protocol = ip_hdr(skb)->protocol;
1508 	sk->sk_bound_dev_if = arg->bound_dev_if;
1509 	ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1510 		       &ipc, &rt, MSG_DONTWAIT);
1511 	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1512 		if (arg->csumoffset >= 0)
1513 			*((__sum16 *)skb_transport_header(skb) +
1514 			  arg->csumoffset) = csum_fold(csum_add(skb->csum,
1515 								arg->csum));
1516 		skb->ip_summed = CHECKSUM_NONE;
1517 		ip_push_pending_frames(sk, &fl4);
1518 	}
1519 
1520 	bh_unlock_sock(sk);
1521 
1522 	ip_rt_put(rt);
1523 }
1524 
1525 void __init ip_init(void)
1526 {
1527 	ip_rt_init();
1528 	inet_initpeers();
1529 
1530 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1531 	igmp_mc_proc_init();
1532 #endif
1533 }
1534