xref: /openbmc/linux/net/ipv4/ip_output.c (revision 9c1f8594)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		The Internet Protocol (IP) output module.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Donald Becker, <becker@super.org>
11  *		Alan Cox, <Alan.Cox@linux.org>
12  *		Richard Underwood
13  *		Stefan Becker, <stefanb@yello.ping.de>
14  *		Jorge Cwik, <jorge@laser.satlink.net>
15  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16  *		Hirokazu Takahashi, <taka@valinux.co.jp>
17  *
18  *	See ip_input.c for original log
19  *
20  *	Fixes:
21  *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
22  *		Mike Kilburn	:	htons() missing in ip_build_xmit.
23  *		Bradford Johnson:	Fix faulty handling of some frames when
24  *					no route is found.
25  *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
26  *					(in case if packet not accepted by
27  *					output firewall rules)
28  *		Mike McLagan	:	Routing by source
29  *		Alexey Kuznetsov:	use new route cache
30  *		Andi Kleen:		Fix broken PMTU recovery and remove
31  *					some redundant tests.
32  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
33  *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
34  *		Andi Kleen	:	Split fast and slow ip_build_xmit path
35  *					for decreased register pressure on x86
36  *					and more readibility.
37  *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
38  *					silently drop skb instead of failing with -EPERM.
39  *		Detlev Wengorz	:	Copy protocol for fragments.
40  *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
41  *					datagrams.
42  *		Hirokazu Takahashi:	sendfile() on UDP works now.
43  */
44 
45 #include <asm/uaccess.h>
46 #include <asm/system.h>
47 #include <linux/module.h>
48 #include <linux/types.h>
49 #include <linux/kernel.h>
50 #include <linux/mm.h>
51 #include <linux/string.h>
52 #include <linux/errno.h>
53 #include <linux/highmem.h>
54 #include <linux/slab.h>
55 
56 #include <linux/socket.h>
57 #include <linux/sockios.h>
58 #include <linux/in.h>
59 #include <linux/inet.h>
60 #include <linux/netdevice.h>
61 #include <linux/etherdevice.h>
62 #include <linux/proc_fs.h>
63 #include <linux/stat.h>
64 #include <linux/init.h>
65 
66 #include <net/snmp.h>
67 #include <net/ip.h>
68 #include <net/protocol.h>
69 #include <net/route.h>
70 #include <net/xfrm.h>
71 #include <linux/skbuff.h>
72 #include <net/sock.h>
73 #include <net/arp.h>
74 #include <net/icmp.h>
75 #include <net/checksum.h>
76 #include <net/inetpeer.h>
77 #include <linux/igmp.h>
78 #include <linux/netfilter_ipv4.h>
79 #include <linux/netfilter_bridge.h>
80 #include <linux/mroute.h>
81 #include <linux/netlink.h>
82 #include <linux/tcp.h>
83 
84 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
85 EXPORT_SYMBOL(sysctl_ip_default_ttl);
86 
87 /* Generate a checksum for an outgoing IP datagram. */
88 __inline__ void ip_send_check(struct iphdr *iph)
89 {
90 	iph->check = 0;
91 	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
92 }
93 EXPORT_SYMBOL(ip_send_check);
94 
95 int __ip_local_out(struct sk_buff *skb)
96 {
97 	struct iphdr *iph = ip_hdr(skb);
98 
99 	iph->tot_len = htons(skb->len);
100 	ip_send_check(iph);
101 	return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
102 		       skb_dst(skb)->dev, dst_output);
103 }
104 
105 int ip_local_out(struct sk_buff *skb)
106 {
107 	int err;
108 
109 	err = __ip_local_out(skb);
110 	if (likely(err == 1))
111 		err = dst_output(skb);
112 
113 	return err;
114 }
115 EXPORT_SYMBOL_GPL(ip_local_out);
116 
117 /* dev_loopback_xmit for use with netfilter. */
118 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
119 {
120 	skb_reset_mac_header(newskb);
121 	__skb_pull(newskb, skb_network_offset(newskb));
122 	newskb->pkt_type = PACKET_LOOPBACK;
123 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
124 	WARN_ON(!skb_dst(newskb));
125 	skb_dst_force(newskb);
126 	netif_rx_ni(newskb);
127 	return 0;
128 }
129 
130 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
131 {
132 	int ttl = inet->uc_ttl;
133 
134 	if (ttl < 0)
135 		ttl = ip4_dst_hoplimit(dst);
136 	return ttl;
137 }
138 
139 /*
140  *		Add an ip header to a skbuff and send it out.
141  *
142  */
143 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
144 			  __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
145 {
146 	struct inet_sock *inet = inet_sk(sk);
147 	struct rtable *rt = skb_rtable(skb);
148 	struct iphdr *iph;
149 
150 	/* Build the IP header. */
151 	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
152 	skb_reset_network_header(skb);
153 	iph = ip_hdr(skb);
154 	iph->version  = 4;
155 	iph->ihl      = 5;
156 	iph->tos      = inet->tos;
157 	if (ip_dont_fragment(sk, &rt->dst))
158 		iph->frag_off = htons(IP_DF);
159 	else
160 		iph->frag_off = 0;
161 	iph->ttl      = ip_select_ttl(inet, &rt->dst);
162 	iph->daddr    = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
163 	iph->saddr    = saddr;
164 	iph->protocol = sk->sk_protocol;
165 	ip_select_ident(iph, &rt->dst, sk);
166 
167 	if (opt && opt->opt.optlen) {
168 		iph->ihl += opt->opt.optlen>>2;
169 		ip_options_build(skb, &opt->opt, daddr, rt, 0);
170 	}
171 
172 	skb->priority = sk->sk_priority;
173 	skb->mark = sk->sk_mark;
174 
175 	/* Send it out. */
176 	return ip_local_out(skb);
177 }
178 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
179 
180 static inline int ip_finish_output2(struct sk_buff *skb)
181 {
182 	struct dst_entry *dst = skb_dst(skb);
183 	struct rtable *rt = (struct rtable *)dst;
184 	struct net_device *dev = dst->dev;
185 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
186 	struct neighbour *neigh;
187 
188 	if (rt->rt_type == RTN_MULTICAST) {
189 		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
190 	} else if (rt->rt_type == RTN_BROADCAST)
191 		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
192 
193 	/* Be paranoid, rather than too clever. */
194 	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
195 		struct sk_buff *skb2;
196 
197 		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
198 		if (skb2 == NULL) {
199 			kfree_skb(skb);
200 			return -ENOMEM;
201 		}
202 		if (skb->sk)
203 			skb_set_owner_w(skb2, skb->sk);
204 		kfree_skb(skb);
205 		skb = skb2;
206 	}
207 
208 	rcu_read_lock();
209 	neigh = dst_get_neighbour(dst);
210 	if (neigh) {
211 		int res = neigh_output(neigh, skb);
212 
213 		rcu_read_unlock();
214 		return res;
215 	}
216 	rcu_read_unlock();
217 
218 	if (net_ratelimit())
219 		printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
220 	kfree_skb(skb);
221 	return -EINVAL;
222 }
223 
224 static inline int ip_skb_dst_mtu(struct sk_buff *skb)
225 {
226 	struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
227 
228 	return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
229 	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
230 }
231 
232 static int ip_finish_output(struct sk_buff *skb)
233 {
234 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
235 	/* Policy lookup after SNAT yielded a new policy */
236 	if (skb_dst(skb)->xfrm != NULL) {
237 		IPCB(skb)->flags |= IPSKB_REROUTED;
238 		return dst_output(skb);
239 	}
240 #endif
241 	if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
242 		return ip_fragment(skb, ip_finish_output2);
243 	else
244 		return ip_finish_output2(skb);
245 }
246 
247 int ip_mc_output(struct sk_buff *skb)
248 {
249 	struct sock *sk = skb->sk;
250 	struct rtable *rt = skb_rtable(skb);
251 	struct net_device *dev = rt->dst.dev;
252 
253 	/*
254 	 *	If the indicated interface is up and running, send the packet.
255 	 */
256 	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
257 
258 	skb->dev = dev;
259 	skb->protocol = htons(ETH_P_IP);
260 
261 	/*
262 	 *	Multicasts are looped back for other local users
263 	 */
264 
265 	if (rt->rt_flags&RTCF_MULTICAST) {
266 		if (sk_mc_loop(sk)
267 #ifdef CONFIG_IP_MROUTE
268 		/* Small optimization: do not loopback not local frames,
269 		   which returned after forwarding; they will be  dropped
270 		   by ip_mr_input in any case.
271 		   Note, that local frames are looped back to be delivered
272 		   to local recipients.
273 
274 		   This check is duplicated in ip_mr_input at the moment.
275 		 */
276 		    &&
277 		    ((rt->rt_flags & RTCF_LOCAL) ||
278 		     !(IPCB(skb)->flags & IPSKB_FORWARDED))
279 #endif
280 		   ) {
281 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
282 			if (newskb)
283 				NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
284 					newskb, NULL, newskb->dev,
285 					ip_dev_loopback_xmit);
286 		}
287 
288 		/* Multicasts with ttl 0 must not go beyond the host */
289 
290 		if (ip_hdr(skb)->ttl == 0) {
291 			kfree_skb(skb);
292 			return 0;
293 		}
294 	}
295 
296 	if (rt->rt_flags&RTCF_BROADCAST) {
297 		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
298 		if (newskb)
299 			NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
300 				NULL, newskb->dev, ip_dev_loopback_xmit);
301 	}
302 
303 	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
304 			    skb->dev, ip_finish_output,
305 			    !(IPCB(skb)->flags & IPSKB_REROUTED));
306 }
307 
308 int ip_output(struct sk_buff *skb)
309 {
310 	struct net_device *dev = skb_dst(skb)->dev;
311 
312 	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
313 
314 	skb->dev = dev;
315 	skb->protocol = htons(ETH_P_IP);
316 
317 	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
318 			    ip_finish_output,
319 			    !(IPCB(skb)->flags & IPSKB_REROUTED));
320 }
321 
322 int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
323 {
324 	struct sock *sk = skb->sk;
325 	struct inet_sock *inet = inet_sk(sk);
326 	struct ip_options_rcu *inet_opt;
327 	struct flowi4 *fl4;
328 	struct rtable *rt;
329 	struct iphdr *iph;
330 	int res;
331 
332 	/* Skip all of this if the packet is already routed,
333 	 * f.e. by something like SCTP.
334 	 */
335 	rcu_read_lock();
336 	inet_opt = rcu_dereference(inet->inet_opt);
337 	fl4 = &fl->u.ip4;
338 	rt = skb_rtable(skb);
339 	if (rt != NULL)
340 		goto packet_routed;
341 
342 	/* Make sure we can route this packet. */
343 	rt = (struct rtable *)__sk_dst_check(sk, 0);
344 	if (rt == NULL) {
345 		__be32 daddr;
346 
347 		/* Use correct destination address if we have options. */
348 		daddr = inet->inet_daddr;
349 		if (inet_opt && inet_opt->opt.srr)
350 			daddr = inet_opt->opt.faddr;
351 
352 		/* If this fails, retransmit mechanism of transport layer will
353 		 * keep trying until route appears or the connection times
354 		 * itself out.
355 		 */
356 		rt = ip_route_output_ports(sock_net(sk), fl4, sk,
357 					   daddr, inet->inet_saddr,
358 					   inet->inet_dport,
359 					   inet->inet_sport,
360 					   sk->sk_protocol,
361 					   RT_CONN_FLAGS(sk),
362 					   sk->sk_bound_dev_if);
363 		if (IS_ERR(rt))
364 			goto no_route;
365 		sk_setup_caps(sk, &rt->dst);
366 	}
367 	skb_dst_set_noref(skb, &rt->dst);
368 
369 packet_routed:
370 	if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
371 		goto no_route;
372 
373 	/* OK, we know where to send it, allocate and build IP header. */
374 	skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
375 	skb_reset_network_header(skb);
376 	iph = ip_hdr(skb);
377 	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
378 	if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
379 		iph->frag_off = htons(IP_DF);
380 	else
381 		iph->frag_off = 0;
382 	iph->ttl      = ip_select_ttl(inet, &rt->dst);
383 	iph->protocol = sk->sk_protocol;
384 	iph->saddr    = fl4->saddr;
385 	iph->daddr    = fl4->daddr;
386 	/* Transport layer set skb->h.foo itself. */
387 
388 	if (inet_opt && inet_opt->opt.optlen) {
389 		iph->ihl += inet_opt->opt.optlen >> 2;
390 		ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
391 	}
392 
393 	ip_select_ident_more(iph, &rt->dst, sk,
394 			     (skb_shinfo(skb)->gso_segs ?: 1) - 1);
395 
396 	skb->priority = sk->sk_priority;
397 	skb->mark = sk->sk_mark;
398 
399 	res = ip_local_out(skb);
400 	rcu_read_unlock();
401 	return res;
402 
403 no_route:
404 	rcu_read_unlock();
405 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
406 	kfree_skb(skb);
407 	return -EHOSTUNREACH;
408 }
409 EXPORT_SYMBOL(ip_queue_xmit);
410 
411 
412 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
413 {
414 	to->pkt_type = from->pkt_type;
415 	to->priority = from->priority;
416 	to->protocol = from->protocol;
417 	skb_dst_drop(to);
418 	skb_dst_copy(to, from);
419 	to->dev = from->dev;
420 	to->mark = from->mark;
421 
422 	/* Copy the flags to each fragment. */
423 	IPCB(to)->flags = IPCB(from)->flags;
424 
425 #ifdef CONFIG_NET_SCHED
426 	to->tc_index = from->tc_index;
427 #endif
428 	nf_copy(to, from);
429 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
430     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
431 	to->nf_trace = from->nf_trace;
432 #endif
433 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
434 	to->ipvs_property = from->ipvs_property;
435 #endif
436 	skb_copy_secmark(to, from);
437 }
438 
439 /*
440  *	This IP datagram is too large to be sent in one piece.  Break it up into
441  *	smaller pieces (each of size equal to IP header plus
442  *	a block of the data of the original IP data part) that will yet fit in a
443  *	single device frame, and queue such a frame for sending.
444  */
445 
446 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
447 {
448 	struct iphdr *iph;
449 	int ptr;
450 	struct net_device *dev;
451 	struct sk_buff *skb2;
452 	unsigned int mtu, hlen, left, len, ll_rs;
453 	int offset;
454 	__be16 not_last_frag;
455 	struct rtable *rt = skb_rtable(skb);
456 	int err = 0;
457 
458 	dev = rt->dst.dev;
459 
460 	/*
461 	 *	Point into the IP datagram header.
462 	 */
463 
464 	iph = ip_hdr(skb);
465 
466 	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
467 		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
468 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
469 			  htonl(ip_skb_dst_mtu(skb)));
470 		kfree_skb(skb);
471 		return -EMSGSIZE;
472 	}
473 
474 	/*
475 	 *	Setup starting values.
476 	 */
477 
478 	hlen = iph->ihl * 4;
479 	mtu = dst_mtu(&rt->dst) - hlen;	/* Size of data space */
480 #ifdef CONFIG_BRIDGE_NETFILTER
481 	if (skb->nf_bridge)
482 		mtu -= nf_bridge_mtu_reduction(skb);
483 #endif
484 	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
485 
486 	/* When frag_list is given, use it. First, check its validity:
487 	 * some transformers could create wrong frag_list or break existing
488 	 * one, it is not prohibited. In this case fall back to copying.
489 	 *
490 	 * LATER: this step can be merged to real generation of fragments,
491 	 * we can switch to copy when see the first bad fragment.
492 	 */
493 	if (skb_has_frag_list(skb)) {
494 		struct sk_buff *frag, *frag2;
495 		int first_len = skb_pagelen(skb);
496 
497 		if (first_len - hlen > mtu ||
498 		    ((first_len - hlen) & 7) ||
499 		    ip_is_fragment(iph) ||
500 		    skb_cloned(skb))
501 			goto slow_path;
502 
503 		skb_walk_frags(skb, frag) {
504 			/* Correct geometry. */
505 			if (frag->len > mtu ||
506 			    ((frag->len & 7) && frag->next) ||
507 			    skb_headroom(frag) < hlen)
508 				goto slow_path_clean;
509 
510 			/* Partially cloned skb? */
511 			if (skb_shared(frag))
512 				goto slow_path_clean;
513 
514 			BUG_ON(frag->sk);
515 			if (skb->sk) {
516 				frag->sk = skb->sk;
517 				frag->destructor = sock_wfree;
518 			}
519 			skb->truesize -= frag->truesize;
520 		}
521 
522 		/* Everything is OK. Generate! */
523 
524 		err = 0;
525 		offset = 0;
526 		frag = skb_shinfo(skb)->frag_list;
527 		skb_frag_list_init(skb);
528 		skb->data_len = first_len - skb_headlen(skb);
529 		skb->len = first_len;
530 		iph->tot_len = htons(first_len);
531 		iph->frag_off = htons(IP_MF);
532 		ip_send_check(iph);
533 
534 		for (;;) {
535 			/* Prepare header of the next frame,
536 			 * before previous one went down. */
537 			if (frag) {
538 				frag->ip_summed = CHECKSUM_NONE;
539 				skb_reset_transport_header(frag);
540 				__skb_push(frag, hlen);
541 				skb_reset_network_header(frag);
542 				memcpy(skb_network_header(frag), iph, hlen);
543 				iph = ip_hdr(frag);
544 				iph->tot_len = htons(frag->len);
545 				ip_copy_metadata(frag, skb);
546 				if (offset == 0)
547 					ip_options_fragment(frag);
548 				offset += skb->len - hlen;
549 				iph->frag_off = htons(offset>>3);
550 				if (frag->next != NULL)
551 					iph->frag_off |= htons(IP_MF);
552 				/* Ready, complete checksum */
553 				ip_send_check(iph);
554 			}
555 
556 			err = output(skb);
557 
558 			if (!err)
559 				IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
560 			if (err || !frag)
561 				break;
562 
563 			skb = frag;
564 			frag = skb->next;
565 			skb->next = NULL;
566 		}
567 
568 		if (err == 0) {
569 			IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
570 			return 0;
571 		}
572 
573 		while (frag) {
574 			skb = frag->next;
575 			kfree_skb(frag);
576 			frag = skb;
577 		}
578 		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
579 		return err;
580 
581 slow_path_clean:
582 		skb_walk_frags(skb, frag2) {
583 			if (frag2 == frag)
584 				break;
585 			frag2->sk = NULL;
586 			frag2->destructor = NULL;
587 			skb->truesize += frag2->truesize;
588 		}
589 	}
590 
591 slow_path:
592 	left = skb->len - hlen;		/* Space per frame */
593 	ptr = hlen;		/* Where to start from */
594 
595 	/* for bridged IP traffic encapsulated inside f.e. a vlan header,
596 	 * we need to make room for the encapsulating header
597 	 */
598 	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
599 
600 	/*
601 	 *	Fragment the datagram.
602 	 */
603 
604 	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
605 	not_last_frag = iph->frag_off & htons(IP_MF);
606 
607 	/*
608 	 *	Keep copying data until we run out.
609 	 */
610 
611 	while (left > 0) {
612 		len = left;
613 		/* IF: it doesn't fit, use 'mtu' - the data space left */
614 		if (len > mtu)
615 			len = mtu;
616 		/* IF: we are not sending up to and including the packet end
617 		   then align the next start on an eight byte boundary */
618 		if (len < left)	{
619 			len &= ~7;
620 		}
621 		/*
622 		 *	Allocate buffer.
623 		 */
624 
625 		if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
626 			NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
627 			err = -ENOMEM;
628 			goto fail;
629 		}
630 
631 		/*
632 		 *	Set up data on packet
633 		 */
634 
635 		ip_copy_metadata(skb2, skb);
636 		skb_reserve(skb2, ll_rs);
637 		skb_put(skb2, len + hlen);
638 		skb_reset_network_header(skb2);
639 		skb2->transport_header = skb2->network_header + hlen;
640 
641 		/*
642 		 *	Charge the memory for the fragment to any owner
643 		 *	it might possess
644 		 */
645 
646 		if (skb->sk)
647 			skb_set_owner_w(skb2, skb->sk);
648 
649 		/*
650 		 *	Copy the packet header into the new buffer.
651 		 */
652 
653 		skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
654 
655 		/*
656 		 *	Copy a block of the IP datagram.
657 		 */
658 		if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
659 			BUG();
660 		left -= len;
661 
662 		/*
663 		 *	Fill in the new header fields.
664 		 */
665 		iph = ip_hdr(skb2);
666 		iph->frag_off = htons((offset >> 3));
667 
668 		/* ANK: dirty, but effective trick. Upgrade options only if
669 		 * the segment to be fragmented was THE FIRST (otherwise,
670 		 * options are already fixed) and make it ONCE
671 		 * on the initial skb, so that all the following fragments
672 		 * will inherit fixed options.
673 		 */
674 		if (offset == 0)
675 			ip_options_fragment(skb);
676 
677 		/*
678 		 *	Added AC : If we are fragmenting a fragment that's not the
679 		 *		   last fragment then keep MF on each bit
680 		 */
681 		if (left > 0 || not_last_frag)
682 			iph->frag_off |= htons(IP_MF);
683 		ptr += len;
684 		offset += len;
685 
686 		/*
687 		 *	Put this fragment into the sending queue.
688 		 */
689 		iph->tot_len = htons(len + hlen);
690 
691 		ip_send_check(iph);
692 
693 		err = output(skb2);
694 		if (err)
695 			goto fail;
696 
697 		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
698 	}
699 	kfree_skb(skb);
700 	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
701 	return err;
702 
703 fail:
704 	kfree_skb(skb);
705 	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
706 	return err;
707 }
708 EXPORT_SYMBOL(ip_fragment);
709 
710 int
711 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
712 {
713 	struct iovec *iov = from;
714 
715 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
716 		if (memcpy_fromiovecend(to, iov, offset, len) < 0)
717 			return -EFAULT;
718 	} else {
719 		__wsum csum = 0;
720 		if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
721 			return -EFAULT;
722 		skb->csum = csum_block_add(skb->csum, csum, odd);
723 	}
724 	return 0;
725 }
726 EXPORT_SYMBOL(ip_generic_getfrag);
727 
728 static inline __wsum
729 csum_page(struct page *page, int offset, int copy)
730 {
731 	char *kaddr;
732 	__wsum csum;
733 	kaddr = kmap(page);
734 	csum = csum_partial(kaddr + offset, copy, 0);
735 	kunmap(page);
736 	return csum;
737 }
738 
739 static inline int ip_ufo_append_data(struct sock *sk,
740 			struct sk_buff_head *queue,
741 			int getfrag(void *from, char *to, int offset, int len,
742 			       int odd, struct sk_buff *skb),
743 			void *from, int length, int hh_len, int fragheaderlen,
744 			int transhdrlen, int maxfraglen, unsigned int flags)
745 {
746 	struct sk_buff *skb;
747 	int err;
748 
749 	/* There is support for UDP fragmentation offload by network
750 	 * device, so create one single skb packet containing complete
751 	 * udp datagram
752 	 */
753 	if ((skb = skb_peek_tail(queue)) == NULL) {
754 		skb = sock_alloc_send_skb(sk,
755 			hh_len + fragheaderlen + transhdrlen + 20,
756 			(flags & MSG_DONTWAIT), &err);
757 
758 		if (skb == NULL)
759 			return err;
760 
761 		/* reserve space for Hardware header */
762 		skb_reserve(skb, hh_len);
763 
764 		/* create space for UDP/IP header */
765 		skb_put(skb, fragheaderlen + transhdrlen);
766 
767 		/* initialize network header pointer */
768 		skb_reset_network_header(skb);
769 
770 		/* initialize protocol header pointer */
771 		skb->transport_header = skb->network_header + fragheaderlen;
772 
773 		skb->ip_summed = CHECKSUM_PARTIAL;
774 		skb->csum = 0;
775 
776 		/* specify the length of each IP datagram fragment */
777 		skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
778 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
779 		__skb_queue_tail(queue, skb);
780 	}
781 
782 	return skb_append_datato_frags(sk, skb, getfrag, from,
783 				       (length - transhdrlen));
784 }
785 
786 static int __ip_append_data(struct sock *sk,
787 			    struct flowi4 *fl4,
788 			    struct sk_buff_head *queue,
789 			    struct inet_cork *cork,
790 			    int getfrag(void *from, char *to, int offset,
791 					int len, int odd, struct sk_buff *skb),
792 			    void *from, int length, int transhdrlen,
793 			    unsigned int flags)
794 {
795 	struct inet_sock *inet = inet_sk(sk);
796 	struct sk_buff *skb;
797 
798 	struct ip_options *opt = cork->opt;
799 	int hh_len;
800 	int exthdrlen;
801 	int mtu;
802 	int copy;
803 	int err;
804 	int offset = 0;
805 	unsigned int maxfraglen, fragheaderlen;
806 	int csummode = CHECKSUM_NONE;
807 	struct rtable *rt = (struct rtable *)cork->dst;
808 
809 	skb = skb_peek_tail(queue);
810 
811 	exthdrlen = !skb ? rt->dst.header_len : 0;
812 	mtu = cork->fragsize;
813 
814 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
815 
816 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
817 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
818 
819 	if (cork->length + length > 0xFFFF - fragheaderlen) {
820 		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
821 			       mtu-exthdrlen);
822 		return -EMSGSIZE;
823 	}
824 
825 	/*
826 	 * transhdrlen > 0 means that this is the first fragment and we wish
827 	 * it won't be fragmented in the future.
828 	 */
829 	if (transhdrlen &&
830 	    length + fragheaderlen <= mtu &&
831 	    rt->dst.dev->features & NETIF_F_V4_CSUM &&
832 	    !exthdrlen)
833 		csummode = CHECKSUM_PARTIAL;
834 
835 	cork->length += length;
836 	if (((length > mtu) || (skb && skb_is_gso(skb))) &&
837 	    (sk->sk_protocol == IPPROTO_UDP) &&
838 	    (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
839 		err = ip_ufo_append_data(sk, queue, getfrag, from, length,
840 					 hh_len, fragheaderlen, transhdrlen,
841 					 maxfraglen, flags);
842 		if (err)
843 			goto error;
844 		return 0;
845 	}
846 
847 	/* So, what's going on in the loop below?
848 	 *
849 	 * We use calculated fragment length to generate chained skb,
850 	 * each of segments is IP fragment ready for sending to network after
851 	 * adding appropriate IP header.
852 	 */
853 
854 	if (!skb)
855 		goto alloc_new_skb;
856 
857 	while (length > 0) {
858 		/* Check if the remaining data fits into current packet. */
859 		copy = mtu - skb->len;
860 		if (copy < length)
861 			copy = maxfraglen - skb->len;
862 		if (copy <= 0) {
863 			char *data;
864 			unsigned int datalen;
865 			unsigned int fraglen;
866 			unsigned int fraggap;
867 			unsigned int alloclen;
868 			struct sk_buff *skb_prev;
869 alloc_new_skb:
870 			skb_prev = skb;
871 			if (skb_prev)
872 				fraggap = skb_prev->len - maxfraglen;
873 			else
874 				fraggap = 0;
875 
876 			/*
877 			 * If remaining data exceeds the mtu,
878 			 * we know we need more fragment(s).
879 			 */
880 			datalen = length + fraggap;
881 			if (datalen > mtu - fragheaderlen)
882 				datalen = maxfraglen - fragheaderlen;
883 			fraglen = datalen + fragheaderlen;
884 
885 			if ((flags & MSG_MORE) &&
886 			    !(rt->dst.dev->features&NETIF_F_SG))
887 				alloclen = mtu;
888 			else
889 				alloclen = fraglen;
890 
891 			alloclen += exthdrlen;
892 
893 			/* The last fragment gets additional space at tail.
894 			 * Note, with MSG_MORE we overallocate on fragments,
895 			 * because we have no idea what fragment will be
896 			 * the last.
897 			 */
898 			if (datalen == length + fraggap)
899 				alloclen += rt->dst.trailer_len;
900 
901 			if (transhdrlen) {
902 				skb = sock_alloc_send_skb(sk,
903 						alloclen + hh_len + 15,
904 						(flags & MSG_DONTWAIT), &err);
905 			} else {
906 				skb = NULL;
907 				if (atomic_read(&sk->sk_wmem_alloc) <=
908 				    2 * sk->sk_sndbuf)
909 					skb = sock_wmalloc(sk,
910 							   alloclen + hh_len + 15, 1,
911 							   sk->sk_allocation);
912 				if (unlikely(skb == NULL))
913 					err = -ENOBUFS;
914 				else
915 					/* only the initial fragment is
916 					   time stamped */
917 					cork->tx_flags = 0;
918 			}
919 			if (skb == NULL)
920 				goto error;
921 
922 			/*
923 			 *	Fill in the control structures
924 			 */
925 			skb->ip_summed = csummode;
926 			skb->csum = 0;
927 			skb_reserve(skb, hh_len);
928 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
929 
930 			/*
931 			 *	Find where to start putting bytes.
932 			 */
933 			data = skb_put(skb, fraglen + exthdrlen);
934 			skb_set_network_header(skb, exthdrlen);
935 			skb->transport_header = (skb->network_header +
936 						 fragheaderlen);
937 			data += fragheaderlen + exthdrlen;
938 
939 			if (fraggap) {
940 				skb->csum = skb_copy_and_csum_bits(
941 					skb_prev, maxfraglen,
942 					data + transhdrlen, fraggap, 0);
943 				skb_prev->csum = csum_sub(skb_prev->csum,
944 							  skb->csum);
945 				data += fraggap;
946 				pskb_trim_unique(skb_prev, maxfraglen);
947 			}
948 
949 			copy = datalen - transhdrlen - fraggap;
950 			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
951 				err = -EFAULT;
952 				kfree_skb(skb);
953 				goto error;
954 			}
955 
956 			offset += copy;
957 			length -= datalen - fraggap;
958 			transhdrlen = 0;
959 			exthdrlen = 0;
960 			csummode = CHECKSUM_NONE;
961 
962 			/*
963 			 * Put the packet on the pending queue.
964 			 */
965 			__skb_queue_tail(queue, skb);
966 			continue;
967 		}
968 
969 		if (copy > length)
970 			copy = length;
971 
972 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
973 			unsigned int off;
974 
975 			off = skb->len;
976 			if (getfrag(from, skb_put(skb, copy),
977 					offset, copy, off, skb) < 0) {
978 				__skb_trim(skb, off);
979 				err = -EFAULT;
980 				goto error;
981 			}
982 		} else {
983 			int i = skb_shinfo(skb)->nr_frags;
984 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
985 			struct page *page = cork->page;
986 			int off = cork->off;
987 			unsigned int left;
988 
989 			if (page && (left = PAGE_SIZE - off) > 0) {
990 				if (copy >= left)
991 					copy = left;
992 				if (page != frag->page) {
993 					if (i == MAX_SKB_FRAGS) {
994 						err = -EMSGSIZE;
995 						goto error;
996 					}
997 					get_page(page);
998 					skb_fill_page_desc(skb, i, page, off, 0);
999 					frag = &skb_shinfo(skb)->frags[i];
1000 				}
1001 			} else if (i < MAX_SKB_FRAGS) {
1002 				if (copy > PAGE_SIZE)
1003 					copy = PAGE_SIZE;
1004 				page = alloc_pages(sk->sk_allocation, 0);
1005 				if (page == NULL)  {
1006 					err = -ENOMEM;
1007 					goto error;
1008 				}
1009 				cork->page = page;
1010 				cork->off = 0;
1011 
1012 				skb_fill_page_desc(skb, i, page, 0, 0);
1013 				frag = &skb_shinfo(skb)->frags[i];
1014 			} else {
1015 				err = -EMSGSIZE;
1016 				goto error;
1017 			}
1018 			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1019 				err = -EFAULT;
1020 				goto error;
1021 			}
1022 			cork->off += copy;
1023 			frag->size += copy;
1024 			skb->len += copy;
1025 			skb->data_len += copy;
1026 			skb->truesize += copy;
1027 			atomic_add(copy, &sk->sk_wmem_alloc);
1028 		}
1029 		offset += copy;
1030 		length -= copy;
1031 	}
1032 
1033 	return 0;
1034 
1035 error:
1036 	cork->length -= length;
1037 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1038 	return err;
1039 }
1040 
1041 static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1042 			 struct ipcm_cookie *ipc, struct rtable **rtp)
1043 {
1044 	struct inet_sock *inet = inet_sk(sk);
1045 	struct ip_options_rcu *opt;
1046 	struct rtable *rt;
1047 
1048 	/*
1049 	 * setup for corking.
1050 	 */
1051 	opt = ipc->opt;
1052 	if (opt) {
1053 		if (cork->opt == NULL) {
1054 			cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1055 					    sk->sk_allocation);
1056 			if (unlikely(cork->opt == NULL))
1057 				return -ENOBUFS;
1058 		}
1059 		memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1060 		cork->flags |= IPCORK_OPT;
1061 		cork->addr = ipc->addr;
1062 	}
1063 	rt = *rtp;
1064 	if (unlikely(!rt))
1065 		return -EFAULT;
1066 	/*
1067 	 * We steal reference to this route, caller should not release it
1068 	 */
1069 	*rtp = NULL;
1070 	cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1071 			 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1072 	cork->dst = &rt->dst;
1073 	cork->length = 0;
1074 	cork->tx_flags = ipc->tx_flags;
1075 	cork->page = NULL;
1076 	cork->off = 0;
1077 
1078 	return 0;
1079 }
1080 
1081 /*
1082  *	ip_append_data() and ip_append_page() can make one large IP datagram
1083  *	from many pieces of data. Each pieces will be holded on the socket
1084  *	until ip_push_pending_frames() is called. Each piece can be a page
1085  *	or non-page data.
1086  *
1087  *	Not only UDP, other transport protocols - e.g. raw sockets - can use
1088  *	this interface potentially.
1089  *
1090  *	LATER: length must be adjusted by pad at tail, when it is required.
1091  */
1092 int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1093 		   int getfrag(void *from, char *to, int offset, int len,
1094 			       int odd, struct sk_buff *skb),
1095 		   void *from, int length, int transhdrlen,
1096 		   struct ipcm_cookie *ipc, struct rtable **rtp,
1097 		   unsigned int flags)
1098 {
1099 	struct inet_sock *inet = inet_sk(sk);
1100 	int err;
1101 
1102 	if (flags&MSG_PROBE)
1103 		return 0;
1104 
1105 	if (skb_queue_empty(&sk->sk_write_queue)) {
1106 		err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1107 		if (err)
1108 			return err;
1109 	} else {
1110 		transhdrlen = 0;
1111 	}
1112 
1113 	return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
1114 				from, length, transhdrlen, flags);
1115 }
1116 
1117 ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1118 		       int offset, size_t size, int flags)
1119 {
1120 	struct inet_sock *inet = inet_sk(sk);
1121 	struct sk_buff *skb;
1122 	struct rtable *rt;
1123 	struct ip_options *opt = NULL;
1124 	struct inet_cork *cork;
1125 	int hh_len;
1126 	int mtu;
1127 	int len;
1128 	int err;
1129 	unsigned int maxfraglen, fragheaderlen, fraggap;
1130 
1131 	if (inet->hdrincl)
1132 		return -EPERM;
1133 
1134 	if (flags&MSG_PROBE)
1135 		return 0;
1136 
1137 	if (skb_queue_empty(&sk->sk_write_queue))
1138 		return -EINVAL;
1139 
1140 	cork = &inet->cork.base;
1141 	rt = (struct rtable *)cork->dst;
1142 	if (cork->flags & IPCORK_OPT)
1143 		opt = cork->opt;
1144 
1145 	if (!(rt->dst.dev->features&NETIF_F_SG))
1146 		return -EOPNOTSUPP;
1147 
1148 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1149 	mtu = cork->fragsize;
1150 
1151 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1152 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1153 
1154 	if (cork->length + size > 0xFFFF - fragheaderlen) {
1155 		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
1156 		return -EMSGSIZE;
1157 	}
1158 
1159 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1160 		return -EINVAL;
1161 
1162 	cork->length += size;
1163 	if ((size + skb->len > mtu) &&
1164 	    (sk->sk_protocol == IPPROTO_UDP) &&
1165 	    (rt->dst.dev->features & NETIF_F_UFO)) {
1166 		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1167 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1168 	}
1169 
1170 
1171 	while (size > 0) {
1172 		int i;
1173 
1174 		if (skb_is_gso(skb))
1175 			len = size;
1176 		else {
1177 
1178 			/* Check if the remaining data fits into current packet. */
1179 			len = mtu - skb->len;
1180 			if (len < size)
1181 				len = maxfraglen - skb->len;
1182 		}
1183 		if (len <= 0) {
1184 			struct sk_buff *skb_prev;
1185 			int alloclen;
1186 
1187 			skb_prev = skb;
1188 			fraggap = skb_prev->len - maxfraglen;
1189 
1190 			alloclen = fragheaderlen + hh_len + fraggap + 15;
1191 			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1192 			if (unlikely(!skb)) {
1193 				err = -ENOBUFS;
1194 				goto error;
1195 			}
1196 
1197 			/*
1198 			 *	Fill in the control structures
1199 			 */
1200 			skb->ip_summed = CHECKSUM_NONE;
1201 			skb->csum = 0;
1202 			skb_reserve(skb, hh_len);
1203 
1204 			/*
1205 			 *	Find where to start putting bytes.
1206 			 */
1207 			skb_put(skb, fragheaderlen + fraggap);
1208 			skb_reset_network_header(skb);
1209 			skb->transport_header = (skb->network_header +
1210 						 fragheaderlen);
1211 			if (fraggap) {
1212 				skb->csum = skb_copy_and_csum_bits(skb_prev,
1213 								   maxfraglen,
1214 						    skb_transport_header(skb),
1215 								   fraggap, 0);
1216 				skb_prev->csum = csum_sub(skb_prev->csum,
1217 							  skb->csum);
1218 				pskb_trim_unique(skb_prev, maxfraglen);
1219 			}
1220 
1221 			/*
1222 			 * Put the packet on the pending queue.
1223 			 */
1224 			__skb_queue_tail(&sk->sk_write_queue, skb);
1225 			continue;
1226 		}
1227 
1228 		i = skb_shinfo(skb)->nr_frags;
1229 		if (len > size)
1230 			len = size;
1231 		if (skb_can_coalesce(skb, i, page, offset)) {
1232 			skb_shinfo(skb)->frags[i-1].size += len;
1233 		} else if (i < MAX_SKB_FRAGS) {
1234 			get_page(page);
1235 			skb_fill_page_desc(skb, i, page, offset, len);
1236 		} else {
1237 			err = -EMSGSIZE;
1238 			goto error;
1239 		}
1240 
1241 		if (skb->ip_summed == CHECKSUM_NONE) {
1242 			__wsum csum;
1243 			csum = csum_page(page, offset, len);
1244 			skb->csum = csum_block_add(skb->csum, csum, skb->len);
1245 		}
1246 
1247 		skb->len += len;
1248 		skb->data_len += len;
1249 		skb->truesize += len;
1250 		atomic_add(len, &sk->sk_wmem_alloc);
1251 		offset += len;
1252 		size -= len;
1253 	}
1254 	return 0;
1255 
1256 error:
1257 	cork->length -= size;
1258 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1259 	return err;
1260 }
1261 
1262 static void ip_cork_release(struct inet_cork *cork)
1263 {
1264 	cork->flags &= ~IPCORK_OPT;
1265 	kfree(cork->opt);
1266 	cork->opt = NULL;
1267 	dst_release(cork->dst);
1268 	cork->dst = NULL;
1269 }
1270 
1271 /*
1272  *	Combined all pending IP fragments on the socket as one IP datagram
1273  *	and push them out.
1274  */
1275 struct sk_buff *__ip_make_skb(struct sock *sk,
1276 			      struct flowi4 *fl4,
1277 			      struct sk_buff_head *queue,
1278 			      struct inet_cork *cork)
1279 {
1280 	struct sk_buff *skb, *tmp_skb;
1281 	struct sk_buff **tail_skb;
1282 	struct inet_sock *inet = inet_sk(sk);
1283 	struct net *net = sock_net(sk);
1284 	struct ip_options *opt = NULL;
1285 	struct rtable *rt = (struct rtable *)cork->dst;
1286 	struct iphdr *iph;
1287 	__be16 df = 0;
1288 	__u8 ttl;
1289 
1290 	if ((skb = __skb_dequeue(queue)) == NULL)
1291 		goto out;
1292 	tail_skb = &(skb_shinfo(skb)->frag_list);
1293 
1294 	/* move skb->data to ip header from ext header */
1295 	if (skb->data < skb_network_header(skb))
1296 		__skb_pull(skb, skb_network_offset(skb));
1297 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1298 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1299 		*tail_skb = tmp_skb;
1300 		tail_skb = &(tmp_skb->next);
1301 		skb->len += tmp_skb->len;
1302 		skb->data_len += tmp_skb->len;
1303 		skb->truesize += tmp_skb->truesize;
1304 		tmp_skb->destructor = NULL;
1305 		tmp_skb->sk = NULL;
1306 	}
1307 
1308 	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1309 	 * to fragment the frame generated here. No matter, what transforms
1310 	 * how transforms change size of the packet, it will come out.
1311 	 */
1312 	if (inet->pmtudisc < IP_PMTUDISC_DO)
1313 		skb->local_df = 1;
1314 
1315 	/* DF bit is set when we want to see DF on outgoing frames.
1316 	 * If local_df is set too, we still allow to fragment this frame
1317 	 * locally. */
1318 	if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1319 	    (skb->len <= dst_mtu(&rt->dst) &&
1320 	     ip_dont_fragment(sk, &rt->dst)))
1321 		df = htons(IP_DF);
1322 
1323 	if (cork->flags & IPCORK_OPT)
1324 		opt = cork->opt;
1325 
1326 	if (rt->rt_type == RTN_MULTICAST)
1327 		ttl = inet->mc_ttl;
1328 	else
1329 		ttl = ip_select_ttl(inet, &rt->dst);
1330 
1331 	iph = (struct iphdr *)skb->data;
1332 	iph->version = 4;
1333 	iph->ihl = 5;
1334 	iph->tos = inet->tos;
1335 	iph->frag_off = df;
1336 	ip_select_ident(iph, &rt->dst, sk);
1337 	iph->ttl = ttl;
1338 	iph->protocol = sk->sk_protocol;
1339 	iph->saddr = fl4->saddr;
1340 	iph->daddr = fl4->daddr;
1341 
1342 	if (opt) {
1343 		iph->ihl += opt->optlen>>2;
1344 		ip_options_build(skb, opt, cork->addr, rt, 0);
1345 	}
1346 
1347 	skb->priority = sk->sk_priority;
1348 	skb->mark = sk->sk_mark;
1349 	/*
1350 	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1351 	 * on dst refcount
1352 	 */
1353 	cork->dst = NULL;
1354 	skb_dst_set(skb, &rt->dst);
1355 
1356 	if (iph->protocol == IPPROTO_ICMP)
1357 		icmp_out_count(net, ((struct icmphdr *)
1358 			skb_transport_header(skb))->type);
1359 
1360 	ip_cork_release(cork);
1361 out:
1362 	return skb;
1363 }
1364 
1365 int ip_send_skb(struct sk_buff *skb)
1366 {
1367 	struct net *net = sock_net(skb->sk);
1368 	int err;
1369 
1370 	err = ip_local_out(skb);
1371 	if (err) {
1372 		if (err > 0)
1373 			err = net_xmit_errno(err);
1374 		if (err)
1375 			IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1376 	}
1377 
1378 	return err;
1379 }
1380 
1381 int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1382 {
1383 	struct sk_buff *skb;
1384 
1385 	skb = ip_finish_skb(sk, fl4);
1386 	if (!skb)
1387 		return 0;
1388 
1389 	/* Netfilter gets whole the not fragmented skb. */
1390 	return ip_send_skb(skb);
1391 }
1392 
1393 /*
1394  *	Throw away all pending data on the socket.
1395  */
1396 static void __ip_flush_pending_frames(struct sock *sk,
1397 				      struct sk_buff_head *queue,
1398 				      struct inet_cork *cork)
1399 {
1400 	struct sk_buff *skb;
1401 
1402 	while ((skb = __skb_dequeue_tail(queue)) != NULL)
1403 		kfree_skb(skb);
1404 
1405 	ip_cork_release(cork);
1406 }
1407 
1408 void ip_flush_pending_frames(struct sock *sk)
1409 {
1410 	__ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1411 }
1412 
1413 struct sk_buff *ip_make_skb(struct sock *sk,
1414 			    struct flowi4 *fl4,
1415 			    int getfrag(void *from, char *to, int offset,
1416 					int len, int odd, struct sk_buff *skb),
1417 			    void *from, int length, int transhdrlen,
1418 			    struct ipcm_cookie *ipc, struct rtable **rtp,
1419 			    unsigned int flags)
1420 {
1421 	struct inet_cork cork;
1422 	struct sk_buff_head queue;
1423 	int err;
1424 
1425 	if (flags & MSG_PROBE)
1426 		return NULL;
1427 
1428 	__skb_queue_head_init(&queue);
1429 
1430 	cork.flags = 0;
1431 	cork.addr = 0;
1432 	cork.opt = NULL;
1433 	err = ip_setup_cork(sk, &cork, ipc, rtp);
1434 	if (err)
1435 		return ERR_PTR(err);
1436 
1437 	err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
1438 			       from, length, transhdrlen, flags);
1439 	if (err) {
1440 		__ip_flush_pending_frames(sk, &queue, &cork);
1441 		return ERR_PTR(err);
1442 	}
1443 
1444 	return __ip_make_skb(sk, fl4, &queue, &cork);
1445 }
1446 
1447 /*
1448  *	Fetch data from kernel space and fill in checksum if needed.
1449  */
1450 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1451 			      int len, int odd, struct sk_buff *skb)
1452 {
1453 	__wsum csum;
1454 
1455 	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1456 	skb->csum = csum_block_add(skb->csum, csum, odd);
1457 	return 0;
1458 }
1459 
1460 /*
1461  *	Generic function to send a packet as reply to another packet.
1462  *	Used to send TCP resets so far. ICMP should use this function too.
1463  *
1464  *	Should run single threaded per socket because it uses the sock
1465  *     	structure to pass arguments.
1466  */
1467 void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1468 		   struct ip_reply_arg *arg, unsigned int len)
1469 {
1470 	struct inet_sock *inet = inet_sk(sk);
1471 	struct ip_options_data replyopts;
1472 	struct ipcm_cookie ipc;
1473 	struct flowi4 fl4;
1474 	struct rtable *rt = skb_rtable(skb);
1475 
1476 	if (ip_options_echo(&replyopts.opt.opt, skb))
1477 		return;
1478 
1479 	ipc.addr = daddr;
1480 	ipc.opt = NULL;
1481 	ipc.tx_flags = 0;
1482 
1483 	if (replyopts.opt.opt.optlen) {
1484 		ipc.opt = &replyopts.opt;
1485 
1486 		if (replyopts.opt.opt.srr)
1487 			daddr = replyopts.opt.opt.faddr;
1488 	}
1489 
1490 	flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1491 			   RT_TOS(ip_hdr(skb)->tos),
1492 			   RT_SCOPE_UNIVERSE, sk->sk_protocol,
1493 			   ip_reply_arg_flowi_flags(arg),
1494 			   daddr, rt->rt_spec_dst,
1495 			   tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1496 	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1497 	rt = ip_route_output_key(sock_net(sk), &fl4);
1498 	if (IS_ERR(rt))
1499 		return;
1500 
1501 	/* And let IP do all the hard work.
1502 
1503 	   This chunk is not reenterable, hence spinlock.
1504 	   Note that it uses the fact, that this function is called
1505 	   with locally disabled BH and that sk cannot be already spinlocked.
1506 	 */
1507 	bh_lock_sock(sk);
1508 	inet->tos = ip_hdr(skb)->tos;
1509 	sk->sk_priority = skb->priority;
1510 	sk->sk_protocol = ip_hdr(skb)->protocol;
1511 	sk->sk_bound_dev_if = arg->bound_dev_if;
1512 	ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1513 		       &ipc, &rt, MSG_DONTWAIT);
1514 	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1515 		if (arg->csumoffset >= 0)
1516 			*((__sum16 *)skb_transport_header(skb) +
1517 			  arg->csumoffset) = csum_fold(csum_add(skb->csum,
1518 								arg->csum));
1519 		skb->ip_summed = CHECKSUM_NONE;
1520 		ip_push_pending_frames(sk, &fl4);
1521 	}
1522 
1523 	bh_unlock_sock(sk);
1524 
1525 	ip_rt_put(rt);
1526 }
1527 
1528 void __init ip_init(void)
1529 {
1530 	ip_rt_init();
1531 	inet_initpeers();
1532 
1533 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1534 	igmp_mc_proc_init();
1535 #endif
1536 }
1537