xref: /openbmc/linux/net/core/sock.c (revision 643d1f7f)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Version:	$Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
11  *
12  * Authors:	Ross Biro
13  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14  *		Florian La Roche, <flla@stud.uni-sb.de>
15  *		Alan Cox, <A.Cox@swansea.ac.uk>
16  *
17  * Fixes:
18  *		Alan Cox	: 	Numerous verify_area() problems
19  *		Alan Cox	:	Connecting on a connecting socket
20  *					now returns an error for tcp.
21  *		Alan Cox	:	sock->protocol is set correctly.
22  *					and is not sometimes left as 0.
23  *		Alan Cox	:	connect handles icmp errors on a
24  *					connect properly. Unfortunately there
25  *					is a restart syscall nasty there. I
26  *					can't match BSD without hacking the C
27  *					library. Ideas urgently sought!
28  *		Alan Cox	:	Disallow bind() to addresses that are
29  *					not ours - especially broadcast ones!!
30  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
31  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
32  *					instead they leave that for the DESTROY timer.
33  *		Alan Cox	:	Clean up error flag in accept
34  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
35  *					was buggy. Put a remove_sock() in the handler
36  *					for memory when we hit 0. Also altered the timer
37  *					code. The ACK stuff can wait and needs major
38  *					TCP layer surgery.
39  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
40  *					and fixed timer/inet_bh race.
41  *		Alan Cox	:	Added zapped flag for TCP
42  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
43  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
45  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
46  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
48  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
49  *	Pauline Middelink	:	identd support
50  *		Alan Cox	:	Fixed connect() taking signals I think.
51  *		Alan Cox	:	SO_LINGER supported
52  *		Alan Cox	:	Error reporting fixes
53  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
54  *		Alan Cox	:	inet sockets don't set sk->type!
55  *		Alan Cox	:	Split socket option code
56  *		Alan Cox	:	Callbacks
57  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
58  *		Alex		:	Removed restriction on inet fioctl
59  *		Alan Cox	:	Splitting INET from NET core
60  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
61  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
62  *		Alan Cox	:	Split IP from generic code
63  *		Alan Cox	:	New kfree_skbmem()
64  *		Alan Cox	:	Make SO_DEBUG superuser only.
65  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
66  *					(compatibility fix)
67  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
68  *		Alan Cox	:	Allocator for a socket is settable.
69  *		Alan Cox	:	SO_ERROR includes soft errors.
70  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
71  *		Alan Cox	: 	Generic socket allocation to make hooks
72  *					easier (suggested by Craig Metz).
73  *		Michael Pall	:	SO_ERROR returns positive errno again
74  *              Steve Whitehouse:       Added default destructor to free
75  *                                      protocol private data.
76  *              Steve Whitehouse:       Added various other default routines
77  *                                      common to several socket families.
78  *              Chris Evans     :       Call suser() check last on F_SETOWN
79  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
81  *		Andi Kleen	:	Fix write_space callback
82  *		Chris Evans	:	Security fixes - signedness again
83  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
84  *
85  * To Fix:
86  *
87  *
88  *		This program is free software; you can redistribute it and/or
89  *		modify it under the terms of the GNU General Public License
90  *		as published by the Free Software Foundation; either version
91  *		2 of the License, or (at your option) any later version.
92  */
93 
94 #include <linux/capability.h>
95 #include <linux/errno.h>
96 #include <linux/types.h>
97 #include <linux/socket.h>
98 #include <linux/in.h>
99 #include <linux/kernel.h>
100 #include <linux/module.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/sched.h>
104 #include <linux/timer.h>
105 #include <linux/string.h>
106 #include <linux/sockios.h>
107 #include <linux/net.h>
108 #include <linux/mm.h>
109 #include <linux/slab.h>
110 #include <linux/interrupt.h>
111 #include <linux/poll.h>
112 #include <linux/tcp.h>
113 #include <linux/init.h>
114 #include <linux/highmem.h>
115 
116 #include <asm/uaccess.h>
117 #include <asm/system.h>
118 
119 #include <linux/netdevice.h>
120 #include <net/protocol.h>
121 #include <linux/skbuff.h>
122 #include <net/net_namespace.h>
123 #include <net/request_sock.h>
124 #include <net/sock.h>
125 #include <net/xfrm.h>
126 #include <linux/ipsec.h>
127 
128 #include <linux/filter.h>
129 
130 #ifdef CONFIG_INET
131 #include <net/tcp.h>
132 #endif
133 
134 /*
135  * Each address family might have different locking rules, so we have
136  * one slock key per address family:
137  */
138 static struct lock_class_key af_family_keys[AF_MAX];
139 static struct lock_class_key af_family_slock_keys[AF_MAX];
140 
141 #ifdef CONFIG_DEBUG_LOCK_ALLOC
142 /*
143  * Make lock validator output more readable. (we pre-construct these
144  * strings build-time, so that runtime initialization of socket
145  * locks is fast):
146  */
147 static const char *af_family_key_strings[AF_MAX+1] = {
148   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
149   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
150   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
151   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
152   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
153   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
154   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
155   "sk_lock-21"       , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
156   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
157   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
158   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
159   "sk_lock-AF_RXRPC" , "sk_lock-AF_MAX"
160 };
161 static const char *af_family_slock_key_strings[AF_MAX+1] = {
162   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
163   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
164   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
165   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
166   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
167   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
168   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
169   "slock-21"       , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
170   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
171   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
172   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
173   "slock-AF_RXRPC" , "slock-AF_MAX"
174 };
175 static const char *af_family_clock_key_strings[AF_MAX+1] = {
176   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
177   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
178   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
179   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
180   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
181   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
182   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
183   "clock-21"       , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
184   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
185   "clock-27"       , "clock-28"          , "clock-29"          ,
186   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
187   "clock-AF_RXRPC" , "clock-AF_MAX"
188 };
189 #endif
190 
191 /*
192  * sk_callback_lock locking rules are per-address-family,
193  * so split the lock classes by using a per-AF key:
194  */
195 static struct lock_class_key af_callback_keys[AF_MAX];
196 
197 /* Take into consideration the size of the struct sk_buff overhead in the
198  * determination of these values, since that is non-constant across
199  * platforms.  This makes socket queueing behavior and performance
200  * not depend upon such differences.
201  */
202 #define _SK_MEM_PACKETS		256
203 #define _SK_MEM_OVERHEAD	(sizeof(struct sk_buff) + 256)
204 #define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
205 #define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
206 
207 /* Run time adjustable parameters. */
208 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
209 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
210 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
211 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
212 
213 /* Maximal space eaten by iovec or ancilliary data plus some space */
214 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
215 
216 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
217 {
218 	struct timeval tv;
219 
220 	if (optlen < sizeof(tv))
221 		return -EINVAL;
222 	if (copy_from_user(&tv, optval, sizeof(tv)))
223 		return -EFAULT;
224 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
225 		return -EDOM;
226 
227 	if (tv.tv_sec < 0) {
228 		static int warned __read_mostly;
229 
230 		*timeo_p = 0;
231 		if (warned < 10 && net_ratelimit())
232 			warned++;
233 			printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
234 			       "tries to set negative timeout\n",
235 				current->comm, task_pid_nr(current));
236 		return 0;
237 	}
238 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
239 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
240 		return 0;
241 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
242 		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
243 	return 0;
244 }
245 
246 static void sock_warn_obsolete_bsdism(const char *name)
247 {
248 	static int warned;
249 	static char warncomm[TASK_COMM_LEN];
250 	if (strcmp(warncomm, current->comm) && warned < 5) {
251 		strcpy(warncomm,  current->comm);
252 		printk(KERN_WARNING "process `%s' is using obsolete "
253 		       "%s SO_BSDCOMPAT\n", warncomm, name);
254 		warned++;
255 	}
256 }
257 
258 static void sock_disable_timestamp(struct sock *sk)
259 {
260 	if (sock_flag(sk, SOCK_TIMESTAMP)) {
261 		sock_reset_flag(sk, SOCK_TIMESTAMP);
262 		net_disable_timestamp();
263 	}
264 }
265 
266 
267 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
268 {
269 	int err = 0;
270 	int skb_len;
271 
272 	/* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
273 	   number of warnings when compiling with -W --ANK
274 	 */
275 	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
276 	    (unsigned)sk->sk_rcvbuf) {
277 		err = -ENOMEM;
278 		goto out;
279 	}
280 
281 	err = sk_filter(sk, skb);
282 	if (err)
283 		goto out;
284 
285 	if (!sk_rmem_schedule(sk, skb->truesize)) {
286 		err = -ENOBUFS;
287 		goto out;
288 	}
289 
290 	skb->dev = NULL;
291 	skb_set_owner_r(skb, sk);
292 
293 	/* Cache the SKB length before we tack it onto the receive
294 	 * queue.  Once it is added it no longer belongs to us and
295 	 * may be freed by other threads of control pulling packets
296 	 * from the queue.
297 	 */
298 	skb_len = skb->len;
299 
300 	skb_queue_tail(&sk->sk_receive_queue, skb);
301 
302 	if (!sock_flag(sk, SOCK_DEAD))
303 		sk->sk_data_ready(sk, skb_len);
304 out:
305 	return err;
306 }
307 EXPORT_SYMBOL(sock_queue_rcv_skb);
308 
309 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
310 {
311 	int rc = NET_RX_SUCCESS;
312 
313 	if (sk_filter(sk, skb))
314 		goto discard_and_relse;
315 
316 	skb->dev = NULL;
317 
318 	if (nested)
319 		bh_lock_sock_nested(sk);
320 	else
321 		bh_lock_sock(sk);
322 	if (!sock_owned_by_user(sk)) {
323 		/*
324 		 * trylock + unlock semantics:
325 		 */
326 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
327 
328 		rc = sk->sk_backlog_rcv(sk, skb);
329 
330 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
331 	} else
332 		sk_add_backlog(sk, skb);
333 	bh_unlock_sock(sk);
334 out:
335 	sock_put(sk);
336 	return rc;
337 discard_and_relse:
338 	kfree_skb(skb);
339 	goto out;
340 }
341 EXPORT_SYMBOL(sk_receive_skb);
342 
343 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
344 {
345 	struct dst_entry *dst = sk->sk_dst_cache;
346 
347 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
348 		sk->sk_dst_cache = NULL;
349 		dst_release(dst);
350 		return NULL;
351 	}
352 
353 	return dst;
354 }
355 EXPORT_SYMBOL(__sk_dst_check);
356 
357 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
358 {
359 	struct dst_entry *dst = sk_dst_get(sk);
360 
361 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
362 		sk_dst_reset(sk);
363 		dst_release(dst);
364 		return NULL;
365 	}
366 
367 	return dst;
368 }
369 EXPORT_SYMBOL(sk_dst_check);
370 
371 static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
372 {
373 	int ret = -ENOPROTOOPT;
374 #ifdef CONFIG_NETDEVICES
375 	struct net *net = sk->sk_net;
376 	char devname[IFNAMSIZ];
377 	int index;
378 
379 	/* Sorry... */
380 	ret = -EPERM;
381 	if (!capable(CAP_NET_RAW))
382 		goto out;
383 
384 	ret = -EINVAL;
385 	if (optlen < 0)
386 		goto out;
387 
388 	/* Bind this socket to a particular device like "eth0",
389 	 * as specified in the passed interface name. If the
390 	 * name is "" or the option length is zero the socket
391 	 * is not bound.
392 	 */
393 	if (optlen > IFNAMSIZ - 1)
394 		optlen = IFNAMSIZ - 1;
395 	memset(devname, 0, sizeof(devname));
396 
397 	ret = -EFAULT;
398 	if (copy_from_user(devname, optval, optlen))
399 		goto out;
400 
401 	if (devname[0] == '\0') {
402 		index = 0;
403 	} else {
404 		struct net_device *dev = dev_get_by_name(net, devname);
405 
406 		ret = -ENODEV;
407 		if (!dev)
408 			goto out;
409 
410 		index = dev->ifindex;
411 		dev_put(dev);
412 	}
413 
414 	lock_sock(sk);
415 	sk->sk_bound_dev_if = index;
416 	sk_dst_reset(sk);
417 	release_sock(sk);
418 
419 	ret = 0;
420 
421 out:
422 #endif
423 
424 	return ret;
425 }
426 
427 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
428 {
429 	if (valbool)
430 		sock_set_flag(sk, bit);
431 	else
432 		sock_reset_flag(sk, bit);
433 }
434 
435 /*
436  *	This is meant for all protocols to use and covers goings on
437  *	at the socket level. Everything here is generic.
438  */
439 
440 int sock_setsockopt(struct socket *sock, int level, int optname,
441 		    char __user *optval, int optlen)
442 {
443 	struct sock *sk=sock->sk;
444 	int val;
445 	int valbool;
446 	struct linger ling;
447 	int ret = 0;
448 
449 	/*
450 	 *	Options without arguments
451 	 */
452 
453 #ifdef SO_DONTLINGER		/* Compatibility item... */
454 	if (optname == SO_DONTLINGER) {
455 		lock_sock(sk);
456 		sock_reset_flag(sk, SOCK_LINGER);
457 		release_sock(sk);
458 		return 0;
459 	}
460 #endif
461 
462 	if (optname == SO_BINDTODEVICE)
463 		return sock_bindtodevice(sk, optval, optlen);
464 
465 	if (optlen < sizeof(int))
466 		return -EINVAL;
467 
468 	if (get_user(val, (int __user *)optval))
469 		return -EFAULT;
470 
471 	valbool = val?1:0;
472 
473 	lock_sock(sk);
474 
475 	switch(optname) {
476 	case SO_DEBUG:
477 		if (val && !capable(CAP_NET_ADMIN)) {
478 			ret = -EACCES;
479 		} else
480 			sock_valbool_flag(sk, SOCK_DBG, valbool);
481 		break;
482 	case SO_REUSEADDR:
483 		sk->sk_reuse = valbool;
484 		break;
485 	case SO_TYPE:
486 	case SO_ERROR:
487 		ret = -ENOPROTOOPT;
488 		break;
489 	case SO_DONTROUTE:
490 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
491 		break;
492 	case SO_BROADCAST:
493 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
494 		break;
495 	case SO_SNDBUF:
496 		/* Don't error on this BSD doesn't and if you think
497 		   about it this is right. Otherwise apps have to
498 		   play 'guess the biggest size' games. RCVBUF/SNDBUF
499 		   are treated in BSD as hints */
500 
501 		if (val > sysctl_wmem_max)
502 			val = sysctl_wmem_max;
503 set_sndbuf:
504 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
505 		if ((val * 2) < SOCK_MIN_SNDBUF)
506 			sk->sk_sndbuf = SOCK_MIN_SNDBUF;
507 		else
508 			sk->sk_sndbuf = val * 2;
509 
510 		/*
511 		 *	Wake up sending tasks if we
512 		 *	upped the value.
513 		 */
514 		sk->sk_write_space(sk);
515 		break;
516 
517 	case SO_SNDBUFFORCE:
518 		if (!capable(CAP_NET_ADMIN)) {
519 			ret = -EPERM;
520 			break;
521 		}
522 		goto set_sndbuf;
523 
524 	case SO_RCVBUF:
525 		/* Don't error on this BSD doesn't and if you think
526 		   about it this is right. Otherwise apps have to
527 		   play 'guess the biggest size' games. RCVBUF/SNDBUF
528 		   are treated in BSD as hints */
529 
530 		if (val > sysctl_rmem_max)
531 			val = sysctl_rmem_max;
532 set_rcvbuf:
533 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
534 		/*
535 		 * We double it on the way in to account for
536 		 * "struct sk_buff" etc. overhead.   Applications
537 		 * assume that the SO_RCVBUF setting they make will
538 		 * allow that much actual data to be received on that
539 		 * socket.
540 		 *
541 		 * Applications are unaware that "struct sk_buff" and
542 		 * other overheads allocate from the receive buffer
543 		 * during socket buffer allocation.
544 		 *
545 		 * And after considering the possible alternatives,
546 		 * returning the value we actually used in getsockopt
547 		 * is the most desirable behavior.
548 		 */
549 		if ((val * 2) < SOCK_MIN_RCVBUF)
550 			sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
551 		else
552 			sk->sk_rcvbuf = val * 2;
553 		break;
554 
555 	case SO_RCVBUFFORCE:
556 		if (!capable(CAP_NET_ADMIN)) {
557 			ret = -EPERM;
558 			break;
559 		}
560 		goto set_rcvbuf;
561 
562 	case SO_KEEPALIVE:
563 #ifdef CONFIG_INET
564 		if (sk->sk_protocol == IPPROTO_TCP)
565 			tcp_set_keepalive(sk, valbool);
566 #endif
567 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
568 		break;
569 
570 	case SO_OOBINLINE:
571 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
572 		break;
573 
574 	case SO_NO_CHECK:
575 		sk->sk_no_check = valbool;
576 		break;
577 
578 	case SO_PRIORITY:
579 		if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
580 			sk->sk_priority = val;
581 		else
582 			ret = -EPERM;
583 		break;
584 
585 	case SO_LINGER:
586 		if (optlen < sizeof(ling)) {
587 			ret = -EINVAL;	/* 1003.1g */
588 			break;
589 		}
590 		if (copy_from_user(&ling,optval,sizeof(ling))) {
591 			ret = -EFAULT;
592 			break;
593 		}
594 		if (!ling.l_onoff)
595 			sock_reset_flag(sk, SOCK_LINGER);
596 		else {
597 #if (BITS_PER_LONG == 32)
598 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
599 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
600 			else
601 #endif
602 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
603 			sock_set_flag(sk, SOCK_LINGER);
604 		}
605 		break;
606 
607 	case SO_BSDCOMPAT:
608 		sock_warn_obsolete_bsdism("setsockopt");
609 		break;
610 
611 	case SO_PASSCRED:
612 		if (valbool)
613 			set_bit(SOCK_PASSCRED, &sock->flags);
614 		else
615 			clear_bit(SOCK_PASSCRED, &sock->flags);
616 		break;
617 
618 	case SO_TIMESTAMP:
619 	case SO_TIMESTAMPNS:
620 		if (valbool)  {
621 			if (optname == SO_TIMESTAMP)
622 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
623 			else
624 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
625 			sock_set_flag(sk, SOCK_RCVTSTAMP);
626 			sock_enable_timestamp(sk);
627 		} else {
628 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
629 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
630 		}
631 		break;
632 
633 	case SO_RCVLOWAT:
634 		if (val < 0)
635 			val = INT_MAX;
636 		sk->sk_rcvlowat = val ? : 1;
637 		break;
638 
639 	case SO_RCVTIMEO:
640 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
641 		break;
642 
643 	case SO_SNDTIMEO:
644 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
645 		break;
646 
647 	case SO_ATTACH_FILTER:
648 		ret = -EINVAL;
649 		if (optlen == sizeof(struct sock_fprog)) {
650 			struct sock_fprog fprog;
651 
652 			ret = -EFAULT;
653 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
654 				break;
655 
656 			ret = sk_attach_filter(&fprog, sk);
657 		}
658 		break;
659 
660 	case SO_DETACH_FILTER:
661 		ret = sk_detach_filter(sk);
662 		break;
663 
664 	case SO_PASSSEC:
665 		if (valbool)
666 			set_bit(SOCK_PASSSEC, &sock->flags);
667 		else
668 			clear_bit(SOCK_PASSSEC, &sock->flags);
669 		break;
670 	case SO_MARK:
671 		if (!capable(CAP_NET_ADMIN))
672 			ret = -EPERM;
673 		else {
674 			sk->sk_mark = val;
675 		}
676 		break;
677 
678 		/* We implement the SO_SNDLOWAT etc to
679 		   not be settable (1003.1g 5.3) */
680 	default:
681 		ret = -ENOPROTOOPT;
682 		break;
683 	}
684 	release_sock(sk);
685 	return ret;
686 }
687 
688 
689 int sock_getsockopt(struct socket *sock, int level, int optname,
690 		    char __user *optval, int __user *optlen)
691 {
692 	struct sock *sk = sock->sk;
693 
694 	union {
695 		int val;
696 		struct linger ling;
697 		struct timeval tm;
698 	} v;
699 
700 	unsigned int lv = sizeof(int);
701 	int len;
702 
703 	if (get_user(len, optlen))
704 		return -EFAULT;
705 	if (len < 0)
706 		return -EINVAL;
707 
708 	switch(optname) {
709 	case SO_DEBUG:
710 		v.val = sock_flag(sk, SOCK_DBG);
711 		break;
712 
713 	case SO_DONTROUTE:
714 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
715 		break;
716 
717 	case SO_BROADCAST:
718 		v.val = !!sock_flag(sk, SOCK_BROADCAST);
719 		break;
720 
721 	case SO_SNDBUF:
722 		v.val = sk->sk_sndbuf;
723 		break;
724 
725 	case SO_RCVBUF:
726 		v.val = sk->sk_rcvbuf;
727 		break;
728 
729 	case SO_REUSEADDR:
730 		v.val = sk->sk_reuse;
731 		break;
732 
733 	case SO_KEEPALIVE:
734 		v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
735 		break;
736 
737 	case SO_TYPE:
738 		v.val = sk->sk_type;
739 		break;
740 
741 	case SO_ERROR:
742 		v.val = -sock_error(sk);
743 		if (v.val==0)
744 			v.val = xchg(&sk->sk_err_soft, 0);
745 		break;
746 
747 	case SO_OOBINLINE:
748 		v.val = !!sock_flag(sk, SOCK_URGINLINE);
749 		break;
750 
751 	case SO_NO_CHECK:
752 		v.val = sk->sk_no_check;
753 		break;
754 
755 	case SO_PRIORITY:
756 		v.val = sk->sk_priority;
757 		break;
758 
759 	case SO_LINGER:
760 		lv		= sizeof(v.ling);
761 		v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER);
762 		v.ling.l_linger	= sk->sk_lingertime / HZ;
763 		break;
764 
765 	case SO_BSDCOMPAT:
766 		sock_warn_obsolete_bsdism("getsockopt");
767 		break;
768 
769 	case SO_TIMESTAMP:
770 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
771 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
772 		break;
773 
774 	case SO_TIMESTAMPNS:
775 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
776 		break;
777 
778 	case SO_RCVTIMEO:
779 		lv=sizeof(struct timeval);
780 		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
781 			v.tm.tv_sec = 0;
782 			v.tm.tv_usec = 0;
783 		} else {
784 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
785 			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
786 		}
787 		break;
788 
789 	case SO_SNDTIMEO:
790 		lv=sizeof(struct timeval);
791 		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
792 			v.tm.tv_sec = 0;
793 			v.tm.tv_usec = 0;
794 		} else {
795 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
796 			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
797 		}
798 		break;
799 
800 	case SO_RCVLOWAT:
801 		v.val = sk->sk_rcvlowat;
802 		break;
803 
804 	case SO_SNDLOWAT:
805 		v.val=1;
806 		break;
807 
808 	case SO_PASSCRED:
809 		v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
810 		break;
811 
812 	case SO_PEERCRED:
813 		if (len > sizeof(sk->sk_peercred))
814 			len = sizeof(sk->sk_peercred);
815 		if (copy_to_user(optval, &sk->sk_peercred, len))
816 			return -EFAULT;
817 		goto lenout;
818 
819 	case SO_PEERNAME:
820 	{
821 		char address[128];
822 
823 		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
824 			return -ENOTCONN;
825 		if (lv < len)
826 			return -EINVAL;
827 		if (copy_to_user(optval, address, len))
828 			return -EFAULT;
829 		goto lenout;
830 	}
831 
832 	/* Dubious BSD thing... Probably nobody even uses it, but
833 	 * the UNIX standard wants it for whatever reason... -DaveM
834 	 */
835 	case SO_ACCEPTCONN:
836 		v.val = sk->sk_state == TCP_LISTEN;
837 		break;
838 
839 	case SO_PASSSEC:
840 		v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
841 		break;
842 
843 	case SO_PEERSEC:
844 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
845 
846 	case SO_MARK:
847 		v.val = sk->sk_mark;
848 		break;
849 
850 	default:
851 		return -ENOPROTOOPT;
852 	}
853 
854 	if (len > lv)
855 		len = lv;
856 	if (copy_to_user(optval, &v, len))
857 		return -EFAULT;
858 lenout:
859 	if (put_user(len, optlen))
860 		return -EFAULT;
861 	return 0;
862 }
863 
864 /*
865  * Initialize an sk_lock.
866  *
867  * (We also register the sk_lock with the lock validator.)
868  */
869 static inline void sock_lock_init(struct sock *sk)
870 {
871 	sock_lock_init_class_and_name(sk,
872 			af_family_slock_key_strings[sk->sk_family],
873 			af_family_slock_keys + sk->sk_family,
874 			af_family_key_strings[sk->sk_family],
875 			af_family_keys + sk->sk_family);
876 }
877 
878 static void sock_copy(struct sock *nsk, const struct sock *osk)
879 {
880 #ifdef CONFIG_SECURITY_NETWORK
881 	void *sptr = nsk->sk_security;
882 #endif
883 
884 	memcpy(nsk, osk, osk->sk_prot->obj_size);
885 #ifdef CONFIG_SECURITY_NETWORK
886 	nsk->sk_security = sptr;
887 	security_sk_clone(osk, nsk);
888 #endif
889 }
890 
891 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
892 		int family)
893 {
894 	struct sock *sk;
895 	struct kmem_cache *slab;
896 
897 	slab = prot->slab;
898 	if (slab != NULL)
899 		sk = kmem_cache_alloc(slab, priority);
900 	else
901 		sk = kmalloc(prot->obj_size, priority);
902 
903 	if (sk != NULL) {
904 		if (security_sk_alloc(sk, family, priority))
905 			goto out_free;
906 
907 		if (!try_module_get(prot->owner))
908 			goto out_free_sec;
909 	}
910 
911 	return sk;
912 
913 out_free_sec:
914 	security_sk_free(sk);
915 out_free:
916 	if (slab != NULL)
917 		kmem_cache_free(slab, sk);
918 	else
919 		kfree(sk);
920 	return NULL;
921 }
922 
923 static void sk_prot_free(struct proto *prot, struct sock *sk)
924 {
925 	struct kmem_cache *slab;
926 	struct module *owner;
927 
928 	owner = prot->owner;
929 	slab = prot->slab;
930 
931 	security_sk_free(sk);
932 	if (slab != NULL)
933 		kmem_cache_free(slab, sk);
934 	else
935 		kfree(sk);
936 	module_put(owner);
937 }
938 
939 /**
940  *	sk_alloc - All socket objects are allocated here
941  *	@net: the applicable net namespace
942  *	@family: protocol family
943  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
944  *	@prot: struct proto associated with this new sock instance
945  *	@zero_it: if we should zero the newly allocated sock
946  */
947 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
948 		      struct proto *prot)
949 {
950 	struct sock *sk;
951 
952 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
953 	if (sk) {
954 		sk->sk_family = family;
955 		/*
956 		 * See comment in struct sock definition to understand
957 		 * why we need sk_prot_creator -acme
958 		 */
959 		sk->sk_prot = sk->sk_prot_creator = prot;
960 		sock_lock_init(sk);
961 		sk->sk_net = get_net(net);
962 	}
963 
964 	return sk;
965 }
966 
967 void sk_free(struct sock *sk)
968 {
969 	struct sk_filter *filter;
970 
971 	if (sk->sk_destruct)
972 		sk->sk_destruct(sk);
973 
974 	filter = rcu_dereference(sk->sk_filter);
975 	if (filter) {
976 		sk_filter_uncharge(sk, filter);
977 		rcu_assign_pointer(sk->sk_filter, NULL);
978 	}
979 
980 	sock_disable_timestamp(sk);
981 
982 	if (atomic_read(&sk->sk_omem_alloc))
983 		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
984 		       __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
985 
986 	put_net(sk->sk_net);
987 	sk_prot_free(sk->sk_prot_creator, sk);
988 }
989 
990 struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
991 {
992 	struct sock *newsk;
993 
994 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
995 	if (newsk != NULL) {
996 		struct sk_filter *filter;
997 
998 		sock_copy(newsk, sk);
999 
1000 		/* SANITY */
1001 		get_net(newsk->sk_net);
1002 		sk_node_init(&newsk->sk_node);
1003 		sock_lock_init(newsk);
1004 		bh_lock_sock(newsk);
1005 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1006 
1007 		atomic_set(&newsk->sk_rmem_alloc, 0);
1008 		atomic_set(&newsk->sk_wmem_alloc, 0);
1009 		atomic_set(&newsk->sk_omem_alloc, 0);
1010 		skb_queue_head_init(&newsk->sk_receive_queue);
1011 		skb_queue_head_init(&newsk->sk_write_queue);
1012 #ifdef CONFIG_NET_DMA
1013 		skb_queue_head_init(&newsk->sk_async_wait_queue);
1014 #endif
1015 
1016 		rwlock_init(&newsk->sk_dst_lock);
1017 		rwlock_init(&newsk->sk_callback_lock);
1018 		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1019 				af_callback_keys + newsk->sk_family,
1020 				af_family_clock_key_strings[newsk->sk_family]);
1021 
1022 		newsk->sk_dst_cache	= NULL;
1023 		newsk->sk_wmem_queued	= 0;
1024 		newsk->sk_forward_alloc = 0;
1025 		newsk->sk_send_head	= NULL;
1026 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1027 
1028 		sock_reset_flag(newsk, SOCK_DONE);
1029 		skb_queue_head_init(&newsk->sk_error_queue);
1030 
1031 		filter = newsk->sk_filter;
1032 		if (filter != NULL)
1033 			sk_filter_charge(newsk, filter);
1034 
1035 		if (unlikely(xfrm_sk_clone_policy(newsk))) {
1036 			/* It is still raw copy of parent, so invalidate
1037 			 * destructor and make plain sk_free() */
1038 			newsk->sk_destruct = NULL;
1039 			sk_free(newsk);
1040 			newsk = NULL;
1041 			goto out;
1042 		}
1043 
1044 		newsk->sk_err	   = 0;
1045 		newsk->sk_priority = 0;
1046 		atomic_set(&newsk->sk_refcnt, 2);
1047 
1048 		/*
1049 		 * Increment the counter in the same struct proto as the master
1050 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1051 		 * is the same as sk->sk_prot->socks, as this field was copied
1052 		 * with memcpy).
1053 		 *
1054 		 * This _changes_ the previous behaviour, where
1055 		 * tcp_create_openreq_child always was incrementing the
1056 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1057 		 * to be taken into account in all callers. -acme
1058 		 */
1059 		sk_refcnt_debug_inc(newsk);
1060 		newsk->sk_socket = NULL;
1061 		newsk->sk_sleep	 = NULL;
1062 
1063 		if (newsk->sk_prot->sockets_allocated)
1064 			atomic_inc(newsk->sk_prot->sockets_allocated);
1065 	}
1066 out:
1067 	return newsk;
1068 }
1069 
1070 EXPORT_SYMBOL_GPL(sk_clone);
1071 
1072 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1073 {
1074 	__sk_dst_set(sk, dst);
1075 	sk->sk_route_caps = dst->dev->features;
1076 	if (sk->sk_route_caps & NETIF_F_GSO)
1077 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1078 	if (sk_can_gso(sk)) {
1079 		if (dst->header_len)
1080 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1081 		else
1082 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1083 	}
1084 }
1085 EXPORT_SYMBOL_GPL(sk_setup_caps);
1086 
1087 void __init sk_init(void)
1088 {
1089 	if (num_physpages <= 4096) {
1090 		sysctl_wmem_max = 32767;
1091 		sysctl_rmem_max = 32767;
1092 		sysctl_wmem_default = 32767;
1093 		sysctl_rmem_default = 32767;
1094 	} else if (num_physpages >= 131072) {
1095 		sysctl_wmem_max = 131071;
1096 		sysctl_rmem_max = 131071;
1097 	}
1098 }
1099 
1100 /*
1101  *	Simple resource managers for sockets.
1102  */
1103 
1104 
1105 /*
1106  * Write buffer destructor automatically called from kfree_skb.
1107  */
1108 void sock_wfree(struct sk_buff *skb)
1109 {
1110 	struct sock *sk = skb->sk;
1111 
1112 	/* In case it might be waiting for more memory. */
1113 	atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
1114 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
1115 		sk->sk_write_space(sk);
1116 	sock_put(sk);
1117 }
1118 
1119 /*
1120  * Read buffer destructor automatically called from kfree_skb.
1121  */
1122 void sock_rfree(struct sk_buff *skb)
1123 {
1124 	struct sock *sk = skb->sk;
1125 
1126 	skb_truesize_check(skb);
1127 	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1128 	sk_mem_uncharge(skb->sk, skb->truesize);
1129 }
1130 
1131 
1132 int sock_i_uid(struct sock *sk)
1133 {
1134 	int uid;
1135 
1136 	read_lock(&sk->sk_callback_lock);
1137 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1138 	read_unlock(&sk->sk_callback_lock);
1139 	return uid;
1140 }
1141 
1142 unsigned long sock_i_ino(struct sock *sk)
1143 {
1144 	unsigned long ino;
1145 
1146 	read_lock(&sk->sk_callback_lock);
1147 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1148 	read_unlock(&sk->sk_callback_lock);
1149 	return ino;
1150 }
1151 
1152 /*
1153  * Allocate a skb from the socket's send buffer.
1154  */
1155 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1156 			     gfp_t priority)
1157 {
1158 	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1159 		struct sk_buff * skb = alloc_skb(size, priority);
1160 		if (skb) {
1161 			skb_set_owner_w(skb, sk);
1162 			return skb;
1163 		}
1164 	}
1165 	return NULL;
1166 }
1167 
1168 /*
1169  * Allocate a skb from the socket's receive buffer.
1170  */
1171 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1172 			     gfp_t priority)
1173 {
1174 	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1175 		struct sk_buff *skb = alloc_skb(size, priority);
1176 		if (skb) {
1177 			skb_set_owner_r(skb, sk);
1178 			return skb;
1179 		}
1180 	}
1181 	return NULL;
1182 }
1183 
1184 /*
1185  * Allocate a memory block from the socket's option memory buffer.
1186  */
1187 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1188 {
1189 	if ((unsigned)size <= sysctl_optmem_max &&
1190 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1191 		void *mem;
1192 		/* First do the add, to avoid the race if kmalloc
1193 		 * might sleep.
1194 		 */
1195 		atomic_add(size, &sk->sk_omem_alloc);
1196 		mem = kmalloc(size, priority);
1197 		if (mem)
1198 			return mem;
1199 		atomic_sub(size, &sk->sk_omem_alloc);
1200 	}
1201 	return NULL;
1202 }
1203 
1204 /*
1205  * Free an option memory block.
1206  */
1207 void sock_kfree_s(struct sock *sk, void *mem, int size)
1208 {
1209 	kfree(mem);
1210 	atomic_sub(size, &sk->sk_omem_alloc);
1211 }
1212 
1213 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1214    I think, these locks should be removed for datagram sockets.
1215  */
1216 static long sock_wait_for_wmem(struct sock * sk, long timeo)
1217 {
1218 	DEFINE_WAIT(wait);
1219 
1220 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1221 	for (;;) {
1222 		if (!timeo)
1223 			break;
1224 		if (signal_pending(current))
1225 			break;
1226 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1227 		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1228 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1229 			break;
1230 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1231 			break;
1232 		if (sk->sk_err)
1233 			break;
1234 		timeo = schedule_timeout(timeo);
1235 	}
1236 	finish_wait(sk->sk_sleep, &wait);
1237 	return timeo;
1238 }
1239 
1240 
1241 /*
1242  *	Generic send/receive buffer handlers
1243  */
1244 
1245 static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
1246 					    unsigned long header_len,
1247 					    unsigned long data_len,
1248 					    int noblock, int *errcode)
1249 {
1250 	struct sk_buff *skb;
1251 	gfp_t gfp_mask;
1252 	long timeo;
1253 	int err;
1254 
1255 	gfp_mask = sk->sk_allocation;
1256 	if (gfp_mask & __GFP_WAIT)
1257 		gfp_mask |= __GFP_REPEAT;
1258 
1259 	timeo = sock_sndtimeo(sk, noblock);
1260 	while (1) {
1261 		err = sock_error(sk);
1262 		if (err != 0)
1263 			goto failure;
1264 
1265 		err = -EPIPE;
1266 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1267 			goto failure;
1268 
1269 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1270 			skb = alloc_skb(header_len, gfp_mask);
1271 			if (skb) {
1272 				int npages;
1273 				int i;
1274 
1275 				/* No pages, we're done... */
1276 				if (!data_len)
1277 					break;
1278 
1279 				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1280 				skb->truesize += data_len;
1281 				skb_shinfo(skb)->nr_frags = npages;
1282 				for (i = 0; i < npages; i++) {
1283 					struct page *page;
1284 					skb_frag_t *frag;
1285 
1286 					page = alloc_pages(sk->sk_allocation, 0);
1287 					if (!page) {
1288 						err = -ENOBUFS;
1289 						skb_shinfo(skb)->nr_frags = i;
1290 						kfree_skb(skb);
1291 						goto failure;
1292 					}
1293 
1294 					frag = &skb_shinfo(skb)->frags[i];
1295 					frag->page = page;
1296 					frag->page_offset = 0;
1297 					frag->size = (data_len >= PAGE_SIZE ?
1298 						      PAGE_SIZE :
1299 						      data_len);
1300 					data_len -= PAGE_SIZE;
1301 				}
1302 
1303 				/* Full success... */
1304 				break;
1305 			}
1306 			err = -ENOBUFS;
1307 			goto failure;
1308 		}
1309 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1310 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1311 		err = -EAGAIN;
1312 		if (!timeo)
1313 			goto failure;
1314 		if (signal_pending(current))
1315 			goto interrupted;
1316 		timeo = sock_wait_for_wmem(sk, timeo);
1317 	}
1318 
1319 	skb_set_owner_w(skb, sk);
1320 	return skb;
1321 
1322 interrupted:
1323 	err = sock_intr_errno(timeo);
1324 failure:
1325 	*errcode = err;
1326 	return NULL;
1327 }
1328 
1329 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1330 				    int noblock, int *errcode)
1331 {
1332 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1333 }
1334 
1335 static void __lock_sock(struct sock *sk)
1336 {
1337 	DEFINE_WAIT(wait);
1338 
1339 	for (;;) {
1340 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1341 					TASK_UNINTERRUPTIBLE);
1342 		spin_unlock_bh(&sk->sk_lock.slock);
1343 		schedule();
1344 		spin_lock_bh(&sk->sk_lock.slock);
1345 		if (!sock_owned_by_user(sk))
1346 			break;
1347 	}
1348 	finish_wait(&sk->sk_lock.wq, &wait);
1349 }
1350 
1351 static void __release_sock(struct sock *sk)
1352 {
1353 	struct sk_buff *skb = sk->sk_backlog.head;
1354 
1355 	do {
1356 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1357 		bh_unlock_sock(sk);
1358 
1359 		do {
1360 			struct sk_buff *next = skb->next;
1361 
1362 			skb->next = NULL;
1363 			sk->sk_backlog_rcv(sk, skb);
1364 
1365 			/*
1366 			 * We are in process context here with softirqs
1367 			 * disabled, use cond_resched_softirq() to preempt.
1368 			 * This is safe to do because we've taken the backlog
1369 			 * queue private:
1370 			 */
1371 			cond_resched_softirq();
1372 
1373 			skb = next;
1374 		} while (skb != NULL);
1375 
1376 		bh_lock_sock(sk);
1377 	} while ((skb = sk->sk_backlog.head) != NULL);
1378 }
1379 
1380 /**
1381  * sk_wait_data - wait for data to arrive at sk_receive_queue
1382  * @sk:    sock to wait on
1383  * @timeo: for how long
1384  *
1385  * Now socket state including sk->sk_err is changed only under lock,
1386  * hence we may omit checks after joining wait queue.
1387  * We check receive queue before schedule() only as optimization;
1388  * it is very likely that release_sock() added new data.
1389  */
1390 int sk_wait_data(struct sock *sk, long *timeo)
1391 {
1392 	int rc;
1393 	DEFINE_WAIT(wait);
1394 
1395 	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1396 	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1397 	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1398 	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1399 	finish_wait(sk->sk_sleep, &wait);
1400 	return rc;
1401 }
1402 
1403 EXPORT_SYMBOL(sk_wait_data);
1404 
1405 /**
1406  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1407  *	@sk: socket
1408  *	@size: memory size to allocate
1409  *	@kind: allocation type
1410  *
1411  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1412  *	rmem allocation. This function assumes that protocols which have
1413  *	memory_pressure use sk_wmem_queued as write buffer accounting.
1414  */
1415 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1416 {
1417 	struct proto *prot = sk->sk_prot;
1418 	int amt = sk_mem_pages(size);
1419 	int allocated;
1420 
1421 	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1422 	allocated = atomic_add_return(amt, prot->memory_allocated);
1423 
1424 	/* Under limit. */
1425 	if (allocated <= prot->sysctl_mem[0]) {
1426 		if (prot->memory_pressure && *prot->memory_pressure)
1427 			*prot->memory_pressure = 0;
1428 		return 1;
1429 	}
1430 
1431 	/* Under pressure. */
1432 	if (allocated > prot->sysctl_mem[1])
1433 		if (prot->enter_memory_pressure)
1434 			prot->enter_memory_pressure();
1435 
1436 	/* Over hard limit. */
1437 	if (allocated > prot->sysctl_mem[2])
1438 		goto suppress_allocation;
1439 
1440 	/* guarantee minimum buffer size under pressure */
1441 	if (kind == SK_MEM_RECV) {
1442 		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1443 			return 1;
1444 	} else { /* SK_MEM_SEND */
1445 		if (sk->sk_type == SOCK_STREAM) {
1446 			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1447 				return 1;
1448 		} else if (atomic_read(&sk->sk_wmem_alloc) <
1449 			   prot->sysctl_wmem[0])
1450 				return 1;
1451 	}
1452 
1453 	if (prot->memory_pressure) {
1454 		if (!*prot->memory_pressure ||
1455 		    prot->sysctl_mem[2] > atomic_read(prot->sockets_allocated) *
1456 		    sk_mem_pages(sk->sk_wmem_queued +
1457 				 atomic_read(&sk->sk_rmem_alloc) +
1458 				 sk->sk_forward_alloc))
1459 			return 1;
1460 	}
1461 
1462 suppress_allocation:
1463 
1464 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1465 		sk_stream_moderate_sndbuf(sk);
1466 
1467 		/* Fail only if socket is _under_ its sndbuf.
1468 		 * In this case we cannot block, so that we have to fail.
1469 		 */
1470 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1471 			return 1;
1472 	}
1473 
1474 	/* Alas. Undo changes. */
1475 	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1476 	atomic_sub(amt, prot->memory_allocated);
1477 	return 0;
1478 }
1479 
1480 EXPORT_SYMBOL(__sk_mem_schedule);
1481 
1482 /**
1483  *	__sk_reclaim - reclaim memory_allocated
1484  *	@sk: socket
1485  */
1486 void __sk_mem_reclaim(struct sock *sk)
1487 {
1488 	struct proto *prot = sk->sk_prot;
1489 
1490 	atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
1491 		   prot->memory_allocated);
1492 	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1493 
1494 	if (prot->memory_pressure && *prot->memory_pressure &&
1495 	    (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1496 		*prot->memory_pressure = 0;
1497 }
1498 
1499 EXPORT_SYMBOL(__sk_mem_reclaim);
1500 
1501 
1502 /*
1503  * Set of default routines for initialising struct proto_ops when
1504  * the protocol does not support a particular function. In certain
1505  * cases where it makes no sense for a protocol to have a "do nothing"
1506  * function, some default processing is provided.
1507  */
1508 
1509 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1510 {
1511 	return -EOPNOTSUPP;
1512 }
1513 
1514 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1515 		    int len, int flags)
1516 {
1517 	return -EOPNOTSUPP;
1518 }
1519 
1520 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1521 {
1522 	return -EOPNOTSUPP;
1523 }
1524 
1525 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1526 {
1527 	return -EOPNOTSUPP;
1528 }
1529 
1530 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1531 		    int *len, int peer)
1532 {
1533 	return -EOPNOTSUPP;
1534 }
1535 
1536 unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1537 {
1538 	return 0;
1539 }
1540 
1541 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1542 {
1543 	return -EOPNOTSUPP;
1544 }
1545 
1546 int sock_no_listen(struct socket *sock, int backlog)
1547 {
1548 	return -EOPNOTSUPP;
1549 }
1550 
1551 int sock_no_shutdown(struct socket *sock, int how)
1552 {
1553 	return -EOPNOTSUPP;
1554 }
1555 
1556 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1557 		    char __user *optval, int optlen)
1558 {
1559 	return -EOPNOTSUPP;
1560 }
1561 
1562 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1563 		    char __user *optval, int __user *optlen)
1564 {
1565 	return -EOPNOTSUPP;
1566 }
1567 
1568 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1569 		    size_t len)
1570 {
1571 	return -EOPNOTSUPP;
1572 }
1573 
1574 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1575 		    size_t len, int flags)
1576 {
1577 	return -EOPNOTSUPP;
1578 }
1579 
1580 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1581 {
1582 	/* Mirror missing mmap method error code */
1583 	return -ENODEV;
1584 }
1585 
1586 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1587 {
1588 	ssize_t res;
1589 	struct msghdr msg = {.msg_flags = flags};
1590 	struct kvec iov;
1591 	char *kaddr = kmap(page);
1592 	iov.iov_base = kaddr + offset;
1593 	iov.iov_len = size;
1594 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1595 	kunmap(page);
1596 	return res;
1597 }
1598 
1599 /*
1600  *	Default Socket Callbacks
1601  */
1602 
1603 static void sock_def_wakeup(struct sock *sk)
1604 {
1605 	read_lock(&sk->sk_callback_lock);
1606 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1607 		wake_up_interruptible_all(sk->sk_sleep);
1608 	read_unlock(&sk->sk_callback_lock);
1609 }
1610 
1611 static void sock_def_error_report(struct sock *sk)
1612 {
1613 	read_lock(&sk->sk_callback_lock);
1614 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1615 		wake_up_interruptible(sk->sk_sleep);
1616 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1617 	read_unlock(&sk->sk_callback_lock);
1618 }
1619 
1620 static void sock_def_readable(struct sock *sk, int len)
1621 {
1622 	read_lock(&sk->sk_callback_lock);
1623 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1624 		wake_up_interruptible(sk->sk_sleep);
1625 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1626 	read_unlock(&sk->sk_callback_lock);
1627 }
1628 
1629 static void sock_def_write_space(struct sock *sk)
1630 {
1631 	read_lock(&sk->sk_callback_lock);
1632 
1633 	/* Do not wake up a writer until he can make "significant"
1634 	 * progress.  --DaveM
1635 	 */
1636 	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1637 		if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1638 			wake_up_interruptible(sk->sk_sleep);
1639 
1640 		/* Should agree with poll, otherwise some programs break */
1641 		if (sock_writeable(sk))
1642 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1643 	}
1644 
1645 	read_unlock(&sk->sk_callback_lock);
1646 }
1647 
1648 static void sock_def_destruct(struct sock *sk)
1649 {
1650 	kfree(sk->sk_protinfo);
1651 }
1652 
1653 void sk_send_sigurg(struct sock *sk)
1654 {
1655 	if (sk->sk_socket && sk->sk_socket->file)
1656 		if (send_sigurg(&sk->sk_socket->file->f_owner))
1657 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1658 }
1659 
1660 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1661 		    unsigned long expires)
1662 {
1663 	if (!mod_timer(timer, expires))
1664 		sock_hold(sk);
1665 }
1666 
1667 EXPORT_SYMBOL(sk_reset_timer);
1668 
1669 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1670 {
1671 	if (timer_pending(timer) && del_timer(timer))
1672 		__sock_put(sk);
1673 }
1674 
1675 EXPORT_SYMBOL(sk_stop_timer);
1676 
1677 void sock_init_data(struct socket *sock, struct sock *sk)
1678 {
1679 	skb_queue_head_init(&sk->sk_receive_queue);
1680 	skb_queue_head_init(&sk->sk_write_queue);
1681 	skb_queue_head_init(&sk->sk_error_queue);
1682 #ifdef CONFIG_NET_DMA
1683 	skb_queue_head_init(&sk->sk_async_wait_queue);
1684 #endif
1685 
1686 	sk->sk_send_head	=	NULL;
1687 
1688 	init_timer(&sk->sk_timer);
1689 
1690 	sk->sk_allocation	=	GFP_KERNEL;
1691 	sk->sk_rcvbuf		=	sysctl_rmem_default;
1692 	sk->sk_sndbuf		=	sysctl_wmem_default;
1693 	sk->sk_state		=	TCP_CLOSE;
1694 	sk->sk_socket		=	sock;
1695 
1696 	sock_set_flag(sk, SOCK_ZAPPED);
1697 
1698 	if (sock) {
1699 		sk->sk_type	=	sock->type;
1700 		sk->sk_sleep	=	&sock->wait;
1701 		sock->sk	=	sk;
1702 	} else
1703 		sk->sk_sleep	=	NULL;
1704 
1705 	rwlock_init(&sk->sk_dst_lock);
1706 	rwlock_init(&sk->sk_callback_lock);
1707 	lockdep_set_class_and_name(&sk->sk_callback_lock,
1708 			af_callback_keys + sk->sk_family,
1709 			af_family_clock_key_strings[sk->sk_family]);
1710 
1711 	sk->sk_state_change	=	sock_def_wakeup;
1712 	sk->sk_data_ready	=	sock_def_readable;
1713 	sk->sk_write_space	=	sock_def_write_space;
1714 	sk->sk_error_report	=	sock_def_error_report;
1715 	sk->sk_destruct		=	sock_def_destruct;
1716 
1717 	sk->sk_sndmsg_page	=	NULL;
1718 	sk->sk_sndmsg_off	=	0;
1719 
1720 	sk->sk_peercred.pid 	=	0;
1721 	sk->sk_peercred.uid	=	-1;
1722 	sk->sk_peercred.gid	=	-1;
1723 	sk->sk_write_pending	=	0;
1724 	sk->sk_rcvlowat		=	1;
1725 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
1726 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
1727 
1728 	sk->sk_stamp = ktime_set(-1L, -1L);
1729 
1730 	atomic_set(&sk->sk_refcnt, 1);
1731 	atomic_set(&sk->sk_drops, 0);
1732 }
1733 
1734 void fastcall lock_sock_nested(struct sock *sk, int subclass)
1735 {
1736 	might_sleep();
1737 	spin_lock_bh(&sk->sk_lock.slock);
1738 	if (sk->sk_lock.owned)
1739 		__lock_sock(sk);
1740 	sk->sk_lock.owned = 1;
1741 	spin_unlock(&sk->sk_lock.slock);
1742 	/*
1743 	 * The sk_lock has mutex_lock() semantics here:
1744 	 */
1745 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
1746 	local_bh_enable();
1747 }
1748 
1749 EXPORT_SYMBOL(lock_sock_nested);
1750 
1751 void fastcall release_sock(struct sock *sk)
1752 {
1753 	/*
1754 	 * The sk_lock has mutex_unlock() semantics:
1755 	 */
1756 	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1757 
1758 	spin_lock_bh(&sk->sk_lock.slock);
1759 	if (sk->sk_backlog.tail)
1760 		__release_sock(sk);
1761 	sk->sk_lock.owned = 0;
1762 	if (waitqueue_active(&sk->sk_lock.wq))
1763 		wake_up(&sk->sk_lock.wq);
1764 	spin_unlock_bh(&sk->sk_lock.slock);
1765 }
1766 EXPORT_SYMBOL(release_sock);
1767 
1768 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1769 {
1770 	struct timeval tv;
1771 	if (!sock_flag(sk, SOCK_TIMESTAMP))
1772 		sock_enable_timestamp(sk);
1773 	tv = ktime_to_timeval(sk->sk_stamp);
1774 	if (tv.tv_sec == -1)
1775 		return -ENOENT;
1776 	if (tv.tv_sec == 0) {
1777 		sk->sk_stamp = ktime_get_real();
1778 		tv = ktime_to_timeval(sk->sk_stamp);
1779 	}
1780 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
1781 }
1782 EXPORT_SYMBOL(sock_get_timestamp);
1783 
1784 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
1785 {
1786 	struct timespec ts;
1787 	if (!sock_flag(sk, SOCK_TIMESTAMP))
1788 		sock_enable_timestamp(sk);
1789 	ts = ktime_to_timespec(sk->sk_stamp);
1790 	if (ts.tv_sec == -1)
1791 		return -ENOENT;
1792 	if (ts.tv_sec == 0) {
1793 		sk->sk_stamp = ktime_get_real();
1794 		ts = ktime_to_timespec(sk->sk_stamp);
1795 	}
1796 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
1797 }
1798 EXPORT_SYMBOL(sock_get_timestampns);
1799 
1800 void sock_enable_timestamp(struct sock *sk)
1801 {
1802 	if (!sock_flag(sk, SOCK_TIMESTAMP)) {
1803 		sock_set_flag(sk, SOCK_TIMESTAMP);
1804 		net_enable_timestamp();
1805 	}
1806 }
1807 
1808 /*
1809  *	Get a socket option on an socket.
1810  *
1811  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
1812  *	asynchronous errors should be reported by getsockopt. We assume
1813  *	this means if you specify SO_ERROR (otherwise whats the point of it).
1814  */
1815 int sock_common_getsockopt(struct socket *sock, int level, int optname,
1816 			   char __user *optval, int __user *optlen)
1817 {
1818 	struct sock *sk = sock->sk;
1819 
1820 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1821 }
1822 
1823 EXPORT_SYMBOL(sock_common_getsockopt);
1824 
1825 #ifdef CONFIG_COMPAT
1826 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1827 				  char __user *optval, int __user *optlen)
1828 {
1829 	struct sock *sk = sock->sk;
1830 
1831 	if (sk->sk_prot->compat_getsockopt != NULL)
1832 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
1833 						      optval, optlen);
1834 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1835 }
1836 EXPORT_SYMBOL(compat_sock_common_getsockopt);
1837 #endif
1838 
1839 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1840 			struct msghdr *msg, size_t size, int flags)
1841 {
1842 	struct sock *sk = sock->sk;
1843 	int addr_len = 0;
1844 	int err;
1845 
1846 	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1847 				   flags & ~MSG_DONTWAIT, &addr_len);
1848 	if (err >= 0)
1849 		msg->msg_namelen = addr_len;
1850 	return err;
1851 }
1852 
1853 EXPORT_SYMBOL(sock_common_recvmsg);
1854 
1855 /*
1856  *	Set socket options on an inet socket.
1857  */
1858 int sock_common_setsockopt(struct socket *sock, int level, int optname,
1859 			   char __user *optval, int optlen)
1860 {
1861 	struct sock *sk = sock->sk;
1862 
1863 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1864 }
1865 
1866 EXPORT_SYMBOL(sock_common_setsockopt);
1867 
1868 #ifdef CONFIG_COMPAT
1869 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
1870 				  char __user *optval, int optlen)
1871 {
1872 	struct sock *sk = sock->sk;
1873 
1874 	if (sk->sk_prot->compat_setsockopt != NULL)
1875 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
1876 						      optval, optlen);
1877 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1878 }
1879 EXPORT_SYMBOL(compat_sock_common_setsockopt);
1880 #endif
1881 
1882 void sk_common_release(struct sock *sk)
1883 {
1884 	if (sk->sk_prot->destroy)
1885 		sk->sk_prot->destroy(sk);
1886 
1887 	/*
1888 	 * Observation: when sock_common_release is called, processes have
1889 	 * no access to socket. But net still has.
1890 	 * Step one, detach it from networking:
1891 	 *
1892 	 * A. Remove from hash tables.
1893 	 */
1894 
1895 	sk->sk_prot->unhash(sk);
1896 
1897 	/*
1898 	 * In this point socket cannot receive new packets, but it is possible
1899 	 * that some packets are in flight because some CPU runs receiver and
1900 	 * did hash table lookup before we unhashed socket. They will achieve
1901 	 * receive queue and will be purged by socket destructor.
1902 	 *
1903 	 * Also we still have packets pending on receive queue and probably,
1904 	 * our own packets waiting in device queues. sock_destroy will drain
1905 	 * receive queue, but transmitted packets will delay socket destruction
1906 	 * until the last reference will be released.
1907 	 */
1908 
1909 	sock_orphan(sk);
1910 
1911 	xfrm_sk_free_policy(sk);
1912 
1913 	sk_refcnt_debug_release(sk);
1914 	sock_put(sk);
1915 }
1916 
1917 EXPORT_SYMBOL(sk_common_release);
1918 
1919 static DEFINE_RWLOCK(proto_list_lock);
1920 static LIST_HEAD(proto_list);
1921 
1922 int proto_register(struct proto *prot, int alloc_slab)
1923 {
1924 	char *request_sock_slab_name = NULL;
1925 	char *timewait_sock_slab_name;
1926 
1927 	if (sock_prot_inuse_init(prot) != 0) {
1928 		printk(KERN_CRIT "%s: Can't alloc inuse counters!\n", prot->name);
1929 		goto out;
1930 	}
1931 
1932 	if (alloc_slab) {
1933 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
1934 					       SLAB_HWCACHE_ALIGN, NULL);
1935 
1936 		if (prot->slab == NULL) {
1937 			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
1938 			       prot->name);
1939 			goto out_free_inuse;
1940 		}
1941 
1942 		if (prot->rsk_prot != NULL) {
1943 			static const char mask[] = "request_sock_%s";
1944 
1945 			request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1946 			if (request_sock_slab_name == NULL)
1947 				goto out_free_sock_slab;
1948 
1949 			sprintf(request_sock_slab_name, mask, prot->name);
1950 			prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
1951 								 prot->rsk_prot->obj_size, 0,
1952 								 SLAB_HWCACHE_ALIGN, NULL);
1953 
1954 			if (prot->rsk_prot->slab == NULL) {
1955 				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
1956 				       prot->name);
1957 				goto out_free_request_sock_slab_name;
1958 			}
1959 		}
1960 
1961 		if (prot->twsk_prot != NULL) {
1962 			static const char mask[] = "tw_sock_%s";
1963 
1964 			timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1965 
1966 			if (timewait_sock_slab_name == NULL)
1967 				goto out_free_request_sock_slab;
1968 
1969 			sprintf(timewait_sock_slab_name, mask, prot->name);
1970 			prot->twsk_prot->twsk_slab =
1971 				kmem_cache_create(timewait_sock_slab_name,
1972 						  prot->twsk_prot->twsk_obj_size,
1973 						  0, SLAB_HWCACHE_ALIGN,
1974 						  NULL);
1975 			if (prot->twsk_prot->twsk_slab == NULL)
1976 				goto out_free_timewait_sock_slab_name;
1977 		}
1978 	}
1979 
1980 	write_lock(&proto_list_lock);
1981 	list_add(&prot->node, &proto_list);
1982 	write_unlock(&proto_list_lock);
1983 	return 0;
1984 
1985 out_free_timewait_sock_slab_name:
1986 	kfree(timewait_sock_slab_name);
1987 out_free_request_sock_slab:
1988 	if (prot->rsk_prot && prot->rsk_prot->slab) {
1989 		kmem_cache_destroy(prot->rsk_prot->slab);
1990 		prot->rsk_prot->slab = NULL;
1991 	}
1992 out_free_request_sock_slab_name:
1993 	kfree(request_sock_slab_name);
1994 out_free_sock_slab:
1995 	kmem_cache_destroy(prot->slab);
1996 	prot->slab = NULL;
1997 out_free_inuse:
1998 	sock_prot_inuse_free(prot);
1999 out:
2000 	return -ENOBUFS;
2001 }
2002 
2003 EXPORT_SYMBOL(proto_register);
2004 
2005 void proto_unregister(struct proto *prot)
2006 {
2007 	write_lock(&proto_list_lock);
2008 	list_del(&prot->node);
2009 	write_unlock(&proto_list_lock);
2010 
2011 	sock_prot_inuse_free(prot);
2012 
2013 	if (prot->slab != NULL) {
2014 		kmem_cache_destroy(prot->slab);
2015 		prot->slab = NULL;
2016 	}
2017 
2018 	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2019 		const char *name = kmem_cache_name(prot->rsk_prot->slab);
2020 
2021 		kmem_cache_destroy(prot->rsk_prot->slab);
2022 		kfree(name);
2023 		prot->rsk_prot->slab = NULL;
2024 	}
2025 
2026 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2027 		const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
2028 
2029 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2030 		kfree(name);
2031 		prot->twsk_prot->twsk_slab = NULL;
2032 	}
2033 }
2034 
2035 EXPORT_SYMBOL(proto_unregister);
2036 
2037 #ifdef CONFIG_PROC_FS
2038 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2039 	__acquires(proto_list_lock)
2040 {
2041 	read_lock(&proto_list_lock);
2042 	return seq_list_start_head(&proto_list, *pos);
2043 }
2044 
2045 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2046 {
2047 	return seq_list_next(v, &proto_list, pos);
2048 }
2049 
2050 static void proto_seq_stop(struct seq_file *seq, void *v)
2051 	__releases(proto_list_lock)
2052 {
2053 	read_unlock(&proto_list_lock);
2054 }
2055 
2056 static char proto_method_implemented(const void *method)
2057 {
2058 	return method == NULL ? 'n' : 'y';
2059 }
2060 
2061 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2062 {
2063 	seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
2064 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2065 		   proto->name,
2066 		   proto->obj_size,
2067 		   proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
2068 		   proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
2069 		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2070 		   proto->max_header,
2071 		   proto->slab == NULL ? "no" : "yes",
2072 		   module_name(proto->owner),
2073 		   proto_method_implemented(proto->close),
2074 		   proto_method_implemented(proto->connect),
2075 		   proto_method_implemented(proto->disconnect),
2076 		   proto_method_implemented(proto->accept),
2077 		   proto_method_implemented(proto->ioctl),
2078 		   proto_method_implemented(proto->init),
2079 		   proto_method_implemented(proto->destroy),
2080 		   proto_method_implemented(proto->shutdown),
2081 		   proto_method_implemented(proto->setsockopt),
2082 		   proto_method_implemented(proto->getsockopt),
2083 		   proto_method_implemented(proto->sendmsg),
2084 		   proto_method_implemented(proto->recvmsg),
2085 		   proto_method_implemented(proto->sendpage),
2086 		   proto_method_implemented(proto->bind),
2087 		   proto_method_implemented(proto->backlog_rcv),
2088 		   proto_method_implemented(proto->hash),
2089 		   proto_method_implemented(proto->unhash),
2090 		   proto_method_implemented(proto->get_port),
2091 		   proto_method_implemented(proto->enter_memory_pressure));
2092 }
2093 
2094 static int proto_seq_show(struct seq_file *seq, void *v)
2095 {
2096 	if (v == &proto_list)
2097 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2098 			   "protocol",
2099 			   "size",
2100 			   "sockets",
2101 			   "memory",
2102 			   "press",
2103 			   "maxhdr",
2104 			   "slab",
2105 			   "module",
2106 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2107 	else
2108 		proto_seq_printf(seq, list_entry(v, struct proto, node));
2109 	return 0;
2110 }
2111 
2112 static const struct seq_operations proto_seq_ops = {
2113 	.start  = proto_seq_start,
2114 	.next   = proto_seq_next,
2115 	.stop   = proto_seq_stop,
2116 	.show   = proto_seq_show,
2117 };
2118 
2119 static int proto_seq_open(struct inode *inode, struct file *file)
2120 {
2121 	return seq_open(file, &proto_seq_ops);
2122 }
2123 
2124 static const struct file_operations proto_seq_fops = {
2125 	.owner		= THIS_MODULE,
2126 	.open		= proto_seq_open,
2127 	.read		= seq_read,
2128 	.llseek		= seq_lseek,
2129 	.release	= seq_release,
2130 };
2131 
2132 static int __init proto_init(void)
2133 {
2134 	/* register /proc/net/protocols */
2135 	return proc_net_fops_create(&init_net, "protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
2136 }
2137 
2138 subsys_initcall(proto_init);
2139 
2140 #endif /* PROC_FS */
2141 
2142 EXPORT_SYMBOL(sk_alloc);
2143 EXPORT_SYMBOL(sk_free);
2144 EXPORT_SYMBOL(sk_send_sigurg);
2145 EXPORT_SYMBOL(sock_alloc_send_skb);
2146 EXPORT_SYMBOL(sock_init_data);
2147 EXPORT_SYMBOL(sock_kfree_s);
2148 EXPORT_SYMBOL(sock_kmalloc);
2149 EXPORT_SYMBOL(sock_no_accept);
2150 EXPORT_SYMBOL(sock_no_bind);
2151 EXPORT_SYMBOL(sock_no_connect);
2152 EXPORT_SYMBOL(sock_no_getname);
2153 EXPORT_SYMBOL(sock_no_getsockopt);
2154 EXPORT_SYMBOL(sock_no_ioctl);
2155 EXPORT_SYMBOL(sock_no_listen);
2156 EXPORT_SYMBOL(sock_no_mmap);
2157 EXPORT_SYMBOL(sock_no_poll);
2158 EXPORT_SYMBOL(sock_no_recvmsg);
2159 EXPORT_SYMBOL(sock_no_sendmsg);
2160 EXPORT_SYMBOL(sock_no_sendpage);
2161 EXPORT_SYMBOL(sock_no_setsockopt);
2162 EXPORT_SYMBOL(sock_no_shutdown);
2163 EXPORT_SYMBOL(sock_no_socketpair);
2164 EXPORT_SYMBOL(sock_rfree);
2165 EXPORT_SYMBOL(sock_setsockopt);
2166 EXPORT_SYMBOL(sock_wfree);
2167 EXPORT_SYMBOL(sock_wmalloc);
2168 EXPORT_SYMBOL(sock_i_uid);
2169 EXPORT_SYMBOL(sock_i_ino);
2170 EXPORT_SYMBOL(sysctl_optmem_max);
2171