xref: /openbmc/linux/net/core/sock.c (revision f42b3800)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Version:	$Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
11  *
12  * Authors:	Ross Biro
13  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14  *		Florian La Roche, <flla@stud.uni-sb.de>
15  *		Alan Cox, <A.Cox@swansea.ac.uk>
16  *
17  * Fixes:
18  *		Alan Cox	: 	Numerous verify_area() problems
19  *		Alan Cox	:	Connecting on a connecting socket
20  *					now returns an error for tcp.
21  *		Alan Cox	:	sock->protocol is set correctly.
22  *					and is not sometimes left as 0.
23  *		Alan Cox	:	connect handles icmp errors on a
24  *					connect properly. Unfortunately there
25  *					is a restart syscall nasty there. I
26  *					can't match BSD without hacking the C
27  *					library. Ideas urgently sought!
28  *		Alan Cox	:	Disallow bind() to addresses that are
29  *					not ours - especially broadcast ones!!
30  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
31  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
32  *					instead they leave that for the DESTROY timer.
33  *		Alan Cox	:	Clean up error flag in accept
34  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
35  *					was buggy. Put a remove_sock() in the handler
36  *					for memory when we hit 0. Also altered the timer
37  *					code. The ACK stuff can wait and needs major
38  *					TCP layer surgery.
39  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
40  *					and fixed timer/inet_bh race.
41  *		Alan Cox	:	Added zapped flag for TCP
42  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
43  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
45  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
46  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
48  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
49  *	Pauline Middelink	:	identd support
50  *		Alan Cox	:	Fixed connect() taking signals I think.
51  *		Alan Cox	:	SO_LINGER supported
52  *		Alan Cox	:	Error reporting fixes
53  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
54  *		Alan Cox	:	inet sockets don't set sk->type!
55  *		Alan Cox	:	Split socket option code
56  *		Alan Cox	:	Callbacks
57  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
58  *		Alex		:	Removed restriction on inet fioctl
59  *		Alan Cox	:	Splitting INET from NET core
60  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
61  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
62  *		Alan Cox	:	Split IP from generic code
63  *		Alan Cox	:	New kfree_skbmem()
64  *		Alan Cox	:	Make SO_DEBUG superuser only.
65  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
66  *					(compatibility fix)
67  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
68  *		Alan Cox	:	Allocator for a socket is settable.
69  *		Alan Cox	:	SO_ERROR includes soft errors.
70  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
71  *		Alan Cox	: 	Generic socket allocation to make hooks
72  *					easier (suggested by Craig Metz).
73  *		Michael Pall	:	SO_ERROR returns positive errno again
74  *              Steve Whitehouse:       Added default destructor to free
75  *                                      protocol private data.
76  *              Steve Whitehouse:       Added various other default routines
77  *                                      common to several socket families.
78  *              Chris Evans     :       Call suser() check last on F_SETOWN
79  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
81  *		Andi Kleen	:	Fix write_space callback
82  *		Chris Evans	:	Security fixes - signedness again
83  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
84  *
85  * To Fix:
86  *
87  *
88  *		This program is free software; you can redistribute it and/or
89  *		modify it under the terms of the GNU General Public License
90  *		as published by the Free Software Foundation; either version
91  *		2 of the License, or (at your option) any later version.
92  */
93 
94 #include <linux/capability.h>
95 #include <linux/errno.h>
96 #include <linux/types.h>
97 #include <linux/socket.h>
98 #include <linux/in.h>
99 #include <linux/kernel.h>
100 #include <linux/module.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/sched.h>
104 #include <linux/timer.h>
105 #include <linux/string.h>
106 #include <linux/sockios.h>
107 #include <linux/net.h>
108 #include <linux/mm.h>
109 #include <linux/slab.h>
110 #include <linux/interrupt.h>
111 #include <linux/poll.h>
112 #include <linux/tcp.h>
113 #include <linux/init.h>
114 #include <linux/highmem.h>
115 
116 #include <asm/uaccess.h>
117 #include <asm/system.h>
118 
119 #include <linux/netdevice.h>
120 #include <net/protocol.h>
121 #include <linux/skbuff.h>
122 #include <net/net_namespace.h>
123 #include <net/request_sock.h>
124 #include <net/sock.h>
125 #include <net/xfrm.h>
126 #include <linux/ipsec.h>
127 
128 #include <linux/filter.h>
129 
130 #ifdef CONFIG_INET
131 #include <net/tcp.h>
132 #endif
133 
134 /*
135  * Each address family might have different locking rules, so we have
136  * one slock key per address family:
137  */
138 static struct lock_class_key af_family_keys[AF_MAX];
139 static struct lock_class_key af_family_slock_keys[AF_MAX];
140 
141 #ifdef CONFIG_DEBUG_LOCK_ALLOC
142 /*
143  * Make lock validator output more readable. (we pre-construct these
144  * strings build-time, so that runtime initialization of socket
145  * locks is fast):
146  */
147 static const char *af_family_key_strings[AF_MAX+1] = {
148   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
149   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
150   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
151   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
152   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
153   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
154   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
155   "sk_lock-21"       , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
156   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
157   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
158   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
159   "sk_lock-AF_RXRPC" , "sk_lock-AF_MAX"
160 };
161 static const char *af_family_slock_key_strings[AF_MAX+1] = {
162   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
163   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
164   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
165   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
166   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
167   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
168   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
169   "slock-21"       , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
170   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
171   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
172   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
173   "slock-AF_RXRPC" , "slock-AF_MAX"
174 };
175 static const char *af_family_clock_key_strings[AF_MAX+1] = {
176   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
177   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
178   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
179   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
180   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
181   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
182   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
183   "clock-21"       , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
184   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
185   "clock-27"       , "clock-28"          , "clock-29"          ,
186   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
187   "clock-AF_RXRPC" , "clock-AF_MAX"
188 };
189 #endif
190 
191 /*
192  * sk_callback_lock locking rules are per-address-family,
193  * so split the lock classes by using a per-AF key:
194  */
195 static struct lock_class_key af_callback_keys[AF_MAX];
196 
197 /* Take into consideration the size of the struct sk_buff overhead in the
198  * determination of these values, since that is non-constant across
199  * platforms.  This makes socket queueing behavior and performance
200  * not depend upon such differences.
201  */
202 #define _SK_MEM_PACKETS		256
203 #define _SK_MEM_OVERHEAD	(sizeof(struct sk_buff) + 256)
204 #define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
205 #define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
206 
207 /* Run time adjustable parameters. */
208 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
209 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
210 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
211 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
212 
213 /* Maximal space eaten by iovec or ancilliary data plus some space */
214 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
215 
216 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
217 {
218 	struct timeval tv;
219 
220 	if (optlen < sizeof(tv))
221 		return -EINVAL;
222 	if (copy_from_user(&tv, optval, sizeof(tv)))
223 		return -EFAULT;
224 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
225 		return -EDOM;
226 
227 	if (tv.tv_sec < 0) {
228 		static int warned __read_mostly;
229 
230 		*timeo_p = 0;
231 		if (warned < 10 && net_ratelimit())
232 			warned++;
233 			printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
234 			       "tries to set negative timeout\n",
235 				current->comm, task_pid_nr(current));
236 		return 0;
237 	}
238 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
239 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
240 		return 0;
241 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
242 		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
243 	return 0;
244 }
245 
246 static void sock_warn_obsolete_bsdism(const char *name)
247 {
248 	static int warned;
249 	static char warncomm[TASK_COMM_LEN];
250 	if (strcmp(warncomm, current->comm) && warned < 5) {
251 		strcpy(warncomm,  current->comm);
252 		printk(KERN_WARNING "process `%s' is using obsolete "
253 		       "%s SO_BSDCOMPAT\n", warncomm, name);
254 		warned++;
255 	}
256 }
257 
258 static void sock_disable_timestamp(struct sock *sk)
259 {
260 	if (sock_flag(sk, SOCK_TIMESTAMP)) {
261 		sock_reset_flag(sk, SOCK_TIMESTAMP);
262 		net_disable_timestamp();
263 	}
264 }
265 
266 
267 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
268 {
269 	int err = 0;
270 	int skb_len;
271 
272 	/* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
273 	   number of warnings when compiling with -W --ANK
274 	 */
275 	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
276 	    (unsigned)sk->sk_rcvbuf) {
277 		err = -ENOMEM;
278 		goto out;
279 	}
280 
281 	err = sk_filter(sk, skb);
282 	if (err)
283 		goto out;
284 
285 	if (!sk_rmem_schedule(sk, skb->truesize)) {
286 		err = -ENOBUFS;
287 		goto out;
288 	}
289 
290 	skb->dev = NULL;
291 	skb_set_owner_r(skb, sk);
292 
293 	/* Cache the SKB length before we tack it onto the receive
294 	 * queue.  Once it is added it no longer belongs to us and
295 	 * may be freed by other threads of control pulling packets
296 	 * from the queue.
297 	 */
298 	skb_len = skb->len;
299 
300 	skb_queue_tail(&sk->sk_receive_queue, skb);
301 
302 	if (!sock_flag(sk, SOCK_DEAD))
303 		sk->sk_data_ready(sk, skb_len);
304 out:
305 	return err;
306 }
307 EXPORT_SYMBOL(sock_queue_rcv_skb);
308 
309 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
310 {
311 	int rc = NET_RX_SUCCESS;
312 
313 	if (sk_filter(sk, skb))
314 		goto discard_and_relse;
315 
316 	skb->dev = NULL;
317 
318 	if (nested)
319 		bh_lock_sock_nested(sk);
320 	else
321 		bh_lock_sock(sk);
322 	if (!sock_owned_by_user(sk)) {
323 		/*
324 		 * trylock + unlock semantics:
325 		 */
326 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
327 
328 		rc = sk->sk_backlog_rcv(sk, skb);
329 
330 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
331 	} else
332 		sk_add_backlog(sk, skb);
333 	bh_unlock_sock(sk);
334 out:
335 	sock_put(sk);
336 	return rc;
337 discard_and_relse:
338 	kfree_skb(skb);
339 	goto out;
340 }
341 EXPORT_SYMBOL(sk_receive_skb);
342 
343 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
344 {
345 	struct dst_entry *dst = sk->sk_dst_cache;
346 
347 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
348 		sk->sk_dst_cache = NULL;
349 		dst_release(dst);
350 		return NULL;
351 	}
352 
353 	return dst;
354 }
355 EXPORT_SYMBOL(__sk_dst_check);
356 
357 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
358 {
359 	struct dst_entry *dst = sk_dst_get(sk);
360 
361 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
362 		sk_dst_reset(sk);
363 		dst_release(dst);
364 		return NULL;
365 	}
366 
367 	return dst;
368 }
369 EXPORT_SYMBOL(sk_dst_check);
370 
371 static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
372 {
373 	int ret = -ENOPROTOOPT;
374 #ifdef CONFIG_NETDEVICES
375 	struct net *net = sock_net(sk);
376 	char devname[IFNAMSIZ];
377 	int index;
378 
379 	/* Sorry... */
380 	ret = -EPERM;
381 	if (!capable(CAP_NET_RAW))
382 		goto out;
383 
384 	ret = -EINVAL;
385 	if (optlen < 0)
386 		goto out;
387 
388 	/* Bind this socket to a particular device like "eth0",
389 	 * as specified in the passed interface name. If the
390 	 * name is "" or the option length is zero the socket
391 	 * is not bound.
392 	 */
393 	if (optlen > IFNAMSIZ - 1)
394 		optlen = IFNAMSIZ - 1;
395 	memset(devname, 0, sizeof(devname));
396 
397 	ret = -EFAULT;
398 	if (copy_from_user(devname, optval, optlen))
399 		goto out;
400 
401 	if (devname[0] == '\0') {
402 		index = 0;
403 	} else {
404 		struct net_device *dev = dev_get_by_name(net, devname);
405 
406 		ret = -ENODEV;
407 		if (!dev)
408 			goto out;
409 
410 		index = dev->ifindex;
411 		dev_put(dev);
412 	}
413 
414 	lock_sock(sk);
415 	sk->sk_bound_dev_if = index;
416 	sk_dst_reset(sk);
417 	release_sock(sk);
418 
419 	ret = 0;
420 
421 out:
422 #endif
423 
424 	return ret;
425 }
426 
427 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
428 {
429 	if (valbool)
430 		sock_set_flag(sk, bit);
431 	else
432 		sock_reset_flag(sk, bit);
433 }
434 
435 /*
436  *	This is meant for all protocols to use and covers goings on
437  *	at the socket level. Everything here is generic.
438  */
439 
440 int sock_setsockopt(struct socket *sock, int level, int optname,
441 		    char __user *optval, int optlen)
442 {
443 	struct sock *sk=sock->sk;
444 	int val;
445 	int valbool;
446 	struct linger ling;
447 	int ret = 0;
448 
449 	/*
450 	 *	Options without arguments
451 	 */
452 
453 #ifdef SO_DONTLINGER		/* Compatibility item... */
454 	if (optname == SO_DONTLINGER) {
455 		lock_sock(sk);
456 		sock_reset_flag(sk, SOCK_LINGER);
457 		release_sock(sk);
458 		return 0;
459 	}
460 #endif
461 
462 	if (optname == SO_BINDTODEVICE)
463 		return sock_bindtodevice(sk, optval, optlen);
464 
465 	if (optlen < sizeof(int))
466 		return -EINVAL;
467 
468 	if (get_user(val, (int __user *)optval))
469 		return -EFAULT;
470 
471 	valbool = val?1:0;
472 
473 	lock_sock(sk);
474 
475 	switch(optname) {
476 	case SO_DEBUG:
477 		if (val && !capable(CAP_NET_ADMIN)) {
478 			ret = -EACCES;
479 		} else
480 			sock_valbool_flag(sk, SOCK_DBG, valbool);
481 		break;
482 	case SO_REUSEADDR:
483 		sk->sk_reuse = valbool;
484 		break;
485 	case SO_TYPE:
486 	case SO_ERROR:
487 		ret = -ENOPROTOOPT;
488 		break;
489 	case SO_DONTROUTE:
490 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
491 		break;
492 	case SO_BROADCAST:
493 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
494 		break;
495 	case SO_SNDBUF:
496 		/* Don't error on this BSD doesn't and if you think
497 		   about it this is right. Otherwise apps have to
498 		   play 'guess the biggest size' games. RCVBUF/SNDBUF
499 		   are treated in BSD as hints */
500 
501 		if (val > sysctl_wmem_max)
502 			val = sysctl_wmem_max;
503 set_sndbuf:
504 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
505 		if ((val * 2) < SOCK_MIN_SNDBUF)
506 			sk->sk_sndbuf = SOCK_MIN_SNDBUF;
507 		else
508 			sk->sk_sndbuf = val * 2;
509 
510 		/*
511 		 *	Wake up sending tasks if we
512 		 *	upped the value.
513 		 */
514 		sk->sk_write_space(sk);
515 		break;
516 
517 	case SO_SNDBUFFORCE:
518 		if (!capable(CAP_NET_ADMIN)) {
519 			ret = -EPERM;
520 			break;
521 		}
522 		goto set_sndbuf;
523 
524 	case SO_RCVBUF:
525 		/* Don't error on this BSD doesn't and if you think
526 		   about it this is right. Otherwise apps have to
527 		   play 'guess the biggest size' games. RCVBUF/SNDBUF
528 		   are treated in BSD as hints */
529 
530 		if (val > sysctl_rmem_max)
531 			val = sysctl_rmem_max;
532 set_rcvbuf:
533 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
534 		/*
535 		 * We double it on the way in to account for
536 		 * "struct sk_buff" etc. overhead.   Applications
537 		 * assume that the SO_RCVBUF setting they make will
538 		 * allow that much actual data to be received on that
539 		 * socket.
540 		 *
541 		 * Applications are unaware that "struct sk_buff" and
542 		 * other overheads allocate from the receive buffer
543 		 * during socket buffer allocation.
544 		 *
545 		 * And after considering the possible alternatives,
546 		 * returning the value we actually used in getsockopt
547 		 * is the most desirable behavior.
548 		 */
549 		if ((val * 2) < SOCK_MIN_RCVBUF)
550 			sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
551 		else
552 			sk->sk_rcvbuf = val * 2;
553 		break;
554 
555 	case SO_RCVBUFFORCE:
556 		if (!capable(CAP_NET_ADMIN)) {
557 			ret = -EPERM;
558 			break;
559 		}
560 		goto set_rcvbuf;
561 
562 	case SO_KEEPALIVE:
563 #ifdef CONFIG_INET
564 		if (sk->sk_protocol == IPPROTO_TCP)
565 			tcp_set_keepalive(sk, valbool);
566 #endif
567 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
568 		break;
569 
570 	case SO_OOBINLINE:
571 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
572 		break;
573 
574 	case SO_NO_CHECK:
575 		sk->sk_no_check = valbool;
576 		break;
577 
578 	case SO_PRIORITY:
579 		if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
580 			sk->sk_priority = val;
581 		else
582 			ret = -EPERM;
583 		break;
584 
585 	case SO_LINGER:
586 		if (optlen < sizeof(ling)) {
587 			ret = -EINVAL;	/* 1003.1g */
588 			break;
589 		}
590 		if (copy_from_user(&ling,optval,sizeof(ling))) {
591 			ret = -EFAULT;
592 			break;
593 		}
594 		if (!ling.l_onoff)
595 			sock_reset_flag(sk, SOCK_LINGER);
596 		else {
597 #if (BITS_PER_LONG == 32)
598 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
599 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
600 			else
601 #endif
602 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
603 			sock_set_flag(sk, SOCK_LINGER);
604 		}
605 		break;
606 
607 	case SO_BSDCOMPAT:
608 		sock_warn_obsolete_bsdism("setsockopt");
609 		break;
610 
611 	case SO_PASSCRED:
612 		if (valbool)
613 			set_bit(SOCK_PASSCRED, &sock->flags);
614 		else
615 			clear_bit(SOCK_PASSCRED, &sock->flags);
616 		break;
617 
618 	case SO_TIMESTAMP:
619 	case SO_TIMESTAMPNS:
620 		if (valbool)  {
621 			if (optname == SO_TIMESTAMP)
622 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
623 			else
624 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
625 			sock_set_flag(sk, SOCK_RCVTSTAMP);
626 			sock_enable_timestamp(sk);
627 		} else {
628 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
629 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
630 		}
631 		break;
632 
633 	case SO_RCVLOWAT:
634 		if (val < 0)
635 			val = INT_MAX;
636 		sk->sk_rcvlowat = val ? : 1;
637 		break;
638 
639 	case SO_RCVTIMEO:
640 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
641 		break;
642 
643 	case SO_SNDTIMEO:
644 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
645 		break;
646 
647 	case SO_ATTACH_FILTER:
648 		ret = -EINVAL;
649 		if (optlen == sizeof(struct sock_fprog)) {
650 			struct sock_fprog fprog;
651 
652 			ret = -EFAULT;
653 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
654 				break;
655 
656 			ret = sk_attach_filter(&fprog, sk);
657 		}
658 		break;
659 
660 	case SO_DETACH_FILTER:
661 		ret = sk_detach_filter(sk);
662 		break;
663 
664 	case SO_PASSSEC:
665 		if (valbool)
666 			set_bit(SOCK_PASSSEC, &sock->flags);
667 		else
668 			clear_bit(SOCK_PASSSEC, &sock->flags);
669 		break;
670 	case SO_MARK:
671 		if (!capable(CAP_NET_ADMIN))
672 			ret = -EPERM;
673 		else {
674 			sk->sk_mark = val;
675 		}
676 		break;
677 
678 		/* We implement the SO_SNDLOWAT etc to
679 		   not be settable (1003.1g 5.3) */
680 	default:
681 		ret = -ENOPROTOOPT;
682 		break;
683 	}
684 	release_sock(sk);
685 	return ret;
686 }
687 
688 
689 int sock_getsockopt(struct socket *sock, int level, int optname,
690 		    char __user *optval, int __user *optlen)
691 {
692 	struct sock *sk = sock->sk;
693 
694 	union {
695 		int val;
696 		struct linger ling;
697 		struct timeval tm;
698 	} v;
699 
700 	unsigned int lv = sizeof(int);
701 	int len;
702 
703 	if (get_user(len, optlen))
704 		return -EFAULT;
705 	if (len < 0)
706 		return -EINVAL;
707 
708 	switch(optname) {
709 	case SO_DEBUG:
710 		v.val = sock_flag(sk, SOCK_DBG);
711 		break;
712 
713 	case SO_DONTROUTE:
714 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
715 		break;
716 
717 	case SO_BROADCAST:
718 		v.val = !!sock_flag(sk, SOCK_BROADCAST);
719 		break;
720 
721 	case SO_SNDBUF:
722 		v.val = sk->sk_sndbuf;
723 		break;
724 
725 	case SO_RCVBUF:
726 		v.val = sk->sk_rcvbuf;
727 		break;
728 
729 	case SO_REUSEADDR:
730 		v.val = sk->sk_reuse;
731 		break;
732 
733 	case SO_KEEPALIVE:
734 		v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
735 		break;
736 
737 	case SO_TYPE:
738 		v.val = sk->sk_type;
739 		break;
740 
741 	case SO_ERROR:
742 		v.val = -sock_error(sk);
743 		if (v.val==0)
744 			v.val = xchg(&sk->sk_err_soft, 0);
745 		break;
746 
747 	case SO_OOBINLINE:
748 		v.val = !!sock_flag(sk, SOCK_URGINLINE);
749 		break;
750 
751 	case SO_NO_CHECK:
752 		v.val = sk->sk_no_check;
753 		break;
754 
755 	case SO_PRIORITY:
756 		v.val = sk->sk_priority;
757 		break;
758 
759 	case SO_LINGER:
760 		lv		= sizeof(v.ling);
761 		v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER);
762 		v.ling.l_linger	= sk->sk_lingertime / HZ;
763 		break;
764 
765 	case SO_BSDCOMPAT:
766 		sock_warn_obsolete_bsdism("getsockopt");
767 		break;
768 
769 	case SO_TIMESTAMP:
770 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
771 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
772 		break;
773 
774 	case SO_TIMESTAMPNS:
775 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
776 		break;
777 
778 	case SO_RCVTIMEO:
779 		lv=sizeof(struct timeval);
780 		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
781 			v.tm.tv_sec = 0;
782 			v.tm.tv_usec = 0;
783 		} else {
784 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
785 			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
786 		}
787 		break;
788 
789 	case SO_SNDTIMEO:
790 		lv=sizeof(struct timeval);
791 		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
792 			v.tm.tv_sec = 0;
793 			v.tm.tv_usec = 0;
794 		} else {
795 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
796 			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
797 		}
798 		break;
799 
800 	case SO_RCVLOWAT:
801 		v.val = sk->sk_rcvlowat;
802 		break;
803 
804 	case SO_SNDLOWAT:
805 		v.val=1;
806 		break;
807 
808 	case SO_PASSCRED:
809 		v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
810 		break;
811 
812 	case SO_PEERCRED:
813 		if (len > sizeof(sk->sk_peercred))
814 			len = sizeof(sk->sk_peercred);
815 		if (copy_to_user(optval, &sk->sk_peercred, len))
816 			return -EFAULT;
817 		goto lenout;
818 
819 	case SO_PEERNAME:
820 	{
821 		char address[128];
822 
823 		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
824 			return -ENOTCONN;
825 		if (lv < len)
826 			return -EINVAL;
827 		if (copy_to_user(optval, address, len))
828 			return -EFAULT;
829 		goto lenout;
830 	}
831 
832 	/* Dubious BSD thing... Probably nobody even uses it, but
833 	 * the UNIX standard wants it for whatever reason... -DaveM
834 	 */
835 	case SO_ACCEPTCONN:
836 		v.val = sk->sk_state == TCP_LISTEN;
837 		break;
838 
839 	case SO_PASSSEC:
840 		v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
841 		break;
842 
843 	case SO_PEERSEC:
844 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
845 
846 	case SO_MARK:
847 		v.val = sk->sk_mark;
848 		break;
849 
850 	default:
851 		return -ENOPROTOOPT;
852 	}
853 
854 	if (len > lv)
855 		len = lv;
856 	if (copy_to_user(optval, &v, len))
857 		return -EFAULT;
858 lenout:
859 	if (put_user(len, optlen))
860 		return -EFAULT;
861 	return 0;
862 }
863 
864 /*
865  * Initialize an sk_lock.
866  *
867  * (We also register the sk_lock with the lock validator.)
868  */
869 static inline void sock_lock_init(struct sock *sk)
870 {
871 	sock_lock_init_class_and_name(sk,
872 			af_family_slock_key_strings[sk->sk_family],
873 			af_family_slock_keys + sk->sk_family,
874 			af_family_key_strings[sk->sk_family],
875 			af_family_keys + sk->sk_family);
876 }
877 
878 static void sock_copy(struct sock *nsk, const struct sock *osk)
879 {
880 #ifdef CONFIG_SECURITY_NETWORK
881 	void *sptr = nsk->sk_security;
882 #endif
883 
884 	memcpy(nsk, osk, osk->sk_prot->obj_size);
885 #ifdef CONFIG_SECURITY_NETWORK
886 	nsk->sk_security = sptr;
887 	security_sk_clone(osk, nsk);
888 #endif
889 }
890 
891 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
892 		int family)
893 {
894 	struct sock *sk;
895 	struct kmem_cache *slab;
896 
897 	slab = prot->slab;
898 	if (slab != NULL)
899 		sk = kmem_cache_alloc(slab, priority);
900 	else
901 		sk = kmalloc(prot->obj_size, priority);
902 
903 	if (sk != NULL) {
904 		if (security_sk_alloc(sk, family, priority))
905 			goto out_free;
906 
907 		if (!try_module_get(prot->owner))
908 			goto out_free_sec;
909 	}
910 
911 	return sk;
912 
913 out_free_sec:
914 	security_sk_free(sk);
915 out_free:
916 	if (slab != NULL)
917 		kmem_cache_free(slab, sk);
918 	else
919 		kfree(sk);
920 	return NULL;
921 }
922 
923 static void sk_prot_free(struct proto *prot, struct sock *sk)
924 {
925 	struct kmem_cache *slab;
926 	struct module *owner;
927 
928 	owner = prot->owner;
929 	slab = prot->slab;
930 
931 	security_sk_free(sk);
932 	if (slab != NULL)
933 		kmem_cache_free(slab, sk);
934 	else
935 		kfree(sk);
936 	module_put(owner);
937 }
938 
939 /**
940  *	sk_alloc - All socket objects are allocated here
941  *	@net: the applicable net namespace
942  *	@family: protocol family
943  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
944  *	@prot: struct proto associated with this new sock instance
945  *	@zero_it: if we should zero the newly allocated sock
946  */
947 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
948 		      struct proto *prot)
949 {
950 	struct sock *sk;
951 
952 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
953 	if (sk) {
954 		sk->sk_family = family;
955 		/*
956 		 * See comment in struct sock definition to understand
957 		 * why we need sk_prot_creator -acme
958 		 */
959 		sk->sk_prot = sk->sk_prot_creator = prot;
960 		sock_lock_init(sk);
961 		sock_net_set(sk, get_net(net));
962 	}
963 
964 	return sk;
965 }
966 
967 void sk_free(struct sock *sk)
968 {
969 	struct sk_filter *filter;
970 
971 	if (sk->sk_destruct)
972 		sk->sk_destruct(sk);
973 
974 	filter = rcu_dereference(sk->sk_filter);
975 	if (filter) {
976 		sk_filter_uncharge(sk, filter);
977 		rcu_assign_pointer(sk->sk_filter, NULL);
978 	}
979 
980 	sock_disable_timestamp(sk);
981 
982 	if (atomic_read(&sk->sk_omem_alloc))
983 		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
984 		       __func__, atomic_read(&sk->sk_omem_alloc));
985 
986 	put_net(sock_net(sk));
987 	sk_prot_free(sk->sk_prot_creator, sk);
988 }
989 
990 /*
991  * Last sock_put should drop referrence to sk->sk_net. It has already
992  * been dropped in sk_change_net. Taking referrence to stopping namespace
993  * is not an option.
994  * Take referrence to a socket to remove it from hash _alive_ and after that
995  * destroy it in the context of init_net.
996  */
997 void sk_release_kernel(struct sock *sk)
998 {
999 	if (sk == NULL || sk->sk_socket == NULL)
1000 		return;
1001 
1002 	sock_hold(sk);
1003 	sock_release(sk->sk_socket);
1004 	release_net(sock_net(sk));
1005 	sock_net_set(sk, get_net(&init_net));
1006 	sock_put(sk);
1007 }
1008 EXPORT_SYMBOL(sk_release_kernel);
1009 
1010 struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
1011 {
1012 	struct sock *newsk;
1013 
1014 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1015 	if (newsk != NULL) {
1016 		struct sk_filter *filter;
1017 
1018 		sock_copy(newsk, sk);
1019 
1020 		/* SANITY */
1021 		get_net(sock_net(newsk));
1022 		sk_node_init(&newsk->sk_node);
1023 		sock_lock_init(newsk);
1024 		bh_lock_sock(newsk);
1025 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1026 
1027 		atomic_set(&newsk->sk_rmem_alloc, 0);
1028 		atomic_set(&newsk->sk_wmem_alloc, 0);
1029 		atomic_set(&newsk->sk_omem_alloc, 0);
1030 		skb_queue_head_init(&newsk->sk_receive_queue);
1031 		skb_queue_head_init(&newsk->sk_write_queue);
1032 #ifdef CONFIG_NET_DMA
1033 		skb_queue_head_init(&newsk->sk_async_wait_queue);
1034 #endif
1035 
1036 		rwlock_init(&newsk->sk_dst_lock);
1037 		rwlock_init(&newsk->sk_callback_lock);
1038 		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1039 				af_callback_keys + newsk->sk_family,
1040 				af_family_clock_key_strings[newsk->sk_family]);
1041 
1042 		newsk->sk_dst_cache	= NULL;
1043 		newsk->sk_wmem_queued	= 0;
1044 		newsk->sk_forward_alloc = 0;
1045 		newsk->sk_send_head	= NULL;
1046 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1047 
1048 		sock_reset_flag(newsk, SOCK_DONE);
1049 		skb_queue_head_init(&newsk->sk_error_queue);
1050 
1051 		filter = newsk->sk_filter;
1052 		if (filter != NULL)
1053 			sk_filter_charge(newsk, filter);
1054 
1055 		if (unlikely(xfrm_sk_clone_policy(newsk))) {
1056 			/* It is still raw copy of parent, so invalidate
1057 			 * destructor and make plain sk_free() */
1058 			newsk->sk_destruct = NULL;
1059 			sk_free(newsk);
1060 			newsk = NULL;
1061 			goto out;
1062 		}
1063 
1064 		newsk->sk_err	   = 0;
1065 		newsk->sk_priority = 0;
1066 		atomic_set(&newsk->sk_refcnt, 2);
1067 
1068 		/*
1069 		 * Increment the counter in the same struct proto as the master
1070 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1071 		 * is the same as sk->sk_prot->socks, as this field was copied
1072 		 * with memcpy).
1073 		 *
1074 		 * This _changes_ the previous behaviour, where
1075 		 * tcp_create_openreq_child always was incrementing the
1076 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1077 		 * to be taken into account in all callers. -acme
1078 		 */
1079 		sk_refcnt_debug_inc(newsk);
1080 		newsk->sk_socket = NULL;
1081 		newsk->sk_sleep	 = NULL;
1082 
1083 		if (newsk->sk_prot->sockets_allocated)
1084 			atomic_inc(newsk->sk_prot->sockets_allocated);
1085 	}
1086 out:
1087 	return newsk;
1088 }
1089 
1090 EXPORT_SYMBOL_GPL(sk_clone);
1091 
1092 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1093 {
1094 	__sk_dst_set(sk, dst);
1095 	sk->sk_route_caps = dst->dev->features;
1096 	if (sk->sk_route_caps & NETIF_F_GSO)
1097 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1098 	if (sk_can_gso(sk)) {
1099 		if (dst->header_len) {
1100 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1101 		} else {
1102 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1103 			sk->sk_gso_max_size = dst->dev->gso_max_size;
1104 		}
1105 	}
1106 }
1107 EXPORT_SYMBOL_GPL(sk_setup_caps);
1108 
1109 void __init sk_init(void)
1110 {
1111 	if (num_physpages <= 4096) {
1112 		sysctl_wmem_max = 32767;
1113 		sysctl_rmem_max = 32767;
1114 		sysctl_wmem_default = 32767;
1115 		sysctl_rmem_default = 32767;
1116 	} else if (num_physpages >= 131072) {
1117 		sysctl_wmem_max = 131071;
1118 		sysctl_rmem_max = 131071;
1119 	}
1120 }
1121 
1122 /*
1123  *	Simple resource managers for sockets.
1124  */
1125 
1126 
1127 /*
1128  * Write buffer destructor automatically called from kfree_skb.
1129  */
1130 void sock_wfree(struct sk_buff *skb)
1131 {
1132 	struct sock *sk = skb->sk;
1133 
1134 	/* In case it might be waiting for more memory. */
1135 	atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
1136 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
1137 		sk->sk_write_space(sk);
1138 	sock_put(sk);
1139 }
1140 
1141 /*
1142  * Read buffer destructor automatically called from kfree_skb.
1143  */
1144 void sock_rfree(struct sk_buff *skb)
1145 {
1146 	struct sock *sk = skb->sk;
1147 
1148 	skb_truesize_check(skb);
1149 	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1150 	sk_mem_uncharge(skb->sk, skb->truesize);
1151 }
1152 
1153 
1154 int sock_i_uid(struct sock *sk)
1155 {
1156 	int uid;
1157 
1158 	read_lock(&sk->sk_callback_lock);
1159 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1160 	read_unlock(&sk->sk_callback_lock);
1161 	return uid;
1162 }
1163 
1164 unsigned long sock_i_ino(struct sock *sk)
1165 {
1166 	unsigned long ino;
1167 
1168 	read_lock(&sk->sk_callback_lock);
1169 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1170 	read_unlock(&sk->sk_callback_lock);
1171 	return ino;
1172 }
1173 
1174 /*
1175  * Allocate a skb from the socket's send buffer.
1176  */
1177 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1178 			     gfp_t priority)
1179 {
1180 	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1181 		struct sk_buff * skb = alloc_skb(size, priority);
1182 		if (skb) {
1183 			skb_set_owner_w(skb, sk);
1184 			return skb;
1185 		}
1186 	}
1187 	return NULL;
1188 }
1189 
1190 /*
1191  * Allocate a skb from the socket's receive buffer.
1192  */
1193 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1194 			     gfp_t priority)
1195 {
1196 	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1197 		struct sk_buff *skb = alloc_skb(size, priority);
1198 		if (skb) {
1199 			skb_set_owner_r(skb, sk);
1200 			return skb;
1201 		}
1202 	}
1203 	return NULL;
1204 }
1205 
1206 /*
1207  * Allocate a memory block from the socket's option memory buffer.
1208  */
1209 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1210 {
1211 	if ((unsigned)size <= sysctl_optmem_max &&
1212 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1213 		void *mem;
1214 		/* First do the add, to avoid the race if kmalloc
1215 		 * might sleep.
1216 		 */
1217 		atomic_add(size, &sk->sk_omem_alloc);
1218 		mem = kmalloc(size, priority);
1219 		if (mem)
1220 			return mem;
1221 		atomic_sub(size, &sk->sk_omem_alloc);
1222 	}
1223 	return NULL;
1224 }
1225 
1226 /*
1227  * Free an option memory block.
1228  */
1229 void sock_kfree_s(struct sock *sk, void *mem, int size)
1230 {
1231 	kfree(mem);
1232 	atomic_sub(size, &sk->sk_omem_alloc);
1233 }
1234 
1235 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1236    I think, these locks should be removed for datagram sockets.
1237  */
1238 static long sock_wait_for_wmem(struct sock * sk, long timeo)
1239 {
1240 	DEFINE_WAIT(wait);
1241 
1242 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1243 	for (;;) {
1244 		if (!timeo)
1245 			break;
1246 		if (signal_pending(current))
1247 			break;
1248 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1249 		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1250 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1251 			break;
1252 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1253 			break;
1254 		if (sk->sk_err)
1255 			break;
1256 		timeo = schedule_timeout(timeo);
1257 	}
1258 	finish_wait(sk->sk_sleep, &wait);
1259 	return timeo;
1260 }
1261 
1262 
1263 /*
1264  *	Generic send/receive buffer handlers
1265  */
1266 
1267 static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
1268 					    unsigned long header_len,
1269 					    unsigned long data_len,
1270 					    int noblock, int *errcode)
1271 {
1272 	struct sk_buff *skb;
1273 	gfp_t gfp_mask;
1274 	long timeo;
1275 	int err;
1276 
1277 	gfp_mask = sk->sk_allocation;
1278 	if (gfp_mask & __GFP_WAIT)
1279 		gfp_mask |= __GFP_REPEAT;
1280 
1281 	timeo = sock_sndtimeo(sk, noblock);
1282 	while (1) {
1283 		err = sock_error(sk);
1284 		if (err != 0)
1285 			goto failure;
1286 
1287 		err = -EPIPE;
1288 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1289 			goto failure;
1290 
1291 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1292 			skb = alloc_skb(header_len, gfp_mask);
1293 			if (skb) {
1294 				int npages;
1295 				int i;
1296 
1297 				/* No pages, we're done... */
1298 				if (!data_len)
1299 					break;
1300 
1301 				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1302 				skb->truesize += data_len;
1303 				skb_shinfo(skb)->nr_frags = npages;
1304 				for (i = 0; i < npages; i++) {
1305 					struct page *page;
1306 					skb_frag_t *frag;
1307 
1308 					page = alloc_pages(sk->sk_allocation, 0);
1309 					if (!page) {
1310 						err = -ENOBUFS;
1311 						skb_shinfo(skb)->nr_frags = i;
1312 						kfree_skb(skb);
1313 						goto failure;
1314 					}
1315 
1316 					frag = &skb_shinfo(skb)->frags[i];
1317 					frag->page = page;
1318 					frag->page_offset = 0;
1319 					frag->size = (data_len >= PAGE_SIZE ?
1320 						      PAGE_SIZE :
1321 						      data_len);
1322 					data_len -= PAGE_SIZE;
1323 				}
1324 
1325 				/* Full success... */
1326 				break;
1327 			}
1328 			err = -ENOBUFS;
1329 			goto failure;
1330 		}
1331 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1332 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1333 		err = -EAGAIN;
1334 		if (!timeo)
1335 			goto failure;
1336 		if (signal_pending(current))
1337 			goto interrupted;
1338 		timeo = sock_wait_for_wmem(sk, timeo);
1339 	}
1340 
1341 	skb_set_owner_w(skb, sk);
1342 	return skb;
1343 
1344 interrupted:
1345 	err = sock_intr_errno(timeo);
1346 failure:
1347 	*errcode = err;
1348 	return NULL;
1349 }
1350 
1351 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1352 				    int noblock, int *errcode)
1353 {
1354 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1355 }
1356 
1357 static void __lock_sock(struct sock *sk)
1358 {
1359 	DEFINE_WAIT(wait);
1360 
1361 	for (;;) {
1362 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1363 					TASK_UNINTERRUPTIBLE);
1364 		spin_unlock_bh(&sk->sk_lock.slock);
1365 		schedule();
1366 		spin_lock_bh(&sk->sk_lock.slock);
1367 		if (!sock_owned_by_user(sk))
1368 			break;
1369 	}
1370 	finish_wait(&sk->sk_lock.wq, &wait);
1371 }
1372 
1373 static void __release_sock(struct sock *sk)
1374 {
1375 	struct sk_buff *skb = sk->sk_backlog.head;
1376 
1377 	do {
1378 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1379 		bh_unlock_sock(sk);
1380 
1381 		do {
1382 			struct sk_buff *next = skb->next;
1383 
1384 			skb->next = NULL;
1385 			sk->sk_backlog_rcv(sk, skb);
1386 
1387 			/*
1388 			 * We are in process context here with softirqs
1389 			 * disabled, use cond_resched_softirq() to preempt.
1390 			 * This is safe to do because we've taken the backlog
1391 			 * queue private:
1392 			 */
1393 			cond_resched_softirq();
1394 
1395 			skb = next;
1396 		} while (skb != NULL);
1397 
1398 		bh_lock_sock(sk);
1399 	} while ((skb = sk->sk_backlog.head) != NULL);
1400 }
1401 
1402 /**
1403  * sk_wait_data - wait for data to arrive at sk_receive_queue
1404  * @sk:    sock to wait on
1405  * @timeo: for how long
1406  *
1407  * Now socket state including sk->sk_err is changed only under lock,
1408  * hence we may omit checks after joining wait queue.
1409  * We check receive queue before schedule() only as optimization;
1410  * it is very likely that release_sock() added new data.
1411  */
1412 int sk_wait_data(struct sock *sk, long *timeo)
1413 {
1414 	int rc;
1415 	DEFINE_WAIT(wait);
1416 
1417 	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1418 	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1419 	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1420 	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1421 	finish_wait(sk->sk_sleep, &wait);
1422 	return rc;
1423 }
1424 
1425 EXPORT_SYMBOL(sk_wait_data);
1426 
1427 /**
1428  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1429  *	@sk: socket
1430  *	@size: memory size to allocate
1431  *	@kind: allocation type
1432  *
1433  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1434  *	rmem allocation. This function assumes that protocols which have
1435  *	memory_pressure use sk_wmem_queued as write buffer accounting.
1436  */
1437 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1438 {
1439 	struct proto *prot = sk->sk_prot;
1440 	int amt = sk_mem_pages(size);
1441 	int allocated;
1442 
1443 	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1444 	allocated = atomic_add_return(amt, prot->memory_allocated);
1445 
1446 	/* Under limit. */
1447 	if (allocated <= prot->sysctl_mem[0]) {
1448 		if (prot->memory_pressure && *prot->memory_pressure)
1449 			*prot->memory_pressure = 0;
1450 		return 1;
1451 	}
1452 
1453 	/* Under pressure. */
1454 	if (allocated > prot->sysctl_mem[1])
1455 		if (prot->enter_memory_pressure)
1456 			prot->enter_memory_pressure();
1457 
1458 	/* Over hard limit. */
1459 	if (allocated > prot->sysctl_mem[2])
1460 		goto suppress_allocation;
1461 
1462 	/* guarantee minimum buffer size under pressure */
1463 	if (kind == SK_MEM_RECV) {
1464 		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1465 			return 1;
1466 	} else { /* SK_MEM_SEND */
1467 		if (sk->sk_type == SOCK_STREAM) {
1468 			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1469 				return 1;
1470 		} else if (atomic_read(&sk->sk_wmem_alloc) <
1471 			   prot->sysctl_wmem[0])
1472 				return 1;
1473 	}
1474 
1475 	if (prot->memory_pressure) {
1476 		if (!*prot->memory_pressure ||
1477 		    prot->sysctl_mem[2] > atomic_read(prot->sockets_allocated) *
1478 		    sk_mem_pages(sk->sk_wmem_queued +
1479 				 atomic_read(&sk->sk_rmem_alloc) +
1480 				 sk->sk_forward_alloc))
1481 			return 1;
1482 	}
1483 
1484 suppress_allocation:
1485 
1486 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1487 		sk_stream_moderate_sndbuf(sk);
1488 
1489 		/* Fail only if socket is _under_ its sndbuf.
1490 		 * In this case we cannot block, so that we have to fail.
1491 		 */
1492 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1493 			return 1;
1494 	}
1495 
1496 	/* Alas. Undo changes. */
1497 	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1498 	atomic_sub(amt, prot->memory_allocated);
1499 	return 0;
1500 }
1501 
1502 EXPORT_SYMBOL(__sk_mem_schedule);
1503 
1504 /**
1505  *	__sk_reclaim - reclaim memory_allocated
1506  *	@sk: socket
1507  */
1508 void __sk_mem_reclaim(struct sock *sk)
1509 {
1510 	struct proto *prot = sk->sk_prot;
1511 
1512 	atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
1513 		   prot->memory_allocated);
1514 	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1515 
1516 	if (prot->memory_pressure && *prot->memory_pressure &&
1517 	    (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1518 		*prot->memory_pressure = 0;
1519 }
1520 
1521 EXPORT_SYMBOL(__sk_mem_reclaim);
1522 
1523 
1524 /*
1525  * Set of default routines for initialising struct proto_ops when
1526  * the protocol does not support a particular function. In certain
1527  * cases where it makes no sense for a protocol to have a "do nothing"
1528  * function, some default processing is provided.
1529  */
1530 
1531 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1532 {
1533 	return -EOPNOTSUPP;
1534 }
1535 
1536 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1537 		    int len, int flags)
1538 {
1539 	return -EOPNOTSUPP;
1540 }
1541 
1542 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1543 {
1544 	return -EOPNOTSUPP;
1545 }
1546 
1547 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1548 {
1549 	return -EOPNOTSUPP;
1550 }
1551 
1552 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1553 		    int *len, int peer)
1554 {
1555 	return -EOPNOTSUPP;
1556 }
1557 
1558 unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1559 {
1560 	return 0;
1561 }
1562 
1563 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1564 {
1565 	return -EOPNOTSUPP;
1566 }
1567 
1568 int sock_no_listen(struct socket *sock, int backlog)
1569 {
1570 	return -EOPNOTSUPP;
1571 }
1572 
1573 int sock_no_shutdown(struct socket *sock, int how)
1574 {
1575 	return -EOPNOTSUPP;
1576 }
1577 
1578 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1579 		    char __user *optval, int optlen)
1580 {
1581 	return -EOPNOTSUPP;
1582 }
1583 
1584 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1585 		    char __user *optval, int __user *optlen)
1586 {
1587 	return -EOPNOTSUPP;
1588 }
1589 
1590 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1591 		    size_t len)
1592 {
1593 	return -EOPNOTSUPP;
1594 }
1595 
1596 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1597 		    size_t len, int flags)
1598 {
1599 	return -EOPNOTSUPP;
1600 }
1601 
1602 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1603 {
1604 	/* Mirror missing mmap method error code */
1605 	return -ENODEV;
1606 }
1607 
1608 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1609 {
1610 	ssize_t res;
1611 	struct msghdr msg = {.msg_flags = flags};
1612 	struct kvec iov;
1613 	char *kaddr = kmap(page);
1614 	iov.iov_base = kaddr + offset;
1615 	iov.iov_len = size;
1616 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1617 	kunmap(page);
1618 	return res;
1619 }
1620 
1621 /*
1622  *	Default Socket Callbacks
1623  */
1624 
1625 static void sock_def_wakeup(struct sock *sk)
1626 {
1627 	read_lock(&sk->sk_callback_lock);
1628 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1629 		wake_up_interruptible_all(sk->sk_sleep);
1630 	read_unlock(&sk->sk_callback_lock);
1631 }
1632 
1633 static void sock_def_error_report(struct sock *sk)
1634 {
1635 	read_lock(&sk->sk_callback_lock);
1636 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1637 		wake_up_interruptible(sk->sk_sleep);
1638 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1639 	read_unlock(&sk->sk_callback_lock);
1640 }
1641 
1642 static void sock_def_readable(struct sock *sk, int len)
1643 {
1644 	read_lock(&sk->sk_callback_lock);
1645 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1646 		wake_up_interruptible_sync(sk->sk_sleep);
1647 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1648 	read_unlock(&sk->sk_callback_lock);
1649 }
1650 
1651 static void sock_def_write_space(struct sock *sk)
1652 {
1653 	read_lock(&sk->sk_callback_lock);
1654 
1655 	/* Do not wake up a writer until he can make "significant"
1656 	 * progress.  --DaveM
1657 	 */
1658 	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1659 		if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1660 			wake_up_interruptible_sync(sk->sk_sleep);
1661 
1662 		/* Should agree with poll, otherwise some programs break */
1663 		if (sock_writeable(sk))
1664 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1665 	}
1666 
1667 	read_unlock(&sk->sk_callback_lock);
1668 }
1669 
1670 static void sock_def_destruct(struct sock *sk)
1671 {
1672 	kfree(sk->sk_protinfo);
1673 }
1674 
1675 void sk_send_sigurg(struct sock *sk)
1676 {
1677 	if (sk->sk_socket && sk->sk_socket->file)
1678 		if (send_sigurg(&sk->sk_socket->file->f_owner))
1679 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1680 }
1681 
1682 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1683 		    unsigned long expires)
1684 {
1685 	if (!mod_timer(timer, expires))
1686 		sock_hold(sk);
1687 }
1688 
1689 EXPORT_SYMBOL(sk_reset_timer);
1690 
1691 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1692 {
1693 	if (timer_pending(timer) && del_timer(timer))
1694 		__sock_put(sk);
1695 }
1696 
1697 EXPORT_SYMBOL(sk_stop_timer);
1698 
1699 void sock_init_data(struct socket *sock, struct sock *sk)
1700 {
1701 	skb_queue_head_init(&sk->sk_receive_queue);
1702 	skb_queue_head_init(&sk->sk_write_queue);
1703 	skb_queue_head_init(&sk->sk_error_queue);
1704 #ifdef CONFIG_NET_DMA
1705 	skb_queue_head_init(&sk->sk_async_wait_queue);
1706 #endif
1707 
1708 	sk->sk_send_head	=	NULL;
1709 
1710 	init_timer(&sk->sk_timer);
1711 
1712 	sk->sk_allocation	=	GFP_KERNEL;
1713 	sk->sk_rcvbuf		=	sysctl_rmem_default;
1714 	sk->sk_sndbuf		=	sysctl_wmem_default;
1715 	sk->sk_state		=	TCP_CLOSE;
1716 	sk->sk_socket		=	sock;
1717 
1718 	sock_set_flag(sk, SOCK_ZAPPED);
1719 
1720 	if (sock) {
1721 		sk->sk_type	=	sock->type;
1722 		sk->sk_sleep	=	&sock->wait;
1723 		sock->sk	=	sk;
1724 	} else
1725 		sk->sk_sleep	=	NULL;
1726 
1727 	rwlock_init(&sk->sk_dst_lock);
1728 	rwlock_init(&sk->sk_callback_lock);
1729 	lockdep_set_class_and_name(&sk->sk_callback_lock,
1730 			af_callback_keys + sk->sk_family,
1731 			af_family_clock_key_strings[sk->sk_family]);
1732 
1733 	sk->sk_state_change	=	sock_def_wakeup;
1734 	sk->sk_data_ready	=	sock_def_readable;
1735 	sk->sk_write_space	=	sock_def_write_space;
1736 	sk->sk_error_report	=	sock_def_error_report;
1737 	sk->sk_destruct		=	sock_def_destruct;
1738 
1739 	sk->sk_sndmsg_page	=	NULL;
1740 	sk->sk_sndmsg_off	=	0;
1741 
1742 	sk->sk_peercred.pid 	=	0;
1743 	sk->sk_peercred.uid	=	-1;
1744 	sk->sk_peercred.gid	=	-1;
1745 	sk->sk_write_pending	=	0;
1746 	sk->sk_rcvlowat		=	1;
1747 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
1748 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
1749 
1750 	sk->sk_stamp = ktime_set(-1L, 0);
1751 
1752 	atomic_set(&sk->sk_refcnt, 1);
1753 	atomic_set(&sk->sk_drops, 0);
1754 }
1755 
1756 void lock_sock_nested(struct sock *sk, int subclass)
1757 {
1758 	might_sleep();
1759 	spin_lock_bh(&sk->sk_lock.slock);
1760 	if (sk->sk_lock.owned)
1761 		__lock_sock(sk);
1762 	sk->sk_lock.owned = 1;
1763 	spin_unlock(&sk->sk_lock.slock);
1764 	/*
1765 	 * The sk_lock has mutex_lock() semantics here:
1766 	 */
1767 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
1768 	local_bh_enable();
1769 }
1770 
1771 EXPORT_SYMBOL(lock_sock_nested);
1772 
1773 void release_sock(struct sock *sk)
1774 {
1775 	/*
1776 	 * The sk_lock has mutex_unlock() semantics:
1777 	 */
1778 	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1779 
1780 	spin_lock_bh(&sk->sk_lock.slock);
1781 	if (sk->sk_backlog.tail)
1782 		__release_sock(sk);
1783 	sk->sk_lock.owned = 0;
1784 	if (waitqueue_active(&sk->sk_lock.wq))
1785 		wake_up(&sk->sk_lock.wq);
1786 	spin_unlock_bh(&sk->sk_lock.slock);
1787 }
1788 EXPORT_SYMBOL(release_sock);
1789 
1790 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1791 {
1792 	struct timeval tv;
1793 	if (!sock_flag(sk, SOCK_TIMESTAMP))
1794 		sock_enable_timestamp(sk);
1795 	tv = ktime_to_timeval(sk->sk_stamp);
1796 	if (tv.tv_sec == -1)
1797 		return -ENOENT;
1798 	if (tv.tv_sec == 0) {
1799 		sk->sk_stamp = ktime_get_real();
1800 		tv = ktime_to_timeval(sk->sk_stamp);
1801 	}
1802 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
1803 }
1804 EXPORT_SYMBOL(sock_get_timestamp);
1805 
1806 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
1807 {
1808 	struct timespec ts;
1809 	if (!sock_flag(sk, SOCK_TIMESTAMP))
1810 		sock_enable_timestamp(sk);
1811 	ts = ktime_to_timespec(sk->sk_stamp);
1812 	if (ts.tv_sec == -1)
1813 		return -ENOENT;
1814 	if (ts.tv_sec == 0) {
1815 		sk->sk_stamp = ktime_get_real();
1816 		ts = ktime_to_timespec(sk->sk_stamp);
1817 	}
1818 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
1819 }
1820 EXPORT_SYMBOL(sock_get_timestampns);
1821 
1822 void sock_enable_timestamp(struct sock *sk)
1823 {
1824 	if (!sock_flag(sk, SOCK_TIMESTAMP)) {
1825 		sock_set_flag(sk, SOCK_TIMESTAMP);
1826 		net_enable_timestamp();
1827 	}
1828 }
1829 
1830 /*
1831  *	Get a socket option on an socket.
1832  *
1833  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
1834  *	asynchronous errors should be reported by getsockopt. We assume
1835  *	this means if you specify SO_ERROR (otherwise whats the point of it).
1836  */
1837 int sock_common_getsockopt(struct socket *sock, int level, int optname,
1838 			   char __user *optval, int __user *optlen)
1839 {
1840 	struct sock *sk = sock->sk;
1841 
1842 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1843 }
1844 
1845 EXPORT_SYMBOL(sock_common_getsockopt);
1846 
1847 #ifdef CONFIG_COMPAT
1848 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1849 				  char __user *optval, int __user *optlen)
1850 {
1851 	struct sock *sk = sock->sk;
1852 
1853 	if (sk->sk_prot->compat_getsockopt != NULL)
1854 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
1855 						      optval, optlen);
1856 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1857 }
1858 EXPORT_SYMBOL(compat_sock_common_getsockopt);
1859 #endif
1860 
1861 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1862 			struct msghdr *msg, size_t size, int flags)
1863 {
1864 	struct sock *sk = sock->sk;
1865 	int addr_len = 0;
1866 	int err;
1867 
1868 	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1869 				   flags & ~MSG_DONTWAIT, &addr_len);
1870 	if (err >= 0)
1871 		msg->msg_namelen = addr_len;
1872 	return err;
1873 }
1874 
1875 EXPORT_SYMBOL(sock_common_recvmsg);
1876 
1877 /*
1878  *	Set socket options on an inet socket.
1879  */
1880 int sock_common_setsockopt(struct socket *sock, int level, int optname,
1881 			   char __user *optval, int optlen)
1882 {
1883 	struct sock *sk = sock->sk;
1884 
1885 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1886 }
1887 
1888 EXPORT_SYMBOL(sock_common_setsockopt);
1889 
1890 #ifdef CONFIG_COMPAT
1891 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
1892 				  char __user *optval, int optlen)
1893 {
1894 	struct sock *sk = sock->sk;
1895 
1896 	if (sk->sk_prot->compat_setsockopt != NULL)
1897 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
1898 						      optval, optlen);
1899 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1900 }
1901 EXPORT_SYMBOL(compat_sock_common_setsockopt);
1902 #endif
1903 
1904 void sk_common_release(struct sock *sk)
1905 {
1906 	if (sk->sk_prot->destroy)
1907 		sk->sk_prot->destroy(sk);
1908 
1909 	/*
1910 	 * Observation: when sock_common_release is called, processes have
1911 	 * no access to socket. But net still has.
1912 	 * Step one, detach it from networking:
1913 	 *
1914 	 * A. Remove from hash tables.
1915 	 */
1916 
1917 	sk->sk_prot->unhash(sk);
1918 
1919 	/*
1920 	 * In this point socket cannot receive new packets, but it is possible
1921 	 * that some packets are in flight because some CPU runs receiver and
1922 	 * did hash table lookup before we unhashed socket. They will achieve
1923 	 * receive queue and will be purged by socket destructor.
1924 	 *
1925 	 * Also we still have packets pending on receive queue and probably,
1926 	 * our own packets waiting in device queues. sock_destroy will drain
1927 	 * receive queue, but transmitted packets will delay socket destruction
1928 	 * until the last reference will be released.
1929 	 */
1930 
1931 	sock_orphan(sk);
1932 
1933 	xfrm_sk_free_policy(sk);
1934 
1935 	sk_refcnt_debug_release(sk);
1936 	sock_put(sk);
1937 }
1938 
1939 EXPORT_SYMBOL(sk_common_release);
1940 
1941 static DEFINE_RWLOCK(proto_list_lock);
1942 static LIST_HEAD(proto_list);
1943 
1944 #ifdef CONFIG_PROC_FS
1945 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
1946 struct prot_inuse {
1947 	int val[PROTO_INUSE_NR];
1948 };
1949 
1950 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
1951 
1952 #ifdef CONFIG_NET_NS
1953 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
1954 {
1955 	int cpu = smp_processor_id();
1956 	per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
1957 }
1958 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
1959 
1960 int sock_prot_inuse_get(struct net *net, struct proto *prot)
1961 {
1962 	int cpu, idx = prot->inuse_idx;
1963 	int res = 0;
1964 
1965 	for_each_possible_cpu(cpu)
1966 		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
1967 
1968 	return res >= 0 ? res : 0;
1969 }
1970 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
1971 
1972 static int sock_inuse_init_net(struct net *net)
1973 {
1974 	net->core.inuse = alloc_percpu(struct prot_inuse);
1975 	return net->core.inuse ? 0 : -ENOMEM;
1976 }
1977 
1978 static void sock_inuse_exit_net(struct net *net)
1979 {
1980 	free_percpu(net->core.inuse);
1981 }
1982 
1983 static struct pernet_operations net_inuse_ops = {
1984 	.init = sock_inuse_init_net,
1985 	.exit = sock_inuse_exit_net,
1986 };
1987 
1988 static __init int net_inuse_init(void)
1989 {
1990 	if (register_pernet_subsys(&net_inuse_ops))
1991 		panic("Cannot initialize net inuse counters");
1992 
1993 	return 0;
1994 }
1995 
1996 core_initcall(net_inuse_init);
1997 #else
1998 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
1999 
2000 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2001 {
2002 	__get_cpu_var(prot_inuse).val[prot->inuse_idx] += val;
2003 }
2004 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2005 
2006 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2007 {
2008 	int cpu, idx = prot->inuse_idx;
2009 	int res = 0;
2010 
2011 	for_each_possible_cpu(cpu)
2012 		res += per_cpu(prot_inuse, cpu).val[idx];
2013 
2014 	return res >= 0 ? res : 0;
2015 }
2016 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2017 #endif
2018 
2019 static void assign_proto_idx(struct proto *prot)
2020 {
2021 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2022 
2023 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2024 		printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2025 		return;
2026 	}
2027 
2028 	set_bit(prot->inuse_idx, proto_inuse_idx);
2029 }
2030 
2031 static void release_proto_idx(struct proto *prot)
2032 {
2033 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2034 		clear_bit(prot->inuse_idx, proto_inuse_idx);
2035 }
2036 #else
2037 static inline void assign_proto_idx(struct proto *prot)
2038 {
2039 }
2040 
2041 static inline void release_proto_idx(struct proto *prot)
2042 {
2043 }
2044 #endif
2045 
2046 int proto_register(struct proto *prot, int alloc_slab)
2047 {
2048 	char *request_sock_slab_name = NULL;
2049 	char *timewait_sock_slab_name;
2050 
2051 	if (alloc_slab) {
2052 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2053 					       SLAB_HWCACHE_ALIGN, NULL);
2054 
2055 		if (prot->slab == NULL) {
2056 			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2057 			       prot->name);
2058 			goto out;
2059 		}
2060 
2061 		if (prot->rsk_prot != NULL) {
2062 			static const char mask[] = "request_sock_%s";
2063 
2064 			request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2065 			if (request_sock_slab_name == NULL)
2066 				goto out_free_sock_slab;
2067 
2068 			sprintf(request_sock_slab_name, mask, prot->name);
2069 			prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
2070 								 prot->rsk_prot->obj_size, 0,
2071 								 SLAB_HWCACHE_ALIGN, NULL);
2072 
2073 			if (prot->rsk_prot->slab == NULL) {
2074 				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2075 				       prot->name);
2076 				goto out_free_request_sock_slab_name;
2077 			}
2078 		}
2079 
2080 		if (prot->twsk_prot != NULL) {
2081 			static const char mask[] = "tw_sock_%s";
2082 
2083 			timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2084 
2085 			if (timewait_sock_slab_name == NULL)
2086 				goto out_free_request_sock_slab;
2087 
2088 			sprintf(timewait_sock_slab_name, mask, prot->name);
2089 			prot->twsk_prot->twsk_slab =
2090 				kmem_cache_create(timewait_sock_slab_name,
2091 						  prot->twsk_prot->twsk_obj_size,
2092 						  0, SLAB_HWCACHE_ALIGN,
2093 						  NULL);
2094 			if (prot->twsk_prot->twsk_slab == NULL)
2095 				goto out_free_timewait_sock_slab_name;
2096 		}
2097 	}
2098 
2099 	write_lock(&proto_list_lock);
2100 	list_add(&prot->node, &proto_list);
2101 	assign_proto_idx(prot);
2102 	write_unlock(&proto_list_lock);
2103 	return 0;
2104 
2105 out_free_timewait_sock_slab_name:
2106 	kfree(timewait_sock_slab_name);
2107 out_free_request_sock_slab:
2108 	if (prot->rsk_prot && prot->rsk_prot->slab) {
2109 		kmem_cache_destroy(prot->rsk_prot->slab);
2110 		prot->rsk_prot->slab = NULL;
2111 	}
2112 out_free_request_sock_slab_name:
2113 	kfree(request_sock_slab_name);
2114 out_free_sock_slab:
2115 	kmem_cache_destroy(prot->slab);
2116 	prot->slab = NULL;
2117 out:
2118 	return -ENOBUFS;
2119 }
2120 
2121 EXPORT_SYMBOL(proto_register);
2122 
2123 void proto_unregister(struct proto *prot)
2124 {
2125 	write_lock(&proto_list_lock);
2126 	release_proto_idx(prot);
2127 	list_del(&prot->node);
2128 	write_unlock(&proto_list_lock);
2129 
2130 	if (prot->slab != NULL) {
2131 		kmem_cache_destroy(prot->slab);
2132 		prot->slab = NULL;
2133 	}
2134 
2135 	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2136 		const char *name = kmem_cache_name(prot->rsk_prot->slab);
2137 
2138 		kmem_cache_destroy(prot->rsk_prot->slab);
2139 		kfree(name);
2140 		prot->rsk_prot->slab = NULL;
2141 	}
2142 
2143 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2144 		const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
2145 
2146 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2147 		kfree(name);
2148 		prot->twsk_prot->twsk_slab = NULL;
2149 	}
2150 }
2151 
2152 EXPORT_SYMBOL(proto_unregister);
2153 
2154 #ifdef CONFIG_PROC_FS
2155 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2156 	__acquires(proto_list_lock)
2157 {
2158 	read_lock(&proto_list_lock);
2159 	return seq_list_start_head(&proto_list, *pos);
2160 }
2161 
2162 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2163 {
2164 	return seq_list_next(v, &proto_list, pos);
2165 }
2166 
2167 static void proto_seq_stop(struct seq_file *seq, void *v)
2168 	__releases(proto_list_lock)
2169 {
2170 	read_unlock(&proto_list_lock);
2171 }
2172 
2173 static char proto_method_implemented(const void *method)
2174 {
2175 	return method == NULL ? 'n' : 'y';
2176 }
2177 
2178 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2179 {
2180 	seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
2181 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2182 		   proto->name,
2183 		   proto->obj_size,
2184 		   proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
2185 		   proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
2186 		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2187 		   proto->max_header,
2188 		   proto->slab == NULL ? "no" : "yes",
2189 		   module_name(proto->owner),
2190 		   proto_method_implemented(proto->close),
2191 		   proto_method_implemented(proto->connect),
2192 		   proto_method_implemented(proto->disconnect),
2193 		   proto_method_implemented(proto->accept),
2194 		   proto_method_implemented(proto->ioctl),
2195 		   proto_method_implemented(proto->init),
2196 		   proto_method_implemented(proto->destroy),
2197 		   proto_method_implemented(proto->shutdown),
2198 		   proto_method_implemented(proto->setsockopt),
2199 		   proto_method_implemented(proto->getsockopt),
2200 		   proto_method_implemented(proto->sendmsg),
2201 		   proto_method_implemented(proto->recvmsg),
2202 		   proto_method_implemented(proto->sendpage),
2203 		   proto_method_implemented(proto->bind),
2204 		   proto_method_implemented(proto->backlog_rcv),
2205 		   proto_method_implemented(proto->hash),
2206 		   proto_method_implemented(proto->unhash),
2207 		   proto_method_implemented(proto->get_port),
2208 		   proto_method_implemented(proto->enter_memory_pressure));
2209 }
2210 
2211 static int proto_seq_show(struct seq_file *seq, void *v)
2212 {
2213 	if (v == &proto_list)
2214 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2215 			   "protocol",
2216 			   "size",
2217 			   "sockets",
2218 			   "memory",
2219 			   "press",
2220 			   "maxhdr",
2221 			   "slab",
2222 			   "module",
2223 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2224 	else
2225 		proto_seq_printf(seq, list_entry(v, struct proto, node));
2226 	return 0;
2227 }
2228 
2229 static const struct seq_operations proto_seq_ops = {
2230 	.start  = proto_seq_start,
2231 	.next   = proto_seq_next,
2232 	.stop   = proto_seq_stop,
2233 	.show   = proto_seq_show,
2234 };
2235 
2236 static int proto_seq_open(struct inode *inode, struct file *file)
2237 {
2238 	return seq_open(file, &proto_seq_ops);
2239 }
2240 
2241 static const struct file_operations proto_seq_fops = {
2242 	.owner		= THIS_MODULE,
2243 	.open		= proto_seq_open,
2244 	.read		= seq_read,
2245 	.llseek		= seq_lseek,
2246 	.release	= seq_release,
2247 };
2248 
2249 static int __init proto_init(void)
2250 {
2251 	/* register /proc/net/protocols */
2252 	return proc_net_fops_create(&init_net, "protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
2253 }
2254 
2255 subsys_initcall(proto_init);
2256 
2257 #endif /* PROC_FS */
2258 
2259 EXPORT_SYMBOL(sk_alloc);
2260 EXPORT_SYMBOL(sk_free);
2261 EXPORT_SYMBOL(sk_send_sigurg);
2262 EXPORT_SYMBOL(sock_alloc_send_skb);
2263 EXPORT_SYMBOL(sock_init_data);
2264 EXPORT_SYMBOL(sock_kfree_s);
2265 EXPORT_SYMBOL(sock_kmalloc);
2266 EXPORT_SYMBOL(sock_no_accept);
2267 EXPORT_SYMBOL(sock_no_bind);
2268 EXPORT_SYMBOL(sock_no_connect);
2269 EXPORT_SYMBOL(sock_no_getname);
2270 EXPORT_SYMBOL(sock_no_getsockopt);
2271 EXPORT_SYMBOL(sock_no_ioctl);
2272 EXPORT_SYMBOL(sock_no_listen);
2273 EXPORT_SYMBOL(sock_no_mmap);
2274 EXPORT_SYMBOL(sock_no_poll);
2275 EXPORT_SYMBOL(sock_no_recvmsg);
2276 EXPORT_SYMBOL(sock_no_sendmsg);
2277 EXPORT_SYMBOL(sock_no_sendpage);
2278 EXPORT_SYMBOL(sock_no_setsockopt);
2279 EXPORT_SYMBOL(sock_no_shutdown);
2280 EXPORT_SYMBOL(sock_no_socketpair);
2281 EXPORT_SYMBOL(sock_rfree);
2282 EXPORT_SYMBOL(sock_setsockopt);
2283 EXPORT_SYMBOL(sock_wfree);
2284 EXPORT_SYMBOL(sock_wmalloc);
2285 EXPORT_SYMBOL(sock_i_uid);
2286 EXPORT_SYMBOL(sock_i_ino);
2287 EXPORT_SYMBOL(sysctl_optmem_max);
2288