xref: /openbmc/linux/net/core/sock.c (revision bec36eca)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *		This program is free software; you can redistribute it and/or
87  *		modify it under the terms of the GNU General Public License
88  *		as published by the Free Software Foundation; either version
89  *		2 of the License, or (at your option) any later version.
90  */
91 
92 #include <linux/capability.h>
93 #include <linux/errno.h>
94 #include <linux/types.h>
95 #include <linux/socket.h>
96 #include <linux/in.h>
97 #include <linux/kernel.h>
98 #include <linux/module.h>
99 #include <linux/proc_fs.h>
100 #include <linux/seq_file.h>
101 #include <linux/sched.h>
102 #include <linux/timer.h>
103 #include <linux/string.h>
104 #include <linux/sockios.h>
105 #include <linux/net.h>
106 #include <linux/mm.h>
107 #include <linux/slab.h>
108 #include <linux/interrupt.h>
109 #include <linux/poll.h>
110 #include <linux/tcp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113 
114 #include <asm/uaccess.h>
115 #include <asm/system.h>
116 
117 #include <linux/netdevice.h>
118 #include <net/protocol.h>
119 #include <linux/skbuff.h>
120 #include <net/net_namespace.h>
121 #include <net/request_sock.h>
122 #include <net/sock.h>
123 #include <linux/net_tstamp.h>
124 #include <net/xfrm.h>
125 #include <linux/ipsec.h>
126 
127 #include <linux/filter.h>
128 
129 #ifdef CONFIG_INET
130 #include <net/tcp.h>
131 #endif
132 
133 /*
134  * Each address family might have different locking rules, so we have
135  * one slock key per address family:
136  */
137 static struct lock_class_key af_family_keys[AF_MAX];
138 static struct lock_class_key af_family_slock_keys[AF_MAX];
139 
140 /*
141  * Make lock validator output more readable. (we pre-construct these
142  * strings build-time, so that runtime initialization of socket
143  * locks is fast):
144  */
145 static const char *af_family_key_strings[AF_MAX+1] = {
146   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
147   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
148   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
149   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
150   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
151   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
152   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
153   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
154   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
155   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
156   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
157   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
158   "sk_lock-AF_MAX"
159 };
160 static const char *af_family_slock_key_strings[AF_MAX+1] = {
161   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
162   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
163   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
164   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
165   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
166   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
167   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
168   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
169   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
170   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
171   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
172   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
173   "slock-AF_MAX"
174 };
175 static const char *af_family_clock_key_strings[AF_MAX+1] = {
176   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
177   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
178   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
179   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
180   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
181   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
182   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
183   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
184   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
185   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
186   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
187   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
188   "clock-AF_MAX"
189 };
190 
191 /*
192  * sk_callback_lock locking rules are per-address-family,
193  * so split the lock classes by using a per-AF key:
194  */
195 static struct lock_class_key af_callback_keys[AF_MAX];
196 
197 /* Take into consideration the size of the struct sk_buff overhead in the
198  * determination of these values, since that is non-constant across
199  * platforms.  This makes socket queueing behavior and performance
200  * not depend upon such differences.
201  */
202 #define _SK_MEM_PACKETS		256
203 #define _SK_MEM_OVERHEAD	(sizeof(struct sk_buff) + 256)
204 #define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
205 #define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
206 
207 /* Run time adjustable parameters. */
208 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
209 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
210 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
211 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
212 
213 /* Maximal space eaten by iovec or ancilliary data plus some space */
214 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
215 
216 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
217 {
218 	struct timeval tv;
219 
220 	if (optlen < sizeof(tv))
221 		return -EINVAL;
222 	if (copy_from_user(&tv, optval, sizeof(tv)))
223 		return -EFAULT;
224 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
225 		return -EDOM;
226 
227 	if (tv.tv_sec < 0) {
228 		static int warned __read_mostly;
229 
230 		*timeo_p = 0;
231 		if (warned < 10 && net_ratelimit()) {
232 			warned++;
233 			printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
234 			       "tries to set negative timeout\n",
235 				current->comm, task_pid_nr(current));
236 		}
237 		return 0;
238 	}
239 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
240 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
241 		return 0;
242 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
243 		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
244 	return 0;
245 }
246 
247 static void sock_warn_obsolete_bsdism(const char *name)
248 {
249 	static int warned;
250 	static char warncomm[TASK_COMM_LEN];
251 	if (strcmp(warncomm, current->comm) && warned < 5) {
252 		strcpy(warncomm,  current->comm);
253 		printk(KERN_WARNING "process `%s' is using obsolete "
254 		       "%s SO_BSDCOMPAT\n", warncomm, name);
255 		warned++;
256 	}
257 }
258 
259 static void sock_disable_timestamp(struct sock *sk, int flag)
260 {
261 	if (sock_flag(sk, flag)) {
262 		sock_reset_flag(sk, flag);
263 		if (!sock_flag(sk, SOCK_TIMESTAMP) &&
264 		    !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) {
265 			net_disable_timestamp();
266 		}
267 	}
268 }
269 
270 
271 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
272 {
273 	int err = 0;
274 	int skb_len;
275 
276 	/* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
277 	   number of warnings when compiling with -W --ANK
278 	 */
279 	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
280 	    (unsigned)sk->sk_rcvbuf) {
281 		err = -ENOMEM;
282 		goto out;
283 	}
284 
285 	err = sk_filter(sk, skb);
286 	if (err)
287 		goto out;
288 
289 	if (!sk_rmem_schedule(sk, skb->truesize)) {
290 		err = -ENOBUFS;
291 		goto out;
292 	}
293 
294 	skb->dev = NULL;
295 	skb_set_owner_r(skb, sk);
296 
297 	/* Cache the SKB length before we tack it onto the receive
298 	 * queue.  Once it is added it no longer belongs to us and
299 	 * may be freed by other threads of control pulling packets
300 	 * from the queue.
301 	 */
302 	skb_len = skb->len;
303 
304 	skb_queue_tail(&sk->sk_receive_queue, skb);
305 
306 	if (!sock_flag(sk, SOCK_DEAD))
307 		sk->sk_data_ready(sk, skb_len);
308 out:
309 	return err;
310 }
311 EXPORT_SYMBOL(sock_queue_rcv_skb);
312 
313 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
314 {
315 	int rc = NET_RX_SUCCESS;
316 
317 	if (sk_filter(sk, skb))
318 		goto discard_and_relse;
319 
320 	skb->dev = NULL;
321 
322 	if (nested)
323 		bh_lock_sock_nested(sk);
324 	else
325 		bh_lock_sock(sk);
326 	if (!sock_owned_by_user(sk)) {
327 		/*
328 		 * trylock + unlock semantics:
329 		 */
330 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
331 
332 		rc = sk_backlog_rcv(sk, skb);
333 
334 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
335 	} else
336 		sk_add_backlog(sk, skb);
337 	bh_unlock_sock(sk);
338 out:
339 	sock_put(sk);
340 	return rc;
341 discard_and_relse:
342 	kfree_skb(skb);
343 	goto out;
344 }
345 EXPORT_SYMBOL(sk_receive_skb);
346 
347 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
348 {
349 	struct dst_entry *dst = sk->sk_dst_cache;
350 
351 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
352 		sk->sk_dst_cache = NULL;
353 		dst_release(dst);
354 		return NULL;
355 	}
356 
357 	return dst;
358 }
359 EXPORT_SYMBOL(__sk_dst_check);
360 
361 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
362 {
363 	struct dst_entry *dst = sk_dst_get(sk);
364 
365 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
366 		sk_dst_reset(sk);
367 		dst_release(dst);
368 		return NULL;
369 	}
370 
371 	return dst;
372 }
373 EXPORT_SYMBOL(sk_dst_check);
374 
375 static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
376 {
377 	int ret = -ENOPROTOOPT;
378 #ifdef CONFIG_NETDEVICES
379 	struct net *net = sock_net(sk);
380 	char devname[IFNAMSIZ];
381 	int index;
382 
383 	/* Sorry... */
384 	ret = -EPERM;
385 	if (!capable(CAP_NET_RAW))
386 		goto out;
387 
388 	ret = -EINVAL;
389 	if (optlen < 0)
390 		goto out;
391 
392 	/* Bind this socket to a particular device like "eth0",
393 	 * as specified in the passed interface name. If the
394 	 * name is "" or the option length is zero the socket
395 	 * is not bound.
396 	 */
397 	if (optlen > IFNAMSIZ - 1)
398 		optlen = IFNAMSIZ - 1;
399 	memset(devname, 0, sizeof(devname));
400 
401 	ret = -EFAULT;
402 	if (copy_from_user(devname, optval, optlen))
403 		goto out;
404 
405 	if (devname[0] == '\0') {
406 		index = 0;
407 	} else {
408 		struct net_device *dev = dev_get_by_name(net, devname);
409 
410 		ret = -ENODEV;
411 		if (!dev)
412 			goto out;
413 
414 		index = dev->ifindex;
415 		dev_put(dev);
416 	}
417 
418 	lock_sock(sk);
419 	sk->sk_bound_dev_if = index;
420 	sk_dst_reset(sk);
421 	release_sock(sk);
422 
423 	ret = 0;
424 
425 out:
426 #endif
427 
428 	return ret;
429 }
430 
431 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
432 {
433 	if (valbool)
434 		sock_set_flag(sk, bit);
435 	else
436 		sock_reset_flag(sk, bit);
437 }
438 
439 /*
440  *	This is meant for all protocols to use and covers goings on
441  *	at the socket level. Everything here is generic.
442  */
443 
444 int sock_setsockopt(struct socket *sock, int level, int optname,
445 		    char __user *optval, int optlen)
446 {
447 	struct sock *sk=sock->sk;
448 	int val;
449 	int valbool;
450 	struct linger ling;
451 	int ret = 0;
452 
453 	/*
454 	 *	Options without arguments
455 	 */
456 
457 	if (optname == SO_BINDTODEVICE)
458 		return sock_bindtodevice(sk, optval, optlen);
459 
460 	if (optlen < sizeof(int))
461 		return -EINVAL;
462 
463 	if (get_user(val, (int __user *)optval))
464 		return -EFAULT;
465 
466 	valbool = val?1:0;
467 
468 	lock_sock(sk);
469 
470 	switch(optname) {
471 	case SO_DEBUG:
472 		if (val && !capable(CAP_NET_ADMIN)) {
473 			ret = -EACCES;
474 		} else
475 			sock_valbool_flag(sk, SOCK_DBG, valbool);
476 		break;
477 	case SO_REUSEADDR:
478 		sk->sk_reuse = valbool;
479 		break;
480 	case SO_TYPE:
481 	case SO_ERROR:
482 		ret = -ENOPROTOOPT;
483 		break;
484 	case SO_DONTROUTE:
485 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
486 		break;
487 	case SO_BROADCAST:
488 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
489 		break;
490 	case SO_SNDBUF:
491 		/* Don't error on this BSD doesn't and if you think
492 		   about it this is right. Otherwise apps have to
493 		   play 'guess the biggest size' games. RCVBUF/SNDBUF
494 		   are treated in BSD as hints */
495 
496 		if (val > sysctl_wmem_max)
497 			val = sysctl_wmem_max;
498 set_sndbuf:
499 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
500 		if ((val * 2) < SOCK_MIN_SNDBUF)
501 			sk->sk_sndbuf = SOCK_MIN_SNDBUF;
502 		else
503 			sk->sk_sndbuf = val * 2;
504 
505 		/*
506 		 *	Wake up sending tasks if we
507 		 *	upped the value.
508 		 */
509 		sk->sk_write_space(sk);
510 		break;
511 
512 	case SO_SNDBUFFORCE:
513 		if (!capable(CAP_NET_ADMIN)) {
514 			ret = -EPERM;
515 			break;
516 		}
517 		goto set_sndbuf;
518 
519 	case SO_RCVBUF:
520 		/* Don't error on this BSD doesn't and if you think
521 		   about it this is right. Otherwise apps have to
522 		   play 'guess the biggest size' games. RCVBUF/SNDBUF
523 		   are treated in BSD as hints */
524 
525 		if (val > sysctl_rmem_max)
526 			val = sysctl_rmem_max;
527 set_rcvbuf:
528 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
529 		/*
530 		 * We double it on the way in to account for
531 		 * "struct sk_buff" etc. overhead.   Applications
532 		 * assume that the SO_RCVBUF setting they make will
533 		 * allow that much actual data to be received on that
534 		 * socket.
535 		 *
536 		 * Applications are unaware that "struct sk_buff" and
537 		 * other overheads allocate from the receive buffer
538 		 * during socket buffer allocation.
539 		 *
540 		 * And after considering the possible alternatives,
541 		 * returning the value we actually used in getsockopt
542 		 * is the most desirable behavior.
543 		 */
544 		if ((val * 2) < SOCK_MIN_RCVBUF)
545 			sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
546 		else
547 			sk->sk_rcvbuf = val * 2;
548 		break;
549 
550 	case SO_RCVBUFFORCE:
551 		if (!capable(CAP_NET_ADMIN)) {
552 			ret = -EPERM;
553 			break;
554 		}
555 		goto set_rcvbuf;
556 
557 	case SO_KEEPALIVE:
558 #ifdef CONFIG_INET
559 		if (sk->sk_protocol == IPPROTO_TCP)
560 			tcp_set_keepalive(sk, valbool);
561 #endif
562 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
563 		break;
564 
565 	case SO_OOBINLINE:
566 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
567 		break;
568 
569 	case SO_NO_CHECK:
570 		sk->sk_no_check = valbool;
571 		break;
572 
573 	case SO_PRIORITY:
574 		if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
575 			sk->sk_priority = val;
576 		else
577 			ret = -EPERM;
578 		break;
579 
580 	case SO_LINGER:
581 		if (optlen < sizeof(ling)) {
582 			ret = -EINVAL;	/* 1003.1g */
583 			break;
584 		}
585 		if (copy_from_user(&ling,optval,sizeof(ling))) {
586 			ret = -EFAULT;
587 			break;
588 		}
589 		if (!ling.l_onoff)
590 			sock_reset_flag(sk, SOCK_LINGER);
591 		else {
592 #if (BITS_PER_LONG == 32)
593 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
594 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
595 			else
596 #endif
597 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
598 			sock_set_flag(sk, SOCK_LINGER);
599 		}
600 		break;
601 
602 	case SO_BSDCOMPAT:
603 		sock_warn_obsolete_bsdism("setsockopt");
604 		break;
605 
606 	case SO_PASSCRED:
607 		if (valbool)
608 			set_bit(SOCK_PASSCRED, &sock->flags);
609 		else
610 			clear_bit(SOCK_PASSCRED, &sock->flags);
611 		break;
612 
613 	case SO_TIMESTAMP:
614 	case SO_TIMESTAMPNS:
615 		if (valbool)  {
616 			if (optname == SO_TIMESTAMP)
617 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
618 			else
619 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
620 			sock_set_flag(sk, SOCK_RCVTSTAMP);
621 			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
622 		} else {
623 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
624 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
625 		}
626 		break;
627 
628 	case SO_TIMESTAMPING:
629 		if (val & ~SOF_TIMESTAMPING_MASK) {
630 			ret = EINVAL;
631 			break;
632 		}
633 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
634 				  val & SOF_TIMESTAMPING_TX_HARDWARE);
635 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
636 				  val & SOF_TIMESTAMPING_TX_SOFTWARE);
637 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
638 				  val & SOF_TIMESTAMPING_RX_HARDWARE);
639 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
640 			sock_enable_timestamp(sk,
641 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
642 		else
643 			sock_disable_timestamp(sk,
644 					       SOCK_TIMESTAMPING_RX_SOFTWARE);
645 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
646 				  val & SOF_TIMESTAMPING_SOFTWARE);
647 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
648 				  val & SOF_TIMESTAMPING_SYS_HARDWARE);
649 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
650 				  val & SOF_TIMESTAMPING_RAW_HARDWARE);
651 		break;
652 
653 	case SO_RCVLOWAT:
654 		if (val < 0)
655 			val = INT_MAX;
656 		sk->sk_rcvlowat = val ? : 1;
657 		break;
658 
659 	case SO_RCVTIMEO:
660 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
661 		break;
662 
663 	case SO_SNDTIMEO:
664 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
665 		break;
666 
667 	case SO_ATTACH_FILTER:
668 		ret = -EINVAL;
669 		if (optlen == sizeof(struct sock_fprog)) {
670 			struct sock_fprog fprog;
671 
672 			ret = -EFAULT;
673 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
674 				break;
675 
676 			ret = sk_attach_filter(&fprog, sk);
677 		}
678 		break;
679 
680 	case SO_DETACH_FILTER:
681 		ret = sk_detach_filter(sk);
682 		break;
683 
684 	case SO_PASSSEC:
685 		if (valbool)
686 			set_bit(SOCK_PASSSEC, &sock->flags);
687 		else
688 			clear_bit(SOCK_PASSSEC, &sock->flags);
689 		break;
690 	case SO_MARK:
691 		if (!capable(CAP_NET_ADMIN))
692 			ret = -EPERM;
693 		else {
694 			sk->sk_mark = val;
695 		}
696 		break;
697 
698 		/* We implement the SO_SNDLOWAT etc to
699 		   not be settable (1003.1g 5.3) */
700 	default:
701 		ret = -ENOPROTOOPT;
702 		break;
703 	}
704 	release_sock(sk);
705 	return ret;
706 }
707 
708 
709 int sock_getsockopt(struct socket *sock, int level, int optname,
710 		    char __user *optval, int __user *optlen)
711 {
712 	struct sock *sk = sock->sk;
713 
714 	union {
715 		int val;
716 		struct linger ling;
717 		struct timeval tm;
718 	} v;
719 
720 	unsigned int lv = sizeof(int);
721 	int len;
722 
723 	if (get_user(len, optlen))
724 		return -EFAULT;
725 	if (len < 0)
726 		return -EINVAL;
727 
728 	memset(&v, 0, sizeof(v));
729 
730 	switch(optname) {
731 	case SO_DEBUG:
732 		v.val = sock_flag(sk, SOCK_DBG);
733 		break;
734 
735 	case SO_DONTROUTE:
736 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
737 		break;
738 
739 	case SO_BROADCAST:
740 		v.val = !!sock_flag(sk, SOCK_BROADCAST);
741 		break;
742 
743 	case SO_SNDBUF:
744 		v.val = sk->sk_sndbuf;
745 		break;
746 
747 	case SO_RCVBUF:
748 		v.val = sk->sk_rcvbuf;
749 		break;
750 
751 	case SO_REUSEADDR:
752 		v.val = sk->sk_reuse;
753 		break;
754 
755 	case SO_KEEPALIVE:
756 		v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
757 		break;
758 
759 	case SO_TYPE:
760 		v.val = sk->sk_type;
761 		break;
762 
763 	case SO_ERROR:
764 		v.val = -sock_error(sk);
765 		if (v.val==0)
766 			v.val = xchg(&sk->sk_err_soft, 0);
767 		break;
768 
769 	case SO_OOBINLINE:
770 		v.val = !!sock_flag(sk, SOCK_URGINLINE);
771 		break;
772 
773 	case SO_NO_CHECK:
774 		v.val = sk->sk_no_check;
775 		break;
776 
777 	case SO_PRIORITY:
778 		v.val = sk->sk_priority;
779 		break;
780 
781 	case SO_LINGER:
782 		lv		= sizeof(v.ling);
783 		v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER);
784 		v.ling.l_linger	= sk->sk_lingertime / HZ;
785 		break;
786 
787 	case SO_BSDCOMPAT:
788 		sock_warn_obsolete_bsdism("getsockopt");
789 		break;
790 
791 	case SO_TIMESTAMP:
792 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
793 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
794 		break;
795 
796 	case SO_TIMESTAMPNS:
797 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
798 		break;
799 
800 	case SO_TIMESTAMPING:
801 		v.val = 0;
802 		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
803 			v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
804 		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
805 			v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
806 		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
807 			v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
808 		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
809 			v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
810 		if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
811 			v.val |= SOF_TIMESTAMPING_SOFTWARE;
812 		if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
813 			v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
814 		if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
815 			v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
816 		break;
817 
818 	case SO_RCVTIMEO:
819 		lv=sizeof(struct timeval);
820 		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
821 			v.tm.tv_sec = 0;
822 			v.tm.tv_usec = 0;
823 		} else {
824 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
825 			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
826 		}
827 		break;
828 
829 	case SO_SNDTIMEO:
830 		lv=sizeof(struct timeval);
831 		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
832 			v.tm.tv_sec = 0;
833 			v.tm.tv_usec = 0;
834 		} else {
835 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
836 			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
837 		}
838 		break;
839 
840 	case SO_RCVLOWAT:
841 		v.val = sk->sk_rcvlowat;
842 		break;
843 
844 	case SO_SNDLOWAT:
845 		v.val=1;
846 		break;
847 
848 	case SO_PASSCRED:
849 		v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
850 		break;
851 
852 	case SO_PEERCRED:
853 		if (len > sizeof(sk->sk_peercred))
854 			len = sizeof(sk->sk_peercred);
855 		if (copy_to_user(optval, &sk->sk_peercred, len))
856 			return -EFAULT;
857 		goto lenout;
858 
859 	case SO_PEERNAME:
860 	{
861 		char address[128];
862 
863 		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
864 			return -ENOTCONN;
865 		if (lv < len)
866 			return -EINVAL;
867 		if (copy_to_user(optval, address, len))
868 			return -EFAULT;
869 		goto lenout;
870 	}
871 
872 	/* Dubious BSD thing... Probably nobody even uses it, but
873 	 * the UNIX standard wants it for whatever reason... -DaveM
874 	 */
875 	case SO_ACCEPTCONN:
876 		v.val = sk->sk_state == TCP_LISTEN;
877 		break;
878 
879 	case SO_PASSSEC:
880 		v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
881 		break;
882 
883 	case SO_PEERSEC:
884 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
885 
886 	case SO_MARK:
887 		v.val = sk->sk_mark;
888 		break;
889 
890 	default:
891 		return -ENOPROTOOPT;
892 	}
893 
894 	if (len > lv)
895 		len = lv;
896 	if (copy_to_user(optval, &v, len))
897 		return -EFAULT;
898 lenout:
899 	if (put_user(len, optlen))
900 		return -EFAULT;
901 	return 0;
902 }
903 
904 /*
905  * Initialize an sk_lock.
906  *
907  * (We also register the sk_lock with the lock validator.)
908  */
909 static inline void sock_lock_init(struct sock *sk)
910 {
911 	sock_lock_init_class_and_name(sk,
912 			af_family_slock_key_strings[sk->sk_family],
913 			af_family_slock_keys + sk->sk_family,
914 			af_family_key_strings[sk->sk_family],
915 			af_family_keys + sk->sk_family);
916 }
917 
918 static void sock_copy(struct sock *nsk, const struct sock *osk)
919 {
920 #ifdef CONFIG_SECURITY_NETWORK
921 	void *sptr = nsk->sk_security;
922 #endif
923 
924 	memcpy(nsk, osk, osk->sk_prot->obj_size);
925 #ifdef CONFIG_SECURITY_NETWORK
926 	nsk->sk_security = sptr;
927 	security_sk_clone(osk, nsk);
928 #endif
929 }
930 
931 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
932 		int family)
933 {
934 	struct sock *sk;
935 	struct kmem_cache *slab;
936 
937 	slab = prot->slab;
938 	if (slab != NULL)
939 		sk = kmem_cache_alloc(slab, priority);
940 	else
941 		sk = kmalloc(prot->obj_size, priority);
942 
943 	if (sk != NULL) {
944 		if (security_sk_alloc(sk, family, priority))
945 			goto out_free;
946 
947 		if (!try_module_get(prot->owner))
948 			goto out_free_sec;
949 	}
950 
951 	return sk;
952 
953 out_free_sec:
954 	security_sk_free(sk);
955 out_free:
956 	if (slab != NULL)
957 		kmem_cache_free(slab, sk);
958 	else
959 		kfree(sk);
960 	return NULL;
961 }
962 
963 static void sk_prot_free(struct proto *prot, struct sock *sk)
964 {
965 	struct kmem_cache *slab;
966 	struct module *owner;
967 
968 	owner = prot->owner;
969 	slab = prot->slab;
970 
971 	security_sk_free(sk);
972 	if (slab != NULL)
973 		kmem_cache_free(slab, sk);
974 	else
975 		kfree(sk);
976 	module_put(owner);
977 }
978 
979 /**
980  *	sk_alloc - All socket objects are allocated here
981  *	@net: the applicable net namespace
982  *	@family: protocol family
983  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
984  *	@prot: struct proto associated with this new sock instance
985  */
986 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
987 		      struct proto *prot)
988 {
989 	struct sock *sk;
990 
991 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
992 	if (sk) {
993 		sk->sk_family = family;
994 		/*
995 		 * See comment in struct sock definition to understand
996 		 * why we need sk_prot_creator -acme
997 		 */
998 		sk->sk_prot = sk->sk_prot_creator = prot;
999 		sock_lock_init(sk);
1000 		sock_net_set(sk, get_net(net));
1001 	}
1002 
1003 	return sk;
1004 }
1005 
1006 void sk_free(struct sock *sk)
1007 {
1008 	struct sk_filter *filter;
1009 
1010 	if (sk->sk_destruct)
1011 		sk->sk_destruct(sk);
1012 
1013 	filter = rcu_dereference(sk->sk_filter);
1014 	if (filter) {
1015 		sk_filter_uncharge(sk, filter);
1016 		rcu_assign_pointer(sk->sk_filter, NULL);
1017 	}
1018 
1019 	sock_disable_timestamp(sk, SOCK_TIMESTAMP);
1020 	sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
1021 
1022 	if (atomic_read(&sk->sk_omem_alloc))
1023 		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
1024 		       __func__, atomic_read(&sk->sk_omem_alloc));
1025 
1026 	put_net(sock_net(sk));
1027 	sk_prot_free(sk->sk_prot_creator, sk);
1028 }
1029 
1030 /*
1031  * Last sock_put should drop referrence to sk->sk_net. It has already
1032  * been dropped in sk_change_net. Taking referrence to stopping namespace
1033  * is not an option.
1034  * Take referrence to a socket to remove it from hash _alive_ and after that
1035  * destroy it in the context of init_net.
1036  */
1037 void sk_release_kernel(struct sock *sk)
1038 {
1039 	if (sk == NULL || sk->sk_socket == NULL)
1040 		return;
1041 
1042 	sock_hold(sk);
1043 	sock_release(sk->sk_socket);
1044 	release_net(sock_net(sk));
1045 	sock_net_set(sk, get_net(&init_net));
1046 	sock_put(sk);
1047 }
1048 EXPORT_SYMBOL(sk_release_kernel);
1049 
1050 struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
1051 {
1052 	struct sock *newsk;
1053 
1054 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1055 	if (newsk != NULL) {
1056 		struct sk_filter *filter;
1057 
1058 		sock_copy(newsk, sk);
1059 
1060 		/* SANITY */
1061 		get_net(sock_net(newsk));
1062 		sk_node_init(&newsk->sk_node);
1063 		sock_lock_init(newsk);
1064 		bh_lock_sock(newsk);
1065 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1066 
1067 		atomic_set(&newsk->sk_rmem_alloc, 0);
1068 		atomic_set(&newsk->sk_wmem_alloc, 0);
1069 		atomic_set(&newsk->sk_omem_alloc, 0);
1070 		skb_queue_head_init(&newsk->sk_receive_queue);
1071 		skb_queue_head_init(&newsk->sk_write_queue);
1072 #ifdef CONFIG_NET_DMA
1073 		skb_queue_head_init(&newsk->sk_async_wait_queue);
1074 #endif
1075 
1076 		rwlock_init(&newsk->sk_dst_lock);
1077 		rwlock_init(&newsk->sk_callback_lock);
1078 		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1079 				af_callback_keys + newsk->sk_family,
1080 				af_family_clock_key_strings[newsk->sk_family]);
1081 
1082 		newsk->sk_dst_cache	= NULL;
1083 		newsk->sk_wmem_queued	= 0;
1084 		newsk->sk_forward_alloc = 0;
1085 		newsk->sk_send_head	= NULL;
1086 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1087 
1088 		sock_reset_flag(newsk, SOCK_DONE);
1089 		skb_queue_head_init(&newsk->sk_error_queue);
1090 
1091 		filter = newsk->sk_filter;
1092 		if (filter != NULL)
1093 			sk_filter_charge(newsk, filter);
1094 
1095 		if (unlikely(xfrm_sk_clone_policy(newsk))) {
1096 			/* It is still raw copy of parent, so invalidate
1097 			 * destructor and make plain sk_free() */
1098 			newsk->sk_destruct = NULL;
1099 			sk_free(newsk);
1100 			newsk = NULL;
1101 			goto out;
1102 		}
1103 
1104 		newsk->sk_err	   = 0;
1105 		newsk->sk_priority = 0;
1106 		atomic_set(&newsk->sk_refcnt, 2);
1107 
1108 		/*
1109 		 * Increment the counter in the same struct proto as the master
1110 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1111 		 * is the same as sk->sk_prot->socks, as this field was copied
1112 		 * with memcpy).
1113 		 *
1114 		 * This _changes_ the previous behaviour, where
1115 		 * tcp_create_openreq_child always was incrementing the
1116 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1117 		 * to be taken into account in all callers. -acme
1118 		 */
1119 		sk_refcnt_debug_inc(newsk);
1120 		sk_set_socket(newsk, NULL);
1121 		newsk->sk_sleep	 = NULL;
1122 
1123 		if (newsk->sk_prot->sockets_allocated)
1124 			percpu_counter_inc(newsk->sk_prot->sockets_allocated);
1125 	}
1126 out:
1127 	return newsk;
1128 }
1129 
1130 EXPORT_SYMBOL_GPL(sk_clone);
1131 
1132 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1133 {
1134 	__sk_dst_set(sk, dst);
1135 	sk->sk_route_caps = dst->dev->features;
1136 	if (sk->sk_route_caps & NETIF_F_GSO)
1137 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1138 	if (sk_can_gso(sk)) {
1139 		if (dst->header_len) {
1140 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1141 		} else {
1142 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1143 			sk->sk_gso_max_size = dst->dev->gso_max_size;
1144 		}
1145 	}
1146 }
1147 EXPORT_SYMBOL_GPL(sk_setup_caps);
1148 
1149 void __init sk_init(void)
1150 {
1151 	if (num_physpages <= 4096) {
1152 		sysctl_wmem_max = 32767;
1153 		sysctl_rmem_max = 32767;
1154 		sysctl_wmem_default = 32767;
1155 		sysctl_rmem_default = 32767;
1156 	} else if (num_physpages >= 131072) {
1157 		sysctl_wmem_max = 131071;
1158 		sysctl_rmem_max = 131071;
1159 	}
1160 }
1161 
1162 /*
1163  *	Simple resource managers for sockets.
1164  */
1165 
1166 
1167 /*
1168  * Write buffer destructor automatically called from kfree_skb.
1169  */
1170 void sock_wfree(struct sk_buff *skb)
1171 {
1172 	struct sock *sk = skb->sk;
1173 
1174 	/* In case it might be waiting for more memory. */
1175 	atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
1176 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
1177 		sk->sk_write_space(sk);
1178 	sock_put(sk);
1179 }
1180 
1181 /*
1182  * Read buffer destructor automatically called from kfree_skb.
1183  */
1184 void sock_rfree(struct sk_buff *skb)
1185 {
1186 	struct sock *sk = skb->sk;
1187 
1188 	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1189 	sk_mem_uncharge(skb->sk, skb->truesize);
1190 }
1191 
1192 
1193 int sock_i_uid(struct sock *sk)
1194 {
1195 	int uid;
1196 
1197 	read_lock(&sk->sk_callback_lock);
1198 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1199 	read_unlock(&sk->sk_callback_lock);
1200 	return uid;
1201 }
1202 
1203 unsigned long sock_i_ino(struct sock *sk)
1204 {
1205 	unsigned long ino;
1206 
1207 	read_lock(&sk->sk_callback_lock);
1208 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1209 	read_unlock(&sk->sk_callback_lock);
1210 	return ino;
1211 }
1212 
1213 /*
1214  * Allocate a skb from the socket's send buffer.
1215  */
1216 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1217 			     gfp_t priority)
1218 {
1219 	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1220 		struct sk_buff * skb = alloc_skb(size, priority);
1221 		if (skb) {
1222 			skb_set_owner_w(skb, sk);
1223 			return skb;
1224 		}
1225 	}
1226 	return NULL;
1227 }
1228 
1229 /*
1230  * Allocate a skb from the socket's receive buffer.
1231  */
1232 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1233 			     gfp_t priority)
1234 {
1235 	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1236 		struct sk_buff *skb = alloc_skb(size, priority);
1237 		if (skb) {
1238 			skb_set_owner_r(skb, sk);
1239 			return skb;
1240 		}
1241 	}
1242 	return NULL;
1243 }
1244 
1245 /*
1246  * Allocate a memory block from the socket's option memory buffer.
1247  */
1248 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1249 {
1250 	if ((unsigned)size <= sysctl_optmem_max &&
1251 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1252 		void *mem;
1253 		/* First do the add, to avoid the race if kmalloc
1254 		 * might sleep.
1255 		 */
1256 		atomic_add(size, &sk->sk_omem_alloc);
1257 		mem = kmalloc(size, priority);
1258 		if (mem)
1259 			return mem;
1260 		atomic_sub(size, &sk->sk_omem_alloc);
1261 	}
1262 	return NULL;
1263 }
1264 
1265 /*
1266  * Free an option memory block.
1267  */
1268 void sock_kfree_s(struct sock *sk, void *mem, int size)
1269 {
1270 	kfree(mem);
1271 	atomic_sub(size, &sk->sk_omem_alloc);
1272 }
1273 
1274 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1275    I think, these locks should be removed for datagram sockets.
1276  */
1277 static long sock_wait_for_wmem(struct sock * sk, long timeo)
1278 {
1279 	DEFINE_WAIT(wait);
1280 
1281 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1282 	for (;;) {
1283 		if (!timeo)
1284 			break;
1285 		if (signal_pending(current))
1286 			break;
1287 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1288 		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1289 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1290 			break;
1291 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1292 			break;
1293 		if (sk->sk_err)
1294 			break;
1295 		timeo = schedule_timeout(timeo);
1296 	}
1297 	finish_wait(sk->sk_sleep, &wait);
1298 	return timeo;
1299 }
1300 
1301 
1302 /*
1303  *	Generic send/receive buffer handlers
1304  */
1305 
1306 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1307 				     unsigned long data_len, int noblock,
1308 				     int *errcode)
1309 {
1310 	struct sk_buff *skb;
1311 	gfp_t gfp_mask;
1312 	long timeo;
1313 	int err;
1314 
1315 	gfp_mask = sk->sk_allocation;
1316 	if (gfp_mask & __GFP_WAIT)
1317 		gfp_mask |= __GFP_REPEAT;
1318 
1319 	timeo = sock_sndtimeo(sk, noblock);
1320 	while (1) {
1321 		err = sock_error(sk);
1322 		if (err != 0)
1323 			goto failure;
1324 
1325 		err = -EPIPE;
1326 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1327 			goto failure;
1328 
1329 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1330 			skb = alloc_skb(header_len, gfp_mask);
1331 			if (skb) {
1332 				int npages;
1333 				int i;
1334 
1335 				/* No pages, we're done... */
1336 				if (!data_len)
1337 					break;
1338 
1339 				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1340 				skb->truesize += data_len;
1341 				skb_shinfo(skb)->nr_frags = npages;
1342 				for (i = 0; i < npages; i++) {
1343 					struct page *page;
1344 					skb_frag_t *frag;
1345 
1346 					page = alloc_pages(sk->sk_allocation, 0);
1347 					if (!page) {
1348 						err = -ENOBUFS;
1349 						skb_shinfo(skb)->nr_frags = i;
1350 						kfree_skb(skb);
1351 						goto failure;
1352 					}
1353 
1354 					frag = &skb_shinfo(skb)->frags[i];
1355 					frag->page = page;
1356 					frag->page_offset = 0;
1357 					frag->size = (data_len >= PAGE_SIZE ?
1358 						      PAGE_SIZE :
1359 						      data_len);
1360 					data_len -= PAGE_SIZE;
1361 				}
1362 
1363 				/* Full success... */
1364 				break;
1365 			}
1366 			err = -ENOBUFS;
1367 			goto failure;
1368 		}
1369 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1370 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1371 		err = -EAGAIN;
1372 		if (!timeo)
1373 			goto failure;
1374 		if (signal_pending(current))
1375 			goto interrupted;
1376 		timeo = sock_wait_for_wmem(sk, timeo);
1377 	}
1378 
1379 	skb_set_owner_w(skb, sk);
1380 	return skb;
1381 
1382 interrupted:
1383 	err = sock_intr_errno(timeo);
1384 failure:
1385 	*errcode = err;
1386 	return NULL;
1387 }
1388 EXPORT_SYMBOL(sock_alloc_send_pskb);
1389 
1390 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1391 				    int noblock, int *errcode)
1392 {
1393 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1394 }
1395 
1396 static void __lock_sock(struct sock *sk)
1397 {
1398 	DEFINE_WAIT(wait);
1399 
1400 	for (;;) {
1401 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1402 					TASK_UNINTERRUPTIBLE);
1403 		spin_unlock_bh(&sk->sk_lock.slock);
1404 		schedule();
1405 		spin_lock_bh(&sk->sk_lock.slock);
1406 		if (!sock_owned_by_user(sk))
1407 			break;
1408 	}
1409 	finish_wait(&sk->sk_lock.wq, &wait);
1410 }
1411 
1412 static void __release_sock(struct sock *sk)
1413 {
1414 	struct sk_buff *skb = sk->sk_backlog.head;
1415 
1416 	do {
1417 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1418 		bh_unlock_sock(sk);
1419 
1420 		do {
1421 			struct sk_buff *next = skb->next;
1422 
1423 			skb->next = NULL;
1424 			sk_backlog_rcv(sk, skb);
1425 
1426 			/*
1427 			 * We are in process context here with softirqs
1428 			 * disabled, use cond_resched_softirq() to preempt.
1429 			 * This is safe to do because we've taken the backlog
1430 			 * queue private:
1431 			 */
1432 			cond_resched_softirq();
1433 
1434 			skb = next;
1435 		} while (skb != NULL);
1436 
1437 		bh_lock_sock(sk);
1438 	} while ((skb = sk->sk_backlog.head) != NULL);
1439 }
1440 
1441 /**
1442  * sk_wait_data - wait for data to arrive at sk_receive_queue
1443  * @sk:    sock to wait on
1444  * @timeo: for how long
1445  *
1446  * Now socket state including sk->sk_err is changed only under lock,
1447  * hence we may omit checks after joining wait queue.
1448  * We check receive queue before schedule() only as optimization;
1449  * it is very likely that release_sock() added new data.
1450  */
1451 int sk_wait_data(struct sock *sk, long *timeo)
1452 {
1453 	int rc;
1454 	DEFINE_WAIT(wait);
1455 
1456 	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1457 	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1458 	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1459 	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1460 	finish_wait(sk->sk_sleep, &wait);
1461 	return rc;
1462 }
1463 
1464 EXPORT_SYMBOL(sk_wait_data);
1465 
1466 /**
1467  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1468  *	@sk: socket
1469  *	@size: memory size to allocate
1470  *	@kind: allocation type
1471  *
1472  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1473  *	rmem allocation. This function assumes that protocols which have
1474  *	memory_pressure use sk_wmem_queued as write buffer accounting.
1475  */
1476 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1477 {
1478 	struct proto *prot = sk->sk_prot;
1479 	int amt = sk_mem_pages(size);
1480 	int allocated;
1481 
1482 	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1483 	allocated = atomic_add_return(amt, prot->memory_allocated);
1484 
1485 	/* Under limit. */
1486 	if (allocated <= prot->sysctl_mem[0]) {
1487 		if (prot->memory_pressure && *prot->memory_pressure)
1488 			*prot->memory_pressure = 0;
1489 		return 1;
1490 	}
1491 
1492 	/* Under pressure. */
1493 	if (allocated > prot->sysctl_mem[1])
1494 		if (prot->enter_memory_pressure)
1495 			prot->enter_memory_pressure(sk);
1496 
1497 	/* Over hard limit. */
1498 	if (allocated > prot->sysctl_mem[2])
1499 		goto suppress_allocation;
1500 
1501 	/* guarantee minimum buffer size under pressure */
1502 	if (kind == SK_MEM_RECV) {
1503 		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1504 			return 1;
1505 	} else { /* SK_MEM_SEND */
1506 		if (sk->sk_type == SOCK_STREAM) {
1507 			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1508 				return 1;
1509 		} else if (atomic_read(&sk->sk_wmem_alloc) <
1510 			   prot->sysctl_wmem[0])
1511 				return 1;
1512 	}
1513 
1514 	if (prot->memory_pressure) {
1515 		int alloc;
1516 
1517 		if (!*prot->memory_pressure)
1518 			return 1;
1519 		alloc = percpu_counter_read_positive(prot->sockets_allocated);
1520 		if (prot->sysctl_mem[2] > alloc *
1521 		    sk_mem_pages(sk->sk_wmem_queued +
1522 				 atomic_read(&sk->sk_rmem_alloc) +
1523 				 sk->sk_forward_alloc))
1524 			return 1;
1525 	}
1526 
1527 suppress_allocation:
1528 
1529 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1530 		sk_stream_moderate_sndbuf(sk);
1531 
1532 		/* Fail only if socket is _under_ its sndbuf.
1533 		 * In this case we cannot block, so that we have to fail.
1534 		 */
1535 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1536 			return 1;
1537 	}
1538 
1539 	/* Alas. Undo changes. */
1540 	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1541 	atomic_sub(amt, prot->memory_allocated);
1542 	return 0;
1543 }
1544 
1545 EXPORT_SYMBOL(__sk_mem_schedule);
1546 
1547 /**
1548  *	__sk_reclaim - reclaim memory_allocated
1549  *	@sk: socket
1550  */
1551 void __sk_mem_reclaim(struct sock *sk)
1552 {
1553 	struct proto *prot = sk->sk_prot;
1554 
1555 	atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
1556 		   prot->memory_allocated);
1557 	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1558 
1559 	if (prot->memory_pressure && *prot->memory_pressure &&
1560 	    (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1561 		*prot->memory_pressure = 0;
1562 }
1563 
1564 EXPORT_SYMBOL(__sk_mem_reclaim);
1565 
1566 
1567 /*
1568  * Set of default routines for initialising struct proto_ops when
1569  * the protocol does not support a particular function. In certain
1570  * cases where it makes no sense for a protocol to have a "do nothing"
1571  * function, some default processing is provided.
1572  */
1573 
1574 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1575 {
1576 	return -EOPNOTSUPP;
1577 }
1578 
1579 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1580 		    int len, int flags)
1581 {
1582 	return -EOPNOTSUPP;
1583 }
1584 
1585 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1586 {
1587 	return -EOPNOTSUPP;
1588 }
1589 
1590 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1591 {
1592 	return -EOPNOTSUPP;
1593 }
1594 
1595 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1596 		    int *len, int peer)
1597 {
1598 	return -EOPNOTSUPP;
1599 }
1600 
1601 unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1602 {
1603 	return 0;
1604 }
1605 
1606 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1607 {
1608 	return -EOPNOTSUPP;
1609 }
1610 
1611 int sock_no_listen(struct socket *sock, int backlog)
1612 {
1613 	return -EOPNOTSUPP;
1614 }
1615 
1616 int sock_no_shutdown(struct socket *sock, int how)
1617 {
1618 	return -EOPNOTSUPP;
1619 }
1620 
1621 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1622 		    char __user *optval, int optlen)
1623 {
1624 	return -EOPNOTSUPP;
1625 }
1626 
1627 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1628 		    char __user *optval, int __user *optlen)
1629 {
1630 	return -EOPNOTSUPP;
1631 }
1632 
1633 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1634 		    size_t len)
1635 {
1636 	return -EOPNOTSUPP;
1637 }
1638 
1639 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1640 		    size_t len, int flags)
1641 {
1642 	return -EOPNOTSUPP;
1643 }
1644 
1645 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1646 {
1647 	/* Mirror missing mmap method error code */
1648 	return -ENODEV;
1649 }
1650 
1651 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1652 {
1653 	ssize_t res;
1654 	struct msghdr msg = {.msg_flags = flags};
1655 	struct kvec iov;
1656 	char *kaddr = kmap(page);
1657 	iov.iov_base = kaddr + offset;
1658 	iov.iov_len = size;
1659 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1660 	kunmap(page);
1661 	return res;
1662 }
1663 
1664 /*
1665  *	Default Socket Callbacks
1666  */
1667 
1668 static void sock_def_wakeup(struct sock *sk)
1669 {
1670 	read_lock(&sk->sk_callback_lock);
1671 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1672 		wake_up_interruptible_all(sk->sk_sleep);
1673 	read_unlock(&sk->sk_callback_lock);
1674 }
1675 
1676 static void sock_def_error_report(struct sock *sk)
1677 {
1678 	read_lock(&sk->sk_callback_lock);
1679 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1680 		wake_up_interruptible_poll(sk->sk_sleep, POLLERR);
1681 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1682 	read_unlock(&sk->sk_callback_lock);
1683 }
1684 
1685 static void sock_def_readable(struct sock *sk, int len)
1686 {
1687 	read_lock(&sk->sk_callback_lock);
1688 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1689 		wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN |
1690 						POLLRDNORM | POLLRDBAND);
1691 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1692 	read_unlock(&sk->sk_callback_lock);
1693 }
1694 
1695 static void sock_def_write_space(struct sock *sk)
1696 {
1697 	read_lock(&sk->sk_callback_lock);
1698 
1699 	/* Do not wake up a writer until he can make "significant"
1700 	 * progress.  --DaveM
1701 	 */
1702 	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1703 		if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1704 			wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT |
1705 						POLLWRNORM | POLLWRBAND);
1706 
1707 		/* Should agree with poll, otherwise some programs break */
1708 		if (sock_writeable(sk))
1709 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1710 	}
1711 
1712 	read_unlock(&sk->sk_callback_lock);
1713 }
1714 
1715 static void sock_def_destruct(struct sock *sk)
1716 {
1717 	kfree(sk->sk_protinfo);
1718 }
1719 
1720 void sk_send_sigurg(struct sock *sk)
1721 {
1722 	if (sk->sk_socket && sk->sk_socket->file)
1723 		if (send_sigurg(&sk->sk_socket->file->f_owner))
1724 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1725 }
1726 
1727 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1728 		    unsigned long expires)
1729 {
1730 	if (!mod_timer(timer, expires))
1731 		sock_hold(sk);
1732 }
1733 
1734 EXPORT_SYMBOL(sk_reset_timer);
1735 
1736 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1737 {
1738 	if (timer_pending(timer) && del_timer(timer))
1739 		__sock_put(sk);
1740 }
1741 
1742 EXPORT_SYMBOL(sk_stop_timer);
1743 
1744 void sock_init_data(struct socket *sock, struct sock *sk)
1745 {
1746 	skb_queue_head_init(&sk->sk_receive_queue);
1747 	skb_queue_head_init(&sk->sk_write_queue);
1748 	skb_queue_head_init(&sk->sk_error_queue);
1749 #ifdef CONFIG_NET_DMA
1750 	skb_queue_head_init(&sk->sk_async_wait_queue);
1751 #endif
1752 
1753 	sk->sk_send_head	=	NULL;
1754 
1755 	init_timer(&sk->sk_timer);
1756 
1757 	sk->sk_allocation	=	GFP_KERNEL;
1758 	sk->sk_rcvbuf		=	sysctl_rmem_default;
1759 	sk->sk_sndbuf		=	sysctl_wmem_default;
1760 	sk->sk_state		=	TCP_CLOSE;
1761 	sk_set_socket(sk, sock);
1762 
1763 	sock_set_flag(sk, SOCK_ZAPPED);
1764 
1765 	if (sock) {
1766 		sk->sk_type	=	sock->type;
1767 		sk->sk_sleep	=	&sock->wait;
1768 		sock->sk	=	sk;
1769 	} else
1770 		sk->sk_sleep	=	NULL;
1771 
1772 	rwlock_init(&sk->sk_dst_lock);
1773 	rwlock_init(&sk->sk_callback_lock);
1774 	lockdep_set_class_and_name(&sk->sk_callback_lock,
1775 			af_callback_keys + sk->sk_family,
1776 			af_family_clock_key_strings[sk->sk_family]);
1777 
1778 	sk->sk_state_change	=	sock_def_wakeup;
1779 	sk->sk_data_ready	=	sock_def_readable;
1780 	sk->sk_write_space	=	sock_def_write_space;
1781 	sk->sk_error_report	=	sock_def_error_report;
1782 	sk->sk_destruct		=	sock_def_destruct;
1783 
1784 	sk->sk_sndmsg_page	=	NULL;
1785 	sk->sk_sndmsg_off	=	0;
1786 
1787 	sk->sk_peercred.pid 	=	0;
1788 	sk->sk_peercred.uid	=	-1;
1789 	sk->sk_peercred.gid	=	-1;
1790 	sk->sk_write_pending	=	0;
1791 	sk->sk_rcvlowat		=	1;
1792 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
1793 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
1794 
1795 	sk->sk_stamp = ktime_set(-1L, 0);
1796 
1797 	atomic_set(&sk->sk_refcnt, 1);
1798 	atomic_set(&sk->sk_drops, 0);
1799 }
1800 
1801 void lock_sock_nested(struct sock *sk, int subclass)
1802 {
1803 	might_sleep();
1804 	spin_lock_bh(&sk->sk_lock.slock);
1805 	if (sk->sk_lock.owned)
1806 		__lock_sock(sk);
1807 	sk->sk_lock.owned = 1;
1808 	spin_unlock(&sk->sk_lock.slock);
1809 	/*
1810 	 * The sk_lock has mutex_lock() semantics here:
1811 	 */
1812 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
1813 	local_bh_enable();
1814 }
1815 
1816 EXPORT_SYMBOL(lock_sock_nested);
1817 
1818 void release_sock(struct sock *sk)
1819 {
1820 	/*
1821 	 * The sk_lock has mutex_unlock() semantics:
1822 	 */
1823 	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1824 
1825 	spin_lock_bh(&sk->sk_lock.slock);
1826 	if (sk->sk_backlog.tail)
1827 		__release_sock(sk);
1828 	sk->sk_lock.owned = 0;
1829 	if (waitqueue_active(&sk->sk_lock.wq))
1830 		wake_up(&sk->sk_lock.wq);
1831 	spin_unlock_bh(&sk->sk_lock.slock);
1832 }
1833 EXPORT_SYMBOL(release_sock);
1834 
1835 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1836 {
1837 	struct timeval tv;
1838 	if (!sock_flag(sk, SOCK_TIMESTAMP))
1839 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
1840 	tv = ktime_to_timeval(sk->sk_stamp);
1841 	if (tv.tv_sec == -1)
1842 		return -ENOENT;
1843 	if (tv.tv_sec == 0) {
1844 		sk->sk_stamp = ktime_get_real();
1845 		tv = ktime_to_timeval(sk->sk_stamp);
1846 	}
1847 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
1848 }
1849 EXPORT_SYMBOL(sock_get_timestamp);
1850 
1851 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
1852 {
1853 	struct timespec ts;
1854 	if (!sock_flag(sk, SOCK_TIMESTAMP))
1855 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
1856 	ts = ktime_to_timespec(sk->sk_stamp);
1857 	if (ts.tv_sec == -1)
1858 		return -ENOENT;
1859 	if (ts.tv_sec == 0) {
1860 		sk->sk_stamp = ktime_get_real();
1861 		ts = ktime_to_timespec(sk->sk_stamp);
1862 	}
1863 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
1864 }
1865 EXPORT_SYMBOL(sock_get_timestampns);
1866 
1867 void sock_enable_timestamp(struct sock *sk, int flag)
1868 {
1869 	if (!sock_flag(sk, flag)) {
1870 		sock_set_flag(sk, flag);
1871 		/*
1872 		 * we just set one of the two flags which require net
1873 		 * time stamping, but time stamping might have been on
1874 		 * already because of the other one
1875 		 */
1876 		if (!sock_flag(sk,
1877 				flag == SOCK_TIMESTAMP ?
1878 				SOCK_TIMESTAMPING_RX_SOFTWARE :
1879 				SOCK_TIMESTAMP))
1880 			net_enable_timestamp();
1881 	}
1882 }
1883 
1884 /*
1885  *	Get a socket option on an socket.
1886  *
1887  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
1888  *	asynchronous errors should be reported by getsockopt. We assume
1889  *	this means if you specify SO_ERROR (otherwise whats the point of it).
1890  */
1891 int sock_common_getsockopt(struct socket *sock, int level, int optname,
1892 			   char __user *optval, int __user *optlen)
1893 {
1894 	struct sock *sk = sock->sk;
1895 
1896 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1897 }
1898 
1899 EXPORT_SYMBOL(sock_common_getsockopt);
1900 
1901 #ifdef CONFIG_COMPAT
1902 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1903 				  char __user *optval, int __user *optlen)
1904 {
1905 	struct sock *sk = sock->sk;
1906 
1907 	if (sk->sk_prot->compat_getsockopt != NULL)
1908 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
1909 						      optval, optlen);
1910 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1911 }
1912 EXPORT_SYMBOL(compat_sock_common_getsockopt);
1913 #endif
1914 
1915 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1916 			struct msghdr *msg, size_t size, int flags)
1917 {
1918 	struct sock *sk = sock->sk;
1919 	int addr_len = 0;
1920 	int err;
1921 
1922 	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1923 				   flags & ~MSG_DONTWAIT, &addr_len);
1924 	if (err >= 0)
1925 		msg->msg_namelen = addr_len;
1926 	return err;
1927 }
1928 
1929 EXPORT_SYMBOL(sock_common_recvmsg);
1930 
1931 /*
1932  *	Set socket options on an inet socket.
1933  */
1934 int sock_common_setsockopt(struct socket *sock, int level, int optname,
1935 			   char __user *optval, int optlen)
1936 {
1937 	struct sock *sk = sock->sk;
1938 
1939 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1940 }
1941 
1942 EXPORT_SYMBOL(sock_common_setsockopt);
1943 
1944 #ifdef CONFIG_COMPAT
1945 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
1946 				  char __user *optval, int optlen)
1947 {
1948 	struct sock *sk = sock->sk;
1949 
1950 	if (sk->sk_prot->compat_setsockopt != NULL)
1951 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
1952 						      optval, optlen);
1953 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1954 }
1955 EXPORT_SYMBOL(compat_sock_common_setsockopt);
1956 #endif
1957 
1958 void sk_common_release(struct sock *sk)
1959 {
1960 	if (sk->sk_prot->destroy)
1961 		sk->sk_prot->destroy(sk);
1962 
1963 	/*
1964 	 * Observation: when sock_common_release is called, processes have
1965 	 * no access to socket. But net still has.
1966 	 * Step one, detach it from networking:
1967 	 *
1968 	 * A. Remove from hash tables.
1969 	 */
1970 
1971 	sk->sk_prot->unhash(sk);
1972 
1973 	/*
1974 	 * In this point socket cannot receive new packets, but it is possible
1975 	 * that some packets are in flight because some CPU runs receiver and
1976 	 * did hash table lookup before we unhashed socket. They will achieve
1977 	 * receive queue and will be purged by socket destructor.
1978 	 *
1979 	 * Also we still have packets pending on receive queue and probably,
1980 	 * our own packets waiting in device queues. sock_destroy will drain
1981 	 * receive queue, but transmitted packets will delay socket destruction
1982 	 * until the last reference will be released.
1983 	 */
1984 
1985 	sock_orphan(sk);
1986 
1987 	xfrm_sk_free_policy(sk);
1988 
1989 	sk_refcnt_debug_release(sk);
1990 	sock_put(sk);
1991 }
1992 
1993 EXPORT_SYMBOL(sk_common_release);
1994 
1995 static DEFINE_RWLOCK(proto_list_lock);
1996 static LIST_HEAD(proto_list);
1997 
1998 #ifdef CONFIG_PROC_FS
1999 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
2000 struct prot_inuse {
2001 	int val[PROTO_INUSE_NR];
2002 };
2003 
2004 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2005 
2006 #ifdef CONFIG_NET_NS
2007 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2008 {
2009 	int cpu = smp_processor_id();
2010 	per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
2011 }
2012 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2013 
2014 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2015 {
2016 	int cpu, idx = prot->inuse_idx;
2017 	int res = 0;
2018 
2019 	for_each_possible_cpu(cpu)
2020 		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2021 
2022 	return res >= 0 ? res : 0;
2023 }
2024 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2025 
2026 static int sock_inuse_init_net(struct net *net)
2027 {
2028 	net->core.inuse = alloc_percpu(struct prot_inuse);
2029 	return net->core.inuse ? 0 : -ENOMEM;
2030 }
2031 
2032 static void sock_inuse_exit_net(struct net *net)
2033 {
2034 	free_percpu(net->core.inuse);
2035 }
2036 
2037 static struct pernet_operations net_inuse_ops = {
2038 	.init = sock_inuse_init_net,
2039 	.exit = sock_inuse_exit_net,
2040 };
2041 
2042 static __init int net_inuse_init(void)
2043 {
2044 	if (register_pernet_subsys(&net_inuse_ops))
2045 		panic("Cannot initialize net inuse counters");
2046 
2047 	return 0;
2048 }
2049 
2050 core_initcall(net_inuse_init);
2051 #else
2052 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2053 
2054 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2055 {
2056 	__get_cpu_var(prot_inuse).val[prot->inuse_idx] += val;
2057 }
2058 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2059 
2060 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2061 {
2062 	int cpu, idx = prot->inuse_idx;
2063 	int res = 0;
2064 
2065 	for_each_possible_cpu(cpu)
2066 		res += per_cpu(prot_inuse, cpu).val[idx];
2067 
2068 	return res >= 0 ? res : 0;
2069 }
2070 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2071 #endif
2072 
2073 static void assign_proto_idx(struct proto *prot)
2074 {
2075 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2076 
2077 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2078 		printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2079 		return;
2080 	}
2081 
2082 	set_bit(prot->inuse_idx, proto_inuse_idx);
2083 }
2084 
2085 static void release_proto_idx(struct proto *prot)
2086 {
2087 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2088 		clear_bit(prot->inuse_idx, proto_inuse_idx);
2089 }
2090 #else
2091 static inline void assign_proto_idx(struct proto *prot)
2092 {
2093 }
2094 
2095 static inline void release_proto_idx(struct proto *prot)
2096 {
2097 }
2098 #endif
2099 
2100 int proto_register(struct proto *prot, int alloc_slab)
2101 {
2102 	if (alloc_slab) {
2103 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2104 					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2105 					NULL);
2106 
2107 		if (prot->slab == NULL) {
2108 			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2109 			       prot->name);
2110 			goto out;
2111 		}
2112 
2113 		if (prot->rsk_prot != NULL) {
2114 			static const char mask[] = "request_sock_%s";
2115 
2116 			prot->rsk_prot->slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2117 			if (prot->rsk_prot->slab_name == NULL)
2118 				goto out_free_sock_slab;
2119 
2120 			sprintf(prot->rsk_prot->slab_name, mask, prot->name);
2121 			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2122 								 prot->rsk_prot->obj_size, 0,
2123 								 SLAB_HWCACHE_ALIGN, NULL);
2124 
2125 			if (prot->rsk_prot->slab == NULL) {
2126 				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2127 				       prot->name);
2128 				goto out_free_request_sock_slab_name;
2129 			}
2130 		}
2131 
2132 		if (prot->twsk_prot != NULL) {
2133 			static const char mask[] = "tw_sock_%s";
2134 
2135 			prot->twsk_prot->twsk_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2136 
2137 			if (prot->twsk_prot->twsk_slab_name == NULL)
2138 				goto out_free_request_sock_slab;
2139 
2140 			sprintf(prot->twsk_prot->twsk_slab_name, mask, prot->name);
2141 			prot->twsk_prot->twsk_slab =
2142 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2143 						  prot->twsk_prot->twsk_obj_size,
2144 						  0,
2145 						  SLAB_HWCACHE_ALIGN |
2146 							prot->slab_flags,
2147 						  NULL);
2148 			if (prot->twsk_prot->twsk_slab == NULL)
2149 				goto out_free_timewait_sock_slab_name;
2150 		}
2151 	}
2152 
2153 	write_lock(&proto_list_lock);
2154 	list_add(&prot->node, &proto_list);
2155 	assign_proto_idx(prot);
2156 	write_unlock(&proto_list_lock);
2157 	return 0;
2158 
2159 out_free_timewait_sock_slab_name:
2160 	kfree(prot->twsk_prot->twsk_slab_name);
2161 out_free_request_sock_slab:
2162 	if (prot->rsk_prot && prot->rsk_prot->slab) {
2163 		kmem_cache_destroy(prot->rsk_prot->slab);
2164 		prot->rsk_prot->slab = NULL;
2165 	}
2166 out_free_request_sock_slab_name:
2167 	kfree(prot->rsk_prot->slab_name);
2168 out_free_sock_slab:
2169 	kmem_cache_destroy(prot->slab);
2170 	prot->slab = NULL;
2171 out:
2172 	return -ENOBUFS;
2173 }
2174 
2175 EXPORT_SYMBOL(proto_register);
2176 
2177 void proto_unregister(struct proto *prot)
2178 {
2179 	write_lock(&proto_list_lock);
2180 	release_proto_idx(prot);
2181 	list_del(&prot->node);
2182 	write_unlock(&proto_list_lock);
2183 
2184 	if (prot->slab != NULL) {
2185 		kmem_cache_destroy(prot->slab);
2186 		prot->slab = NULL;
2187 	}
2188 
2189 	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2190 		kmem_cache_destroy(prot->rsk_prot->slab);
2191 		kfree(prot->rsk_prot->slab_name);
2192 		prot->rsk_prot->slab = NULL;
2193 	}
2194 
2195 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2196 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2197 		kfree(prot->twsk_prot->twsk_slab_name);
2198 		prot->twsk_prot->twsk_slab = NULL;
2199 	}
2200 }
2201 
2202 EXPORT_SYMBOL(proto_unregister);
2203 
2204 #ifdef CONFIG_PROC_FS
2205 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2206 	__acquires(proto_list_lock)
2207 {
2208 	read_lock(&proto_list_lock);
2209 	return seq_list_start_head(&proto_list, *pos);
2210 }
2211 
2212 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2213 {
2214 	return seq_list_next(v, &proto_list, pos);
2215 }
2216 
2217 static void proto_seq_stop(struct seq_file *seq, void *v)
2218 	__releases(proto_list_lock)
2219 {
2220 	read_unlock(&proto_list_lock);
2221 }
2222 
2223 static char proto_method_implemented(const void *method)
2224 {
2225 	return method == NULL ? 'n' : 'y';
2226 }
2227 
2228 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2229 {
2230 	seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
2231 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2232 		   proto->name,
2233 		   proto->obj_size,
2234 		   sock_prot_inuse_get(seq_file_net(seq), proto),
2235 		   proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
2236 		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2237 		   proto->max_header,
2238 		   proto->slab == NULL ? "no" : "yes",
2239 		   module_name(proto->owner),
2240 		   proto_method_implemented(proto->close),
2241 		   proto_method_implemented(proto->connect),
2242 		   proto_method_implemented(proto->disconnect),
2243 		   proto_method_implemented(proto->accept),
2244 		   proto_method_implemented(proto->ioctl),
2245 		   proto_method_implemented(proto->init),
2246 		   proto_method_implemented(proto->destroy),
2247 		   proto_method_implemented(proto->shutdown),
2248 		   proto_method_implemented(proto->setsockopt),
2249 		   proto_method_implemented(proto->getsockopt),
2250 		   proto_method_implemented(proto->sendmsg),
2251 		   proto_method_implemented(proto->recvmsg),
2252 		   proto_method_implemented(proto->sendpage),
2253 		   proto_method_implemented(proto->bind),
2254 		   proto_method_implemented(proto->backlog_rcv),
2255 		   proto_method_implemented(proto->hash),
2256 		   proto_method_implemented(proto->unhash),
2257 		   proto_method_implemented(proto->get_port),
2258 		   proto_method_implemented(proto->enter_memory_pressure));
2259 }
2260 
2261 static int proto_seq_show(struct seq_file *seq, void *v)
2262 {
2263 	if (v == &proto_list)
2264 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2265 			   "protocol",
2266 			   "size",
2267 			   "sockets",
2268 			   "memory",
2269 			   "press",
2270 			   "maxhdr",
2271 			   "slab",
2272 			   "module",
2273 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2274 	else
2275 		proto_seq_printf(seq, list_entry(v, struct proto, node));
2276 	return 0;
2277 }
2278 
2279 static const struct seq_operations proto_seq_ops = {
2280 	.start  = proto_seq_start,
2281 	.next   = proto_seq_next,
2282 	.stop   = proto_seq_stop,
2283 	.show   = proto_seq_show,
2284 };
2285 
2286 static int proto_seq_open(struct inode *inode, struct file *file)
2287 {
2288 	return seq_open_net(inode, file, &proto_seq_ops,
2289 			    sizeof(struct seq_net_private));
2290 }
2291 
2292 static const struct file_operations proto_seq_fops = {
2293 	.owner		= THIS_MODULE,
2294 	.open		= proto_seq_open,
2295 	.read		= seq_read,
2296 	.llseek		= seq_lseek,
2297 	.release	= seq_release_net,
2298 };
2299 
2300 static __net_init int proto_init_net(struct net *net)
2301 {
2302 	if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2303 		return -ENOMEM;
2304 
2305 	return 0;
2306 }
2307 
2308 static __net_exit void proto_exit_net(struct net *net)
2309 {
2310 	proc_net_remove(net, "protocols");
2311 }
2312 
2313 
2314 static __net_initdata struct pernet_operations proto_net_ops = {
2315 	.init = proto_init_net,
2316 	.exit = proto_exit_net,
2317 };
2318 
2319 static int __init proto_init(void)
2320 {
2321 	return register_pernet_subsys(&proto_net_ops);
2322 }
2323 
2324 subsys_initcall(proto_init);
2325 
2326 #endif /* PROC_FS */
2327 
2328 EXPORT_SYMBOL(sk_alloc);
2329 EXPORT_SYMBOL(sk_free);
2330 EXPORT_SYMBOL(sk_send_sigurg);
2331 EXPORT_SYMBOL(sock_alloc_send_skb);
2332 EXPORT_SYMBOL(sock_init_data);
2333 EXPORT_SYMBOL(sock_kfree_s);
2334 EXPORT_SYMBOL(sock_kmalloc);
2335 EXPORT_SYMBOL(sock_no_accept);
2336 EXPORT_SYMBOL(sock_no_bind);
2337 EXPORT_SYMBOL(sock_no_connect);
2338 EXPORT_SYMBOL(sock_no_getname);
2339 EXPORT_SYMBOL(sock_no_getsockopt);
2340 EXPORT_SYMBOL(sock_no_ioctl);
2341 EXPORT_SYMBOL(sock_no_listen);
2342 EXPORT_SYMBOL(sock_no_mmap);
2343 EXPORT_SYMBOL(sock_no_poll);
2344 EXPORT_SYMBOL(sock_no_recvmsg);
2345 EXPORT_SYMBOL(sock_no_sendmsg);
2346 EXPORT_SYMBOL(sock_no_sendpage);
2347 EXPORT_SYMBOL(sock_no_setsockopt);
2348 EXPORT_SYMBOL(sock_no_shutdown);
2349 EXPORT_SYMBOL(sock_no_socketpair);
2350 EXPORT_SYMBOL(sock_rfree);
2351 EXPORT_SYMBOL(sock_setsockopt);
2352 EXPORT_SYMBOL(sock_wfree);
2353 EXPORT_SYMBOL(sock_wmalloc);
2354 EXPORT_SYMBOL(sock_i_uid);
2355 EXPORT_SYMBOL(sock_i_ino);
2356 EXPORT_SYMBOL(sysctl_optmem_max);
2357