xref: /openbmc/linux/net/core/sock.c (revision c40d04df)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *		This program is free software; you can redistribute it and/or
87  *		modify it under the terms of the GNU General Public License
88  *		as published by the Free Software Foundation; either version
89  *		2 of the License, or (at your option) any later version.
90  */
91 
92 #include <linux/capability.h>
93 #include <linux/errno.h>
94 #include <linux/types.h>
95 #include <linux/socket.h>
96 #include <linux/in.h>
97 #include <linux/kernel.h>
98 #include <linux/module.h>
99 #include <linux/proc_fs.h>
100 #include <linux/seq_file.h>
101 #include <linux/sched.h>
102 #include <linux/timer.h>
103 #include <linux/string.h>
104 #include <linux/sockios.h>
105 #include <linux/net.h>
106 #include <linux/mm.h>
107 #include <linux/slab.h>
108 #include <linux/interrupt.h>
109 #include <linux/poll.h>
110 #include <linux/tcp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113 #include <linux/user_namespace.h>
114 #include <linux/static_key.h>
115 #include <linux/memcontrol.h>
116 
117 #include <asm/uaccess.h>
118 #include <asm/system.h>
119 
120 #include <linux/netdevice.h>
121 #include <net/protocol.h>
122 #include <linux/skbuff.h>
123 #include <net/net_namespace.h>
124 #include <net/request_sock.h>
125 #include <net/sock.h>
126 #include <linux/net_tstamp.h>
127 #include <net/xfrm.h>
128 #include <linux/ipsec.h>
129 #include <net/cls_cgroup.h>
130 #include <net/netprio_cgroup.h>
131 
132 #include <linux/filter.h>
133 
134 #include <trace/events/sock.h>
135 
136 #ifdef CONFIG_INET
137 #include <net/tcp.h>
138 #endif
139 
140 static DEFINE_MUTEX(proto_list_mutex);
141 static LIST_HEAD(proto_list);
142 
143 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
144 int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss)
145 {
146 	struct proto *proto;
147 	int ret = 0;
148 
149 	mutex_lock(&proto_list_mutex);
150 	list_for_each_entry(proto, &proto_list, node) {
151 		if (proto->init_cgroup) {
152 			ret = proto->init_cgroup(cgrp, ss);
153 			if (ret)
154 				goto out;
155 		}
156 	}
157 
158 	mutex_unlock(&proto_list_mutex);
159 	return ret;
160 out:
161 	list_for_each_entry_continue_reverse(proto, &proto_list, node)
162 		if (proto->destroy_cgroup)
163 			proto->destroy_cgroup(cgrp);
164 	mutex_unlock(&proto_list_mutex);
165 	return ret;
166 }
167 
168 void mem_cgroup_sockets_destroy(struct cgroup *cgrp)
169 {
170 	struct proto *proto;
171 
172 	mutex_lock(&proto_list_mutex);
173 	list_for_each_entry_reverse(proto, &proto_list, node)
174 		if (proto->destroy_cgroup)
175 			proto->destroy_cgroup(cgrp);
176 	mutex_unlock(&proto_list_mutex);
177 }
178 #endif
179 
180 /*
181  * Each address family might have different locking rules, so we have
182  * one slock key per address family:
183  */
184 static struct lock_class_key af_family_keys[AF_MAX];
185 static struct lock_class_key af_family_slock_keys[AF_MAX];
186 
187 struct static_key memcg_socket_limit_enabled;
188 EXPORT_SYMBOL(memcg_socket_limit_enabled);
189 
190 /*
191  * Make lock validator output more readable. (we pre-construct these
192  * strings build-time, so that runtime initialization of socket
193  * locks is fast):
194  */
195 static const char *const af_family_key_strings[AF_MAX+1] = {
196   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
197   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
198   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
199   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
200   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
201   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
202   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
203   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
204   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
205   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
206   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
207   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
208   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
209   "sk_lock-AF_NFC"   , "sk_lock-AF_MAX"
210 };
211 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
212   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
213   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
214   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
215   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
216   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
217   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
218   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
219   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
220   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
221   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
222   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
223   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
224   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
225   "slock-AF_NFC"   , "slock-AF_MAX"
226 };
227 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
228   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
229   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
230   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
231   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
232   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
233   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
234   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
235   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
236   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
237   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
238   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
239   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
240   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
241   "clock-AF_NFC"   , "clock-AF_MAX"
242 };
243 
244 /*
245  * sk_callback_lock locking rules are per-address-family,
246  * so split the lock classes by using a per-AF key:
247  */
248 static struct lock_class_key af_callback_keys[AF_MAX];
249 
250 /* Take into consideration the size of the struct sk_buff overhead in the
251  * determination of these values, since that is non-constant across
252  * platforms.  This makes socket queueing behavior and performance
253  * not depend upon such differences.
254  */
255 #define _SK_MEM_PACKETS		256
256 #define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
257 #define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
258 #define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
259 
260 /* Run time adjustable parameters. */
261 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
262 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
263 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
264 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
265 
266 /* Maximal space eaten by iovec or ancillary data plus some space */
267 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
268 EXPORT_SYMBOL(sysctl_optmem_max);
269 
270 #if defined(CONFIG_CGROUPS)
271 #if !defined(CONFIG_NET_CLS_CGROUP)
272 int net_cls_subsys_id = -1;
273 EXPORT_SYMBOL_GPL(net_cls_subsys_id);
274 #endif
275 #if !defined(CONFIG_NETPRIO_CGROUP)
276 int net_prio_subsys_id = -1;
277 EXPORT_SYMBOL_GPL(net_prio_subsys_id);
278 #endif
279 #endif
280 
281 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
282 {
283 	struct timeval tv;
284 
285 	if (optlen < sizeof(tv))
286 		return -EINVAL;
287 	if (copy_from_user(&tv, optval, sizeof(tv)))
288 		return -EFAULT;
289 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
290 		return -EDOM;
291 
292 	if (tv.tv_sec < 0) {
293 		static int warned __read_mostly;
294 
295 		*timeo_p = 0;
296 		if (warned < 10 && net_ratelimit()) {
297 			warned++;
298 			printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
299 			       "tries to set negative timeout\n",
300 				current->comm, task_pid_nr(current));
301 		}
302 		return 0;
303 	}
304 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
305 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
306 		return 0;
307 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
308 		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
309 	return 0;
310 }
311 
312 static void sock_warn_obsolete_bsdism(const char *name)
313 {
314 	static int warned;
315 	static char warncomm[TASK_COMM_LEN];
316 	if (strcmp(warncomm, current->comm) && warned < 5) {
317 		strcpy(warncomm,  current->comm);
318 		printk(KERN_WARNING "process `%s' is using obsolete "
319 		       "%s SO_BSDCOMPAT\n", warncomm, name);
320 		warned++;
321 	}
322 }
323 
324 #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
325 
326 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
327 {
328 	if (sk->sk_flags & flags) {
329 		sk->sk_flags &= ~flags;
330 		if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
331 			net_disable_timestamp();
332 	}
333 }
334 
335 
336 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
337 {
338 	int err;
339 	int skb_len;
340 	unsigned long flags;
341 	struct sk_buff_head *list = &sk->sk_receive_queue;
342 
343 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
344 		atomic_inc(&sk->sk_drops);
345 		trace_sock_rcvqueue_full(sk, skb);
346 		return -ENOMEM;
347 	}
348 
349 	err = sk_filter(sk, skb);
350 	if (err)
351 		return err;
352 
353 	if (!sk_rmem_schedule(sk, skb->truesize)) {
354 		atomic_inc(&sk->sk_drops);
355 		return -ENOBUFS;
356 	}
357 
358 	skb->dev = NULL;
359 	skb_set_owner_r(skb, sk);
360 
361 	/* Cache the SKB length before we tack it onto the receive
362 	 * queue.  Once it is added it no longer belongs to us and
363 	 * may be freed by other threads of control pulling packets
364 	 * from the queue.
365 	 */
366 	skb_len = skb->len;
367 
368 	/* we escape from rcu protected region, make sure we dont leak
369 	 * a norefcounted dst
370 	 */
371 	skb_dst_force(skb);
372 
373 	spin_lock_irqsave(&list->lock, flags);
374 	skb->dropcount = atomic_read(&sk->sk_drops);
375 	__skb_queue_tail(list, skb);
376 	spin_unlock_irqrestore(&list->lock, flags);
377 
378 	if (!sock_flag(sk, SOCK_DEAD))
379 		sk->sk_data_ready(sk, skb_len);
380 	return 0;
381 }
382 EXPORT_SYMBOL(sock_queue_rcv_skb);
383 
384 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
385 {
386 	int rc = NET_RX_SUCCESS;
387 
388 	if (sk_filter(sk, skb))
389 		goto discard_and_relse;
390 
391 	skb->dev = NULL;
392 
393 	if (sk_rcvqueues_full(sk, skb)) {
394 		atomic_inc(&sk->sk_drops);
395 		goto discard_and_relse;
396 	}
397 	if (nested)
398 		bh_lock_sock_nested(sk);
399 	else
400 		bh_lock_sock(sk);
401 	if (!sock_owned_by_user(sk)) {
402 		/*
403 		 * trylock + unlock semantics:
404 		 */
405 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
406 
407 		rc = sk_backlog_rcv(sk, skb);
408 
409 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
410 	} else if (sk_add_backlog(sk, skb)) {
411 		bh_unlock_sock(sk);
412 		atomic_inc(&sk->sk_drops);
413 		goto discard_and_relse;
414 	}
415 
416 	bh_unlock_sock(sk);
417 out:
418 	sock_put(sk);
419 	return rc;
420 discard_and_relse:
421 	kfree_skb(skb);
422 	goto out;
423 }
424 EXPORT_SYMBOL(sk_receive_skb);
425 
426 void sk_reset_txq(struct sock *sk)
427 {
428 	sk_tx_queue_clear(sk);
429 }
430 EXPORT_SYMBOL(sk_reset_txq);
431 
432 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
433 {
434 	struct dst_entry *dst = __sk_dst_get(sk);
435 
436 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
437 		sk_tx_queue_clear(sk);
438 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
439 		dst_release(dst);
440 		return NULL;
441 	}
442 
443 	return dst;
444 }
445 EXPORT_SYMBOL(__sk_dst_check);
446 
447 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
448 {
449 	struct dst_entry *dst = sk_dst_get(sk);
450 
451 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
452 		sk_dst_reset(sk);
453 		dst_release(dst);
454 		return NULL;
455 	}
456 
457 	return dst;
458 }
459 EXPORT_SYMBOL(sk_dst_check);
460 
461 static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
462 {
463 	int ret = -ENOPROTOOPT;
464 #ifdef CONFIG_NETDEVICES
465 	struct net *net = sock_net(sk);
466 	char devname[IFNAMSIZ];
467 	int index;
468 
469 	/* Sorry... */
470 	ret = -EPERM;
471 	if (!capable(CAP_NET_RAW))
472 		goto out;
473 
474 	ret = -EINVAL;
475 	if (optlen < 0)
476 		goto out;
477 
478 	/* Bind this socket to a particular device like "eth0",
479 	 * as specified in the passed interface name. If the
480 	 * name is "" or the option length is zero the socket
481 	 * is not bound.
482 	 */
483 	if (optlen > IFNAMSIZ - 1)
484 		optlen = IFNAMSIZ - 1;
485 	memset(devname, 0, sizeof(devname));
486 
487 	ret = -EFAULT;
488 	if (copy_from_user(devname, optval, optlen))
489 		goto out;
490 
491 	index = 0;
492 	if (devname[0] != '\0') {
493 		struct net_device *dev;
494 
495 		rcu_read_lock();
496 		dev = dev_get_by_name_rcu(net, devname);
497 		if (dev)
498 			index = dev->ifindex;
499 		rcu_read_unlock();
500 		ret = -ENODEV;
501 		if (!dev)
502 			goto out;
503 	}
504 
505 	lock_sock(sk);
506 	sk->sk_bound_dev_if = index;
507 	sk_dst_reset(sk);
508 	release_sock(sk);
509 
510 	ret = 0;
511 
512 out:
513 #endif
514 
515 	return ret;
516 }
517 
518 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
519 {
520 	if (valbool)
521 		sock_set_flag(sk, bit);
522 	else
523 		sock_reset_flag(sk, bit);
524 }
525 
526 /*
527  *	This is meant for all protocols to use and covers goings on
528  *	at the socket level. Everything here is generic.
529  */
530 
531 int sock_setsockopt(struct socket *sock, int level, int optname,
532 		    char __user *optval, unsigned int optlen)
533 {
534 	struct sock *sk = sock->sk;
535 	int val;
536 	int valbool;
537 	struct linger ling;
538 	int ret = 0;
539 
540 	/*
541 	 *	Options without arguments
542 	 */
543 
544 	if (optname == SO_BINDTODEVICE)
545 		return sock_bindtodevice(sk, optval, optlen);
546 
547 	if (optlen < sizeof(int))
548 		return -EINVAL;
549 
550 	if (get_user(val, (int __user *)optval))
551 		return -EFAULT;
552 
553 	valbool = val ? 1 : 0;
554 
555 	lock_sock(sk);
556 
557 	switch (optname) {
558 	case SO_DEBUG:
559 		if (val && !capable(CAP_NET_ADMIN))
560 			ret = -EACCES;
561 		else
562 			sock_valbool_flag(sk, SOCK_DBG, valbool);
563 		break;
564 	case SO_REUSEADDR:
565 		sk->sk_reuse = valbool;
566 		break;
567 	case SO_TYPE:
568 	case SO_PROTOCOL:
569 	case SO_DOMAIN:
570 	case SO_ERROR:
571 		ret = -ENOPROTOOPT;
572 		break;
573 	case SO_DONTROUTE:
574 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
575 		break;
576 	case SO_BROADCAST:
577 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
578 		break;
579 	case SO_SNDBUF:
580 		/* Don't error on this BSD doesn't and if you think
581 		   about it this is right. Otherwise apps have to
582 		   play 'guess the biggest size' games. RCVBUF/SNDBUF
583 		   are treated in BSD as hints */
584 
585 		if (val > sysctl_wmem_max)
586 			val = sysctl_wmem_max;
587 set_sndbuf:
588 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
589 		if ((val * 2) < SOCK_MIN_SNDBUF)
590 			sk->sk_sndbuf = SOCK_MIN_SNDBUF;
591 		else
592 			sk->sk_sndbuf = val * 2;
593 
594 		/*
595 		 *	Wake up sending tasks if we
596 		 *	upped the value.
597 		 */
598 		sk->sk_write_space(sk);
599 		break;
600 
601 	case SO_SNDBUFFORCE:
602 		if (!capable(CAP_NET_ADMIN)) {
603 			ret = -EPERM;
604 			break;
605 		}
606 		goto set_sndbuf;
607 
608 	case SO_RCVBUF:
609 		/* Don't error on this BSD doesn't and if you think
610 		   about it this is right. Otherwise apps have to
611 		   play 'guess the biggest size' games. RCVBUF/SNDBUF
612 		   are treated in BSD as hints */
613 
614 		if (val > sysctl_rmem_max)
615 			val = sysctl_rmem_max;
616 set_rcvbuf:
617 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
618 		/*
619 		 * We double it on the way in to account for
620 		 * "struct sk_buff" etc. overhead.   Applications
621 		 * assume that the SO_RCVBUF setting they make will
622 		 * allow that much actual data to be received on that
623 		 * socket.
624 		 *
625 		 * Applications are unaware that "struct sk_buff" and
626 		 * other overheads allocate from the receive buffer
627 		 * during socket buffer allocation.
628 		 *
629 		 * And after considering the possible alternatives,
630 		 * returning the value we actually used in getsockopt
631 		 * is the most desirable behavior.
632 		 */
633 		if ((val * 2) < SOCK_MIN_RCVBUF)
634 			sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
635 		else
636 			sk->sk_rcvbuf = val * 2;
637 		break;
638 
639 	case SO_RCVBUFFORCE:
640 		if (!capable(CAP_NET_ADMIN)) {
641 			ret = -EPERM;
642 			break;
643 		}
644 		goto set_rcvbuf;
645 
646 	case SO_KEEPALIVE:
647 #ifdef CONFIG_INET
648 		if (sk->sk_protocol == IPPROTO_TCP)
649 			tcp_set_keepalive(sk, valbool);
650 #endif
651 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
652 		break;
653 
654 	case SO_OOBINLINE:
655 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
656 		break;
657 
658 	case SO_NO_CHECK:
659 		sk->sk_no_check = valbool;
660 		break;
661 
662 	case SO_PRIORITY:
663 		if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
664 			sk->sk_priority = val;
665 		else
666 			ret = -EPERM;
667 		break;
668 
669 	case SO_LINGER:
670 		if (optlen < sizeof(ling)) {
671 			ret = -EINVAL;	/* 1003.1g */
672 			break;
673 		}
674 		if (copy_from_user(&ling, optval, sizeof(ling))) {
675 			ret = -EFAULT;
676 			break;
677 		}
678 		if (!ling.l_onoff)
679 			sock_reset_flag(sk, SOCK_LINGER);
680 		else {
681 #if (BITS_PER_LONG == 32)
682 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
683 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
684 			else
685 #endif
686 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
687 			sock_set_flag(sk, SOCK_LINGER);
688 		}
689 		break;
690 
691 	case SO_BSDCOMPAT:
692 		sock_warn_obsolete_bsdism("setsockopt");
693 		break;
694 
695 	case SO_PASSCRED:
696 		if (valbool)
697 			set_bit(SOCK_PASSCRED, &sock->flags);
698 		else
699 			clear_bit(SOCK_PASSCRED, &sock->flags);
700 		break;
701 
702 	case SO_TIMESTAMP:
703 	case SO_TIMESTAMPNS:
704 		if (valbool)  {
705 			if (optname == SO_TIMESTAMP)
706 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
707 			else
708 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
709 			sock_set_flag(sk, SOCK_RCVTSTAMP);
710 			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
711 		} else {
712 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
713 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
714 		}
715 		break;
716 
717 	case SO_TIMESTAMPING:
718 		if (val & ~SOF_TIMESTAMPING_MASK) {
719 			ret = -EINVAL;
720 			break;
721 		}
722 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
723 				  val & SOF_TIMESTAMPING_TX_HARDWARE);
724 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
725 				  val & SOF_TIMESTAMPING_TX_SOFTWARE);
726 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
727 				  val & SOF_TIMESTAMPING_RX_HARDWARE);
728 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
729 			sock_enable_timestamp(sk,
730 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
731 		else
732 			sock_disable_timestamp(sk,
733 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
734 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
735 				  val & SOF_TIMESTAMPING_SOFTWARE);
736 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
737 				  val & SOF_TIMESTAMPING_SYS_HARDWARE);
738 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
739 				  val & SOF_TIMESTAMPING_RAW_HARDWARE);
740 		break;
741 
742 	case SO_RCVLOWAT:
743 		if (val < 0)
744 			val = INT_MAX;
745 		sk->sk_rcvlowat = val ? : 1;
746 		break;
747 
748 	case SO_RCVTIMEO:
749 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
750 		break;
751 
752 	case SO_SNDTIMEO:
753 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
754 		break;
755 
756 	case SO_ATTACH_FILTER:
757 		ret = -EINVAL;
758 		if (optlen == sizeof(struct sock_fprog)) {
759 			struct sock_fprog fprog;
760 
761 			ret = -EFAULT;
762 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
763 				break;
764 
765 			ret = sk_attach_filter(&fprog, sk);
766 		}
767 		break;
768 
769 	case SO_DETACH_FILTER:
770 		ret = sk_detach_filter(sk);
771 		break;
772 
773 	case SO_PASSSEC:
774 		if (valbool)
775 			set_bit(SOCK_PASSSEC, &sock->flags);
776 		else
777 			clear_bit(SOCK_PASSSEC, &sock->flags);
778 		break;
779 	case SO_MARK:
780 		if (!capable(CAP_NET_ADMIN))
781 			ret = -EPERM;
782 		else
783 			sk->sk_mark = val;
784 		break;
785 
786 		/* We implement the SO_SNDLOWAT etc to
787 		   not be settable (1003.1g 5.3) */
788 	case SO_RXQ_OVFL:
789 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
790 		break;
791 
792 	case SO_WIFI_STATUS:
793 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
794 		break;
795 
796 	case SO_PEEK_OFF:
797 		if (sock->ops->set_peek_off)
798 			sock->ops->set_peek_off(sk, val);
799 		else
800 			ret = -EOPNOTSUPP;
801 		break;
802 
803 	case SO_NOFCS:
804 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
805 		break;
806 
807 	default:
808 		ret = -ENOPROTOOPT;
809 		break;
810 	}
811 	release_sock(sk);
812 	return ret;
813 }
814 EXPORT_SYMBOL(sock_setsockopt);
815 
816 
817 void cred_to_ucred(struct pid *pid, const struct cred *cred,
818 		   struct ucred *ucred)
819 {
820 	ucred->pid = pid_vnr(pid);
821 	ucred->uid = ucred->gid = -1;
822 	if (cred) {
823 		struct user_namespace *current_ns = current_user_ns();
824 
825 		ucred->uid = user_ns_map_uid(current_ns, cred, cred->euid);
826 		ucred->gid = user_ns_map_gid(current_ns, cred, cred->egid);
827 	}
828 }
829 EXPORT_SYMBOL_GPL(cred_to_ucred);
830 
831 int sock_getsockopt(struct socket *sock, int level, int optname,
832 		    char __user *optval, int __user *optlen)
833 {
834 	struct sock *sk = sock->sk;
835 
836 	union {
837 		int val;
838 		struct linger ling;
839 		struct timeval tm;
840 	} v;
841 
842 	int lv = sizeof(int);
843 	int len;
844 
845 	if (get_user(len, optlen))
846 		return -EFAULT;
847 	if (len < 0)
848 		return -EINVAL;
849 
850 	memset(&v, 0, sizeof(v));
851 
852 	switch (optname) {
853 	case SO_DEBUG:
854 		v.val = sock_flag(sk, SOCK_DBG);
855 		break;
856 
857 	case SO_DONTROUTE:
858 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
859 		break;
860 
861 	case SO_BROADCAST:
862 		v.val = !!sock_flag(sk, SOCK_BROADCAST);
863 		break;
864 
865 	case SO_SNDBUF:
866 		v.val = sk->sk_sndbuf;
867 		break;
868 
869 	case SO_RCVBUF:
870 		v.val = sk->sk_rcvbuf;
871 		break;
872 
873 	case SO_REUSEADDR:
874 		v.val = sk->sk_reuse;
875 		break;
876 
877 	case SO_KEEPALIVE:
878 		v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
879 		break;
880 
881 	case SO_TYPE:
882 		v.val = sk->sk_type;
883 		break;
884 
885 	case SO_PROTOCOL:
886 		v.val = sk->sk_protocol;
887 		break;
888 
889 	case SO_DOMAIN:
890 		v.val = sk->sk_family;
891 		break;
892 
893 	case SO_ERROR:
894 		v.val = -sock_error(sk);
895 		if (v.val == 0)
896 			v.val = xchg(&sk->sk_err_soft, 0);
897 		break;
898 
899 	case SO_OOBINLINE:
900 		v.val = !!sock_flag(sk, SOCK_URGINLINE);
901 		break;
902 
903 	case SO_NO_CHECK:
904 		v.val = sk->sk_no_check;
905 		break;
906 
907 	case SO_PRIORITY:
908 		v.val = sk->sk_priority;
909 		break;
910 
911 	case SO_LINGER:
912 		lv		= sizeof(v.ling);
913 		v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER);
914 		v.ling.l_linger	= sk->sk_lingertime / HZ;
915 		break;
916 
917 	case SO_BSDCOMPAT:
918 		sock_warn_obsolete_bsdism("getsockopt");
919 		break;
920 
921 	case SO_TIMESTAMP:
922 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
923 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
924 		break;
925 
926 	case SO_TIMESTAMPNS:
927 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
928 		break;
929 
930 	case SO_TIMESTAMPING:
931 		v.val = 0;
932 		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
933 			v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
934 		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
935 			v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
936 		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
937 			v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
938 		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
939 			v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
940 		if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
941 			v.val |= SOF_TIMESTAMPING_SOFTWARE;
942 		if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
943 			v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
944 		if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
945 			v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
946 		break;
947 
948 	case SO_RCVTIMEO:
949 		lv = sizeof(struct timeval);
950 		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
951 			v.tm.tv_sec = 0;
952 			v.tm.tv_usec = 0;
953 		} else {
954 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
955 			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
956 		}
957 		break;
958 
959 	case SO_SNDTIMEO:
960 		lv = sizeof(struct timeval);
961 		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
962 			v.tm.tv_sec = 0;
963 			v.tm.tv_usec = 0;
964 		} else {
965 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
966 			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
967 		}
968 		break;
969 
970 	case SO_RCVLOWAT:
971 		v.val = sk->sk_rcvlowat;
972 		break;
973 
974 	case SO_SNDLOWAT:
975 		v.val = 1;
976 		break;
977 
978 	case SO_PASSCRED:
979 		v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
980 		break;
981 
982 	case SO_PEERCRED:
983 	{
984 		struct ucred peercred;
985 		if (len > sizeof(peercred))
986 			len = sizeof(peercred);
987 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
988 		if (copy_to_user(optval, &peercred, len))
989 			return -EFAULT;
990 		goto lenout;
991 	}
992 
993 	case SO_PEERNAME:
994 	{
995 		char address[128];
996 
997 		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
998 			return -ENOTCONN;
999 		if (lv < len)
1000 			return -EINVAL;
1001 		if (copy_to_user(optval, address, len))
1002 			return -EFAULT;
1003 		goto lenout;
1004 	}
1005 
1006 	/* Dubious BSD thing... Probably nobody even uses it, but
1007 	 * the UNIX standard wants it for whatever reason... -DaveM
1008 	 */
1009 	case SO_ACCEPTCONN:
1010 		v.val = sk->sk_state == TCP_LISTEN;
1011 		break;
1012 
1013 	case SO_PASSSEC:
1014 		v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
1015 		break;
1016 
1017 	case SO_PEERSEC:
1018 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1019 
1020 	case SO_MARK:
1021 		v.val = sk->sk_mark;
1022 		break;
1023 
1024 	case SO_RXQ_OVFL:
1025 		v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);
1026 		break;
1027 
1028 	case SO_WIFI_STATUS:
1029 		v.val = !!sock_flag(sk, SOCK_WIFI_STATUS);
1030 		break;
1031 
1032 	case SO_PEEK_OFF:
1033 		if (!sock->ops->set_peek_off)
1034 			return -EOPNOTSUPP;
1035 
1036 		v.val = sk->sk_peek_off;
1037 		break;
1038 	case SO_NOFCS:
1039 		v.val = !!sock_flag(sk, SOCK_NOFCS);
1040 		break;
1041 	default:
1042 		return -ENOPROTOOPT;
1043 	}
1044 
1045 	if (len > lv)
1046 		len = lv;
1047 	if (copy_to_user(optval, &v, len))
1048 		return -EFAULT;
1049 lenout:
1050 	if (put_user(len, optlen))
1051 		return -EFAULT;
1052 	return 0;
1053 }
1054 
1055 /*
1056  * Initialize an sk_lock.
1057  *
1058  * (We also register the sk_lock with the lock validator.)
1059  */
1060 static inline void sock_lock_init(struct sock *sk)
1061 {
1062 	sock_lock_init_class_and_name(sk,
1063 			af_family_slock_key_strings[sk->sk_family],
1064 			af_family_slock_keys + sk->sk_family,
1065 			af_family_key_strings[sk->sk_family],
1066 			af_family_keys + sk->sk_family);
1067 }
1068 
1069 /*
1070  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1071  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1072  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1073  */
1074 static void sock_copy(struct sock *nsk, const struct sock *osk)
1075 {
1076 #ifdef CONFIG_SECURITY_NETWORK
1077 	void *sptr = nsk->sk_security;
1078 #endif
1079 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1080 
1081 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1082 	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1083 
1084 #ifdef CONFIG_SECURITY_NETWORK
1085 	nsk->sk_security = sptr;
1086 	security_sk_clone(osk, nsk);
1087 #endif
1088 }
1089 
1090 /*
1091  * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
1092  * un-modified. Special care is taken when initializing object to zero.
1093  */
1094 static inline void sk_prot_clear_nulls(struct sock *sk, int size)
1095 {
1096 	if (offsetof(struct sock, sk_node.next) != 0)
1097 		memset(sk, 0, offsetof(struct sock, sk_node.next));
1098 	memset(&sk->sk_node.pprev, 0,
1099 	       size - offsetof(struct sock, sk_node.pprev));
1100 }
1101 
1102 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1103 {
1104 	unsigned long nulls1, nulls2;
1105 
1106 	nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1107 	nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1108 	if (nulls1 > nulls2)
1109 		swap(nulls1, nulls2);
1110 
1111 	if (nulls1 != 0)
1112 		memset((char *)sk, 0, nulls1);
1113 	memset((char *)sk + nulls1 + sizeof(void *), 0,
1114 	       nulls2 - nulls1 - sizeof(void *));
1115 	memset((char *)sk + nulls2 + sizeof(void *), 0,
1116 	       size - nulls2 - sizeof(void *));
1117 }
1118 EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1119 
1120 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1121 		int family)
1122 {
1123 	struct sock *sk;
1124 	struct kmem_cache *slab;
1125 
1126 	slab = prot->slab;
1127 	if (slab != NULL) {
1128 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1129 		if (!sk)
1130 			return sk;
1131 		if (priority & __GFP_ZERO) {
1132 			if (prot->clear_sk)
1133 				prot->clear_sk(sk, prot->obj_size);
1134 			else
1135 				sk_prot_clear_nulls(sk, prot->obj_size);
1136 		}
1137 	} else
1138 		sk = kmalloc(prot->obj_size, priority);
1139 
1140 	if (sk != NULL) {
1141 		kmemcheck_annotate_bitfield(sk, flags);
1142 
1143 		if (security_sk_alloc(sk, family, priority))
1144 			goto out_free;
1145 
1146 		if (!try_module_get(prot->owner))
1147 			goto out_free_sec;
1148 		sk_tx_queue_clear(sk);
1149 	}
1150 
1151 	return sk;
1152 
1153 out_free_sec:
1154 	security_sk_free(sk);
1155 out_free:
1156 	if (slab != NULL)
1157 		kmem_cache_free(slab, sk);
1158 	else
1159 		kfree(sk);
1160 	return NULL;
1161 }
1162 
1163 static void sk_prot_free(struct proto *prot, struct sock *sk)
1164 {
1165 	struct kmem_cache *slab;
1166 	struct module *owner;
1167 
1168 	owner = prot->owner;
1169 	slab = prot->slab;
1170 
1171 	security_sk_free(sk);
1172 	if (slab != NULL)
1173 		kmem_cache_free(slab, sk);
1174 	else
1175 		kfree(sk);
1176 	module_put(owner);
1177 }
1178 
1179 #ifdef CONFIG_CGROUPS
1180 void sock_update_classid(struct sock *sk)
1181 {
1182 	u32 classid;
1183 
1184 	rcu_read_lock();  /* doing current task, which cannot vanish. */
1185 	classid = task_cls_classid(current);
1186 	rcu_read_unlock();
1187 	if (classid && classid != sk->sk_classid)
1188 		sk->sk_classid = classid;
1189 }
1190 EXPORT_SYMBOL(sock_update_classid);
1191 
1192 void sock_update_netprioidx(struct sock *sk)
1193 {
1194 	if (in_interrupt())
1195 		return;
1196 
1197 	sk->sk_cgrp_prioidx = task_netprioidx(current);
1198 }
1199 EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1200 #endif
1201 
1202 /**
1203  *	sk_alloc - All socket objects are allocated here
1204  *	@net: the applicable net namespace
1205  *	@family: protocol family
1206  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1207  *	@prot: struct proto associated with this new sock instance
1208  */
1209 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1210 		      struct proto *prot)
1211 {
1212 	struct sock *sk;
1213 
1214 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1215 	if (sk) {
1216 		sk->sk_family = family;
1217 		/*
1218 		 * See comment in struct sock definition to understand
1219 		 * why we need sk_prot_creator -acme
1220 		 */
1221 		sk->sk_prot = sk->sk_prot_creator = prot;
1222 		sock_lock_init(sk);
1223 		sock_net_set(sk, get_net(net));
1224 		atomic_set(&sk->sk_wmem_alloc, 1);
1225 
1226 		sock_update_classid(sk);
1227 		sock_update_netprioidx(sk);
1228 	}
1229 
1230 	return sk;
1231 }
1232 EXPORT_SYMBOL(sk_alloc);
1233 
1234 static void __sk_free(struct sock *sk)
1235 {
1236 	struct sk_filter *filter;
1237 
1238 	if (sk->sk_destruct)
1239 		sk->sk_destruct(sk);
1240 
1241 	filter = rcu_dereference_check(sk->sk_filter,
1242 				       atomic_read(&sk->sk_wmem_alloc) == 0);
1243 	if (filter) {
1244 		sk_filter_uncharge(sk, filter);
1245 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1246 	}
1247 
1248 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1249 
1250 	if (atomic_read(&sk->sk_omem_alloc))
1251 		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
1252 		       __func__, atomic_read(&sk->sk_omem_alloc));
1253 
1254 	if (sk->sk_peer_cred)
1255 		put_cred(sk->sk_peer_cred);
1256 	put_pid(sk->sk_peer_pid);
1257 	put_net(sock_net(sk));
1258 	sk_prot_free(sk->sk_prot_creator, sk);
1259 }
1260 
1261 void sk_free(struct sock *sk)
1262 {
1263 	/*
1264 	 * We subtract one from sk_wmem_alloc and can know if
1265 	 * some packets are still in some tx queue.
1266 	 * If not null, sock_wfree() will call __sk_free(sk) later
1267 	 */
1268 	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1269 		__sk_free(sk);
1270 }
1271 EXPORT_SYMBOL(sk_free);
1272 
1273 /*
1274  * Last sock_put should drop reference to sk->sk_net. It has already
1275  * been dropped in sk_change_net. Taking reference to stopping namespace
1276  * is not an option.
1277  * Take reference to a socket to remove it from hash _alive_ and after that
1278  * destroy it in the context of init_net.
1279  */
1280 void sk_release_kernel(struct sock *sk)
1281 {
1282 	if (sk == NULL || sk->sk_socket == NULL)
1283 		return;
1284 
1285 	sock_hold(sk);
1286 	sock_release(sk->sk_socket);
1287 	release_net(sock_net(sk));
1288 	sock_net_set(sk, get_net(&init_net));
1289 	sock_put(sk);
1290 }
1291 EXPORT_SYMBOL(sk_release_kernel);
1292 
1293 static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1294 {
1295 	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1296 		sock_update_memcg(newsk);
1297 }
1298 
1299 /**
1300  *	sk_clone_lock - clone a socket, and lock its clone
1301  *	@sk: the socket to clone
1302  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1303  *
1304  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1305  */
1306 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1307 {
1308 	struct sock *newsk;
1309 
1310 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1311 	if (newsk != NULL) {
1312 		struct sk_filter *filter;
1313 
1314 		sock_copy(newsk, sk);
1315 
1316 		/* SANITY */
1317 		get_net(sock_net(newsk));
1318 		sk_node_init(&newsk->sk_node);
1319 		sock_lock_init(newsk);
1320 		bh_lock_sock(newsk);
1321 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1322 		newsk->sk_backlog.len = 0;
1323 
1324 		atomic_set(&newsk->sk_rmem_alloc, 0);
1325 		/*
1326 		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1327 		 */
1328 		atomic_set(&newsk->sk_wmem_alloc, 1);
1329 		atomic_set(&newsk->sk_omem_alloc, 0);
1330 		skb_queue_head_init(&newsk->sk_receive_queue);
1331 		skb_queue_head_init(&newsk->sk_write_queue);
1332 #ifdef CONFIG_NET_DMA
1333 		skb_queue_head_init(&newsk->sk_async_wait_queue);
1334 #endif
1335 
1336 		spin_lock_init(&newsk->sk_dst_lock);
1337 		rwlock_init(&newsk->sk_callback_lock);
1338 		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1339 				af_callback_keys + newsk->sk_family,
1340 				af_family_clock_key_strings[newsk->sk_family]);
1341 
1342 		newsk->sk_dst_cache	= NULL;
1343 		newsk->sk_wmem_queued	= 0;
1344 		newsk->sk_forward_alloc = 0;
1345 		newsk->sk_send_head	= NULL;
1346 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1347 
1348 		sock_reset_flag(newsk, SOCK_DONE);
1349 		skb_queue_head_init(&newsk->sk_error_queue);
1350 
1351 		filter = rcu_dereference_protected(newsk->sk_filter, 1);
1352 		if (filter != NULL)
1353 			sk_filter_charge(newsk, filter);
1354 
1355 		if (unlikely(xfrm_sk_clone_policy(newsk))) {
1356 			/* It is still raw copy of parent, so invalidate
1357 			 * destructor and make plain sk_free() */
1358 			newsk->sk_destruct = NULL;
1359 			bh_unlock_sock(newsk);
1360 			sk_free(newsk);
1361 			newsk = NULL;
1362 			goto out;
1363 		}
1364 
1365 		newsk->sk_err	   = 0;
1366 		newsk->sk_priority = 0;
1367 		/*
1368 		 * Before updating sk_refcnt, we must commit prior changes to memory
1369 		 * (Documentation/RCU/rculist_nulls.txt for details)
1370 		 */
1371 		smp_wmb();
1372 		atomic_set(&newsk->sk_refcnt, 2);
1373 
1374 		/*
1375 		 * Increment the counter in the same struct proto as the master
1376 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1377 		 * is the same as sk->sk_prot->socks, as this field was copied
1378 		 * with memcpy).
1379 		 *
1380 		 * This _changes_ the previous behaviour, where
1381 		 * tcp_create_openreq_child always was incrementing the
1382 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1383 		 * to be taken into account in all callers. -acme
1384 		 */
1385 		sk_refcnt_debug_inc(newsk);
1386 		sk_set_socket(newsk, NULL);
1387 		newsk->sk_wq = NULL;
1388 
1389 		sk_update_clone(sk, newsk);
1390 
1391 		if (newsk->sk_prot->sockets_allocated)
1392 			sk_sockets_allocated_inc(newsk);
1393 
1394 		if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1395 			net_enable_timestamp();
1396 	}
1397 out:
1398 	return newsk;
1399 }
1400 EXPORT_SYMBOL_GPL(sk_clone_lock);
1401 
1402 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1403 {
1404 	__sk_dst_set(sk, dst);
1405 	sk->sk_route_caps = dst->dev->features;
1406 	if (sk->sk_route_caps & NETIF_F_GSO)
1407 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1408 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1409 	if (sk_can_gso(sk)) {
1410 		if (dst->header_len) {
1411 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1412 		} else {
1413 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1414 			sk->sk_gso_max_size = dst->dev->gso_max_size;
1415 		}
1416 	}
1417 }
1418 EXPORT_SYMBOL_GPL(sk_setup_caps);
1419 
1420 void __init sk_init(void)
1421 {
1422 	if (totalram_pages <= 4096) {
1423 		sysctl_wmem_max = 32767;
1424 		sysctl_rmem_max = 32767;
1425 		sysctl_wmem_default = 32767;
1426 		sysctl_rmem_default = 32767;
1427 	} else if (totalram_pages >= 131072) {
1428 		sysctl_wmem_max = 131071;
1429 		sysctl_rmem_max = 131071;
1430 	}
1431 }
1432 
1433 /*
1434  *	Simple resource managers for sockets.
1435  */
1436 
1437 
1438 /*
1439  * Write buffer destructor automatically called from kfree_skb.
1440  */
1441 void sock_wfree(struct sk_buff *skb)
1442 {
1443 	struct sock *sk = skb->sk;
1444 	unsigned int len = skb->truesize;
1445 
1446 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1447 		/*
1448 		 * Keep a reference on sk_wmem_alloc, this will be released
1449 		 * after sk_write_space() call
1450 		 */
1451 		atomic_sub(len - 1, &sk->sk_wmem_alloc);
1452 		sk->sk_write_space(sk);
1453 		len = 1;
1454 	}
1455 	/*
1456 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1457 	 * could not do because of in-flight packets
1458 	 */
1459 	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1460 		__sk_free(sk);
1461 }
1462 EXPORT_SYMBOL(sock_wfree);
1463 
1464 /*
1465  * Read buffer destructor automatically called from kfree_skb.
1466  */
1467 void sock_rfree(struct sk_buff *skb)
1468 {
1469 	struct sock *sk = skb->sk;
1470 	unsigned int len = skb->truesize;
1471 
1472 	atomic_sub(len, &sk->sk_rmem_alloc);
1473 	sk_mem_uncharge(sk, len);
1474 }
1475 EXPORT_SYMBOL(sock_rfree);
1476 
1477 
1478 int sock_i_uid(struct sock *sk)
1479 {
1480 	int uid;
1481 
1482 	read_lock_bh(&sk->sk_callback_lock);
1483 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1484 	read_unlock_bh(&sk->sk_callback_lock);
1485 	return uid;
1486 }
1487 EXPORT_SYMBOL(sock_i_uid);
1488 
1489 unsigned long sock_i_ino(struct sock *sk)
1490 {
1491 	unsigned long ino;
1492 
1493 	read_lock_bh(&sk->sk_callback_lock);
1494 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1495 	read_unlock_bh(&sk->sk_callback_lock);
1496 	return ino;
1497 }
1498 EXPORT_SYMBOL(sock_i_ino);
1499 
1500 /*
1501  * Allocate a skb from the socket's send buffer.
1502  */
1503 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1504 			     gfp_t priority)
1505 {
1506 	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1507 		struct sk_buff *skb = alloc_skb(size, priority);
1508 		if (skb) {
1509 			skb_set_owner_w(skb, sk);
1510 			return skb;
1511 		}
1512 	}
1513 	return NULL;
1514 }
1515 EXPORT_SYMBOL(sock_wmalloc);
1516 
1517 /*
1518  * Allocate a skb from the socket's receive buffer.
1519  */
1520 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1521 			     gfp_t priority)
1522 {
1523 	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1524 		struct sk_buff *skb = alloc_skb(size, priority);
1525 		if (skb) {
1526 			skb_set_owner_r(skb, sk);
1527 			return skb;
1528 		}
1529 	}
1530 	return NULL;
1531 }
1532 
1533 /*
1534  * Allocate a memory block from the socket's option memory buffer.
1535  */
1536 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1537 {
1538 	if ((unsigned)size <= sysctl_optmem_max &&
1539 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1540 		void *mem;
1541 		/* First do the add, to avoid the race if kmalloc
1542 		 * might sleep.
1543 		 */
1544 		atomic_add(size, &sk->sk_omem_alloc);
1545 		mem = kmalloc(size, priority);
1546 		if (mem)
1547 			return mem;
1548 		atomic_sub(size, &sk->sk_omem_alloc);
1549 	}
1550 	return NULL;
1551 }
1552 EXPORT_SYMBOL(sock_kmalloc);
1553 
1554 /*
1555  * Free an option memory block.
1556  */
1557 void sock_kfree_s(struct sock *sk, void *mem, int size)
1558 {
1559 	kfree(mem);
1560 	atomic_sub(size, &sk->sk_omem_alloc);
1561 }
1562 EXPORT_SYMBOL(sock_kfree_s);
1563 
1564 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1565    I think, these locks should be removed for datagram sockets.
1566  */
1567 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1568 {
1569 	DEFINE_WAIT(wait);
1570 
1571 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1572 	for (;;) {
1573 		if (!timeo)
1574 			break;
1575 		if (signal_pending(current))
1576 			break;
1577 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1578 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1579 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1580 			break;
1581 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1582 			break;
1583 		if (sk->sk_err)
1584 			break;
1585 		timeo = schedule_timeout(timeo);
1586 	}
1587 	finish_wait(sk_sleep(sk), &wait);
1588 	return timeo;
1589 }
1590 
1591 
1592 /*
1593  *	Generic send/receive buffer handlers
1594  */
1595 
1596 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1597 				     unsigned long data_len, int noblock,
1598 				     int *errcode)
1599 {
1600 	struct sk_buff *skb;
1601 	gfp_t gfp_mask;
1602 	long timeo;
1603 	int err;
1604 
1605 	gfp_mask = sk->sk_allocation;
1606 	if (gfp_mask & __GFP_WAIT)
1607 		gfp_mask |= __GFP_REPEAT;
1608 
1609 	timeo = sock_sndtimeo(sk, noblock);
1610 	while (1) {
1611 		err = sock_error(sk);
1612 		if (err != 0)
1613 			goto failure;
1614 
1615 		err = -EPIPE;
1616 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1617 			goto failure;
1618 
1619 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1620 			skb = alloc_skb(header_len, gfp_mask);
1621 			if (skb) {
1622 				int npages;
1623 				int i;
1624 
1625 				/* No pages, we're done... */
1626 				if (!data_len)
1627 					break;
1628 
1629 				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1630 				skb->truesize += data_len;
1631 				skb_shinfo(skb)->nr_frags = npages;
1632 				for (i = 0; i < npages; i++) {
1633 					struct page *page;
1634 
1635 					page = alloc_pages(sk->sk_allocation, 0);
1636 					if (!page) {
1637 						err = -ENOBUFS;
1638 						skb_shinfo(skb)->nr_frags = i;
1639 						kfree_skb(skb);
1640 						goto failure;
1641 					}
1642 
1643 					__skb_fill_page_desc(skb, i,
1644 							page, 0,
1645 							(data_len >= PAGE_SIZE ?
1646 							 PAGE_SIZE :
1647 							 data_len));
1648 					data_len -= PAGE_SIZE;
1649 				}
1650 
1651 				/* Full success... */
1652 				break;
1653 			}
1654 			err = -ENOBUFS;
1655 			goto failure;
1656 		}
1657 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1658 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1659 		err = -EAGAIN;
1660 		if (!timeo)
1661 			goto failure;
1662 		if (signal_pending(current))
1663 			goto interrupted;
1664 		timeo = sock_wait_for_wmem(sk, timeo);
1665 	}
1666 
1667 	skb_set_owner_w(skb, sk);
1668 	return skb;
1669 
1670 interrupted:
1671 	err = sock_intr_errno(timeo);
1672 failure:
1673 	*errcode = err;
1674 	return NULL;
1675 }
1676 EXPORT_SYMBOL(sock_alloc_send_pskb);
1677 
1678 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1679 				    int noblock, int *errcode)
1680 {
1681 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1682 }
1683 EXPORT_SYMBOL(sock_alloc_send_skb);
1684 
1685 static void __lock_sock(struct sock *sk)
1686 	__releases(&sk->sk_lock.slock)
1687 	__acquires(&sk->sk_lock.slock)
1688 {
1689 	DEFINE_WAIT(wait);
1690 
1691 	for (;;) {
1692 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1693 					TASK_UNINTERRUPTIBLE);
1694 		spin_unlock_bh(&sk->sk_lock.slock);
1695 		schedule();
1696 		spin_lock_bh(&sk->sk_lock.slock);
1697 		if (!sock_owned_by_user(sk))
1698 			break;
1699 	}
1700 	finish_wait(&sk->sk_lock.wq, &wait);
1701 }
1702 
1703 static void __release_sock(struct sock *sk)
1704 	__releases(&sk->sk_lock.slock)
1705 	__acquires(&sk->sk_lock.slock)
1706 {
1707 	struct sk_buff *skb = sk->sk_backlog.head;
1708 
1709 	do {
1710 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1711 		bh_unlock_sock(sk);
1712 
1713 		do {
1714 			struct sk_buff *next = skb->next;
1715 
1716 			WARN_ON_ONCE(skb_dst_is_noref(skb));
1717 			skb->next = NULL;
1718 			sk_backlog_rcv(sk, skb);
1719 
1720 			/*
1721 			 * We are in process context here with softirqs
1722 			 * disabled, use cond_resched_softirq() to preempt.
1723 			 * This is safe to do because we've taken the backlog
1724 			 * queue private:
1725 			 */
1726 			cond_resched_softirq();
1727 
1728 			skb = next;
1729 		} while (skb != NULL);
1730 
1731 		bh_lock_sock(sk);
1732 	} while ((skb = sk->sk_backlog.head) != NULL);
1733 
1734 	/*
1735 	 * Doing the zeroing here guarantee we can not loop forever
1736 	 * while a wild producer attempts to flood us.
1737 	 */
1738 	sk->sk_backlog.len = 0;
1739 }
1740 
1741 /**
1742  * sk_wait_data - wait for data to arrive at sk_receive_queue
1743  * @sk:    sock to wait on
1744  * @timeo: for how long
1745  *
1746  * Now socket state including sk->sk_err is changed only under lock,
1747  * hence we may omit checks after joining wait queue.
1748  * We check receive queue before schedule() only as optimization;
1749  * it is very likely that release_sock() added new data.
1750  */
1751 int sk_wait_data(struct sock *sk, long *timeo)
1752 {
1753 	int rc;
1754 	DEFINE_WAIT(wait);
1755 
1756 	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1757 	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1758 	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1759 	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1760 	finish_wait(sk_sleep(sk), &wait);
1761 	return rc;
1762 }
1763 EXPORT_SYMBOL(sk_wait_data);
1764 
1765 /**
1766  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1767  *	@sk: socket
1768  *	@size: memory size to allocate
1769  *	@kind: allocation type
1770  *
1771  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1772  *	rmem allocation. This function assumes that protocols which have
1773  *	memory_pressure use sk_wmem_queued as write buffer accounting.
1774  */
1775 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1776 {
1777 	struct proto *prot = sk->sk_prot;
1778 	int amt = sk_mem_pages(size);
1779 	long allocated;
1780 	int parent_status = UNDER_LIMIT;
1781 
1782 	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1783 
1784 	allocated = sk_memory_allocated_add(sk, amt, &parent_status);
1785 
1786 	/* Under limit. */
1787 	if (parent_status == UNDER_LIMIT &&
1788 			allocated <= sk_prot_mem_limits(sk, 0)) {
1789 		sk_leave_memory_pressure(sk);
1790 		return 1;
1791 	}
1792 
1793 	/* Under pressure. (we or our parents) */
1794 	if ((parent_status > SOFT_LIMIT) ||
1795 			allocated > sk_prot_mem_limits(sk, 1))
1796 		sk_enter_memory_pressure(sk);
1797 
1798 	/* Over hard limit (we or our parents) */
1799 	if ((parent_status == OVER_LIMIT) ||
1800 			(allocated > sk_prot_mem_limits(sk, 2)))
1801 		goto suppress_allocation;
1802 
1803 	/* guarantee minimum buffer size under pressure */
1804 	if (kind == SK_MEM_RECV) {
1805 		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1806 			return 1;
1807 
1808 	} else { /* SK_MEM_SEND */
1809 		if (sk->sk_type == SOCK_STREAM) {
1810 			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1811 				return 1;
1812 		} else if (atomic_read(&sk->sk_wmem_alloc) <
1813 			   prot->sysctl_wmem[0])
1814 				return 1;
1815 	}
1816 
1817 	if (sk_has_memory_pressure(sk)) {
1818 		int alloc;
1819 
1820 		if (!sk_under_memory_pressure(sk))
1821 			return 1;
1822 		alloc = sk_sockets_allocated_read_positive(sk);
1823 		if (sk_prot_mem_limits(sk, 2) > alloc *
1824 		    sk_mem_pages(sk->sk_wmem_queued +
1825 				 atomic_read(&sk->sk_rmem_alloc) +
1826 				 sk->sk_forward_alloc))
1827 			return 1;
1828 	}
1829 
1830 suppress_allocation:
1831 
1832 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1833 		sk_stream_moderate_sndbuf(sk);
1834 
1835 		/* Fail only if socket is _under_ its sndbuf.
1836 		 * In this case we cannot block, so that we have to fail.
1837 		 */
1838 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1839 			return 1;
1840 	}
1841 
1842 	trace_sock_exceed_buf_limit(sk, prot, allocated);
1843 
1844 	/* Alas. Undo changes. */
1845 	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1846 
1847 	sk_memory_allocated_sub(sk, amt);
1848 
1849 	return 0;
1850 }
1851 EXPORT_SYMBOL(__sk_mem_schedule);
1852 
1853 /**
1854  *	__sk_reclaim - reclaim memory_allocated
1855  *	@sk: socket
1856  */
1857 void __sk_mem_reclaim(struct sock *sk)
1858 {
1859 	sk_memory_allocated_sub(sk,
1860 				sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
1861 	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1862 
1863 	if (sk_under_memory_pressure(sk) &&
1864 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
1865 		sk_leave_memory_pressure(sk);
1866 }
1867 EXPORT_SYMBOL(__sk_mem_reclaim);
1868 
1869 
1870 /*
1871  * Set of default routines for initialising struct proto_ops when
1872  * the protocol does not support a particular function. In certain
1873  * cases where it makes no sense for a protocol to have a "do nothing"
1874  * function, some default processing is provided.
1875  */
1876 
1877 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1878 {
1879 	return -EOPNOTSUPP;
1880 }
1881 EXPORT_SYMBOL(sock_no_bind);
1882 
1883 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1884 		    int len, int flags)
1885 {
1886 	return -EOPNOTSUPP;
1887 }
1888 EXPORT_SYMBOL(sock_no_connect);
1889 
1890 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1891 {
1892 	return -EOPNOTSUPP;
1893 }
1894 EXPORT_SYMBOL(sock_no_socketpair);
1895 
1896 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1897 {
1898 	return -EOPNOTSUPP;
1899 }
1900 EXPORT_SYMBOL(sock_no_accept);
1901 
1902 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1903 		    int *len, int peer)
1904 {
1905 	return -EOPNOTSUPP;
1906 }
1907 EXPORT_SYMBOL(sock_no_getname);
1908 
1909 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1910 {
1911 	return 0;
1912 }
1913 EXPORT_SYMBOL(sock_no_poll);
1914 
1915 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1916 {
1917 	return -EOPNOTSUPP;
1918 }
1919 EXPORT_SYMBOL(sock_no_ioctl);
1920 
1921 int sock_no_listen(struct socket *sock, int backlog)
1922 {
1923 	return -EOPNOTSUPP;
1924 }
1925 EXPORT_SYMBOL(sock_no_listen);
1926 
1927 int sock_no_shutdown(struct socket *sock, int how)
1928 {
1929 	return -EOPNOTSUPP;
1930 }
1931 EXPORT_SYMBOL(sock_no_shutdown);
1932 
1933 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1934 		    char __user *optval, unsigned int optlen)
1935 {
1936 	return -EOPNOTSUPP;
1937 }
1938 EXPORT_SYMBOL(sock_no_setsockopt);
1939 
1940 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1941 		    char __user *optval, int __user *optlen)
1942 {
1943 	return -EOPNOTSUPP;
1944 }
1945 EXPORT_SYMBOL(sock_no_getsockopt);
1946 
1947 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1948 		    size_t len)
1949 {
1950 	return -EOPNOTSUPP;
1951 }
1952 EXPORT_SYMBOL(sock_no_sendmsg);
1953 
1954 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1955 		    size_t len, int flags)
1956 {
1957 	return -EOPNOTSUPP;
1958 }
1959 EXPORT_SYMBOL(sock_no_recvmsg);
1960 
1961 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1962 {
1963 	/* Mirror missing mmap method error code */
1964 	return -ENODEV;
1965 }
1966 EXPORT_SYMBOL(sock_no_mmap);
1967 
1968 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1969 {
1970 	ssize_t res;
1971 	struct msghdr msg = {.msg_flags = flags};
1972 	struct kvec iov;
1973 	char *kaddr = kmap(page);
1974 	iov.iov_base = kaddr + offset;
1975 	iov.iov_len = size;
1976 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1977 	kunmap(page);
1978 	return res;
1979 }
1980 EXPORT_SYMBOL(sock_no_sendpage);
1981 
1982 /*
1983  *	Default Socket Callbacks
1984  */
1985 
1986 static void sock_def_wakeup(struct sock *sk)
1987 {
1988 	struct socket_wq *wq;
1989 
1990 	rcu_read_lock();
1991 	wq = rcu_dereference(sk->sk_wq);
1992 	if (wq_has_sleeper(wq))
1993 		wake_up_interruptible_all(&wq->wait);
1994 	rcu_read_unlock();
1995 }
1996 
1997 static void sock_def_error_report(struct sock *sk)
1998 {
1999 	struct socket_wq *wq;
2000 
2001 	rcu_read_lock();
2002 	wq = rcu_dereference(sk->sk_wq);
2003 	if (wq_has_sleeper(wq))
2004 		wake_up_interruptible_poll(&wq->wait, POLLERR);
2005 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2006 	rcu_read_unlock();
2007 }
2008 
2009 static void sock_def_readable(struct sock *sk, int len)
2010 {
2011 	struct socket_wq *wq;
2012 
2013 	rcu_read_lock();
2014 	wq = rcu_dereference(sk->sk_wq);
2015 	if (wq_has_sleeper(wq))
2016 		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2017 						POLLRDNORM | POLLRDBAND);
2018 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2019 	rcu_read_unlock();
2020 }
2021 
2022 static void sock_def_write_space(struct sock *sk)
2023 {
2024 	struct socket_wq *wq;
2025 
2026 	rcu_read_lock();
2027 
2028 	/* Do not wake up a writer until he can make "significant"
2029 	 * progress.  --DaveM
2030 	 */
2031 	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2032 		wq = rcu_dereference(sk->sk_wq);
2033 		if (wq_has_sleeper(wq))
2034 			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2035 						POLLWRNORM | POLLWRBAND);
2036 
2037 		/* Should agree with poll, otherwise some programs break */
2038 		if (sock_writeable(sk))
2039 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2040 	}
2041 
2042 	rcu_read_unlock();
2043 }
2044 
2045 static void sock_def_destruct(struct sock *sk)
2046 {
2047 	kfree(sk->sk_protinfo);
2048 }
2049 
2050 void sk_send_sigurg(struct sock *sk)
2051 {
2052 	if (sk->sk_socket && sk->sk_socket->file)
2053 		if (send_sigurg(&sk->sk_socket->file->f_owner))
2054 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2055 }
2056 EXPORT_SYMBOL(sk_send_sigurg);
2057 
2058 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2059 		    unsigned long expires)
2060 {
2061 	if (!mod_timer(timer, expires))
2062 		sock_hold(sk);
2063 }
2064 EXPORT_SYMBOL(sk_reset_timer);
2065 
2066 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2067 {
2068 	if (timer_pending(timer) && del_timer(timer))
2069 		__sock_put(sk);
2070 }
2071 EXPORT_SYMBOL(sk_stop_timer);
2072 
2073 void sock_init_data(struct socket *sock, struct sock *sk)
2074 {
2075 	skb_queue_head_init(&sk->sk_receive_queue);
2076 	skb_queue_head_init(&sk->sk_write_queue);
2077 	skb_queue_head_init(&sk->sk_error_queue);
2078 #ifdef CONFIG_NET_DMA
2079 	skb_queue_head_init(&sk->sk_async_wait_queue);
2080 #endif
2081 
2082 	sk->sk_send_head	=	NULL;
2083 
2084 	init_timer(&sk->sk_timer);
2085 
2086 	sk->sk_allocation	=	GFP_KERNEL;
2087 	sk->sk_rcvbuf		=	sysctl_rmem_default;
2088 	sk->sk_sndbuf		=	sysctl_wmem_default;
2089 	sk->sk_state		=	TCP_CLOSE;
2090 	sk_set_socket(sk, sock);
2091 
2092 	sock_set_flag(sk, SOCK_ZAPPED);
2093 
2094 	if (sock) {
2095 		sk->sk_type	=	sock->type;
2096 		sk->sk_wq	=	sock->wq;
2097 		sock->sk	=	sk;
2098 	} else
2099 		sk->sk_wq	=	NULL;
2100 
2101 	spin_lock_init(&sk->sk_dst_lock);
2102 	rwlock_init(&sk->sk_callback_lock);
2103 	lockdep_set_class_and_name(&sk->sk_callback_lock,
2104 			af_callback_keys + sk->sk_family,
2105 			af_family_clock_key_strings[sk->sk_family]);
2106 
2107 	sk->sk_state_change	=	sock_def_wakeup;
2108 	sk->sk_data_ready	=	sock_def_readable;
2109 	sk->sk_write_space	=	sock_def_write_space;
2110 	sk->sk_error_report	=	sock_def_error_report;
2111 	sk->sk_destruct		=	sock_def_destruct;
2112 
2113 	sk->sk_sndmsg_page	=	NULL;
2114 	sk->sk_sndmsg_off	=	0;
2115 	sk->sk_peek_off		=	-1;
2116 
2117 	sk->sk_peer_pid 	=	NULL;
2118 	sk->sk_peer_cred	=	NULL;
2119 	sk->sk_write_pending	=	0;
2120 	sk->sk_rcvlowat		=	1;
2121 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2122 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2123 
2124 	sk->sk_stamp = ktime_set(-1L, 0);
2125 
2126 	/*
2127 	 * Before updating sk_refcnt, we must commit prior changes to memory
2128 	 * (Documentation/RCU/rculist_nulls.txt for details)
2129 	 */
2130 	smp_wmb();
2131 	atomic_set(&sk->sk_refcnt, 1);
2132 	atomic_set(&sk->sk_drops, 0);
2133 }
2134 EXPORT_SYMBOL(sock_init_data);
2135 
2136 void lock_sock_nested(struct sock *sk, int subclass)
2137 {
2138 	might_sleep();
2139 	spin_lock_bh(&sk->sk_lock.slock);
2140 	if (sk->sk_lock.owned)
2141 		__lock_sock(sk);
2142 	sk->sk_lock.owned = 1;
2143 	spin_unlock(&sk->sk_lock.slock);
2144 	/*
2145 	 * The sk_lock has mutex_lock() semantics here:
2146 	 */
2147 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2148 	local_bh_enable();
2149 }
2150 EXPORT_SYMBOL(lock_sock_nested);
2151 
2152 void release_sock(struct sock *sk)
2153 {
2154 	/*
2155 	 * The sk_lock has mutex_unlock() semantics:
2156 	 */
2157 	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2158 
2159 	spin_lock_bh(&sk->sk_lock.slock);
2160 	if (sk->sk_backlog.tail)
2161 		__release_sock(sk);
2162 	sk->sk_lock.owned = 0;
2163 	if (waitqueue_active(&sk->sk_lock.wq))
2164 		wake_up(&sk->sk_lock.wq);
2165 	spin_unlock_bh(&sk->sk_lock.slock);
2166 }
2167 EXPORT_SYMBOL(release_sock);
2168 
2169 /**
2170  * lock_sock_fast - fast version of lock_sock
2171  * @sk: socket
2172  *
2173  * This version should be used for very small section, where process wont block
2174  * return false if fast path is taken
2175  *   sk_lock.slock locked, owned = 0, BH disabled
2176  * return true if slow path is taken
2177  *   sk_lock.slock unlocked, owned = 1, BH enabled
2178  */
2179 bool lock_sock_fast(struct sock *sk)
2180 {
2181 	might_sleep();
2182 	spin_lock_bh(&sk->sk_lock.slock);
2183 
2184 	if (!sk->sk_lock.owned)
2185 		/*
2186 		 * Note : We must disable BH
2187 		 */
2188 		return false;
2189 
2190 	__lock_sock(sk);
2191 	sk->sk_lock.owned = 1;
2192 	spin_unlock(&sk->sk_lock.slock);
2193 	/*
2194 	 * The sk_lock has mutex_lock() semantics here:
2195 	 */
2196 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2197 	local_bh_enable();
2198 	return true;
2199 }
2200 EXPORT_SYMBOL(lock_sock_fast);
2201 
2202 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2203 {
2204 	struct timeval tv;
2205 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2206 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2207 	tv = ktime_to_timeval(sk->sk_stamp);
2208 	if (tv.tv_sec == -1)
2209 		return -ENOENT;
2210 	if (tv.tv_sec == 0) {
2211 		sk->sk_stamp = ktime_get_real();
2212 		tv = ktime_to_timeval(sk->sk_stamp);
2213 	}
2214 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2215 }
2216 EXPORT_SYMBOL(sock_get_timestamp);
2217 
2218 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2219 {
2220 	struct timespec ts;
2221 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2222 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2223 	ts = ktime_to_timespec(sk->sk_stamp);
2224 	if (ts.tv_sec == -1)
2225 		return -ENOENT;
2226 	if (ts.tv_sec == 0) {
2227 		sk->sk_stamp = ktime_get_real();
2228 		ts = ktime_to_timespec(sk->sk_stamp);
2229 	}
2230 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2231 }
2232 EXPORT_SYMBOL(sock_get_timestampns);
2233 
2234 void sock_enable_timestamp(struct sock *sk, int flag)
2235 {
2236 	if (!sock_flag(sk, flag)) {
2237 		unsigned long previous_flags = sk->sk_flags;
2238 
2239 		sock_set_flag(sk, flag);
2240 		/*
2241 		 * we just set one of the two flags which require net
2242 		 * time stamping, but time stamping might have been on
2243 		 * already because of the other one
2244 		 */
2245 		if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2246 			net_enable_timestamp();
2247 	}
2248 }
2249 
2250 /*
2251  *	Get a socket option on an socket.
2252  *
2253  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2254  *	asynchronous errors should be reported by getsockopt. We assume
2255  *	this means if you specify SO_ERROR (otherwise whats the point of it).
2256  */
2257 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2258 			   char __user *optval, int __user *optlen)
2259 {
2260 	struct sock *sk = sock->sk;
2261 
2262 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2263 }
2264 EXPORT_SYMBOL(sock_common_getsockopt);
2265 
2266 #ifdef CONFIG_COMPAT
2267 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2268 				  char __user *optval, int __user *optlen)
2269 {
2270 	struct sock *sk = sock->sk;
2271 
2272 	if (sk->sk_prot->compat_getsockopt != NULL)
2273 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2274 						      optval, optlen);
2275 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2276 }
2277 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2278 #endif
2279 
2280 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2281 			struct msghdr *msg, size_t size, int flags)
2282 {
2283 	struct sock *sk = sock->sk;
2284 	int addr_len = 0;
2285 	int err;
2286 
2287 	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2288 				   flags & ~MSG_DONTWAIT, &addr_len);
2289 	if (err >= 0)
2290 		msg->msg_namelen = addr_len;
2291 	return err;
2292 }
2293 EXPORT_SYMBOL(sock_common_recvmsg);
2294 
2295 /*
2296  *	Set socket options on an inet socket.
2297  */
2298 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2299 			   char __user *optval, unsigned int optlen)
2300 {
2301 	struct sock *sk = sock->sk;
2302 
2303 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2304 }
2305 EXPORT_SYMBOL(sock_common_setsockopt);
2306 
2307 #ifdef CONFIG_COMPAT
2308 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2309 				  char __user *optval, unsigned int optlen)
2310 {
2311 	struct sock *sk = sock->sk;
2312 
2313 	if (sk->sk_prot->compat_setsockopt != NULL)
2314 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2315 						      optval, optlen);
2316 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2317 }
2318 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2319 #endif
2320 
2321 void sk_common_release(struct sock *sk)
2322 {
2323 	if (sk->sk_prot->destroy)
2324 		sk->sk_prot->destroy(sk);
2325 
2326 	/*
2327 	 * Observation: when sock_common_release is called, processes have
2328 	 * no access to socket. But net still has.
2329 	 * Step one, detach it from networking:
2330 	 *
2331 	 * A. Remove from hash tables.
2332 	 */
2333 
2334 	sk->sk_prot->unhash(sk);
2335 
2336 	/*
2337 	 * In this point socket cannot receive new packets, but it is possible
2338 	 * that some packets are in flight because some CPU runs receiver and
2339 	 * did hash table lookup before we unhashed socket. They will achieve
2340 	 * receive queue and will be purged by socket destructor.
2341 	 *
2342 	 * Also we still have packets pending on receive queue and probably,
2343 	 * our own packets waiting in device queues. sock_destroy will drain
2344 	 * receive queue, but transmitted packets will delay socket destruction
2345 	 * until the last reference will be released.
2346 	 */
2347 
2348 	sock_orphan(sk);
2349 
2350 	xfrm_sk_free_policy(sk);
2351 
2352 	sk_refcnt_debug_release(sk);
2353 	sock_put(sk);
2354 }
2355 EXPORT_SYMBOL(sk_common_release);
2356 
2357 #ifdef CONFIG_PROC_FS
2358 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
2359 struct prot_inuse {
2360 	int val[PROTO_INUSE_NR];
2361 };
2362 
2363 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2364 
2365 #ifdef CONFIG_NET_NS
2366 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2367 {
2368 	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2369 }
2370 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2371 
2372 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2373 {
2374 	int cpu, idx = prot->inuse_idx;
2375 	int res = 0;
2376 
2377 	for_each_possible_cpu(cpu)
2378 		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2379 
2380 	return res >= 0 ? res : 0;
2381 }
2382 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2383 
2384 static int __net_init sock_inuse_init_net(struct net *net)
2385 {
2386 	net->core.inuse = alloc_percpu(struct prot_inuse);
2387 	return net->core.inuse ? 0 : -ENOMEM;
2388 }
2389 
2390 static void __net_exit sock_inuse_exit_net(struct net *net)
2391 {
2392 	free_percpu(net->core.inuse);
2393 }
2394 
2395 static struct pernet_operations net_inuse_ops = {
2396 	.init = sock_inuse_init_net,
2397 	.exit = sock_inuse_exit_net,
2398 };
2399 
2400 static __init int net_inuse_init(void)
2401 {
2402 	if (register_pernet_subsys(&net_inuse_ops))
2403 		panic("Cannot initialize net inuse counters");
2404 
2405 	return 0;
2406 }
2407 
2408 core_initcall(net_inuse_init);
2409 #else
2410 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2411 
2412 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2413 {
2414 	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2415 }
2416 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2417 
2418 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2419 {
2420 	int cpu, idx = prot->inuse_idx;
2421 	int res = 0;
2422 
2423 	for_each_possible_cpu(cpu)
2424 		res += per_cpu(prot_inuse, cpu).val[idx];
2425 
2426 	return res >= 0 ? res : 0;
2427 }
2428 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2429 #endif
2430 
2431 static void assign_proto_idx(struct proto *prot)
2432 {
2433 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2434 
2435 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2436 		printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2437 		return;
2438 	}
2439 
2440 	set_bit(prot->inuse_idx, proto_inuse_idx);
2441 }
2442 
2443 static void release_proto_idx(struct proto *prot)
2444 {
2445 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2446 		clear_bit(prot->inuse_idx, proto_inuse_idx);
2447 }
2448 #else
2449 static inline void assign_proto_idx(struct proto *prot)
2450 {
2451 }
2452 
2453 static inline void release_proto_idx(struct proto *prot)
2454 {
2455 }
2456 #endif
2457 
2458 int proto_register(struct proto *prot, int alloc_slab)
2459 {
2460 	if (alloc_slab) {
2461 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2462 					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2463 					NULL);
2464 
2465 		if (prot->slab == NULL) {
2466 			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2467 			       prot->name);
2468 			goto out;
2469 		}
2470 
2471 		if (prot->rsk_prot != NULL) {
2472 			prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2473 			if (prot->rsk_prot->slab_name == NULL)
2474 				goto out_free_sock_slab;
2475 
2476 			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2477 								 prot->rsk_prot->obj_size, 0,
2478 								 SLAB_HWCACHE_ALIGN, NULL);
2479 
2480 			if (prot->rsk_prot->slab == NULL) {
2481 				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2482 				       prot->name);
2483 				goto out_free_request_sock_slab_name;
2484 			}
2485 		}
2486 
2487 		if (prot->twsk_prot != NULL) {
2488 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2489 
2490 			if (prot->twsk_prot->twsk_slab_name == NULL)
2491 				goto out_free_request_sock_slab;
2492 
2493 			prot->twsk_prot->twsk_slab =
2494 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2495 						  prot->twsk_prot->twsk_obj_size,
2496 						  0,
2497 						  SLAB_HWCACHE_ALIGN |
2498 							prot->slab_flags,
2499 						  NULL);
2500 			if (prot->twsk_prot->twsk_slab == NULL)
2501 				goto out_free_timewait_sock_slab_name;
2502 		}
2503 	}
2504 
2505 	mutex_lock(&proto_list_mutex);
2506 	list_add(&prot->node, &proto_list);
2507 	assign_proto_idx(prot);
2508 	mutex_unlock(&proto_list_mutex);
2509 	return 0;
2510 
2511 out_free_timewait_sock_slab_name:
2512 	kfree(prot->twsk_prot->twsk_slab_name);
2513 out_free_request_sock_slab:
2514 	if (prot->rsk_prot && prot->rsk_prot->slab) {
2515 		kmem_cache_destroy(prot->rsk_prot->slab);
2516 		prot->rsk_prot->slab = NULL;
2517 	}
2518 out_free_request_sock_slab_name:
2519 	if (prot->rsk_prot)
2520 		kfree(prot->rsk_prot->slab_name);
2521 out_free_sock_slab:
2522 	kmem_cache_destroy(prot->slab);
2523 	prot->slab = NULL;
2524 out:
2525 	return -ENOBUFS;
2526 }
2527 EXPORT_SYMBOL(proto_register);
2528 
2529 void proto_unregister(struct proto *prot)
2530 {
2531 	mutex_lock(&proto_list_mutex);
2532 	release_proto_idx(prot);
2533 	list_del(&prot->node);
2534 	mutex_unlock(&proto_list_mutex);
2535 
2536 	if (prot->slab != NULL) {
2537 		kmem_cache_destroy(prot->slab);
2538 		prot->slab = NULL;
2539 	}
2540 
2541 	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2542 		kmem_cache_destroy(prot->rsk_prot->slab);
2543 		kfree(prot->rsk_prot->slab_name);
2544 		prot->rsk_prot->slab = NULL;
2545 	}
2546 
2547 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2548 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2549 		kfree(prot->twsk_prot->twsk_slab_name);
2550 		prot->twsk_prot->twsk_slab = NULL;
2551 	}
2552 }
2553 EXPORT_SYMBOL(proto_unregister);
2554 
2555 #ifdef CONFIG_PROC_FS
2556 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2557 	__acquires(proto_list_mutex)
2558 {
2559 	mutex_lock(&proto_list_mutex);
2560 	return seq_list_start_head(&proto_list, *pos);
2561 }
2562 
2563 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2564 {
2565 	return seq_list_next(v, &proto_list, pos);
2566 }
2567 
2568 static void proto_seq_stop(struct seq_file *seq, void *v)
2569 	__releases(proto_list_mutex)
2570 {
2571 	mutex_unlock(&proto_list_mutex);
2572 }
2573 
2574 static char proto_method_implemented(const void *method)
2575 {
2576 	return method == NULL ? 'n' : 'y';
2577 }
2578 static long sock_prot_memory_allocated(struct proto *proto)
2579 {
2580 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto): -1L;
2581 }
2582 
2583 static char *sock_prot_memory_pressure(struct proto *proto)
2584 {
2585 	return proto->memory_pressure != NULL ?
2586 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2587 }
2588 
2589 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2590 {
2591 
2592 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2593 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2594 		   proto->name,
2595 		   proto->obj_size,
2596 		   sock_prot_inuse_get(seq_file_net(seq), proto),
2597 		   sock_prot_memory_allocated(proto),
2598 		   sock_prot_memory_pressure(proto),
2599 		   proto->max_header,
2600 		   proto->slab == NULL ? "no" : "yes",
2601 		   module_name(proto->owner),
2602 		   proto_method_implemented(proto->close),
2603 		   proto_method_implemented(proto->connect),
2604 		   proto_method_implemented(proto->disconnect),
2605 		   proto_method_implemented(proto->accept),
2606 		   proto_method_implemented(proto->ioctl),
2607 		   proto_method_implemented(proto->init),
2608 		   proto_method_implemented(proto->destroy),
2609 		   proto_method_implemented(proto->shutdown),
2610 		   proto_method_implemented(proto->setsockopt),
2611 		   proto_method_implemented(proto->getsockopt),
2612 		   proto_method_implemented(proto->sendmsg),
2613 		   proto_method_implemented(proto->recvmsg),
2614 		   proto_method_implemented(proto->sendpage),
2615 		   proto_method_implemented(proto->bind),
2616 		   proto_method_implemented(proto->backlog_rcv),
2617 		   proto_method_implemented(proto->hash),
2618 		   proto_method_implemented(proto->unhash),
2619 		   proto_method_implemented(proto->get_port),
2620 		   proto_method_implemented(proto->enter_memory_pressure));
2621 }
2622 
2623 static int proto_seq_show(struct seq_file *seq, void *v)
2624 {
2625 	if (v == &proto_list)
2626 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2627 			   "protocol",
2628 			   "size",
2629 			   "sockets",
2630 			   "memory",
2631 			   "press",
2632 			   "maxhdr",
2633 			   "slab",
2634 			   "module",
2635 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2636 	else
2637 		proto_seq_printf(seq, list_entry(v, struct proto, node));
2638 	return 0;
2639 }
2640 
2641 static const struct seq_operations proto_seq_ops = {
2642 	.start  = proto_seq_start,
2643 	.next   = proto_seq_next,
2644 	.stop   = proto_seq_stop,
2645 	.show   = proto_seq_show,
2646 };
2647 
2648 static int proto_seq_open(struct inode *inode, struct file *file)
2649 {
2650 	return seq_open_net(inode, file, &proto_seq_ops,
2651 			    sizeof(struct seq_net_private));
2652 }
2653 
2654 static const struct file_operations proto_seq_fops = {
2655 	.owner		= THIS_MODULE,
2656 	.open		= proto_seq_open,
2657 	.read		= seq_read,
2658 	.llseek		= seq_lseek,
2659 	.release	= seq_release_net,
2660 };
2661 
2662 static __net_init int proto_init_net(struct net *net)
2663 {
2664 	if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2665 		return -ENOMEM;
2666 
2667 	return 0;
2668 }
2669 
2670 static __net_exit void proto_exit_net(struct net *net)
2671 {
2672 	proc_net_remove(net, "protocols");
2673 }
2674 
2675 
2676 static __net_initdata struct pernet_operations proto_net_ops = {
2677 	.init = proto_init_net,
2678 	.exit = proto_exit_net,
2679 };
2680 
2681 static int __init proto_init(void)
2682 {
2683 	return register_pernet_subsys(&proto_net_ops);
2684 }
2685 
2686 subsys_initcall(proto_init);
2687 
2688 #endif /* PROC_FS */
2689