xref: /openbmc/linux/net/core/sock.c (revision 8a8e04df4747661daaee77e98e102d99c9e09b98)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *		This program is free software; you can redistribute it and/or
87  *		modify it under the terms of the GNU General Public License
88  *		as published by the Free Software Foundation; either version
89  *		2 of the License, or (at your option) any later version.
90  */
91 
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93 
94 #include <linux/capability.h>
95 #include <linux/errno.h>
96 #include <linux/types.h>
97 #include <linux/socket.h>
98 #include <linux/in.h>
99 #include <linux/kernel.h>
100 #include <linux/module.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/sched.h>
104 #include <linux/timer.h>
105 #include <linux/string.h>
106 #include <linux/sockios.h>
107 #include <linux/net.h>
108 #include <linux/mm.h>
109 #include <linux/slab.h>
110 #include <linux/interrupt.h>
111 #include <linux/poll.h>
112 #include <linux/tcp.h>
113 #include <linux/init.h>
114 #include <linux/highmem.h>
115 #include <linux/user_namespace.h>
116 #include <linux/static_key.h>
117 #include <linux/memcontrol.h>
118 #include <linux/prefetch.h>
119 
120 #include <asm/uaccess.h>
121 
122 #include <linux/netdevice.h>
123 #include <net/protocol.h>
124 #include <linux/skbuff.h>
125 #include <net/net_namespace.h>
126 #include <net/request_sock.h>
127 #include <net/sock.h>
128 #include <linux/net_tstamp.h>
129 #include <net/xfrm.h>
130 #include <linux/ipsec.h>
131 #include <net/cls_cgroup.h>
132 #include <net/netprio_cgroup.h>
133 
134 #include <linux/filter.h>
135 
136 #include <trace/events/sock.h>
137 
138 #ifdef CONFIG_INET
139 #include <net/tcp.h>
140 #endif
141 
142 static DEFINE_MUTEX(proto_list_mutex);
143 static LIST_HEAD(proto_list);
144 
145 #ifdef CONFIG_MEMCG_KMEM
146 int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
147 {
148 	struct proto *proto;
149 	int ret = 0;
150 
151 	mutex_lock(&proto_list_mutex);
152 	list_for_each_entry(proto, &proto_list, node) {
153 		if (proto->init_cgroup) {
154 			ret = proto->init_cgroup(memcg, ss);
155 			if (ret)
156 				goto out;
157 		}
158 	}
159 
160 	mutex_unlock(&proto_list_mutex);
161 	return ret;
162 out:
163 	list_for_each_entry_continue_reverse(proto, &proto_list, node)
164 		if (proto->destroy_cgroup)
165 			proto->destroy_cgroup(memcg);
166 	mutex_unlock(&proto_list_mutex);
167 	return ret;
168 }
169 
170 void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
171 {
172 	struct proto *proto;
173 
174 	mutex_lock(&proto_list_mutex);
175 	list_for_each_entry_reverse(proto, &proto_list, node)
176 		if (proto->destroy_cgroup)
177 			proto->destroy_cgroup(memcg);
178 	mutex_unlock(&proto_list_mutex);
179 }
180 #endif
181 
182 /*
183  * Each address family might have different locking rules, so we have
184  * one slock key per address family:
185  */
186 static struct lock_class_key af_family_keys[AF_MAX];
187 static struct lock_class_key af_family_slock_keys[AF_MAX];
188 
189 struct static_key memcg_socket_limit_enabled;
190 EXPORT_SYMBOL(memcg_socket_limit_enabled);
191 
192 /*
193  * Make lock validator output more readable. (we pre-construct these
194  * strings build-time, so that runtime initialization of socket
195  * locks is fast):
196  */
197 static const char *const af_family_key_strings[AF_MAX+1] = {
198   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
199   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
200   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
201   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
202   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
203   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
204   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
205   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
206   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
207   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
208   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
209   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
210   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
211   "sk_lock-AF_NFC"   , "sk_lock-AF_MAX"
212 };
213 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
214   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
215   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
216   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
217   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
218   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
219   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
220   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
221   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
222   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
223   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
224   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
225   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
226   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
227   "slock-AF_NFC"   , "slock-AF_MAX"
228 };
229 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
230   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
231   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
232   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
233   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
234   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
235   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
236   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
237   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
238   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
239   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
240   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
241   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
242   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
243   "clock-AF_NFC"   , "clock-AF_MAX"
244 };
245 
246 /*
247  * sk_callback_lock locking rules are per-address-family,
248  * so split the lock classes by using a per-AF key:
249  */
250 static struct lock_class_key af_callback_keys[AF_MAX];
251 
252 /* Take into consideration the size of the struct sk_buff overhead in the
253  * determination of these values, since that is non-constant across
254  * platforms.  This makes socket queueing behavior and performance
255  * not depend upon such differences.
256  */
257 #define _SK_MEM_PACKETS		256
258 #define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
259 #define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
260 #define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
261 
262 /* Run time adjustable parameters. */
263 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
264 EXPORT_SYMBOL(sysctl_wmem_max);
265 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
266 EXPORT_SYMBOL(sysctl_rmem_max);
267 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
268 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
269 
270 /* Maximal space eaten by iovec or ancillary data plus some space */
271 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
272 EXPORT_SYMBOL(sysctl_optmem_max);
273 
274 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
275 EXPORT_SYMBOL_GPL(memalloc_socks);
276 
277 /**
278  * sk_set_memalloc - sets %SOCK_MEMALLOC
279  * @sk: socket to set it on
280  *
281  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
282  * It's the responsibility of the admin to adjust min_free_kbytes
283  * to meet the requirements
284  */
285 void sk_set_memalloc(struct sock *sk)
286 {
287 	sock_set_flag(sk, SOCK_MEMALLOC);
288 	sk->sk_allocation |= __GFP_MEMALLOC;
289 	static_key_slow_inc(&memalloc_socks);
290 }
291 EXPORT_SYMBOL_GPL(sk_set_memalloc);
292 
293 void sk_clear_memalloc(struct sock *sk)
294 {
295 	sock_reset_flag(sk, SOCK_MEMALLOC);
296 	sk->sk_allocation &= ~__GFP_MEMALLOC;
297 	static_key_slow_dec(&memalloc_socks);
298 
299 	/*
300 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
301 	 * progress of swapping. However, if SOCK_MEMALLOC is cleared while
302 	 * it has rmem allocations there is a risk that the user of the
303 	 * socket cannot make forward progress due to exceeding the rmem
304 	 * limits. By rights, sk_clear_memalloc() should only be called
305 	 * on sockets being torn down but warn and reset the accounting if
306 	 * that assumption breaks.
307 	 */
308 	if (WARN_ON(sk->sk_forward_alloc))
309 		sk_mem_reclaim(sk);
310 }
311 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
312 
313 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
314 {
315 	int ret;
316 	unsigned long pflags = current->flags;
317 
318 	/* these should have been dropped before queueing */
319 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
320 
321 	current->flags |= PF_MEMALLOC;
322 	ret = sk->sk_backlog_rcv(sk, skb);
323 	tsk_restore_flags(current, pflags, PF_MEMALLOC);
324 
325 	return ret;
326 }
327 EXPORT_SYMBOL(__sk_backlog_rcv);
328 
329 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
330 {
331 	struct timeval tv;
332 
333 	if (optlen < sizeof(tv))
334 		return -EINVAL;
335 	if (copy_from_user(&tv, optval, sizeof(tv)))
336 		return -EFAULT;
337 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
338 		return -EDOM;
339 
340 	if (tv.tv_sec < 0) {
341 		static int warned __read_mostly;
342 
343 		*timeo_p = 0;
344 		if (warned < 10 && net_ratelimit()) {
345 			warned++;
346 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
347 				__func__, current->comm, task_pid_nr(current));
348 		}
349 		return 0;
350 	}
351 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
352 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
353 		return 0;
354 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
355 		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
356 	return 0;
357 }
358 
359 static void sock_warn_obsolete_bsdism(const char *name)
360 {
361 	static int warned;
362 	static char warncomm[TASK_COMM_LEN];
363 	if (strcmp(warncomm, current->comm) && warned < 5) {
364 		strcpy(warncomm,  current->comm);
365 		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
366 			warncomm, name);
367 		warned++;
368 	}
369 }
370 
371 #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
372 
373 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
374 {
375 	if (sk->sk_flags & flags) {
376 		sk->sk_flags &= ~flags;
377 		if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
378 			net_disable_timestamp();
379 	}
380 }
381 
382 
383 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
384 {
385 	int err;
386 	int skb_len;
387 	unsigned long flags;
388 	struct sk_buff_head *list = &sk->sk_receive_queue;
389 
390 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
391 		atomic_inc(&sk->sk_drops);
392 		trace_sock_rcvqueue_full(sk, skb);
393 		return -ENOMEM;
394 	}
395 
396 	err = sk_filter(sk, skb);
397 	if (err)
398 		return err;
399 
400 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
401 		atomic_inc(&sk->sk_drops);
402 		return -ENOBUFS;
403 	}
404 
405 	skb->dev = NULL;
406 	skb_set_owner_r(skb, sk);
407 
408 	/* Cache the SKB length before we tack it onto the receive
409 	 * queue.  Once it is added it no longer belongs to us and
410 	 * may be freed by other threads of control pulling packets
411 	 * from the queue.
412 	 */
413 	skb_len = skb->len;
414 
415 	/* we escape from rcu protected region, make sure we dont leak
416 	 * a norefcounted dst
417 	 */
418 	skb_dst_force(skb);
419 
420 	spin_lock_irqsave(&list->lock, flags);
421 	skb->dropcount = atomic_read(&sk->sk_drops);
422 	__skb_queue_tail(list, skb);
423 	spin_unlock_irqrestore(&list->lock, flags);
424 
425 	if (!sock_flag(sk, SOCK_DEAD))
426 		sk->sk_data_ready(sk, skb_len);
427 	return 0;
428 }
429 EXPORT_SYMBOL(sock_queue_rcv_skb);
430 
431 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
432 {
433 	int rc = NET_RX_SUCCESS;
434 
435 	if (sk_filter(sk, skb))
436 		goto discard_and_relse;
437 
438 	skb->dev = NULL;
439 
440 	if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
441 		atomic_inc(&sk->sk_drops);
442 		goto discard_and_relse;
443 	}
444 	if (nested)
445 		bh_lock_sock_nested(sk);
446 	else
447 		bh_lock_sock(sk);
448 	if (!sock_owned_by_user(sk)) {
449 		/*
450 		 * trylock + unlock semantics:
451 		 */
452 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
453 
454 		rc = sk_backlog_rcv(sk, skb);
455 
456 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
457 	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
458 		bh_unlock_sock(sk);
459 		atomic_inc(&sk->sk_drops);
460 		goto discard_and_relse;
461 	}
462 
463 	bh_unlock_sock(sk);
464 out:
465 	sock_put(sk);
466 	return rc;
467 discard_and_relse:
468 	kfree_skb(skb);
469 	goto out;
470 }
471 EXPORT_SYMBOL(sk_receive_skb);
472 
473 void sk_reset_txq(struct sock *sk)
474 {
475 	sk_tx_queue_clear(sk);
476 }
477 EXPORT_SYMBOL(sk_reset_txq);
478 
479 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
480 {
481 	struct dst_entry *dst = __sk_dst_get(sk);
482 
483 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
484 		sk_tx_queue_clear(sk);
485 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
486 		dst_release(dst);
487 		return NULL;
488 	}
489 
490 	return dst;
491 }
492 EXPORT_SYMBOL(__sk_dst_check);
493 
494 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
495 {
496 	struct dst_entry *dst = sk_dst_get(sk);
497 
498 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
499 		sk_dst_reset(sk);
500 		dst_release(dst);
501 		return NULL;
502 	}
503 
504 	return dst;
505 }
506 EXPORT_SYMBOL(sk_dst_check);
507 
508 static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
509 {
510 	int ret = -ENOPROTOOPT;
511 #ifdef CONFIG_NETDEVICES
512 	struct net *net = sock_net(sk);
513 	char devname[IFNAMSIZ];
514 	int index;
515 
516 	/* Sorry... */
517 	ret = -EPERM;
518 	if (!capable(CAP_NET_RAW))
519 		goto out;
520 
521 	ret = -EINVAL;
522 	if (optlen < 0)
523 		goto out;
524 
525 	/* Bind this socket to a particular device like "eth0",
526 	 * as specified in the passed interface name. If the
527 	 * name is "" or the option length is zero the socket
528 	 * is not bound.
529 	 */
530 	if (optlen > IFNAMSIZ - 1)
531 		optlen = IFNAMSIZ - 1;
532 	memset(devname, 0, sizeof(devname));
533 
534 	ret = -EFAULT;
535 	if (copy_from_user(devname, optval, optlen))
536 		goto out;
537 
538 	index = 0;
539 	if (devname[0] != '\0') {
540 		struct net_device *dev;
541 
542 		rcu_read_lock();
543 		dev = dev_get_by_name_rcu(net, devname);
544 		if (dev)
545 			index = dev->ifindex;
546 		rcu_read_unlock();
547 		ret = -ENODEV;
548 		if (!dev)
549 			goto out;
550 	}
551 
552 	lock_sock(sk);
553 	sk->sk_bound_dev_if = index;
554 	sk_dst_reset(sk);
555 	release_sock(sk);
556 
557 	ret = 0;
558 
559 out:
560 #endif
561 
562 	return ret;
563 }
564 
565 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
566 {
567 	if (valbool)
568 		sock_set_flag(sk, bit);
569 	else
570 		sock_reset_flag(sk, bit);
571 }
572 
573 /*
574  *	This is meant for all protocols to use and covers goings on
575  *	at the socket level. Everything here is generic.
576  */
577 
578 int sock_setsockopt(struct socket *sock, int level, int optname,
579 		    char __user *optval, unsigned int optlen)
580 {
581 	struct sock *sk = sock->sk;
582 	int val;
583 	int valbool;
584 	struct linger ling;
585 	int ret = 0;
586 
587 	/*
588 	 *	Options without arguments
589 	 */
590 
591 	if (optname == SO_BINDTODEVICE)
592 		return sock_bindtodevice(sk, optval, optlen);
593 
594 	if (optlen < sizeof(int))
595 		return -EINVAL;
596 
597 	if (get_user(val, (int __user *)optval))
598 		return -EFAULT;
599 
600 	valbool = val ? 1 : 0;
601 
602 	lock_sock(sk);
603 
604 	switch (optname) {
605 	case SO_DEBUG:
606 		if (val && !capable(CAP_NET_ADMIN))
607 			ret = -EACCES;
608 		else
609 			sock_valbool_flag(sk, SOCK_DBG, valbool);
610 		break;
611 	case SO_REUSEADDR:
612 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
613 		break;
614 	case SO_TYPE:
615 	case SO_PROTOCOL:
616 	case SO_DOMAIN:
617 	case SO_ERROR:
618 		ret = -ENOPROTOOPT;
619 		break;
620 	case SO_DONTROUTE:
621 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
622 		break;
623 	case SO_BROADCAST:
624 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
625 		break;
626 	case SO_SNDBUF:
627 		/* Don't error on this BSD doesn't and if you think
628 		 * about it this is right. Otherwise apps have to
629 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
630 		 * are treated in BSD as hints
631 		 */
632 		val = min_t(u32, val, sysctl_wmem_max);
633 set_sndbuf:
634 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
635 		sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
636 		/* Wake up sending tasks if we upped the value. */
637 		sk->sk_write_space(sk);
638 		break;
639 
640 	case SO_SNDBUFFORCE:
641 		if (!capable(CAP_NET_ADMIN)) {
642 			ret = -EPERM;
643 			break;
644 		}
645 		goto set_sndbuf;
646 
647 	case SO_RCVBUF:
648 		/* Don't error on this BSD doesn't and if you think
649 		 * about it this is right. Otherwise apps have to
650 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
651 		 * are treated in BSD as hints
652 		 */
653 		val = min_t(u32, val, sysctl_rmem_max);
654 set_rcvbuf:
655 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
656 		/*
657 		 * We double it on the way in to account for
658 		 * "struct sk_buff" etc. overhead.   Applications
659 		 * assume that the SO_RCVBUF setting they make will
660 		 * allow that much actual data to be received on that
661 		 * socket.
662 		 *
663 		 * Applications are unaware that "struct sk_buff" and
664 		 * other overheads allocate from the receive buffer
665 		 * during socket buffer allocation.
666 		 *
667 		 * And after considering the possible alternatives,
668 		 * returning the value we actually used in getsockopt
669 		 * is the most desirable behavior.
670 		 */
671 		sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
672 		break;
673 
674 	case SO_RCVBUFFORCE:
675 		if (!capable(CAP_NET_ADMIN)) {
676 			ret = -EPERM;
677 			break;
678 		}
679 		goto set_rcvbuf;
680 
681 	case SO_KEEPALIVE:
682 #ifdef CONFIG_INET
683 		if (sk->sk_protocol == IPPROTO_TCP)
684 			tcp_set_keepalive(sk, valbool);
685 #endif
686 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
687 		break;
688 
689 	case SO_OOBINLINE:
690 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
691 		break;
692 
693 	case SO_NO_CHECK:
694 		sk->sk_no_check = valbool;
695 		break;
696 
697 	case SO_PRIORITY:
698 		if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
699 			sk->sk_priority = val;
700 		else
701 			ret = -EPERM;
702 		break;
703 
704 	case SO_LINGER:
705 		if (optlen < sizeof(ling)) {
706 			ret = -EINVAL;	/* 1003.1g */
707 			break;
708 		}
709 		if (copy_from_user(&ling, optval, sizeof(ling))) {
710 			ret = -EFAULT;
711 			break;
712 		}
713 		if (!ling.l_onoff)
714 			sock_reset_flag(sk, SOCK_LINGER);
715 		else {
716 #if (BITS_PER_LONG == 32)
717 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
718 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
719 			else
720 #endif
721 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
722 			sock_set_flag(sk, SOCK_LINGER);
723 		}
724 		break;
725 
726 	case SO_BSDCOMPAT:
727 		sock_warn_obsolete_bsdism("setsockopt");
728 		break;
729 
730 	case SO_PASSCRED:
731 		if (valbool)
732 			set_bit(SOCK_PASSCRED, &sock->flags);
733 		else
734 			clear_bit(SOCK_PASSCRED, &sock->flags);
735 		break;
736 
737 	case SO_TIMESTAMP:
738 	case SO_TIMESTAMPNS:
739 		if (valbool)  {
740 			if (optname == SO_TIMESTAMP)
741 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
742 			else
743 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
744 			sock_set_flag(sk, SOCK_RCVTSTAMP);
745 			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
746 		} else {
747 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
748 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
749 		}
750 		break;
751 
752 	case SO_TIMESTAMPING:
753 		if (val & ~SOF_TIMESTAMPING_MASK) {
754 			ret = -EINVAL;
755 			break;
756 		}
757 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
758 				  val & SOF_TIMESTAMPING_TX_HARDWARE);
759 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
760 				  val & SOF_TIMESTAMPING_TX_SOFTWARE);
761 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
762 				  val & SOF_TIMESTAMPING_RX_HARDWARE);
763 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
764 			sock_enable_timestamp(sk,
765 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
766 		else
767 			sock_disable_timestamp(sk,
768 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
769 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
770 				  val & SOF_TIMESTAMPING_SOFTWARE);
771 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
772 				  val & SOF_TIMESTAMPING_SYS_HARDWARE);
773 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
774 				  val & SOF_TIMESTAMPING_RAW_HARDWARE);
775 		break;
776 
777 	case SO_RCVLOWAT:
778 		if (val < 0)
779 			val = INT_MAX;
780 		sk->sk_rcvlowat = val ? : 1;
781 		break;
782 
783 	case SO_RCVTIMEO:
784 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
785 		break;
786 
787 	case SO_SNDTIMEO:
788 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
789 		break;
790 
791 	case SO_ATTACH_FILTER:
792 		ret = -EINVAL;
793 		if (optlen == sizeof(struct sock_fprog)) {
794 			struct sock_fprog fprog;
795 
796 			ret = -EFAULT;
797 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
798 				break;
799 
800 			ret = sk_attach_filter(&fprog, sk);
801 		}
802 		break;
803 
804 	case SO_DETACH_FILTER:
805 		ret = sk_detach_filter(sk);
806 		break;
807 
808 	case SO_PASSSEC:
809 		if (valbool)
810 			set_bit(SOCK_PASSSEC, &sock->flags);
811 		else
812 			clear_bit(SOCK_PASSSEC, &sock->flags);
813 		break;
814 	case SO_MARK:
815 		if (!capable(CAP_NET_ADMIN))
816 			ret = -EPERM;
817 		else
818 			sk->sk_mark = val;
819 		break;
820 
821 		/* We implement the SO_SNDLOWAT etc to
822 		   not be settable (1003.1g 5.3) */
823 	case SO_RXQ_OVFL:
824 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
825 		break;
826 
827 	case SO_WIFI_STATUS:
828 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
829 		break;
830 
831 	case SO_PEEK_OFF:
832 		if (sock->ops->set_peek_off)
833 			sock->ops->set_peek_off(sk, val);
834 		else
835 			ret = -EOPNOTSUPP;
836 		break;
837 
838 	case SO_NOFCS:
839 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
840 		break;
841 
842 	default:
843 		ret = -ENOPROTOOPT;
844 		break;
845 	}
846 	release_sock(sk);
847 	return ret;
848 }
849 EXPORT_SYMBOL(sock_setsockopt);
850 
851 
852 void cred_to_ucred(struct pid *pid, const struct cred *cred,
853 		   struct ucred *ucred)
854 {
855 	ucred->pid = pid_vnr(pid);
856 	ucred->uid = ucred->gid = -1;
857 	if (cred) {
858 		struct user_namespace *current_ns = current_user_ns();
859 
860 		ucred->uid = from_kuid(current_ns, cred->euid);
861 		ucred->gid = from_kgid(current_ns, cred->egid);
862 	}
863 }
864 EXPORT_SYMBOL_GPL(cred_to_ucred);
865 
866 int sock_getsockopt(struct socket *sock, int level, int optname,
867 		    char __user *optval, int __user *optlen)
868 {
869 	struct sock *sk = sock->sk;
870 
871 	union {
872 		int val;
873 		struct linger ling;
874 		struct timeval tm;
875 	} v;
876 
877 	int lv = sizeof(int);
878 	int len;
879 
880 	if (get_user(len, optlen))
881 		return -EFAULT;
882 	if (len < 0)
883 		return -EINVAL;
884 
885 	memset(&v, 0, sizeof(v));
886 
887 	switch (optname) {
888 	case SO_DEBUG:
889 		v.val = sock_flag(sk, SOCK_DBG);
890 		break;
891 
892 	case SO_DONTROUTE:
893 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
894 		break;
895 
896 	case SO_BROADCAST:
897 		v.val = sock_flag(sk, SOCK_BROADCAST);
898 		break;
899 
900 	case SO_SNDBUF:
901 		v.val = sk->sk_sndbuf;
902 		break;
903 
904 	case SO_RCVBUF:
905 		v.val = sk->sk_rcvbuf;
906 		break;
907 
908 	case SO_REUSEADDR:
909 		v.val = sk->sk_reuse;
910 		break;
911 
912 	case SO_KEEPALIVE:
913 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
914 		break;
915 
916 	case SO_TYPE:
917 		v.val = sk->sk_type;
918 		break;
919 
920 	case SO_PROTOCOL:
921 		v.val = sk->sk_protocol;
922 		break;
923 
924 	case SO_DOMAIN:
925 		v.val = sk->sk_family;
926 		break;
927 
928 	case SO_ERROR:
929 		v.val = -sock_error(sk);
930 		if (v.val == 0)
931 			v.val = xchg(&sk->sk_err_soft, 0);
932 		break;
933 
934 	case SO_OOBINLINE:
935 		v.val = sock_flag(sk, SOCK_URGINLINE);
936 		break;
937 
938 	case SO_NO_CHECK:
939 		v.val = sk->sk_no_check;
940 		break;
941 
942 	case SO_PRIORITY:
943 		v.val = sk->sk_priority;
944 		break;
945 
946 	case SO_LINGER:
947 		lv		= sizeof(v.ling);
948 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
949 		v.ling.l_linger	= sk->sk_lingertime / HZ;
950 		break;
951 
952 	case SO_BSDCOMPAT:
953 		sock_warn_obsolete_bsdism("getsockopt");
954 		break;
955 
956 	case SO_TIMESTAMP:
957 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
958 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
959 		break;
960 
961 	case SO_TIMESTAMPNS:
962 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
963 		break;
964 
965 	case SO_TIMESTAMPING:
966 		v.val = 0;
967 		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
968 			v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
969 		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
970 			v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
971 		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
972 			v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
973 		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
974 			v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
975 		if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
976 			v.val |= SOF_TIMESTAMPING_SOFTWARE;
977 		if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
978 			v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
979 		if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
980 			v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
981 		break;
982 
983 	case SO_RCVTIMEO:
984 		lv = sizeof(struct timeval);
985 		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
986 			v.tm.tv_sec = 0;
987 			v.tm.tv_usec = 0;
988 		} else {
989 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
990 			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
991 		}
992 		break;
993 
994 	case SO_SNDTIMEO:
995 		lv = sizeof(struct timeval);
996 		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
997 			v.tm.tv_sec = 0;
998 			v.tm.tv_usec = 0;
999 		} else {
1000 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1001 			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1002 		}
1003 		break;
1004 
1005 	case SO_RCVLOWAT:
1006 		v.val = sk->sk_rcvlowat;
1007 		break;
1008 
1009 	case SO_SNDLOWAT:
1010 		v.val = 1;
1011 		break;
1012 
1013 	case SO_PASSCRED:
1014 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1015 		break;
1016 
1017 	case SO_PEERCRED:
1018 	{
1019 		struct ucred peercred;
1020 		if (len > sizeof(peercred))
1021 			len = sizeof(peercred);
1022 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1023 		if (copy_to_user(optval, &peercred, len))
1024 			return -EFAULT;
1025 		goto lenout;
1026 	}
1027 
1028 	case SO_PEERNAME:
1029 	{
1030 		char address[128];
1031 
1032 		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1033 			return -ENOTCONN;
1034 		if (lv < len)
1035 			return -EINVAL;
1036 		if (copy_to_user(optval, address, len))
1037 			return -EFAULT;
1038 		goto lenout;
1039 	}
1040 
1041 	/* Dubious BSD thing... Probably nobody even uses it, but
1042 	 * the UNIX standard wants it for whatever reason... -DaveM
1043 	 */
1044 	case SO_ACCEPTCONN:
1045 		v.val = sk->sk_state == TCP_LISTEN;
1046 		break;
1047 
1048 	case SO_PASSSEC:
1049 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1050 		break;
1051 
1052 	case SO_PEERSEC:
1053 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1054 
1055 	case SO_MARK:
1056 		v.val = sk->sk_mark;
1057 		break;
1058 
1059 	case SO_RXQ_OVFL:
1060 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1061 		break;
1062 
1063 	case SO_WIFI_STATUS:
1064 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1065 		break;
1066 
1067 	case SO_PEEK_OFF:
1068 		if (!sock->ops->set_peek_off)
1069 			return -EOPNOTSUPP;
1070 
1071 		v.val = sk->sk_peek_off;
1072 		break;
1073 	case SO_NOFCS:
1074 		v.val = sock_flag(sk, SOCK_NOFCS);
1075 		break;
1076 	default:
1077 		return -ENOPROTOOPT;
1078 	}
1079 
1080 	if (len > lv)
1081 		len = lv;
1082 	if (copy_to_user(optval, &v, len))
1083 		return -EFAULT;
1084 lenout:
1085 	if (put_user(len, optlen))
1086 		return -EFAULT;
1087 	return 0;
1088 }
1089 
1090 /*
1091  * Initialize an sk_lock.
1092  *
1093  * (We also register the sk_lock with the lock validator.)
1094  */
1095 static inline void sock_lock_init(struct sock *sk)
1096 {
1097 	sock_lock_init_class_and_name(sk,
1098 			af_family_slock_key_strings[sk->sk_family],
1099 			af_family_slock_keys + sk->sk_family,
1100 			af_family_key_strings[sk->sk_family],
1101 			af_family_keys + sk->sk_family);
1102 }
1103 
1104 /*
1105  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1106  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1107  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1108  */
1109 static void sock_copy(struct sock *nsk, const struct sock *osk)
1110 {
1111 #ifdef CONFIG_SECURITY_NETWORK
1112 	void *sptr = nsk->sk_security;
1113 #endif
1114 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1115 
1116 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1117 	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1118 
1119 #ifdef CONFIG_SECURITY_NETWORK
1120 	nsk->sk_security = sptr;
1121 	security_sk_clone(osk, nsk);
1122 #endif
1123 }
1124 
1125 /*
1126  * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
1127  * un-modified. Special care is taken when initializing object to zero.
1128  */
1129 static inline void sk_prot_clear_nulls(struct sock *sk, int size)
1130 {
1131 	if (offsetof(struct sock, sk_node.next) != 0)
1132 		memset(sk, 0, offsetof(struct sock, sk_node.next));
1133 	memset(&sk->sk_node.pprev, 0,
1134 	       size - offsetof(struct sock, sk_node.pprev));
1135 }
1136 
1137 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1138 {
1139 	unsigned long nulls1, nulls2;
1140 
1141 	nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1142 	nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1143 	if (nulls1 > nulls2)
1144 		swap(nulls1, nulls2);
1145 
1146 	if (nulls1 != 0)
1147 		memset((char *)sk, 0, nulls1);
1148 	memset((char *)sk + nulls1 + sizeof(void *), 0,
1149 	       nulls2 - nulls1 - sizeof(void *));
1150 	memset((char *)sk + nulls2 + sizeof(void *), 0,
1151 	       size - nulls2 - sizeof(void *));
1152 }
1153 EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1154 
1155 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1156 		int family)
1157 {
1158 	struct sock *sk;
1159 	struct kmem_cache *slab;
1160 
1161 	slab = prot->slab;
1162 	if (slab != NULL) {
1163 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1164 		if (!sk)
1165 			return sk;
1166 		if (priority & __GFP_ZERO) {
1167 			if (prot->clear_sk)
1168 				prot->clear_sk(sk, prot->obj_size);
1169 			else
1170 				sk_prot_clear_nulls(sk, prot->obj_size);
1171 		}
1172 	} else
1173 		sk = kmalloc(prot->obj_size, priority);
1174 
1175 	if (sk != NULL) {
1176 		kmemcheck_annotate_bitfield(sk, flags);
1177 
1178 		if (security_sk_alloc(sk, family, priority))
1179 			goto out_free;
1180 
1181 		if (!try_module_get(prot->owner))
1182 			goto out_free_sec;
1183 		sk_tx_queue_clear(sk);
1184 	}
1185 
1186 	return sk;
1187 
1188 out_free_sec:
1189 	security_sk_free(sk);
1190 out_free:
1191 	if (slab != NULL)
1192 		kmem_cache_free(slab, sk);
1193 	else
1194 		kfree(sk);
1195 	return NULL;
1196 }
1197 
1198 static void sk_prot_free(struct proto *prot, struct sock *sk)
1199 {
1200 	struct kmem_cache *slab;
1201 	struct module *owner;
1202 
1203 	owner = prot->owner;
1204 	slab = prot->slab;
1205 
1206 	security_sk_free(sk);
1207 	if (slab != NULL)
1208 		kmem_cache_free(slab, sk);
1209 	else
1210 		kfree(sk);
1211 	module_put(owner);
1212 }
1213 
1214 #ifdef CONFIG_CGROUPS
1215 #if IS_ENABLED(CONFIG_NET_CLS_CGROUP)
1216 void sock_update_classid(struct sock *sk)
1217 {
1218 	u32 classid;
1219 
1220 	rcu_read_lock();  /* doing current task, which cannot vanish. */
1221 	classid = task_cls_classid(current);
1222 	rcu_read_unlock();
1223 	if (classid && classid != sk->sk_classid)
1224 		sk->sk_classid = classid;
1225 }
1226 EXPORT_SYMBOL(sock_update_classid);
1227 #endif
1228 
1229 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
1230 void sock_update_netprioidx(struct sock *sk, struct task_struct *task)
1231 {
1232 	if (in_interrupt())
1233 		return;
1234 
1235 	sk->sk_cgrp_prioidx = task_netprioidx(task);
1236 }
1237 EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1238 #endif
1239 #endif
1240 
1241 /**
1242  *	sk_alloc - All socket objects are allocated here
1243  *	@net: the applicable net namespace
1244  *	@family: protocol family
1245  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1246  *	@prot: struct proto associated with this new sock instance
1247  */
1248 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1249 		      struct proto *prot)
1250 {
1251 	struct sock *sk;
1252 
1253 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1254 	if (sk) {
1255 		sk->sk_family = family;
1256 		/*
1257 		 * See comment in struct sock definition to understand
1258 		 * why we need sk_prot_creator -acme
1259 		 */
1260 		sk->sk_prot = sk->sk_prot_creator = prot;
1261 		sock_lock_init(sk);
1262 		sock_net_set(sk, get_net(net));
1263 		atomic_set(&sk->sk_wmem_alloc, 1);
1264 
1265 		sock_update_classid(sk);
1266 		sock_update_netprioidx(sk, current);
1267 	}
1268 
1269 	return sk;
1270 }
1271 EXPORT_SYMBOL(sk_alloc);
1272 
1273 static void __sk_free(struct sock *sk)
1274 {
1275 	struct sk_filter *filter;
1276 
1277 	if (sk->sk_destruct)
1278 		sk->sk_destruct(sk);
1279 
1280 	filter = rcu_dereference_check(sk->sk_filter,
1281 				       atomic_read(&sk->sk_wmem_alloc) == 0);
1282 	if (filter) {
1283 		sk_filter_uncharge(sk, filter);
1284 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1285 	}
1286 
1287 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1288 
1289 	if (atomic_read(&sk->sk_omem_alloc))
1290 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1291 			 __func__, atomic_read(&sk->sk_omem_alloc));
1292 
1293 	if (sk->sk_peer_cred)
1294 		put_cred(sk->sk_peer_cred);
1295 	put_pid(sk->sk_peer_pid);
1296 	put_net(sock_net(sk));
1297 	sk_prot_free(sk->sk_prot_creator, sk);
1298 }
1299 
1300 void sk_free(struct sock *sk)
1301 {
1302 	/*
1303 	 * We subtract one from sk_wmem_alloc and can know if
1304 	 * some packets are still in some tx queue.
1305 	 * If not null, sock_wfree() will call __sk_free(sk) later
1306 	 */
1307 	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1308 		__sk_free(sk);
1309 }
1310 EXPORT_SYMBOL(sk_free);
1311 
1312 /*
1313  * Last sock_put should drop reference to sk->sk_net. It has already
1314  * been dropped in sk_change_net. Taking reference to stopping namespace
1315  * is not an option.
1316  * Take reference to a socket to remove it from hash _alive_ and after that
1317  * destroy it in the context of init_net.
1318  */
1319 void sk_release_kernel(struct sock *sk)
1320 {
1321 	if (sk == NULL || sk->sk_socket == NULL)
1322 		return;
1323 
1324 	sock_hold(sk);
1325 	sock_release(sk->sk_socket);
1326 	release_net(sock_net(sk));
1327 	sock_net_set(sk, get_net(&init_net));
1328 	sock_put(sk);
1329 }
1330 EXPORT_SYMBOL(sk_release_kernel);
1331 
1332 static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1333 {
1334 	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1335 		sock_update_memcg(newsk);
1336 }
1337 
1338 /**
1339  *	sk_clone_lock - clone a socket, and lock its clone
1340  *	@sk: the socket to clone
1341  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1342  *
1343  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1344  */
1345 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1346 {
1347 	struct sock *newsk;
1348 
1349 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1350 	if (newsk != NULL) {
1351 		struct sk_filter *filter;
1352 
1353 		sock_copy(newsk, sk);
1354 
1355 		/* SANITY */
1356 		get_net(sock_net(newsk));
1357 		sk_node_init(&newsk->sk_node);
1358 		sock_lock_init(newsk);
1359 		bh_lock_sock(newsk);
1360 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1361 		newsk->sk_backlog.len = 0;
1362 
1363 		atomic_set(&newsk->sk_rmem_alloc, 0);
1364 		/*
1365 		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1366 		 */
1367 		atomic_set(&newsk->sk_wmem_alloc, 1);
1368 		atomic_set(&newsk->sk_omem_alloc, 0);
1369 		skb_queue_head_init(&newsk->sk_receive_queue);
1370 		skb_queue_head_init(&newsk->sk_write_queue);
1371 #ifdef CONFIG_NET_DMA
1372 		skb_queue_head_init(&newsk->sk_async_wait_queue);
1373 #endif
1374 
1375 		spin_lock_init(&newsk->sk_dst_lock);
1376 		rwlock_init(&newsk->sk_callback_lock);
1377 		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1378 				af_callback_keys + newsk->sk_family,
1379 				af_family_clock_key_strings[newsk->sk_family]);
1380 
1381 		newsk->sk_dst_cache	= NULL;
1382 		newsk->sk_wmem_queued	= 0;
1383 		newsk->sk_forward_alloc = 0;
1384 		newsk->sk_send_head	= NULL;
1385 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1386 
1387 		sock_reset_flag(newsk, SOCK_DONE);
1388 		skb_queue_head_init(&newsk->sk_error_queue);
1389 
1390 		filter = rcu_dereference_protected(newsk->sk_filter, 1);
1391 		if (filter != NULL)
1392 			sk_filter_charge(newsk, filter);
1393 
1394 		if (unlikely(xfrm_sk_clone_policy(newsk))) {
1395 			/* It is still raw copy of parent, so invalidate
1396 			 * destructor and make plain sk_free() */
1397 			newsk->sk_destruct = NULL;
1398 			bh_unlock_sock(newsk);
1399 			sk_free(newsk);
1400 			newsk = NULL;
1401 			goto out;
1402 		}
1403 
1404 		newsk->sk_err	   = 0;
1405 		newsk->sk_priority = 0;
1406 		/*
1407 		 * Before updating sk_refcnt, we must commit prior changes to memory
1408 		 * (Documentation/RCU/rculist_nulls.txt for details)
1409 		 */
1410 		smp_wmb();
1411 		atomic_set(&newsk->sk_refcnt, 2);
1412 
1413 		/*
1414 		 * Increment the counter in the same struct proto as the master
1415 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1416 		 * is the same as sk->sk_prot->socks, as this field was copied
1417 		 * with memcpy).
1418 		 *
1419 		 * This _changes_ the previous behaviour, where
1420 		 * tcp_create_openreq_child always was incrementing the
1421 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1422 		 * to be taken into account in all callers. -acme
1423 		 */
1424 		sk_refcnt_debug_inc(newsk);
1425 		sk_set_socket(newsk, NULL);
1426 		newsk->sk_wq = NULL;
1427 
1428 		sk_update_clone(sk, newsk);
1429 
1430 		if (newsk->sk_prot->sockets_allocated)
1431 			sk_sockets_allocated_inc(newsk);
1432 
1433 		if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1434 			net_enable_timestamp();
1435 	}
1436 out:
1437 	return newsk;
1438 }
1439 EXPORT_SYMBOL_GPL(sk_clone_lock);
1440 
1441 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1442 {
1443 	__sk_dst_set(sk, dst);
1444 	sk->sk_route_caps = dst->dev->features;
1445 	if (sk->sk_route_caps & NETIF_F_GSO)
1446 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1447 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1448 	if (sk_can_gso(sk)) {
1449 		if (dst->header_len) {
1450 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1451 		} else {
1452 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1453 			sk->sk_gso_max_size = dst->dev->gso_max_size;
1454 			sk->sk_gso_max_segs = dst->dev->gso_max_segs;
1455 		}
1456 	}
1457 }
1458 EXPORT_SYMBOL_GPL(sk_setup_caps);
1459 
1460 void __init sk_init(void)
1461 {
1462 	if (totalram_pages <= 4096) {
1463 		sysctl_wmem_max = 32767;
1464 		sysctl_rmem_max = 32767;
1465 		sysctl_wmem_default = 32767;
1466 		sysctl_rmem_default = 32767;
1467 	} else if (totalram_pages >= 131072) {
1468 		sysctl_wmem_max = 131071;
1469 		sysctl_rmem_max = 131071;
1470 	}
1471 }
1472 
1473 /*
1474  *	Simple resource managers for sockets.
1475  */
1476 
1477 
1478 /*
1479  * Write buffer destructor automatically called from kfree_skb.
1480  */
1481 void sock_wfree(struct sk_buff *skb)
1482 {
1483 	struct sock *sk = skb->sk;
1484 	unsigned int len = skb->truesize;
1485 
1486 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1487 		/*
1488 		 * Keep a reference on sk_wmem_alloc, this will be released
1489 		 * after sk_write_space() call
1490 		 */
1491 		atomic_sub(len - 1, &sk->sk_wmem_alloc);
1492 		sk->sk_write_space(sk);
1493 		len = 1;
1494 	}
1495 	/*
1496 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1497 	 * could not do because of in-flight packets
1498 	 */
1499 	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1500 		__sk_free(sk);
1501 }
1502 EXPORT_SYMBOL(sock_wfree);
1503 
1504 /*
1505  * Read buffer destructor automatically called from kfree_skb.
1506  */
1507 void sock_rfree(struct sk_buff *skb)
1508 {
1509 	struct sock *sk = skb->sk;
1510 	unsigned int len = skb->truesize;
1511 
1512 	atomic_sub(len, &sk->sk_rmem_alloc);
1513 	sk_mem_uncharge(sk, len);
1514 }
1515 EXPORT_SYMBOL(sock_rfree);
1516 
1517 void sock_edemux(struct sk_buff *skb)
1518 {
1519 	sock_put(skb->sk);
1520 }
1521 EXPORT_SYMBOL(sock_edemux);
1522 
1523 int sock_i_uid(struct sock *sk)
1524 {
1525 	int uid;
1526 
1527 	read_lock_bh(&sk->sk_callback_lock);
1528 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1529 	read_unlock_bh(&sk->sk_callback_lock);
1530 	return uid;
1531 }
1532 EXPORT_SYMBOL(sock_i_uid);
1533 
1534 unsigned long sock_i_ino(struct sock *sk)
1535 {
1536 	unsigned long ino;
1537 
1538 	read_lock_bh(&sk->sk_callback_lock);
1539 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1540 	read_unlock_bh(&sk->sk_callback_lock);
1541 	return ino;
1542 }
1543 EXPORT_SYMBOL(sock_i_ino);
1544 
1545 /*
1546  * Allocate a skb from the socket's send buffer.
1547  */
1548 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1549 			     gfp_t priority)
1550 {
1551 	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1552 		struct sk_buff *skb = alloc_skb(size, priority);
1553 		if (skb) {
1554 			skb_set_owner_w(skb, sk);
1555 			return skb;
1556 		}
1557 	}
1558 	return NULL;
1559 }
1560 EXPORT_SYMBOL(sock_wmalloc);
1561 
1562 /*
1563  * Allocate a skb from the socket's receive buffer.
1564  */
1565 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1566 			     gfp_t priority)
1567 {
1568 	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1569 		struct sk_buff *skb = alloc_skb(size, priority);
1570 		if (skb) {
1571 			skb_set_owner_r(skb, sk);
1572 			return skb;
1573 		}
1574 	}
1575 	return NULL;
1576 }
1577 
1578 /*
1579  * Allocate a memory block from the socket's option memory buffer.
1580  */
1581 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1582 {
1583 	if ((unsigned int)size <= sysctl_optmem_max &&
1584 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1585 		void *mem;
1586 		/* First do the add, to avoid the race if kmalloc
1587 		 * might sleep.
1588 		 */
1589 		atomic_add(size, &sk->sk_omem_alloc);
1590 		mem = kmalloc(size, priority);
1591 		if (mem)
1592 			return mem;
1593 		atomic_sub(size, &sk->sk_omem_alloc);
1594 	}
1595 	return NULL;
1596 }
1597 EXPORT_SYMBOL(sock_kmalloc);
1598 
1599 /*
1600  * Free an option memory block.
1601  */
1602 void sock_kfree_s(struct sock *sk, void *mem, int size)
1603 {
1604 	kfree(mem);
1605 	atomic_sub(size, &sk->sk_omem_alloc);
1606 }
1607 EXPORT_SYMBOL(sock_kfree_s);
1608 
1609 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1610    I think, these locks should be removed for datagram sockets.
1611  */
1612 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1613 {
1614 	DEFINE_WAIT(wait);
1615 
1616 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1617 	for (;;) {
1618 		if (!timeo)
1619 			break;
1620 		if (signal_pending(current))
1621 			break;
1622 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1623 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1624 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1625 			break;
1626 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1627 			break;
1628 		if (sk->sk_err)
1629 			break;
1630 		timeo = schedule_timeout(timeo);
1631 	}
1632 	finish_wait(sk_sleep(sk), &wait);
1633 	return timeo;
1634 }
1635 
1636 
1637 /*
1638  *	Generic send/receive buffer handlers
1639  */
1640 
1641 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1642 				     unsigned long data_len, int noblock,
1643 				     int *errcode)
1644 {
1645 	struct sk_buff *skb;
1646 	gfp_t gfp_mask;
1647 	long timeo;
1648 	int err;
1649 	int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1650 
1651 	err = -EMSGSIZE;
1652 	if (npages > MAX_SKB_FRAGS)
1653 		goto failure;
1654 
1655 	gfp_mask = sk->sk_allocation;
1656 	if (gfp_mask & __GFP_WAIT)
1657 		gfp_mask |= __GFP_REPEAT;
1658 
1659 	timeo = sock_sndtimeo(sk, noblock);
1660 	while (1) {
1661 		err = sock_error(sk);
1662 		if (err != 0)
1663 			goto failure;
1664 
1665 		err = -EPIPE;
1666 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1667 			goto failure;
1668 
1669 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1670 			skb = alloc_skb(header_len, gfp_mask);
1671 			if (skb) {
1672 				int i;
1673 
1674 				/* No pages, we're done... */
1675 				if (!data_len)
1676 					break;
1677 
1678 				skb->truesize += data_len;
1679 				skb_shinfo(skb)->nr_frags = npages;
1680 				for (i = 0; i < npages; i++) {
1681 					struct page *page;
1682 
1683 					page = alloc_pages(sk->sk_allocation, 0);
1684 					if (!page) {
1685 						err = -ENOBUFS;
1686 						skb_shinfo(skb)->nr_frags = i;
1687 						kfree_skb(skb);
1688 						goto failure;
1689 					}
1690 
1691 					__skb_fill_page_desc(skb, i,
1692 							page, 0,
1693 							(data_len >= PAGE_SIZE ?
1694 							 PAGE_SIZE :
1695 							 data_len));
1696 					data_len -= PAGE_SIZE;
1697 				}
1698 
1699 				/* Full success... */
1700 				break;
1701 			}
1702 			err = -ENOBUFS;
1703 			goto failure;
1704 		}
1705 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1706 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1707 		err = -EAGAIN;
1708 		if (!timeo)
1709 			goto failure;
1710 		if (signal_pending(current))
1711 			goto interrupted;
1712 		timeo = sock_wait_for_wmem(sk, timeo);
1713 	}
1714 
1715 	skb_set_owner_w(skb, sk);
1716 	return skb;
1717 
1718 interrupted:
1719 	err = sock_intr_errno(timeo);
1720 failure:
1721 	*errcode = err;
1722 	return NULL;
1723 }
1724 EXPORT_SYMBOL(sock_alloc_send_pskb);
1725 
1726 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1727 				    int noblock, int *errcode)
1728 {
1729 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1730 }
1731 EXPORT_SYMBOL(sock_alloc_send_skb);
1732 
1733 static void __lock_sock(struct sock *sk)
1734 	__releases(&sk->sk_lock.slock)
1735 	__acquires(&sk->sk_lock.slock)
1736 {
1737 	DEFINE_WAIT(wait);
1738 
1739 	for (;;) {
1740 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1741 					TASK_UNINTERRUPTIBLE);
1742 		spin_unlock_bh(&sk->sk_lock.slock);
1743 		schedule();
1744 		spin_lock_bh(&sk->sk_lock.slock);
1745 		if (!sock_owned_by_user(sk))
1746 			break;
1747 	}
1748 	finish_wait(&sk->sk_lock.wq, &wait);
1749 }
1750 
1751 static void __release_sock(struct sock *sk)
1752 	__releases(&sk->sk_lock.slock)
1753 	__acquires(&sk->sk_lock.slock)
1754 {
1755 	struct sk_buff *skb = sk->sk_backlog.head;
1756 
1757 	do {
1758 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1759 		bh_unlock_sock(sk);
1760 
1761 		do {
1762 			struct sk_buff *next = skb->next;
1763 
1764 			prefetch(next);
1765 			WARN_ON_ONCE(skb_dst_is_noref(skb));
1766 			skb->next = NULL;
1767 			sk_backlog_rcv(sk, skb);
1768 
1769 			/*
1770 			 * We are in process context here with softirqs
1771 			 * disabled, use cond_resched_softirq() to preempt.
1772 			 * This is safe to do because we've taken the backlog
1773 			 * queue private:
1774 			 */
1775 			cond_resched_softirq();
1776 
1777 			skb = next;
1778 		} while (skb != NULL);
1779 
1780 		bh_lock_sock(sk);
1781 	} while ((skb = sk->sk_backlog.head) != NULL);
1782 
1783 	/*
1784 	 * Doing the zeroing here guarantee we can not loop forever
1785 	 * while a wild producer attempts to flood us.
1786 	 */
1787 	sk->sk_backlog.len = 0;
1788 }
1789 
1790 /**
1791  * sk_wait_data - wait for data to arrive at sk_receive_queue
1792  * @sk:    sock to wait on
1793  * @timeo: for how long
1794  *
1795  * Now socket state including sk->sk_err is changed only under lock,
1796  * hence we may omit checks after joining wait queue.
1797  * We check receive queue before schedule() only as optimization;
1798  * it is very likely that release_sock() added new data.
1799  */
1800 int sk_wait_data(struct sock *sk, long *timeo)
1801 {
1802 	int rc;
1803 	DEFINE_WAIT(wait);
1804 
1805 	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1806 	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1807 	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1808 	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1809 	finish_wait(sk_sleep(sk), &wait);
1810 	return rc;
1811 }
1812 EXPORT_SYMBOL(sk_wait_data);
1813 
1814 /**
1815  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1816  *	@sk: socket
1817  *	@size: memory size to allocate
1818  *	@kind: allocation type
1819  *
1820  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1821  *	rmem allocation. This function assumes that protocols which have
1822  *	memory_pressure use sk_wmem_queued as write buffer accounting.
1823  */
1824 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1825 {
1826 	struct proto *prot = sk->sk_prot;
1827 	int amt = sk_mem_pages(size);
1828 	long allocated;
1829 	int parent_status = UNDER_LIMIT;
1830 
1831 	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1832 
1833 	allocated = sk_memory_allocated_add(sk, amt, &parent_status);
1834 
1835 	/* Under limit. */
1836 	if (parent_status == UNDER_LIMIT &&
1837 			allocated <= sk_prot_mem_limits(sk, 0)) {
1838 		sk_leave_memory_pressure(sk);
1839 		return 1;
1840 	}
1841 
1842 	/* Under pressure. (we or our parents) */
1843 	if ((parent_status > SOFT_LIMIT) ||
1844 			allocated > sk_prot_mem_limits(sk, 1))
1845 		sk_enter_memory_pressure(sk);
1846 
1847 	/* Over hard limit (we or our parents) */
1848 	if ((parent_status == OVER_LIMIT) ||
1849 			(allocated > sk_prot_mem_limits(sk, 2)))
1850 		goto suppress_allocation;
1851 
1852 	/* guarantee minimum buffer size under pressure */
1853 	if (kind == SK_MEM_RECV) {
1854 		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1855 			return 1;
1856 
1857 	} else { /* SK_MEM_SEND */
1858 		if (sk->sk_type == SOCK_STREAM) {
1859 			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1860 				return 1;
1861 		} else if (atomic_read(&sk->sk_wmem_alloc) <
1862 			   prot->sysctl_wmem[0])
1863 				return 1;
1864 	}
1865 
1866 	if (sk_has_memory_pressure(sk)) {
1867 		int alloc;
1868 
1869 		if (!sk_under_memory_pressure(sk))
1870 			return 1;
1871 		alloc = sk_sockets_allocated_read_positive(sk);
1872 		if (sk_prot_mem_limits(sk, 2) > alloc *
1873 		    sk_mem_pages(sk->sk_wmem_queued +
1874 				 atomic_read(&sk->sk_rmem_alloc) +
1875 				 sk->sk_forward_alloc))
1876 			return 1;
1877 	}
1878 
1879 suppress_allocation:
1880 
1881 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1882 		sk_stream_moderate_sndbuf(sk);
1883 
1884 		/* Fail only if socket is _under_ its sndbuf.
1885 		 * In this case we cannot block, so that we have to fail.
1886 		 */
1887 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1888 			return 1;
1889 	}
1890 
1891 	trace_sock_exceed_buf_limit(sk, prot, allocated);
1892 
1893 	/* Alas. Undo changes. */
1894 	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1895 
1896 	sk_memory_allocated_sub(sk, amt);
1897 
1898 	return 0;
1899 }
1900 EXPORT_SYMBOL(__sk_mem_schedule);
1901 
1902 /**
1903  *	__sk_reclaim - reclaim memory_allocated
1904  *	@sk: socket
1905  */
1906 void __sk_mem_reclaim(struct sock *sk)
1907 {
1908 	sk_memory_allocated_sub(sk,
1909 				sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
1910 	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1911 
1912 	if (sk_under_memory_pressure(sk) &&
1913 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
1914 		sk_leave_memory_pressure(sk);
1915 }
1916 EXPORT_SYMBOL(__sk_mem_reclaim);
1917 
1918 
1919 /*
1920  * Set of default routines for initialising struct proto_ops when
1921  * the protocol does not support a particular function. In certain
1922  * cases where it makes no sense for a protocol to have a "do nothing"
1923  * function, some default processing is provided.
1924  */
1925 
1926 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1927 {
1928 	return -EOPNOTSUPP;
1929 }
1930 EXPORT_SYMBOL(sock_no_bind);
1931 
1932 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1933 		    int len, int flags)
1934 {
1935 	return -EOPNOTSUPP;
1936 }
1937 EXPORT_SYMBOL(sock_no_connect);
1938 
1939 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1940 {
1941 	return -EOPNOTSUPP;
1942 }
1943 EXPORT_SYMBOL(sock_no_socketpair);
1944 
1945 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1946 {
1947 	return -EOPNOTSUPP;
1948 }
1949 EXPORT_SYMBOL(sock_no_accept);
1950 
1951 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1952 		    int *len, int peer)
1953 {
1954 	return -EOPNOTSUPP;
1955 }
1956 EXPORT_SYMBOL(sock_no_getname);
1957 
1958 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1959 {
1960 	return 0;
1961 }
1962 EXPORT_SYMBOL(sock_no_poll);
1963 
1964 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1965 {
1966 	return -EOPNOTSUPP;
1967 }
1968 EXPORT_SYMBOL(sock_no_ioctl);
1969 
1970 int sock_no_listen(struct socket *sock, int backlog)
1971 {
1972 	return -EOPNOTSUPP;
1973 }
1974 EXPORT_SYMBOL(sock_no_listen);
1975 
1976 int sock_no_shutdown(struct socket *sock, int how)
1977 {
1978 	return -EOPNOTSUPP;
1979 }
1980 EXPORT_SYMBOL(sock_no_shutdown);
1981 
1982 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1983 		    char __user *optval, unsigned int optlen)
1984 {
1985 	return -EOPNOTSUPP;
1986 }
1987 EXPORT_SYMBOL(sock_no_setsockopt);
1988 
1989 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1990 		    char __user *optval, int __user *optlen)
1991 {
1992 	return -EOPNOTSUPP;
1993 }
1994 EXPORT_SYMBOL(sock_no_getsockopt);
1995 
1996 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1997 		    size_t len)
1998 {
1999 	return -EOPNOTSUPP;
2000 }
2001 EXPORT_SYMBOL(sock_no_sendmsg);
2002 
2003 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2004 		    size_t len, int flags)
2005 {
2006 	return -EOPNOTSUPP;
2007 }
2008 EXPORT_SYMBOL(sock_no_recvmsg);
2009 
2010 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2011 {
2012 	/* Mirror missing mmap method error code */
2013 	return -ENODEV;
2014 }
2015 EXPORT_SYMBOL(sock_no_mmap);
2016 
2017 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2018 {
2019 	ssize_t res;
2020 	struct msghdr msg = {.msg_flags = flags};
2021 	struct kvec iov;
2022 	char *kaddr = kmap(page);
2023 	iov.iov_base = kaddr + offset;
2024 	iov.iov_len = size;
2025 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2026 	kunmap(page);
2027 	return res;
2028 }
2029 EXPORT_SYMBOL(sock_no_sendpage);
2030 
2031 /*
2032  *	Default Socket Callbacks
2033  */
2034 
2035 static void sock_def_wakeup(struct sock *sk)
2036 {
2037 	struct socket_wq *wq;
2038 
2039 	rcu_read_lock();
2040 	wq = rcu_dereference(sk->sk_wq);
2041 	if (wq_has_sleeper(wq))
2042 		wake_up_interruptible_all(&wq->wait);
2043 	rcu_read_unlock();
2044 }
2045 
2046 static void sock_def_error_report(struct sock *sk)
2047 {
2048 	struct socket_wq *wq;
2049 
2050 	rcu_read_lock();
2051 	wq = rcu_dereference(sk->sk_wq);
2052 	if (wq_has_sleeper(wq))
2053 		wake_up_interruptible_poll(&wq->wait, POLLERR);
2054 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2055 	rcu_read_unlock();
2056 }
2057 
2058 static void sock_def_readable(struct sock *sk, int len)
2059 {
2060 	struct socket_wq *wq;
2061 
2062 	rcu_read_lock();
2063 	wq = rcu_dereference(sk->sk_wq);
2064 	if (wq_has_sleeper(wq))
2065 		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2066 						POLLRDNORM | POLLRDBAND);
2067 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2068 	rcu_read_unlock();
2069 }
2070 
2071 static void sock_def_write_space(struct sock *sk)
2072 {
2073 	struct socket_wq *wq;
2074 
2075 	rcu_read_lock();
2076 
2077 	/* Do not wake up a writer until he can make "significant"
2078 	 * progress.  --DaveM
2079 	 */
2080 	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2081 		wq = rcu_dereference(sk->sk_wq);
2082 		if (wq_has_sleeper(wq))
2083 			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2084 						POLLWRNORM | POLLWRBAND);
2085 
2086 		/* Should agree with poll, otherwise some programs break */
2087 		if (sock_writeable(sk))
2088 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2089 	}
2090 
2091 	rcu_read_unlock();
2092 }
2093 
2094 static void sock_def_destruct(struct sock *sk)
2095 {
2096 	kfree(sk->sk_protinfo);
2097 }
2098 
2099 void sk_send_sigurg(struct sock *sk)
2100 {
2101 	if (sk->sk_socket && sk->sk_socket->file)
2102 		if (send_sigurg(&sk->sk_socket->file->f_owner))
2103 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2104 }
2105 EXPORT_SYMBOL(sk_send_sigurg);
2106 
2107 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2108 		    unsigned long expires)
2109 {
2110 	if (!mod_timer(timer, expires))
2111 		sock_hold(sk);
2112 }
2113 EXPORT_SYMBOL(sk_reset_timer);
2114 
2115 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2116 {
2117 	if (timer_pending(timer) && del_timer(timer))
2118 		__sock_put(sk);
2119 }
2120 EXPORT_SYMBOL(sk_stop_timer);
2121 
2122 void sock_init_data(struct socket *sock, struct sock *sk)
2123 {
2124 	skb_queue_head_init(&sk->sk_receive_queue);
2125 	skb_queue_head_init(&sk->sk_write_queue);
2126 	skb_queue_head_init(&sk->sk_error_queue);
2127 #ifdef CONFIG_NET_DMA
2128 	skb_queue_head_init(&sk->sk_async_wait_queue);
2129 #endif
2130 
2131 	sk->sk_send_head	=	NULL;
2132 
2133 	init_timer(&sk->sk_timer);
2134 
2135 	sk->sk_allocation	=	GFP_KERNEL;
2136 	sk->sk_rcvbuf		=	sysctl_rmem_default;
2137 	sk->sk_sndbuf		=	sysctl_wmem_default;
2138 	sk->sk_state		=	TCP_CLOSE;
2139 	sk_set_socket(sk, sock);
2140 
2141 	sock_set_flag(sk, SOCK_ZAPPED);
2142 
2143 	if (sock) {
2144 		sk->sk_type	=	sock->type;
2145 		sk->sk_wq	=	sock->wq;
2146 		sock->sk	=	sk;
2147 	} else
2148 		sk->sk_wq	=	NULL;
2149 
2150 	spin_lock_init(&sk->sk_dst_lock);
2151 	rwlock_init(&sk->sk_callback_lock);
2152 	lockdep_set_class_and_name(&sk->sk_callback_lock,
2153 			af_callback_keys + sk->sk_family,
2154 			af_family_clock_key_strings[sk->sk_family]);
2155 
2156 	sk->sk_state_change	=	sock_def_wakeup;
2157 	sk->sk_data_ready	=	sock_def_readable;
2158 	sk->sk_write_space	=	sock_def_write_space;
2159 	sk->sk_error_report	=	sock_def_error_report;
2160 	sk->sk_destruct		=	sock_def_destruct;
2161 
2162 	sk->sk_sndmsg_page	=	NULL;
2163 	sk->sk_sndmsg_off	=	0;
2164 	sk->sk_peek_off		=	-1;
2165 
2166 	sk->sk_peer_pid 	=	NULL;
2167 	sk->sk_peer_cred	=	NULL;
2168 	sk->sk_write_pending	=	0;
2169 	sk->sk_rcvlowat		=	1;
2170 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2171 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2172 
2173 	sk->sk_stamp = ktime_set(-1L, 0);
2174 
2175 	/*
2176 	 * Before updating sk_refcnt, we must commit prior changes to memory
2177 	 * (Documentation/RCU/rculist_nulls.txt for details)
2178 	 */
2179 	smp_wmb();
2180 	atomic_set(&sk->sk_refcnt, 1);
2181 	atomic_set(&sk->sk_drops, 0);
2182 }
2183 EXPORT_SYMBOL(sock_init_data);
2184 
2185 void lock_sock_nested(struct sock *sk, int subclass)
2186 {
2187 	might_sleep();
2188 	spin_lock_bh(&sk->sk_lock.slock);
2189 	if (sk->sk_lock.owned)
2190 		__lock_sock(sk);
2191 	sk->sk_lock.owned = 1;
2192 	spin_unlock(&sk->sk_lock.slock);
2193 	/*
2194 	 * The sk_lock has mutex_lock() semantics here:
2195 	 */
2196 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2197 	local_bh_enable();
2198 }
2199 EXPORT_SYMBOL(lock_sock_nested);
2200 
2201 void release_sock(struct sock *sk)
2202 {
2203 	/*
2204 	 * The sk_lock has mutex_unlock() semantics:
2205 	 */
2206 	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2207 
2208 	spin_lock_bh(&sk->sk_lock.slock);
2209 	if (sk->sk_backlog.tail)
2210 		__release_sock(sk);
2211 
2212 	if (sk->sk_prot->release_cb)
2213 		sk->sk_prot->release_cb(sk);
2214 
2215 	sk->sk_lock.owned = 0;
2216 	if (waitqueue_active(&sk->sk_lock.wq))
2217 		wake_up(&sk->sk_lock.wq);
2218 	spin_unlock_bh(&sk->sk_lock.slock);
2219 }
2220 EXPORT_SYMBOL(release_sock);
2221 
2222 /**
2223  * lock_sock_fast - fast version of lock_sock
2224  * @sk: socket
2225  *
2226  * This version should be used for very small section, where process wont block
2227  * return false if fast path is taken
2228  *   sk_lock.slock locked, owned = 0, BH disabled
2229  * return true if slow path is taken
2230  *   sk_lock.slock unlocked, owned = 1, BH enabled
2231  */
2232 bool lock_sock_fast(struct sock *sk)
2233 {
2234 	might_sleep();
2235 	spin_lock_bh(&sk->sk_lock.slock);
2236 
2237 	if (!sk->sk_lock.owned)
2238 		/*
2239 		 * Note : We must disable BH
2240 		 */
2241 		return false;
2242 
2243 	__lock_sock(sk);
2244 	sk->sk_lock.owned = 1;
2245 	spin_unlock(&sk->sk_lock.slock);
2246 	/*
2247 	 * The sk_lock has mutex_lock() semantics here:
2248 	 */
2249 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2250 	local_bh_enable();
2251 	return true;
2252 }
2253 EXPORT_SYMBOL(lock_sock_fast);
2254 
2255 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2256 {
2257 	struct timeval tv;
2258 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2259 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2260 	tv = ktime_to_timeval(sk->sk_stamp);
2261 	if (tv.tv_sec == -1)
2262 		return -ENOENT;
2263 	if (tv.tv_sec == 0) {
2264 		sk->sk_stamp = ktime_get_real();
2265 		tv = ktime_to_timeval(sk->sk_stamp);
2266 	}
2267 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2268 }
2269 EXPORT_SYMBOL(sock_get_timestamp);
2270 
2271 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2272 {
2273 	struct timespec ts;
2274 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2275 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2276 	ts = ktime_to_timespec(sk->sk_stamp);
2277 	if (ts.tv_sec == -1)
2278 		return -ENOENT;
2279 	if (ts.tv_sec == 0) {
2280 		sk->sk_stamp = ktime_get_real();
2281 		ts = ktime_to_timespec(sk->sk_stamp);
2282 	}
2283 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2284 }
2285 EXPORT_SYMBOL(sock_get_timestampns);
2286 
2287 void sock_enable_timestamp(struct sock *sk, int flag)
2288 {
2289 	if (!sock_flag(sk, flag)) {
2290 		unsigned long previous_flags = sk->sk_flags;
2291 
2292 		sock_set_flag(sk, flag);
2293 		/*
2294 		 * we just set one of the two flags which require net
2295 		 * time stamping, but time stamping might have been on
2296 		 * already because of the other one
2297 		 */
2298 		if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2299 			net_enable_timestamp();
2300 	}
2301 }
2302 
2303 /*
2304  *	Get a socket option on an socket.
2305  *
2306  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2307  *	asynchronous errors should be reported by getsockopt. We assume
2308  *	this means if you specify SO_ERROR (otherwise whats the point of it).
2309  */
2310 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2311 			   char __user *optval, int __user *optlen)
2312 {
2313 	struct sock *sk = sock->sk;
2314 
2315 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2316 }
2317 EXPORT_SYMBOL(sock_common_getsockopt);
2318 
2319 #ifdef CONFIG_COMPAT
2320 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2321 				  char __user *optval, int __user *optlen)
2322 {
2323 	struct sock *sk = sock->sk;
2324 
2325 	if (sk->sk_prot->compat_getsockopt != NULL)
2326 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2327 						      optval, optlen);
2328 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2329 }
2330 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2331 #endif
2332 
2333 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2334 			struct msghdr *msg, size_t size, int flags)
2335 {
2336 	struct sock *sk = sock->sk;
2337 	int addr_len = 0;
2338 	int err;
2339 
2340 	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2341 				   flags & ~MSG_DONTWAIT, &addr_len);
2342 	if (err >= 0)
2343 		msg->msg_namelen = addr_len;
2344 	return err;
2345 }
2346 EXPORT_SYMBOL(sock_common_recvmsg);
2347 
2348 /*
2349  *	Set socket options on an inet socket.
2350  */
2351 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2352 			   char __user *optval, unsigned int optlen)
2353 {
2354 	struct sock *sk = sock->sk;
2355 
2356 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2357 }
2358 EXPORT_SYMBOL(sock_common_setsockopt);
2359 
2360 #ifdef CONFIG_COMPAT
2361 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2362 				  char __user *optval, unsigned int optlen)
2363 {
2364 	struct sock *sk = sock->sk;
2365 
2366 	if (sk->sk_prot->compat_setsockopt != NULL)
2367 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2368 						      optval, optlen);
2369 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2370 }
2371 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2372 #endif
2373 
2374 void sk_common_release(struct sock *sk)
2375 {
2376 	if (sk->sk_prot->destroy)
2377 		sk->sk_prot->destroy(sk);
2378 
2379 	/*
2380 	 * Observation: when sock_common_release is called, processes have
2381 	 * no access to socket. But net still has.
2382 	 * Step one, detach it from networking:
2383 	 *
2384 	 * A. Remove from hash tables.
2385 	 */
2386 
2387 	sk->sk_prot->unhash(sk);
2388 
2389 	/*
2390 	 * In this point socket cannot receive new packets, but it is possible
2391 	 * that some packets are in flight because some CPU runs receiver and
2392 	 * did hash table lookup before we unhashed socket. They will achieve
2393 	 * receive queue and will be purged by socket destructor.
2394 	 *
2395 	 * Also we still have packets pending on receive queue and probably,
2396 	 * our own packets waiting in device queues. sock_destroy will drain
2397 	 * receive queue, but transmitted packets will delay socket destruction
2398 	 * until the last reference will be released.
2399 	 */
2400 
2401 	sock_orphan(sk);
2402 
2403 	xfrm_sk_free_policy(sk);
2404 
2405 	sk_refcnt_debug_release(sk);
2406 	sock_put(sk);
2407 }
2408 EXPORT_SYMBOL(sk_common_release);
2409 
2410 #ifdef CONFIG_PROC_FS
2411 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
2412 struct prot_inuse {
2413 	int val[PROTO_INUSE_NR];
2414 };
2415 
2416 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2417 
2418 #ifdef CONFIG_NET_NS
2419 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2420 {
2421 	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2422 }
2423 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2424 
2425 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2426 {
2427 	int cpu, idx = prot->inuse_idx;
2428 	int res = 0;
2429 
2430 	for_each_possible_cpu(cpu)
2431 		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2432 
2433 	return res >= 0 ? res : 0;
2434 }
2435 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2436 
2437 static int __net_init sock_inuse_init_net(struct net *net)
2438 {
2439 	net->core.inuse = alloc_percpu(struct prot_inuse);
2440 	return net->core.inuse ? 0 : -ENOMEM;
2441 }
2442 
2443 static void __net_exit sock_inuse_exit_net(struct net *net)
2444 {
2445 	free_percpu(net->core.inuse);
2446 }
2447 
2448 static struct pernet_operations net_inuse_ops = {
2449 	.init = sock_inuse_init_net,
2450 	.exit = sock_inuse_exit_net,
2451 };
2452 
2453 static __init int net_inuse_init(void)
2454 {
2455 	if (register_pernet_subsys(&net_inuse_ops))
2456 		panic("Cannot initialize net inuse counters");
2457 
2458 	return 0;
2459 }
2460 
2461 core_initcall(net_inuse_init);
2462 #else
2463 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2464 
2465 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2466 {
2467 	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2468 }
2469 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2470 
2471 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2472 {
2473 	int cpu, idx = prot->inuse_idx;
2474 	int res = 0;
2475 
2476 	for_each_possible_cpu(cpu)
2477 		res += per_cpu(prot_inuse, cpu).val[idx];
2478 
2479 	return res >= 0 ? res : 0;
2480 }
2481 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2482 #endif
2483 
2484 static void assign_proto_idx(struct proto *prot)
2485 {
2486 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2487 
2488 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2489 		pr_err("PROTO_INUSE_NR exhausted\n");
2490 		return;
2491 	}
2492 
2493 	set_bit(prot->inuse_idx, proto_inuse_idx);
2494 }
2495 
2496 static void release_proto_idx(struct proto *prot)
2497 {
2498 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2499 		clear_bit(prot->inuse_idx, proto_inuse_idx);
2500 }
2501 #else
2502 static inline void assign_proto_idx(struct proto *prot)
2503 {
2504 }
2505 
2506 static inline void release_proto_idx(struct proto *prot)
2507 {
2508 }
2509 #endif
2510 
2511 int proto_register(struct proto *prot, int alloc_slab)
2512 {
2513 	if (alloc_slab) {
2514 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2515 					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2516 					NULL);
2517 
2518 		if (prot->slab == NULL) {
2519 			pr_crit("%s: Can't create sock SLAB cache!\n",
2520 				prot->name);
2521 			goto out;
2522 		}
2523 
2524 		if (prot->rsk_prot != NULL) {
2525 			prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2526 			if (prot->rsk_prot->slab_name == NULL)
2527 				goto out_free_sock_slab;
2528 
2529 			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2530 								 prot->rsk_prot->obj_size, 0,
2531 								 SLAB_HWCACHE_ALIGN, NULL);
2532 
2533 			if (prot->rsk_prot->slab == NULL) {
2534 				pr_crit("%s: Can't create request sock SLAB cache!\n",
2535 					prot->name);
2536 				goto out_free_request_sock_slab_name;
2537 			}
2538 		}
2539 
2540 		if (prot->twsk_prot != NULL) {
2541 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2542 
2543 			if (prot->twsk_prot->twsk_slab_name == NULL)
2544 				goto out_free_request_sock_slab;
2545 
2546 			prot->twsk_prot->twsk_slab =
2547 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2548 						  prot->twsk_prot->twsk_obj_size,
2549 						  0,
2550 						  SLAB_HWCACHE_ALIGN |
2551 							prot->slab_flags,
2552 						  NULL);
2553 			if (prot->twsk_prot->twsk_slab == NULL)
2554 				goto out_free_timewait_sock_slab_name;
2555 		}
2556 	}
2557 
2558 	mutex_lock(&proto_list_mutex);
2559 	list_add(&prot->node, &proto_list);
2560 	assign_proto_idx(prot);
2561 	mutex_unlock(&proto_list_mutex);
2562 	return 0;
2563 
2564 out_free_timewait_sock_slab_name:
2565 	kfree(prot->twsk_prot->twsk_slab_name);
2566 out_free_request_sock_slab:
2567 	if (prot->rsk_prot && prot->rsk_prot->slab) {
2568 		kmem_cache_destroy(prot->rsk_prot->slab);
2569 		prot->rsk_prot->slab = NULL;
2570 	}
2571 out_free_request_sock_slab_name:
2572 	if (prot->rsk_prot)
2573 		kfree(prot->rsk_prot->slab_name);
2574 out_free_sock_slab:
2575 	kmem_cache_destroy(prot->slab);
2576 	prot->slab = NULL;
2577 out:
2578 	return -ENOBUFS;
2579 }
2580 EXPORT_SYMBOL(proto_register);
2581 
2582 void proto_unregister(struct proto *prot)
2583 {
2584 	mutex_lock(&proto_list_mutex);
2585 	release_proto_idx(prot);
2586 	list_del(&prot->node);
2587 	mutex_unlock(&proto_list_mutex);
2588 
2589 	if (prot->slab != NULL) {
2590 		kmem_cache_destroy(prot->slab);
2591 		prot->slab = NULL;
2592 	}
2593 
2594 	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2595 		kmem_cache_destroy(prot->rsk_prot->slab);
2596 		kfree(prot->rsk_prot->slab_name);
2597 		prot->rsk_prot->slab = NULL;
2598 	}
2599 
2600 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2601 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2602 		kfree(prot->twsk_prot->twsk_slab_name);
2603 		prot->twsk_prot->twsk_slab = NULL;
2604 	}
2605 }
2606 EXPORT_SYMBOL(proto_unregister);
2607 
2608 #ifdef CONFIG_PROC_FS
2609 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2610 	__acquires(proto_list_mutex)
2611 {
2612 	mutex_lock(&proto_list_mutex);
2613 	return seq_list_start_head(&proto_list, *pos);
2614 }
2615 
2616 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2617 {
2618 	return seq_list_next(v, &proto_list, pos);
2619 }
2620 
2621 static void proto_seq_stop(struct seq_file *seq, void *v)
2622 	__releases(proto_list_mutex)
2623 {
2624 	mutex_unlock(&proto_list_mutex);
2625 }
2626 
2627 static char proto_method_implemented(const void *method)
2628 {
2629 	return method == NULL ? 'n' : 'y';
2630 }
2631 static long sock_prot_memory_allocated(struct proto *proto)
2632 {
2633 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2634 }
2635 
2636 static char *sock_prot_memory_pressure(struct proto *proto)
2637 {
2638 	return proto->memory_pressure != NULL ?
2639 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2640 }
2641 
2642 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2643 {
2644 
2645 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2646 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2647 		   proto->name,
2648 		   proto->obj_size,
2649 		   sock_prot_inuse_get(seq_file_net(seq), proto),
2650 		   sock_prot_memory_allocated(proto),
2651 		   sock_prot_memory_pressure(proto),
2652 		   proto->max_header,
2653 		   proto->slab == NULL ? "no" : "yes",
2654 		   module_name(proto->owner),
2655 		   proto_method_implemented(proto->close),
2656 		   proto_method_implemented(proto->connect),
2657 		   proto_method_implemented(proto->disconnect),
2658 		   proto_method_implemented(proto->accept),
2659 		   proto_method_implemented(proto->ioctl),
2660 		   proto_method_implemented(proto->init),
2661 		   proto_method_implemented(proto->destroy),
2662 		   proto_method_implemented(proto->shutdown),
2663 		   proto_method_implemented(proto->setsockopt),
2664 		   proto_method_implemented(proto->getsockopt),
2665 		   proto_method_implemented(proto->sendmsg),
2666 		   proto_method_implemented(proto->recvmsg),
2667 		   proto_method_implemented(proto->sendpage),
2668 		   proto_method_implemented(proto->bind),
2669 		   proto_method_implemented(proto->backlog_rcv),
2670 		   proto_method_implemented(proto->hash),
2671 		   proto_method_implemented(proto->unhash),
2672 		   proto_method_implemented(proto->get_port),
2673 		   proto_method_implemented(proto->enter_memory_pressure));
2674 }
2675 
2676 static int proto_seq_show(struct seq_file *seq, void *v)
2677 {
2678 	if (v == &proto_list)
2679 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2680 			   "protocol",
2681 			   "size",
2682 			   "sockets",
2683 			   "memory",
2684 			   "press",
2685 			   "maxhdr",
2686 			   "slab",
2687 			   "module",
2688 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2689 	else
2690 		proto_seq_printf(seq, list_entry(v, struct proto, node));
2691 	return 0;
2692 }
2693 
2694 static const struct seq_operations proto_seq_ops = {
2695 	.start  = proto_seq_start,
2696 	.next   = proto_seq_next,
2697 	.stop   = proto_seq_stop,
2698 	.show   = proto_seq_show,
2699 };
2700 
2701 static int proto_seq_open(struct inode *inode, struct file *file)
2702 {
2703 	return seq_open_net(inode, file, &proto_seq_ops,
2704 			    sizeof(struct seq_net_private));
2705 }
2706 
2707 static const struct file_operations proto_seq_fops = {
2708 	.owner		= THIS_MODULE,
2709 	.open		= proto_seq_open,
2710 	.read		= seq_read,
2711 	.llseek		= seq_lseek,
2712 	.release	= seq_release_net,
2713 };
2714 
2715 static __net_init int proto_init_net(struct net *net)
2716 {
2717 	if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2718 		return -ENOMEM;
2719 
2720 	return 0;
2721 }
2722 
2723 static __net_exit void proto_exit_net(struct net *net)
2724 {
2725 	proc_net_remove(net, "protocols");
2726 }
2727 
2728 
2729 static __net_initdata struct pernet_operations proto_net_ops = {
2730 	.init = proto_init_net,
2731 	.exit = proto_exit_net,
2732 };
2733 
2734 static int __init proto_init(void)
2735 {
2736 	return register_pernet_subsys(&proto_net_ops);
2737 }
2738 
2739 subsys_initcall(proto_init);
2740 
2741 #endif /* PROC_FS */
2742