xref: /openbmc/linux/net/core/sock.c (revision 4da722ca19f30f7db250db808d1ab1703607a932)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *		This program is free software; you can redistribute it and/or
87  *		modify it under the terms of the GNU General Public License
88  *		as published by the Free Software Foundation; either version
89  *		2 of the License, or (at your option) any later version.
90  */
91 
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93 
94 #include <linux/capability.h>
95 #include <linux/errno.h>
96 #include <linux/errqueue.h>
97 #include <linux/types.h>
98 #include <linux/socket.h>
99 #include <linux/in.h>
100 #include <linux/kernel.h>
101 #include <linux/module.h>
102 #include <linux/proc_fs.h>
103 #include <linux/seq_file.h>
104 #include <linux/sched.h>
105 #include <linux/sched/mm.h>
106 #include <linux/timer.h>
107 #include <linux/string.h>
108 #include <linux/sockios.h>
109 #include <linux/net.h>
110 #include <linux/mm.h>
111 #include <linux/slab.h>
112 #include <linux/interrupt.h>
113 #include <linux/poll.h>
114 #include <linux/tcp.h>
115 #include <linux/init.h>
116 #include <linux/highmem.h>
117 #include <linux/user_namespace.h>
118 #include <linux/static_key.h>
119 #include <linux/memcontrol.h>
120 #include <linux/prefetch.h>
121 
122 #include <linux/uaccess.h>
123 
124 #include <linux/netdevice.h>
125 #include <net/protocol.h>
126 #include <linux/skbuff.h>
127 #include <net/net_namespace.h>
128 #include <net/request_sock.h>
129 #include <net/sock.h>
130 #include <linux/net_tstamp.h>
131 #include <net/xfrm.h>
132 #include <linux/ipsec.h>
133 #include <net/cls_cgroup.h>
134 #include <net/netprio_cgroup.h>
135 #include <linux/sock_diag.h>
136 
137 #include <linux/filter.h>
138 #include <net/sock_reuseport.h>
139 
140 #include <trace/events/sock.h>
141 
142 #include <net/tcp.h>
143 #include <net/busy_poll.h>
144 
145 static DEFINE_MUTEX(proto_list_mutex);
146 static LIST_HEAD(proto_list);
147 
148 /**
149  * sk_ns_capable - General socket capability test
150  * @sk: Socket to use a capability on or through
151  * @user_ns: The user namespace of the capability to use
152  * @cap: The capability to use
153  *
154  * Test to see if the opener of the socket had when the socket was
155  * created and the current process has the capability @cap in the user
156  * namespace @user_ns.
157  */
158 bool sk_ns_capable(const struct sock *sk,
159 		   struct user_namespace *user_ns, int cap)
160 {
161 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
162 		ns_capable(user_ns, cap);
163 }
164 EXPORT_SYMBOL(sk_ns_capable);
165 
166 /**
167  * sk_capable - Socket global capability test
168  * @sk: Socket to use a capability on or through
169  * @cap: The global capability to use
170  *
171  * Test to see if the opener of the socket had when the socket was
172  * created and the current process has the capability @cap in all user
173  * namespaces.
174  */
175 bool sk_capable(const struct sock *sk, int cap)
176 {
177 	return sk_ns_capable(sk, &init_user_ns, cap);
178 }
179 EXPORT_SYMBOL(sk_capable);
180 
181 /**
182  * sk_net_capable - Network namespace socket capability test
183  * @sk: Socket to use a capability on or through
184  * @cap: The capability to use
185  *
186  * Test to see if the opener of the socket had when the socket was created
187  * and the current process has the capability @cap over the network namespace
188  * the socket is a member of.
189  */
190 bool sk_net_capable(const struct sock *sk, int cap)
191 {
192 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
193 }
194 EXPORT_SYMBOL(sk_net_capable);
195 
196 /*
197  * Each address family might have different locking rules, so we have
198  * one slock key per address family and separate keys for internal and
199  * userspace sockets.
200  */
201 static struct lock_class_key af_family_keys[AF_MAX];
202 static struct lock_class_key af_family_kern_keys[AF_MAX];
203 static struct lock_class_key af_family_slock_keys[AF_MAX];
204 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
205 
206 /*
207  * Make lock validator output more readable. (we pre-construct these
208  * strings build-time, so that runtime initialization of socket
209  * locks is fast):
210  */
211 
212 #define _sock_locks(x)						  \
213   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
214   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
215   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
216   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
217   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
218   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
219   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
220   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
221   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
222   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
223   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
224   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
225   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
226   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
227   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_MAX"
228 
229 static const char *const af_family_key_strings[AF_MAX+1] = {
230 	_sock_locks("sk_lock-")
231 };
232 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
233 	_sock_locks("slock-")
234 };
235 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
236 	_sock_locks("clock-")
237 };
238 
239 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
240 	_sock_locks("k-sk_lock-")
241 };
242 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
243 	_sock_locks("k-slock-")
244 };
245 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
246 	_sock_locks("k-clock-")
247 };
248 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
249   "rlock-AF_UNSPEC", "rlock-AF_UNIX"     , "rlock-AF_INET"     ,
250   "rlock-AF_AX25"  , "rlock-AF_IPX"      , "rlock-AF_APPLETALK",
251   "rlock-AF_NETROM", "rlock-AF_BRIDGE"   , "rlock-AF_ATMPVC"   ,
252   "rlock-AF_X25"   , "rlock-AF_INET6"    , "rlock-AF_ROSE"     ,
253   "rlock-AF_DECnet", "rlock-AF_NETBEUI"  , "rlock-AF_SECURITY" ,
254   "rlock-AF_KEY"   , "rlock-AF_NETLINK"  , "rlock-AF_PACKET"   ,
255   "rlock-AF_ASH"   , "rlock-AF_ECONET"   , "rlock-AF_ATMSVC"   ,
256   "rlock-AF_RDS"   , "rlock-AF_SNA"      , "rlock-AF_IRDA"     ,
257   "rlock-AF_PPPOX" , "rlock-AF_WANPIPE"  , "rlock-AF_LLC"      ,
258   "rlock-27"       , "rlock-28"          , "rlock-AF_CAN"      ,
259   "rlock-AF_TIPC"  , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV"     ,
260   "rlock-AF_RXRPC" , "rlock-AF_ISDN"     , "rlock-AF_PHONET"   ,
261   "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG"      ,
262   "rlock-AF_NFC"   , "rlock-AF_VSOCK"    , "rlock-AF_KCM"      ,
263   "rlock-AF_QIPCRTR", "rlock-AF_SMC"     , "rlock-AF_MAX"
264 };
265 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
266   "wlock-AF_UNSPEC", "wlock-AF_UNIX"     , "wlock-AF_INET"     ,
267   "wlock-AF_AX25"  , "wlock-AF_IPX"      , "wlock-AF_APPLETALK",
268   "wlock-AF_NETROM", "wlock-AF_BRIDGE"   , "wlock-AF_ATMPVC"   ,
269   "wlock-AF_X25"   , "wlock-AF_INET6"    , "wlock-AF_ROSE"     ,
270   "wlock-AF_DECnet", "wlock-AF_NETBEUI"  , "wlock-AF_SECURITY" ,
271   "wlock-AF_KEY"   , "wlock-AF_NETLINK"  , "wlock-AF_PACKET"   ,
272   "wlock-AF_ASH"   , "wlock-AF_ECONET"   , "wlock-AF_ATMSVC"   ,
273   "wlock-AF_RDS"   , "wlock-AF_SNA"      , "wlock-AF_IRDA"     ,
274   "wlock-AF_PPPOX" , "wlock-AF_WANPIPE"  , "wlock-AF_LLC"      ,
275   "wlock-27"       , "wlock-28"          , "wlock-AF_CAN"      ,
276   "wlock-AF_TIPC"  , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV"     ,
277   "wlock-AF_RXRPC" , "wlock-AF_ISDN"     , "wlock-AF_PHONET"   ,
278   "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG"      ,
279   "wlock-AF_NFC"   , "wlock-AF_VSOCK"    , "wlock-AF_KCM"      ,
280   "wlock-AF_QIPCRTR", "wlock-AF_SMC"     , "wlock-AF_MAX"
281 };
282 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
283   "elock-AF_UNSPEC", "elock-AF_UNIX"     , "elock-AF_INET"     ,
284   "elock-AF_AX25"  , "elock-AF_IPX"      , "elock-AF_APPLETALK",
285   "elock-AF_NETROM", "elock-AF_BRIDGE"   , "elock-AF_ATMPVC"   ,
286   "elock-AF_X25"   , "elock-AF_INET6"    , "elock-AF_ROSE"     ,
287   "elock-AF_DECnet", "elock-AF_NETBEUI"  , "elock-AF_SECURITY" ,
288   "elock-AF_KEY"   , "elock-AF_NETLINK"  , "elock-AF_PACKET"   ,
289   "elock-AF_ASH"   , "elock-AF_ECONET"   , "elock-AF_ATMSVC"   ,
290   "elock-AF_RDS"   , "elock-AF_SNA"      , "elock-AF_IRDA"     ,
291   "elock-AF_PPPOX" , "elock-AF_WANPIPE"  , "elock-AF_LLC"      ,
292   "elock-27"       , "elock-28"          , "elock-AF_CAN"      ,
293   "elock-AF_TIPC"  , "elock-AF_BLUETOOTH", "elock-AF_IUCV"     ,
294   "elock-AF_RXRPC" , "elock-AF_ISDN"     , "elock-AF_PHONET"   ,
295   "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG"      ,
296   "elock-AF_NFC"   , "elock-AF_VSOCK"    , "elock-AF_KCM"      ,
297   "elock-AF_QIPCRTR", "elock-AF_SMC"     , "elock-AF_MAX"
298 };
299 
300 /*
301  * sk_callback_lock and sk queues locking rules are per-address-family,
302  * so split the lock classes by using a per-AF key:
303  */
304 static struct lock_class_key af_callback_keys[AF_MAX];
305 static struct lock_class_key af_rlock_keys[AF_MAX];
306 static struct lock_class_key af_wlock_keys[AF_MAX];
307 static struct lock_class_key af_elock_keys[AF_MAX];
308 static struct lock_class_key af_kern_callback_keys[AF_MAX];
309 
310 /* Take into consideration the size of the struct sk_buff overhead in the
311  * determination of these values, since that is non-constant across
312  * platforms.  This makes socket queueing behavior and performance
313  * not depend upon such differences.
314  */
315 #define _SK_MEM_PACKETS		256
316 #define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
317 #define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
318 #define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
319 
320 /* Run time adjustable parameters. */
321 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
322 EXPORT_SYMBOL(sysctl_wmem_max);
323 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
324 EXPORT_SYMBOL(sysctl_rmem_max);
325 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
326 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
327 
328 /* Maximal space eaten by iovec or ancillary data plus some space */
329 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
330 EXPORT_SYMBOL(sysctl_optmem_max);
331 
332 int sysctl_tstamp_allow_data __read_mostly = 1;
333 
334 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
335 EXPORT_SYMBOL_GPL(memalloc_socks);
336 
337 /**
338  * sk_set_memalloc - sets %SOCK_MEMALLOC
339  * @sk: socket to set it on
340  *
341  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
342  * It's the responsibility of the admin to adjust min_free_kbytes
343  * to meet the requirements
344  */
345 void sk_set_memalloc(struct sock *sk)
346 {
347 	sock_set_flag(sk, SOCK_MEMALLOC);
348 	sk->sk_allocation |= __GFP_MEMALLOC;
349 	static_key_slow_inc(&memalloc_socks);
350 }
351 EXPORT_SYMBOL_GPL(sk_set_memalloc);
352 
353 void sk_clear_memalloc(struct sock *sk)
354 {
355 	sock_reset_flag(sk, SOCK_MEMALLOC);
356 	sk->sk_allocation &= ~__GFP_MEMALLOC;
357 	static_key_slow_dec(&memalloc_socks);
358 
359 	/*
360 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
361 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
362 	 * it has rmem allocations due to the last swapfile being deactivated
363 	 * but there is a risk that the socket is unusable due to exceeding
364 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
365 	 */
366 	sk_mem_reclaim(sk);
367 }
368 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
369 
370 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
371 {
372 	int ret;
373 	unsigned int noreclaim_flag;
374 
375 	/* these should have been dropped before queueing */
376 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
377 
378 	noreclaim_flag = memalloc_noreclaim_save();
379 	ret = sk->sk_backlog_rcv(sk, skb);
380 	memalloc_noreclaim_restore(noreclaim_flag);
381 
382 	return ret;
383 }
384 EXPORT_SYMBOL(__sk_backlog_rcv);
385 
386 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
387 {
388 	struct timeval tv;
389 
390 	if (optlen < sizeof(tv))
391 		return -EINVAL;
392 	if (copy_from_user(&tv, optval, sizeof(tv)))
393 		return -EFAULT;
394 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
395 		return -EDOM;
396 
397 	if (tv.tv_sec < 0) {
398 		static int warned __read_mostly;
399 
400 		*timeo_p = 0;
401 		if (warned < 10 && net_ratelimit()) {
402 			warned++;
403 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
404 				__func__, current->comm, task_pid_nr(current));
405 		}
406 		return 0;
407 	}
408 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
409 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
410 		return 0;
411 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
412 		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
413 	return 0;
414 }
415 
416 static void sock_warn_obsolete_bsdism(const char *name)
417 {
418 	static int warned;
419 	static char warncomm[TASK_COMM_LEN];
420 	if (strcmp(warncomm, current->comm) && warned < 5) {
421 		strcpy(warncomm,  current->comm);
422 		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
423 			warncomm, name);
424 		warned++;
425 	}
426 }
427 
428 static bool sock_needs_netstamp(const struct sock *sk)
429 {
430 	switch (sk->sk_family) {
431 	case AF_UNSPEC:
432 	case AF_UNIX:
433 		return false;
434 	default:
435 		return true;
436 	}
437 }
438 
439 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
440 {
441 	if (sk->sk_flags & flags) {
442 		sk->sk_flags &= ~flags;
443 		if (sock_needs_netstamp(sk) &&
444 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
445 			net_disable_timestamp();
446 	}
447 }
448 
449 
450 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
451 {
452 	unsigned long flags;
453 	struct sk_buff_head *list = &sk->sk_receive_queue;
454 
455 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
456 		atomic_inc(&sk->sk_drops);
457 		trace_sock_rcvqueue_full(sk, skb);
458 		return -ENOMEM;
459 	}
460 
461 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
462 		atomic_inc(&sk->sk_drops);
463 		return -ENOBUFS;
464 	}
465 
466 	skb->dev = NULL;
467 	skb_set_owner_r(skb, sk);
468 
469 	/* we escape from rcu protected region, make sure we dont leak
470 	 * a norefcounted dst
471 	 */
472 	skb_dst_force(skb);
473 
474 	spin_lock_irqsave(&list->lock, flags);
475 	sock_skb_set_dropcount(sk, skb);
476 	__skb_queue_tail(list, skb);
477 	spin_unlock_irqrestore(&list->lock, flags);
478 
479 	if (!sock_flag(sk, SOCK_DEAD))
480 		sk->sk_data_ready(sk);
481 	return 0;
482 }
483 EXPORT_SYMBOL(__sock_queue_rcv_skb);
484 
485 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
486 {
487 	int err;
488 
489 	err = sk_filter(sk, skb);
490 	if (err)
491 		return err;
492 
493 	return __sock_queue_rcv_skb(sk, skb);
494 }
495 EXPORT_SYMBOL(sock_queue_rcv_skb);
496 
497 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
498 		     const int nested, unsigned int trim_cap, bool refcounted)
499 {
500 	int rc = NET_RX_SUCCESS;
501 
502 	if (sk_filter_trim_cap(sk, skb, trim_cap))
503 		goto discard_and_relse;
504 
505 	skb->dev = NULL;
506 
507 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
508 		atomic_inc(&sk->sk_drops);
509 		goto discard_and_relse;
510 	}
511 	if (nested)
512 		bh_lock_sock_nested(sk);
513 	else
514 		bh_lock_sock(sk);
515 	if (!sock_owned_by_user(sk)) {
516 		/*
517 		 * trylock + unlock semantics:
518 		 */
519 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
520 
521 		rc = sk_backlog_rcv(sk, skb);
522 
523 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
524 	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
525 		bh_unlock_sock(sk);
526 		atomic_inc(&sk->sk_drops);
527 		goto discard_and_relse;
528 	}
529 
530 	bh_unlock_sock(sk);
531 out:
532 	if (refcounted)
533 		sock_put(sk);
534 	return rc;
535 discard_and_relse:
536 	kfree_skb(skb);
537 	goto out;
538 }
539 EXPORT_SYMBOL(__sk_receive_skb);
540 
541 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
542 {
543 	struct dst_entry *dst = __sk_dst_get(sk);
544 
545 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
546 		sk_tx_queue_clear(sk);
547 		sk->sk_dst_pending_confirm = 0;
548 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
549 		dst_release(dst);
550 		return NULL;
551 	}
552 
553 	return dst;
554 }
555 EXPORT_SYMBOL(__sk_dst_check);
556 
557 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
558 {
559 	struct dst_entry *dst = sk_dst_get(sk);
560 
561 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
562 		sk_dst_reset(sk);
563 		dst_release(dst);
564 		return NULL;
565 	}
566 
567 	return dst;
568 }
569 EXPORT_SYMBOL(sk_dst_check);
570 
571 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
572 				int optlen)
573 {
574 	int ret = -ENOPROTOOPT;
575 #ifdef CONFIG_NETDEVICES
576 	struct net *net = sock_net(sk);
577 	char devname[IFNAMSIZ];
578 	int index;
579 
580 	/* Sorry... */
581 	ret = -EPERM;
582 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
583 		goto out;
584 
585 	ret = -EINVAL;
586 	if (optlen < 0)
587 		goto out;
588 
589 	/* Bind this socket to a particular device like "eth0",
590 	 * as specified in the passed interface name. If the
591 	 * name is "" or the option length is zero the socket
592 	 * is not bound.
593 	 */
594 	if (optlen > IFNAMSIZ - 1)
595 		optlen = IFNAMSIZ - 1;
596 	memset(devname, 0, sizeof(devname));
597 
598 	ret = -EFAULT;
599 	if (copy_from_user(devname, optval, optlen))
600 		goto out;
601 
602 	index = 0;
603 	if (devname[0] != '\0') {
604 		struct net_device *dev;
605 
606 		rcu_read_lock();
607 		dev = dev_get_by_name_rcu(net, devname);
608 		if (dev)
609 			index = dev->ifindex;
610 		rcu_read_unlock();
611 		ret = -ENODEV;
612 		if (!dev)
613 			goto out;
614 	}
615 
616 	lock_sock(sk);
617 	sk->sk_bound_dev_if = index;
618 	sk_dst_reset(sk);
619 	release_sock(sk);
620 
621 	ret = 0;
622 
623 out:
624 #endif
625 
626 	return ret;
627 }
628 
629 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
630 				int __user *optlen, int len)
631 {
632 	int ret = -ENOPROTOOPT;
633 #ifdef CONFIG_NETDEVICES
634 	struct net *net = sock_net(sk);
635 	char devname[IFNAMSIZ];
636 
637 	if (sk->sk_bound_dev_if == 0) {
638 		len = 0;
639 		goto zero;
640 	}
641 
642 	ret = -EINVAL;
643 	if (len < IFNAMSIZ)
644 		goto out;
645 
646 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
647 	if (ret)
648 		goto out;
649 
650 	len = strlen(devname) + 1;
651 
652 	ret = -EFAULT;
653 	if (copy_to_user(optval, devname, len))
654 		goto out;
655 
656 zero:
657 	ret = -EFAULT;
658 	if (put_user(len, optlen))
659 		goto out;
660 
661 	ret = 0;
662 
663 out:
664 #endif
665 
666 	return ret;
667 }
668 
669 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
670 {
671 	if (valbool)
672 		sock_set_flag(sk, bit);
673 	else
674 		sock_reset_flag(sk, bit);
675 }
676 
677 bool sk_mc_loop(struct sock *sk)
678 {
679 	if (dev_recursion_level())
680 		return false;
681 	if (!sk)
682 		return true;
683 	switch (sk->sk_family) {
684 	case AF_INET:
685 		return inet_sk(sk)->mc_loop;
686 #if IS_ENABLED(CONFIG_IPV6)
687 	case AF_INET6:
688 		return inet6_sk(sk)->mc_loop;
689 #endif
690 	}
691 	WARN_ON(1);
692 	return true;
693 }
694 EXPORT_SYMBOL(sk_mc_loop);
695 
696 /*
697  *	This is meant for all protocols to use and covers goings on
698  *	at the socket level. Everything here is generic.
699  */
700 
701 int sock_setsockopt(struct socket *sock, int level, int optname,
702 		    char __user *optval, unsigned int optlen)
703 {
704 	struct sock *sk = sock->sk;
705 	int val;
706 	int valbool;
707 	struct linger ling;
708 	int ret = 0;
709 
710 	/*
711 	 *	Options without arguments
712 	 */
713 
714 	if (optname == SO_BINDTODEVICE)
715 		return sock_setbindtodevice(sk, optval, optlen);
716 
717 	if (optlen < sizeof(int))
718 		return -EINVAL;
719 
720 	if (get_user(val, (int __user *)optval))
721 		return -EFAULT;
722 
723 	valbool = val ? 1 : 0;
724 
725 	lock_sock(sk);
726 
727 	switch (optname) {
728 	case SO_DEBUG:
729 		if (val && !capable(CAP_NET_ADMIN))
730 			ret = -EACCES;
731 		else
732 			sock_valbool_flag(sk, SOCK_DBG, valbool);
733 		break;
734 	case SO_REUSEADDR:
735 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
736 		break;
737 	case SO_REUSEPORT:
738 		sk->sk_reuseport = valbool;
739 		break;
740 	case SO_TYPE:
741 	case SO_PROTOCOL:
742 	case SO_DOMAIN:
743 	case SO_ERROR:
744 		ret = -ENOPROTOOPT;
745 		break;
746 	case SO_DONTROUTE:
747 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
748 		break;
749 	case SO_BROADCAST:
750 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
751 		break;
752 	case SO_SNDBUF:
753 		/* Don't error on this BSD doesn't and if you think
754 		 * about it this is right. Otherwise apps have to
755 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
756 		 * are treated in BSD as hints
757 		 */
758 		val = min_t(u32, val, sysctl_wmem_max);
759 set_sndbuf:
760 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
761 		sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
762 		/* Wake up sending tasks if we upped the value. */
763 		sk->sk_write_space(sk);
764 		break;
765 
766 	case SO_SNDBUFFORCE:
767 		if (!capable(CAP_NET_ADMIN)) {
768 			ret = -EPERM;
769 			break;
770 		}
771 		goto set_sndbuf;
772 
773 	case SO_RCVBUF:
774 		/* Don't error on this BSD doesn't and if you think
775 		 * about it this is right. Otherwise apps have to
776 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
777 		 * are treated in BSD as hints
778 		 */
779 		val = min_t(u32, val, sysctl_rmem_max);
780 set_rcvbuf:
781 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
782 		/*
783 		 * We double it on the way in to account for
784 		 * "struct sk_buff" etc. overhead.   Applications
785 		 * assume that the SO_RCVBUF setting they make will
786 		 * allow that much actual data to be received on that
787 		 * socket.
788 		 *
789 		 * Applications are unaware that "struct sk_buff" and
790 		 * other overheads allocate from the receive buffer
791 		 * during socket buffer allocation.
792 		 *
793 		 * And after considering the possible alternatives,
794 		 * returning the value we actually used in getsockopt
795 		 * is the most desirable behavior.
796 		 */
797 		sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
798 		break;
799 
800 	case SO_RCVBUFFORCE:
801 		if (!capable(CAP_NET_ADMIN)) {
802 			ret = -EPERM;
803 			break;
804 		}
805 		goto set_rcvbuf;
806 
807 	case SO_KEEPALIVE:
808 		if (sk->sk_prot->keepalive)
809 			sk->sk_prot->keepalive(sk, valbool);
810 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
811 		break;
812 
813 	case SO_OOBINLINE:
814 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
815 		break;
816 
817 	case SO_NO_CHECK:
818 		sk->sk_no_check_tx = valbool;
819 		break;
820 
821 	case SO_PRIORITY:
822 		if ((val >= 0 && val <= 6) ||
823 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
824 			sk->sk_priority = val;
825 		else
826 			ret = -EPERM;
827 		break;
828 
829 	case SO_LINGER:
830 		if (optlen < sizeof(ling)) {
831 			ret = -EINVAL;	/* 1003.1g */
832 			break;
833 		}
834 		if (copy_from_user(&ling, optval, sizeof(ling))) {
835 			ret = -EFAULT;
836 			break;
837 		}
838 		if (!ling.l_onoff)
839 			sock_reset_flag(sk, SOCK_LINGER);
840 		else {
841 #if (BITS_PER_LONG == 32)
842 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
843 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
844 			else
845 #endif
846 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
847 			sock_set_flag(sk, SOCK_LINGER);
848 		}
849 		break;
850 
851 	case SO_BSDCOMPAT:
852 		sock_warn_obsolete_bsdism("setsockopt");
853 		break;
854 
855 	case SO_PASSCRED:
856 		if (valbool)
857 			set_bit(SOCK_PASSCRED, &sock->flags);
858 		else
859 			clear_bit(SOCK_PASSCRED, &sock->flags);
860 		break;
861 
862 	case SO_TIMESTAMP:
863 	case SO_TIMESTAMPNS:
864 		if (valbool)  {
865 			if (optname == SO_TIMESTAMP)
866 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
867 			else
868 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
869 			sock_set_flag(sk, SOCK_RCVTSTAMP);
870 			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
871 		} else {
872 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
873 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
874 		}
875 		break;
876 
877 	case SO_TIMESTAMPING:
878 		if (val & ~SOF_TIMESTAMPING_MASK) {
879 			ret = -EINVAL;
880 			break;
881 		}
882 
883 		if (val & SOF_TIMESTAMPING_OPT_ID &&
884 		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
885 			if (sk->sk_protocol == IPPROTO_TCP &&
886 			    sk->sk_type == SOCK_STREAM) {
887 				if ((1 << sk->sk_state) &
888 				    (TCPF_CLOSE | TCPF_LISTEN)) {
889 					ret = -EINVAL;
890 					break;
891 				}
892 				sk->sk_tskey = tcp_sk(sk)->snd_una;
893 			} else {
894 				sk->sk_tskey = 0;
895 			}
896 		}
897 
898 		if (val & SOF_TIMESTAMPING_OPT_STATS &&
899 		    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
900 			ret = -EINVAL;
901 			break;
902 		}
903 
904 		sk->sk_tsflags = val;
905 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
906 			sock_enable_timestamp(sk,
907 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
908 		else
909 			sock_disable_timestamp(sk,
910 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
911 		break;
912 
913 	case SO_RCVLOWAT:
914 		if (val < 0)
915 			val = INT_MAX;
916 		sk->sk_rcvlowat = val ? : 1;
917 		break;
918 
919 	case SO_RCVTIMEO:
920 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
921 		break;
922 
923 	case SO_SNDTIMEO:
924 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
925 		break;
926 
927 	case SO_ATTACH_FILTER:
928 		ret = -EINVAL;
929 		if (optlen == sizeof(struct sock_fprog)) {
930 			struct sock_fprog fprog;
931 
932 			ret = -EFAULT;
933 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
934 				break;
935 
936 			ret = sk_attach_filter(&fprog, sk);
937 		}
938 		break;
939 
940 	case SO_ATTACH_BPF:
941 		ret = -EINVAL;
942 		if (optlen == sizeof(u32)) {
943 			u32 ufd;
944 
945 			ret = -EFAULT;
946 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
947 				break;
948 
949 			ret = sk_attach_bpf(ufd, sk);
950 		}
951 		break;
952 
953 	case SO_ATTACH_REUSEPORT_CBPF:
954 		ret = -EINVAL;
955 		if (optlen == sizeof(struct sock_fprog)) {
956 			struct sock_fprog fprog;
957 
958 			ret = -EFAULT;
959 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
960 				break;
961 
962 			ret = sk_reuseport_attach_filter(&fprog, sk);
963 		}
964 		break;
965 
966 	case SO_ATTACH_REUSEPORT_EBPF:
967 		ret = -EINVAL;
968 		if (optlen == sizeof(u32)) {
969 			u32 ufd;
970 
971 			ret = -EFAULT;
972 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
973 				break;
974 
975 			ret = sk_reuseport_attach_bpf(ufd, sk);
976 		}
977 		break;
978 
979 	case SO_DETACH_FILTER:
980 		ret = sk_detach_filter(sk);
981 		break;
982 
983 	case SO_LOCK_FILTER:
984 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
985 			ret = -EPERM;
986 		else
987 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
988 		break;
989 
990 	case SO_PASSSEC:
991 		if (valbool)
992 			set_bit(SOCK_PASSSEC, &sock->flags);
993 		else
994 			clear_bit(SOCK_PASSSEC, &sock->flags);
995 		break;
996 	case SO_MARK:
997 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
998 			ret = -EPERM;
999 		else
1000 			sk->sk_mark = val;
1001 		break;
1002 
1003 	case SO_RXQ_OVFL:
1004 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1005 		break;
1006 
1007 	case SO_WIFI_STATUS:
1008 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1009 		break;
1010 
1011 	case SO_PEEK_OFF:
1012 		if (sock->ops->set_peek_off)
1013 			ret = sock->ops->set_peek_off(sk, val);
1014 		else
1015 			ret = -EOPNOTSUPP;
1016 		break;
1017 
1018 	case SO_NOFCS:
1019 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1020 		break;
1021 
1022 	case SO_SELECT_ERR_QUEUE:
1023 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1024 		break;
1025 
1026 #ifdef CONFIG_NET_RX_BUSY_POLL
1027 	case SO_BUSY_POLL:
1028 		/* allow unprivileged users to decrease the value */
1029 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1030 			ret = -EPERM;
1031 		else {
1032 			if (val < 0)
1033 				ret = -EINVAL;
1034 			else
1035 				sk->sk_ll_usec = val;
1036 		}
1037 		break;
1038 #endif
1039 
1040 	case SO_MAX_PACING_RATE:
1041 		if (val != ~0U)
1042 			cmpxchg(&sk->sk_pacing_status,
1043 				SK_PACING_NONE,
1044 				SK_PACING_NEEDED);
1045 		sk->sk_max_pacing_rate = val;
1046 		sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1047 					 sk->sk_max_pacing_rate);
1048 		break;
1049 
1050 	case SO_INCOMING_CPU:
1051 		sk->sk_incoming_cpu = val;
1052 		break;
1053 
1054 	case SO_CNX_ADVICE:
1055 		if (val == 1)
1056 			dst_negative_advice(sk);
1057 		break;
1058 	default:
1059 		ret = -ENOPROTOOPT;
1060 		break;
1061 	}
1062 	release_sock(sk);
1063 	return ret;
1064 }
1065 EXPORT_SYMBOL(sock_setsockopt);
1066 
1067 
1068 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1069 			  struct ucred *ucred)
1070 {
1071 	ucred->pid = pid_vnr(pid);
1072 	ucred->uid = ucred->gid = -1;
1073 	if (cred) {
1074 		struct user_namespace *current_ns = current_user_ns();
1075 
1076 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1077 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1078 	}
1079 }
1080 
1081 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1082 {
1083 	struct user_namespace *user_ns = current_user_ns();
1084 	int i;
1085 
1086 	for (i = 0; i < src->ngroups; i++)
1087 		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1088 			return -EFAULT;
1089 
1090 	return 0;
1091 }
1092 
1093 int sock_getsockopt(struct socket *sock, int level, int optname,
1094 		    char __user *optval, int __user *optlen)
1095 {
1096 	struct sock *sk = sock->sk;
1097 
1098 	union {
1099 		int val;
1100 		u64 val64;
1101 		struct linger ling;
1102 		struct timeval tm;
1103 	} v;
1104 
1105 	int lv = sizeof(int);
1106 	int len;
1107 
1108 	if (get_user(len, optlen))
1109 		return -EFAULT;
1110 	if (len < 0)
1111 		return -EINVAL;
1112 
1113 	memset(&v, 0, sizeof(v));
1114 
1115 	switch (optname) {
1116 	case SO_DEBUG:
1117 		v.val = sock_flag(sk, SOCK_DBG);
1118 		break;
1119 
1120 	case SO_DONTROUTE:
1121 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1122 		break;
1123 
1124 	case SO_BROADCAST:
1125 		v.val = sock_flag(sk, SOCK_BROADCAST);
1126 		break;
1127 
1128 	case SO_SNDBUF:
1129 		v.val = sk->sk_sndbuf;
1130 		break;
1131 
1132 	case SO_RCVBUF:
1133 		v.val = sk->sk_rcvbuf;
1134 		break;
1135 
1136 	case SO_REUSEADDR:
1137 		v.val = sk->sk_reuse;
1138 		break;
1139 
1140 	case SO_REUSEPORT:
1141 		v.val = sk->sk_reuseport;
1142 		break;
1143 
1144 	case SO_KEEPALIVE:
1145 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1146 		break;
1147 
1148 	case SO_TYPE:
1149 		v.val = sk->sk_type;
1150 		break;
1151 
1152 	case SO_PROTOCOL:
1153 		v.val = sk->sk_protocol;
1154 		break;
1155 
1156 	case SO_DOMAIN:
1157 		v.val = sk->sk_family;
1158 		break;
1159 
1160 	case SO_ERROR:
1161 		v.val = -sock_error(sk);
1162 		if (v.val == 0)
1163 			v.val = xchg(&sk->sk_err_soft, 0);
1164 		break;
1165 
1166 	case SO_OOBINLINE:
1167 		v.val = sock_flag(sk, SOCK_URGINLINE);
1168 		break;
1169 
1170 	case SO_NO_CHECK:
1171 		v.val = sk->sk_no_check_tx;
1172 		break;
1173 
1174 	case SO_PRIORITY:
1175 		v.val = sk->sk_priority;
1176 		break;
1177 
1178 	case SO_LINGER:
1179 		lv		= sizeof(v.ling);
1180 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1181 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1182 		break;
1183 
1184 	case SO_BSDCOMPAT:
1185 		sock_warn_obsolete_bsdism("getsockopt");
1186 		break;
1187 
1188 	case SO_TIMESTAMP:
1189 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1190 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1191 		break;
1192 
1193 	case SO_TIMESTAMPNS:
1194 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1195 		break;
1196 
1197 	case SO_TIMESTAMPING:
1198 		v.val = sk->sk_tsflags;
1199 		break;
1200 
1201 	case SO_RCVTIMEO:
1202 		lv = sizeof(struct timeval);
1203 		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1204 			v.tm.tv_sec = 0;
1205 			v.tm.tv_usec = 0;
1206 		} else {
1207 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1208 			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1209 		}
1210 		break;
1211 
1212 	case SO_SNDTIMEO:
1213 		lv = sizeof(struct timeval);
1214 		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1215 			v.tm.tv_sec = 0;
1216 			v.tm.tv_usec = 0;
1217 		} else {
1218 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1219 			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1220 		}
1221 		break;
1222 
1223 	case SO_RCVLOWAT:
1224 		v.val = sk->sk_rcvlowat;
1225 		break;
1226 
1227 	case SO_SNDLOWAT:
1228 		v.val = 1;
1229 		break;
1230 
1231 	case SO_PASSCRED:
1232 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1233 		break;
1234 
1235 	case SO_PEERCRED:
1236 	{
1237 		struct ucred peercred;
1238 		if (len > sizeof(peercred))
1239 			len = sizeof(peercred);
1240 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1241 		if (copy_to_user(optval, &peercred, len))
1242 			return -EFAULT;
1243 		goto lenout;
1244 	}
1245 
1246 	case SO_PEERGROUPS:
1247 	{
1248 		int ret, n;
1249 
1250 		if (!sk->sk_peer_cred)
1251 			return -ENODATA;
1252 
1253 		n = sk->sk_peer_cred->group_info->ngroups;
1254 		if (len < n * sizeof(gid_t)) {
1255 			len = n * sizeof(gid_t);
1256 			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1257 		}
1258 		len = n * sizeof(gid_t);
1259 
1260 		ret = groups_to_user((gid_t __user *)optval,
1261 				     sk->sk_peer_cred->group_info);
1262 		if (ret)
1263 			return ret;
1264 		goto lenout;
1265 	}
1266 
1267 	case SO_PEERNAME:
1268 	{
1269 		char address[128];
1270 
1271 		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1272 			return -ENOTCONN;
1273 		if (lv < len)
1274 			return -EINVAL;
1275 		if (copy_to_user(optval, address, len))
1276 			return -EFAULT;
1277 		goto lenout;
1278 	}
1279 
1280 	/* Dubious BSD thing... Probably nobody even uses it, but
1281 	 * the UNIX standard wants it for whatever reason... -DaveM
1282 	 */
1283 	case SO_ACCEPTCONN:
1284 		v.val = sk->sk_state == TCP_LISTEN;
1285 		break;
1286 
1287 	case SO_PASSSEC:
1288 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1289 		break;
1290 
1291 	case SO_PEERSEC:
1292 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1293 
1294 	case SO_MARK:
1295 		v.val = sk->sk_mark;
1296 		break;
1297 
1298 	case SO_RXQ_OVFL:
1299 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1300 		break;
1301 
1302 	case SO_WIFI_STATUS:
1303 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1304 		break;
1305 
1306 	case SO_PEEK_OFF:
1307 		if (!sock->ops->set_peek_off)
1308 			return -EOPNOTSUPP;
1309 
1310 		v.val = sk->sk_peek_off;
1311 		break;
1312 	case SO_NOFCS:
1313 		v.val = sock_flag(sk, SOCK_NOFCS);
1314 		break;
1315 
1316 	case SO_BINDTODEVICE:
1317 		return sock_getbindtodevice(sk, optval, optlen, len);
1318 
1319 	case SO_GET_FILTER:
1320 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1321 		if (len < 0)
1322 			return len;
1323 
1324 		goto lenout;
1325 
1326 	case SO_LOCK_FILTER:
1327 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1328 		break;
1329 
1330 	case SO_BPF_EXTENSIONS:
1331 		v.val = bpf_tell_extensions();
1332 		break;
1333 
1334 	case SO_SELECT_ERR_QUEUE:
1335 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1336 		break;
1337 
1338 #ifdef CONFIG_NET_RX_BUSY_POLL
1339 	case SO_BUSY_POLL:
1340 		v.val = sk->sk_ll_usec;
1341 		break;
1342 #endif
1343 
1344 	case SO_MAX_PACING_RATE:
1345 		v.val = sk->sk_max_pacing_rate;
1346 		break;
1347 
1348 	case SO_INCOMING_CPU:
1349 		v.val = sk->sk_incoming_cpu;
1350 		break;
1351 
1352 	case SO_MEMINFO:
1353 	{
1354 		u32 meminfo[SK_MEMINFO_VARS];
1355 
1356 		if (get_user(len, optlen))
1357 			return -EFAULT;
1358 
1359 		sk_get_meminfo(sk, meminfo);
1360 
1361 		len = min_t(unsigned int, len, sizeof(meminfo));
1362 		if (copy_to_user(optval, &meminfo, len))
1363 			return -EFAULT;
1364 
1365 		goto lenout;
1366 	}
1367 
1368 #ifdef CONFIG_NET_RX_BUSY_POLL
1369 	case SO_INCOMING_NAPI_ID:
1370 		v.val = READ_ONCE(sk->sk_napi_id);
1371 
1372 		/* aggregate non-NAPI IDs down to 0 */
1373 		if (v.val < MIN_NAPI_ID)
1374 			v.val = 0;
1375 
1376 		break;
1377 #endif
1378 
1379 	case SO_COOKIE:
1380 		lv = sizeof(u64);
1381 		if (len < lv)
1382 			return -EINVAL;
1383 		v.val64 = sock_gen_cookie(sk);
1384 		break;
1385 
1386 	default:
1387 		/* We implement the SO_SNDLOWAT etc to not be settable
1388 		 * (1003.1g 7).
1389 		 */
1390 		return -ENOPROTOOPT;
1391 	}
1392 
1393 	if (len > lv)
1394 		len = lv;
1395 	if (copy_to_user(optval, &v, len))
1396 		return -EFAULT;
1397 lenout:
1398 	if (put_user(len, optlen))
1399 		return -EFAULT;
1400 	return 0;
1401 }
1402 
1403 /*
1404  * Initialize an sk_lock.
1405  *
1406  * (We also register the sk_lock with the lock validator.)
1407  */
1408 static inline void sock_lock_init(struct sock *sk)
1409 {
1410 	if (sk->sk_kern_sock)
1411 		sock_lock_init_class_and_name(
1412 			sk,
1413 			af_family_kern_slock_key_strings[sk->sk_family],
1414 			af_family_kern_slock_keys + sk->sk_family,
1415 			af_family_kern_key_strings[sk->sk_family],
1416 			af_family_kern_keys + sk->sk_family);
1417 	else
1418 		sock_lock_init_class_and_name(
1419 			sk,
1420 			af_family_slock_key_strings[sk->sk_family],
1421 			af_family_slock_keys + sk->sk_family,
1422 			af_family_key_strings[sk->sk_family],
1423 			af_family_keys + sk->sk_family);
1424 }
1425 
1426 /*
1427  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1428  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1429  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1430  */
1431 static void sock_copy(struct sock *nsk, const struct sock *osk)
1432 {
1433 #ifdef CONFIG_SECURITY_NETWORK
1434 	void *sptr = nsk->sk_security;
1435 #endif
1436 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1437 
1438 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1439 	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1440 
1441 #ifdef CONFIG_SECURITY_NETWORK
1442 	nsk->sk_security = sptr;
1443 	security_sk_clone(osk, nsk);
1444 #endif
1445 }
1446 
1447 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1448 		int family)
1449 {
1450 	struct sock *sk;
1451 	struct kmem_cache *slab;
1452 
1453 	slab = prot->slab;
1454 	if (slab != NULL) {
1455 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1456 		if (!sk)
1457 			return sk;
1458 		if (priority & __GFP_ZERO)
1459 			sk_prot_clear_nulls(sk, prot->obj_size);
1460 	} else
1461 		sk = kmalloc(prot->obj_size, priority);
1462 
1463 	if (sk != NULL) {
1464 		kmemcheck_annotate_bitfield(sk, flags);
1465 
1466 		if (security_sk_alloc(sk, family, priority))
1467 			goto out_free;
1468 
1469 		if (!try_module_get(prot->owner))
1470 			goto out_free_sec;
1471 		sk_tx_queue_clear(sk);
1472 	}
1473 
1474 	return sk;
1475 
1476 out_free_sec:
1477 	security_sk_free(sk);
1478 out_free:
1479 	if (slab != NULL)
1480 		kmem_cache_free(slab, sk);
1481 	else
1482 		kfree(sk);
1483 	return NULL;
1484 }
1485 
1486 static void sk_prot_free(struct proto *prot, struct sock *sk)
1487 {
1488 	struct kmem_cache *slab;
1489 	struct module *owner;
1490 
1491 	owner = prot->owner;
1492 	slab = prot->slab;
1493 
1494 	cgroup_sk_free(&sk->sk_cgrp_data);
1495 	mem_cgroup_sk_free(sk);
1496 	security_sk_free(sk);
1497 	if (slab != NULL)
1498 		kmem_cache_free(slab, sk);
1499 	else
1500 		kfree(sk);
1501 	module_put(owner);
1502 }
1503 
1504 /**
1505  *	sk_alloc - All socket objects are allocated here
1506  *	@net: the applicable net namespace
1507  *	@family: protocol family
1508  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1509  *	@prot: struct proto associated with this new sock instance
1510  *	@kern: is this to be a kernel socket?
1511  */
1512 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1513 		      struct proto *prot, int kern)
1514 {
1515 	struct sock *sk;
1516 
1517 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1518 	if (sk) {
1519 		sk->sk_family = family;
1520 		/*
1521 		 * See comment in struct sock definition to understand
1522 		 * why we need sk_prot_creator -acme
1523 		 */
1524 		sk->sk_prot = sk->sk_prot_creator = prot;
1525 		sk->sk_kern_sock = kern;
1526 		sock_lock_init(sk);
1527 		sk->sk_net_refcnt = kern ? 0 : 1;
1528 		if (likely(sk->sk_net_refcnt))
1529 			get_net(net);
1530 		sock_net_set(sk, net);
1531 		refcount_set(&sk->sk_wmem_alloc, 1);
1532 
1533 		mem_cgroup_sk_alloc(sk);
1534 		cgroup_sk_alloc(&sk->sk_cgrp_data);
1535 		sock_update_classid(&sk->sk_cgrp_data);
1536 		sock_update_netprioidx(&sk->sk_cgrp_data);
1537 	}
1538 
1539 	return sk;
1540 }
1541 EXPORT_SYMBOL(sk_alloc);
1542 
1543 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1544  * grace period. This is the case for UDP sockets and TCP listeners.
1545  */
1546 static void __sk_destruct(struct rcu_head *head)
1547 {
1548 	struct sock *sk = container_of(head, struct sock, sk_rcu);
1549 	struct sk_filter *filter;
1550 
1551 	if (sk->sk_destruct)
1552 		sk->sk_destruct(sk);
1553 
1554 	filter = rcu_dereference_check(sk->sk_filter,
1555 				       refcount_read(&sk->sk_wmem_alloc) == 0);
1556 	if (filter) {
1557 		sk_filter_uncharge(sk, filter);
1558 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1559 	}
1560 	if (rcu_access_pointer(sk->sk_reuseport_cb))
1561 		reuseport_detach_sock(sk);
1562 
1563 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1564 
1565 	if (atomic_read(&sk->sk_omem_alloc))
1566 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1567 			 __func__, atomic_read(&sk->sk_omem_alloc));
1568 
1569 	if (sk->sk_frag.page) {
1570 		put_page(sk->sk_frag.page);
1571 		sk->sk_frag.page = NULL;
1572 	}
1573 
1574 	if (sk->sk_peer_cred)
1575 		put_cred(sk->sk_peer_cred);
1576 	put_pid(sk->sk_peer_pid);
1577 	if (likely(sk->sk_net_refcnt))
1578 		put_net(sock_net(sk));
1579 	sk_prot_free(sk->sk_prot_creator, sk);
1580 }
1581 
1582 void sk_destruct(struct sock *sk)
1583 {
1584 	if (sock_flag(sk, SOCK_RCU_FREE))
1585 		call_rcu(&sk->sk_rcu, __sk_destruct);
1586 	else
1587 		__sk_destruct(&sk->sk_rcu);
1588 }
1589 
1590 static void __sk_free(struct sock *sk)
1591 {
1592 	if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
1593 		sock_diag_broadcast_destroy(sk);
1594 	else
1595 		sk_destruct(sk);
1596 }
1597 
1598 void sk_free(struct sock *sk)
1599 {
1600 	/*
1601 	 * We subtract one from sk_wmem_alloc and can know if
1602 	 * some packets are still in some tx queue.
1603 	 * If not null, sock_wfree() will call __sk_free(sk) later
1604 	 */
1605 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1606 		__sk_free(sk);
1607 }
1608 EXPORT_SYMBOL(sk_free);
1609 
1610 static void sk_init_common(struct sock *sk)
1611 {
1612 	skb_queue_head_init(&sk->sk_receive_queue);
1613 	skb_queue_head_init(&sk->sk_write_queue);
1614 	skb_queue_head_init(&sk->sk_error_queue);
1615 
1616 	rwlock_init(&sk->sk_callback_lock);
1617 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1618 			af_rlock_keys + sk->sk_family,
1619 			af_family_rlock_key_strings[sk->sk_family]);
1620 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1621 			af_wlock_keys + sk->sk_family,
1622 			af_family_wlock_key_strings[sk->sk_family]);
1623 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1624 			af_elock_keys + sk->sk_family,
1625 			af_family_elock_key_strings[sk->sk_family]);
1626 	lockdep_set_class_and_name(&sk->sk_callback_lock,
1627 			af_callback_keys + sk->sk_family,
1628 			af_family_clock_key_strings[sk->sk_family]);
1629 }
1630 
1631 /**
1632  *	sk_clone_lock - clone a socket, and lock its clone
1633  *	@sk: the socket to clone
1634  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1635  *
1636  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1637  */
1638 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1639 {
1640 	struct sock *newsk;
1641 	bool is_charged = true;
1642 
1643 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1644 	if (newsk != NULL) {
1645 		struct sk_filter *filter;
1646 
1647 		sock_copy(newsk, sk);
1648 
1649 		/* SANITY */
1650 		if (likely(newsk->sk_net_refcnt))
1651 			get_net(sock_net(newsk));
1652 		sk_node_init(&newsk->sk_node);
1653 		sock_lock_init(newsk);
1654 		bh_lock_sock(newsk);
1655 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1656 		newsk->sk_backlog.len = 0;
1657 
1658 		atomic_set(&newsk->sk_rmem_alloc, 0);
1659 		/*
1660 		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1661 		 */
1662 		refcount_set(&newsk->sk_wmem_alloc, 1);
1663 		atomic_set(&newsk->sk_omem_alloc, 0);
1664 		sk_init_common(newsk);
1665 
1666 		newsk->sk_dst_cache	= NULL;
1667 		newsk->sk_dst_pending_confirm = 0;
1668 		newsk->sk_wmem_queued	= 0;
1669 		newsk->sk_forward_alloc = 0;
1670 		atomic_set(&newsk->sk_drops, 0);
1671 		newsk->sk_send_head	= NULL;
1672 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1673 
1674 		sock_reset_flag(newsk, SOCK_DONE);
1675 
1676 		filter = rcu_dereference_protected(newsk->sk_filter, 1);
1677 		if (filter != NULL)
1678 			/* though it's an empty new sock, the charging may fail
1679 			 * if sysctl_optmem_max was changed between creation of
1680 			 * original socket and cloning
1681 			 */
1682 			is_charged = sk_filter_charge(newsk, filter);
1683 
1684 		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1685 			/* We need to make sure that we don't uncharge the new
1686 			 * socket if we couldn't charge it in the first place
1687 			 * as otherwise we uncharge the parent's filter.
1688 			 */
1689 			if (!is_charged)
1690 				RCU_INIT_POINTER(newsk->sk_filter, NULL);
1691 			sk_free_unlock_clone(newsk);
1692 			newsk = NULL;
1693 			goto out;
1694 		}
1695 		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1696 
1697 		newsk->sk_err	   = 0;
1698 		newsk->sk_err_soft = 0;
1699 		newsk->sk_priority = 0;
1700 		newsk->sk_incoming_cpu = raw_smp_processor_id();
1701 		atomic64_set(&newsk->sk_cookie, 0);
1702 
1703 		mem_cgroup_sk_alloc(newsk);
1704 		cgroup_sk_alloc(&newsk->sk_cgrp_data);
1705 
1706 		/*
1707 		 * Before updating sk_refcnt, we must commit prior changes to memory
1708 		 * (Documentation/RCU/rculist_nulls.txt for details)
1709 		 */
1710 		smp_wmb();
1711 		refcount_set(&newsk->sk_refcnt, 2);
1712 
1713 		/*
1714 		 * Increment the counter in the same struct proto as the master
1715 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1716 		 * is the same as sk->sk_prot->socks, as this field was copied
1717 		 * with memcpy).
1718 		 *
1719 		 * This _changes_ the previous behaviour, where
1720 		 * tcp_create_openreq_child always was incrementing the
1721 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1722 		 * to be taken into account in all callers. -acme
1723 		 */
1724 		sk_refcnt_debug_inc(newsk);
1725 		sk_set_socket(newsk, NULL);
1726 		newsk->sk_wq = NULL;
1727 
1728 		if (newsk->sk_prot->sockets_allocated)
1729 			sk_sockets_allocated_inc(newsk);
1730 
1731 		if (sock_needs_netstamp(sk) &&
1732 		    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1733 			net_enable_timestamp();
1734 	}
1735 out:
1736 	return newsk;
1737 }
1738 EXPORT_SYMBOL_GPL(sk_clone_lock);
1739 
1740 void sk_free_unlock_clone(struct sock *sk)
1741 {
1742 	/* It is still raw copy of parent, so invalidate
1743 	 * destructor and make plain sk_free() */
1744 	sk->sk_destruct = NULL;
1745 	bh_unlock_sock(sk);
1746 	sk_free(sk);
1747 }
1748 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1749 
1750 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1751 {
1752 	u32 max_segs = 1;
1753 
1754 	sk_dst_set(sk, dst);
1755 	sk->sk_route_caps = dst->dev->features;
1756 	if (sk->sk_route_caps & NETIF_F_GSO)
1757 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1758 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1759 	if (sk_can_gso(sk)) {
1760 		if (dst->header_len) {
1761 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1762 		} else {
1763 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1764 			sk->sk_gso_max_size = dst->dev->gso_max_size;
1765 			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1766 		}
1767 	}
1768 	sk->sk_gso_max_segs = max_segs;
1769 }
1770 EXPORT_SYMBOL_GPL(sk_setup_caps);
1771 
1772 /*
1773  *	Simple resource managers for sockets.
1774  */
1775 
1776 
1777 /*
1778  * Write buffer destructor automatically called from kfree_skb.
1779  */
1780 void sock_wfree(struct sk_buff *skb)
1781 {
1782 	struct sock *sk = skb->sk;
1783 	unsigned int len = skb->truesize;
1784 
1785 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1786 		/*
1787 		 * Keep a reference on sk_wmem_alloc, this will be released
1788 		 * after sk_write_space() call
1789 		 */
1790 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1791 		sk->sk_write_space(sk);
1792 		len = 1;
1793 	}
1794 	/*
1795 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1796 	 * could not do because of in-flight packets
1797 	 */
1798 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1799 		__sk_free(sk);
1800 }
1801 EXPORT_SYMBOL(sock_wfree);
1802 
1803 /* This variant of sock_wfree() is used by TCP,
1804  * since it sets SOCK_USE_WRITE_QUEUE.
1805  */
1806 void __sock_wfree(struct sk_buff *skb)
1807 {
1808 	struct sock *sk = skb->sk;
1809 
1810 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1811 		__sk_free(sk);
1812 }
1813 
1814 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1815 {
1816 	skb_orphan(skb);
1817 	skb->sk = sk;
1818 #ifdef CONFIG_INET
1819 	if (unlikely(!sk_fullsock(sk))) {
1820 		skb->destructor = sock_edemux;
1821 		sock_hold(sk);
1822 		return;
1823 	}
1824 #endif
1825 	skb->destructor = sock_wfree;
1826 	skb_set_hash_from_sk(skb, sk);
1827 	/*
1828 	 * We used to take a refcount on sk, but following operation
1829 	 * is enough to guarantee sk_free() wont free this sock until
1830 	 * all in-flight packets are completed
1831 	 */
1832 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1833 }
1834 EXPORT_SYMBOL(skb_set_owner_w);
1835 
1836 /* This helper is used by netem, as it can hold packets in its
1837  * delay queue. We want to allow the owner socket to send more
1838  * packets, as if they were already TX completed by a typical driver.
1839  * But we also want to keep skb->sk set because some packet schedulers
1840  * rely on it (sch_fq for example).
1841  */
1842 void skb_orphan_partial(struct sk_buff *skb)
1843 {
1844 	if (skb_is_tcp_pure_ack(skb))
1845 		return;
1846 
1847 	if (skb->destructor == sock_wfree
1848 #ifdef CONFIG_INET
1849 	    || skb->destructor == tcp_wfree
1850 #endif
1851 		) {
1852 		struct sock *sk = skb->sk;
1853 
1854 		if (refcount_inc_not_zero(&sk->sk_refcnt)) {
1855 			WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
1856 			skb->destructor = sock_efree;
1857 		}
1858 	} else {
1859 		skb_orphan(skb);
1860 	}
1861 }
1862 EXPORT_SYMBOL(skb_orphan_partial);
1863 
1864 /*
1865  * Read buffer destructor automatically called from kfree_skb.
1866  */
1867 void sock_rfree(struct sk_buff *skb)
1868 {
1869 	struct sock *sk = skb->sk;
1870 	unsigned int len = skb->truesize;
1871 
1872 	atomic_sub(len, &sk->sk_rmem_alloc);
1873 	sk_mem_uncharge(sk, len);
1874 }
1875 EXPORT_SYMBOL(sock_rfree);
1876 
1877 /*
1878  * Buffer destructor for skbs that are not used directly in read or write
1879  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1880  */
1881 void sock_efree(struct sk_buff *skb)
1882 {
1883 	sock_put(skb->sk);
1884 }
1885 EXPORT_SYMBOL(sock_efree);
1886 
1887 kuid_t sock_i_uid(struct sock *sk)
1888 {
1889 	kuid_t uid;
1890 
1891 	read_lock_bh(&sk->sk_callback_lock);
1892 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1893 	read_unlock_bh(&sk->sk_callback_lock);
1894 	return uid;
1895 }
1896 EXPORT_SYMBOL(sock_i_uid);
1897 
1898 unsigned long sock_i_ino(struct sock *sk)
1899 {
1900 	unsigned long ino;
1901 
1902 	read_lock_bh(&sk->sk_callback_lock);
1903 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1904 	read_unlock_bh(&sk->sk_callback_lock);
1905 	return ino;
1906 }
1907 EXPORT_SYMBOL(sock_i_ino);
1908 
1909 /*
1910  * Allocate a skb from the socket's send buffer.
1911  */
1912 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1913 			     gfp_t priority)
1914 {
1915 	if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1916 		struct sk_buff *skb = alloc_skb(size, priority);
1917 		if (skb) {
1918 			skb_set_owner_w(skb, sk);
1919 			return skb;
1920 		}
1921 	}
1922 	return NULL;
1923 }
1924 EXPORT_SYMBOL(sock_wmalloc);
1925 
1926 /*
1927  * Allocate a memory block from the socket's option memory buffer.
1928  */
1929 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1930 {
1931 	if ((unsigned int)size <= sysctl_optmem_max &&
1932 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1933 		void *mem;
1934 		/* First do the add, to avoid the race if kmalloc
1935 		 * might sleep.
1936 		 */
1937 		atomic_add(size, &sk->sk_omem_alloc);
1938 		mem = kmalloc(size, priority);
1939 		if (mem)
1940 			return mem;
1941 		atomic_sub(size, &sk->sk_omem_alloc);
1942 	}
1943 	return NULL;
1944 }
1945 EXPORT_SYMBOL(sock_kmalloc);
1946 
1947 /* Free an option memory block. Note, we actually want the inline
1948  * here as this allows gcc to detect the nullify and fold away the
1949  * condition entirely.
1950  */
1951 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1952 				  const bool nullify)
1953 {
1954 	if (WARN_ON_ONCE(!mem))
1955 		return;
1956 	if (nullify)
1957 		kzfree(mem);
1958 	else
1959 		kfree(mem);
1960 	atomic_sub(size, &sk->sk_omem_alloc);
1961 }
1962 
1963 void sock_kfree_s(struct sock *sk, void *mem, int size)
1964 {
1965 	__sock_kfree_s(sk, mem, size, false);
1966 }
1967 EXPORT_SYMBOL(sock_kfree_s);
1968 
1969 void sock_kzfree_s(struct sock *sk, void *mem, int size)
1970 {
1971 	__sock_kfree_s(sk, mem, size, true);
1972 }
1973 EXPORT_SYMBOL(sock_kzfree_s);
1974 
1975 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1976    I think, these locks should be removed for datagram sockets.
1977  */
1978 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1979 {
1980 	DEFINE_WAIT(wait);
1981 
1982 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1983 	for (;;) {
1984 		if (!timeo)
1985 			break;
1986 		if (signal_pending(current))
1987 			break;
1988 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1989 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1990 		if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1991 			break;
1992 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1993 			break;
1994 		if (sk->sk_err)
1995 			break;
1996 		timeo = schedule_timeout(timeo);
1997 	}
1998 	finish_wait(sk_sleep(sk), &wait);
1999 	return timeo;
2000 }
2001 
2002 
2003 /*
2004  *	Generic send/receive buffer handlers
2005  */
2006 
2007 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2008 				     unsigned long data_len, int noblock,
2009 				     int *errcode, int max_page_order)
2010 {
2011 	struct sk_buff *skb;
2012 	long timeo;
2013 	int err;
2014 
2015 	timeo = sock_sndtimeo(sk, noblock);
2016 	for (;;) {
2017 		err = sock_error(sk);
2018 		if (err != 0)
2019 			goto failure;
2020 
2021 		err = -EPIPE;
2022 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2023 			goto failure;
2024 
2025 		if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2026 			break;
2027 
2028 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2029 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2030 		err = -EAGAIN;
2031 		if (!timeo)
2032 			goto failure;
2033 		if (signal_pending(current))
2034 			goto interrupted;
2035 		timeo = sock_wait_for_wmem(sk, timeo);
2036 	}
2037 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2038 				   errcode, sk->sk_allocation);
2039 	if (skb)
2040 		skb_set_owner_w(skb, sk);
2041 	return skb;
2042 
2043 interrupted:
2044 	err = sock_intr_errno(timeo);
2045 failure:
2046 	*errcode = err;
2047 	return NULL;
2048 }
2049 EXPORT_SYMBOL(sock_alloc_send_pskb);
2050 
2051 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2052 				    int noblock, int *errcode)
2053 {
2054 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2055 }
2056 EXPORT_SYMBOL(sock_alloc_send_skb);
2057 
2058 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2059 		     struct sockcm_cookie *sockc)
2060 {
2061 	u32 tsflags;
2062 
2063 	switch (cmsg->cmsg_type) {
2064 	case SO_MARK:
2065 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2066 			return -EPERM;
2067 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2068 			return -EINVAL;
2069 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2070 		break;
2071 	case SO_TIMESTAMPING:
2072 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2073 			return -EINVAL;
2074 
2075 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2076 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2077 			return -EINVAL;
2078 
2079 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2080 		sockc->tsflags |= tsflags;
2081 		break;
2082 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2083 	case SCM_RIGHTS:
2084 	case SCM_CREDENTIALS:
2085 		break;
2086 	default:
2087 		return -EINVAL;
2088 	}
2089 	return 0;
2090 }
2091 EXPORT_SYMBOL(__sock_cmsg_send);
2092 
2093 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2094 		   struct sockcm_cookie *sockc)
2095 {
2096 	struct cmsghdr *cmsg;
2097 	int ret;
2098 
2099 	for_each_cmsghdr(cmsg, msg) {
2100 		if (!CMSG_OK(msg, cmsg))
2101 			return -EINVAL;
2102 		if (cmsg->cmsg_level != SOL_SOCKET)
2103 			continue;
2104 		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2105 		if (ret)
2106 			return ret;
2107 	}
2108 	return 0;
2109 }
2110 EXPORT_SYMBOL(sock_cmsg_send);
2111 
2112 static void sk_enter_memory_pressure(struct sock *sk)
2113 {
2114 	if (!sk->sk_prot->enter_memory_pressure)
2115 		return;
2116 
2117 	sk->sk_prot->enter_memory_pressure(sk);
2118 }
2119 
2120 static void sk_leave_memory_pressure(struct sock *sk)
2121 {
2122 	if (sk->sk_prot->leave_memory_pressure) {
2123 		sk->sk_prot->leave_memory_pressure(sk);
2124 	} else {
2125 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2126 
2127 		if (memory_pressure && *memory_pressure)
2128 			*memory_pressure = 0;
2129 	}
2130 }
2131 
2132 /* On 32bit arches, an skb frag is limited to 2^15 */
2133 #define SKB_FRAG_PAGE_ORDER	get_order(32768)
2134 
2135 /**
2136  * skb_page_frag_refill - check that a page_frag contains enough room
2137  * @sz: minimum size of the fragment we want to get
2138  * @pfrag: pointer to page_frag
2139  * @gfp: priority for memory allocation
2140  *
2141  * Note: While this allocator tries to use high order pages, there is
2142  * no guarantee that allocations succeed. Therefore, @sz MUST be
2143  * less or equal than PAGE_SIZE.
2144  */
2145 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2146 {
2147 	if (pfrag->page) {
2148 		if (page_ref_count(pfrag->page) == 1) {
2149 			pfrag->offset = 0;
2150 			return true;
2151 		}
2152 		if (pfrag->offset + sz <= pfrag->size)
2153 			return true;
2154 		put_page(pfrag->page);
2155 	}
2156 
2157 	pfrag->offset = 0;
2158 	if (SKB_FRAG_PAGE_ORDER) {
2159 		/* Avoid direct reclaim but allow kswapd to wake */
2160 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2161 					  __GFP_COMP | __GFP_NOWARN |
2162 					  __GFP_NORETRY,
2163 					  SKB_FRAG_PAGE_ORDER);
2164 		if (likely(pfrag->page)) {
2165 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2166 			return true;
2167 		}
2168 	}
2169 	pfrag->page = alloc_page(gfp);
2170 	if (likely(pfrag->page)) {
2171 		pfrag->size = PAGE_SIZE;
2172 		return true;
2173 	}
2174 	return false;
2175 }
2176 EXPORT_SYMBOL(skb_page_frag_refill);
2177 
2178 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2179 {
2180 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2181 		return true;
2182 
2183 	sk_enter_memory_pressure(sk);
2184 	sk_stream_moderate_sndbuf(sk);
2185 	return false;
2186 }
2187 EXPORT_SYMBOL(sk_page_frag_refill);
2188 
2189 static void __lock_sock(struct sock *sk)
2190 	__releases(&sk->sk_lock.slock)
2191 	__acquires(&sk->sk_lock.slock)
2192 {
2193 	DEFINE_WAIT(wait);
2194 
2195 	for (;;) {
2196 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2197 					TASK_UNINTERRUPTIBLE);
2198 		spin_unlock_bh(&sk->sk_lock.slock);
2199 		schedule();
2200 		spin_lock_bh(&sk->sk_lock.slock);
2201 		if (!sock_owned_by_user(sk))
2202 			break;
2203 	}
2204 	finish_wait(&sk->sk_lock.wq, &wait);
2205 }
2206 
2207 static void __release_sock(struct sock *sk)
2208 	__releases(&sk->sk_lock.slock)
2209 	__acquires(&sk->sk_lock.slock)
2210 {
2211 	struct sk_buff *skb, *next;
2212 
2213 	while ((skb = sk->sk_backlog.head) != NULL) {
2214 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2215 
2216 		spin_unlock_bh(&sk->sk_lock.slock);
2217 
2218 		do {
2219 			next = skb->next;
2220 			prefetch(next);
2221 			WARN_ON_ONCE(skb_dst_is_noref(skb));
2222 			skb->next = NULL;
2223 			sk_backlog_rcv(sk, skb);
2224 
2225 			cond_resched();
2226 
2227 			skb = next;
2228 		} while (skb != NULL);
2229 
2230 		spin_lock_bh(&sk->sk_lock.slock);
2231 	}
2232 
2233 	/*
2234 	 * Doing the zeroing here guarantee we can not loop forever
2235 	 * while a wild producer attempts to flood us.
2236 	 */
2237 	sk->sk_backlog.len = 0;
2238 }
2239 
2240 void __sk_flush_backlog(struct sock *sk)
2241 {
2242 	spin_lock_bh(&sk->sk_lock.slock);
2243 	__release_sock(sk);
2244 	spin_unlock_bh(&sk->sk_lock.slock);
2245 }
2246 
2247 /**
2248  * sk_wait_data - wait for data to arrive at sk_receive_queue
2249  * @sk:    sock to wait on
2250  * @timeo: for how long
2251  * @skb:   last skb seen on sk_receive_queue
2252  *
2253  * Now socket state including sk->sk_err is changed only under lock,
2254  * hence we may omit checks after joining wait queue.
2255  * We check receive queue before schedule() only as optimization;
2256  * it is very likely that release_sock() added new data.
2257  */
2258 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2259 {
2260 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2261 	int rc;
2262 
2263 	add_wait_queue(sk_sleep(sk), &wait);
2264 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2265 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2266 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2267 	remove_wait_queue(sk_sleep(sk), &wait);
2268 	return rc;
2269 }
2270 EXPORT_SYMBOL(sk_wait_data);
2271 
2272 /**
2273  *	__sk_mem_raise_allocated - increase memory_allocated
2274  *	@sk: socket
2275  *	@size: memory size to allocate
2276  *	@amt: pages to allocate
2277  *	@kind: allocation type
2278  *
2279  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2280  */
2281 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2282 {
2283 	struct proto *prot = sk->sk_prot;
2284 	long allocated = sk_memory_allocated_add(sk, amt);
2285 
2286 	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2287 	    !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2288 		goto suppress_allocation;
2289 
2290 	/* Under limit. */
2291 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2292 		sk_leave_memory_pressure(sk);
2293 		return 1;
2294 	}
2295 
2296 	/* Under pressure. */
2297 	if (allocated > sk_prot_mem_limits(sk, 1))
2298 		sk_enter_memory_pressure(sk);
2299 
2300 	/* Over hard limit. */
2301 	if (allocated > sk_prot_mem_limits(sk, 2))
2302 		goto suppress_allocation;
2303 
2304 	/* guarantee minimum buffer size under pressure */
2305 	if (kind == SK_MEM_RECV) {
2306 		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2307 			return 1;
2308 
2309 	} else { /* SK_MEM_SEND */
2310 		if (sk->sk_type == SOCK_STREAM) {
2311 			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2312 				return 1;
2313 		} else if (refcount_read(&sk->sk_wmem_alloc) <
2314 			   prot->sysctl_wmem[0])
2315 				return 1;
2316 	}
2317 
2318 	if (sk_has_memory_pressure(sk)) {
2319 		int alloc;
2320 
2321 		if (!sk_under_memory_pressure(sk))
2322 			return 1;
2323 		alloc = sk_sockets_allocated_read_positive(sk);
2324 		if (sk_prot_mem_limits(sk, 2) > alloc *
2325 		    sk_mem_pages(sk->sk_wmem_queued +
2326 				 atomic_read(&sk->sk_rmem_alloc) +
2327 				 sk->sk_forward_alloc))
2328 			return 1;
2329 	}
2330 
2331 suppress_allocation:
2332 
2333 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2334 		sk_stream_moderate_sndbuf(sk);
2335 
2336 		/* Fail only if socket is _under_ its sndbuf.
2337 		 * In this case we cannot block, so that we have to fail.
2338 		 */
2339 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2340 			return 1;
2341 	}
2342 
2343 	trace_sock_exceed_buf_limit(sk, prot, allocated);
2344 
2345 	sk_memory_allocated_sub(sk, amt);
2346 
2347 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2348 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2349 
2350 	return 0;
2351 }
2352 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2353 
2354 /**
2355  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2356  *	@sk: socket
2357  *	@size: memory size to allocate
2358  *	@kind: allocation type
2359  *
2360  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2361  *	rmem allocation. This function assumes that protocols which have
2362  *	memory_pressure use sk_wmem_queued as write buffer accounting.
2363  */
2364 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2365 {
2366 	int ret, amt = sk_mem_pages(size);
2367 
2368 	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2369 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2370 	if (!ret)
2371 		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2372 	return ret;
2373 }
2374 EXPORT_SYMBOL(__sk_mem_schedule);
2375 
2376 /**
2377  *	__sk_mem_reduce_allocated - reclaim memory_allocated
2378  *	@sk: socket
2379  *	@amount: number of quanta
2380  *
2381  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2382  */
2383 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2384 {
2385 	sk_memory_allocated_sub(sk, amount);
2386 
2387 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2388 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2389 
2390 	if (sk_under_memory_pressure(sk) &&
2391 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2392 		sk_leave_memory_pressure(sk);
2393 }
2394 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2395 
2396 /**
2397  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2398  *	@sk: socket
2399  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2400  */
2401 void __sk_mem_reclaim(struct sock *sk, int amount)
2402 {
2403 	amount >>= SK_MEM_QUANTUM_SHIFT;
2404 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2405 	__sk_mem_reduce_allocated(sk, amount);
2406 }
2407 EXPORT_SYMBOL(__sk_mem_reclaim);
2408 
2409 int sk_set_peek_off(struct sock *sk, int val)
2410 {
2411 	if (val < 0)
2412 		return -EINVAL;
2413 
2414 	sk->sk_peek_off = val;
2415 	return 0;
2416 }
2417 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2418 
2419 /*
2420  * Set of default routines for initialising struct proto_ops when
2421  * the protocol does not support a particular function. In certain
2422  * cases where it makes no sense for a protocol to have a "do nothing"
2423  * function, some default processing is provided.
2424  */
2425 
2426 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2427 {
2428 	return -EOPNOTSUPP;
2429 }
2430 EXPORT_SYMBOL(sock_no_bind);
2431 
2432 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2433 		    int len, int flags)
2434 {
2435 	return -EOPNOTSUPP;
2436 }
2437 EXPORT_SYMBOL(sock_no_connect);
2438 
2439 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2440 {
2441 	return -EOPNOTSUPP;
2442 }
2443 EXPORT_SYMBOL(sock_no_socketpair);
2444 
2445 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2446 		   bool kern)
2447 {
2448 	return -EOPNOTSUPP;
2449 }
2450 EXPORT_SYMBOL(sock_no_accept);
2451 
2452 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2453 		    int *len, int peer)
2454 {
2455 	return -EOPNOTSUPP;
2456 }
2457 EXPORT_SYMBOL(sock_no_getname);
2458 
2459 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2460 {
2461 	return 0;
2462 }
2463 EXPORT_SYMBOL(sock_no_poll);
2464 
2465 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2466 {
2467 	return -EOPNOTSUPP;
2468 }
2469 EXPORT_SYMBOL(sock_no_ioctl);
2470 
2471 int sock_no_listen(struct socket *sock, int backlog)
2472 {
2473 	return -EOPNOTSUPP;
2474 }
2475 EXPORT_SYMBOL(sock_no_listen);
2476 
2477 int sock_no_shutdown(struct socket *sock, int how)
2478 {
2479 	return -EOPNOTSUPP;
2480 }
2481 EXPORT_SYMBOL(sock_no_shutdown);
2482 
2483 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2484 		    char __user *optval, unsigned int optlen)
2485 {
2486 	return -EOPNOTSUPP;
2487 }
2488 EXPORT_SYMBOL(sock_no_setsockopt);
2489 
2490 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2491 		    char __user *optval, int __user *optlen)
2492 {
2493 	return -EOPNOTSUPP;
2494 }
2495 EXPORT_SYMBOL(sock_no_getsockopt);
2496 
2497 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2498 {
2499 	return -EOPNOTSUPP;
2500 }
2501 EXPORT_SYMBOL(sock_no_sendmsg);
2502 
2503 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2504 		    int flags)
2505 {
2506 	return -EOPNOTSUPP;
2507 }
2508 EXPORT_SYMBOL(sock_no_recvmsg);
2509 
2510 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2511 {
2512 	/* Mirror missing mmap method error code */
2513 	return -ENODEV;
2514 }
2515 EXPORT_SYMBOL(sock_no_mmap);
2516 
2517 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2518 {
2519 	ssize_t res;
2520 	struct msghdr msg = {.msg_flags = flags};
2521 	struct kvec iov;
2522 	char *kaddr = kmap(page);
2523 	iov.iov_base = kaddr + offset;
2524 	iov.iov_len = size;
2525 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2526 	kunmap(page);
2527 	return res;
2528 }
2529 EXPORT_SYMBOL(sock_no_sendpage);
2530 
2531 /*
2532  *	Default Socket Callbacks
2533  */
2534 
2535 static void sock_def_wakeup(struct sock *sk)
2536 {
2537 	struct socket_wq *wq;
2538 
2539 	rcu_read_lock();
2540 	wq = rcu_dereference(sk->sk_wq);
2541 	if (skwq_has_sleeper(wq))
2542 		wake_up_interruptible_all(&wq->wait);
2543 	rcu_read_unlock();
2544 }
2545 
2546 static void sock_def_error_report(struct sock *sk)
2547 {
2548 	struct socket_wq *wq;
2549 
2550 	rcu_read_lock();
2551 	wq = rcu_dereference(sk->sk_wq);
2552 	if (skwq_has_sleeper(wq))
2553 		wake_up_interruptible_poll(&wq->wait, POLLERR);
2554 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2555 	rcu_read_unlock();
2556 }
2557 
2558 static void sock_def_readable(struct sock *sk)
2559 {
2560 	struct socket_wq *wq;
2561 
2562 	rcu_read_lock();
2563 	wq = rcu_dereference(sk->sk_wq);
2564 	if (skwq_has_sleeper(wq))
2565 		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2566 						POLLRDNORM | POLLRDBAND);
2567 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2568 	rcu_read_unlock();
2569 }
2570 
2571 static void sock_def_write_space(struct sock *sk)
2572 {
2573 	struct socket_wq *wq;
2574 
2575 	rcu_read_lock();
2576 
2577 	/* Do not wake up a writer until he can make "significant"
2578 	 * progress.  --DaveM
2579 	 */
2580 	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2581 		wq = rcu_dereference(sk->sk_wq);
2582 		if (skwq_has_sleeper(wq))
2583 			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2584 						POLLWRNORM | POLLWRBAND);
2585 
2586 		/* Should agree with poll, otherwise some programs break */
2587 		if (sock_writeable(sk))
2588 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2589 	}
2590 
2591 	rcu_read_unlock();
2592 }
2593 
2594 static void sock_def_destruct(struct sock *sk)
2595 {
2596 }
2597 
2598 void sk_send_sigurg(struct sock *sk)
2599 {
2600 	if (sk->sk_socket && sk->sk_socket->file)
2601 		if (send_sigurg(&sk->sk_socket->file->f_owner))
2602 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2603 }
2604 EXPORT_SYMBOL(sk_send_sigurg);
2605 
2606 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2607 		    unsigned long expires)
2608 {
2609 	if (!mod_timer(timer, expires))
2610 		sock_hold(sk);
2611 }
2612 EXPORT_SYMBOL(sk_reset_timer);
2613 
2614 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2615 {
2616 	if (del_timer(timer))
2617 		__sock_put(sk);
2618 }
2619 EXPORT_SYMBOL(sk_stop_timer);
2620 
2621 void sock_init_data(struct socket *sock, struct sock *sk)
2622 {
2623 	sk_init_common(sk);
2624 	sk->sk_send_head	=	NULL;
2625 
2626 	init_timer(&sk->sk_timer);
2627 
2628 	sk->sk_allocation	=	GFP_KERNEL;
2629 	sk->sk_rcvbuf		=	sysctl_rmem_default;
2630 	sk->sk_sndbuf		=	sysctl_wmem_default;
2631 	sk->sk_state		=	TCP_CLOSE;
2632 	sk_set_socket(sk, sock);
2633 
2634 	sock_set_flag(sk, SOCK_ZAPPED);
2635 
2636 	if (sock) {
2637 		sk->sk_type	=	sock->type;
2638 		sk->sk_wq	=	sock->wq;
2639 		sock->sk	=	sk;
2640 		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
2641 	} else {
2642 		sk->sk_wq	=	NULL;
2643 		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
2644 	}
2645 
2646 	rwlock_init(&sk->sk_callback_lock);
2647 	if (sk->sk_kern_sock)
2648 		lockdep_set_class_and_name(
2649 			&sk->sk_callback_lock,
2650 			af_kern_callback_keys + sk->sk_family,
2651 			af_family_kern_clock_key_strings[sk->sk_family]);
2652 	else
2653 		lockdep_set_class_and_name(
2654 			&sk->sk_callback_lock,
2655 			af_callback_keys + sk->sk_family,
2656 			af_family_clock_key_strings[sk->sk_family]);
2657 
2658 	sk->sk_state_change	=	sock_def_wakeup;
2659 	sk->sk_data_ready	=	sock_def_readable;
2660 	sk->sk_write_space	=	sock_def_write_space;
2661 	sk->sk_error_report	=	sock_def_error_report;
2662 	sk->sk_destruct		=	sock_def_destruct;
2663 
2664 	sk->sk_frag.page	=	NULL;
2665 	sk->sk_frag.offset	=	0;
2666 	sk->sk_peek_off		=	-1;
2667 
2668 	sk->sk_peer_pid 	=	NULL;
2669 	sk->sk_peer_cred	=	NULL;
2670 	sk->sk_write_pending	=	0;
2671 	sk->sk_rcvlowat		=	1;
2672 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2673 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2674 
2675 	sk->sk_stamp = SK_DEFAULT_STAMP;
2676 
2677 #ifdef CONFIG_NET_RX_BUSY_POLL
2678 	sk->sk_napi_id		=	0;
2679 	sk->sk_ll_usec		=	sysctl_net_busy_read;
2680 #endif
2681 
2682 	sk->sk_max_pacing_rate = ~0U;
2683 	sk->sk_pacing_rate = ~0U;
2684 	sk->sk_incoming_cpu = -1;
2685 	/*
2686 	 * Before updating sk_refcnt, we must commit prior changes to memory
2687 	 * (Documentation/RCU/rculist_nulls.txt for details)
2688 	 */
2689 	smp_wmb();
2690 	refcount_set(&sk->sk_refcnt, 1);
2691 	atomic_set(&sk->sk_drops, 0);
2692 }
2693 EXPORT_SYMBOL(sock_init_data);
2694 
2695 void lock_sock_nested(struct sock *sk, int subclass)
2696 {
2697 	might_sleep();
2698 	spin_lock_bh(&sk->sk_lock.slock);
2699 	if (sk->sk_lock.owned)
2700 		__lock_sock(sk);
2701 	sk->sk_lock.owned = 1;
2702 	spin_unlock(&sk->sk_lock.slock);
2703 	/*
2704 	 * The sk_lock has mutex_lock() semantics here:
2705 	 */
2706 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2707 	local_bh_enable();
2708 }
2709 EXPORT_SYMBOL(lock_sock_nested);
2710 
2711 void release_sock(struct sock *sk)
2712 {
2713 	spin_lock_bh(&sk->sk_lock.slock);
2714 	if (sk->sk_backlog.tail)
2715 		__release_sock(sk);
2716 
2717 	/* Warning : release_cb() might need to release sk ownership,
2718 	 * ie call sock_release_ownership(sk) before us.
2719 	 */
2720 	if (sk->sk_prot->release_cb)
2721 		sk->sk_prot->release_cb(sk);
2722 
2723 	sock_release_ownership(sk);
2724 	if (waitqueue_active(&sk->sk_lock.wq))
2725 		wake_up(&sk->sk_lock.wq);
2726 	spin_unlock_bh(&sk->sk_lock.slock);
2727 }
2728 EXPORT_SYMBOL(release_sock);
2729 
2730 /**
2731  * lock_sock_fast - fast version of lock_sock
2732  * @sk: socket
2733  *
2734  * This version should be used for very small section, where process wont block
2735  * return false if fast path is taken:
2736  *
2737  *   sk_lock.slock locked, owned = 0, BH disabled
2738  *
2739  * return true if slow path is taken:
2740  *
2741  *   sk_lock.slock unlocked, owned = 1, BH enabled
2742  */
2743 bool lock_sock_fast(struct sock *sk)
2744 {
2745 	might_sleep();
2746 	spin_lock_bh(&sk->sk_lock.slock);
2747 
2748 	if (!sk->sk_lock.owned)
2749 		/*
2750 		 * Note : We must disable BH
2751 		 */
2752 		return false;
2753 
2754 	__lock_sock(sk);
2755 	sk->sk_lock.owned = 1;
2756 	spin_unlock(&sk->sk_lock.slock);
2757 	/*
2758 	 * The sk_lock has mutex_lock() semantics here:
2759 	 */
2760 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2761 	local_bh_enable();
2762 	return true;
2763 }
2764 EXPORT_SYMBOL(lock_sock_fast);
2765 
2766 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2767 {
2768 	struct timeval tv;
2769 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2770 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2771 	tv = ktime_to_timeval(sk->sk_stamp);
2772 	if (tv.tv_sec == -1)
2773 		return -ENOENT;
2774 	if (tv.tv_sec == 0) {
2775 		sk->sk_stamp = ktime_get_real();
2776 		tv = ktime_to_timeval(sk->sk_stamp);
2777 	}
2778 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2779 }
2780 EXPORT_SYMBOL(sock_get_timestamp);
2781 
2782 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2783 {
2784 	struct timespec ts;
2785 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2786 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2787 	ts = ktime_to_timespec(sk->sk_stamp);
2788 	if (ts.tv_sec == -1)
2789 		return -ENOENT;
2790 	if (ts.tv_sec == 0) {
2791 		sk->sk_stamp = ktime_get_real();
2792 		ts = ktime_to_timespec(sk->sk_stamp);
2793 	}
2794 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2795 }
2796 EXPORT_SYMBOL(sock_get_timestampns);
2797 
2798 void sock_enable_timestamp(struct sock *sk, int flag)
2799 {
2800 	if (!sock_flag(sk, flag)) {
2801 		unsigned long previous_flags = sk->sk_flags;
2802 
2803 		sock_set_flag(sk, flag);
2804 		/*
2805 		 * we just set one of the two flags which require net
2806 		 * time stamping, but time stamping might have been on
2807 		 * already because of the other one
2808 		 */
2809 		if (sock_needs_netstamp(sk) &&
2810 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
2811 			net_enable_timestamp();
2812 	}
2813 }
2814 
2815 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2816 		       int level, int type)
2817 {
2818 	struct sock_exterr_skb *serr;
2819 	struct sk_buff *skb;
2820 	int copied, err;
2821 
2822 	err = -EAGAIN;
2823 	skb = sock_dequeue_err_skb(sk);
2824 	if (skb == NULL)
2825 		goto out;
2826 
2827 	copied = skb->len;
2828 	if (copied > len) {
2829 		msg->msg_flags |= MSG_TRUNC;
2830 		copied = len;
2831 	}
2832 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
2833 	if (err)
2834 		goto out_free_skb;
2835 
2836 	sock_recv_timestamp(msg, sk, skb);
2837 
2838 	serr = SKB_EXT_ERR(skb);
2839 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2840 
2841 	msg->msg_flags |= MSG_ERRQUEUE;
2842 	err = copied;
2843 
2844 out_free_skb:
2845 	kfree_skb(skb);
2846 out:
2847 	return err;
2848 }
2849 EXPORT_SYMBOL(sock_recv_errqueue);
2850 
2851 /*
2852  *	Get a socket option on an socket.
2853  *
2854  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2855  *	asynchronous errors should be reported by getsockopt. We assume
2856  *	this means if you specify SO_ERROR (otherwise whats the point of it).
2857  */
2858 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2859 			   char __user *optval, int __user *optlen)
2860 {
2861 	struct sock *sk = sock->sk;
2862 
2863 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2864 }
2865 EXPORT_SYMBOL(sock_common_getsockopt);
2866 
2867 #ifdef CONFIG_COMPAT
2868 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2869 				  char __user *optval, int __user *optlen)
2870 {
2871 	struct sock *sk = sock->sk;
2872 
2873 	if (sk->sk_prot->compat_getsockopt != NULL)
2874 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2875 						      optval, optlen);
2876 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2877 }
2878 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2879 #endif
2880 
2881 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2882 			int flags)
2883 {
2884 	struct sock *sk = sock->sk;
2885 	int addr_len = 0;
2886 	int err;
2887 
2888 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2889 				   flags & ~MSG_DONTWAIT, &addr_len);
2890 	if (err >= 0)
2891 		msg->msg_namelen = addr_len;
2892 	return err;
2893 }
2894 EXPORT_SYMBOL(sock_common_recvmsg);
2895 
2896 /*
2897  *	Set socket options on an inet socket.
2898  */
2899 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2900 			   char __user *optval, unsigned int optlen)
2901 {
2902 	struct sock *sk = sock->sk;
2903 
2904 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2905 }
2906 EXPORT_SYMBOL(sock_common_setsockopt);
2907 
2908 #ifdef CONFIG_COMPAT
2909 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2910 				  char __user *optval, unsigned int optlen)
2911 {
2912 	struct sock *sk = sock->sk;
2913 
2914 	if (sk->sk_prot->compat_setsockopt != NULL)
2915 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2916 						      optval, optlen);
2917 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2918 }
2919 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2920 #endif
2921 
2922 void sk_common_release(struct sock *sk)
2923 {
2924 	if (sk->sk_prot->destroy)
2925 		sk->sk_prot->destroy(sk);
2926 
2927 	/*
2928 	 * Observation: when sock_common_release is called, processes have
2929 	 * no access to socket. But net still has.
2930 	 * Step one, detach it from networking:
2931 	 *
2932 	 * A. Remove from hash tables.
2933 	 */
2934 
2935 	sk->sk_prot->unhash(sk);
2936 
2937 	/*
2938 	 * In this point socket cannot receive new packets, but it is possible
2939 	 * that some packets are in flight because some CPU runs receiver and
2940 	 * did hash table lookup before we unhashed socket. They will achieve
2941 	 * receive queue and will be purged by socket destructor.
2942 	 *
2943 	 * Also we still have packets pending on receive queue and probably,
2944 	 * our own packets waiting in device queues. sock_destroy will drain
2945 	 * receive queue, but transmitted packets will delay socket destruction
2946 	 * until the last reference will be released.
2947 	 */
2948 
2949 	sock_orphan(sk);
2950 
2951 	xfrm_sk_free_policy(sk);
2952 
2953 	sk_refcnt_debug_release(sk);
2954 
2955 	sock_put(sk);
2956 }
2957 EXPORT_SYMBOL(sk_common_release);
2958 
2959 void sk_get_meminfo(const struct sock *sk, u32 *mem)
2960 {
2961 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
2962 
2963 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
2964 	mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
2965 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
2966 	mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
2967 	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
2968 	mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
2969 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
2970 	mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
2971 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
2972 }
2973 
2974 #ifdef CONFIG_PROC_FS
2975 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
2976 struct prot_inuse {
2977 	int val[PROTO_INUSE_NR];
2978 };
2979 
2980 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2981 
2982 #ifdef CONFIG_NET_NS
2983 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2984 {
2985 	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2986 }
2987 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2988 
2989 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2990 {
2991 	int cpu, idx = prot->inuse_idx;
2992 	int res = 0;
2993 
2994 	for_each_possible_cpu(cpu)
2995 		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2996 
2997 	return res >= 0 ? res : 0;
2998 }
2999 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3000 
3001 static int __net_init sock_inuse_init_net(struct net *net)
3002 {
3003 	net->core.inuse = alloc_percpu(struct prot_inuse);
3004 	return net->core.inuse ? 0 : -ENOMEM;
3005 }
3006 
3007 static void __net_exit sock_inuse_exit_net(struct net *net)
3008 {
3009 	free_percpu(net->core.inuse);
3010 }
3011 
3012 static struct pernet_operations net_inuse_ops = {
3013 	.init = sock_inuse_init_net,
3014 	.exit = sock_inuse_exit_net,
3015 };
3016 
3017 static __init int net_inuse_init(void)
3018 {
3019 	if (register_pernet_subsys(&net_inuse_ops))
3020 		panic("Cannot initialize net inuse counters");
3021 
3022 	return 0;
3023 }
3024 
3025 core_initcall(net_inuse_init);
3026 #else
3027 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
3028 
3029 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3030 {
3031 	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
3032 }
3033 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3034 
3035 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3036 {
3037 	int cpu, idx = prot->inuse_idx;
3038 	int res = 0;
3039 
3040 	for_each_possible_cpu(cpu)
3041 		res += per_cpu(prot_inuse, cpu).val[idx];
3042 
3043 	return res >= 0 ? res : 0;
3044 }
3045 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3046 #endif
3047 
3048 static void assign_proto_idx(struct proto *prot)
3049 {
3050 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3051 
3052 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3053 		pr_err("PROTO_INUSE_NR exhausted\n");
3054 		return;
3055 	}
3056 
3057 	set_bit(prot->inuse_idx, proto_inuse_idx);
3058 }
3059 
3060 static void release_proto_idx(struct proto *prot)
3061 {
3062 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3063 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3064 }
3065 #else
3066 static inline void assign_proto_idx(struct proto *prot)
3067 {
3068 }
3069 
3070 static inline void release_proto_idx(struct proto *prot)
3071 {
3072 }
3073 #endif
3074 
3075 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3076 {
3077 	if (!rsk_prot)
3078 		return;
3079 	kfree(rsk_prot->slab_name);
3080 	rsk_prot->slab_name = NULL;
3081 	kmem_cache_destroy(rsk_prot->slab);
3082 	rsk_prot->slab = NULL;
3083 }
3084 
3085 static int req_prot_init(const struct proto *prot)
3086 {
3087 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3088 
3089 	if (!rsk_prot)
3090 		return 0;
3091 
3092 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3093 					prot->name);
3094 	if (!rsk_prot->slab_name)
3095 		return -ENOMEM;
3096 
3097 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3098 					   rsk_prot->obj_size, 0,
3099 					   prot->slab_flags, NULL);
3100 
3101 	if (!rsk_prot->slab) {
3102 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3103 			prot->name);
3104 		return -ENOMEM;
3105 	}
3106 	return 0;
3107 }
3108 
3109 int proto_register(struct proto *prot, int alloc_slab)
3110 {
3111 	if (alloc_slab) {
3112 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
3113 					SLAB_HWCACHE_ALIGN | prot->slab_flags,
3114 					NULL);
3115 
3116 		if (prot->slab == NULL) {
3117 			pr_crit("%s: Can't create sock SLAB cache!\n",
3118 				prot->name);
3119 			goto out;
3120 		}
3121 
3122 		if (req_prot_init(prot))
3123 			goto out_free_request_sock_slab;
3124 
3125 		if (prot->twsk_prot != NULL) {
3126 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3127 
3128 			if (prot->twsk_prot->twsk_slab_name == NULL)
3129 				goto out_free_request_sock_slab;
3130 
3131 			prot->twsk_prot->twsk_slab =
3132 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3133 						  prot->twsk_prot->twsk_obj_size,
3134 						  0,
3135 						  prot->slab_flags,
3136 						  NULL);
3137 			if (prot->twsk_prot->twsk_slab == NULL)
3138 				goto out_free_timewait_sock_slab_name;
3139 		}
3140 	}
3141 
3142 	mutex_lock(&proto_list_mutex);
3143 	list_add(&prot->node, &proto_list);
3144 	assign_proto_idx(prot);
3145 	mutex_unlock(&proto_list_mutex);
3146 	return 0;
3147 
3148 out_free_timewait_sock_slab_name:
3149 	kfree(prot->twsk_prot->twsk_slab_name);
3150 out_free_request_sock_slab:
3151 	req_prot_cleanup(prot->rsk_prot);
3152 
3153 	kmem_cache_destroy(prot->slab);
3154 	prot->slab = NULL;
3155 out:
3156 	return -ENOBUFS;
3157 }
3158 EXPORT_SYMBOL(proto_register);
3159 
3160 void proto_unregister(struct proto *prot)
3161 {
3162 	mutex_lock(&proto_list_mutex);
3163 	release_proto_idx(prot);
3164 	list_del(&prot->node);
3165 	mutex_unlock(&proto_list_mutex);
3166 
3167 	kmem_cache_destroy(prot->slab);
3168 	prot->slab = NULL;
3169 
3170 	req_prot_cleanup(prot->rsk_prot);
3171 
3172 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3173 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3174 		kfree(prot->twsk_prot->twsk_slab_name);
3175 		prot->twsk_prot->twsk_slab = NULL;
3176 	}
3177 }
3178 EXPORT_SYMBOL(proto_unregister);
3179 
3180 #ifdef CONFIG_PROC_FS
3181 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3182 	__acquires(proto_list_mutex)
3183 {
3184 	mutex_lock(&proto_list_mutex);
3185 	return seq_list_start_head(&proto_list, *pos);
3186 }
3187 
3188 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3189 {
3190 	return seq_list_next(v, &proto_list, pos);
3191 }
3192 
3193 static void proto_seq_stop(struct seq_file *seq, void *v)
3194 	__releases(proto_list_mutex)
3195 {
3196 	mutex_unlock(&proto_list_mutex);
3197 }
3198 
3199 static char proto_method_implemented(const void *method)
3200 {
3201 	return method == NULL ? 'n' : 'y';
3202 }
3203 static long sock_prot_memory_allocated(struct proto *proto)
3204 {
3205 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3206 }
3207 
3208 static char *sock_prot_memory_pressure(struct proto *proto)
3209 {
3210 	return proto->memory_pressure != NULL ?
3211 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3212 }
3213 
3214 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3215 {
3216 
3217 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3218 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3219 		   proto->name,
3220 		   proto->obj_size,
3221 		   sock_prot_inuse_get(seq_file_net(seq), proto),
3222 		   sock_prot_memory_allocated(proto),
3223 		   sock_prot_memory_pressure(proto),
3224 		   proto->max_header,
3225 		   proto->slab == NULL ? "no" : "yes",
3226 		   module_name(proto->owner),
3227 		   proto_method_implemented(proto->close),
3228 		   proto_method_implemented(proto->connect),
3229 		   proto_method_implemented(proto->disconnect),
3230 		   proto_method_implemented(proto->accept),
3231 		   proto_method_implemented(proto->ioctl),
3232 		   proto_method_implemented(proto->init),
3233 		   proto_method_implemented(proto->destroy),
3234 		   proto_method_implemented(proto->shutdown),
3235 		   proto_method_implemented(proto->setsockopt),
3236 		   proto_method_implemented(proto->getsockopt),
3237 		   proto_method_implemented(proto->sendmsg),
3238 		   proto_method_implemented(proto->recvmsg),
3239 		   proto_method_implemented(proto->sendpage),
3240 		   proto_method_implemented(proto->bind),
3241 		   proto_method_implemented(proto->backlog_rcv),
3242 		   proto_method_implemented(proto->hash),
3243 		   proto_method_implemented(proto->unhash),
3244 		   proto_method_implemented(proto->get_port),
3245 		   proto_method_implemented(proto->enter_memory_pressure));
3246 }
3247 
3248 static int proto_seq_show(struct seq_file *seq, void *v)
3249 {
3250 	if (v == &proto_list)
3251 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3252 			   "protocol",
3253 			   "size",
3254 			   "sockets",
3255 			   "memory",
3256 			   "press",
3257 			   "maxhdr",
3258 			   "slab",
3259 			   "module",
3260 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3261 	else
3262 		proto_seq_printf(seq, list_entry(v, struct proto, node));
3263 	return 0;
3264 }
3265 
3266 static const struct seq_operations proto_seq_ops = {
3267 	.start  = proto_seq_start,
3268 	.next   = proto_seq_next,
3269 	.stop   = proto_seq_stop,
3270 	.show   = proto_seq_show,
3271 };
3272 
3273 static int proto_seq_open(struct inode *inode, struct file *file)
3274 {
3275 	return seq_open_net(inode, file, &proto_seq_ops,
3276 			    sizeof(struct seq_net_private));
3277 }
3278 
3279 static const struct file_operations proto_seq_fops = {
3280 	.owner		= THIS_MODULE,
3281 	.open		= proto_seq_open,
3282 	.read		= seq_read,
3283 	.llseek		= seq_lseek,
3284 	.release	= seq_release_net,
3285 };
3286 
3287 static __net_init int proto_init_net(struct net *net)
3288 {
3289 	if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3290 		return -ENOMEM;
3291 
3292 	return 0;
3293 }
3294 
3295 static __net_exit void proto_exit_net(struct net *net)
3296 {
3297 	remove_proc_entry("protocols", net->proc_net);
3298 }
3299 
3300 
3301 static __net_initdata struct pernet_operations proto_net_ops = {
3302 	.init = proto_init_net,
3303 	.exit = proto_exit_net,
3304 };
3305 
3306 static int __init proto_init(void)
3307 {
3308 	return register_pernet_subsys(&proto_net_ops);
3309 }
3310 
3311 subsys_initcall(proto_init);
3312 
3313 #endif /* PROC_FS */
3314 
3315 #ifdef CONFIG_NET_RX_BUSY_POLL
3316 bool sk_busy_loop_end(void *p, unsigned long start_time)
3317 {
3318 	struct sock *sk = p;
3319 
3320 	return !skb_queue_empty(&sk->sk_receive_queue) ||
3321 	       sk_busy_loop_timeout(sk, start_time);
3322 }
3323 EXPORT_SYMBOL(sk_busy_loop_end);
3324 #endif /* CONFIG_NET_RX_BUSY_POLL */
3325