xref: /openbmc/linux/net/core/sock.c (revision 7f2e85840871f199057e65232ebde846192ed989)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *		This program is free software; you can redistribute it and/or
87  *		modify it under the terms of the GNU General Public License
88  *		as published by the Free Software Foundation; either version
89  *		2 of the License, or (at your option) any later version.
90  */
91 
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93 
94 #include <linux/capability.h>
95 #include <linux/errno.h>
96 #include <linux/errqueue.h>
97 #include <linux/types.h>
98 #include <linux/socket.h>
99 #include <linux/in.h>
100 #include <linux/kernel.h>
101 #include <linux/module.h>
102 #include <linux/proc_fs.h>
103 #include <linux/seq_file.h>
104 #include <linux/sched.h>
105 #include <linux/sched/mm.h>
106 #include <linux/timer.h>
107 #include <linux/string.h>
108 #include <linux/sockios.h>
109 #include <linux/net.h>
110 #include <linux/mm.h>
111 #include <linux/slab.h>
112 #include <linux/interrupt.h>
113 #include <linux/poll.h>
114 #include <linux/tcp.h>
115 #include <linux/init.h>
116 #include <linux/highmem.h>
117 #include <linux/user_namespace.h>
118 #include <linux/static_key.h>
119 #include <linux/memcontrol.h>
120 #include <linux/prefetch.h>
121 
122 #include <linux/uaccess.h>
123 
124 #include <linux/netdevice.h>
125 #include <net/protocol.h>
126 #include <linux/skbuff.h>
127 #include <net/net_namespace.h>
128 #include <net/request_sock.h>
129 #include <net/sock.h>
130 #include <linux/net_tstamp.h>
131 #include <net/xfrm.h>
132 #include <linux/ipsec.h>
133 #include <net/cls_cgroup.h>
134 #include <net/netprio_cgroup.h>
135 #include <linux/sock_diag.h>
136 
137 #include <linux/filter.h>
138 #include <net/sock_reuseport.h>
139 
140 #include <trace/events/sock.h>
141 
142 #include <net/tcp.h>
143 #include <net/busy_poll.h>
144 
145 static DEFINE_MUTEX(proto_list_mutex);
146 static LIST_HEAD(proto_list);
147 
148 static void sock_inuse_add(struct net *net, int val);
149 
150 /**
151  * sk_ns_capable - General socket capability test
152  * @sk: Socket to use a capability on or through
153  * @user_ns: The user namespace of the capability to use
154  * @cap: The capability to use
155  *
156  * Test to see if the opener of the socket had when the socket was
157  * created and the current process has the capability @cap in the user
158  * namespace @user_ns.
159  */
160 bool sk_ns_capable(const struct sock *sk,
161 		   struct user_namespace *user_ns, int cap)
162 {
163 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
164 		ns_capable(user_ns, cap);
165 }
166 EXPORT_SYMBOL(sk_ns_capable);
167 
168 /**
169  * sk_capable - Socket global capability test
170  * @sk: Socket to use a capability on or through
171  * @cap: The global capability to use
172  *
173  * Test to see if the opener of the socket had when the socket was
174  * created and the current process has the capability @cap in all user
175  * namespaces.
176  */
177 bool sk_capable(const struct sock *sk, int cap)
178 {
179 	return sk_ns_capable(sk, &init_user_ns, cap);
180 }
181 EXPORT_SYMBOL(sk_capable);
182 
183 /**
184  * sk_net_capable - Network namespace socket capability test
185  * @sk: Socket to use a capability on or through
186  * @cap: The capability to use
187  *
188  * Test to see if the opener of the socket had when the socket was created
189  * and the current process has the capability @cap over the network namespace
190  * the socket is a member of.
191  */
192 bool sk_net_capable(const struct sock *sk, int cap)
193 {
194 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
195 }
196 EXPORT_SYMBOL(sk_net_capable);
197 
198 /*
199  * Each address family might have different locking rules, so we have
200  * one slock key per address family and separate keys for internal and
201  * userspace sockets.
202  */
203 static struct lock_class_key af_family_keys[AF_MAX];
204 static struct lock_class_key af_family_kern_keys[AF_MAX];
205 static struct lock_class_key af_family_slock_keys[AF_MAX];
206 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
207 
208 /*
209  * Make lock validator output more readable. (we pre-construct these
210  * strings build-time, so that runtime initialization of socket
211  * locks is fast):
212  */
213 
214 #define _sock_locks(x)						  \
215   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
216   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
217   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
218   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
219   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
220   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
221   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
222   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
223   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
224   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
225   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
226   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
227   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
228   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
229   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_MAX"
230 
231 static const char *const af_family_key_strings[AF_MAX+1] = {
232 	_sock_locks("sk_lock-")
233 };
234 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
235 	_sock_locks("slock-")
236 };
237 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
238 	_sock_locks("clock-")
239 };
240 
241 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
242 	_sock_locks("k-sk_lock-")
243 };
244 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
245 	_sock_locks("k-slock-")
246 };
247 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
248 	_sock_locks("k-clock-")
249 };
250 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
251   "rlock-AF_UNSPEC", "rlock-AF_UNIX"     , "rlock-AF_INET"     ,
252   "rlock-AF_AX25"  , "rlock-AF_IPX"      , "rlock-AF_APPLETALK",
253   "rlock-AF_NETROM", "rlock-AF_BRIDGE"   , "rlock-AF_ATMPVC"   ,
254   "rlock-AF_X25"   , "rlock-AF_INET6"    , "rlock-AF_ROSE"     ,
255   "rlock-AF_DECnet", "rlock-AF_NETBEUI"  , "rlock-AF_SECURITY" ,
256   "rlock-AF_KEY"   , "rlock-AF_NETLINK"  , "rlock-AF_PACKET"   ,
257   "rlock-AF_ASH"   , "rlock-AF_ECONET"   , "rlock-AF_ATMSVC"   ,
258   "rlock-AF_RDS"   , "rlock-AF_SNA"      , "rlock-AF_IRDA"     ,
259   "rlock-AF_PPPOX" , "rlock-AF_WANPIPE"  , "rlock-AF_LLC"      ,
260   "rlock-27"       , "rlock-28"          , "rlock-AF_CAN"      ,
261   "rlock-AF_TIPC"  , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV"     ,
262   "rlock-AF_RXRPC" , "rlock-AF_ISDN"     , "rlock-AF_PHONET"   ,
263   "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG"      ,
264   "rlock-AF_NFC"   , "rlock-AF_VSOCK"    , "rlock-AF_KCM"      ,
265   "rlock-AF_QIPCRTR", "rlock-AF_SMC"     , "rlock-AF_MAX"
266 };
267 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
268   "wlock-AF_UNSPEC", "wlock-AF_UNIX"     , "wlock-AF_INET"     ,
269   "wlock-AF_AX25"  , "wlock-AF_IPX"      , "wlock-AF_APPLETALK",
270   "wlock-AF_NETROM", "wlock-AF_BRIDGE"   , "wlock-AF_ATMPVC"   ,
271   "wlock-AF_X25"   , "wlock-AF_INET6"    , "wlock-AF_ROSE"     ,
272   "wlock-AF_DECnet", "wlock-AF_NETBEUI"  , "wlock-AF_SECURITY" ,
273   "wlock-AF_KEY"   , "wlock-AF_NETLINK"  , "wlock-AF_PACKET"   ,
274   "wlock-AF_ASH"   , "wlock-AF_ECONET"   , "wlock-AF_ATMSVC"   ,
275   "wlock-AF_RDS"   , "wlock-AF_SNA"      , "wlock-AF_IRDA"     ,
276   "wlock-AF_PPPOX" , "wlock-AF_WANPIPE"  , "wlock-AF_LLC"      ,
277   "wlock-27"       , "wlock-28"          , "wlock-AF_CAN"      ,
278   "wlock-AF_TIPC"  , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV"     ,
279   "wlock-AF_RXRPC" , "wlock-AF_ISDN"     , "wlock-AF_PHONET"   ,
280   "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG"      ,
281   "wlock-AF_NFC"   , "wlock-AF_VSOCK"    , "wlock-AF_KCM"      ,
282   "wlock-AF_QIPCRTR", "wlock-AF_SMC"     , "wlock-AF_MAX"
283 };
284 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
285   "elock-AF_UNSPEC", "elock-AF_UNIX"     , "elock-AF_INET"     ,
286   "elock-AF_AX25"  , "elock-AF_IPX"      , "elock-AF_APPLETALK",
287   "elock-AF_NETROM", "elock-AF_BRIDGE"   , "elock-AF_ATMPVC"   ,
288   "elock-AF_X25"   , "elock-AF_INET6"    , "elock-AF_ROSE"     ,
289   "elock-AF_DECnet", "elock-AF_NETBEUI"  , "elock-AF_SECURITY" ,
290   "elock-AF_KEY"   , "elock-AF_NETLINK"  , "elock-AF_PACKET"   ,
291   "elock-AF_ASH"   , "elock-AF_ECONET"   , "elock-AF_ATMSVC"   ,
292   "elock-AF_RDS"   , "elock-AF_SNA"      , "elock-AF_IRDA"     ,
293   "elock-AF_PPPOX" , "elock-AF_WANPIPE"  , "elock-AF_LLC"      ,
294   "elock-27"       , "elock-28"          , "elock-AF_CAN"      ,
295   "elock-AF_TIPC"  , "elock-AF_BLUETOOTH", "elock-AF_IUCV"     ,
296   "elock-AF_RXRPC" , "elock-AF_ISDN"     , "elock-AF_PHONET"   ,
297   "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG"      ,
298   "elock-AF_NFC"   , "elock-AF_VSOCK"    , "elock-AF_KCM"      ,
299   "elock-AF_QIPCRTR", "elock-AF_SMC"     , "elock-AF_MAX"
300 };
301 
302 /*
303  * sk_callback_lock and sk queues locking rules are per-address-family,
304  * so split the lock classes by using a per-AF key:
305  */
306 static struct lock_class_key af_callback_keys[AF_MAX];
307 static struct lock_class_key af_rlock_keys[AF_MAX];
308 static struct lock_class_key af_wlock_keys[AF_MAX];
309 static struct lock_class_key af_elock_keys[AF_MAX];
310 static struct lock_class_key af_kern_callback_keys[AF_MAX];
311 
312 /* Run time adjustable parameters. */
313 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
314 EXPORT_SYMBOL(sysctl_wmem_max);
315 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
316 EXPORT_SYMBOL(sysctl_rmem_max);
317 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
318 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
319 
320 /* Maximal space eaten by iovec or ancillary data plus some space */
321 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
322 EXPORT_SYMBOL(sysctl_optmem_max);
323 
324 int sysctl_tstamp_allow_data __read_mostly = 1;
325 
326 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
327 EXPORT_SYMBOL_GPL(memalloc_socks);
328 
329 /**
330  * sk_set_memalloc - sets %SOCK_MEMALLOC
331  * @sk: socket to set it on
332  *
333  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
334  * It's the responsibility of the admin to adjust min_free_kbytes
335  * to meet the requirements
336  */
337 void sk_set_memalloc(struct sock *sk)
338 {
339 	sock_set_flag(sk, SOCK_MEMALLOC);
340 	sk->sk_allocation |= __GFP_MEMALLOC;
341 	static_key_slow_inc(&memalloc_socks);
342 }
343 EXPORT_SYMBOL_GPL(sk_set_memalloc);
344 
345 void sk_clear_memalloc(struct sock *sk)
346 {
347 	sock_reset_flag(sk, SOCK_MEMALLOC);
348 	sk->sk_allocation &= ~__GFP_MEMALLOC;
349 	static_key_slow_dec(&memalloc_socks);
350 
351 	/*
352 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
353 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
354 	 * it has rmem allocations due to the last swapfile being deactivated
355 	 * but there is a risk that the socket is unusable due to exceeding
356 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
357 	 */
358 	sk_mem_reclaim(sk);
359 }
360 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
361 
362 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
363 {
364 	int ret;
365 	unsigned int noreclaim_flag;
366 
367 	/* these should have been dropped before queueing */
368 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
369 
370 	noreclaim_flag = memalloc_noreclaim_save();
371 	ret = sk->sk_backlog_rcv(sk, skb);
372 	memalloc_noreclaim_restore(noreclaim_flag);
373 
374 	return ret;
375 }
376 EXPORT_SYMBOL(__sk_backlog_rcv);
377 
378 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
379 {
380 	struct timeval tv;
381 
382 	if (optlen < sizeof(tv))
383 		return -EINVAL;
384 	if (copy_from_user(&tv, optval, sizeof(tv)))
385 		return -EFAULT;
386 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
387 		return -EDOM;
388 
389 	if (tv.tv_sec < 0) {
390 		static int warned __read_mostly;
391 
392 		*timeo_p = 0;
393 		if (warned < 10 && net_ratelimit()) {
394 			warned++;
395 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
396 				__func__, current->comm, task_pid_nr(current));
397 		}
398 		return 0;
399 	}
400 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
401 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
402 		return 0;
403 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
404 		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
405 	return 0;
406 }
407 
408 static void sock_warn_obsolete_bsdism(const char *name)
409 {
410 	static int warned;
411 	static char warncomm[TASK_COMM_LEN];
412 	if (strcmp(warncomm, current->comm) && warned < 5) {
413 		strcpy(warncomm,  current->comm);
414 		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
415 			warncomm, name);
416 		warned++;
417 	}
418 }
419 
420 static bool sock_needs_netstamp(const struct sock *sk)
421 {
422 	switch (sk->sk_family) {
423 	case AF_UNSPEC:
424 	case AF_UNIX:
425 		return false;
426 	default:
427 		return true;
428 	}
429 }
430 
431 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
432 {
433 	if (sk->sk_flags & flags) {
434 		sk->sk_flags &= ~flags;
435 		if (sock_needs_netstamp(sk) &&
436 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
437 			net_disable_timestamp();
438 	}
439 }
440 
441 
442 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
443 {
444 	unsigned long flags;
445 	struct sk_buff_head *list = &sk->sk_receive_queue;
446 
447 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
448 		atomic_inc(&sk->sk_drops);
449 		trace_sock_rcvqueue_full(sk, skb);
450 		return -ENOMEM;
451 	}
452 
453 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
454 		atomic_inc(&sk->sk_drops);
455 		return -ENOBUFS;
456 	}
457 
458 	skb->dev = NULL;
459 	skb_set_owner_r(skb, sk);
460 
461 	/* we escape from rcu protected region, make sure we dont leak
462 	 * a norefcounted dst
463 	 */
464 	skb_dst_force(skb);
465 
466 	spin_lock_irqsave(&list->lock, flags);
467 	sock_skb_set_dropcount(sk, skb);
468 	__skb_queue_tail(list, skb);
469 	spin_unlock_irqrestore(&list->lock, flags);
470 
471 	if (!sock_flag(sk, SOCK_DEAD))
472 		sk->sk_data_ready(sk);
473 	return 0;
474 }
475 EXPORT_SYMBOL(__sock_queue_rcv_skb);
476 
477 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
478 {
479 	int err;
480 
481 	err = sk_filter(sk, skb);
482 	if (err)
483 		return err;
484 
485 	return __sock_queue_rcv_skb(sk, skb);
486 }
487 EXPORT_SYMBOL(sock_queue_rcv_skb);
488 
489 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
490 		     const int nested, unsigned int trim_cap, bool refcounted)
491 {
492 	int rc = NET_RX_SUCCESS;
493 
494 	if (sk_filter_trim_cap(sk, skb, trim_cap))
495 		goto discard_and_relse;
496 
497 	skb->dev = NULL;
498 
499 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
500 		atomic_inc(&sk->sk_drops);
501 		goto discard_and_relse;
502 	}
503 	if (nested)
504 		bh_lock_sock_nested(sk);
505 	else
506 		bh_lock_sock(sk);
507 	if (!sock_owned_by_user(sk)) {
508 		/*
509 		 * trylock + unlock semantics:
510 		 */
511 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
512 
513 		rc = sk_backlog_rcv(sk, skb);
514 
515 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
516 	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
517 		bh_unlock_sock(sk);
518 		atomic_inc(&sk->sk_drops);
519 		goto discard_and_relse;
520 	}
521 
522 	bh_unlock_sock(sk);
523 out:
524 	if (refcounted)
525 		sock_put(sk);
526 	return rc;
527 discard_and_relse:
528 	kfree_skb(skb);
529 	goto out;
530 }
531 EXPORT_SYMBOL(__sk_receive_skb);
532 
533 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
534 {
535 	struct dst_entry *dst = __sk_dst_get(sk);
536 
537 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
538 		sk_tx_queue_clear(sk);
539 		sk->sk_dst_pending_confirm = 0;
540 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
541 		dst_release(dst);
542 		return NULL;
543 	}
544 
545 	return dst;
546 }
547 EXPORT_SYMBOL(__sk_dst_check);
548 
549 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
550 {
551 	struct dst_entry *dst = sk_dst_get(sk);
552 
553 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
554 		sk_dst_reset(sk);
555 		dst_release(dst);
556 		return NULL;
557 	}
558 
559 	return dst;
560 }
561 EXPORT_SYMBOL(sk_dst_check);
562 
563 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
564 				int optlen)
565 {
566 	int ret = -ENOPROTOOPT;
567 #ifdef CONFIG_NETDEVICES
568 	struct net *net = sock_net(sk);
569 	char devname[IFNAMSIZ];
570 	int index;
571 
572 	/* Sorry... */
573 	ret = -EPERM;
574 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
575 		goto out;
576 
577 	ret = -EINVAL;
578 	if (optlen < 0)
579 		goto out;
580 
581 	/* Bind this socket to a particular device like "eth0",
582 	 * as specified in the passed interface name. If the
583 	 * name is "" or the option length is zero the socket
584 	 * is not bound.
585 	 */
586 	if (optlen > IFNAMSIZ - 1)
587 		optlen = IFNAMSIZ - 1;
588 	memset(devname, 0, sizeof(devname));
589 
590 	ret = -EFAULT;
591 	if (copy_from_user(devname, optval, optlen))
592 		goto out;
593 
594 	index = 0;
595 	if (devname[0] != '\0') {
596 		struct net_device *dev;
597 
598 		rcu_read_lock();
599 		dev = dev_get_by_name_rcu(net, devname);
600 		if (dev)
601 			index = dev->ifindex;
602 		rcu_read_unlock();
603 		ret = -ENODEV;
604 		if (!dev)
605 			goto out;
606 	}
607 
608 	lock_sock(sk);
609 	sk->sk_bound_dev_if = index;
610 	sk_dst_reset(sk);
611 	release_sock(sk);
612 
613 	ret = 0;
614 
615 out:
616 #endif
617 
618 	return ret;
619 }
620 
621 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
622 				int __user *optlen, int len)
623 {
624 	int ret = -ENOPROTOOPT;
625 #ifdef CONFIG_NETDEVICES
626 	struct net *net = sock_net(sk);
627 	char devname[IFNAMSIZ];
628 
629 	if (sk->sk_bound_dev_if == 0) {
630 		len = 0;
631 		goto zero;
632 	}
633 
634 	ret = -EINVAL;
635 	if (len < IFNAMSIZ)
636 		goto out;
637 
638 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
639 	if (ret)
640 		goto out;
641 
642 	len = strlen(devname) + 1;
643 
644 	ret = -EFAULT;
645 	if (copy_to_user(optval, devname, len))
646 		goto out;
647 
648 zero:
649 	ret = -EFAULT;
650 	if (put_user(len, optlen))
651 		goto out;
652 
653 	ret = 0;
654 
655 out:
656 #endif
657 
658 	return ret;
659 }
660 
661 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
662 {
663 	if (valbool)
664 		sock_set_flag(sk, bit);
665 	else
666 		sock_reset_flag(sk, bit);
667 }
668 
669 bool sk_mc_loop(struct sock *sk)
670 {
671 	if (dev_recursion_level())
672 		return false;
673 	if (!sk)
674 		return true;
675 	switch (sk->sk_family) {
676 	case AF_INET:
677 		return inet_sk(sk)->mc_loop;
678 #if IS_ENABLED(CONFIG_IPV6)
679 	case AF_INET6:
680 		return inet6_sk(sk)->mc_loop;
681 #endif
682 	}
683 	WARN_ON(1);
684 	return true;
685 }
686 EXPORT_SYMBOL(sk_mc_loop);
687 
688 /*
689  *	This is meant for all protocols to use and covers goings on
690  *	at the socket level. Everything here is generic.
691  */
692 
693 int sock_setsockopt(struct socket *sock, int level, int optname,
694 		    char __user *optval, unsigned int optlen)
695 {
696 	struct sock *sk = sock->sk;
697 	int val;
698 	int valbool;
699 	struct linger ling;
700 	int ret = 0;
701 
702 	/*
703 	 *	Options without arguments
704 	 */
705 
706 	if (optname == SO_BINDTODEVICE)
707 		return sock_setbindtodevice(sk, optval, optlen);
708 
709 	if (optlen < sizeof(int))
710 		return -EINVAL;
711 
712 	if (get_user(val, (int __user *)optval))
713 		return -EFAULT;
714 
715 	valbool = val ? 1 : 0;
716 
717 	lock_sock(sk);
718 
719 	switch (optname) {
720 	case SO_DEBUG:
721 		if (val && !capable(CAP_NET_ADMIN))
722 			ret = -EACCES;
723 		else
724 			sock_valbool_flag(sk, SOCK_DBG, valbool);
725 		break;
726 	case SO_REUSEADDR:
727 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
728 		break;
729 	case SO_REUSEPORT:
730 		sk->sk_reuseport = valbool;
731 		break;
732 	case SO_TYPE:
733 	case SO_PROTOCOL:
734 	case SO_DOMAIN:
735 	case SO_ERROR:
736 		ret = -ENOPROTOOPT;
737 		break;
738 	case SO_DONTROUTE:
739 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
740 		break;
741 	case SO_BROADCAST:
742 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
743 		break;
744 	case SO_SNDBUF:
745 		/* Don't error on this BSD doesn't and if you think
746 		 * about it this is right. Otherwise apps have to
747 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
748 		 * are treated in BSD as hints
749 		 */
750 		val = min_t(u32, val, sysctl_wmem_max);
751 set_sndbuf:
752 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
753 		sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
754 		/* Wake up sending tasks if we upped the value. */
755 		sk->sk_write_space(sk);
756 		break;
757 
758 	case SO_SNDBUFFORCE:
759 		if (!capable(CAP_NET_ADMIN)) {
760 			ret = -EPERM;
761 			break;
762 		}
763 		goto set_sndbuf;
764 
765 	case SO_RCVBUF:
766 		/* Don't error on this BSD doesn't and if you think
767 		 * about it this is right. Otherwise apps have to
768 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
769 		 * are treated in BSD as hints
770 		 */
771 		val = min_t(u32, val, sysctl_rmem_max);
772 set_rcvbuf:
773 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
774 		/*
775 		 * We double it on the way in to account for
776 		 * "struct sk_buff" etc. overhead.   Applications
777 		 * assume that the SO_RCVBUF setting they make will
778 		 * allow that much actual data to be received on that
779 		 * socket.
780 		 *
781 		 * Applications are unaware that "struct sk_buff" and
782 		 * other overheads allocate from the receive buffer
783 		 * during socket buffer allocation.
784 		 *
785 		 * And after considering the possible alternatives,
786 		 * returning the value we actually used in getsockopt
787 		 * is the most desirable behavior.
788 		 */
789 		sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
790 		break;
791 
792 	case SO_RCVBUFFORCE:
793 		if (!capable(CAP_NET_ADMIN)) {
794 			ret = -EPERM;
795 			break;
796 		}
797 		goto set_rcvbuf;
798 
799 	case SO_KEEPALIVE:
800 		if (sk->sk_prot->keepalive)
801 			sk->sk_prot->keepalive(sk, valbool);
802 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
803 		break;
804 
805 	case SO_OOBINLINE:
806 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
807 		break;
808 
809 	case SO_NO_CHECK:
810 		sk->sk_no_check_tx = valbool;
811 		break;
812 
813 	case SO_PRIORITY:
814 		if ((val >= 0 && val <= 6) ||
815 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
816 			sk->sk_priority = val;
817 		else
818 			ret = -EPERM;
819 		break;
820 
821 	case SO_LINGER:
822 		if (optlen < sizeof(ling)) {
823 			ret = -EINVAL;	/* 1003.1g */
824 			break;
825 		}
826 		if (copy_from_user(&ling, optval, sizeof(ling))) {
827 			ret = -EFAULT;
828 			break;
829 		}
830 		if (!ling.l_onoff)
831 			sock_reset_flag(sk, SOCK_LINGER);
832 		else {
833 #if (BITS_PER_LONG == 32)
834 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
835 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
836 			else
837 #endif
838 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
839 			sock_set_flag(sk, SOCK_LINGER);
840 		}
841 		break;
842 
843 	case SO_BSDCOMPAT:
844 		sock_warn_obsolete_bsdism("setsockopt");
845 		break;
846 
847 	case SO_PASSCRED:
848 		if (valbool)
849 			set_bit(SOCK_PASSCRED, &sock->flags);
850 		else
851 			clear_bit(SOCK_PASSCRED, &sock->flags);
852 		break;
853 
854 	case SO_TIMESTAMP:
855 	case SO_TIMESTAMPNS:
856 		if (valbool)  {
857 			if (optname == SO_TIMESTAMP)
858 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
859 			else
860 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
861 			sock_set_flag(sk, SOCK_RCVTSTAMP);
862 			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
863 		} else {
864 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
865 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
866 		}
867 		break;
868 
869 	case SO_TIMESTAMPING:
870 		if (val & ~SOF_TIMESTAMPING_MASK) {
871 			ret = -EINVAL;
872 			break;
873 		}
874 
875 		if (val & SOF_TIMESTAMPING_OPT_ID &&
876 		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
877 			if (sk->sk_protocol == IPPROTO_TCP &&
878 			    sk->sk_type == SOCK_STREAM) {
879 				if ((1 << sk->sk_state) &
880 				    (TCPF_CLOSE | TCPF_LISTEN)) {
881 					ret = -EINVAL;
882 					break;
883 				}
884 				sk->sk_tskey = tcp_sk(sk)->snd_una;
885 			} else {
886 				sk->sk_tskey = 0;
887 			}
888 		}
889 
890 		if (val & SOF_TIMESTAMPING_OPT_STATS &&
891 		    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
892 			ret = -EINVAL;
893 			break;
894 		}
895 
896 		sk->sk_tsflags = val;
897 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
898 			sock_enable_timestamp(sk,
899 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
900 		else
901 			sock_disable_timestamp(sk,
902 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
903 		break;
904 
905 	case SO_RCVLOWAT:
906 		if (val < 0)
907 			val = INT_MAX;
908 		sk->sk_rcvlowat = val ? : 1;
909 		break;
910 
911 	case SO_RCVTIMEO:
912 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
913 		break;
914 
915 	case SO_SNDTIMEO:
916 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
917 		break;
918 
919 	case SO_ATTACH_FILTER:
920 		ret = -EINVAL;
921 		if (optlen == sizeof(struct sock_fprog)) {
922 			struct sock_fprog fprog;
923 
924 			ret = -EFAULT;
925 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
926 				break;
927 
928 			ret = sk_attach_filter(&fprog, sk);
929 		}
930 		break;
931 
932 	case SO_ATTACH_BPF:
933 		ret = -EINVAL;
934 		if (optlen == sizeof(u32)) {
935 			u32 ufd;
936 
937 			ret = -EFAULT;
938 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
939 				break;
940 
941 			ret = sk_attach_bpf(ufd, sk);
942 		}
943 		break;
944 
945 	case SO_ATTACH_REUSEPORT_CBPF:
946 		ret = -EINVAL;
947 		if (optlen == sizeof(struct sock_fprog)) {
948 			struct sock_fprog fprog;
949 
950 			ret = -EFAULT;
951 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
952 				break;
953 
954 			ret = sk_reuseport_attach_filter(&fprog, sk);
955 		}
956 		break;
957 
958 	case SO_ATTACH_REUSEPORT_EBPF:
959 		ret = -EINVAL;
960 		if (optlen == sizeof(u32)) {
961 			u32 ufd;
962 
963 			ret = -EFAULT;
964 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
965 				break;
966 
967 			ret = sk_reuseport_attach_bpf(ufd, sk);
968 		}
969 		break;
970 
971 	case SO_DETACH_FILTER:
972 		ret = sk_detach_filter(sk);
973 		break;
974 
975 	case SO_LOCK_FILTER:
976 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
977 			ret = -EPERM;
978 		else
979 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
980 		break;
981 
982 	case SO_PASSSEC:
983 		if (valbool)
984 			set_bit(SOCK_PASSSEC, &sock->flags);
985 		else
986 			clear_bit(SOCK_PASSSEC, &sock->flags);
987 		break;
988 	case SO_MARK:
989 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
990 			ret = -EPERM;
991 		else
992 			sk->sk_mark = val;
993 		break;
994 
995 	case SO_RXQ_OVFL:
996 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
997 		break;
998 
999 	case SO_WIFI_STATUS:
1000 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1001 		break;
1002 
1003 	case SO_PEEK_OFF:
1004 		if (sock->ops->set_peek_off)
1005 			ret = sock->ops->set_peek_off(sk, val);
1006 		else
1007 			ret = -EOPNOTSUPP;
1008 		break;
1009 
1010 	case SO_NOFCS:
1011 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1012 		break;
1013 
1014 	case SO_SELECT_ERR_QUEUE:
1015 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1016 		break;
1017 
1018 #ifdef CONFIG_NET_RX_BUSY_POLL
1019 	case SO_BUSY_POLL:
1020 		/* allow unprivileged users to decrease the value */
1021 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1022 			ret = -EPERM;
1023 		else {
1024 			if (val < 0)
1025 				ret = -EINVAL;
1026 			else
1027 				sk->sk_ll_usec = val;
1028 		}
1029 		break;
1030 #endif
1031 
1032 	case SO_MAX_PACING_RATE:
1033 		if (val != ~0U)
1034 			cmpxchg(&sk->sk_pacing_status,
1035 				SK_PACING_NONE,
1036 				SK_PACING_NEEDED);
1037 		sk->sk_max_pacing_rate = val;
1038 		sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1039 					 sk->sk_max_pacing_rate);
1040 		break;
1041 
1042 	case SO_INCOMING_CPU:
1043 		sk->sk_incoming_cpu = val;
1044 		break;
1045 
1046 	case SO_CNX_ADVICE:
1047 		if (val == 1)
1048 			dst_negative_advice(sk);
1049 		break;
1050 
1051 	case SO_ZEROCOPY:
1052 		if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6)
1053 			ret = -ENOTSUPP;
1054 		else if (sk->sk_protocol != IPPROTO_TCP)
1055 			ret = -ENOTSUPP;
1056 		else if (sk->sk_state != TCP_CLOSE)
1057 			ret = -EBUSY;
1058 		else if (val < 0 || val > 1)
1059 			ret = -EINVAL;
1060 		else
1061 			sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1062 		break;
1063 
1064 	default:
1065 		ret = -ENOPROTOOPT;
1066 		break;
1067 	}
1068 	release_sock(sk);
1069 	return ret;
1070 }
1071 EXPORT_SYMBOL(sock_setsockopt);
1072 
1073 
1074 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1075 			  struct ucred *ucred)
1076 {
1077 	ucred->pid = pid_vnr(pid);
1078 	ucred->uid = ucred->gid = -1;
1079 	if (cred) {
1080 		struct user_namespace *current_ns = current_user_ns();
1081 
1082 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1083 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1084 	}
1085 }
1086 
1087 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1088 {
1089 	struct user_namespace *user_ns = current_user_ns();
1090 	int i;
1091 
1092 	for (i = 0; i < src->ngroups; i++)
1093 		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1094 			return -EFAULT;
1095 
1096 	return 0;
1097 }
1098 
1099 int sock_getsockopt(struct socket *sock, int level, int optname,
1100 		    char __user *optval, int __user *optlen)
1101 {
1102 	struct sock *sk = sock->sk;
1103 
1104 	union {
1105 		int val;
1106 		u64 val64;
1107 		struct linger ling;
1108 		struct timeval tm;
1109 	} v;
1110 
1111 	int lv = sizeof(int);
1112 	int len;
1113 
1114 	if (get_user(len, optlen))
1115 		return -EFAULT;
1116 	if (len < 0)
1117 		return -EINVAL;
1118 
1119 	memset(&v, 0, sizeof(v));
1120 
1121 	switch (optname) {
1122 	case SO_DEBUG:
1123 		v.val = sock_flag(sk, SOCK_DBG);
1124 		break;
1125 
1126 	case SO_DONTROUTE:
1127 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1128 		break;
1129 
1130 	case SO_BROADCAST:
1131 		v.val = sock_flag(sk, SOCK_BROADCAST);
1132 		break;
1133 
1134 	case SO_SNDBUF:
1135 		v.val = sk->sk_sndbuf;
1136 		break;
1137 
1138 	case SO_RCVBUF:
1139 		v.val = sk->sk_rcvbuf;
1140 		break;
1141 
1142 	case SO_REUSEADDR:
1143 		v.val = sk->sk_reuse;
1144 		break;
1145 
1146 	case SO_REUSEPORT:
1147 		v.val = sk->sk_reuseport;
1148 		break;
1149 
1150 	case SO_KEEPALIVE:
1151 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1152 		break;
1153 
1154 	case SO_TYPE:
1155 		v.val = sk->sk_type;
1156 		break;
1157 
1158 	case SO_PROTOCOL:
1159 		v.val = sk->sk_protocol;
1160 		break;
1161 
1162 	case SO_DOMAIN:
1163 		v.val = sk->sk_family;
1164 		break;
1165 
1166 	case SO_ERROR:
1167 		v.val = -sock_error(sk);
1168 		if (v.val == 0)
1169 			v.val = xchg(&sk->sk_err_soft, 0);
1170 		break;
1171 
1172 	case SO_OOBINLINE:
1173 		v.val = sock_flag(sk, SOCK_URGINLINE);
1174 		break;
1175 
1176 	case SO_NO_CHECK:
1177 		v.val = sk->sk_no_check_tx;
1178 		break;
1179 
1180 	case SO_PRIORITY:
1181 		v.val = sk->sk_priority;
1182 		break;
1183 
1184 	case SO_LINGER:
1185 		lv		= sizeof(v.ling);
1186 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1187 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1188 		break;
1189 
1190 	case SO_BSDCOMPAT:
1191 		sock_warn_obsolete_bsdism("getsockopt");
1192 		break;
1193 
1194 	case SO_TIMESTAMP:
1195 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1196 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1197 		break;
1198 
1199 	case SO_TIMESTAMPNS:
1200 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1201 		break;
1202 
1203 	case SO_TIMESTAMPING:
1204 		v.val = sk->sk_tsflags;
1205 		break;
1206 
1207 	case SO_RCVTIMEO:
1208 		lv = sizeof(struct timeval);
1209 		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1210 			v.tm.tv_sec = 0;
1211 			v.tm.tv_usec = 0;
1212 		} else {
1213 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1214 			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1215 		}
1216 		break;
1217 
1218 	case SO_SNDTIMEO:
1219 		lv = sizeof(struct timeval);
1220 		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1221 			v.tm.tv_sec = 0;
1222 			v.tm.tv_usec = 0;
1223 		} else {
1224 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1225 			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1226 		}
1227 		break;
1228 
1229 	case SO_RCVLOWAT:
1230 		v.val = sk->sk_rcvlowat;
1231 		break;
1232 
1233 	case SO_SNDLOWAT:
1234 		v.val = 1;
1235 		break;
1236 
1237 	case SO_PASSCRED:
1238 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1239 		break;
1240 
1241 	case SO_PEERCRED:
1242 	{
1243 		struct ucred peercred;
1244 		if (len > sizeof(peercred))
1245 			len = sizeof(peercred);
1246 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1247 		if (copy_to_user(optval, &peercred, len))
1248 			return -EFAULT;
1249 		goto lenout;
1250 	}
1251 
1252 	case SO_PEERGROUPS:
1253 	{
1254 		int ret, n;
1255 
1256 		if (!sk->sk_peer_cred)
1257 			return -ENODATA;
1258 
1259 		n = sk->sk_peer_cred->group_info->ngroups;
1260 		if (len < n * sizeof(gid_t)) {
1261 			len = n * sizeof(gid_t);
1262 			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1263 		}
1264 		len = n * sizeof(gid_t);
1265 
1266 		ret = groups_to_user((gid_t __user *)optval,
1267 				     sk->sk_peer_cred->group_info);
1268 		if (ret)
1269 			return ret;
1270 		goto lenout;
1271 	}
1272 
1273 	case SO_PEERNAME:
1274 	{
1275 		char address[128];
1276 
1277 		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1278 			return -ENOTCONN;
1279 		if (lv < len)
1280 			return -EINVAL;
1281 		if (copy_to_user(optval, address, len))
1282 			return -EFAULT;
1283 		goto lenout;
1284 	}
1285 
1286 	/* Dubious BSD thing... Probably nobody even uses it, but
1287 	 * the UNIX standard wants it for whatever reason... -DaveM
1288 	 */
1289 	case SO_ACCEPTCONN:
1290 		v.val = sk->sk_state == TCP_LISTEN;
1291 		break;
1292 
1293 	case SO_PASSSEC:
1294 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1295 		break;
1296 
1297 	case SO_PEERSEC:
1298 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1299 
1300 	case SO_MARK:
1301 		v.val = sk->sk_mark;
1302 		break;
1303 
1304 	case SO_RXQ_OVFL:
1305 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1306 		break;
1307 
1308 	case SO_WIFI_STATUS:
1309 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1310 		break;
1311 
1312 	case SO_PEEK_OFF:
1313 		if (!sock->ops->set_peek_off)
1314 			return -EOPNOTSUPP;
1315 
1316 		v.val = sk->sk_peek_off;
1317 		break;
1318 	case SO_NOFCS:
1319 		v.val = sock_flag(sk, SOCK_NOFCS);
1320 		break;
1321 
1322 	case SO_BINDTODEVICE:
1323 		return sock_getbindtodevice(sk, optval, optlen, len);
1324 
1325 	case SO_GET_FILTER:
1326 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1327 		if (len < 0)
1328 			return len;
1329 
1330 		goto lenout;
1331 
1332 	case SO_LOCK_FILTER:
1333 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1334 		break;
1335 
1336 	case SO_BPF_EXTENSIONS:
1337 		v.val = bpf_tell_extensions();
1338 		break;
1339 
1340 	case SO_SELECT_ERR_QUEUE:
1341 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1342 		break;
1343 
1344 #ifdef CONFIG_NET_RX_BUSY_POLL
1345 	case SO_BUSY_POLL:
1346 		v.val = sk->sk_ll_usec;
1347 		break;
1348 #endif
1349 
1350 	case SO_MAX_PACING_RATE:
1351 		v.val = sk->sk_max_pacing_rate;
1352 		break;
1353 
1354 	case SO_INCOMING_CPU:
1355 		v.val = sk->sk_incoming_cpu;
1356 		break;
1357 
1358 	case SO_MEMINFO:
1359 	{
1360 		u32 meminfo[SK_MEMINFO_VARS];
1361 
1362 		if (get_user(len, optlen))
1363 			return -EFAULT;
1364 
1365 		sk_get_meminfo(sk, meminfo);
1366 
1367 		len = min_t(unsigned int, len, sizeof(meminfo));
1368 		if (copy_to_user(optval, &meminfo, len))
1369 			return -EFAULT;
1370 
1371 		goto lenout;
1372 	}
1373 
1374 #ifdef CONFIG_NET_RX_BUSY_POLL
1375 	case SO_INCOMING_NAPI_ID:
1376 		v.val = READ_ONCE(sk->sk_napi_id);
1377 
1378 		/* aggregate non-NAPI IDs down to 0 */
1379 		if (v.val < MIN_NAPI_ID)
1380 			v.val = 0;
1381 
1382 		break;
1383 #endif
1384 
1385 	case SO_COOKIE:
1386 		lv = sizeof(u64);
1387 		if (len < lv)
1388 			return -EINVAL;
1389 		v.val64 = sock_gen_cookie(sk);
1390 		break;
1391 
1392 	case SO_ZEROCOPY:
1393 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1394 		break;
1395 
1396 	default:
1397 		/* We implement the SO_SNDLOWAT etc to not be settable
1398 		 * (1003.1g 7).
1399 		 */
1400 		return -ENOPROTOOPT;
1401 	}
1402 
1403 	if (len > lv)
1404 		len = lv;
1405 	if (copy_to_user(optval, &v, len))
1406 		return -EFAULT;
1407 lenout:
1408 	if (put_user(len, optlen))
1409 		return -EFAULT;
1410 	return 0;
1411 }
1412 
1413 /*
1414  * Initialize an sk_lock.
1415  *
1416  * (We also register the sk_lock with the lock validator.)
1417  */
1418 static inline void sock_lock_init(struct sock *sk)
1419 {
1420 	if (sk->sk_kern_sock)
1421 		sock_lock_init_class_and_name(
1422 			sk,
1423 			af_family_kern_slock_key_strings[sk->sk_family],
1424 			af_family_kern_slock_keys + sk->sk_family,
1425 			af_family_kern_key_strings[sk->sk_family],
1426 			af_family_kern_keys + sk->sk_family);
1427 	else
1428 		sock_lock_init_class_and_name(
1429 			sk,
1430 			af_family_slock_key_strings[sk->sk_family],
1431 			af_family_slock_keys + sk->sk_family,
1432 			af_family_key_strings[sk->sk_family],
1433 			af_family_keys + sk->sk_family);
1434 }
1435 
1436 /*
1437  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1438  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1439  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1440  */
1441 static void sock_copy(struct sock *nsk, const struct sock *osk)
1442 {
1443 #ifdef CONFIG_SECURITY_NETWORK
1444 	void *sptr = nsk->sk_security;
1445 #endif
1446 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1447 
1448 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1449 	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1450 
1451 #ifdef CONFIG_SECURITY_NETWORK
1452 	nsk->sk_security = sptr;
1453 	security_sk_clone(osk, nsk);
1454 #endif
1455 }
1456 
1457 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1458 		int family)
1459 {
1460 	struct sock *sk;
1461 	struct kmem_cache *slab;
1462 
1463 	slab = prot->slab;
1464 	if (slab != NULL) {
1465 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1466 		if (!sk)
1467 			return sk;
1468 		if (priority & __GFP_ZERO)
1469 			sk_prot_clear_nulls(sk, prot->obj_size);
1470 	} else
1471 		sk = kmalloc(prot->obj_size, priority);
1472 
1473 	if (sk != NULL) {
1474 		if (security_sk_alloc(sk, family, priority))
1475 			goto out_free;
1476 
1477 		if (!try_module_get(prot->owner))
1478 			goto out_free_sec;
1479 		sk_tx_queue_clear(sk);
1480 	}
1481 
1482 	return sk;
1483 
1484 out_free_sec:
1485 	security_sk_free(sk);
1486 out_free:
1487 	if (slab != NULL)
1488 		kmem_cache_free(slab, sk);
1489 	else
1490 		kfree(sk);
1491 	return NULL;
1492 }
1493 
1494 static void sk_prot_free(struct proto *prot, struct sock *sk)
1495 {
1496 	struct kmem_cache *slab;
1497 	struct module *owner;
1498 
1499 	owner = prot->owner;
1500 	slab = prot->slab;
1501 
1502 	cgroup_sk_free(&sk->sk_cgrp_data);
1503 	mem_cgroup_sk_free(sk);
1504 	security_sk_free(sk);
1505 	if (slab != NULL)
1506 		kmem_cache_free(slab, sk);
1507 	else
1508 		kfree(sk);
1509 	module_put(owner);
1510 }
1511 
1512 /**
1513  *	sk_alloc - All socket objects are allocated here
1514  *	@net: the applicable net namespace
1515  *	@family: protocol family
1516  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1517  *	@prot: struct proto associated with this new sock instance
1518  *	@kern: is this to be a kernel socket?
1519  */
1520 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1521 		      struct proto *prot, int kern)
1522 {
1523 	struct sock *sk;
1524 
1525 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1526 	if (sk) {
1527 		sk->sk_family = family;
1528 		/*
1529 		 * See comment in struct sock definition to understand
1530 		 * why we need sk_prot_creator -acme
1531 		 */
1532 		sk->sk_prot = sk->sk_prot_creator = prot;
1533 		sk->sk_kern_sock = kern;
1534 		sock_lock_init(sk);
1535 		sk->sk_net_refcnt = kern ? 0 : 1;
1536 		if (likely(sk->sk_net_refcnt)) {
1537 			get_net(net);
1538 			sock_inuse_add(net, 1);
1539 		}
1540 
1541 		sock_net_set(sk, net);
1542 		refcount_set(&sk->sk_wmem_alloc, 1);
1543 
1544 		mem_cgroup_sk_alloc(sk);
1545 		cgroup_sk_alloc(&sk->sk_cgrp_data);
1546 		sock_update_classid(&sk->sk_cgrp_data);
1547 		sock_update_netprioidx(&sk->sk_cgrp_data);
1548 	}
1549 
1550 	return sk;
1551 }
1552 EXPORT_SYMBOL(sk_alloc);
1553 
1554 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1555  * grace period. This is the case for UDP sockets and TCP listeners.
1556  */
1557 static void __sk_destruct(struct rcu_head *head)
1558 {
1559 	struct sock *sk = container_of(head, struct sock, sk_rcu);
1560 	struct sk_filter *filter;
1561 
1562 	if (sk->sk_destruct)
1563 		sk->sk_destruct(sk);
1564 
1565 	filter = rcu_dereference_check(sk->sk_filter,
1566 				       refcount_read(&sk->sk_wmem_alloc) == 0);
1567 	if (filter) {
1568 		sk_filter_uncharge(sk, filter);
1569 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1570 	}
1571 	if (rcu_access_pointer(sk->sk_reuseport_cb))
1572 		reuseport_detach_sock(sk);
1573 
1574 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1575 
1576 	if (atomic_read(&sk->sk_omem_alloc))
1577 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1578 			 __func__, atomic_read(&sk->sk_omem_alloc));
1579 
1580 	if (sk->sk_frag.page) {
1581 		put_page(sk->sk_frag.page);
1582 		sk->sk_frag.page = NULL;
1583 	}
1584 
1585 	if (sk->sk_peer_cred)
1586 		put_cred(sk->sk_peer_cred);
1587 	put_pid(sk->sk_peer_pid);
1588 	if (likely(sk->sk_net_refcnt))
1589 		put_net(sock_net(sk));
1590 	sk_prot_free(sk->sk_prot_creator, sk);
1591 }
1592 
1593 void sk_destruct(struct sock *sk)
1594 {
1595 	if (sock_flag(sk, SOCK_RCU_FREE))
1596 		call_rcu(&sk->sk_rcu, __sk_destruct);
1597 	else
1598 		__sk_destruct(&sk->sk_rcu);
1599 }
1600 
1601 static void __sk_free(struct sock *sk)
1602 {
1603 	if (likely(sk->sk_net_refcnt))
1604 		sock_inuse_add(sock_net(sk), -1);
1605 
1606 	if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
1607 		sock_diag_broadcast_destroy(sk);
1608 	else
1609 		sk_destruct(sk);
1610 }
1611 
1612 void sk_free(struct sock *sk)
1613 {
1614 	/*
1615 	 * We subtract one from sk_wmem_alloc and can know if
1616 	 * some packets are still in some tx queue.
1617 	 * If not null, sock_wfree() will call __sk_free(sk) later
1618 	 */
1619 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1620 		__sk_free(sk);
1621 }
1622 EXPORT_SYMBOL(sk_free);
1623 
1624 static void sk_init_common(struct sock *sk)
1625 {
1626 	skb_queue_head_init(&sk->sk_receive_queue);
1627 	skb_queue_head_init(&sk->sk_write_queue);
1628 	skb_queue_head_init(&sk->sk_error_queue);
1629 
1630 	rwlock_init(&sk->sk_callback_lock);
1631 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1632 			af_rlock_keys + sk->sk_family,
1633 			af_family_rlock_key_strings[sk->sk_family]);
1634 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1635 			af_wlock_keys + sk->sk_family,
1636 			af_family_wlock_key_strings[sk->sk_family]);
1637 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1638 			af_elock_keys + sk->sk_family,
1639 			af_family_elock_key_strings[sk->sk_family]);
1640 	lockdep_set_class_and_name(&sk->sk_callback_lock,
1641 			af_callback_keys + sk->sk_family,
1642 			af_family_clock_key_strings[sk->sk_family]);
1643 }
1644 
1645 /**
1646  *	sk_clone_lock - clone a socket, and lock its clone
1647  *	@sk: the socket to clone
1648  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1649  *
1650  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1651  */
1652 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1653 {
1654 	struct sock *newsk;
1655 	bool is_charged = true;
1656 
1657 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1658 	if (newsk != NULL) {
1659 		struct sk_filter *filter;
1660 
1661 		sock_copy(newsk, sk);
1662 
1663 		newsk->sk_prot_creator = sk->sk_prot;
1664 
1665 		/* SANITY */
1666 		if (likely(newsk->sk_net_refcnt))
1667 			get_net(sock_net(newsk));
1668 		sk_node_init(&newsk->sk_node);
1669 		sock_lock_init(newsk);
1670 		bh_lock_sock(newsk);
1671 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1672 		newsk->sk_backlog.len = 0;
1673 
1674 		atomic_set(&newsk->sk_rmem_alloc, 0);
1675 		/*
1676 		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1677 		 */
1678 		refcount_set(&newsk->sk_wmem_alloc, 1);
1679 		atomic_set(&newsk->sk_omem_alloc, 0);
1680 		sk_init_common(newsk);
1681 
1682 		newsk->sk_dst_cache	= NULL;
1683 		newsk->sk_dst_pending_confirm = 0;
1684 		newsk->sk_wmem_queued	= 0;
1685 		newsk->sk_forward_alloc = 0;
1686 		atomic_set(&newsk->sk_drops, 0);
1687 		newsk->sk_send_head	= NULL;
1688 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1689 		atomic_set(&newsk->sk_zckey, 0);
1690 
1691 		sock_reset_flag(newsk, SOCK_DONE);
1692 		mem_cgroup_sk_alloc(newsk);
1693 		cgroup_sk_alloc(&newsk->sk_cgrp_data);
1694 
1695 		rcu_read_lock();
1696 		filter = rcu_dereference(sk->sk_filter);
1697 		if (filter != NULL)
1698 			/* though it's an empty new sock, the charging may fail
1699 			 * if sysctl_optmem_max was changed between creation of
1700 			 * original socket and cloning
1701 			 */
1702 			is_charged = sk_filter_charge(newsk, filter);
1703 		RCU_INIT_POINTER(newsk->sk_filter, filter);
1704 		rcu_read_unlock();
1705 
1706 		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1707 			/* We need to make sure that we don't uncharge the new
1708 			 * socket if we couldn't charge it in the first place
1709 			 * as otherwise we uncharge the parent's filter.
1710 			 */
1711 			if (!is_charged)
1712 				RCU_INIT_POINTER(newsk->sk_filter, NULL);
1713 			sk_free_unlock_clone(newsk);
1714 			newsk = NULL;
1715 			goto out;
1716 		}
1717 		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1718 
1719 		newsk->sk_err	   = 0;
1720 		newsk->sk_err_soft = 0;
1721 		newsk->sk_priority = 0;
1722 		newsk->sk_incoming_cpu = raw_smp_processor_id();
1723 		atomic64_set(&newsk->sk_cookie, 0);
1724 		if (likely(newsk->sk_net_refcnt))
1725 			sock_inuse_add(sock_net(newsk), 1);
1726 
1727 		/*
1728 		 * Before updating sk_refcnt, we must commit prior changes to memory
1729 		 * (Documentation/RCU/rculist_nulls.txt for details)
1730 		 */
1731 		smp_wmb();
1732 		refcount_set(&newsk->sk_refcnt, 2);
1733 
1734 		/*
1735 		 * Increment the counter in the same struct proto as the master
1736 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1737 		 * is the same as sk->sk_prot->socks, as this field was copied
1738 		 * with memcpy).
1739 		 *
1740 		 * This _changes_ the previous behaviour, where
1741 		 * tcp_create_openreq_child always was incrementing the
1742 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1743 		 * to be taken into account in all callers. -acme
1744 		 */
1745 		sk_refcnt_debug_inc(newsk);
1746 		sk_set_socket(newsk, NULL);
1747 		newsk->sk_wq = NULL;
1748 
1749 		if (newsk->sk_prot->sockets_allocated)
1750 			sk_sockets_allocated_inc(newsk);
1751 
1752 		if (sock_needs_netstamp(sk) &&
1753 		    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1754 			net_enable_timestamp();
1755 	}
1756 out:
1757 	return newsk;
1758 }
1759 EXPORT_SYMBOL_GPL(sk_clone_lock);
1760 
1761 void sk_free_unlock_clone(struct sock *sk)
1762 {
1763 	/* It is still raw copy of parent, so invalidate
1764 	 * destructor and make plain sk_free() */
1765 	sk->sk_destruct = NULL;
1766 	bh_unlock_sock(sk);
1767 	sk_free(sk);
1768 }
1769 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1770 
1771 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1772 {
1773 	u32 max_segs = 1;
1774 
1775 	sk_dst_set(sk, dst);
1776 	sk->sk_route_caps = dst->dev->features;
1777 	if (sk->sk_route_caps & NETIF_F_GSO)
1778 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1779 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1780 	if (sk_can_gso(sk)) {
1781 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
1782 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1783 		} else {
1784 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1785 			sk->sk_gso_max_size = dst->dev->gso_max_size;
1786 			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1787 		}
1788 	}
1789 	sk->sk_gso_max_segs = max_segs;
1790 }
1791 EXPORT_SYMBOL_GPL(sk_setup_caps);
1792 
1793 /*
1794  *	Simple resource managers for sockets.
1795  */
1796 
1797 
1798 /*
1799  * Write buffer destructor automatically called from kfree_skb.
1800  */
1801 void sock_wfree(struct sk_buff *skb)
1802 {
1803 	struct sock *sk = skb->sk;
1804 	unsigned int len = skb->truesize;
1805 
1806 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1807 		/*
1808 		 * Keep a reference on sk_wmem_alloc, this will be released
1809 		 * after sk_write_space() call
1810 		 */
1811 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1812 		sk->sk_write_space(sk);
1813 		len = 1;
1814 	}
1815 	/*
1816 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1817 	 * could not do because of in-flight packets
1818 	 */
1819 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1820 		__sk_free(sk);
1821 }
1822 EXPORT_SYMBOL(sock_wfree);
1823 
1824 /* This variant of sock_wfree() is used by TCP,
1825  * since it sets SOCK_USE_WRITE_QUEUE.
1826  */
1827 void __sock_wfree(struct sk_buff *skb)
1828 {
1829 	struct sock *sk = skb->sk;
1830 
1831 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1832 		__sk_free(sk);
1833 }
1834 
1835 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1836 {
1837 	skb_orphan(skb);
1838 	skb->sk = sk;
1839 #ifdef CONFIG_INET
1840 	if (unlikely(!sk_fullsock(sk))) {
1841 		skb->destructor = sock_edemux;
1842 		sock_hold(sk);
1843 		return;
1844 	}
1845 #endif
1846 	skb->destructor = sock_wfree;
1847 	skb_set_hash_from_sk(skb, sk);
1848 	/*
1849 	 * We used to take a refcount on sk, but following operation
1850 	 * is enough to guarantee sk_free() wont free this sock until
1851 	 * all in-flight packets are completed
1852 	 */
1853 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1854 }
1855 EXPORT_SYMBOL(skb_set_owner_w);
1856 
1857 /* This helper is used by netem, as it can hold packets in its
1858  * delay queue. We want to allow the owner socket to send more
1859  * packets, as if they were already TX completed by a typical driver.
1860  * But we also want to keep skb->sk set because some packet schedulers
1861  * rely on it (sch_fq for example).
1862  */
1863 void skb_orphan_partial(struct sk_buff *skb)
1864 {
1865 	if (skb_is_tcp_pure_ack(skb))
1866 		return;
1867 
1868 	if (skb->destructor == sock_wfree
1869 #ifdef CONFIG_INET
1870 	    || skb->destructor == tcp_wfree
1871 #endif
1872 		) {
1873 		struct sock *sk = skb->sk;
1874 
1875 		if (refcount_inc_not_zero(&sk->sk_refcnt)) {
1876 			WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
1877 			skb->destructor = sock_efree;
1878 		}
1879 	} else {
1880 		skb_orphan(skb);
1881 	}
1882 }
1883 EXPORT_SYMBOL(skb_orphan_partial);
1884 
1885 /*
1886  * Read buffer destructor automatically called from kfree_skb.
1887  */
1888 void sock_rfree(struct sk_buff *skb)
1889 {
1890 	struct sock *sk = skb->sk;
1891 	unsigned int len = skb->truesize;
1892 
1893 	atomic_sub(len, &sk->sk_rmem_alloc);
1894 	sk_mem_uncharge(sk, len);
1895 }
1896 EXPORT_SYMBOL(sock_rfree);
1897 
1898 /*
1899  * Buffer destructor for skbs that are not used directly in read or write
1900  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1901  */
1902 void sock_efree(struct sk_buff *skb)
1903 {
1904 	sock_put(skb->sk);
1905 }
1906 EXPORT_SYMBOL(sock_efree);
1907 
1908 kuid_t sock_i_uid(struct sock *sk)
1909 {
1910 	kuid_t uid;
1911 
1912 	read_lock_bh(&sk->sk_callback_lock);
1913 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1914 	read_unlock_bh(&sk->sk_callback_lock);
1915 	return uid;
1916 }
1917 EXPORT_SYMBOL(sock_i_uid);
1918 
1919 unsigned long sock_i_ino(struct sock *sk)
1920 {
1921 	unsigned long ino;
1922 
1923 	read_lock_bh(&sk->sk_callback_lock);
1924 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1925 	read_unlock_bh(&sk->sk_callback_lock);
1926 	return ino;
1927 }
1928 EXPORT_SYMBOL(sock_i_ino);
1929 
1930 /*
1931  * Allocate a skb from the socket's send buffer.
1932  */
1933 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1934 			     gfp_t priority)
1935 {
1936 	if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1937 		struct sk_buff *skb = alloc_skb(size, priority);
1938 		if (skb) {
1939 			skb_set_owner_w(skb, sk);
1940 			return skb;
1941 		}
1942 	}
1943 	return NULL;
1944 }
1945 EXPORT_SYMBOL(sock_wmalloc);
1946 
1947 static void sock_ofree(struct sk_buff *skb)
1948 {
1949 	struct sock *sk = skb->sk;
1950 
1951 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
1952 }
1953 
1954 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
1955 			     gfp_t priority)
1956 {
1957 	struct sk_buff *skb;
1958 
1959 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
1960 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
1961 	    sysctl_optmem_max)
1962 		return NULL;
1963 
1964 	skb = alloc_skb(size, priority);
1965 	if (!skb)
1966 		return NULL;
1967 
1968 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
1969 	skb->sk = sk;
1970 	skb->destructor = sock_ofree;
1971 	return skb;
1972 }
1973 
1974 /*
1975  * Allocate a memory block from the socket's option memory buffer.
1976  */
1977 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1978 {
1979 	if ((unsigned int)size <= sysctl_optmem_max &&
1980 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1981 		void *mem;
1982 		/* First do the add, to avoid the race if kmalloc
1983 		 * might sleep.
1984 		 */
1985 		atomic_add(size, &sk->sk_omem_alloc);
1986 		mem = kmalloc(size, priority);
1987 		if (mem)
1988 			return mem;
1989 		atomic_sub(size, &sk->sk_omem_alloc);
1990 	}
1991 	return NULL;
1992 }
1993 EXPORT_SYMBOL(sock_kmalloc);
1994 
1995 /* Free an option memory block. Note, we actually want the inline
1996  * here as this allows gcc to detect the nullify and fold away the
1997  * condition entirely.
1998  */
1999 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2000 				  const bool nullify)
2001 {
2002 	if (WARN_ON_ONCE(!mem))
2003 		return;
2004 	if (nullify)
2005 		kzfree(mem);
2006 	else
2007 		kfree(mem);
2008 	atomic_sub(size, &sk->sk_omem_alloc);
2009 }
2010 
2011 void sock_kfree_s(struct sock *sk, void *mem, int size)
2012 {
2013 	__sock_kfree_s(sk, mem, size, false);
2014 }
2015 EXPORT_SYMBOL(sock_kfree_s);
2016 
2017 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2018 {
2019 	__sock_kfree_s(sk, mem, size, true);
2020 }
2021 EXPORT_SYMBOL(sock_kzfree_s);
2022 
2023 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2024    I think, these locks should be removed for datagram sockets.
2025  */
2026 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2027 {
2028 	DEFINE_WAIT(wait);
2029 
2030 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2031 	for (;;) {
2032 		if (!timeo)
2033 			break;
2034 		if (signal_pending(current))
2035 			break;
2036 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2037 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2038 		if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
2039 			break;
2040 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2041 			break;
2042 		if (sk->sk_err)
2043 			break;
2044 		timeo = schedule_timeout(timeo);
2045 	}
2046 	finish_wait(sk_sleep(sk), &wait);
2047 	return timeo;
2048 }
2049 
2050 
2051 /*
2052  *	Generic send/receive buffer handlers
2053  */
2054 
2055 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2056 				     unsigned long data_len, int noblock,
2057 				     int *errcode, int max_page_order)
2058 {
2059 	struct sk_buff *skb;
2060 	long timeo;
2061 	int err;
2062 
2063 	timeo = sock_sndtimeo(sk, noblock);
2064 	for (;;) {
2065 		err = sock_error(sk);
2066 		if (err != 0)
2067 			goto failure;
2068 
2069 		err = -EPIPE;
2070 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2071 			goto failure;
2072 
2073 		if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2074 			break;
2075 
2076 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2077 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2078 		err = -EAGAIN;
2079 		if (!timeo)
2080 			goto failure;
2081 		if (signal_pending(current))
2082 			goto interrupted;
2083 		timeo = sock_wait_for_wmem(sk, timeo);
2084 	}
2085 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2086 				   errcode, sk->sk_allocation);
2087 	if (skb)
2088 		skb_set_owner_w(skb, sk);
2089 	return skb;
2090 
2091 interrupted:
2092 	err = sock_intr_errno(timeo);
2093 failure:
2094 	*errcode = err;
2095 	return NULL;
2096 }
2097 EXPORT_SYMBOL(sock_alloc_send_pskb);
2098 
2099 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2100 				    int noblock, int *errcode)
2101 {
2102 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2103 }
2104 EXPORT_SYMBOL(sock_alloc_send_skb);
2105 
2106 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2107 		     struct sockcm_cookie *sockc)
2108 {
2109 	u32 tsflags;
2110 
2111 	switch (cmsg->cmsg_type) {
2112 	case SO_MARK:
2113 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2114 			return -EPERM;
2115 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2116 			return -EINVAL;
2117 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2118 		break;
2119 	case SO_TIMESTAMPING:
2120 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2121 			return -EINVAL;
2122 
2123 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2124 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2125 			return -EINVAL;
2126 
2127 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2128 		sockc->tsflags |= tsflags;
2129 		break;
2130 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2131 	case SCM_RIGHTS:
2132 	case SCM_CREDENTIALS:
2133 		break;
2134 	default:
2135 		return -EINVAL;
2136 	}
2137 	return 0;
2138 }
2139 EXPORT_SYMBOL(__sock_cmsg_send);
2140 
2141 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2142 		   struct sockcm_cookie *sockc)
2143 {
2144 	struct cmsghdr *cmsg;
2145 	int ret;
2146 
2147 	for_each_cmsghdr(cmsg, msg) {
2148 		if (!CMSG_OK(msg, cmsg))
2149 			return -EINVAL;
2150 		if (cmsg->cmsg_level != SOL_SOCKET)
2151 			continue;
2152 		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2153 		if (ret)
2154 			return ret;
2155 	}
2156 	return 0;
2157 }
2158 EXPORT_SYMBOL(sock_cmsg_send);
2159 
2160 static void sk_enter_memory_pressure(struct sock *sk)
2161 {
2162 	if (!sk->sk_prot->enter_memory_pressure)
2163 		return;
2164 
2165 	sk->sk_prot->enter_memory_pressure(sk);
2166 }
2167 
2168 static void sk_leave_memory_pressure(struct sock *sk)
2169 {
2170 	if (sk->sk_prot->leave_memory_pressure) {
2171 		sk->sk_prot->leave_memory_pressure(sk);
2172 	} else {
2173 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2174 
2175 		if (memory_pressure && *memory_pressure)
2176 			*memory_pressure = 0;
2177 	}
2178 }
2179 
2180 /* On 32bit arches, an skb frag is limited to 2^15 */
2181 #define SKB_FRAG_PAGE_ORDER	get_order(32768)
2182 
2183 /**
2184  * skb_page_frag_refill - check that a page_frag contains enough room
2185  * @sz: minimum size of the fragment we want to get
2186  * @pfrag: pointer to page_frag
2187  * @gfp: priority for memory allocation
2188  *
2189  * Note: While this allocator tries to use high order pages, there is
2190  * no guarantee that allocations succeed. Therefore, @sz MUST be
2191  * less or equal than PAGE_SIZE.
2192  */
2193 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2194 {
2195 	if (pfrag->page) {
2196 		if (page_ref_count(pfrag->page) == 1) {
2197 			pfrag->offset = 0;
2198 			return true;
2199 		}
2200 		if (pfrag->offset + sz <= pfrag->size)
2201 			return true;
2202 		put_page(pfrag->page);
2203 	}
2204 
2205 	pfrag->offset = 0;
2206 	if (SKB_FRAG_PAGE_ORDER) {
2207 		/* Avoid direct reclaim but allow kswapd to wake */
2208 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2209 					  __GFP_COMP | __GFP_NOWARN |
2210 					  __GFP_NORETRY,
2211 					  SKB_FRAG_PAGE_ORDER);
2212 		if (likely(pfrag->page)) {
2213 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2214 			return true;
2215 		}
2216 	}
2217 	pfrag->page = alloc_page(gfp);
2218 	if (likely(pfrag->page)) {
2219 		pfrag->size = PAGE_SIZE;
2220 		return true;
2221 	}
2222 	return false;
2223 }
2224 EXPORT_SYMBOL(skb_page_frag_refill);
2225 
2226 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2227 {
2228 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2229 		return true;
2230 
2231 	sk_enter_memory_pressure(sk);
2232 	sk_stream_moderate_sndbuf(sk);
2233 	return false;
2234 }
2235 EXPORT_SYMBOL(sk_page_frag_refill);
2236 
2237 static void __lock_sock(struct sock *sk)
2238 	__releases(&sk->sk_lock.slock)
2239 	__acquires(&sk->sk_lock.slock)
2240 {
2241 	DEFINE_WAIT(wait);
2242 
2243 	for (;;) {
2244 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2245 					TASK_UNINTERRUPTIBLE);
2246 		spin_unlock_bh(&sk->sk_lock.slock);
2247 		schedule();
2248 		spin_lock_bh(&sk->sk_lock.slock);
2249 		if (!sock_owned_by_user(sk))
2250 			break;
2251 	}
2252 	finish_wait(&sk->sk_lock.wq, &wait);
2253 }
2254 
2255 static void __release_sock(struct sock *sk)
2256 	__releases(&sk->sk_lock.slock)
2257 	__acquires(&sk->sk_lock.slock)
2258 {
2259 	struct sk_buff *skb, *next;
2260 
2261 	while ((skb = sk->sk_backlog.head) != NULL) {
2262 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2263 
2264 		spin_unlock_bh(&sk->sk_lock.slock);
2265 
2266 		do {
2267 			next = skb->next;
2268 			prefetch(next);
2269 			WARN_ON_ONCE(skb_dst_is_noref(skb));
2270 			skb->next = NULL;
2271 			sk_backlog_rcv(sk, skb);
2272 
2273 			cond_resched();
2274 
2275 			skb = next;
2276 		} while (skb != NULL);
2277 
2278 		spin_lock_bh(&sk->sk_lock.slock);
2279 	}
2280 
2281 	/*
2282 	 * Doing the zeroing here guarantee we can not loop forever
2283 	 * while a wild producer attempts to flood us.
2284 	 */
2285 	sk->sk_backlog.len = 0;
2286 }
2287 
2288 void __sk_flush_backlog(struct sock *sk)
2289 {
2290 	spin_lock_bh(&sk->sk_lock.slock);
2291 	__release_sock(sk);
2292 	spin_unlock_bh(&sk->sk_lock.slock);
2293 }
2294 
2295 /**
2296  * sk_wait_data - wait for data to arrive at sk_receive_queue
2297  * @sk:    sock to wait on
2298  * @timeo: for how long
2299  * @skb:   last skb seen on sk_receive_queue
2300  *
2301  * Now socket state including sk->sk_err is changed only under lock,
2302  * hence we may omit checks after joining wait queue.
2303  * We check receive queue before schedule() only as optimization;
2304  * it is very likely that release_sock() added new data.
2305  */
2306 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2307 {
2308 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2309 	int rc;
2310 
2311 	add_wait_queue(sk_sleep(sk), &wait);
2312 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2313 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2314 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2315 	remove_wait_queue(sk_sleep(sk), &wait);
2316 	return rc;
2317 }
2318 EXPORT_SYMBOL(sk_wait_data);
2319 
2320 /**
2321  *	__sk_mem_raise_allocated - increase memory_allocated
2322  *	@sk: socket
2323  *	@size: memory size to allocate
2324  *	@amt: pages to allocate
2325  *	@kind: allocation type
2326  *
2327  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2328  */
2329 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2330 {
2331 	struct proto *prot = sk->sk_prot;
2332 	long allocated = sk_memory_allocated_add(sk, amt);
2333 
2334 	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2335 	    !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2336 		goto suppress_allocation;
2337 
2338 	/* Under limit. */
2339 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2340 		sk_leave_memory_pressure(sk);
2341 		return 1;
2342 	}
2343 
2344 	/* Under pressure. */
2345 	if (allocated > sk_prot_mem_limits(sk, 1))
2346 		sk_enter_memory_pressure(sk);
2347 
2348 	/* Over hard limit. */
2349 	if (allocated > sk_prot_mem_limits(sk, 2))
2350 		goto suppress_allocation;
2351 
2352 	/* guarantee minimum buffer size under pressure */
2353 	if (kind == SK_MEM_RECV) {
2354 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2355 			return 1;
2356 
2357 	} else { /* SK_MEM_SEND */
2358 		int wmem0 = sk_get_wmem0(sk, prot);
2359 
2360 		if (sk->sk_type == SOCK_STREAM) {
2361 			if (sk->sk_wmem_queued < wmem0)
2362 				return 1;
2363 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2364 				return 1;
2365 		}
2366 	}
2367 
2368 	if (sk_has_memory_pressure(sk)) {
2369 		int alloc;
2370 
2371 		if (!sk_under_memory_pressure(sk))
2372 			return 1;
2373 		alloc = sk_sockets_allocated_read_positive(sk);
2374 		if (sk_prot_mem_limits(sk, 2) > alloc *
2375 		    sk_mem_pages(sk->sk_wmem_queued +
2376 				 atomic_read(&sk->sk_rmem_alloc) +
2377 				 sk->sk_forward_alloc))
2378 			return 1;
2379 	}
2380 
2381 suppress_allocation:
2382 
2383 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2384 		sk_stream_moderate_sndbuf(sk);
2385 
2386 		/* Fail only if socket is _under_ its sndbuf.
2387 		 * In this case we cannot block, so that we have to fail.
2388 		 */
2389 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2390 			return 1;
2391 	}
2392 
2393 	trace_sock_exceed_buf_limit(sk, prot, allocated);
2394 
2395 	sk_memory_allocated_sub(sk, amt);
2396 
2397 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2398 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2399 
2400 	return 0;
2401 }
2402 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2403 
2404 /**
2405  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2406  *	@sk: socket
2407  *	@size: memory size to allocate
2408  *	@kind: allocation type
2409  *
2410  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2411  *	rmem allocation. This function assumes that protocols which have
2412  *	memory_pressure use sk_wmem_queued as write buffer accounting.
2413  */
2414 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2415 {
2416 	int ret, amt = sk_mem_pages(size);
2417 
2418 	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2419 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2420 	if (!ret)
2421 		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2422 	return ret;
2423 }
2424 EXPORT_SYMBOL(__sk_mem_schedule);
2425 
2426 /**
2427  *	__sk_mem_reduce_allocated - reclaim memory_allocated
2428  *	@sk: socket
2429  *	@amount: number of quanta
2430  *
2431  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2432  */
2433 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2434 {
2435 	sk_memory_allocated_sub(sk, amount);
2436 
2437 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2438 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2439 
2440 	if (sk_under_memory_pressure(sk) &&
2441 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2442 		sk_leave_memory_pressure(sk);
2443 }
2444 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2445 
2446 /**
2447  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2448  *	@sk: socket
2449  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2450  */
2451 void __sk_mem_reclaim(struct sock *sk, int amount)
2452 {
2453 	amount >>= SK_MEM_QUANTUM_SHIFT;
2454 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2455 	__sk_mem_reduce_allocated(sk, amount);
2456 }
2457 EXPORT_SYMBOL(__sk_mem_reclaim);
2458 
2459 int sk_set_peek_off(struct sock *sk, int val)
2460 {
2461 	sk->sk_peek_off = val;
2462 	return 0;
2463 }
2464 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2465 
2466 /*
2467  * Set of default routines for initialising struct proto_ops when
2468  * the protocol does not support a particular function. In certain
2469  * cases where it makes no sense for a protocol to have a "do nothing"
2470  * function, some default processing is provided.
2471  */
2472 
2473 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2474 {
2475 	return -EOPNOTSUPP;
2476 }
2477 EXPORT_SYMBOL(sock_no_bind);
2478 
2479 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2480 		    int len, int flags)
2481 {
2482 	return -EOPNOTSUPP;
2483 }
2484 EXPORT_SYMBOL(sock_no_connect);
2485 
2486 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2487 {
2488 	return -EOPNOTSUPP;
2489 }
2490 EXPORT_SYMBOL(sock_no_socketpair);
2491 
2492 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2493 		   bool kern)
2494 {
2495 	return -EOPNOTSUPP;
2496 }
2497 EXPORT_SYMBOL(sock_no_accept);
2498 
2499 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2500 		    int *len, int peer)
2501 {
2502 	return -EOPNOTSUPP;
2503 }
2504 EXPORT_SYMBOL(sock_no_getname);
2505 
2506 __poll_t sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2507 {
2508 	return 0;
2509 }
2510 EXPORT_SYMBOL(sock_no_poll);
2511 
2512 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2513 {
2514 	return -EOPNOTSUPP;
2515 }
2516 EXPORT_SYMBOL(sock_no_ioctl);
2517 
2518 int sock_no_listen(struct socket *sock, int backlog)
2519 {
2520 	return -EOPNOTSUPP;
2521 }
2522 EXPORT_SYMBOL(sock_no_listen);
2523 
2524 int sock_no_shutdown(struct socket *sock, int how)
2525 {
2526 	return -EOPNOTSUPP;
2527 }
2528 EXPORT_SYMBOL(sock_no_shutdown);
2529 
2530 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2531 		    char __user *optval, unsigned int optlen)
2532 {
2533 	return -EOPNOTSUPP;
2534 }
2535 EXPORT_SYMBOL(sock_no_setsockopt);
2536 
2537 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2538 		    char __user *optval, int __user *optlen)
2539 {
2540 	return -EOPNOTSUPP;
2541 }
2542 EXPORT_SYMBOL(sock_no_getsockopt);
2543 
2544 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2545 {
2546 	return -EOPNOTSUPP;
2547 }
2548 EXPORT_SYMBOL(sock_no_sendmsg);
2549 
2550 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2551 {
2552 	return -EOPNOTSUPP;
2553 }
2554 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2555 
2556 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2557 		    int flags)
2558 {
2559 	return -EOPNOTSUPP;
2560 }
2561 EXPORT_SYMBOL(sock_no_recvmsg);
2562 
2563 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2564 {
2565 	/* Mirror missing mmap method error code */
2566 	return -ENODEV;
2567 }
2568 EXPORT_SYMBOL(sock_no_mmap);
2569 
2570 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2571 {
2572 	ssize_t res;
2573 	struct msghdr msg = {.msg_flags = flags};
2574 	struct kvec iov;
2575 	char *kaddr = kmap(page);
2576 	iov.iov_base = kaddr + offset;
2577 	iov.iov_len = size;
2578 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2579 	kunmap(page);
2580 	return res;
2581 }
2582 EXPORT_SYMBOL(sock_no_sendpage);
2583 
2584 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2585 				int offset, size_t size, int flags)
2586 {
2587 	ssize_t res;
2588 	struct msghdr msg = {.msg_flags = flags};
2589 	struct kvec iov;
2590 	char *kaddr = kmap(page);
2591 
2592 	iov.iov_base = kaddr + offset;
2593 	iov.iov_len = size;
2594 	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2595 	kunmap(page);
2596 	return res;
2597 }
2598 EXPORT_SYMBOL(sock_no_sendpage_locked);
2599 
2600 /*
2601  *	Default Socket Callbacks
2602  */
2603 
2604 static void sock_def_wakeup(struct sock *sk)
2605 {
2606 	struct socket_wq *wq;
2607 
2608 	rcu_read_lock();
2609 	wq = rcu_dereference(sk->sk_wq);
2610 	if (skwq_has_sleeper(wq))
2611 		wake_up_interruptible_all(&wq->wait);
2612 	rcu_read_unlock();
2613 }
2614 
2615 static void sock_def_error_report(struct sock *sk)
2616 {
2617 	struct socket_wq *wq;
2618 
2619 	rcu_read_lock();
2620 	wq = rcu_dereference(sk->sk_wq);
2621 	if (skwq_has_sleeper(wq))
2622 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2623 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2624 	rcu_read_unlock();
2625 }
2626 
2627 static void sock_def_readable(struct sock *sk)
2628 {
2629 	struct socket_wq *wq;
2630 
2631 	rcu_read_lock();
2632 	wq = rcu_dereference(sk->sk_wq);
2633 	if (skwq_has_sleeper(wq))
2634 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2635 						EPOLLRDNORM | EPOLLRDBAND);
2636 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2637 	rcu_read_unlock();
2638 }
2639 
2640 static void sock_def_write_space(struct sock *sk)
2641 {
2642 	struct socket_wq *wq;
2643 
2644 	rcu_read_lock();
2645 
2646 	/* Do not wake up a writer until he can make "significant"
2647 	 * progress.  --DaveM
2648 	 */
2649 	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2650 		wq = rcu_dereference(sk->sk_wq);
2651 		if (skwq_has_sleeper(wq))
2652 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2653 						EPOLLWRNORM | EPOLLWRBAND);
2654 
2655 		/* Should agree with poll, otherwise some programs break */
2656 		if (sock_writeable(sk))
2657 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2658 	}
2659 
2660 	rcu_read_unlock();
2661 }
2662 
2663 static void sock_def_destruct(struct sock *sk)
2664 {
2665 }
2666 
2667 void sk_send_sigurg(struct sock *sk)
2668 {
2669 	if (sk->sk_socket && sk->sk_socket->file)
2670 		if (send_sigurg(&sk->sk_socket->file->f_owner))
2671 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2672 }
2673 EXPORT_SYMBOL(sk_send_sigurg);
2674 
2675 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2676 		    unsigned long expires)
2677 {
2678 	if (!mod_timer(timer, expires))
2679 		sock_hold(sk);
2680 }
2681 EXPORT_SYMBOL(sk_reset_timer);
2682 
2683 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2684 {
2685 	if (del_timer(timer))
2686 		__sock_put(sk);
2687 }
2688 EXPORT_SYMBOL(sk_stop_timer);
2689 
2690 void sock_init_data(struct socket *sock, struct sock *sk)
2691 {
2692 	sk_init_common(sk);
2693 	sk->sk_send_head	=	NULL;
2694 
2695 	timer_setup(&sk->sk_timer, NULL, 0);
2696 
2697 	sk->sk_allocation	=	GFP_KERNEL;
2698 	sk->sk_rcvbuf		=	sysctl_rmem_default;
2699 	sk->sk_sndbuf		=	sysctl_wmem_default;
2700 	sk->sk_state		=	TCP_CLOSE;
2701 	sk_set_socket(sk, sock);
2702 
2703 	sock_set_flag(sk, SOCK_ZAPPED);
2704 
2705 	if (sock) {
2706 		sk->sk_type	=	sock->type;
2707 		sk->sk_wq	=	sock->wq;
2708 		sock->sk	=	sk;
2709 		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
2710 	} else {
2711 		sk->sk_wq	=	NULL;
2712 		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
2713 	}
2714 
2715 	rwlock_init(&sk->sk_callback_lock);
2716 	if (sk->sk_kern_sock)
2717 		lockdep_set_class_and_name(
2718 			&sk->sk_callback_lock,
2719 			af_kern_callback_keys + sk->sk_family,
2720 			af_family_kern_clock_key_strings[sk->sk_family]);
2721 	else
2722 		lockdep_set_class_and_name(
2723 			&sk->sk_callback_lock,
2724 			af_callback_keys + sk->sk_family,
2725 			af_family_clock_key_strings[sk->sk_family]);
2726 
2727 	sk->sk_state_change	=	sock_def_wakeup;
2728 	sk->sk_data_ready	=	sock_def_readable;
2729 	sk->sk_write_space	=	sock_def_write_space;
2730 	sk->sk_error_report	=	sock_def_error_report;
2731 	sk->sk_destruct		=	sock_def_destruct;
2732 
2733 	sk->sk_frag.page	=	NULL;
2734 	sk->sk_frag.offset	=	0;
2735 	sk->sk_peek_off		=	-1;
2736 
2737 	sk->sk_peer_pid 	=	NULL;
2738 	sk->sk_peer_cred	=	NULL;
2739 	sk->sk_write_pending	=	0;
2740 	sk->sk_rcvlowat		=	1;
2741 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2742 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2743 
2744 	sk->sk_stamp = SK_DEFAULT_STAMP;
2745 	atomic_set(&sk->sk_zckey, 0);
2746 
2747 #ifdef CONFIG_NET_RX_BUSY_POLL
2748 	sk->sk_napi_id		=	0;
2749 	sk->sk_ll_usec		=	sysctl_net_busy_read;
2750 #endif
2751 
2752 	sk->sk_max_pacing_rate = ~0U;
2753 	sk->sk_pacing_rate = ~0U;
2754 	sk->sk_pacing_shift = 10;
2755 	sk->sk_incoming_cpu = -1;
2756 	/*
2757 	 * Before updating sk_refcnt, we must commit prior changes to memory
2758 	 * (Documentation/RCU/rculist_nulls.txt for details)
2759 	 */
2760 	smp_wmb();
2761 	refcount_set(&sk->sk_refcnt, 1);
2762 	atomic_set(&sk->sk_drops, 0);
2763 }
2764 EXPORT_SYMBOL(sock_init_data);
2765 
2766 void lock_sock_nested(struct sock *sk, int subclass)
2767 {
2768 	might_sleep();
2769 	spin_lock_bh(&sk->sk_lock.slock);
2770 	if (sk->sk_lock.owned)
2771 		__lock_sock(sk);
2772 	sk->sk_lock.owned = 1;
2773 	spin_unlock(&sk->sk_lock.slock);
2774 	/*
2775 	 * The sk_lock has mutex_lock() semantics here:
2776 	 */
2777 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2778 	local_bh_enable();
2779 }
2780 EXPORT_SYMBOL(lock_sock_nested);
2781 
2782 void release_sock(struct sock *sk)
2783 {
2784 	spin_lock_bh(&sk->sk_lock.slock);
2785 	if (sk->sk_backlog.tail)
2786 		__release_sock(sk);
2787 
2788 	/* Warning : release_cb() might need to release sk ownership,
2789 	 * ie call sock_release_ownership(sk) before us.
2790 	 */
2791 	if (sk->sk_prot->release_cb)
2792 		sk->sk_prot->release_cb(sk);
2793 
2794 	sock_release_ownership(sk);
2795 	if (waitqueue_active(&sk->sk_lock.wq))
2796 		wake_up(&sk->sk_lock.wq);
2797 	spin_unlock_bh(&sk->sk_lock.slock);
2798 }
2799 EXPORT_SYMBOL(release_sock);
2800 
2801 /**
2802  * lock_sock_fast - fast version of lock_sock
2803  * @sk: socket
2804  *
2805  * This version should be used for very small section, where process wont block
2806  * return false if fast path is taken:
2807  *
2808  *   sk_lock.slock locked, owned = 0, BH disabled
2809  *
2810  * return true if slow path is taken:
2811  *
2812  *   sk_lock.slock unlocked, owned = 1, BH enabled
2813  */
2814 bool lock_sock_fast(struct sock *sk)
2815 {
2816 	might_sleep();
2817 	spin_lock_bh(&sk->sk_lock.slock);
2818 
2819 	if (!sk->sk_lock.owned)
2820 		/*
2821 		 * Note : We must disable BH
2822 		 */
2823 		return false;
2824 
2825 	__lock_sock(sk);
2826 	sk->sk_lock.owned = 1;
2827 	spin_unlock(&sk->sk_lock.slock);
2828 	/*
2829 	 * The sk_lock has mutex_lock() semantics here:
2830 	 */
2831 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2832 	local_bh_enable();
2833 	return true;
2834 }
2835 EXPORT_SYMBOL(lock_sock_fast);
2836 
2837 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2838 {
2839 	struct timeval tv;
2840 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2841 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2842 	tv = ktime_to_timeval(sk->sk_stamp);
2843 	if (tv.tv_sec == -1)
2844 		return -ENOENT;
2845 	if (tv.tv_sec == 0) {
2846 		sk->sk_stamp = ktime_get_real();
2847 		tv = ktime_to_timeval(sk->sk_stamp);
2848 	}
2849 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2850 }
2851 EXPORT_SYMBOL(sock_get_timestamp);
2852 
2853 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2854 {
2855 	struct timespec ts;
2856 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2857 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2858 	ts = ktime_to_timespec(sk->sk_stamp);
2859 	if (ts.tv_sec == -1)
2860 		return -ENOENT;
2861 	if (ts.tv_sec == 0) {
2862 		sk->sk_stamp = ktime_get_real();
2863 		ts = ktime_to_timespec(sk->sk_stamp);
2864 	}
2865 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2866 }
2867 EXPORT_SYMBOL(sock_get_timestampns);
2868 
2869 void sock_enable_timestamp(struct sock *sk, int flag)
2870 {
2871 	if (!sock_flag(sk, flag)) {
2872 		unsigned long previous_flags = sk->sk_flags;
2873 
2874 		sock_set_flag(sk, flag);
2875 		/*
2876 		 * we just set one of the two flags which require net
2877 		 * time stamping, but time stamping might have been on
2878 		 * already because of the other one
2879 		 */
2880 		if (sock_needs_netstamp(sk) &&
2881 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
2882 			net_enable_timestamp();
2883 	}
2884 }
2885 
2886 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2887 		       int level, int type)
2888 {
2889 	struct sock_exterr_skb *serr;
2890 	struct sk_buff *skb;
2891 	int copied, err;
2892 
2893 	err = -EAGAIN;
2894 	skb = sock_dequeue_err_skb(sk);
2895 	if (skb == NULL)
2896 		goto out;
2897 
2898 	copied = skb->len;
2899 	if (copied > len) {
2900 		msg->msg_flags |= MSG_TRUNC;
2901 		copied = len;
2902 	}
2903 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
2904 	if (err)
2905 		goto out_free_skb;
2906 
2907 	sock_recv_timestamp(msg, sk, skb);
2908 
2909 	serr = SKB_EXT_ERR(skb);
2910 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2911 
2912 	msg->msg_flags |= MSG_ERRQUEUE;
2913 	err = copied;
2914 
2915 out_free_skb:
2916 	kfree_skb(skb);
2917 out:
2918 	return err;
2919 }
2920 EXPORT_SYMBOL(sock_recv_errqueue);
2921 
2922 /*
2923  *	Get a socket option on an socket.
2924  *
2925  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2926  *	asynchronous errors should be reported by getsockopt. We assume
2927  *	this means if you specify SO_ERROR (otherwise whats the point of it).
2928  */
2929 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2930 			   char __user *optval, int __user *optlen)
2931 {
2932 	struct sock *sk = sock->sk;
2933 
2934 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2935 }
2936 EXPORT_SYMBOL(sock_common_getsockopt);
2937 
2938 #ifdef CONFIG_COMPAT
2939 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2940 				  char __user *optval, int __user *optlen)
2941 {
2942 	struct sock *sk = sock->sk;
2943 
2944 	if (sk->sk_prot->compat_getsockopt != NULL)
2945 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2946 						      optval, optlen);
2947 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2948 }
2949 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2950 #endif
2951 
2952 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2953 			int flags)
2954 {
2955 	struct sock *sk = sock->sk;
2956 	int addr_len = 0;
2957 	int err;
2958 
2959 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2960 				   flags & ~MSG_DONTWAIT, &addr_len);
2961 	if (err >= 0)
2962 		msg->msg_namelen = addr_len;
2963 	return err;
2964 }
2965 EXPORT_SYMBOL(sock_common_recvmsg);
2966 
2967 /*
2968  *	Set socket options on an inet socket.
2969  */
2970 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2971 			   char __user *optval, unsigned int optlen)
2972 {
2973 	struct sock *sk = sock->sk;
2974 
2975 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2976 }
2977 EXPORT_SYMBOL(sock_common_setsockopt);
2978 
2979 #ifdef CONFIG_COMPAT
2980 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2981 				  char __user *optval, unsigned int optlen)
2982 {
2983 	struct sock *sk = sock->sk;
2984 
2985 	if (sk->sk_prot->compat_setsockopt != NULL)
2986 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2987 						      optval, optlen);
2988 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2989 }
2990 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2991 #endif
2992 
2993 void sk_common_release(struct sock *sk)
2994 {
2995 	if (sk->sk_prot->destroy)
2996 		sk->sk_prot->destroy(sk);
2997 
2998 	/*
2999 	 * Observation: when sock_common_release is called, processes have
3000 	 * no access to socket. But net still has.
3001 	 * Step one, detach it from networking:
3002 	 *
3003 	 * A. Remove from hash tables.
3004 	 */
3005 
3006 	sk->sk_prot->unhash(sk);
3007 
3008 	/*
3009 	 * In this point socket cannot receive new packets, but it is possible
3010 	 * that some packets are in flight because some CPU runs receiver and
3011 	 * did hash table lookup before we unhashed socket. They will achieve
3012 	 * receive queue and will be purged by socket destructor.
3013 	 *
3014 	 * Also we still have packets pending on receive queue and probably,
3015 	 * our own packets waiting in device queues. sock_destroy will drain
3016 	 * receive queue, but transmitted packets will delay socket destruction
3017 	 * until the last reference will be released.
3018 	 */
3019 
3020 	sock_orphan(sk);
3021 
3022 	xfrm_sk_free_policy(sk);
3023 
3024 	sk_refcnt_debug_release(sk);
3025 
3026 	sock_put(sk);
3027 }
3028 EXPORT_SYMBOL(sk_common_release);
3029 
3030 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3031 {
3032 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3033 
3034 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3035 	mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3036 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3037 	mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3038 	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3039 	mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3040 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3041 	mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3042 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3043 }
3044 
3045 #ifdef CONFIG_PROC_FS
3046 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
3047 struct prot_inuse {
3048 	int val[PROTO_INUSE_NR];
3049 };
3050 
3051 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3052 
3053 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3054 {
3055 	__this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3056 }
3057 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3058 
3059 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3060 {
3061 	int cpu, idx = prot->inuse_idx;
3062 	int res = 0;
3063 
3064 	for_each_possible_cpu(cpu)
3065 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3066 
3067 	return res >= 0 ? res : 0;
3068 }
3069 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3070 
3071 static void sock_inuse_add(struct net *net, int val)
3072 {
3073 	this_cpu_add(*net->core.sock_inuse, val);
3074 }
3075 
3076 int sock_inuse_get(struct net *net)
3077 {
3078 	int cpu, res = 0;
3079 
3080 	for_each_possible_cpu(cpu)
3081 		res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3082 
3083 	return res;
3084 }
3085 
3086 EXPORT_SYMBOL_GPL(sock_inuse_get);
3087 
3088 static int __net_init sock_inuse_init_net(struct net *net)
3089 {
3090 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3091 	if (net->core.prot_inuse == NULL)
3092 		return -ENOMEM;
3093 
3094 	net->core.sock_inuse = alloc_percpu(int);
3095 	if (net->core.sock_inuse == NULL)
3096 		goto out;
3097 
3098 	return 0;
3099 
3100 out:
3101 	free_percpu(net->core.prot_inuse);
3102 	return -ENOMEM;
3103 }
3104 
3105 static void __net_exit sock_inuse_exit_net(struct net *net)
3106 {
3107 	free_percpu(net->core.prot_inuse);
3108 	free_percpu(net->core.sock_inuse);
3109 }
3110 
3111 static struct pernet_operations net_inuse_ops = {
3112 	.init = sock_inuse_init_net,
3113 	.exit = sock_inuse_exit_net,
3114 };
3115 
3116 static __init int net_inuse_init(void)
3117 {
3118 	if (register_pernet_subsys(&net_inuse_ops))
3119 		panic("Cannot initialize net inuse counters");
3120 
3121 	return 0;
3122 }
3123 
3124 core_initcall(net_inuse_init);
3125 
3126 static void assign_proto_idx(struct proto *prot)
3127 {
3128 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3129 
3130 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3131 		pr_err("PROTO_INUSE_NR exhausted\n");
3132 		return;
3133 	}
3134 
3135 	set_bit(prot->inuse_idx, proto_inuse_idx);
3136 }
3137 
3138 static void release_proto_idx(struct proto *prot)
3139 {
3140 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3141 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3142 }
3143 #else
3144 static inline void assign_proto_idx(struct proto *prot)
3145 {
3146 }
3147 
3148 static inline void release_proto_idx(struct proto *prot)
3149 {
3150 }
3151 
3152 static void sock_inuse_add(struct net *net, int val)
3153 {
3154 }
3155 #endif
3156 
3157 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3158 {
3159 	if (!rsk_prot)
3160 		return;
3161 	kfree(rsk_prot->slab_name);
3162 	rsk_prot->slab_name = NULL;
3163 	kmem_cache_destroy(rsk_prot->slab);
3164 	rsk_prot->slab = NULL;
3165 }
3166 
3167 static int req_prot_init(const struct proto *prot)
3168 {
3169 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3170 
3171 	if (!rsk_prot)
3172 		return 0;
3173 
3174 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3175 					prot->name);
3176 	if (!rsk_prot->slab_name)
3177 		return -ENOMEM;
3178 
3179 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3180 					   rsk_prot->obj_size, 0,
3181 					   prot->slab_flags, NULL);
3182 
3183 	if (!rsk_prot->slab) {
3184 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3185 			prot->name);
3186 		return -ENOMEM;
3187 	}
3188 	return 0;
3189 }
3190 
3191 int proto_register(struct proto *prot, int alloc_slab)
3192 {
3193 	if (alloc_slab) {
3194 		prot->slab = kmem_cache_create_usercopy(prot->name,
3195 					prot->obj_size, 0,
3196 					SLAB_HWCACHE_ALIGN | prot->slab_flags,
3197 					prot->useroffset, prot->usersize,
3198 					NULL);
3199 
3200 		if (prot->slab == NULL) {
3201 			pr_crit("%s: Can't create sock SLAB cache!\n",
3202 				prot->name);
3203 			goto out;
3204 		}
3205 
3206 		if (req_prot_init(prot))
3207 			goto out_free_request_sock_slab;
3208 
3209 		if (prot->twsk_prot != NULL) {
3210 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3211 
3212 			if (prot->twsk_prot->twsk_slab_name == NULL)
3213 				goto out_free_request_sock_slab;
3214 
3215 			prot->twsk_prot->twsk_slab =
3216 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3217 						  prot->twsk_prot->twsk_obj_size,
3218 						  0,
3219 						  prot->slab_flags,
3220 						  NULL);
3221 			if (prot->twsk_prot->twsk_slab == NULL)
3222 				goto out_free_timewait_sock_slab_name;
3223 		}
3224 	}
3225 
3226 	mutex_lock(&proto_list_mutex);
3227 	list_add(&prot->node, &proto_list);
3228 	assign_proto_idx(prot);
3229 	mutex_unlock(&proto_list_mutex);
3230 	return 0;
3231 
3232 out_free_timewait_sock_slab_name:
3233 	kfree(prot->twsk_prot->twsk_slab_name);
3234 out_free_request_sock_slab:
3235 	req_prot_cleanup(prot->rsk_prot);
3236 
3237 	kmem_cache_destroy(prot->slab);
3238 	prot->slab = NULL;
3239 out:
3240 	return -ENOBUFS;
3241 }
3242 EXPORT_SYMBOL(proto_register);
3243 
3244 void proto_unregister(struct proto *prot)
3245 {
3246 	mutex_lock(&proto_list_mutex);
3247 	release_proto_idx(prot);
3248 	list_del(&prot->node);
3249 	mutex_unlock(&proto_list_mutex);
3250 
3251 	kmem_cache_destroy(prot->slab);
3252 	prot->slab = NULL;
3253 
3254 	req_prot_cleanup(prot->rsk_prot);
3255 
3256 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3257 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3258 		kfree(prot->twsk_prot->twsk_slab_name);
3259 		prot->twsk_prot->twsk_slab = NULL;
3260 	}
3261 }
3262 EXPORT_SYMBOL(proto_unregister);
3263 
3264 #ifdef CONFIG_PROC_FS
3265 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3266 	__acquires(proto_list_mutex)
3267 {
3268 	mutex_lock(&proto_list_mutex);
3269 	return seq_list_start_head(&proto_list, *pos);
3270 }
3271 
3272 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3273 {
3274 	return seq_list_next(v, &proto_list, pos);
3275 }
3276 
3277 static void proto_seq_stop(struct seq_file *seq, void *v)
3278 	__releases(proto_list_mutex)
3279 {
3280 	mutex_unlock(&proto_list_mutex);
3281 }
3282 
3283 static char proto_method_implemented(const void *method)
3284 {
3285 	return method == NULL ? 'n' : 'y';
3286 }
3287 static long sock_prot_memory_allocated(struct proto *proto)
3288 {
3289 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3290 }
3291 
3292 static char *sock_prot_memory_pressure(struct proto *proto)
3293 {
3294 	return proto->memory_pressure != NULL ?
3295 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3296 }
3297 
3298 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3299 {
3300 
3301 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3302 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3303 		   proto->name,
3304 		   proto->obj_size,
3305 		   sock_prot_inuse_get(seq_file_net(seq), proto),
3306 		   sock_prot_memory_allocated(proto),
3307 		   sock_prot_memory_pressure(proto),
3308 		   proto->max_header,
3309 		   proto->slab == NULL ? "no" : "yes",
3310 		   module_name(proto->owner),
3311 		   proto_method_implemented(proto->close),
3312 		   proto_method_implemented(proto->connect),
3313 		   proto_method_implemented(proto->disconnect),
3314 		   proto_method_implemented(proto->accept),
3315 		   proto_method_implemented(proto->ioctl),
3316 		   proto_method_implemented(proto->init),
3317 		   proto_method_implemented(proto->destroy),
3318 		   proto_method_implemented(proto->shutdown),
3319 		   proto_method_implemented(proto->setsockopt),
3320 		   proto_method_implemented(proto->getsockopt),
3321 		   proto_method_implemented(proto->sendmsg),
3322 		   proto_method_implemented(proto->recvmsg),
3323 		   proto_method_implemented(proto->sendpage),
3324 		   proto_method_implemented(proto->bind),
3325 		   proto_method_implemented(proto->backlog_rcv),
3326 		   proto_method_implemented(proto->hash),
3327 		   proto_method_implemented(proto->unhash),
3328 		   proto_method_implemented(proto->get_port),
3329 		   proto_method_implemented(proto->enter_memory_pressure));
3330 }
3331 
3332 static int proto_seq_show(struct seq_file *seq, void *v)
3333 {
3334 	if (v == &proto_list)
3335 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3336 			   "protocol",
3337 			   "size",
3338 			   "sockets",
3339 			   "memory",
3340 			   "press",
3341 			   "maxhdr",
3342 			   "slab",
3343 			   "module",
3344 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3345 	else
3346 		proto_seq_printf(seq, list_entry(v, struct proto, node));
3347 	return 0;
3348 }
3349 
3350 static const struct seq_operations proto_seq_ops = {
3351 	.start  = proto_seq_start,
3352 	.next   = proto_seq_next,
3353 	.stop   = proto_seq_stop,
3354 	.show   = proto_seq_show,
3355 };
3356 
3357 static int proto_seq_open(struct inode *inode, struct file *file)
3358 {
3359 	return seq_open_net(inode, file, &proto_seq_ops,
3360 			    sizeof(struct seq_net_private));
3361 }
3362 
3363 static const struct file_operations proto_seq_fops = {
3364 	.open		= proto_seq_open,
3365 	.read		= seq_read,
3366 	.llseek		= seq_lseek,
3367 	.release	= seq_release_net,
3368 };
3369 
3370 static __net_init int proto_init_net(struct net *net)
3371 {
3372 	if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3373 		return -ENOMEM;
3374 
3375 	return 0;
3376 }
3377 
3378 static __net_exit void proto_exit_net(struct net *net)
3379 {
3380 	remove_proc_entry("protocols", net->proc_net);
3381 }
3382 
3383 
3384 static __net_initdata struct pernet_operations proto_net_ops = {
3385 	.init = proto_init_net,
3386 	.exit = proto_exit_net,
3387 };
3388 
3389 static int __init proto_init(void)
3390 {
3391 	return register_pernet_subsys(&proto_net_ops);
3392 }
3393 
3394 subsys_initcall(proto_init);
3395 
3396 #endif /* PROC_FS */
3397 
3398 #ifdef CONFIG_NET_RX_BUSY_POLL
3399 bool sk_busy_loop_end(void *p, unsigned long start_time)
3400 {
3401 	struct sock *sk = p;
3402 
3403 	return !skb_queue_empty(&sk->sk_receive_queue) ||
3404 	       sk_busy_loop_timeout(sk, start_time);
3405 }
3406 EXPORT_SYMBOL(sk_busy_loop_end);
3407 #endif /* CONFIG_NET_RX_BUSY_POLL */
3408