xref: /openbmc/linux/net/core/sock.c (revision d2ba09c1)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *		This program is free software; you can redistribute it and/or
87  *		modify it under the terms of the GNU General Public License
88  *		as published by the Free Software Foundation; either version
89  *		2 of the License, or (at your option) any later version.
90  */
91 
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93 
94 #include <linux/capability.h>
95 #include <linux/errno.h>
96 #include <linux/errqueue.h>
97 #include <linux/types.h>
98 #include <linux/socket.h>
99 #include <linux/in.h>
100 #include <linux/kernel.h>
101 #include <linux/module.h>
102 #include <linux/proc_fs.h>
103 #include <linux/seq_file.h>
104 #include <linux/sched.h>
105 #include <linux/sched/mm.h>
106 #include <linux/timer.h>
107 #include <linux/string.h>
108 #include <linux/sockios.h>
109 #include <linux/net.h>
110 #include <linux/mm.h>
111 #include <linux/slab.h>
112 #include <linux/interrupt.h>
113 #include <linux/poll.h>
114 #include <linux/tcp.h>
115 #include <linux/init.h>
116 #include <linux/highmem.h>
117 #include <linux/user_namespace.h>
118 #include <linux/static_key.h>
119 #include <linux/memcontrol.h>
120 #include <linux/prefetch.h>
121 
122 #include <linux/uaccess.h>
123 
124 #include <linux/netdevice.h>
125 #include <net/protocol.h>
126 #include <linux/skbuff.h>
127 #include <net/net_namespace.h>
128 #include <net/request_sock.h>
129 #include <net/sock.h>
130 #include <linux/net_tstamp.h>
131 #include <net/xfrm.h>
132 #include <linux/ipsec.h>
133 #include <net/cls_cgroup.h>
134 #include <net/netprio_cgroup.h>
135 #include <linux/sock_diag.h>
136 
137 #include <linux/filter.h>
138 #include <net/sock_reuseport.h>
139 
140 #include <trace/events/sock.h>
141 
142 #include <net/tcp.h>
143 #include <net/busy_poll.h>
144 
145 static DEFINE_MUTEX(proto_list_mutex);
146 static LIST_HEAD(proto_list);
147 
148 static void sock_inuse_add(struct net *net, int val);
149 
150 /**
151  * sk_ns_capable - General socket capability test
152  * @sk: Socket to use a capability on or through
153  * @user_ns: The user namespace of the capability to use
154  * @cap: The capability to use
155  *
156  * Test to see if the opener of the socket had when the socket was
157  * created and the current process has the capability @cap in the user
158  * namespace @user_ns.
159  */
160 bool sk_ns_capable(const struct sock *sk,
161 		   struct user_namespace *user_ns, int cap)
162 {
163 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
164 		ns_capable(user_ns, cap);
165 }
166 EXPORT_SYMBOL(sk_ns_capable);
167 
168 /**
169  * sk_capable - Socket global capability test
170  * @sk: Socket to use a capability on or through
171  * @cap: The global capability to use
172  *
173  * Test to see if the opener of the socket had when the socket was
174  * created and the current process has the capability @cap in all user
175  * namespaces.
176  */
177 bool sk_capable(const struct sock *sk, int cap)
178 {
179 	return sk_ns_capable(sk, &init_user_ns, cap);
180 }
181 EXPORT_SYMBOL(sk_capable);
182 
183 /**
184  * sk_net_capable - Network namespace socket capability test
185  * @sk: Socket to use a capability on or through
186  * @cap: The capability to use
187  *
188  * Test to see if the opener of the socket had when the socket was created
189  * and the current process has the capability @cap over the network namespace
190  * the socket is a member of.
191  */
192 bool sk_net_capable(const struct sock *sk, int cap)
193 {
194 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
195 }
196 EXPORT_SYMBOL(sk_net_capable);
197 
198 /*
199  * Each address family might have different locking rules, so we have
200  * one slock key per address family and separate keys for internal and
201  * userspace sockets.
202  */
203 static struct lock_class_key af_family_keys[AF_MAX];
204 static struct lock_class_key af_family_kern_keys[AF_MAX];
205 static struct lock_class_key af_family_slock_keys[AF_MAX];
206 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
207 
208 /*
209  * Make lock validator output more readable. (we pre-construct these
210  * strings build-time, so that runtime initialization of socket
211  * locks is fast):
212  */
213 
214 #define _sock_locks(x)						  \
215   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
216   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
217   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
218   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
219   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
220   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
221   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
222   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
223   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
224   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
225   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
226   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
227   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
228   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
229   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
230   x "AF_MAX"
231 
232 static const char *const af_family_key_strings[AF_MAX+1] = {
233 	_sock_locks("sk_lock-")
234 };
235 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
236 	_sock_locks("slock-")
237 };
238 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
239 	_sock_locks("clock-")
240 };
241 
242 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
243 	_sock_locks("k-sk_lock-")
244 };
245 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
246 	_sock_locks("k-slock-")
247 };
248 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
249 	_sock_locks("k-clock-")
250 };
251 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
252   "rlock-AF_UNSPEC", "rlock-AF_UNIX"     , "rlock-AF_INET"     ,
253   "rlock-AF_AX25"  , "rlock-AF_IPX"      , "rlock-AF_APPLETALK",
254   "rlock-AF_NETROM", "rlock-AF_BRIDGE"   , "rlock-AF_ATMPVC"   ,
255   "rlock-AF_X25"   , "rlock-AF_INET6"    , "rlock-AF_ROSE"     ,
256   "rlock-AF_DECnet", "rlock-AF_NETBEUI"  , "rlock-AF_SECURITY" ,
257   "rlock-AF_KEY"   , "rlock-AF_NETLINK"  , "rlock-AF_PACKET"   ,
258   "rlock-AF_ASH"   , "rlock-AF_ECONET"   , "rlock-AF_ATMSVC"   ,
259   "rlock-AF_RDS"   , "rlock-AF_SNA"      , "rlock-AF_IRDA"     ,
260   "rlock-AF_PPPOX" , "rlock-AF_WANPIPE"  , "rlock-AF_LLC"      ,
261   "rlock-27"       , "rlock-28"          , "rlock-AF_CAN"      ,
262   "rlock-AF_TIPC"  , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV"     ,
263   "rlock-AF_RXRPC" , "rlock-AF_ISDN"     , "rlock-AF_PHONET"   ,
264   "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG"      ,
265   "rlock-AF_NFC"   , "rlock-AF_VSOCK"    , "rlock-AF_KCM"      ,
266   "rlock-AF_QIPCRTR", "rlock-AF_SMC"     , "rlock-AF_XDP"      ,
267   "rlock-AF_MAX"
268 };
269 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
270   "wlock-AF_UNSPEC", "wlock-AF_UNIX"     , "wlock-AF_INET"     ,
271   "wlock-AF_AX25"  , "wlock-AF_IPX"      , "wlock-AF_APPLETALK",
272   "wlock-AF_NETROM", "wlock-AF_BRIDGE"   , "wlock-AF_ATMPVC"   ,
273   "wlock-AF_X25"   , "wlock-AF_INET6"    , "wlock-AF_ROSE"     ,
274   "wlock-AF_DECnet", "wlock-AF_NETBEUI"  , "wlock-AF_SECURITY" ,
275   "wlock-AF_KEY"   , "wlock-AF_NETLINK"  , "wlock-AF_PACKET"   ,
276   "wlock-AF_ASH"   , "wlock-AF_ECONET"   , "wlock-AF_ATMSVC"   ,
277   "wlock-AF_RDS"   , "wlock-AF_SNA"      , "wlock-AF_IRDA"     ,
278   "wlock-AF_PPPOX" , "wlock-AF_WANPIPE"  , "wlock-AF_LLC"      ,
279   "wlock-27"       , "wlock-28"          , "wlock-AF_CAN"      ,
280   "wlock-AF_TIPC"  , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV"     ,
281   "wlock-AF_RXRPC" , "wlock-AF_ISDN"     , "wlock-AF_PHONET"   ,
282   "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG"      ,
283   "wlock-AF_NFC"   , "wlock-AF_VSOCK"    , "wlock-AF_KCM"      ,
284   "wlock-AF_QIPCRTR", "wlock-AF_SMC"     , "wlock-AF_XDP"      ,
285   "wlock-AF_MAX"
286 };
287 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
288   "elock-AF_UNSPEC", "elock-AF_UNIX"     , "elock-AF_INET"     ,
289   "elock-AF_AX25"  , "elock-AF_IPX"      , "elock-AF_APPLETALK",
290   "elock-AF_NETROM", "elock-AF_BRIDGE"   , "elock-AF_ATMPVC"   ,
291   "elock-AF_X25"   , "elock-AF_INET6"    , "elock-AF_ROSE"     ,
292   "elock-AF_DECnet", "elock-AF_NETBEUI"  , "elock-AF_SECURITY" ,
293   "elock-AF_KEY"   , "elock-AF_NETLINK"  , "elock-AF_PACKET"   ,
294   "elock-AF_ASH"   , "elock-AF_ECONET"   , "elock-AF_ATMSVC"   ,
295   "elock-AF_RDS"   , "elock-AF_SNA"      , "elock-AF_IRDA"     ,
296   "elock-AF_PPPOX" , "elock-AF_WANPIPE"  , "elock-AF_LLC"      ,
297   "elock-27"       , "elock-28"          , "elock-AF_CAN"      ,
298   "elock-AF_TIPC"  , "elock-AF_BLUETOOTH", "elock-AF_IUCV"     ,
299   "elock-AF_RXRPC" , "elock-AF_ISDN"     , "elock-AF_PHONET"   ,
300   "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG"      ,
301   "elock-AF_NFC"   , "elock-AF_VSOCK"    , "elock-AF_KCM"      ,
302   "elock-AF_QIPCRTR", "elock-AF_SMC"     , "elock-AF_XDP"      ,
303   "elock-AF_MAX"
304 };
305 
306 /*
307  * sk_callback_lock and sk queues locking rules are per-address-family,
308  * so split the lock classes by using a per-AF key:
309  */
310 static struct lock_class_key af_callback_keys[AF_MAX];
311 static struct lock_class_key af_rlock_keys[AF_MAX];
312 static struct lock_class_key af_wlock_keys[AF_MAX];
313 static struct lock_class_key af_elock_keys[AF_MAX];
314 static struct lock_class_key af_kern_callback_keys[AF_MAX];
315 
316 /* Run time adjustable parameters. */
317 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
318 EXPORT_SYMBOL(sysctl_wmem_max);
319 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
320 EXPORT_SYMBOL(sysctl_rmem_max);
321 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
322 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
323 
324 /* Maximal space eaten by iovec or ancillary data plus some space */
325 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
326 EXPORT_SYMBOL(sysctl_optmem_max);
327 
328 int sysctl_tstamp_allow_data __read_mostly = 1;
329 
330 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
331 EXPORT_SYMBOL_GPL(memalloc_socks_key);
332 
333 /**
334  * sk_set_memalloc - sets %SOCK_MEMALLOC
335  * @sk: socket to set it on
336  *
337  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
338  * It's the responsibility of the admin to adjust min_free_kbytes
339  * to meet the requirements
340  */
341 void sk_set_memalloc(struct sock *sk)
342 {
343 	sock_set_flag(sk, SOCK_MEMALLOC);
344 	sk->sk_allocation |= __GFP_MEMALLOC;
345 	static_branch_inc(&memalloc_socks_key);
346 }
347 EXPORT_SYMBOL_GPL(sk_set_memalloc);
348 
349 void sk_clear_memalloc(struct sock *sk)
350 {
351 	sock_reset_flag(sk, SOCK_MEMALLOC);
352 	sk->sk_allocation &= ~__GFP_MEMALLOC;
353 	static_branch_dec(&memalloc_socks_key);
354 
355 	/*
356 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
357 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
358 	 * it has rmem allocations due to the last swapfile being deactivated
359 	 * but there is a risk that the socket is unusable due to exceeding
360 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
361 	 */
362 	sk_mem_reclaim(sk);
363 }
364 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
365 
366 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
367 {
368 	int ret;
369 	unsigned int noreclaim_flag;
370 
371 	/* these should have been dropped before queueing */
372 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
373 
374 	noreclaim_flag = memalloc_noreclaim_save();
375 	ret = sk->sk_backlog_rcv(sk, skb);
376 	memalloc_noreclaim_restore(noreclaim_flag);
377 
378 	return ret;
379 }
380 EXPORT_SYMBOL(__sk_backlog_rcv);
381 
382 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
383 {
384 	struct timeval tv;
385 
386 	if (optlen < sizeof(tv))
387 		return -EINVAL;
388 	if (copy_from_user(&tv, optval, sizeof(tv)))
389 		return -EFAULT;
390 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
391 		return -EDOM;
392 
393 	if (tv.tv_sec < 0) {
394 		static int warned __read_mostly;
395 
396 		*timeo_p = 0;
397 		if (warned < 10 && net_ratelimit()) {
398 			warned++;
399 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
400 				__func__, current->comm, task_pid_nr(current));
401 		}
402 		return 0;
403 	}
404 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
405 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
406 		return 0;
407 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
408 		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
409 	return 0;
410 }
411 
412 static void sock_warn_obsolete_bsdism(const char *name)
413 {
414 	static int warned;
415 	static char warncomm[TASK_COMM_LEN];
416 	if (strcmp(warncomm, current->comm) && warned < 5) {
417 		strcpy(warncomm,  current->comm);
418 		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
419 			warncomm, name);
420 		warned++;
421 	}
422 }
423 
424 static bool sock_needs_netstamp(const struct sock *sk)
425 {
426 	switch (sk->sk_family) {
427 	case AF_UNSPEC:
428 	case AF_UNIX:
429 		return false;
430 	default:
431 		return true;
432 	}
433 }
434 
435 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
436 {
437 	if (sk->sk_flags & flags) {
438 		sk->sk_flags &= ~flags;
439 		if (sock_needs_netstamp(sk) &&
440 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
441 			net_disable_timestamp();
442 	}
443 }
444 
445 
446 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
447 {
448 	unsigned long flags;
449 	struct sk_buff_head *list = &sk->sk_receive_queue;
450 
451 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
452 		atomic_inc(&sk->sk_drops);
453 		trace_sock_rcvqueue_full(sk, skb);
454 		return -ENOMEM;
455 	}
456 
457 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
458 		atomic_inc(&sk->sk_drops);
459 		return -ENOBUFS;
460 	}
461 
462 	skb->dev = NULL;
463 	skb_set_owner_r(skb, sk);
464 
465 	/* we escape from rcu protected region, make sure we dont leak
466 	 * a norefcounted dst
467 	 */
468 	skb_dst_force(skb);
469 
470 	spin_lock_irqsave(&list->lock, flags);
471 	sock_skb_set_dropcount(sk, skb);
472 	__skb_queue_tail(list, skb);
473 	spin_unlock_irqrestore(&list->lock, flags);
474 
475 	if (!sock_flag(sk, SOCK_DEAD))
476 		sk->sk_data_ready(sk);
477 	return 0;
478 }
479 EXPORT_SYMBOL(__sock_queue_rcv_skb);
480 
481 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
482 {
483 	int err;
484 
485 	err = sk_filter(sk, skb);
486 	if (err)
487 		return err;
488 
489 	return __sock_queue_rcv_skb(sk, skb);
490 }
491 EXPORT_SYMBOL(sock_queue_rcv_skb);
492 
493 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
494 		     const int nested, unsigned int trim_cap, bool refcounted)
495 {
496 	int rc = NET_RX_SUCCESS;
497 
498 	if (sk_filter_trim_cap(sk, skb, trim_cap))
499 		goto discard_and_relse;
500 
501 	skb->dev = NULL;
502 
503 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
504 		atomic_inc(&sk->sk_drops);
505 		goto discard_and_relse;
506 	}
507 	if (nested)
508 		bh_lock_sock_nested(sk);
509 	else
510 		bh_lock_sock(sk);
511 	if (!sock_owned_by_user(sk)) {
512 		/*
513 		 * trylock + unlock semantics:
514 		 */
515 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
516 
517 		rc = sk_backlog_rcv(sk, skb);
518 
519 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
520 	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
521 		bh_unlock_sock(sk);
522 		atomic_inc(&sk->sk_drops);
523 		goto discard_and_relse;
524 	}
525 
526 	bh_unlock_sock(sk);
527 out:
528 	if (refcounted)
529 		sock_put(sk);
530 	return rc;
531 discard_and_relse:
532 	kfree_skb(skb);
533 	goto out;
534 }
535 EXPORT_SYMBOL(__sk_receive_skb);
536 
537 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
538 {
539 	struct dst_entry *dst = __sk_dst_get(sk);
540 
541 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
542 		sk_tx_queue_clear(sk);
543 		sk->sk_dst_pending_confirm = 0;
544 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
545 		dst_release(dst);
546 		return NULL;
547 	}
548 
549 	return dst;
550 }
551 EXPORT_SYMBOL(__sk_dst_check);
552 
553 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
554 {
555 	struct dst_entry *dst = sk_dst_get(sk);
556 
557 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
558 		sk_dst_reset(sk);
559 		dst_release(dst);
560 		return NULL;
561 	}
562 
563 	return dst;
564 }
565 EXPORT_SYMBOL(sk_dst_check);
566 
567 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
568 				int optlen)
569 {
570 	int ret = -ENOPROTOOPT;
571 #ifdef CONFIG_NETDEVICES
572 	struct net *net = sock_net(sk);
573 	char devname[IFNAMSIZ];
574 	int index;
575 
576 	/* Sorry... */
577 	ret = -EPERM;
578 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
579 		goto out;
580 
581 	ret = -EINVAL;
582 	if (optlen < 0)
583 		goto out;
584 
585 	/* Bind this socket to a particular device like "eth0",
586 	 * as specified in the passed interface name. If the
587 	 * name is "" or the option length is zero the socket
588 	 * is not bound.
589 	 */
590 	if (optlen > IFNAMSIZ - 1)
591 		optlen = IFNAMSIZ - 1;
592 	memset(devname, 0, sizeof(devname));
593 
594 	ret = -EFAULT;
595 	if (copy_from_user(devname, optval, optlen))
596 		goto out;
597 
598 	index = 0;
599 	if (devname[0] != '\0') {
600 		struct net_device *dev;
601 
602 		rcu_read_lock();
603 		dev = dev_get_by_name_rcu(net, devname);
604 		if (dev)
605 			index = dev->ifindex;
606 		rcu_read_unlock();
607 		ret = -ENODEV;
608 		if (!dev)
609 			goto out;
610 	}
611 
612 	lock_sock(sk);
613 	sk->sk_bound_dev_if = index;
614 	sk_dst_reset(sk);
615 	release_sock(sk);
616 
617 	ret = 0;
618 
619 out:
620 #endif
621 
622 	return ret;
623 }
624 
625 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
626 				int __user *optlen, int len)
627 {
628 	int ret = -ENOPROTOOPT;
629 #ifdef CONFIG_NETDEVICES
630 	struct net *net = sock_net(sk);
631 	char devname[IFNAMSIZ];
632 
633 	if (sk->sk_bound_dev_if == 0) {
634 		len = 0;
635 		goto zero;
636 	}
637 
638 	ret = -EINVAL;
639 	if (len < IFNAMSIZ)
640 		goto out;
641 
642 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
643 	if (ret)
644 		goto out;
645 
646 	len = strlen(devname) + 1;
647 
648 	ret = -EFAULT;
649 	if (copy_to_user(optval, devname, len))
650 		goto out;
651 
652 zero:
653 	ret = -EFAULT;
654 	if (put_user(len, optlen))
655 		goto out;
656 
657 	ret = 0;
658 
659 out:
660 #endif
661 
662 	return ret;
663 }
664 
665 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
666 {
667 	if (valbool)
668 		sock_set_flag(sk, bit);
669 	else
670 		sock_reset_flag(sk, bit);
671 }
672 
673 bool sk_mc_loop(struct sock *sk)
674 {
675 	if (dev_recursion_level())
676 		return false;
677 	if (!sk)
678 		return true;
679 	switch (sk->sk_family) {
680 	case AF_INET:
681 		return inet_sk(sk)->mc_loop;
682 #if IS_ENABLED(CONFIG_IPV6)
683 	case AF_INET6:
684 		return inet6_sk(sk)->mc_loop;
685 #endif
686 	}
687 	WARN_ON(1);
688 	return true;
689 }
690 EXPORT_SYMBOL(sk_mc_loop);
691 
692 /*
693  *	This is meant for all protocols to use and covers goings on
694  *	at the socket level. Everything here is generic.
695  */
696 
697 int sock_setsockopt(struct socket *sock, int level, int optname,
698 		    char __user *optval, unsigned int optlen)
699 {
700 	struct sock *sk = sock->sk;
701 	int val;
702 	int valbool;
703 	struct linger ling;
704 	int ret = 0;
705 
706 	/*
707 	 *	Options without arguments
708 	 */
709 
710 	if (optname == SO_BINDTODEVICE)
711 		return sock_setbindtodevice(sk, optval, optlen);
712 
713 	if (optlen < sizeof(int))
714 		return -EINVAL;
715 
716 	if (get_user(val, (int __user *)optval))
717 		return -EFAULT;
718 
719 	valbool = val ? 1 : 0;
720 
721 	lock_sock(sk);
722 
723 	switch (optname) {
724 	case SO_DEBUG:
725 		if (val && !capable(CAP_NET_ADMIN))
726 			ret = -EACCES;
727 		else
728 			sock_valbool_flag(sk, SOCK_DBG, valbool);
729 		break;
730 	case SO_REUSEADDR:
731 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
732 		break;
733 	case SO_REUSEPORT:
734 		sk->sk_reuseport = valbool;
735 		break;
736 	case SO_TYPE:
737 	case SO_PROTOCOL:
738 	case SO_DOMAIN:
739 	case SO_ERROR:
740 		ret = -ENOPROTOOPT;
741 		break;
742 	case SO_DONTROUTE:
743 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
744 		break;
745 	case SO_BROADCAST:
746 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
747 		break;
748 	case SO_SNDBUF:
749 		/* Don't error on this BSD doesn't and if you think
750 		 * about it this is right. Otherwise apps have to
751 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
752 		 * are treated in BSD as hints
753 		 */
754 		val = min_t(u32, val, sysctl_wmem_max);
755 set_sndbuf:
756 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
757 		sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
758 		/* Wake up sending tasks if we upped the value. */
759 		sk->sk_write_space(sk);
760 		break;
761 
762 	case SO_SNDBUFFORCE:
763 		if (!capable(CAP_NET_ADMIN)) {
764 			ret = -EPERM;
765 			break;
766 		}
767 		goto set_sndbuf;
768 
769 	case SO_RCVBUF:
770 		/* Don't error on this BSD doesn't and if you think
771 		 * about it this is right. Otherwise apps have to
772 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
773 		 * are treated in BSD as hints
774 		 */
775 		val = min_t(u32, val, sysctl_rmem_max);
776 set_rcvbuf:
777 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
778 		/*
779 		 * We double it on the way in to account for
780 		 * "struct sk_buff" etc. overhead.   Applications
781 		 * assume that the SO_RCVBUF setting they make will
782 		 * allow that much actual data to be received on that
783 		 * socket.
784 		 *
785 		 * Applications are unaware that "struct sk_buff" and
786 		 * other overheads allocate from the receive buffer
787 		 * during socket buffer allocation.
788 		 *
789 		 * And after considering the possible alternatives,
790 		 * returning the value we actually used in getsockopt
791 		 * is the most desirable behavior.
792 		 */
793 		sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
794 		break;
795 
796 	case SO_RCVBUFFORCE:
797 		if (!capable(CAP_NET_ADMIN)) {
798 			ret = -EPERM;
799 			break;
800 		}
801 		goto set_rcvbuf;
802 
803 	case SO_KEEPALIVE:
804 		if (sk->sk_prot->keepalive)
805 			sk->sk_prot->keepalive(sk, valbool);
806 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
807 		break;
808 
809 	case SO_OOBINLINE:
810 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
811 		break;
812 
813 	case SO_NO_CHECK:
814 		sk->sk_no_check_tx = valbool;
815 		break;
816 
817 	case SO_PRIORITY:
818 		if ((val >= 0 && val <= 6) ||
819 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
820 			sk->sk_priority = val;
821 		else
822 			ret = -EPERM;
823 		break;
824 
825 	case SO_LINGER:
826 		if (optlen < sizeof(ling)) {
827 			ret = -EINVAL;	/* 1003.1g */
828 			break;
829 		}
830 		if (copy_from_user(&ling, optval, sizeof(ling))) {
831 			ret = -EFAULT;
832 			break;
833 		}
834 		if (!ling.l_onoff)
835 			sock_reset_flag(sk, SOCK_LINGER);
836 		else {
837 #if (BITS_PER_LONG == 32)
838 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
839 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
840 			else
841 #endif
842 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
843 			sock_set_flag(sk, SOCK_LINGER);
844 		}
845 		break;
846 
847 	case SO_BSDCOMPAT:
848 		sock_warn_obsolete_bsdism("setsockopt");
849 		break;
850 
851 	case SO_PASSCRED:
852 		if (valbool)
853 			set_bit(SOCK_PASSCRED, &sock->flags);
854 		else
855 			clear_bit(SOCK_PASSCRED, &sock->flags);
856 		break;
857 
858 	case SO_TIMESTAMP:
859 	case SO_TIMESTAMPNS:
860 		if (valbool)  {
861 			if (optname == SO_TIMESTAMP)
862 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
863 			else
864 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
865 			sock_set_flag(sk, SOCK_RCVTSTAMP);
866 			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
867 		} else {
868 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
869 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
870 		}
871 		break;
872 
873 	case SO_TIMESTAMPING:
874 		if (val & ~SOF_TIMESTAMPING_MASK) {
875 			ret = -EINVAL;
876 			break;
877 		}
878 
879 		if (val & SOF_TIMESTAMPING_OPT_ID &&
880 		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
881 			if (sk->sk_protocol == IPPROTO_TCP &&
882 			    sk->sk_type == SOCK_STREAM) {
883 				if ((1 << sk->sk_state) &
884 				    (TCPF_CLOSE | TCPF_LISTEN)) {
885 					ret = -EINVAL;
886 					break;
887 				}
888 				sk->sk_tskey = tcp_sk(sk)->snd_una;
889 			} else {
890 				sk->sk_tskey = 0;
891 			}
892 		}
893 
894 		if (val & SOF_TIMESTAMPING_OPT_STATS &&
895 		    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
896 			ret = -EINVAL;
897 			break;
898 		}
899 
900 		sk->sk_tsflags = val;
901 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
902 			sock_enable_timestamp(sk,
903 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
904 		else
905 			sock_disable_timestamp(sk,
906 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
907 		break;
908 
909 	case SO_RCVLOWAT:
910 		if (val < 0)
911 			val = INT_MAX;
912 		if (sock->ops->set_rcvlowat)
913 			ret = sock->ops->set_rcvlowat(sk, val);
914 		else
915 			sk->sk_rcvlowat = val ? : 1;
916 		break;
917 
918 	case SO_RCVTIMEO:
919 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
920 		break;
921 
922 	case SO_SNDTIMEO:
923 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
924 		break;
925 
926 	case SO_ATTACH_FILTER:
927 		ret = -EINVAL;
928 		if (optlen == sizeof(struct sock_fprog)) {
929 			struct sock_fprog fprog;
930 
931 			ret = -EFAULT;
932 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
933 				break;
934 
935 			ret = sk_attach_filter(&fprog, sk);
936 		}
937 		break;
938 
939 	case SO_ATTACH_BPF:
940 		ret = -EINVAL;
941 		if (optlen == sizeof(u32)) {
942 			u32 ufd;
943 
944 			ret = -EFAULT;
945 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
946 				break;
947 
948 			ret = sk_attach_bpf(ufd, sk);
949 		}
950 		break;
951 
952 	case SO_ATTACH_REUSEPORT_CBPF:
953 		ret = -EINVAL;
954 		if (optlen == sizeof(struct sock_fprog)) {
955 			struct sock_fprog fprog;
956 
957 			ret = -EFAULT;
958 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
959 				break;
960 
961 			ret = sk_reuseport_attach_filter(&fprog, sk);
962 		}
963 		break;
964 
965 	case SO_ATTACH_REUSEPORT_EBPF:
966 		ret = -EINVAL;
967 		if (optlen == sizeof(u32)) {
968 			u32 ufd;
969 
970 			ret = -EFAULT;
971 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
972 				break;
973 
974 			ret = sk_reuseport_attach_bpf(ufd, sk);
975 		}
976 		break;
977 
978 	case SO_DETACH_FILTER:
979 		ret = sk_detach_filter(sk);
980 		break;
981 
982 	case SO_LOCK_FILTER:
983 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
984 			ret = -EPERM;
985 		else
986 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
987 		break;
988 
989 	case SO_PASSSEC:
990 		if (valbool)
991 			set_bit(SOCK_PASSSEC, &sock->flags);
992 		else
993 			clear_bit(SOCK_PASSSEC, &sock->flags);
994 		break;
995 	case SO_MARK:
996 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
997 			ret = -EPERM;
998 		else
999 			sk->sk_mark = val;
1000 		break;
1001 
1002 	case SO_RXQ_OVFL:
1003 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1004 		break;
1005 
1006 	case SO_WIFI_STATUS:
1007 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1008 		break;
1009 
1010 	case SO_PEEK_OFF:
1011 		if (sock->ops->set_peek_off)
1012 			ret = sock->ops->set_peek_off(sk, val);
1013 		else
1014 			ret = -EOPNOTSUPP;
1015 		break;
1016 
1017 	case SO_NOFCS:
1018 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1019 		break;
1020 
1021 	case SO_SELECT_ERR_QUEUE:
1022 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1023 		break;
1024 
1025 #ifdef CONFIG_NET_RX_BUSY_POLL
1026 	case SO_BUSY_POLL:
1027 		/* allow unprivileged users to decrease the value */
1028 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1029 			ret = -EPERM;
1030 		else {
1031 			if (val < 0)
1032 				ret = -EINVAL;
1033 			else
1034 				sk->sk_ll_usec = val;
1035 		}
1036 		break;
1037 #endif
1038 
1039 	case SO_MAX_PACING_RATE:
1040 		if (val != ~0U)
1041 			cmpxchg(&sk->sk_pacing_status,
1042 				SK_PACING_NONE,
1043 				SK_PACING_NEEDED);
1044 		sk->sk_max_pacing_rate = val;
1045 		sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1046 					 sk->sk_max_pacing_rate);
1047 		break;
1048 
1049 	case SO_INCOMING_CPU:
1050 		sk->sk_incoming_cpu = val;
1051 		break;
1052 
1053 	case SO_CNX_ADVICE:
1054 		if (val == 1)
1055 			dst_negative_advice(sk);
1056 		break;
1057 
1058 	case SO_ZEROCOPY:
1059 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1060 			if (sk->sk_protocol != IPPROTO_TCP)
1061 				ret = -ENOTSUPP;
1062 		} else if (sk->sk_family != PF_RDS) {
1063 			ret = -ENOTSUPP;
1064 		}
1065 		if (!ret) {
1066 			if (val < 0 || val > 1)
1067 				ret = -EINVAL;
1068 			else
1069 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1070 		}
1071 		break;
1072 
1073 	default:
1074 		ret = -ENOPROTOOPT;
1075 		break;
1076 	}
1077 	release_sock(sk);
1078 	return ret;
1079 }
1080 EXPORT_SYMBOL(sock_setsockopt);
1081 
1082 
1083 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1084 			  struct ucred *ucred)
1085 {
1086 	ucred->pid = pid_vnr(pid);
1087 	ucred->uid = ucred->gid = -1;
1088 	if (cred) {
1089 		struct user_namespace *current_ns = current_user_ns();
1090 
1091 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1092 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1093 	}
1094 }
1095 
1096 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1097 {
1098 	struct user_namespace *user_ns = current_user_ns();
1099 	int i;
1100 
1101 	for (i = 0; i < src->ngroups; i++)
1102 		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1103 			return -EFAULT;
1104 
1105 	return 0;
1106 }
1107 
1108 int sock_getsockopt(struct socket *sock, int level, int optname,
1109 		    char __user *optval, int __user *optlen)
1110 {
1111 	struct sock *sk = sock->sk;
1112 
1113 	union {
1114 		int val;
1115 		u64 val64;
1116 		struct linger ling;
1117 		struct timeval tm;
1118 	} v;
1119 
1120 	int lv = sizeof(int);
1121 	int len;
1122 
1123 	if (get_user(len, optlen))
1124 		return -EFAULT;
1125 	if (len < 0)
1126 		return -EINVAL;
1127 
1128 	memset(&v, 0, sizeof(v));
1129 
1130 	switch (optname) {
1131 	case SO_DEBUG:
1132 		v.val = sock_flag(sk, SOCK_DBG);
1133 		break;
1134 
1135 	case SO_DONTROUTE:
1136 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1137 		break;
1138 
1139 	case SO_BROADCAST:
1140 		v.val = sock_flag(sk, SOCK_BROADCAST);
1141 		break;
1142 
1143 	case SO_SNDBUF:
1144 		v.val = sk->sk_sndbuf;
1145 		break;
1146 
1147 	case SO_RCVBUF:
1148 		v.val = sk->sk_rcvbuf;
1149 		break;
1150 
1151 	case SO_REUSEADDR:
1152 		v.val = sk->sk_reuse;
1153 		break;
1154 
1155 	case SO_REUSEPORT:
1156 		v.val = sk->sk_reuseport;
1157 		break;
1158 
1159 	case SO_KEEPALIVE:
1160 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1161 		break;
1162 
1163 	case SO_TYPE:
1164 		v.val = sk->sk_type;
1165 		break;
1166 
1167 	case SO_PROTOCOL:
1168 		v.val = sk->sk_protocol;
1169 		break;
1170 
1171 	case SO_DOMAIN:
1172 		v.val = sk->sk_family;
1173 		break;
1174 
1175 	case SO_ERROR:
1176 		v.val = -sock_error(sk);
1177 		if (v.val == 0)
1178 			v.val = xchg(&sk->sk_err_soft, 0);
1179 		break;
1180 
1181 	case SO_OOBINLINE:
1182 		v.val = sock_flag(sk, SOCK_URGINLINE);
1183 		break;
1184 
1185 	case SO_NO_CHECK:
1186 		v.val = sk->sk_no_check_tx;
1187 		break;
1188 
1189 	case SO_PRIORITY:
1190 		v.val = sk->sk_priority;
1191 		break;
1192 
1193 	case SO_LINGER:
1194 		lv		= sizeof(v.ling);
1195 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1196 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1197 		break;
1198 
1199 	case SO_BSDCOMPAT:
1200 		sock_warn_obsolete_bsdism("getsockopt");
1201 		break;
1202 
1203 	case SO_TIMESTAMP:
1204 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1205 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1206 		break;
1207 
1208 	case SO_TIMESTAMPNS:
1209 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1210 		break;
1211 
1212 	case SO_TIMESTAMPING:
1213 		v.val = sk->sk_tsflags;
1214 		break;
1215 
1216 	case SO_RCVTIMEO:
1217 		lv = sizeof(struct timeval);
1218 		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1219 			v.tm.tv_sec = 0;
1220 			v.tm.tv_usec = 0;
1221 		} else {
1222 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1223 			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1224 		}
1225 		break;
1226 
1227 	case SO_SNDTIMEO:
1228 		lv = sizeof(struct timeval);
1229 		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1230 			v.tm.tv_sec = 0;
1231 			v.tm.tv_usec = 0;
1232 		} else {
1233 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1234 			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1235 		}
1236 		break;
1237 
1238 	case SO_RCVLOWAT:
1239 		v.val = sk->sk_rcvlowat;
1240 		break;
1241 
1242 	case SO_SNDLOWAT:
1243 		v.val = 1;
1244 		break;
1245 
1246 	case SO_PASSCRED:
1247 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1248 		break;
1249 
1250 	case SO_PEERCRED:
1251 	{
1252 		struct ucred peercred;
1253 		if (len > sizeof(peercred))
1254 			len = sizeof(peercred);
1255 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1256 		if (copy_to_user(optval, &peercred, len))
1257 			return -EFAULT;
1258 		goto lenout;
1259 	}
1260 
1261 	case SO_PEERGROUPS:
1262 	{
1263 		int ret, n;
1264 
1265 		if (!sk->sk_peer_cred)
1266 			return -ENODATA;
1267 
1268 		n = sk->sk_peer_cred->group_info->ngroups;
1269 		if (len < n * sizeof(gid_t)) {
1270 			len = n * sizeof(gid_t);
1271 			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1272 		}
1273 		len = n * sizeof(gid_t);
1274 
1275 		ret = groups_to_user((gid_t __user *)optval,
1276 				     sk->sk_peer_cred->group_info);
1277 		if (ret)
1278 			return ret;
1279 		goto lenout;
1280 	}
1281 
1282 	case SO_PEERNAME:
1283 	{
1284 		char address[128];
1285 
1286 		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1287 		if (lv < 0)
1288 			return -ENOTCONN;
1289 		if (lv < len)
1290 			return -EINVAL;
1291 		if (copy_to_user(optval, address, len))
1292 			return -EFAULT;
1293 		goto lenout;
1294 	}
1295 
1296 	/* Dubious BSD thing... Probably nobody even uses it, but
1297 	 * the UNIX standard wants it for whatever reason... -DaveM
1298 	 */
1299 	case SO_ACCEPTCONN:
1300 		v.val = sk->sk_state == TCP_LISTEN;
1301 		break;
1302 
1303 	case SO_PASSSEC:
1304 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1305 		break;
1306 
1307 	case SO_PEERSEC:
1308 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1309 
1310 	case SO_MARK:
1311 		v.val = sk->sk_mark;
1312 		break;
1313 
1314 	case SO_RXQ_OVFL:
1315 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1316 		break;
1317 
1318 	case SO_WIFI_STATUS:
1319 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1320 		break;
1321 
1322 	case SO_PEEK_OFF:
1323 		if (!sock->ops->set_peek_off)
1324 			return -EOPNOTSUPP;
1325 
1326 		v.val = sk->sk_peek_off;
1327 		break;
1328 	case SO_NOFCS:
1329 		v.val = sock_flag(sk, SOCK_NOFCS);
1330 		break;
1331 
1332 	case SO_BINDTODEVICE:
1333 		return sock_getbindtodevice(sk, optval, optlen, len);
1334 
1335 	case SO_GET_FILTER:
1336 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1337 		if (len < 0)
1338 			return len;
1339 
1340 		goto lenout;
1341 
1342 	case SO_LOCK_FILTER:
1343 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1344 		break;
1345 
1346 	case SO_BPF_EXTENSIONS:
1347 		v.val = bpf_tell_extensions();
1348 		break;
1349 
1350 	case SO_SELECT_ERR_QUEUE:
1351 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1352 		break;
1353 
1354 #ifdef CONFIG_NET_RX_BUSY_POLL
1355 	case SO_BUSY_POLL:
1356 		v.val = sk->sk_ll_usec;
1357 		break;
1358 #endif
1359 
1360 	case SO_MAX_PACING_RATE:
1361 		v.val = sk->sk_max_pacing_rate;
1362 		break;
1363 
1364 	case SO_INCOMING_CPU:
1365 		v.val = sk->sk_incoming_cpu;
1366 		break;
1367 
1368 	case SO_MEMINFO:
1369 	{
1370 		u32 meminfo[SK_MEMINFO_VARS];
1371 
1372 		if (get_user(len, optlen))
1373 			return -EFAULT;
1374 
1375 		sk_get_meminfo(sk, meminfo);
1376 
1377 		len = min_t(unsigned int, len, sizeof(meminfo));
1378 		if (copy_to_user(optval, &meminfo, len))
1379 			return -EFAULT;
1380 
1381 		goto lenout;
1382 	}
1383 
1384 #ifdef CONFIG_NET_RX_BUSY_POLL
1385 	case SO_INCOMING_NAPI_ID:
1386 		v.val = READ_ONCE(sk->sk_napi_id);
1387 
1388 		/* aggregate non-NAPI IDs down to 0 */
1389 		if (v.val < MIN_NAPI_ID)
1390 			v.val = 0;
1391 
1392 		break;
1393 #endif
1394 
1395 	case SO_COOKIE:
1396 		lv = sizeof(u64);
1397 		if (len < lv)
1398 			return -EINVAL;
1399 		v.val64 = sock_gen_cookie(sk);
1400 		break;
1401 
1402 	case SO_ZEROCOPY:
1403 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1404 		break;
1405 
1406 	default:
1407 		/* We implement the SO_SNDLOWAT etc to not be settable
1408 		 * (1003.1g 7).
1409 		 */
1410 		return -ENOPROTOOPT;
1411 	}
1412 
1413 	if (len > lv)
1414 		len = lv;
1415 	if (copy_to_user(optval, &v, len))
1416 		return -EFAULT;
1417 lenout:
1418 	if (put_user(len, optlen))
1419 		return -EFAULT;
1420 	return 0;
1421 }
1422 
1423 /*
1424  * Initialize an sk_lock.
1425  *
1426  * (We also register the sk_lock with the lock validator.)
1427  */
1428 static inline void sock_lock_init(struct sock *sk)
1429 {
1430 	if (sk->sk_kern_sock)
1431 		sock_lock_init_class_and_name(
1432 			sk,
1433 			af_family_kern_slock_key_strings[sk->sk_family],
1434 			af_family_kern_slock_keys + sk->sk_family,
1435 			af_family_kern_key_strings[sk->sk_family],
1436 			af_family_kern_keys + sk->sk_family);
1437 	else
1438 		sock_lock_init_class_and_name(
1439 			sk,
1440 			af_family_slock_key_strings[sk->sk_family],
1441 			af_family_slock_keys + sk->sk_family,
1442 			af_family_key_strings[sk->sk_family],
1443 			af_family_keys + sk->sk_family);
1444 }
1445 
1446 /*
1447  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1448  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1449  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1450  */
1451 static void sock_copy(struct sock *nsk, const struct sock *osk)
1452 {
1453 #ifdef CONFIG_SECURITY_NETWORK
1454 	void *sptr = nsk->sk_security;
1455 #endif
1456 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1457 
1458 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1459 	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1460 
1461 #ifdef CONFIG_SECURITY_NETWORK
1462 	nsk->sk_security = sptr;
1463 	security_sk_clone(osk, nsk);
1464 #endif
1465 }
1466 
1467 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1468 		int family)
1469 {
1470 	struct sock *sk;
1471 	struct kmem_cache *slab;
1472 
1473 	slab = prot->slab;
1474 	if (slab != NULL) {
1475 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1476 		if (!sk)
1477 			return sk;
1478 		if (priority & __GFP_ZERO)
1479 			sk_prot_clear_nulls(sk, prot->obj_size);
1480 	} else
1481 		sk = kmalloc(prot->obj_size, priority);
1482 
1483 	if (sk != NULL) {
1484 		if (security_sk_alloc(sk, family, priority))
1485 			goto out_free;
1486 
1487 		if (!try_module_get(prot->owner))
1488 			goto out_free_sec;
1489 		sk_tx_queue_clear(sk);
1490 	}
1491 
1492 	return sk;
1493 
1494 out_free_sec:
1495 	security_sk_free(sk);
1496 out_free:
1497 	if (slab != NULL)
1498 		kmem_cache_free(slab, sk);
1499 	else
1500 		kfree(sk);
1501 	return NULL;
1502 }
1503 
1504 static void sk_prot_free(struct proto *prot, struct sock *sk)
1505 {
1506 	struct kmem_cache *slab;
1507 	struct module *owner;
1508 
1509 	owner = prot->owner;
1510 	slab = prot->slab;
1511 
1512 	cgroup_sk_free(&sk->sk_cgrp_data);
1513 	mem_cgroup_sk_free(sk);
1514 	security_sk_free(sk);
1515 	if (slab != NULL)
1516 		kmem_cache_free(slab, sk);
1517 	else
1518 		kfree(sk);
1519 	module_put(owner);
1520 }
1521 
1522 /**
1523  *	sk_alloc - All socket objects are allocated here
1524  *	@net: the applicable net namespace
1525  *	@family: protocol family
1526  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1527  *	@prot: struct proto associated with this new sock instance
1528  *	@kern: is this to be a kernel socket?
1529  */
1530 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1531 		      struct proto *prot, int kern)
1532 {
1533 	struct sock *sk;
1534 
1535 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1536 	if (sk) {
1537 		sk->sk_family = family;
1538 		/*
1539 		 * See comment in struct sock definition to understand
1540 		 * why we need sk_prot_creator -acme
1541 		 */
1542 		sk->sk_prot = sk->sk_prot_creator = prot;
1543 		sk->sk_kern_sock = kern;
1544 		sock_lock_init(sk);
1545 		sk->sk_net_refcnt = kern ? 0 : 1;
1546 		if (likely(sk->sk_net_refcnt)) {
1547 			get_net(net);
1548 			sock_inuse_add(net, 1);
1549 		}
1550 
1551 		sock_net_set(sk, net);
1552 		refcount_set(&sk->sk_wmem_alloc, 1);
1553 
1554 		mem_cgroup_sk_alloc(sk);
1555 		cgroup_sk_alloc(&sk->sk_cgrp_data);
1556 		sock_update_classid(&sk->sk_cgrp_data);
1557 		sock_update_netprioidx(&sk->sk_cgrp_data);
1558 	}
1559 
1560 	return sk;
1561 }
1562 EXPORT_SYMBOL(sk_alloc);
1563 
1564 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1565  * grace period. This is the case for UDP sockets and TCP listeners.
1566  */
1567 static void __sk_destruct(struct rcu_head *head)
1568 {
1569 	struct sock *sk = container_of(head, struct sock, sk_rcu);
1570 	struct sk_filter *filter;
1571 
1572 	if (sk->sk_destruct)
1573 		sk->sk_destruct(sk);
1574 
1575 	filter = rcu_dereference_check(sk->sk_filter,
1576 				       refcount_read(&sk->sk_wmem_alloc) == 0);
1577 	if (filter) {
1578 		sk_filter_uncharge(sk, filter);
1579 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1580 	}
1581 	if (rcu_access_pointer(sk->sk_reuseport_cb))
1582 		reuseport_detach_sock(sk);
1583 
1584 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1585 
1586 	if (atomic_read(&sk->sk_omem_alloc))
1587 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1588 			 __func__, atomic_read(&sk->sk_omem_alloc));
1589 
1590 	if (sk->sk_frag.page) {
1591 		put_page(sk->sk_frag.page);
1592 		sk->sk_frag.page = NULL;
1593 	}
1594 
1595 	if (sk->sk_peer_cred)
1596 		put_cred(sk->sk_peer_cred);
1597 	put_pid(sk->sk_peer_pid);
1598 	if (likely(sk->sk_net_refcnt))
1599 		put_net(sock_net(sk));
1600 	sk_prot_free(sk->sk_prot_creator, sk);
1601 }
1602 
1603 void sk_destruct(struct sock *sk)
1604 {
1605 	if (sock_flag(sk, SOCK_RCU_FREE))
1606 		call_rcu(&sk->sk_rcu, __sk_destruct);
1607 	else
1608 		__sk_destruct(&sk->sk_rcu);
1609 }
1610 
1611 static void __sk_free(struct sock *sk)
1612 {
1613 	if (likely(sk->sk_net_refcnt))
1614 		sock_inuse_add(sock_net(sk), -1);
1615 
1616 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1617 		sock_diag_broadcast_destroy(sk);
1618 	else
1619 		sk_destruct(sk);
1620 }
1621 
1622 void sk_free(struct sock *sk)
1623 {
1624 	/*
1625 	 * We subtract one from sk_wmem_alloc and can know if
1626 	 * some packets are still in some tx queue.
1627 	 * If not null, sock_wfree() will call __sk_free(sk) later
1628 	 */
1629 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1630 		__sk_free(sk);
1631 }
1632 EXPORT_SYMBOL(sk_free);
1633 
1634 static void sk_init_common(struct sock *sk)
1635 {
1636 	skb_queue_head_init(&sk->sk_receive_queue);
1637 	skb_queue_head_init(&sk->sk_write_queue);
1638 	skb_queue_head_init(&sk->sk_error_queue);
1639 
1640 	rwlock_init(&sk->sk_callback_lock);
1641 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1642 			af_rlock_keys + sk->sk_family,
1643 			af_family_rlock_key_strings[sk->sk_family]);
1644 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1645 			af_wlock_keys + sk->sk_family,
1646 			af_family_wlock_key_strings[sk->sk_family]);
1647 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1648 			af_elock_keys + sk->sk_family,
1649 			af_family_elock_key_strings[sk->sk_family]);
1650 	lockdep_set_class_and_name(&sk->sk_callback_lock,
1651 			af_callback_keys + sk->sk_family,
1652 			af_family_clock_key_strings[sk->sk_family]);
1653 }
1654 
1655 /**
1656  *	sk_clone_lock - clone a socket, and lock its clone
1657  *	@sk: the socket to clone
1658  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1659  *
1660  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1661  */
1662 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1663 {
1664 	struct sock *newsk;
1665 	bool is_charged = true;
1666 
1667 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1668 	if (newsk != NULL) {
1669 		struct sk_filter *filter;
1670 
1671 		sock_copy(newsk, sk);
1672 
1673 		newsk->sk_prot_creator = sk->sk_prot;
1674 
1675 		/* SANITY */
1676 		if (likely(newsk->sk_net_refcnt))
1677 			get_net(sock_net(newsk));
1678 		sk_node_init(&newsk->sk_node);
1679 		sock_lock_init(newsk);
1680 		bh_lock_sock(newsk);
1681 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1682 		newsk->sk_backlog.len = 0;
1683 
1684 		atomic_set(&newsk->sk_rmem_alloc, 0);
1685 		/*
1686 		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1687 		 */
1688 		refcount_set(&newsk->sk_wmem_alloc, 1);
1689 		atomic_set(&newsk->sk_omem_alloc, 0);
1690 		sk_init_common(newsk);
1691 
1692 		newsk->sk_dst_cache	= NULL;
1693 		newsk->sk_dst_pending_confirm = 0;
1694 		newsk->sk_wmem_queued	= 0;
1695 		newsk->sk_forward_alloc = 0;
1696 		atomic_set(&newsk->sk_drops, 0);
1697 		newsk->sk_send_head	= NULL;
1698 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1699 		atomic_set(&newsk->sk_zckey, 0);
1700 
1701 		sock_reset_flag(newsk, SOCK_DONE);
1702 		mem_cgroup_sk_alloc(newsk);
1703 		cgroup_sk_alloc(&newsk->sk_cgrp_data);
1704 
1705 		rcu_read_lock();
1706 		filter = rcu_dereference(sk->sk_filter);
1707 		if (filter != NULL)
1708 			/* though it's an empty new sock, the charging may fail
1709 			 * if sysctl_optmem_max was changed between creation of
1710 			 * original socket and cloning
1711 			 */
1712 			is_charged = sk_filter_charge(newsk, filter);
1713 		RCU_INIT_POINTER(newsk->sk_filter, filter);
1714 		rcu_read_unlock();
1715 
1716 		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1717 			/* We need to make sure that we don't uncharge the new
1718 			 * socket if we couldn't charge it in the first place
1719 			 * as otherwise we uncharge the parent's filter.
1720 			 */
1721 			if (!is_charged)
1722 				RCU_INIT_POINTER(newsk->sk_filter, NULL);
1723 			sk_free_unlock_clone(newsk);
1724 			newsk = NULL;
1725 			goto out;
1726 		}
1727 		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1728 
1729 		newsk->sk_err	   = 0;
1730 		newsk->sk_err_soft = 0;
1731 		newsk->sk_priority = 0;
1732 		newsk->sk_incoming_cpu = raw_smp_processor_id();
1733 		atomic64_set(&newsk->sk_cookie, 0);
1734 		if (likely(newsk->sk_net_refcnt))
1735 			sock_inuse_add(sock_net(newsk), 1);
1736 
1737 		/*
1738 		 * Before updating sk_refcnt, we must commit prior changes to memory
1739 		 * (Documentation/RCU/rculist_nulls.txt for details)
1740 		 */
1741 		smp_wmb();
1742 		refcount_set(&newsk->sk_refcnt, 2);
1743 
1744 		/*
1745 		 * Increment the counter in the same struct proto as the master
1746 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1747 		 * is the same as sk->sk_prot->socks, as this field was copied
1748 		 * with memcpy).
1749 		 *
1750 		 * This _changes_ the previous behaviour, where
1751 		 * tcp_create_openreq_child always was incrementing the
1752 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1753 		 * to be taken into account in all callers. -acme
1754 		 */
1755 		sk_refcnt_debug_inc(newsk);
1756 		sk_set_socket(newsk, NULL);
1757 		newsk->sk_wq = NULL;
1758 
1759 		if (newsk->sk_prot->sockets_allocated)
1760 			sk_sockets_allocated_inc(newsk);
1761 
1762 		if (sock_needs_netstamp(sk) &&
1763 		    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1764 			net_enable_timestamp();
1765 	}
1766 out:
1767 	return newsk;
1768 }
1769 EXPORT_SYMBOL_GPL(sk_clone_lock);
1770 
1771 void sk_free_unlock_clone(struct sock *sk)
1772 {
1773 	/* It is still raw copy of parent, so invalidate
1774 	 * destructor and make plain sk_free() */
1775 	sk->sk_destruct = NULL;
1776 	bh_unlock_sock(sk);
1777 	sk_free(sk);
1778 }
1779 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1780 
1781 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1782 {
1783 	u32 max_segs = 1;
1784 
1785 	sk_dst_set(sk, dst);
1786 	sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
1787 	if (sk->sk_route_caps & NETIF_F_GSO)
1788 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1789 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1790 	if (sk_can_gso(sk)) {
1791 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
1792 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1793 		} else {
1794 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1795 			sk->sk_gso_max_size = dst->dev->gso_max_size;
1796 			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1797 		}
1798 	}
1799 	sk->sk_gso_max_segs = max_segs;
1800 }
1801 EXPORT_SYMBOL_GPL(sk_setup_caps);
1802 
1803 /*
1804  *	Simple resource managers for sockets.
1805  */
1806 
1807 
1808 /*
1809  * Write buffer destructor automatically called from kfree_skb.
1810  */
1811 void sock_wfree(struct sk_buff *skb)
1812 {
1813 	struct sock *sk = skb->sk;
1814 	unsigned int len = skb->truesize;
1815 
1816 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1817 		/*
1818 		 * Keep a reference on sk_wmem_alloc, this will be released
1819 		 * after sk_write_space() call
1820 		 */
1821 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1822 		sk->sk_write_space(sk);
1823 		len = 1;
1824 	}
1825 	/*
1826 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1827 	 * could not do because of in-flight packets
1828 	 */
1829 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1830 		__sk_free(sk);
1831 }
1832 EXPORT_SYMBOL(sock_wfree);
1833 
1834 /* This variant of sock_wfree() is used by TCP,
1835  * since it sets SOCK_USE_WRITE_QUEUE.
1836  */
1837 void __sock_wfree(struct sk_buff *skb)
1838 {
1839 	struct sock *sk = skb->sk;
1840 
1841 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1842 		__sk_free(sk);
1843 }
1844 
1845 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1846 {
1847 	skb_orphan(skb);
1848 	skb->sk = sk;
1849 #ifdef CONFIG_INET
1850 	if (unlikely(!sk_fullsock(sk))) {
1851 		skb->destructor = sock_edemux;
1852 		sock_hold(sk);
1853 		return;
1854 	}
1855 #endif
1856 	skb->destructor = sock_wfree;
1857 	skb_set_hash_from_sk(skb, sk);
1858 	/*
1859 	 * We used to take a refcount on sk, but following operation
1860 	 * is enough to guarantee sk_free() wont free this sock until
1861 	 * all in-flight packets are completed
1862 	 */
1863 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1864 }
1865 EXPORT_SYMBOL(skb_set_owner_w);
1866 
1867 /* This helper is used by netem, as it can hold packets in its
1868  * delay queue. We want to allow the owner socket to send more
1869  * packets, as if they were already TX completed by a typical driver.
1870  * But we also want to keep skb->sk set because some packet schedulers
1871  * rely on it (sch_fq for example).
1872  */
1873 void skb_orphan_partial(struct sk_buff *skb)
1874 {
1875 	if (skb_is_tcp_pure_ack(skb))
1876 		return;
1877 
1878 	if (skb->destructor == sock_wfree
1879 #ifdef CONFIG_INET
1880 	    || skb->destructor == tcp_wfree
1881 #endif
1882 		) {
1883 		struct sock *sk = skb->sk;
1884 
1885 		if (refcount_inc_not_zero(&sk->sk_refcnt)) {
1886 			WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
1887 			skb->destructor = sock_efree;
1888 		}
1889 	} else {
1890 		skb_orphan(skb);
1891 	}
1892 }
1893 EXPORT_SYMBOL(skb_orphan_partial);
1894 
1895 /*
1896  * Read buffer destructor automatically called from kfree_skb.
1897  */
1898 void sock_rfree(struct sk_buff *skb)
1899 {
1900 	struct sock *sk = skb->sk;
1901 	unsigned int len = skb->truesize;
1902 
1903 	atomic_sub(len, &sk->sk_rmem_alloc);
1904 	sk_mem_uncharge(sk, len);
1905 }
1906 EXPORT_SYMBOL(sock_rfree);
1907 
1908 /*
1909  * Buffer destructor for skbs that are not used directly in read or write
1910  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1911  */
1912 void sock_efree(struct sk_buff *skb)
1913 {
1914 	sock_put(skb->sk);
1915 }
1916 EXPORT_SYMBOL(sock_efree);
1917 
1918 kuid_t sock_i_uid(struct sock *sk)
1919 {
1920 	kuid_t uid;
1921 
1922 	read_lock_bh(&sk->sk_callback_lock);
1923 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1924 	read_unlock_bh(&sk->sk_callback_lock);
1925 	return uid;
1926 }
1927 EXPORT_SYMBOL(sock_i_uid);
1928 
1929 unsigned long sock_i_ino(struct sock *sk)
1930 {
1931 	unsigned long ino;
1932 
1933 	read_lock_bh(&sk->sk_callback_lock);
1934 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1935 	read_unlock_bh(&sk->sk_callback_lock);
1936 	return ino;
1937 }
1938 EXPORT_SYMBOL(sock_i_ino);
1939 
1940 /*
1941  * Allocate a skb from the socket's send buffer.
1942  */
1943 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1944 			     gfp_t priority)
1945 {
1946 	if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1947 		struct sk_buff *skb = alloc_skb(size, priority);
1948 		if (skb) {
1949 			skb_set_owner_w(skb, sk);
1950 			return skb;
1951 		}
1952 	}
1953 	return NULL;
1954 }
1955 EXPORT_SYMBOL(sock_wmalloc);
1956 
1957 static void sock_ofree(struct sk_buff *skb)
1958 {
1959 	struct sock *sk = skb->sk;
1960 
1961 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
1962 }
1963 
1964 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
1965 			     gfp_t priority)
1966 {
1967 	struct sk_buff *skb;
1968 
1969 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
1970 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
1971 	    sysctl_optmem_max)
1972 		return NULL;
1973 
1974 	skb = alloc_skb(size, priority);
1975 	if (!skb)
1976 		return NULL;
1977 
1978 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
1979 	skb->sk = sk;
1980 	skb->destructor = sock_ofree;
1981 	return skb;
1982 }
1983 
1984 /*
1985  * Allocate a memory block from the socket's option memory buffer.
1986  */
1987 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1988 {
1989 	if ((unsigned int)size <= sysctl_optmem_max &&
1990 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1991 		void *mem;
1992 		/* First do the add, to avoid the race if kmalloc
1993 		 * might sleep.
1994 		 */
1995 		atomic_add(size, &sk->sk_omem_alloc);
1996 		mem = kmalloc(size, priority);
1997 		if (mem)
1998 			return mem;
1999 		atomic_sub(size, &sk->sk_omem_alloc);
2000 	}
2001 	return NULL;
2002 }
2003 EXPORT_SYMBOL(sock_kmalloc);
2004 
2005 /* Free an option memory block. Note, we actually want the inline
2006  * here as this allows gcc to detect the nullify and fold away the
2007  * condition entirely.
2008  */
2009 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2010 				  const bool nullify)
2011 {
2012 	if (WARN_ON_ONCE(!mem))
2013 		return;
2014 	if (nullify)
2015 		kzfree(mem);
2016 	else
2017 		kfree(mem);
2018 	atomic_sub(size, &sk->sk_omem_alloc);
2019 }
2020 
2021 void sock_kfree_s(struct sock *sk, void *mem, int size)
2022 {
2023 	__sock_kfree_s(sk, mem, size, false);
2024 }
2025 EXPORT_SYMBOL(sock_kfree_s);
2026 
2027 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2028 {
2029 	__sock_kfree_s(sk, mem, size, true);
2030 }
2031 EXPORT_SYMBOL(sock_kzfree_s);
2032 
2033 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2034    I think, these locks should be removed for datagram sockets.
2035  */
2036 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2037 {
2038 	DEFINE_WAIT(wait);
2039 
2040 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2041 	for (;;) {
2042 		if (!timeo)
2043 			break;
2044 		if (signal_pending(current))
2045 			break;
2046 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2047 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2048 		if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
2049 			break;
2050 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2051 			break;
2052 		if (sk->sk_err)
2053 			break;
2054 		timeo = schedule_timeout(timeo);
2055 	}
2056 	finish_wait(sk_sleep(sk), &wait);
2057 	return timeo;
2058 }
2059 
2060 
2061 /*
2062  *	Generic send/receive buffer handlers
2063  */
2064 
2065 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2066 				     unsigned long data_len, int noblock,
2067 				     int *errcode, int max_page_order)
2068 {
2069 	struct sk_buff *skb;
2070 	long timeo;
2071 	int err;
2072 
2073 	timeo = sock_sndtimeo(sk, noblock);
2074 	for (;;) {
2075 		err = sock_error(sk);
2076 		if (err != 0)
2077 			goto failure;
2078 
2079 		err = -EPIPE;
2080 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2081 			goto failure;
2082 
2083 		if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2084 			break;
2085 
2086 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2087 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2088 		err = -EAGAIN;
2089 		if (!timeo)
2090 			goto failure;
2091 		if (signal_pending(current))
2092 			goto interrupted;
2093 		timeo = sock_wait_for_wmem(sk, timeo);
2094 	}
2095 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2096 				   errcode, sk->sk_allocation);
2097 	if (skb)
2098 		skb_set_owner_w(skb, sk);
2099 	return skb;
2100 
2101 interrupted:
2102 	err = sock_intr_errno(timeo);
2103 failure:
2104 	*errcode = err;
2105 	return NULL;
2106 }
2107 EXPORT_SYMBOL(sock_alloc_send_pskb);
2108 
2109 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2110 				    int noblock, int *errcode)
2111 {
2112 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2113 }
2114 EXPORT_SYMBOL(sock_alloc_send_skb);
2115 
2116 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2117 		     struct sockcm_cookie *sockc)
2118 {
2119 	u32 tsflags;
2120 
2121 	switch (cmsg->cmsg_type) {
2122 	case SO_MARK:
2123 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2124 			return -EPERM;
2125 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2126 			return -EINVAL;
2127 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2128 		break;
2129 	case SO_TIMESTAMPING:
2130 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2131 			return -EINVAL;
2132 
2133 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2134 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2135 			return -EINVAL;
2136 
2137 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2138 		sockc->tsflags |= tsflags;
2139 		break;
2140 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2141 	case SCM_RIGHTS:
2142 	case SCM_CREDENTIALS:
2143 		break;
2144 	default:
2145 		return -EINVAL;
2146 	}
2147 	return 0;
2148 }
2149 EXPORT_SYMBOL(__sock_cmsg_send);
2150 
2151 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2152 		   struct sockcm_cookie *sockc)
2153 {
2154 	struct cmsghdr *cmsg;
2155 	int ret;
2156 
2157 	for_each_cmsghdr(cmsg, msg) {
2158 		if (!CMSG_OK(msg, cmsg))
2159 			return -EINVAL;
2160 		if (cmsg->cmsg_level != SOL_SOCKET)
2161 			continue;
2162 		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2163 		if (ret)
2164 			return ret;
2165 	}
2166 	return 0;
2167 }
2168 EXPORT_SYMBOL(sock_cmsg_send);
2169 
2170 static void sk_enter_memory_pressure(struct sock *sk)
2171 {
2172 	if (!sk->sk_prot->enter_memory_pressure)
2173 		return;
2174 
2175 	sk->sk_prot->enter_memory_pressure(sk);
2176 }
2177 
2178 static void sk_leave_memory_pressure(struct sock *sk)
2179 {
2180 	if (sk->sk_prot->leave_memory_pressure) {
2181 		sk->sk_prot->leave_memory_pressure(sk);
2182 	} else {
2183 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2184 
2185 		if (memory_pressure && *memory_pressure)
2186 			*memory_pressure = 0;
2187 	}
2188 }
2189 
2190 /* On 32bit arches, an skb frag is limited to 2^15 */
2191 #define SKB_FRAG_PAGE_ORDER	get_order(32768)
2192 
2193 /**
2194  * skb_page_frag_refill - check that a page_frag contains enough room
2195  * @sz: minimum size of the fragment we want to get
2196  * @pfrag: pointer to page_frag
2197  * @gfp: priority for memory allocation
2198  *
2199  * Note: While this allocator tries to use high order pages, there is
2200  * no guarantee that allocations succeed. Therefore, @sz MUST be
2201  * less or equal than PAGE_SIZE.
2202  */
2203 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2204 {
2205 	if (pfrag->page) {
2206 		if (page_ref_count(pfrag->page) == 1) {
2207 			pfrag->offset = 0;
2208 			return true;
2209 		}
2210 		if (pfrag->offset + sz <= pfrag->size)
2211 			return true;
2212 		put_page(pfrag->page);
2213 	}
2214 
2215 	pfrag->offset = 0;
2216 	if (SKB_FRAG_PAGE_ORDER) {
2217 		/* Avoid direct reclaim but allow kswapd to wake */
2218 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2219 					  __GFP_COMP | __GFP_NOWARN |
2220 					  __GFP_NORETRY,
2221 					  SKB_FRAG_PAGE_ORDER);
2222 		if (likely(pfrag->page)) {
2223 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2224 			return true;
2225 		}
2226 	}
2227 	pfrag->page = alloc_page(gfp);
2228 	if (likely(pfrag->page)) {
2229 		pfrag->size = PAGE_SIZE;
2230 		return true;
2231 	}
2232 	return false;
2233 }
2234 EXPORT_SYMBOL(skb_page_frag_refill);
2235 
2236 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2237 {
2238 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2239 		return true;
2240 
2241 	sk_enter_memory_pressure(sk);
2242 	sk_stream_moderate_sndbuf(sk);
2243 	return false;
2244 }
2245 EXPORT_SYMBOL(sk_page_frag_refill);
2246 
2247 int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
2248 		int sg_start, int *sg_curr_index, unsigned int *sg_curr_size,
2249 		int first_coalesce)
2250 {
2251 	int sg_curr = *sg_curr_index, use = 0, rc = 0;
2252 	unsigned int size = *sg_curr_size;
2253 	struct page_frag *pfrag;
2254 	struct scatterlist *sge;
2255 
2256 	len -= size;
2257 	pfrag = sk_page_frag(sk);
2258 
2259 	while (len > 0) {
2260 		unsigned int orig_offset;
2261 
2262 		if (!sk_page_frag_refill(sk, pfrag)) {
2263 			rc = -ENOMEM;
2264 			goto out;
2265 		}
2266 
2267 		use = min_t(int, len, pfrag->size - pfrag->offset);
2268 
2269 		if (!sk_wmem_schedule(sk, use)) {
2270 			rc = -ENOMEM;
2271 			goto out;
2272 		}
2273 
2274 		sk_mem_charge(sk, use);
2275 		size += use;
2276 		orig_offset = pfrag->offset;
2277 		pfrag->offset += use;
2278 
2279 		sge = sg + sg_curr - 1;
2280 		if (sg_curr > first_coalesce && sg_page(sg) == pfrag->page &&
2281 		    sg->offset + sg->length == orig_offset) {
2282 			sg->length += use;
2283 		} else {
2284 			sge = sg + sg_curr;
2285 			sg_unmark_end(sge);
2286 			sg_set_page(sge, pfrag->page, use, orig_offset);
2287 			get_page(pfrag->page);
2288 			sg_curr++;
2289 
2290 			if (sg_curr == MAX_SKB_FRAGS)
2291 				sg_curr = 0;
2292 
2293 			if (sg_curr == sg_start) {
2294 				rc = -ENOSPC;
2295 				break;
2296 			}
2297 		}
2298 
2299 		len -= use;
2300 	}
2301 out:
2302 	*sg_curr_size = size;
2303 	*sg_curr_index = sg_curr;
2304 	return rc;
2305 }
2306 EXPORT_SYMBOL(sk_alloc_sg);
2307 
2308 static void __lock_sock(struct sock *sk)
2309 	__releases(&sk->sk_lock.slock)
2310 	__acquires(&sk->sk_lock.slock)
2311 {
2312 	DEFINE_WAIT(wait);
2313 
2314 	for (;;) {
2315 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2316 					TASK_UNINTERRUPTIBLE);
2317 		spin_unlock_bh(&sk->sk_lock.slock);
2318 		schedule();
2319 		spin_lock_bh(&sk->sk_lock.slock);
2320 		if (!sock_owned_by_user(sk))
2321 			break;
2322 	}
2323 	finish_wait(&sk->sk_lock.wq, &wait);
2324 }
2325 
2326 static void __release_sock(struct sock *sk)
2327 	__releases(&sk->sk_lock.slock)
2328 	__acquires(&sk->sk_lock.slock)
2329 {
2330 	struct sk_buff *skb, *next;
2331 
2332 	while ((skb = sk->sk_backlog.head) != NULL) {
2333 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2334 
2335 		spin_unlock_bh(&sk->sk_lock.slock);
2336 
2337 		do {
2338 			next = skb->next;
2339 			prefetch(next);
2340 			WARN_ON_ONCE(skb_dst_is_noref(skb));
2341 			skb->next = NULL;
2342 			sk_backlog_rcv(sk, skb);
2343 
2344 			cond_resched();
2345 
2346 			skb = next;
2347 		} while (skb != NULL);
2348 
2349 		spin_lock_bh(&sk->sk_lock.slock);
2350 	}
2351 
2352 	/*
2353 	 * Doing the zeroing here guarantee we can not loop forever
2354 	 * while a wild producer attempts to flood us.
2355 	 */
2356 	sk->sk_backlog.len = 0;
2357 }
2358 
2359 void __sk_flush_backlog(struct sock *sk)
2360 {
2361 	spin_lock_bh(&sk->sk_lock.slock);
2362 	__release_sock(sk);
2363 	spin_unlock_bh(&sk->sk_lock.slock);
2364 }
2365 
2366 /**
2367  * sk_wait_data - wait for data to arrive at sk_receive_queue
2368  * @sk:    sock to wait on
2369  * @timeo: for how long
2370  * @skb:   last skb seen on sk_receive_queue
2371  *
2372  * Now socket state including sk->sk_err is changed only under lock,
2373  * hence we may omit checks after joining wait queue.
2374  * We check receive queue before schedule() only as optimization;
2375  * it is very likely that release_sock() added new data.
2376  */
2377 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2378 {
2379 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2380 	int rc;
2381 
2382 	add_wait_queue(sk_sleep(sk), &wait);
2383 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2384 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2385 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2386 	remove_wait_queue(sk_sleep(sk), &wait);
2387 	return rc;
2388 }
2389 EXPORT_SYMBOL(sk_wait_data);
2390 
2391 /**
2392  *	__sk_mem_raise_allocated - increase memory_allocated
2393  *	@sk: socket
2394  *	@size: memory size to allocate
2395  *	@amt: pages to allocate
2396  *	@kind: allocation type
2397  *
2398  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2399  */
2400 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2401 {
2402 	struct proto *prot = sk->sk_prot;
2403 	long allocated = sk_memory_allocated_add(sk, amt);
2404 
2405 	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2406 	    !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2407 		goto suppress_allocation;
2408 
2409 	/* Under limit. */
2410 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2411 		sk_leave_memory_pressure(sk);
2412 		return 1;
2413 	}
2414 
2415 	/* Under pressure. */
2416 	if (allocated > sk_prot_mem_limits(sk, 1))
2417 		sk_enter_memory_pressure(sk);
2418 
2419 	/* Over hard limit. */
2420 	if (allocated > sk_prot_mem_limits(sk, 2))
2421 		goto suppress_allocation;
2422 
2423 	/* guarantee minimum buffer size under pressure */
2424 	if (kind == SK_MEM_RECV) {
2425 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2426 			return 1;
2427 
2428 	} else { /* SK_MEM_SEND */
2429 		int wmem0 = sk_get_wmem0(sk, prot);
2430 
2431 		if (sk->sk_type == SOCK_STREAM) {
2432 			if (sk->sk_wmem_queued < wmem0)
2433 				return 1;
2434 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2435 				return 1;
2436 		}
2437 	}
2438 
2439 	if (sk_has_memory_pressure(sk)) {
2440 		int alloc;
2441 
2442 		if (!sk_under_memory_pressure(sk))
2443 			return 1;
2444 		alloc = sk_sockets_allocated_read_positive(sk);
2445 		if (sk_prot_mem_limits(sk, 2) > alloc *
2446 		    sk_mem_pages(sk->sk_wmem_queued +
2447 				 atomic_read(&sk->sk_rmem_alloc) +
2448 				 sk->sk_forward_alloc))
2449 			return 1;
2450 	}
2451 
2452 suppress_allocation:
2453 
2454 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2455 		sk_stream_moderate_sndbuf(sk);
2456 
2457 		/* Fail only if socket is _under_ its sndbuf.
2458 		 * In this case we cannot block, so that we have to fail.
2459 		 */
2460 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2461 			return 1;
2462 	}
2463 
2464 	trace_sock_exceed_buf_limit(sk, prot, allocated);
2465 
2466 	sk_memory_allocated_sub(sk, amt);
2467 
2468 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2469 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2470 
2471 	return 0;
2472 }
2473 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2474 
2475 /**
2476  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2477  *	@sk: socket
2478  *	@size: memory size to allocate
2479  *	@kind: allocation type
2480  *
2481  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2482  *	rmem allocation. This function assumes that protocols which have
2483  *	memory_pressure use sk_wmem_queued as write buffer accounting.
2484  */
2485 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2486 {
2487 	int ret, amt = sk_mem_pages(size);
2488 
2489 	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2490 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2491 	if (!ret)
2492 		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2493 	return ret;
2494 }
2495 EXPORT_SYMBOL(__sk_mem_schedule);
2496 
2497 /**
2498  *	__sk_mem_reduce_allocated - reclaim memory_allocated
2499  *	@sk: socket
2500  *	@amount: number of quanta
2501  *
2502  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2503  */
2504 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2505 {
2506 	sk_memory_allocated_sub(sk, amount);
2507 
2508 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2509 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2510 
2511 	if (sk_under_memory_pressure(sk) &&
2512 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2513 		sk_leave_memory_pressure(sk);
2514 }
2515 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2516 
2517 /**
2518  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2519  *	@sk: socket
2520  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2521  */
2522 void __sk_mem_reclaim(struct sock *sk, int amount)
2523 {
2524 	amount >>= SK_MEM_QUANTUM_SHIFT;
2525 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2526 	__sk_mem_reduce_allocated(sk, amount);
2527 }
2528 EXPORT_SYMBOL(__sk_mem_reclaim);
2529 
2530 int sk_set_peek_off(struct sock *sk, int val)
2531 {
2532 	sk->sk_peek_off = val;
2533 	return 0;
2534 }
2535 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2536 
2537 /*
2538  * Set of default routines for initialising struct proto_ops when
2539  * the protocol does not support a particular function. In certain
2540  * cases where it makes no sense for a protocol to have a "do nothing"
2541  * function, some default processing is provided.
2542  */
2543 
2544 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2545 {
2546 	return -EOPNOTSUPP;
2547 }
2548 EXPORT_SYMBOL(sock_no_bind);
2549 
2550 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2551 		    int len, int flags)
2552 {
2553 	return -EOPNOTSUPP;
2554 }
2555 EXPORT_SYMBOL(sock_no_connect);
2556 
2557 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2558 {
2559 	return -EOPNOTSUPP;
2560 }
2561 EXPORT_SYMBOL(sock_no_socketpair);
2562 
2563 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2564 		   bool kern)
2565 {
2566 	return -EOPNOTSUPP;
2567 }
2568 EXPORT_SYMBOL(sock_no_accept);
2569 
2570 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2571 		    int peer)
2572 {
2573 	return -EOPNOTSUPP;
2574 }
2575 EXPORT_SYMBOL(sock_no_getname);
2576 
2577 __poll_t sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2578 {
2579 	return 0;
2580 }
2581 EXPORT_SYMBOL(sock_no_poll);
2582 
2583 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2584 {
2585 	return -EOPNOTSUPP;
2586 }
2587 EXPORT_SYMBOL(sock_no_ioctl);
2588 
2589 int sock_no_listen(struct socket *sock, int backlog)
2590 {
2591 	return -EOPNOTSUPP;
2592 }
2593 EXPORT_SYMBOL(sock_no_listen);
2594 
2595 int sock_no_shutdown(struct socket *sock, int how)
2596 {
2597 	return -EOPNOTSUPP;
2598 }
2599 EXPORT_SYMBOL(sock_no_shutdown);
2600 
2601 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2602 		    char __user *optval, unsigned int optlen)
2603 {
2604 	return -EOPNOTSUPP;
2605 }
2606 EXPORT_SYMBOL(sock_no_setsockopt);
2607 
2608 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2609 		    char __user *optval, int __user *optlen)
2610 {
2611 	return -EOPNOTSUPP;
2612 }
2613 EXPORT_SYMBOL(sock_no_getsockopt);
2614 
2615 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2616 {
2617 	return -EOPNOTSUPP;
2618 }
2619 EXPORT_SYMBOL(sock_no_sendmsg);
2620 
2621 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2622 {
2623 	return -EOPNOTSUPP;
2624 }
2625 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2626 
2627 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2628 		    int flags)
2629 {
2630 	return -EOPNOTSUPP;
2631 }
2632 EXPORT_SYMBOL(sock_no_recvmsg);
2633 
2634 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2635 {
2636 	/* Mirror missing mmap method error code */
2637 	return -ENODEV;
2638 }
2639 EXPORT_SYMBOL(sock_no_mmap);
2640 
2641 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2642 {
2643 	ssize_t res;
2644 	struct msghdr msg = {.msg_flags = flags};
2645 	struct kvec iov;
2646 	char *kaddr = kmap(page);
2647 	iov.iov_base = kaddr + offset;
2648 	iov.iov_len = size;
2649 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2650 	kunmap(page);
2651 	return res;
2652 }
2653 EXPORT_SYMBOL(sock_no_sendpage);
2654 
2655 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2656 				int offset, size_t size, int flags)
2657 {
2658 	ssize_t res;
2659 	struct msghdr msg = {.msg_flags = flags};
2660 	struct kvec iov;
2661 	char *kaddr = kmap(page);
2662 
2663 	iov.iov_base = kaddr + offset;
2664 	iov.iov_len = size;
2665 	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2666 	kunmap(page);
2667 	return res;
2668 }
2669 EXPORT_SYMBOL(sock_no_sendpage_locked);
2670 
2671 /*
2672  *	Default Socket Callbacks
2673  */
2674 
2675 static void sock_def_wakeup(struct sock *sk)
2676 {
2677 	struct socket_wq *wq;
2678 
2679 	rcu_read_lock();
2680 	wq = rcu_dereference(sk->sk_wq);
2681 	if (skwq_has_sleeper(wq))
2682 		wake_up_interruptible_all(&wq->wait);
2683 	rcu_read_unlock();
2684 }
2685 
2686 static void sock_def_error_report(struct sock *sk)
2687 {
2688 	struct socket_wq *wq;
2689 
2690 	rcu_read_lock();
2691 	wq = rcu_dereference(sk->sk_wq);
2692 	if (skwq_has_sleeper(wq))
2693 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2694 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2695 	rcu_read_unlock();
2696 }
2697 
2698 static void sock_def_readable(struct sock *sk)
2699 {
2700 	struct socket_wq *wq;
2701 
2702 	rcu_read_lock();
2703 	wq = rcu_dereference(sk->sk_wq);
2704 	if (skwq_has_sleeper(wq))
2705 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2706 						EPOLLRDNORM | EPOLLRDBAND);
2707 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2708 	rcu_read_unlock();
2709 }
2710 
2711 static void sock_def_write_space(struct sock *sk)
2712 {
2713 	struct socket_wq *wq;
2714 
2715 	rcu_read_lock();
2716 
2717 	/* Do not wake up a writer until he can make "significant"
2718 	 * progress.  --DaveM
2719 	 */
2720 	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2721 		wq = rcu_dereference(sk->sk_wq);
2722 		if (skwq_has_sleeper(wq))
2723 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2724 						EPOLLWRNORM | EPOLLWRBAND);
2725 
2726 		/* Should agree with poll, otherwise some programs break */
2727 		if (sock_writeable(sk))
2728 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2729 	}
2730 
2731 	rcu_read_unlock();
2732 }
2733 
2734 static void sock_def_destruct(struct sock *sk)
2735 {
2736 }
2737 
2738 void sk_send_sigurg(struct sock *sk)
2739 {
2740 	if (sk->sk_socket && sk->sk_socket->file)
2741 		if (send_sigurg(&sk->sk_socket->file->f_owner))
2742 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2743 }
2744 EXPORT_SYMBOL(sk_send_sigurg);
2745 
2746 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2747 		    unsigned long expires)
2748 {
2749 	if (!mod_timer(timer, expires))
2750 		sock_hold(sk);
2751 }
2752 EXPORT_SYMBOL(sk_reset_timer);
2753 
2754 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2755 {
2756 	if (del_timer(timer))
2757 		__sock_put(sk);
2758 }
2759 EXPORT_SYMBOL(sk_stop_timer);
2760 
2761 void sock_init_data(struct socket *sock, struct sock *sk)
2762 {
2763 	sk_init_common(sk);
2764 	sk->sk_send_head	=	NULL;
2765 
2766 	timer_setup(&sk->sk_timer, NULL, 0);
2767 
2768 	sk->sk_allocation	=	GFP_KERNEL;
2769 	sk->sk_rcvbuf		=	sysctl_rmem_default;
2770 	sk->sk_sndbuf		=	sysctl_wmem_default;
2771 	sk->sk_state		=	TCP_CLOSE;
2772 	sk_set_socket(sk, sock);
2773 
2774 	sock_set_flag(sk, SOCK_ZAPPED);
2775 
2776 	if (sock) {
2777 		sk->sk_type	=	sock->type;
2778 		sk->sk_wq	=	sock->wq;
2779 		sock->sk	=	sk;
2780 		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
2781 	} else {
2782 		sk->sk_wq	=	NULL;
2783 		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
2784 	}
2785 
2786 	rwlock_init(&sk->sk_callback_lock);
2787 	if (sk->sk_kern_sock)
2788 		lockdep_set_class_and_name(
2789 			&sk->sk_callback_lock,
2790 			af_kern_callback_keys + sk->sk_family,
2791 			af_family_kern_clock_key_strings[sk->sk_family]);
2792 	else
2793 		lockdep_set_class_and_name(
2794 			&sk->sk_callback_lock,
2795 			af_callback_keys + sk->sk_family,
2796 			af_family_clock_key_strings[sk->sk_family]);
2797 
2798 	sk->sk_state_change	=	sock_def_wakeup;
2799 	sk->sk_data_ready	=	sock_def_readable;
2800 	sk->sk_write_space	=	sock_def_write_space;
2801 	sk->sk_error_report	=	sock_def_error_report;
2802 	sk->sk_destruct		=	sock_def_destruct;
2803 
2804 	sk->sk_frag.page	=	NULL;
2805 	sk->sk_frag.offset	=	0;
2806 	sk->sk_peek_off		=	-1;
2807 
2808 	sk->sk_peer_pid 	=	NULL;
2809 	sk->sk_peer_cred	=	NULL;
2810 	sk->sk_write_pending	=	0;
2811 	sk->sk_rcvlowat		=	1;
2812 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2813 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2814 
2815 	sk->sk_stamp = SK_DEFAULT_STAMP;
2816 	atomic_set(&sk->sk_zckey, 0);
2817 
2818 #ifdef CONFIG_NET_RX_BUSY_POLL
2819 	sk->sk_napi_id		=	0;
2820 	sk->sk_ll_usec		=	sysctl_net_busy_read;
2821 #endif
2822 
2823 	sk->sk_max_pacing_rate = ~0U;
2824 	sk->sk_pacing_rate = ~0U;
2825 	sk->sk_pacing_shift = 10;
2826 	sk->sk_incoming_cpu = -1;
2827 	/*
2828 	 * Before updating sk_refcnt, we must commit prior changes to memory
2829 	 * (Documentation/RCU/rculist_nulls.txt for details)
2830 	 */
2831 	smp_wmb();
2832 	refcount_set(&sk->sk_refcnt, 1);
2833 	atomic_set(&sk->sk_drops, 0);
2834 }
2835 EXPORT_SYMBOL(sock_init_data);
2836 
2837 void lock_sock_nested(struct sock *sk, int subclass)
2838 {
2839 	might_sleep();
2840 	spin_lock_bh(&sk->sk_lock.slock);
2841 	if (sk->sk_lock.owned)
2842 		__lock_sock(sk);
2843 	sk->sk_lock.owned = 1;
2844 	spin_unlock(&sk->sk_lock.slock);
2845 	/*
2846 	 * The sk_lock has mutex_lock() semantics here:
2847 	 */
2848 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2849 	local_bh_enable();
2850 }
2851 EXPORT_SYMBOL(lock_sock_nested);
2852 
2853 void release_sock(struct sock *sk)
2854 {
2855 	spin_lock_bh(&sk->sk_lock.slock);
2856 	if (sk->sk_backlog.tail)
2857 		__release_sock(sk);
2858 
2859 	/* Warning : release_cb() might need to release sk ownership,
2860 	 * ie call sock_release_ownership(sk) before us.
2861 	 */
2862 	if (sk->sk_prot->release_cb)
2863 		sk->sk_prot->release_cb(sk);
2864 
2865 	sock_release_ownership(sk);
2866 	if (waitqueue_active(&sk->sk_lock.wq))
2867 		wake_up(&sk->sk_lock.wq);
2868 	spin_unlock_bh(&sk->sk_lock.slock);
2869 }
2870 EXPORT_SYMBOL(release_sock);
2871 
2872 /**
2873  * lock_sock_fast - fast version of lock_sock
2874  * @sk: socket
2875  *
2876  * This version should be used for very small section, where process wont block
2877  * return false if fast path is taken:
2878  *
2879  *   sk_lock.slock locked, owned = 0, BH disabled
2880  *
2881  * return true if slow path is taken:
2882  *
2883  *   sk_lock.slock unlocked, owned = 1, BH enabled
2884  */
2885 bool lock_sock_fast(struct sock *sk)
2886 {
2887 	might_sleep();
2888 	spin_lock_bh(&sk->sk_lock.slock);
2889 
2890 	if (!sk->sk_lock.owned)
2891 		/*
2892 		 * Note : We must disable BH
2893 		 */
2894 		return false;
2895 
2896 	__lock_sock(sk);
2897 	sk->sk_lock.owned = 1;
2898 	spin_unlock(&sk->sk_lock.slock);
2899 	/*
2900 	 * The sk_lock has mutex_lock() semantics here:
2901 	 */
2902 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2903 	local_bh_enable();
2904 	return true;
2905 }
2906 EXPORT_SYMBOL(lock_sock_fast);
2907 
2908 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2909 {
2910 	struct timeval tv;
2911 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2912 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2913 	tv = ktime_to_timeval(sk->sk_stamp);
2914 	if (tv.tv_sec == -1)
2915 		return -ENOENT;
2916 	if (tv.tv_sec == 0) {
2917 		sk->sk_stamp = ktime_get_real();
2918 		tv = ktime_to_timeval(sk->sk_stamp);
2919 	}
2920 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2921 }
2922 EXPORT_SYMBOL(sock_get_timestamp);
2923 
2924 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2925 {
2926 	struct timespec ts;
2927 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2928 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2929 	ts = ktime_to_timespec(sk->sk_stamp);
2930 	if (ts.tv_sec == -1)
2931 		return -ENOENT;
2932 	if (ts.tv_sec == 0) {
2933 		sk->sk_stamp = ktime_get_real();
2934 		ts = ktime_to_timespec(sk->sk_stamp);
2935 	}
2936 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2937 }
2938 EXPORT_SYMBOL(sock_get_timestampns);
2939 
2940 void sock_enable_timestamp(struct sock *sk, int flag)
2941 {
2942 	if (!sock_flag(sk, flag)) {
2943 		unsigned long previous_flags = sk->sk_flags;
2944 
2945 		sock_set_flag(sk, flag);
2946 		/*
2947 		 * we just set one of the two flags which require net
2948 		 * time stamping, but time stamping might have been on
2949 		 * already because of the other one
2950 		 */
2951 		if (sock_needs_netstamp(sk) &&
2952 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
2953 			net_enable_timestamp();
2954 	}
2955 }
2956 
2957 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2958 		       int level, int type)
2959 {
2960 	struct sock_exterr_skb *serr;
2961 	struct sk_buff *skb;
2962 	int copied, err;
2963 
2964 	err = -EAGAIN;
2965 	skb = sock_dequeue_err_skb(sk);
2966 	if (skb == NULL)
2967 		goto out;
2968 
2969 	copied = skb->len;
2970 	if (copied > len) {
2971 		msg->msg_flags |= MSG_TRUNC;
2972 		copied = len;
2973 	}
2974 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
2975 	if (err)
2976 		goto out_free_skb;
2977 
2978 	sock_recv_timestamp(msg, sk, skb);
2979 
2980 	serr = SKB_EXT_ERR(skb);
2981 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2982 
2983 	msg->msg_flags |= MSG_ERRQUEUE;
2984 	err = copied;
2985 
2986 out_free_skb:
2987 	kfree_skb(skb);
2988 out:
2989 	return err;
2990 }
2991 EXPORT_SYMBOL(sock_recv_errqueue);
2992 
2993 /*
2994  *	Get a socket option on an socket.
2995  *
2996  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2997  *	asynchronous errors should be reported by getsockopt. We assume
2998  *	this means if you specify SO_ERROR (otherwise whats the point of it).
2999  */
3000 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3001 			   char __user *optval, int __user *optlen)
3002 {
3003 	struct sock *sk = sock->sk;
3004 
3005 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3006 }
3007 EXPORT_SYMBOL(sock_common_getsockopt);
3008 
3009 #ifdef CONFIG_COMPAT
3010 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
3011 				  char __user *optval, int __user *optlen)
3012 {
3013 	struct sock *sk = sock->sk;
3014 
3015 	if (sk->sk_prot->compat_getsockopt != NULL)
3016 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
3017 						      optval, optlen);
3018 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3019 }
3020 EXPORT_SYMBOL(compat_sock_common_getsockopt);
3021 #endif
3022 
3023 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3024 			int flags)
3025 {
3026 	struct sock *sk = sock->sk;
3027 	int addr_len = 0;
3028 	int err;
3029 
3030 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3031 				   flags & ~MSG_DONTWAIT, &addr_len);
3032 	if (err >= 0)
3033 		msg->msg_namelen = addr_len;
3034 	return err;
3035 }
3036 EXPORT_SYMBOL(sock_common_recvmsg);
3037 
3038 /*
3039  *	Set socket options on an inet socket.
3040  */
3041 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3042 			   char __user *optval, unsigned int optlen)
3043 {
3044 	struct sock *sk = sock->sk;
3045 
3046 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3047 }
3048 EXPORT_SYMBOL(sock_common_setsockopt);
3049 
3050 #ifdef CONFIG_COMPAT
3051 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
3052 				  char __user *optval, unsigned int optlen)
3053 {
3054 	struct sock *sk = sock->sk;
3055 
3056 	if (sk->sk_prot->compat_setsockopt != NULL)
3057 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
3058 						      optval, optlen);
3059 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3060 }
3061 EXPORT_SYMBOL(compat_sock_common_setsockopt);
3062 #endif
3063 
3064 void sk_common_release(struct sock *sk)
3065 {
3066 	if (sk->sk_prot->destroy)
3067 		sk->sk_prot->destroy(sk);
3068 
3069 	/*
3070 	 * Observation: when sock_common_release is called, processes have
3071 	 * no access to socket. But net still has.
3072 	 * Step one, detach it from networking:
3073 	 *
3074 	 * A. Remove from hash tables.
3075 	 */
3076 
3077 	sk->sk_prot->unhash(sk);
3078 
3079 	/*
3080 	 * In this point socket cannot receive new packets, but it is possible
3081 	 * that some packets are in flight because some CPU runs receiver and
3082 	 * did hash table lookup before we unhashed socket. They will achieve
3083 	 * receive queue and will be purged by socket destructor.
3084 	 *
3085 	 * Also we still have packets pending on receive queue and probably,
3086 	 * our own packets waiting in device queues. sock_destroy will drain
3087 	 * receive queue, but transmitted packets will delay socket destruction
3088 	 * until the last reference will be released.
3089 	 */
3090 
3091 	sock_orphan(sk);
3092 
3093 	xfrm_sk_free_policy(sk);
3094 
3095 	sk_refcnt_debug_release(sk);
3096 
3097 	sock_put(sk);
3098 }
3099 EXPORT_SYMBOL(sk_common_release);
3100 
3101 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3102 {
3103 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3104 
3105 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3106 	mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3107 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3108 	mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3109 	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3110 	mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3111 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3112 	mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3113 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3114 }
3115 
3116 #ifdef CONFIG_PROC_FS
3117 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
3118 struct prot_inuse {
3119 	int val[PROTO_INUSE_NR];
3120 };
3121 
3122 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3123 
3124 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3125 {
3126 	__this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3127 }
3128 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3129 
3130 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3131 {
3132 	int cpu, idx = prot->inuse_idx;
3133 	int res = 0;
3134 
3135 	for_each_possible_cpu(cpu)
3136 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3137 
3138 	return res >= 0 ? res : 0;
3139 }
3140 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3141 
3142 static void sock_inuse_add(struct net *net, int val)
3143 {
3144 	this_cpu_add(*net->core.sock_inuse, val);
3145 }
3146 
3147 int sock_inuse_get(struct net *net)
3148 {
3149 	int cpu, res = 0;
3150 
3151 	for_each_possible_cpu(cpu)
3152 		res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3153 
3154 	return res;
3155 }
3156 
3157 EXPORT_SYMBOL_GPL(sock_inuse_get);
3158 
3159 static int __net_init sock_inuse_init_net(struct net *net)
3160 {
3161 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3162 	if (net->core.prot_inuse == NULL)
3163 		return -ENOMEM;
3164 
3165 	net->core.sock_inuse = alloc_percpu(int);
3166 	if (net->core.sock_inuse == NULL)
3167 		goto out;
3168 
3169 	return 0;
3170 
3171 out:
3172 	free_percpu(net->core.prot_inuse);
3173 	return -ENOMEM;
3174 }
3175 
3176 static void __net_exit sock_inuse_exit_net(struct net *net)
3177 {
3178 	free_percpu(net->core.prot_inuse);
3179 	free_percpu(net->core.sock_inuse);
3180 }
3181 
3182 static struct pernet_operations net_inuse_ops = {
3183 	.init = sock_inuse_init_net,
3184 	.exit = sock_inuse_exit_net,
3185 };
3186 
3187 static __init int net_inuse_init(void)
3188 {
3189 	if (register_pernet_subsys(&net_inuse_ops))
3190 		panic("Cannot initialize net inuse counters");
3191 
3192 	return 0;
3193 }
3194 
3195 core_initcall(net_inuse_init);
3196 
3197 static void assign_proto_idx(struct proto *prot)
3198 {
3199 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3200 
3201 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3202 		pr_err("PROTO_INUSE_NR exhausted\n");
3203 		return;
3204 	}
3205 
3206 	set_bit(prot->inuse_idx, proto_inuse_idx);
3207 }
3208 
3209 static void release_proto_idx(struct proto *prot)
3210 {
3211 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3212 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3213 }
3214 #else
3215 static inline void assign_proto_idx(struct proto *prot)
3216 {
3217 }
3218 
3219 static inline void release_proto_idx(struct proto *prot)
3220 {
3221 }
3222 
3223 static void sock_inuse_add(struct net *net, int val)
3224 {
3225 }
3226 #endif
3227 
3228 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3229 {
3230 	if (!rsk_prot)
3231 		return;
3232 	kfree(rsk_prot->slab_name);
3233 	rsk_prot->slab_name = NULL;
3234 	kmem_cache_destroy(rsk_prot->slab);
3235 	rsk_prot->slab = NULL;
3236 }
3237 
3238 static int req_prot_init(const struct proto *prot)
3239 {
3240 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3241 
3242 	if (!rsk_prot)
3243 		return 0;
3244 
3245 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3246 					prot->name);
3247 	if (!rsk_prot->slab_name)
3248 		return -ENOMEM;
3249 
3250 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3251 					   rsk_prot->obj_size, 0,
3252 					   prot->slab_flags, NULL);
3253 
3254 	if (!rsk_prot->slab) {
3255 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3256 			prot->name);
3257 		return -ENOMEM;
3258 	}
3259 	return 0;
3260 }
3261 
3262 int proto_register(struct proto *prot, int alloc_slab)
3263 {
3264 	if (alloc_slab) {
3265 		prot->slab = kmem_cache_create_usercopy(prot->name,
3266 					prot->obj_size, 0,
3267 					SLAB_HWCACHE_ALIGN | prot->slab_flags,
3268 					prot->useroffset, prot->usersize,
3269 					NULL);
3270 
3271 		if (prot->slab == NULL) {
3272 			pr_crit("%s: Can't create sock SLAB cache!\n",
3273 				prot->name);
3274 			goto out;
3275 		}
3276 
3277 		if (req_prot_init(prot))
3278 			goto out_free_request_sock_slab;
3279 
3280 		if (prot->twsk_prot != NULL) {
3281 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3282 
3283 			if (prot->twsk_prot->twsk_slab_name == NULL)
3284 				goto out_free_request_sock_slab;
3285 
3286 			prot->twsk_prot->twsk_slab =
3287 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3288 						  prot->twsk_prot->twsk_obj_size,
3289 						  0,
3290 						  prot->slab_flags,
3291 						  NULL);
3292 			if (prot->twsk_prot->twsk_slab == NULL)
3293 				goto out_free_timewait_sock_slab_name;
3294 		}
3295 	}
3296 
3297 	mutex_lock(&proto_list_mutex);
3298 	list_add(&prot->node, &proto_list);
3299 	assign_proto_idx(prot);
3300 	mutex_unlock(&proto_list_mutex);
3301 	return 0;
3302 
3303 out_free_timewait_sock_slab_name:
3304 	kfree(prot->twsk_prot->twsk_slab_name);
3305 out_free_request_sock_slab:
3306 	req_prot_cleanup(prot->rsk_prot);
3307 
3308 	kmem_cache_destroy(prot->slab);
3309 	prot->slab = NULL;
3310 out:
3311 	return -ENOBUFS;
3312 }
3313 EXPORT_SYMBOL(proto_register);
3314 
3315 void proto_unregister(struct proto *prot)
3316 {
3317 	mutex_lock(&proto_list_mutex);
3318 	release_proto_idx(prot);
3319 	list_del(&prot->node);
3320 	mutex_unlock(&proto_list_mutex);
3321 
3322 	kmem_cache_destroy(prot->slab);
3323 	prot->slab = NULL;
3324 
3325 	req_prot_cleanup(prot->rsk_prot);
3326 
3327 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3328 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3329 		kfree(prot->twsk_prot->twsk_slab_name);
3330 		prot->twsk_prot->twsk_slab = NULL;
3331 	}
3332 }
3333 EXPORT_SYMBOL(proto_unregister);
3334 
3335 int sock_load_diag_module(int family, int protocol)
3336 {
3337 	if (!protocol) {
3338 		if (!sock_is_registered(family))
3339 			return -ENOENT;
3340 
3341 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3342 				      NETLINK_SOCK_DIAG, family);
3343 	}
3344 
3345 #ifdef CONFIG_INET
3346 	if (family == AF_INET &&
3347 	    !rcu_access_pointer(inet_protos[protocol]))
3348 		return -ENOENT;
3349 #endif
3350 
3351 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3352 			      NETLINK_SOCK_DIAG, family, protocol);
3353 }
3354 EXPORT_SYMBOL(sock_load_diag_module);
3355 
3356 #ifdef CONFIG_PROC_FS
3357 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3358 	__acquires(proto_list_mutex)
3359 {
3360 	mutex_lock(&proto_list_mutex);
3361 	return seq_list_start_head(&proto_list, *pos);
3362 }
3363 
3364 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3365 {
3366 	return seq_list_next(v, &proto_list, pos);
3367 }
3368 
3369 static void proto_seq_stop(struct seq_file *seq, void *v)
3370 	__releases(proto_list_mutex)
3371 {
3372 	mutex_unlock(&proto_list_mutex);
3373 }
3374 
3375 static char proto_method_implemented(const void *method)
3376 {
3377 	return method == NULL ? 'n' : 'y';
3378 }
3379 static long sock_prot_memory_allocated(struct proto *proto)
3380 {
3381 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3382 }
3383 
3384 static char *sock_prot_memory_pressure(struct proto *proto)
3385 {
3386 	return proto->memory_pressure != NULL ?
3387 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3388 }
3389 
3390 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3391 {
3392 
3393 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3394 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3395 		   proto->name,
3396 		   proto->obj_size,
3397 		   sock_prot_inuse_get(seq_file_net(seq), proto),
3398 		   sock_prot_memory_allocated(proto),
3399 		   sock_prot_memory_pressure(proto),
3400 		   proto->max_header,
3401 		   proto->slab == NULL ? "no" : "yes",
3402 		   module_name(proto->owner),
3403 		   proto_method_implemented(proto->close),
3404 		   proto_method_implemented(proto->connect),
3405 		   proto_method_implemented(proto->disconnect),
3406 		   proto_method_implemented(proto->accept),
3407 		   proto_method_implemented(proto->ioctl),
3408 		   proto_method_implemented(proto->init),
3409 		   proto_method_implemented(proto->destroy),
3410 		   proto_method_implemented(proto->shutdown),
3411 		   proto_method_implemented(proto->setsockopt),
3412 		   proto_method_implemented(proto->getsockopt),
3413 		   proto_method_implemented(proto->sendmsg),
3414 		   proto_method_implemented(proto->recvmsg),
3415 		   proto_method_implemented(proto->sendpage),
3416 		   proto_method_implemented(proto->bind),
3417 		   proto_method_implemented(proto->backlog_rcv),
3418 		   proto_method_implemented(proto->hash),
3419 		   proto_method_implemented(proto->unhash),
3420 		   proto_method_implemented(proto->get_port),
3421 		   proto_method_implemented(proto->enter_memory_pressure));
3422 }
3423 
3424 static int proto_seq_show(struct seq_file *seq, void *v)
3425 {
3426 	if (v == &proto_list)
3427 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3428 			   "protocol",
3429 			   "size",
3430 			   "sockets",
3431 			   "memory",
3432 			   "press",
3433 			   "maxhdr",
3434 			   "slab",
3435 			   "module",
3436 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3437 	else
3438 		proto_seq_printf(seq, list_entry(v, struct proto, node));
3439 	return 0;
3440 }
3441 
3442 static const struct seq_operations proto_seq_ops = {
3443 	.start  = proto_seq_start,
3444 	.next   = proto_seq_next,
3445 	.stop   = proto_seq_stop,
3446 	.show   = proto_seq_show,
3447 };
3448 
3449 static int proto_seq_open(struct inode *inode, struct file *file)
3450 {
3451 	return seq_open_net(inode, file, &proto_seq_ops,
3452 			    sizeof(struct seq_net_private));
3453 }
3454 
3455 static const struct file_operations proto_seq_fops = {
3456 	.open		= proto_seq_open,
3457 	.read		= seq_read,
3458 	.llseek		= seq_lseek,
3459 	.release	= seq_release_net,
3460 };
3461 
3462 static __net_init int proto_init_net(struct net *net)
3463 {
3464 	if (!proc_create("protocols", 0444, net->proc_net, &proto_seq_fops))
3465 		return -ENOMEM;
3466 
3467 	return 0;
3468 }
3469 
3470 static __net_exit void proto_exit_net(struct net *net)
3471 {
3472 	remove_proc_entry("protocols", net->proc_net);
3473 }
3474 
3475 
3476 static __net_initdata struct pernet_operations proto_net_ops = {
3477 	.init = proto_init_net,
3478 	.exit = proto_exit_net,
3479 };
3480 
3481 static int __init proto_init(void)
3482 {
3483 	return register_pernet_subsys(&proto_net_ops);
3484 }
3485 
3486 subsys_initcall(proto_init);
3487 
3488 #endif /* PROC_FS */
3489 
3490 #ifdef CONFIG_NET_RX_BUSY_POLL
3491 bool sk_busy_loop_end(void *p, unsigned long start_time)
3492 {
3493 	struct sock *sk = p;
3494 
3495 	return !skb_queue_empty(&sk->sk_receive_queue) ||
3496 	       sk_busy_loop_timeout(sk, start_time);
3497 }
3498 EXPORT_SYMBOL(sk_busy_loop_end);
3499 #endif /* CONFIG_NET_RX_BUSY_POLL */
3500