xref: /openbmc/linux/net/core/sock.c (revision 2eb0f624b709e78ec8e2f4c3412947703db99301)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *		This program is free software; you can redistribute it and/or
87  *		modify it under the terms of the GNU General Public License
88  *		as published by the Free Software Foundation; either version
89  *		2 of the License, or (at your option) any later version.
90  */
91 
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93 
94 #include <linux/capability.h>
95 #include <linux/errno.h>
96 #include <linux/errqueue.h>
97 #include <linux/types.h>
98 #include <linux/socket.h>
99 #include <linux/in.h>
100 #include <linux/kernel.h>
101 #include <linux/module.h>
102 #include <linux/proc_fs.h>
103 #include <linux/seq_file.h>
104 #include <linux/sched.h>
105 #include <linux/sched/mm.h>
106 #include <linux/timer.h>
107 #include <linux/string.h>
108 #include <linux/sockios.h>
109 #include <linux/net.h>
110 #include <linux/mm.h>
111 #include <linux/slab.h>
112 #include <linux/interrupt.h>
113 #include <linux/poll.h>
114 #include <linux/tcp.h>
115 #include <linux/init.h>
116 #include <linux/highmem.h>
117 #include <linux/user_namespace.h>
118 #include <linux/static_key.h>
119 #include <linux/memcontrol.h>
120 #include <linux/prefetch.h>
121 
122 #include <linux/uaccess.h>
123 
124 #include <linux/netdevice.h>
125 #include <net/protocol.h>
126 #include <linux/skbuff.h>
127 #include <net/net_namespace.h>
128 #include <net/request_sock.h>
129 #include <net/sock.h>
130 #include <linux/net_tstamp.h>
131 #include <net/xfrm.h>
132 #include <linux/ipsec.h>
133 #include <net/cls_cgroup.h>
134 #include <net/netprio_cgroup.h>
135 #include <linux/sock_diag.h>
136 
137 #include <linux/filter.h>
138 #include <net/sock_reuseport.h>
139 
140 #include <trace/events/sock.h>
141 
142 #include <net/tcp.h>
143 #include <net/busy_poll.h>
144 
145 static DEFINE_MUTEX(proto_list_mutex);
146 static LIST_HEAD(proto_list);
147 
148 static void sock_inuse_add(struct net *net, int val);
149 
150 /**
151  * sk_ns_capable - General socket capability test
152  * @sk: Socket to use a capability on or through
153  * @user_ns: The user namespace of the capability to use
154  * @cap: The capability to use
155  *
156  * Test to see if the opener of the socket had when the socket was
157  * created and the current process has the capability @cap in the user
158  * namespace @user_ns.
159  */
160 bool sk_ns_capable(const struct sock *sk,
161 		   struct user_namespace *user_ns, int cap)
162 {
163 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
164 		ns_capable(user_ns, cap);
165 }
166 EXPORT_SYMBOL(sk_ns_capable);
167 
168 /**
169  * sk_capable - Socket global capability test
170  * @sk: Socket to use a capability on or through
171  * @cap: The global capability to use
172  *
173  * Test to see if the opener of the socket had when the socket was
174  * created and the current process has the capability @cap in all user
175  * namespaces.
176  */
177 bool sk_capable(const struct sock *sk, int cap)
178 {
179 	return sk_ns_capable(sk, &init_user_ns, cap);
180 }
181 EXPORT_SYMBOL(sk_capable);
182 
183 /**
184  * sk_net_capable - Network namespace socket capability test
185  * @sk: Socket to use a capability on or through
186  * @cap: The capability to use
187  *
188  * Test to see if the opener of the socket had when the socket was created
189  * and the current process has the capability @cap over the network namespace
190  * the socket is a member of.
191  */
192 bool sk_net_capable(const struct sock *sk, int cap)
193 {
194 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
195 }
196 EXPORT_SYMBOL(sk_net_capable);
197 
198 /*
199  * Each address family might have different locking rules, so we have
200  * one slock key per address family and separate keys for internal and
201  * userspace sockets.
202  */
203 static struct lock_class_key af_family_keys[AF_MAX];
204 static struct lock_class_key af_family_kern_keys[AF_MAX];
205 static struct lock_class_key af_family_slock_keys[AF_MAX];
206 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
207 
208 /*
209  * Make lock validator output more readable. (we pre-construct these
210  * strings build-time, so that runtime initialization of socket
211  * locks is fast):
212  */
213 
214 #define _sock_locks(x)						  \
215   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
216   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
217   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
218   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
219   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
220   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
221   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
222   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
223   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
224   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
225   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
226   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
227   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
228   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
229   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_MAX"
230 
231 static const char *const af_family_key_strings[AF_MAX+1] = {
232 	_sock_locks("sk_lock-")
233 };
234 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
235 	_sock_locks("slock-")
236 };
237 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
238 	_sock_locks("clock-")
239 };
240 
241 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
242 	_sock_locks("k-sk_lock-")
243 };
244 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
245 	_sock_locks("k-slock-")
246 };
247 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
248 	_sock_locks("k-clock-")
249 };
250 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
251   "rlock-AF_UNSPEC", "rlock-AF_UNIX"     , "rlock-AF_INET"     ,
252   "rlock-AF_AX25"  , "rlock-AF_IPX"      , "rlock-AF_APPLETALK",
253   "rlock-AF_NETROM", "rlock-AF_BRIDGE"   , "rlock-AF_ATMPVC"   ,
254   "rlock-AF_X25"   , "rlock-AF_INET6"    , "rlock-AF_ROSE"     ,
255   "rlock-AF_DECnet", "rlock-AF_NETBEUI"  , "rlock-AF_SECURITY" ,
256   "rlock-AF_KEY"   , "rlock-AF_NETLINK"  , "rlock-AF_PACKET"   ,
257   "rlock-AF_ASH"   , "rlock-AF_ECONET"   , "rlock-AF_ATMSVC"   ,
258   "rlock-AF_RDS"   , "rlock-AF_SNA"      , "rlock-AF_IRDA"     ,
259   "rlock-AF_PPPOX" , "rlock-AF_WANPIPE"  , "rlock-AF_LLC"      ,
260   "rlock-27"       , "rlock-28"          , "rlock-AF_CAN"      ,
261   "rlock-AF_TIPC"  , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV"     ,
262   "rlock-AF_RXRPC" , "rlock-AF_ISDN"     , "rlock-AF_PHONET"   ,
263   "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG"      ,
264   "rlock-AF_NFC"   , "rlock-AF_VSOCK"    , "rlock-AF_KCM"      ,
265   "rlock-AF_QIPCRTR", "rlock-AF_SMC"     , "rlock-AF_MAX"
266 };
267 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
268   "wlock-AF_UNSPEC", "wlock-AF_UNIX"     , "wlock-AF_INET"     ,
269   "wlock-AF_AX25"  , "wlock-AF_IPX"      , "wlock-AF_APPLETALK",
270   "wlock-AF_NETROM", "wlock-AF_BRIDGE"   , "wlock-AF_ATMPVC"   ,
271   "wlock-AF_X25"   , "wlock-AF_INET6"    , "wlock-AF_ROSE"     ,
272   "wlock-AF_DECnet", "wlock-AF_NETBEUI"  , "wlock-AF_SECURITY" ,
273   "wlock-AF_KEY"   , "wlock-AF_NETLINK"  , "wlock-AF_PACKET"   ,
274   "wlock-AF_ASH"   , "wlock-AF_ECONET"   , "wlock-AF_ATMSVC"   ,
275   "wlock-AF_RDS"   , "wlock-AF_SNA"      , "wlock-AF_IRDA"     ,
276   "wlock-AF_PPPOX" , "wlock-AF_WANPIPE"  , "wlock-AF_LLC"      ,
277   "wlock-27"       , "wlock-28"          , "wlock-AF_CAN"      ,
278   "wlock-AF_TIPC"  , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV"     ,
279   "wlock-AF_RXRPC" , "wlock-AF_ISDN"     , "wlock-AF_PHONET"   ,
280   "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG"      ,
281   "wlock-AF_NFC"   , "wlock-AF_VSOCK"    , "wlock-AF_KCM"      ,
282   "wlock-AF_QIPCRTR", "wlock-AF_SMC"     , "wlock-AF_MAX"
283 };
284 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
285   "elock-AF_UNSPEC", "elock-AF_UNIX"     , "elock-AF_INET"     ,
286   "elock-AF_AX25"  , "elock-AF_IPX"      , "elock-AF_APPLETALK",
287   "elock-AF_NETROM", "elock-AF_BRIDGE"   , "elock-AF_ATMPVC"   ,
288   "elock-AF_X25"   , "elock-AF_INET6"    , "elock-AF_ROSE"     ,
289   "elock-AF_DECnet", "elock-AF_NETBEUI"  , "elock-AF_SECURITY" ,
290   "elock-AF_KEY"   , "elock-AF_NETLINK"  , "elock-AF_PACKET"   ,
291   "elock-AF_ASH"   , "elock-AF_ECONET"   , "elock-AF_ATMSVC"   ,
292   "elock-AF_RDS"   , "elock-AF_SNA"      , "elock-AF_IRDA"     ,
293   "elock-AF_PPPOX" , "elock-AF_WANPIPE"  , "elock-AF_LLC"      ,
294   "elock-27"       , "elock-28"          , "elock-AF_CAN"      ,
295   "elock-AF_TIPC"  , "elock-AF_BLUETOOTH", "elock-AF_IUCV"     ,
296   "elock-AF_RXRPC" , "elock-AF_ISDN"     , "elock-AF_PHONET"   ,
297   "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG"      ,
298   "elock-AF_NFC"   , "elock-AF_VSOCK"    , "elock-AF_KCM"      ,
299   "elock-AF_QIPCRTR", "elock-AF_SMC"     , "elock-AF_MAX"
300 };
301 
302 /*
303  * sk_callback_lock and sk queues locking rules are per-address-family,
304  * so split the lock classes by using a per-AF key:
305  */
306 static struct lock_class_key af_callback_keys[AF_MAX];
307 static struct lock_class_key af_rlock_keys[AF_MAX];
308 static struct lock_class_key af_wlock_keys[AF_MAX];
309 static struct lock_class_key af_elock_keys[AF_MAX];
310 static struct lock_class_key af_kern_callback_keys[AF_MAX];
311 
312 /* Run time adjustable parameters. */
313 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
314 EXPORT_SYMBOL(sysctl_wmem_max);
315 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
316 EXPORT_SYMBOL(sysctl_rmem_max);
317 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
318 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
319 
320 /* Maximal space eaten by iovec or ancillary data plus some space */
321 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
322 EXPORT_SYMBOL(sysctl_optmem_max);
323 
324 int sysctl_tstamp_allow_data __read_mostly = 1;
325 
326 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
327 EXPORT_SYMBOL_GPL(memalloc_socks);
328 
329 /**
330  * sk_set_memalloc - sets %SOCK_MEMALLOC
331  * @sk: socket to set it on
332  *
333  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
334  * It's the responsibility of the admin to adjust min_free_kbytes
335  * to meet the requirements
336  */
337 void sk_set_memalloc(struct sock *sk)
338 {
339 	sock_set_flag(sk, SOCK_MEMALLOC);
340 	sk->sk_allocation |= __GFP_MEMALLOC;
341 	static_key_slow_inc(&memalloc_socks);
342 }
343 EXPORT_SYMBOL_GPL(sk_set_memalloc);
344 
345 void sk_clear_memalloc(struct sock *sk)
346 {
347 	sock_reset_flag(sk, SOCK_MEMALLOC);
348 	sk->sk_allocation &= ~__GFP_MEMALLOC;
349 	static_key_slow_dec(&memalloc_socks);
350 
351 	/*
352 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
353 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
354 	 * it has rmem allocations due to the last swapfile being deactivated
355 	 * but there is a risk that the socket is unusable due to exceeding
356 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
357 	 */
358 	sk_mem_reclaim(sk);
359 }
360 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
361 
362 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
363 {
364 	int ret;
365 	unsigned int noreclaim_flag;
366 
367 	/* these should have been dropped before queueing */
368 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
369 
370 	noreclaim_flag = memalloc_noreclaim_save();
371 	ret = sk->sk_backlog_rcv(sk, skb);
372 	memalloc_noreclaim_restore(noreclaim_flag);
373 
374 	return ret;
375 }
376 EXPORT_SYMBOL(__sk_backlog_rcv);
377 
378 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
379 {
380 	struct timeval tv;
381 
382 	if (optlen < sizeof(tv))
383 		return -EINVAL;
384 	if (copy_from_user(&tv, optval, sizeof(tv)))
385 		return -EFAULT;
386 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
387 		return -EDOM;
388 
389 	if (tv.tv_sec < 0) {
390 		static int warned __read_mostly;
391 
392 		*timeo_p = 0;
393 		if (warned < 10 && net_ratelimit()) {
394 			warned++;
395 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
396 				__func__, current->comm, task_pid_nr(current));
397 		}
398 		return 0;
399 	}
400 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
401 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
402 		return 0;
403 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
404 		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
405 	return 0;
406 }
407 
408 static void sock_warn_obsolete_bsdism(const char *name)
409 {
410 	static int warned;
411 	static char warncomm[TASK_COMM_LEN];
412 	if (strcmp(warncomm, current->comm) && warned < 5) {
413 		strcpy(warncomm,  current->comm);
414 		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
415 			warncomm, name);
416 		warned++;
417 	}
418 }
419 
420 static bool sock_needs_netstamp(const struct sock *sk)
421 {
422 	switch (sk->sk_family) {
423 	case AF_UNSPEC:
424 	case AF_UNIX:
425 		return false;
426 	default:
427 		return true;
428 	}
429 }
430 
431 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
432 {
433 	if (sk->sk_flags & flags) {
434 		sk->sk_flags &= ~flags;
435 		if (sock_needs_netstamp(sk) &&
436 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
437 			net_disable_timestamp();
438 	}
439 }
440 
441 
442 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
443 {
444 	unsigned long flags;
445 	struct sk_buff_head *list = &sk->sk_receive_queue;
446 
447 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
448 		atomic_inc(&sk->sk_drops);
449 		trace_sock_rcvqueue_full(sk, skb);
450 		return -ENOMEM;
451 	}
452 
453 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
454 		atomic_inc(&sk->sk_drops);
455 		return -ENOBUFS;
456 	}
457 
458 	skb->dev = NULL;
459 	skb_set_owner_r(skb, sk);
460 
461 	/* we escape from rcu protected region, make sure we dont leak
462 	 * a norefcounted dst
463 	 */
464 	skb_dst_force(skb);
465 
466 	spin_lock_irqsave(&list->lock, flags);
467 	sock_skb_set_dropcount(sk, skb);
468 	__skb_queue_tail(list, skb);
469 	spin_unlock_irqrestore(&list->lock, flags);
470 
471 	if (!sock_flag(sk, SOCK_DEAD))
472 		sk->sk_data_ready(sk);
473 	return 0;
474 }
475 EXPORT_SYMBOL(__sock_queue_rcv_skb);
476 
477 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
478 {
479 	int err;
480 
481 	err = sk_filter(sk, skb);
482 	if (err)
483 		return err;
484 
485 	return __sock_queue_rcv_skb(sk, skb);
486 }
487 EXPORT_SYMBOL(sock_queue_rcv_skb);
488 
489 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
490 		     const int nested, unsigned int trim_cap, bool refcounted)
491 {
492 	int rc = NET_RX_SUCCESS;
493 
494 	if (sk_filter_trim_cap(sk, skb, trim_cap))
495 		goto discard_and_relse;
496 
497 	skb->dev = NULL;
498 
499 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
500 		atomic_inc(&sk->sk_drops);
501 		goto discard_and_relse;
502 	}
503 	if (nested)
504 		bh_lock_sock_nested(sk);
505 	else
506 		bh_lock_sock(sk);
507 	if (!sock_owned_by_user(sk)) {
508 		/*
509 		 * trylock + unlock semantics:
510 		 */
511 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
512 
513 		rc = sk_backlog_rcv(sk, skb);
514 
515 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
516 	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
517 		bh_unlock_sock(sk);
518 		atomic_inc(&sk->sk_drops);
519 		goto discard_and_relse;
520 	}
521 
522 	bh_unlock_sock(sk);
523 out:
524 	if (refcounted)
525 		sock_put(sk);
526 	return rc;
527 discard_and_relse:
528 	kfree_skb(skb);
529 	goto out;
530 }
531 EXPORT_SYMBOL(__sk_receive_skb);
532 
533 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
534 {
535 	struct dst_entry *dst = __sk_dst_get(sk);
536 
537 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
538 		sk_tx_queue_clear(sk);
539 		sk->sk_dst_pending_confirm = 0;
540 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
541 		dst_release(dst);
542 		return NULL;
543 	}
544 
545 	return dst;
546 }
547 EXPORT_SYMBOL(__sk_dst_check);
548 
549 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
550 {
551 	struct dst_entry *dst = sk_dst_get(sk);
552 
553 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
554 		sk_dst_reset(sk);
555 		dst_release(dst);
556 		return NULL;
557 	}
558 
559 	return dst;
560 }
561 EXPORT_SYMBOL(sk_dst_check);
562 
563 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
564 				int optlen)
565 {
566 	int ret = -ENOPROTOOPT;
567 #ifdef CONFIG_NETDEVICES
568 	struct net *net = sock_net(sk);
569 	char devname[IFNAMSIZ];
570 	int index;
571 
572 	/* Sorry... */
573 	ret = -EPERM;
574 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
575 		goto out;
576 
577 	ret = -EINVAL;
578 	if (optlen < 0)
579 		goto out;
580 
581 	/* Bind this socket to a particular device like "eth0",
582 	 * as specified in the passed interface name. If the
583 	 * name is "" or the option length is zero the socket
584 	 * is not bound.
585 	 */
586 	if (optlen > IFNAMSIZ - 1)
587 		optlen = IFNAMSIZ - 1;
588 	memset(devname, 0, sizeof(devname));
589 
590 	ret = -EFAULT;
591 	if (copy_from_user(devname, optval, optlen))
592 		goto out;
593 
594 	index = 0;
595 	if (devname[0] != '\0') {
596 		struct net_device *dev;
597 
598 		rcu_read_lock();
599 		dev = dev_get_by_name_rcu(net, devname);
600 		if (dev)
601 			index = dev->ifindex;
602 		rcu_read_unlock();
603 		ret = -ENODEV;
604 		if (!dev)
605 			goto out;
606 	}
607 
608 	lock_sock(sk);
609 	sk->sk_bound_dev_if = index;
610 	sk_dst_reset(sk);
611 	release_sock(sk);
612 
613 	ret = 0;
614 
615 out:
616 #endif
617 
618 	return ret;
619 }
620 
621 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
622 				int __user *optlen, int len)
623 {
624 	int ret = -ENOPROTOOPT;
625 #ifdef CONFIG_NETDEVICES
626 	struct net *net = sock_net(sk);
627 	char devname[IFNAMSIZ];
628 
629 	if (sk->sk_bound_dev_if == 0) {
630 		len = 0;
631 		goto zero;
632 	}
633 
634 	ret = -EINVAL;
635 	if (len < IFNAMSIZ)
636 		goto out;
637 
638 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
639 	if (ret)
640 		goto out;
641 
642 	len = strlen(devname) + 1;
643 
644 	ret = -EFAULT;
645 	if (copy_to_user(optval, devname, len))
646 		goto out;
647 
648 zero:
649 	ret = -EFAULT;
650 	if (put_user(len, optlen))
651 		goto out;
652 
653 	ret = 0;
654 
655 out:
656 #endif
657 
658 	return ret;
659 }
660 
661 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
662 {
663 	if (valbool)
664 		sock_set_flag(sk, bit);
665 	else
666 		sock_reset_flag(sk, bit);
667 }
668 
669 bool sk_mc_loop(struct sock *sk)
670 {
671 	if (dev_recursion_level())
672 		return false;
673 	if (!sk)
674 		return true;
675 	switch (sk->sk_family) {
676 	case AF_INET:
677 		return inet_sk(sk)->mc_loop;
678 #if IS_ENABLED(CONFIG_IPV6)
679 	case AF_INET6:
680 		return inet6_sk(sk)->mc_loop;
681 #endif
682 	}
683 	WARN_ON(1);
684 	return true;
685 }
686 EXPORT_SYMBOL(sk_mc_loop);
687 
688 /*
689  *	This is meant for all protocols to use and covers goings on
690  *	at the socket level. Everything here is generic.
691  */
692 
693 int sock_setsockopt(struct socket *sock, int level, int optname,
694 		    char __user *optval, unsigned int optlen)
695 {
696 	struct sock *sk = sock->sk;
697 	int val;
698 	int valbool;
699 	struct linger ling;
700 	int ret = 0;
701 
702 	/*
703 	 *	Options without arguments
704 	 */
705 
706 	if (optname == SO_BINDTODEVICE)
707 		return sock_setbindtodevice(sk, optval, optlen);
708 
709 	if (optlen < sizeof(int))
710 		return -EINVAL;
711 
712 	if (get_user(val, (int __user *)optval))
713 		return -EFAULT;
714 
715 	valbool = val ? 1 : 0;
716 
717 	lock_sock(sk);
718 
719 	switch (optname) {
720 	case SO_DEBUG:
721 		if (val && !capable(CAP_NET_ADMIN))
722 			ret = -EACCES;
723 		else
724 			sock_valbool_flag(sk, SOCK_DBG, valbool);
725 		break;
726 	case SO_REUSEADDR:
727 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
728 		break;
729 	case SO_REUSEPORT:
730 		sk->sk_reuseport = valbool;
731 		break;
732 	case SO_TYPE:
733 	case SO_PROTOCOL:
734 	case SO_DOMAIN:
735 	case SO_ERROR:
736 		ret = -ENOPROTOOPT;
737 		break;
738 	case SO_DONTROUTE:
739 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
740 		break;
741 	case SO_BROADCAST:
742 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
743 		break;
744 	case SO_SNDBUF:
745 		/* Don't error on this BSD doesn't and if you think
746 		 * about it this is right. Otherwise apps have to
747 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
748 		 * are treated in BSD as hints
749 		 */
750 		val = min_t(u32, val, sysctl_wmem_max);
751 set_sndbuf:
752 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
753 		sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
754 		/* Wake up sending tasks if we upped the value. */
755 		sk->sk_write_space(sk);
756 		break;
757 
758 	case SO_SNDBUFFORCE:
759 		if (!capable(CAP_NET_ADMIN)) {
760 			ret = -EPERM;
761 			break;
762 		}
763 		goto set_sndbuf;
764 
765 	case SO_RCVBUF:
766 		/* Don't error on this BSD doesn't and if you think
767 		 * about it this is right. Otherwise apps have to
768 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
769 		 * are treated in BSD as hints
770 		 */
771 		val = min_t(u32, val, sysctl_rmem_max);
772 set_rcvbuf:
773 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
774 		/*
775 		 * We double it on the way in to account for
776 		 * "struct sk_buff" etc. overhead.   Applications
777 		 * assume that the SO_RCVBUF setting they make will
778 		 * allow that much actual data to be received on that
779 		 * socket.
780 		 *
781 		 * Applications are unaware that "struct sk_buff" and
782 		 * other overheads allocate from the receive buffer
783 		 * during socket buffer allocation.
784 		 *
785 		 * And after considering the possible alternatives,
786 		 * returning the value we actually used in getsockopt
787 		 * is the most desirable behavior.
788 		 */
789 		sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
790 		break;
791 
792 	case SO_RCVBUFFORCE:
793 		if (!capable(CAP_NET_ADMIN)) {
794 			ret = -EPERM;
795 			break;
796 		}
797 		goto set_rcvbuf;
798 
799 	case SO_KEEPALIVE:
800 		if (sk->sk_prot->keepalive)
801 			sk->sk_prot->keepalive(sk, valbool);
802 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
803 		break;
804 
805 	case SO_OOBINLINE:
806 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
807 		break;
808 
809 	case SO_NO_CHECK:
810 		sk->sk_no_check_tx = valbool;
811 		break;
812 
813 	case SO_PRIORITY:
814 		if ((val >= 0 && val <= 6) ||
815 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
816 			sk->sk_priority = val;
817 		else
818 			ret = -EPERM;
819 		break;
820 
821 	case SO_LINGER:
822 		if (optlen < sizeof(ling)) {
823 			ret = -EINVAL;	/* 1003.1g */
824 			break;
825 		}
826 		if (copy_from_user(&ling, optval, sizeof(ling))) {
827 			ret = -EFAULT;
828 			break;
829 		}
830 		if (!ling.l_onoff)
831 			sock_reset_flag(sk, SOCK_LINGER);
832 		else {
833 #if (BITS_PER_LONG == 32)
834 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
835 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
836 			else
837 #endif
838 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
839 			sock_set_flag(sk, SOCK_LINGER);
840 		}
841 		break;
842 
843 	case SO_BSDCOMPAT:
844 		sock_warn_obsolete_bsdism("setsockopt");
845 		break;
846 
847 	case SO_PASSCRED:
848 		if (valbool)
849 			set_bit(SOCK_PASSCRED, &sock->flags);
850 		else
851 			clear_bit(SOCK_PASSCRED, &sock->flags);
852 		break;
853 
854 	case SO_TIMESTAMP:
855 	case SO_TIMESTAMPNS:
856 		if (valbool)  {
857 			if (optname == SO_TIMESTAMP)
858 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
859 			else
860 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
861 			sock_set_flag(sk, SOCK_RCVTSTAMP);
862 			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
863 		} else {
864 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
865 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
866 		}
867 		break;
868 
869 	case SO_TIMESTAMPING:
870 		if (val & ~SOF_TIMESTAMPING_MASK) {
871 			ret = -EINVAL;
872 			break;
873 		}
874 
875 		if (val & SOF_TIMESTAMPING_OPT_ID &&
876 		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
877 			if (sk->sk_protocol == IPPROTO_TCP &&
878 			    sk->sk_type == SOCK_STREAM) {
879 				if ((1 << sk->sk_state) &
880 				    (TCPF_CLOSE | TCPF_LISTEN)) {
881 					ret = -EINVAL;
882 					break;
883 				}
884 				sk->sk_tskey = tcp_sk(sk)->snd_una;
885 			} else {
886 				sk->sk_tskey = 0;
887 			}
888 		}
889 
890 		if (val & SOF_TIMESTAMPING_OPT_STATS &&
891 		    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
892 			ret = -EINVAL;
893 			break;
894 		}
895 
896 		sk->sk_tsflags = val;
897 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
898 			sock_enable_timestamp(sk,
899 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
900 		else
901 			sock_disable_timestamp(sk,
902 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
903 		break;
904 
905 	case SO_RCVLOWAT:
906 		if (val < 0)
907 			val = INT_MAX;
908 		if (sock->ops->set_rcvlowat)
909 			ret = sock->ops->set_rcvlowat(sk, val);
910 		else
911 			sk->sk_rcvlowat = val ? : 1;
912 		break;
913 
914 	case SO_RCVTIMEO:
915 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
916 		break;
917 
918 	case SO_SNDTIMEO:
919 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
920 		break;
921 
922 	case SO_ATTACH_FILTER:
923 		ret = -EINVAL;
924 		if (optlen == sizeof(struct sock_fprog)) {
925 			struct sock_fprog fprog;
926 
927 			ret = -EFAULT;
928 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
929 				break;
930 
931 			ret = sk_attach_filter(&fprog, sk);
932 		}
933 		break;
934 
935 	case SO_ATTACH_BPF:
936 		ret = -EINVAL;
937 		if (optlen == sizeof(u32)) {
938 			u32 ufd;
939 
940 			ret = -EFAULT;
941 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
942 				break;
943 
944 			ret = sk_attach_bpf(ufd, sk);
945 		}
946 		break;
947 
948 	case SO_ATTACH_REUSEPORT_CBPF:
949 		ret = -EINVAL;
950 		if (optlen == sizeof(struct sock_fprog)) {
951 			struct sock_fprog fprog;
952 
953 			ret = -EFAULT;
954 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
955 				break;
956 
957 			ret = sk_reuseport_attach_filter(&fprog, sk);
958 		}
959 		break;
960 
961 	case SO_ATTACH_REUSEPORT_EBPF:
962 		ret = -EINVAL;
963 		if (optlen == sizeof(u32)) {
964 			u32 ufd;
965 
966 			ret = -EFAULT;
967 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
968 				break;
969 
970 			ret = sk_reuseport_attach_bpf(ufd, sk);
971 		}
972 		break;
973 
974 	case SO_DETACH_FILTER:
975 		ret = sk_detach_filter(sk);
976 		break;
977 
978 	case SO_LOCK_FILTER:
979 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
980 			ret = -EPERM;
981 		else
982 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
983 		break;
984 
985 	case SO_PASSSEC:
986 		if (valbool)
987 			set_bit(SOCK_PASSSEC, &sock->flags);
988 		else
989 			clear_bit(SOCK_PASSSEC, &sock->flags);
990 		break;
991 	case SO_MARK:
992 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
993 			ret = -EPERM;
994 		else
995 			sk->sk_mark = val;
996 		break;
997 
998 	case SO_RXQ_OVFL:
999 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1000 		break;
1001 
1002 	case SO_WIFI_STATUS:
1003 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1004 		break;
1005 
1006 	case SO_PEEK_OFF:
1007 		if (sock->ops->set_peek_off)
1008 			ret = sock->ops->set_peek_off(sk, val);
1009 		else
1010 			ret = -EOPNOTSUPP;
1011 		break;
1012 
1013 	case SO_NOFCS:
1014 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1015 		break;
1016 
1017 	case SO_SELECT_ERR_QUEUE:
1018 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1019 		break;
1020 
1021 #ifdef CONFIG_NET_RX_BUSY_POLL
1022 	case SO_BUSY_POLL:
1023 		/* allow unprivileged users to decrease the value */
1024 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1025 			ret = -EPERM;
1026 		else {
1027 			if (val < 0)
1028 				ret = -EINVAL;
1029 			else
1030 				sk->sk_ll_usec = val;
1031 		}
1032 		break;
1033 #endif
1034 
1035 	case SO_MAX_PACING_RATE:
1036 		if (val != ~0U)
1037 			cmpxchg(&sk->sk_pacing_status,
1038 				SK_PACING_NONE,
1039 				SK_PACING_NEEDED);
1040 		sk->sk_max_pacing_rate = val;
1041 		sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1042 					 sk->sk_max_pacing_rate);
1043 		break;
1044 
1045 	case SO_INCOMING_CPU:
1046 		sk->sk_incoming_cpu = val;
1047 		break;
1048 
1049 	case SO_CNX_ADVICE:
1050 		if (val == 1)
1051 			dst_negative_advice(sk);
1052 		break;
1053 
1054 	case SO_ZEROCOPY:
1055 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1056 			if (sk->sk_protocol != IPPROTO_TCP)
1057 				ret = -ENOTSUPP;
1058 		} else if (sk->sk_family != PF_RDS) {
1059 			ret = -ENOTSUPP;
1060 		}
1061 		if (!ret) {
1062 			if (val < 0 || val > 1)
1063 				ret = -EINVAL;
1064 			else
1065 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1066 		}
1067 		break;
1068 
1069 	default:
1070 		ret = -ENOPROTOOPT;
1071 		break;
1072 	}
1073 	release_sock(sk);
1074 	return ret;
1075 }
1076 EXPORT_SYMBOL(sock_setsockopt);
1077 
1078 
1079 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1080 			  struct ucred *ucred)
1081 {
1082 	ucred->pid = pid_vnr(pid);
1083 	ucred->uid = ucred->gid = -1;
1084 	if (cred) {
1085 		struct user_namespace *current_ns = current_user_ns();
1086 
1087 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1088 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1089 	}
1090 }
1091 
1092 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1093 {
1094 	struct user_namespace *user_ns = current_user_ns();
1095 	int i;
1096 
1097 	for (i = 0; i < src->ngroups; i++)
1098 		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1099 			return -EFAULT;
1100 
1101 	return 0;
1102 }
1103 
1104 int sock_getsockopt(struct socket *sock, int level, int optname,
1105 		    char __user *optval, int __user *optlen)
1106 {
1107 	struct sock *sk = sock->sk;
1108 
1109 	union {
1110 		int val;
1111 		u64 val64;
1112 		struct linger ling;
1113 		struct timeval tm;
1114 	} v;
1115 
1116 	int lv = sizeof(int);
1117 	int len;
1118 
1119 	if (get_user(len, optlen))
1120 		return -EFAULT;
1121 	if (len < 0)
1122 		return -EINVAL;
1123 
1124 	memset(&v, 0, sizeof(v));
1125 
1126 	switch (optname) {
1127 	case SO_DEBUG:
1128 		v.val = sock_flag(sk, SOCK_DBG);
1129 		break;
1130 
1131 	case SO_DONTROUTE:
1132 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1133 		break;
1134 
1135 	case SO_BROADCAST:
1136 		v.val = sock_flag(sk, SOCK_BROADCAST);
1137 		break;
1138 
1139 	case SO_SNDBUF:
1140 		v.val = sk->sk_sndbuf;
1141 		break;
1142 
1143 	case SO_RCVBUF:
1144 		v.val = sk->sk_rcvbuf;
1145 		break;
1146 
1147 	case SO_REUSEADDR:
1148 		v.val = sk->sk_reuse;
1149 		break;
1150 
1151 	case SO_REUSEPORT:
1152 		v.val = sk->sk_reuseport;
1153 		break;
1154 
1155 	case SO_KEEPALIVE:
1156 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1157 		break;
1158 
1159 	case SO_TYPE:
1160 		v.val = sk->sk_type;
1161 		break;
1162 
1163 	case SO_PROTOCOL:
1164 		v.val = sk->sk_protocol;
1165 		break;
1166 
1167 	case SO_DOMAIN:
1168 		v.val = sk->sk_family;
1169 		break;
1170 
1171 	case SO_ERROR:
1172 		v.val = -sock_error(sk);
1173 		if (v.val == 0)
1174 			v.val = xchg(&sk->sk_err_soft, 0);
1175 		break;
1176 
1177 	case SO_OOBINLINE:
1178 		v.val = sock_flag(sk, SOCK_URGINLINE);
1179 		break;
1180 
1181 	case SO_NO_CHECK:
1182 		v.val = sk->sk_no_check_tx;
1183 		break;
1184 
1185 	case SO_PRIORITY:
1186 		v.val = sk->sk_priority;
1187 		break;
1188 
1189 	case SO_LINGER:
1190 		lv		= sizeof(v.ling);
1191 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1192 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1193 		break;
1194 
1195 	case SO_BSDCOMPAT:
1196 		sock_warn_obsolete_bsdism("getsockopt");
1197 		break;
1198 
1199 	case SO_TIMESTAMP:
1200 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1201 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1202 		break;
1203 
1204 	case SO_TIMESTAMPNS:
1205 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1206 		break;
1207 
1208 	case SO_TIMESTAMPING:
1209 		v.val = sk->sk_tsflags;
1210 		break;
1211 
1212 	case SO_RCVTIMEO:
1213 		lv = sizeof(struct timeval);
1214 		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1215 			v.tm.tv_sec = 0;
1216 			v.tm.tv_usec = 0;
1217 		} else {
1218 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1219 			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1220 		}
1221 		break;
1222 
1223 	case SO_SNDTIMEO:
1224 		lv = sizeof(struct timeval);
1225 		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1226 			v.tm.tv_sec = 0;
1227 			v.tm.tv_usec = 0;
1228 		} else {
1229 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1230 			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1231 		}
1232 		break;
1233 
1234 	case SO_RCVLOWAT:
1235 		v.val = sk->sk_rcvlowat;
1236 		break;
1237 
1238 	case SO_SNDLOWAT:
1239 		v.val = 1;
1240 		break;
1241 
1242 	case SO_PASSCRED:
1243 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1244 		break;
1245 
1246 	case SO_PEERCRED:
1247 	{
1248 		struct ucred peercred;
1249 		if (len > sizeof(peercred))
1250 			len = sizeof(peercred);
1251 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1252 		if (copy_to_user(optval, &peercred, len))
1253 			return -EFAULT;
1254 		goto lenout;
1255 	}
1256 
1257 	case SO_PEERGROUPS:
1258 	{
1259 		int ret, n;
1260 
1261 		if (!sk->sk_peer_cred)
1262 			return -ENODATA;
1263 
1264 		n = sk->sk_peer_cred->group_info->ngroups;
1265 		if (len < n * sizeof(gid_t)) {
1266 			len = n * sizeof(gid_t);
1267 			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1268 		}
1269 		len = n * sizeof(gid_t);
1270 
1271 		ret = groups_to_user((gid_t __user *)optval,
1272 				     sk->sk_peer_cred->group_info);
1273 		if (ret)
1274 			return ret;
1275 		goto lenout;
1276 	}
1277 
1278 	case SO_PEERNAME:
1279 	{
1280 		char address[128];
1281 
1282 		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1283 		if (lv < 0)
1284 			return -ENOTCONN;
1285 		if (lv < len)
1286 			return -EINVAL;
1287 		if (copy_to_user(optval, address, len))
1288 			return -EFAULT;
1289 		goto lenout;
1290 	}
1291 
1292 	/* Dubious BSD thing... Probably nobody even uses it, but
1293 	 * the UNIX standard wants it for whatever reason... -DaveM
1294 	 */
1295 	case SO_ACCEPTCONN:
1296 		v.val = sk->sk_state == TCP_LISTEN;
1297 		break;
1298 
1299 	case SO_PASSSEC:
1300 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1301 		break;
1302 
1303 	case SO_PEERSEC:
1304 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1305 
1306 	case SO_MARK:
1307 		v.val = sk->sk_mark;
1308 		break;
1309 
1310 	case SO_RXQ_OVFL:
1311 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1312 		break;
1313 
1314 	case SO_WIFI_STATUS:
1315 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1316 		break;
1317 
1318 	case SO_PEEK_OFF:
1319 		if (!sock->ops->set_peek_off)
1320 			return -EOPNOTSUPP;
1321 
1322 		v.val = sk->sk_peek_off;
1323 		break;
1324 	case SO_NOFCS:
1325 		v.val = sock_flag(sk, SOCK_NOFCS);
1326 		break;
1327 
1328 	case SO_BINDTODEVICE:
1329 		return sock_getbindtodevice(sk, optval, optlen, len);
1330 
1331 	case SO_GET_FILTER:
1332 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1333 		if (len < 0)
1334 			return len;
1335 
1336 		goto lenout;
1337 
1338 	case SO_LOCK_FILTER:
1339 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1340 		break;
1341 
1342 	case SO_BPF_EXTENSIONS:
1343 		v.val = bpf_tell_extensions();
1344 		break;
1345 
1346 	case SO_SELECT_ERR_QUEUE:
1347 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1348 		break;
1349 
1350 #ifdef CONFIG_NET_RX_BUSY_POLL
1351 	case SO_BUSY_POLL:
1352 		v.val = sk->sk_ll_usec;
1353 		break;
1354 #endif
1355 
1356 	case SO_MAX_PACING_RATE:
1357 		v.val = sk->sk_max_pacing_rate;
1358 		break;
1359 
1360 	case SO_INCOMING_CPU:
1361 		v.val = sk->sk_incoming_cpu;
1362 		break;
1363 
1364 	case SO_MEMINFO:
1365 	{
1366 		u32 meminfo[SK_MEMINFO_VARS];
1367 
1368 		if (get_user(len, optlen))
1369 			return -EFAULT;
1370 
1371 		sk_get_meminfo(sk, meminfo);
1372 
1373 		len = min_t(unsigned int, len, sizeof(meminfo));
1374 		if (copy_to_user(optval, &meminfo, len))
1375 			return -EFAULT;
1376 
1377 		goto lenout;
1378 	}
1379 
1380 #ifdef CONFIG_NET_RX_BUSY_POLL
1381 	case SO_INCOMING_NAPI_ID:
1382 		v.val = READ_ONCE(sk->sk_napi_id);
1383 
1384 		/* aggregate non-NAPI IDs down to 0 */
1385 		if (v.val < MIN_NAPI_ID)
1386 			v.val = 0;
1387 
1388 		break;
1389 #endif
1390 
1391 	case SO_COOKIE:
1392 		lv = sizeof(u64);
1393 		if (len < lv)
1394 			return -EINVAL;
1395 		v.val64 = sock_gen_cookie(sk);
1396 		break;
1397 
1398 	case SO_ZEROCOPY:
1399 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1400 		break;
1401 
1402 	default:
1403 		/* We implement the SO_SNDLOWAT etc to not be settable
1404 		 * (1003.1g 7).
1405 		 */
1406 		return -ENOPROTOOPT;
1407 	}
1408 
1409 	if (len > lv)
1410 		len = lv;
1411 	if (copy_to_user(optval, &v, len))
1412 		return -EFAULT;
1413 lenout:
1414 	if (put_user(len, optlen))
1415 		return -EFAULT;
1416 	return 0;
1417 }
1418 
1419 /*
1420  * Initialize an sk_lock.
1421  *
1422  * (We also register the sk_lock with the lock validator.)
1423  */
1424 static inline void sock_lock_init(struct sock *sk)
1425 {
1426 	if (sk->sk_kern_sock)
1427 		sock_lock_init_class_and_name(
1428 			sk,
1429 			af_family_kern_slock_key_strings[sk->sk_family],
1430 			af_family_kern_slock_keys + sk->sk_family,
1431 			af_family_kern_key_strings[sk->sk_family],
1432 			af_family_kern_keys + sk->sk_family);
1433 	else
1434 		sock_lock_init_class_and_name(
1435 			sk,
1436 			af_family_slock_key_strings[sk->sk_family],
1437 			af_family_slock_keys + sk->sk_family,
1438 			af_family_key_strings[sk->sk_family],
1439 			af_family_keys + sk->sk_family);
1440 }
1441 
1442 /*
1443  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1444  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1445  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1446  */
1447 static void sock_copy(struct sock *nsk, const struct sock *osk)
1448 {
1449 #ifdef CONFIG_SECURITY_NETWORK
1450 	void *sptr = nsk->sk_security;
1451 #endif
1452 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1453 
1454 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1455 	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1456 
1457 #ifdef CONFIG_SECURITY_NETWORK
1458 	nsk->sk_security = sptr;
1459 	security_sk_clone(osk, nsk);
1460 #endif
1461 }
1462 
1463 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1464 		int family)
1465 {
1466 	struct sock *sk;
1467 	struct kmem_cache *slab;
1468 
1469 	slab = prot->slab;
1470 	if (slab != NULL) {
1471 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1472 		if (!sk)
1473 			return sk;
1474 		if (priority & __GFP_ZERO)
1475 			sk_prot_clear_nulls(sk, prot->obj_size);
1476 	} else
1477 		sk = kmalloc(prot->obj_size, priority);
1478 
1479 	if (sk != NULL) {
1480 		if (security_sk_alloc(sk, family, priority))
1481 			goto out_free;
1482 
1483 		if (!try_module_get(prot->owner))
1484 			goto out_free_sec;
1485 		sk_tx_queue_clear(sk);
1486 	}
1487 
1488 	return sk;
1489 
1490 out_free_sec:
1491 	security_sk_free(sk);
1492 out_free:
1493 	if (slab != NULL)
1494 		kmem_cache_free(slab, sk);
1495 	else
1496 		kfree(sk);
1497 	return NULL;
1498 }
1499 
1500 static void sk_prot_free(struct proto *prot, struct sock *sk)
1501 {
1502 	struct kmem_cache *slab;
1503 	struct module *owner;
1504 
1505 	owner = prot->owner;
1506 	slab = prot->slab;
1507 
1508 	cgroup_sk_free(&sk->sk_cgrp_data);
1509 	mem_cgroup_sk_free(sk);
1510 	security_sk_free(sk);
1511 	if (slab != NULL)
1512 		kmem_cache_free(slab, sk);
1513 	else
1514 		kfree(sk);
1515 	module_put(owner);
1516 }
1517 
1518 /**
1519  *	sk_alloc - All socket objects are allocated here
1520  *	@net: the applicable net namespace
1521  *	@family: protocol family
1522  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1523  *	@prot: struct proto associated with this new sock instance
1524  *	@kern: is this to be a kernel socket?
1525  */
1526 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1527 		      struct proto *prot, int kern)
1528 {
1529 	struct sock *sk;
1530 
1531 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1532 	if (sk) {
1533 		sk->sk_family = family;
1534 		/*
1535 		 * See comment in struct sock definition to understand
1536 		 * why we need sk_prot_creator -acme
1537 		 */
1538 		sk->sk_prot = sk->sk_prot_creator = prot;
1539 		sk->sk_kern_sock = kern;
1540 		sock_lock_init(sk);
1541 		sk->sk_net_refcnt = kern ? 0 : 1;
1542 		if (likely(sk->sk_net_refcnt)) {
1543 			get_net(net);
1544 			sock_inuse_add(net, 1);
1545 		}
1546 
1547 		sock_net_set(sk, net);
1548 		refcount_set(&sk->sk_wmem_alloc, 1);
1549 
1550 		mem_cgroup_sk_alloc(sk);
1551 		cgroup_sk_alloc(&sk->sk_cgrp_data);
1552 		sock_update_classid(&sk->sk_cgrp_data);
1553 		sock_update_netprioidx(&sk->sk_cgrp_data);
1554 	}
1555 
1556 	return sk;
1557 }
1558 EXPORT_SYMBOL(sk_alloc);
1559 
1560 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1561  * grace period. This is the case for UDP sockets and TCP listeners.
1562  */
1563 static void __sk_destruct(struct rcu_head *head)
1564 {
1565 	struct sock *sk = container_of(head, struct sock, sk_rcu);
1566 	struct sk_filter *filter;
1567 
1568 	if (sk->sk_destruct)
1569 		sk->sk_destruct(sk);
1570 
1571 	filter = rcu_dereference_check(sk->sk_filter,
1572 				       refcount_read(&sk->sk_wmem_alloc) == 0);
1573 	if (filter) {
1574 		sk_filter_uncharge(sk, filter);
1575 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1576 	}
1577 	if (rcu_access_pointer(sk->sk_reuseport_cb))
1578 		reuseport_detach_sock(sk);
1579 
1580 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1581 
1582 	if (atomic_read(&sk->sk_omem_alloc))
1583 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1584 			 __func__, atomic_read(&sk->sk_omem_alloc));
1585 
1586 	if (sk->sk_frag.page) {
1587 		put_page(sk->sk_frag.page);
1588 		sk->sk_frag.page = NULL;
1589 	}
1590 
1591 	if (sk->sk_peer_cred)
1592 		put_cred(sk->sk_peer_cred);
1593 	put_pid(sk->sk_peer_pid);
1594 	if (likely(sk->sk_net_refcnt))
1595 		put_net(sock_net(sk));
1596 	sk_prot_free(sk->sk_prot_creator, sk);
1597 }
1598 
1599 void sk_destruct(struct sock *sk)
1600 {
1601 	if (sock_flag(sk, SOCK_RCU_FREE))
1602 		call_rcu(&sk->sk_rcu, __sk_destruct);
1603 	else
1604 		__sk_destruct(&sk->sk_rcu);
1605 }
1606 
1607 static void __sk_free(struct sock *sk)
1608 {
1609 	if (likely(sk->sk_net_refcnt))
1610 		sock_inuse_add(sock_net(sk), -1);
1611 
1612 	if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
1613 		sock_diag_broadcast_destroy(sk);
1614 	else
1615 		sk_destruct(sk);
1616 }
1617 
1618 void sk_free(struct sock *sk)
1619 {
1620 	/*
1621 	 * We subtract one from sk_wmem_alloc and can know if
1622 	 * some packets are still in some tx queue.
1623 	 * If not null, sock_wfree() will call __sk_free(sk) later
1624 	 */
1625 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1626 		__sk_free(sk);
1627 }
1628 EXPORT_SYMBOL(sk_free);
1629 
1630 static void sk_init_common(struct sock *sk)
1631 {
1632 	skb_queue_head_init(&sk->sk_receive_queue);
1633 	skb_queue_head_init(&sk->sk_write_queue);
1634 	skb_queue_head_init(&sk->sk_error_queue);
1635 
1636 	rwlock_init(&sk->sk_callback_lock);
1637 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1638 			af_rlock_keys + sk->sk_family,
1639 			af_family_rlock_key_strings[sk->sk_family]);
1640 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1641 			af_wlock_keys + sk->sk_family,
1642 			af_family_wlock_key_strings[sk->sk_family]);
1643 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1644 			af_elock_keys + sk->sk_family,
1645 			af_family_elock_key_strings[sk->sk_family]);
1646 	lockdep_set_class_and_name(&sk->sk_callback_lock,
1647 			af_callback_keys + sk->sk_family,
1648 			af_family_clock_key_strings[sk->sk_family]);
1649 }
1650 
1651 /**
1652  *	sk_clone_lock - clone a socket, and lock its clone
1653  *	@sk: the socket to clone
1654  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1655  *
1656  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1657  */
1658 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1659 {
1660 	struct sock *newsk;
1661 	bool is_charged = true;
1662 
1663 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1664 	if (newsk != NULL) {
1665 		struct sk_filter *filter;
1666 
1667 		sock_copy(newsk, sk);
1668 
1669 		newsk->sk_prot_creator = sk->sk_prot;
1670 
1671 		/* SANITY */
1672 		if (likely(newsk->sk_net_refcnt))
1673 			get_net(sock_net(newsk));
1674 		sk_node_init(&newsk->sk_node);
1675 		sock_lock_init(newsk);
1676 		bh_lock_sock(newsk);
1677 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1678 		newsk->sk_backlog.len = 0;
1679 
1680 		atomic_set(&newsk->sk_rmem_alloc, 0);
1681 		/*
1682 		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1683 		 */
1684 		refcount_set(&newsk->sk_wmem_alloc, 1);
1685 		atomic_set(&newsk->sk_omem_alloc, 0);
1686 		sk_init_common(newsk);
1687 
1688 		newsk->sk_dst_cache	= NULL;
1689 		newsk->sk_dst_pending_confirm = 0;
1690 		newsk->sk_wmem_queued	= 0;
1691 		newsk->sk_forward_alloc = 0;
1692 		atomic_set(&newsk->sk_drops, 0);
1693 		newsk->sk_send_head	= NULL;
1694 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1695 		atomic_set(&newsk->sk_zckey, 0);
1696 
1697 		sock_reset_flag(newsk, SOCK_DONE);
1698 		mem_cgroup_sk_alloc(newsk);
1699 		cgroup_sk_alloc(&newsk->sk_cgrp_data);
1700 
1701 		rcu_read_lock();
1702 		filter = rcu_dereference(sk->sk_filter);
1703 		if (filter != NULL)
1704 			/* though it's an empty new sock, the charging may fail
1705 			 * if sysctl_optmem_max was changed between creation of
1706 			 * original socket and cloning
1707 			 */
1708 			is_charged = sk_filter_charge(newsk, filter);
1709 		RCU_INIT_POINTER(newsk->sk_filter, filter);
1710 		rcu_read_unlock();
1711 
1712 		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1713 			/* We need to make sure that we don't uncharge the new
1714 			 * socket if we couldn't charge it in the first place
1715 			 * as otherwise we uncharge the parent's filter.
1716 			 */
1717 			if (!is_charged)
1718 				RCU_INIT_POINTER(newsk->sk_filter, NULL);
1719 			sk_free_unlock_clone(newsk);
1720 			newsk = NULL;
1721 			goto out;
1722 		}
1723 		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1724 
1725 		newsk->sk_err	   = 0;
1726 		newsk->sk_err_soft = 0;
1727 		newsk->sk_priority = 0;
1728 		newsk->sk_incoming_cpu = raw_smp_processor_id();
1729 		atomic64_set(&newsk->sk_cookie, 0);
1730 		if (likely(newsk->sk_net_refcnt))
1731 			sock_inuse_add(sock_net(newsk), 1);
1732 
1733 		/*
1734 		 * Before updating sk_refcnt, we must commit prior changes to memory
1735 		 * (Documentation/RCU/rculist_nulls.txt for details)
1736 		 */
1737 		smp_wmb();
1738 		refcount_set(&newsk->sk_refcnt, 2);
1739 
1740 		/*
1741 		 * Increment the counter in the same struct proto as the master
1742 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1743 		 * is the same as sk->sk_prot->socks, as this field was copied
1744 		 * with memcpy).
1745 		 *
1746 		 * This _changes_ the previous behaviour, where
1747 		 * tcp_create_openreq_child always was incrementing the
1748 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1749 		 * to be taken into account in all callers. -acme
1750 		 */
1751 		sk_refcnt_debug_inc(newsk);
1752 		sk_set_socket(newsk, NULL);
1753 		newsk->sk_wq = NULL;
1754 
1755 		if (newsk->sk_prot->sockets_allocated)
1756 			sk_sockets_allocated_inc(newsk);
1757 
1758 		if (sock_needs_netstamp(sk) &&
1759 		    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1760 			net_enable_timestamp();
1761 	}
1762 out:
1763 	return newsk;
1764 }
1765 EXPORT_SYMBOL_GPL(sk_clone_lock);
1766 
1767 void sk_free_unlock_clone(struct sock *sk)
1768 {
1769 	/* It is still raw copy of parent, so invalidate
1770 	 * destructor and make plain sk_free() */
1771 	sk->sk_destruct = NULL;
1772 	bh_unlock_sock(sk);
1773 	sk_free(sk);
1774 }
1775 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1776 
1777 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1778 {
1779 	u32 max_segs = 1;
1780 
1781 	sk_dst_set(sk, dst);
1782 	sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
1783 	if (sk->sk_route_caps & NETIF_F_GSO)
1784 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1785 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1786 	if (sk_can_gso(sk)) {
1787 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
1788 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1789 		} else {
1790 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1791 			sk->sk_gso_max_size = dst->dev->gso_max_size;
1792 			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1793 		}
1794 	}
1795 	sk->sk_gso_max_segs = max_segs;
1796 }
1797 EXPORT_SYMBOL_GPL(sk_setup_caps);
1798 
1799 /*
1800  *	Simple resource managers for sockets.
1801  */
1802 
1803 
1804 /*
1805  * Write buffer destructor automatically called from kfree_skb.
1806  */
1807 void sock_wfree(struct sk_buff *skb)
1808 {
1809 	struct sock *sk = skb->sk;
1810 	unsigned int len = skb->truesize;
1811 
1812 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1813 		/*
1814 		 * Keep a reference on sk_wmem_alloc, this will be released
1815 		 * after sk_write_space() call
1816 		 */
1817 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1818 		sk->sk_write_space(sk);
1819 		len = 1;
1820 	}
1821 	/*
1822 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1823 	 * could not do because of in-flight packets
1824 	 */
1825 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1826 		__sk_free(sk);
1827 }
1828 EXPORT_SYMBOL(sock_wfree);
1829 
1830 /* This variant of sock_wfree() is used by TCP,
1831  * since it sets SOCK_USE_WRITE_QUEUE.
1832  */
1833 void __sock_wfree(struct sk_buff *skb)
1834 {
1835 	struct sock *sk = skb->sk;
1836 
1837 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1838 		__sk_free(sk);
1839 }
1840 
1841 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1842 {
1843 	skb_orphan(skb);
1844 	skb->sk = sk;
1845 #ifdef CONFIG_INET
1846 	if (unlikely(!sk_fullsock(sk))) {
1847 		skb->destructor = sock_edemux;
1848 		sock_hold(sk);
1849 		return;
1850 	}
1851 #endif
1852 	skb->destructor = sock_wfree;
1853 	skb_set_hash_from_sk(skb, sk);
1854 	/*
1855 	 * We used to take a refcount on sk, but following operation
1856 	 * is enough to guarantee sk_free() wont free this sock until
1857 	 * all in-flight packets are completed
1858 	 */
1859 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1860 }
1861 EXPORT_SYMBOL(skb_set_owner_w);
1862 
1863 /* This helper is used by netem, as it can hold packets in its
1864  * delay queue. We want to allow the owner socket to send more
1865  * packets, as if they were already TX completed by a typical driver.
1866  * But we also want to keep skb->sk set because some packet schedulers
1867  * rely on it (sch_fq for example).
1868  */
1869 void skb_orphan_partial(struct sk_buff *skb)
1870 {
1871 	if (skb_is_tcp_pure_ack(skb))
1872 		return;
1873 
1874 	if (skb->destructor == sock_wfree
1875 #ifdef CONFIG_INET
1876 	    || skb->destructor == tcp_wfree
1877 #endif
1878 		) {
1879 		struct sock *sk = skb->sk;
1880 
1881 		if (refcount_inc_not_zero(&sk->sk_refcnt)) {
1882 			WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
1883 			skb->destructor = sock_efree;
1884 		}
1885 	} else {
1886 		skb_orphan(skb);
1887 	}
1888 }
1889 EXPORT_SYMBOL(skb_orphan_partial);
1890 
1891 /*
1892  * Read buffer destructor automatically called from kfree_skb.
1893  */
1894 void sock_rfree(struct sk_buff *skb)
1895 {
1896 	struct sock *sk = skb->sk;
1897 	unsigned int len = skb->truesize;
1898 
1899 	atomic_sub(len, &sk->sk_rmem_alloc);
1900 	sk_mem_uncharge(sk, len);
1901 }
1902 EXPORT_SYMBOL(sock_rfree);
1903 
1904 /*
1905  * Buffer destructor for skbs that are not used directly in read or write
1906  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1907  */
1908 void sock_efree(struct sk_buff *skb)
1909 {
1910 	sock_put(skb->sk);
1911 }
1912 EXPORT_SYMBOL(sock_efree);
1913 
1914 kuid_t sock_i_uid(struct sock *sk)
1915 {
1916 	kuid_t uid;
1917 
1918 	read_lock_bh(&sk->sk_callback_lock);
1919 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1920 	read_unlock_bh(&sk->sk_callback_lock);
1921 	return uid;
1922 }
1923 EXPORT_SYMBOL(sock_i_uid);
1924 
1925 unsigned long sock_i_ino(struct sock *sk)
1926 {
1927 	unsigned long ino;
1928 
1929 	read_lock_bh(&sk->sk_callback_lock);
1930 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1931 	read_unlock_bh(&sk->sk_callback_lock);
1932 	return ino;
1933 }
1934 EXPORT_SYMBOL(sock_i_ino);
1935 
1936 /*
1937  * Allocate a skb from the socket's send buffer.
1938  */
1939 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1940 			     gfp_t priority)
1941 {
1942 	if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1943 		struct sk_buff *skb = alloc_skb(size, priority);
1944 		if (skb) {
1945 			skb_set_owner_w(skb, sk);
1946 			return skb;
1947 		}
1948 	}
1949 	return NULL;
1950 }
1951 EXPORT_SYMBOL(sock_wmalloc);
1952 
1953 static void sock_ofree(struct sk_buff *skb)
1954 {
1955 	struct sock *sk = skb->sk;
1956 
1957 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
1958 }
1959 
1960 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
1961 			     gfp_t priority)
1962 {
1963 	struct sk_buff *skb;
1964 
1965 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
1966 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
1967 	    sysctl_optmem_max)
1968 		return NULL;
1969 
1970 	skb = alloc_skb(size, priority);
1971 	if (!skb)
1972 		return NULL;
1973 
1974 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
1975 	skb->sk = sk;
1976 	skb->destructor = sock_ofree;
1977 	return skb;
1978 }
1979 
1980 /*
1981  * Allocate a memory block from the socket's option memory buffer.
1982  */
1983 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1984 {
1985 	if ((unsigned int)size <= sysctl_optmem_max &&
1986 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1987 		void *mem;
1988 		/* First do the add, to avoid the race if kmalloc
1989 		 * might sleep.
1990 		 */
1991 		atomic_add(size, &sk->sk_omem_alloc);
1992 		mem = kmalloc(size, priority);
1993 		if (mem)
1994 			return mem;
1995 		atomic_sub(size, &sk->sk_omem_alloc);
1996 	}
1997 	return NULL;
1998 }
1999 EXPORT_SYMBOL(sock_kmalloc);
2000 
2001 /* Free an option memory block. Note, we actually want the inline
2002  * here as this allows gcc to detect the nullify and fold away the
2003  * condition entirely.
2004  */
2005 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2006 				  const bool nullify)
2007 {
2008 	if (WARN_ON_ONCE(!mem))
2009 		return;
2010 	if (nullify)
2011 		kzfree(mem);
2012 	else
2013 		kfree(mem);
2014 	atomic_sub(size, &sk->sk_omem_alloc);
2015 }
2016 
2017 void sock_kfree_s(struct sock *sk, void *mem, int size)
2018 {
2019 	__sock_kfree_s(sk, mem, size, false);
2020 }
2021 EXPORT_SYMBOL(sock_kfree_s);
2022 
2023 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2024 {
2025 	__sock_kfree_s(sk, mem, size, true);
2026 }
2027 EXPORT_SYMBOL(sock_kzfree_s);
2028 
2029 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2030    I think, these locks should be removed for datagram sockets.
2031  */
2032 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2033 {
2034 	DEFINE_WAIT(wait);
2035 
2036 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2037 	for (;;) {
2038 		if (!timeo)
2039 			break;
2040 		if (signal_pending(current))
2041 			break;
2042 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2043 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2044 		if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
2045 			break;
2046 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2047 			break;
2048 		if (sk->sk_err)
2049 			break;
2050 		timeo = schedule_timeout(timeo);
2051 	}
2052 	finish_wait(sk_sleep(sk), &wait);
2053 	return timeo;
2054 }
2055 
2056 
2057 /*
2058  *	Generic send/receive buffer handlers
2059  */
2060 
2061 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2062 				     unsigned long data_len, int noblock,
2063 				     int *errcode, int max_page_order)
2064 {
2065 	struct sk_buff *skb;
2066 	long timeo;
2067 	int err;
2068 
2069 	timeo = sock_sndtimeo(sk, noblock);
2070 	for (;;) {
2071 		err = sock_error(sk);
2072 		if (err != 0)
2073 			goto failure;
2074 
2075 		err = -EPIPE;
2076 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2077 			goto failure;
2078 
2079 		if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2080 			break;
2081 
2082 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2083 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2084 		err = -EAGAIN;
2085 		if (!timeo)
2086 			goto failure;
2087 		if (signal_pending(current))
2088 			goto interrupted;
2089 		timeo = sock_wait_for_wmem(sk, timeo);
2090 	}
2091 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2092 				   errcode, sk->sk_allocation);
2093 	if (skb)
2094 		skb_set_owner_w(skb, sk);
2095 	return skb;
2096 
2097 interrupted:
2098 	err = sock_intr_errno(timeo);
2099 failure:
2100 	*errcode = err;
2101 	return NULL;
2102 }
2103 EXPORT_SYMBOL(sock_alloc_send_pskb);
2104 
2105 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2106 				    int noblock, int *errcode)
2107 {
2108 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2109 }
2110 EXPORT_SYMBOL(sock_alloc_send_skb);
2111 
2112 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2113 		     struct sockcm_cookie *sockc)
2114 {
2115 	u32 tsflags;
2116 
2117 	switch (cmsg->cmsg_type) {
2118 	case SO_MARK:
2119 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2120 			return -EPERM;
2121 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2122 			return -EINVAL;
2123 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2124 		break;
2125 	case SO_TIMESTAMPING:
2126 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2127 			return -EINVAL;
2128 
2129 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2130 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2131 			return -EINVAL;
2132 
2133 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2134 		sockc->tsflags |= tsflags;
2135 		break;
2136 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2137 	case SCM_RIGHTS:
2138 	case SCM_CREDENTIALS:
2139 		break;
2140 	default:
2141 		return -EINVAL;
2142 	}
2143 	return 0;
2144 }
2145 EXPORT_SYMBOL(__sock_cmsg_send);
2146 
2147 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2148 		   struct sockcm_cookie *sockc)
2149 {
2150 	struct cmsghdr *cmsg;
2151 	int ret;
2152 
2153 	for_each_cmsghdr(cmsg, msg) {
2154 		if (!CMSG_OK(msg, cmsg))
2155 			return -EINVAL;
2156 		if (cmsg->cmsg_level != SOL_SOCKET)
2157 			continue;
2158 		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2159 		if (ret)
2160 			return ret;
2161 	}
2162 	return 0;
2163 }
2164 EXPORT_SYMBOL(sock_cmsg_send);
2165 
2166 static void sk_enter_memory_pressure(struct sock *sk)
2167 {
2168 	if (!sk->sk_prot->enter_memory_pressure)
2169 		return;
2170 
2171 	sk->sk_prot->enter_memory_pressure(sk);
2172 }
2173 
2174 static void sk_leave_memory_pressure(struct sock *sk)
2175 {
2176 	if (sk->sk_prot->leave_memory_pressure) {
2177 		sk->sk_prot->leave_memory_pressure(sk);
2178 	} else {
2179 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2180 
2181 		if (memory_pressure && *memory_pressure)
2182 			*memory_pressure = 0;
2183 	}
2184 }
2185 
2186 /* On 32bit arches, an skb frag is limited to 2^15 */
2187 #define SKB_FRAG_PAGE_ORDER	get_order(32768)
2188 
2189 /**
2190  * skb_page_frag_refill - check that a page_frag contains enough room
2191  * @sz: minimum size of the fragment we want to get
2192  * @pfrag: pointer to page_frag
2193  * @gfp: priority for memory allocation
2194  *
2195  * Note: While this allocator tries to use high order pages, there is
2196  * no guarantee that allocations succeed. Therefore, @sz MUST be
2197  * less or equal than PAGE_SIZE.
2198  */
2199 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2200 {
2201 	if (pfrag->page) {
2202 		if (page_ref_count(pfrag->page) == 1) {
2203 			pfrag->offset = 0;
2204 			return true;
2205 		}
2206 		if (pfrag->offset + sz <= pfrag->size)
2207 			return true;
2208 		put_page(pfrag->page);
2209 	}
2210 
2211 	pfrag->offset = 0;
2212 	if (SKB_FRAG_PAGE_ORDER) {
2213 		/* Avoid direct reclaim but allow kswapd to wake */
2214 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2215 					  __GFP_COMP | __GFP_NOWARN |
2216 					  __GFP_NORETRY,
2217 					  SKB_FRAG_PAGE_ORDER);
2218 		if (likely(pfrag->page)) {
2219 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2220 			return true;
2221 		}
2222 	}
2223 	pfrag->page = alloc_page(gfp);
2224 	if (likely(pfrag->page)) {
2225 		pfrag->size = PAGE_SIZE;
2226 		return true;
2227 	}
2228 	return false;
2229 }
2230 EXPORT_SYMBOL(skb_page_frag_refill);
2231 
2232 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2233 {
2234 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2235 		return true;
2236 
2237 	sk_enter_memory_pressure(sk);
2238 	sk_stream_moderate_sndbuf(sk);
2239 	return false;
2240 }
2241 EXPORT_SYMBOL(sk_page_frag_refill);
2242 
2243 int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
2244 		int sg_start, int *sg_curr_index, unsigned int *sg_curr_size,
2245 		int first_coalesce)
2246 {
2247 	int sg_curr = *sg_curr_index, use = 0, rc = 0;
2248 	unsigned int size = *sg_curr_size;
2249 	struct page_frag *pfrag;
2250 	struct scatterlist *sge;
2251 
2252 	len -= size;
2253 	pfrag = sk_page_frag(sk);
2254 
2255 	while (len > 0) {
2256 		unsigned int orig_offset;
2257 
2258 		if (!sk_page_frag_refill(sk, pfrag)) {
2259 			rc = -ENOMEM;
2260 			goto out;
2261 		}
2262 
2263 		use = min_t(int, len, pfrag->size - pfrag->offset);
2264 
2265 		if (!sk_wmem_schedule(sk, use)) {
2266 			rc = -ENOMEM;
2267 			goto out;
2268 		}
2269 
2270 		sk_mem_charge(sk, use);
2271 		size += use;
2272 		orig_offset = pfrag->offset;
2273 		pfrag->offset += use;
2274 
2275 		sge = sg + sg_curr - 1;
2276 		if (sg_curr > first_coalesce && sg_page(sg) == pfrag->page &&
2277 		    sg->offset + sg->length == orig_offset) {
2278 			sg->length += use;
2279 		} else {
2280 			sge = sg + sg_curr;
2281 			sg_unmark_end(sge);
2282 			sg_set_page(sge, pfrag->page, use, orig_offset);
2283 			get_page(pfrag->page);
2284 			sg_curr++;
2285 
2286 			if (sg_curr == MAX_SKB_FRAGS)
2287 				sg_curr = 0;
2288 
2289 			if (sg_curr == sg_start) {
2290 				rc = -ENOSPC;
2291 				break;
2292 			}
2293 		}
2294 
2295 		len -= use;
2296 	}
2297 out:
2298 	*sg_curr_size = size;
2299 	*sg_curr_index = sg_curr;
2300 	return rc;
2301 }
2302 EXPORT_SYMBOL(sk_alloc_sg);
2303 
2304 static void __lock_sock(struct sock *sk)
2305 	__releases(&sk->sk_lock.slock)
2306 	__acquires(&sk->sk_lock.slock)
2307 {
2308 	DEFINE_WAIT(wait);
2309 
2310 	for (;;) {
2311 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2312 					TASK_UNINTERRUPTIBLE);
2313 		spin_unlock_bh(&sk->sk_lock.slock);
2314 		schedule();
2315 		spin_lock_bh(&sk->sk_lock.slock);
2316 		if (!sock_owned_by_user(sk))
2317 			break;
2318 	}
2319 	finish_wait(&sk->sk_lock.wq, &wait);
2320 }
2321 
2322 static void __release_sock(struct sock *sk)
2323 	__releases(&sk->sk_lock.slock)
2324 	__acquires(&sk->sk_lock.slock)
2325 {
2326 	struct sk_buff *skb, *next;
2327 
2328 	while ((skb = sk->sk_backlog.head) != NULL) {
2329 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2330 
2331 		spin_unlock_bh(&sk->sk_lock.slock);
2332 
2333 		do {
2334 			next = skb->next;
2335 			prefetch(next);
2336 			WARN_ON_ONCE(skb_dst_is_noref(skb));
2337 			skb->next = NULL;
2338 			sk_backlog_rcv(sk, skb);
2339 
2340 			cond_resched();
2341 
2342 			skb = next;
2343 		} while (skb != NULL);
2344 
2345 		spin_lock_bh(&sk->sk_lock.slock);
2346 	}
2347 
2348 	/*
2349 	 * Doing the zeroing here guarantee we can not loop forever
2350 	 * while a wild producer attempts to flood us.
2351 	 */
2352 	sk->sk_backlog.len = 0;
2353 }
2354 
2355 void __sk_flush_backlog(struct sock *sk)
2356 {
2357 	spin_lock_bh(&sk->sk_lock.slock);
2358 	__release_sock(sk);
2359 	spin_unlock_bh(&sk->sk_lock.slock);
2360 }
2361 
2362 /**
2363  * sk_wait_data - wait for data to arrive at sk_receive_queue
2364  * @sk:    sock to wait on
2365  * @timeo: for how long
2366  * @skb:   last skb seen on sk_receive_queue
2367  *
2368  * Now socket state including sk->sk_err is changed only under lock,
2369  * hence we may omit checks after joining wait queue.
2370  * We check receive queue before schedule() only as optimization;
2371  * it is very likely that release_sock() added new data.
2372  */
2373 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2374 {
2375 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2376 	int rc;
2377 
2378 	add_wait_queue(sk_sleep(sk), &wait);
2379 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2380 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2381 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2382 	remove_wait_queue(sk_sleep(sk), &wait);
2383 	return rc;
2384 }
2385 EXPORT_SYMBOL(sk_wait_data);
2386 
2387 /**
2388  *	__sk_mem_raise_allocated - increase memory_allocated
2389  *	@sk: socket
2390  *	@size: memory size to allocate
2391  *	@amt: pages to allocate
2392  *	@kind: allocation type
2393  *
2394  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2395  */
2396 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2397 {
2398 	struct proto *prot = sk->sk_prot;
2399 	long allocated = sk_memory_allocated_add(sk, amt);
2400 
2401 	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2402 	    !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2403 		goto suppress_allocation;
2404 
2405 	/* Under limit. */
2406 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2407 		sk_leave_memory_pressure(sk);
2408 		return 1;
2409 	}
2410 
2411 	/* Under pressure. */
2412 	if (allocated > sk_prot_mem_limits(sk, 1))
2413 		sk_enter_memory_pressure(sk);
2414 
2415 	/* Over hard limit. */
2416 	if (allocated > sk_prot_mem_limits(sk, 2))
2417 		goto suppress_allocation;
2418 
2419 	/* guarantee minimum buffer size under pressure */
2420 	if (kind == SK_MEM_RECV) {
2421 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2422 			return 1;
2423 
2424 	} else { /* SK_MEM_SEND */
2425 		int wmem0 = sk_get_wmem0(sk, prot);
2426 
2427 		if (sk->sk_type == SOCK_STREAM) {
2428 			if (sk->sk_wmem_queued < wmem0)
2429 				return 1;
2430 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2431 				return 1;
2432 		}
2433 	}
2434 
2435 	if (sk_has_memory_pressure(sk)) {
2436 		int alloc;
2437 
2438 		if (!sk_under_memory_pressure(sk))
2439 			return 1;
2440 		alloc = sk_sockets_allocated_read_positive(sk);
2441 		if (sk_prot_mem_limits(sk, 2) > alloc *
2442 		    sk_mem_pages(sk->sk_wmem_queued +
2443 				 atomic_read(&sk->sk_rmem_alloc) +
2444 				 sk->sk_forward_alloc))
2445 			return 1;
2446 	}
2447 
2448 suppress_allocation:
2449 
2450 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2451 		sk_stream_moderate_sndbuf(sk);
2452 
2453 		/* Fail only if socket is _under_ its sndbuf.
2454 		 * In this case we cannot block, so that we have to fail.
2455 		 */
2456 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2457 			return 1;
2458 	}
2459 
2460 	trace_sock_exceed_buf_limit(sk, prot, allocated);
2461 
2462 	sk_memory_allocated_sub(sk, amt);
2463 
2464 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2465 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2466 
2467 	return 0;
2468 }
2469 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2470 
2471 /**
2472  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2473  *	@sk: socket
2474  *	@size: memory size to allocate
2475  *	@kind: allocation type
2476  *
2477  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2478  *	rmem allocation. This function assumes that protocols which have
2479  *	memory_pressure use sk_wmem_queued as write buffer accounting.
2480  */
2481 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2482 {
2483 	int ret, amt = sk_mem_pages(size);
2484 
2485 	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2486 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2487 	if (!ret)
2488 		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2489 	return ret;
2490 }
2491 EXPORT_SYMBOL(__sk_mem_schedule);
2492 
2493 /**
2494  *	__sk_mem_reduce_allocated - reclaim memory_allocated
2495  *	@sk: socket
2496  *	@amount: number of quanta
2497  *
2498  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2499  */
2500 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2501 {
2502 	sk_memory_allocated_sub(sk, amount);
2503 
2504 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2505 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2506 
2507 	if (sk_under_memory_pressure(sk) &&
2508 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2509 		sk_leave_memory_pressure(sk);
2510 }
2511 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2512 
2513 /**
2514  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2515  *	@sk: socket
2516  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2517  */
2518 void __sk_mem_reclaim(struct sock *sk, int amount)
2519 {
2520 	amount >>= SK_MEM_QUANTUM_SHIFT;
2521 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2522 	__sk_mem_reduce_allocated(sk, amount);
2523 }
2524 EXPORT_SYMBOL(__sk_mem_reclaim);
2525 
2526 int sk_set_peek_off(struct sock *sk, int val)
2527 {
2528 	sk->sk_peek_off = val;
2529 	return 0;
2530 }
2531 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2532 
2533 /*
2534  * Set of default routines for initialising struct proto_ops when
2535  * the protocol does not support a particular function. In certain
2536  * cases where it makes no sense for a protocol to have a "do nothing"
2537  * function, some default processing is provided.
2538  */
2539 
2540 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2541 {
2542 	return -EOPNOTSUPP;
2543 }
2544 EXPORT_SYMBOL(sock_no_bind);
2545 
2546 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2547 		    int len, int flags)
2548 {
2549 	return -EOPNOTSUPP;
2550 }
2551 EXPORT_SYMBOL(sock_no_connect);
2552 
2553 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2554 {
2555 	return -EOPNOTSUPP;
2556 }
2557 EXPORT_SYMBOL(sock_no_socketpair);
2558 
2559 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2560 		   bool kern)
2561 {
2562 	return -EOPNOTSUPP;
2563 }
2564 EXPORT_SYMBOL(sock_no_accept);
2565 
2566 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2567 		    int peer)
2568 {
2569 	return -EOPNOTSUPP;
2570 }
2571 EXPORT_SYMBOL(sock_no_getname);
2572 
2573 __poll_t sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2574 {
2575 	return 0;
2576 }
2577 EXPORT_SYMBOL(sock_no_poll);
2578 
2579 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2580 {
2581 	return -EOPNOTSUPP;
2582 }
2583 EXPORT_SYMBOL(sock_no_ioctl);
2584 
2585 int sock_no_listen(struct socket *sock, int backlog)
2586 {
2587 	return -EOPNOTSUPP;
2588 }
2589 EXPORT_SYMBOL(sock_no_listen);
2590 
2591 int sock_no_shutdown(struct socket *sock, int how)
2592 {
2593 	return -EOPNOTSUPP;
2594 }
2595 EXPORT_SYMBOL(sock_no_shutdown);
2596 
2597 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2598 		    char __user *optval, unsigned int optlen)
2599 {
2600 	return -EOPNOTSUPP;
2601 }
2602 EXPORT_SYMBOL(sock_no_setsockopt);
2603 
2604 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2605 		    char __user *optval, int __user *optlen)
2606 {
2607 	return -EOPNOTSUPP;
2608 }
2609 EXPORT_SYMBOL(sock_no_getsockopt);
2610 
2611 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2612 {
2613 	return -EOPNOTSUPP;
2614 }
2615 EXPORT_SYMBOL(sock_no_sendmsg);
2616 
2617 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2618 {
2619 	return -EOPNOTSUPP;
2620 }
2621 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2622 
2623 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2624 		    int flags)
2625 {
2626 	return -EOPNOTSUPP;
2627 }
2628 EXPORT_SYMBOL(sock_no_recvmsg);
2629 
2630 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2631 {
2632 	/* Mirror missing mmap method error code */
2633 	return -ENODEV;
2634 }
2635 EXPORT_SYMBOL(sock_no_mmap);
2636 
2637 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2638 {
2639 	ssize_t res;
2640 	struct msghdr msg = {.msg_flags = flags};
2641 	struct kvec iov;
2642 	char *kaddr = kmap(page);
2643 	iov.iov_base = kaddr + offset;
2644 	iov.iov_len = size;
2645 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2646 	kunmap(page);
2647 	return res;
2648 }
2649 EXPORT_SYMBOL(sock_no_sendpage);
2650 
2651 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2652 				int offset, size_t size, int flags)
2653 {
2654 	ssize_t res;
2655 	struct msghdr msg = {.msg_flags = flags};
2656 	struct kvec iov;
2657 	char *kaddr = kmap(page);
2658 
2659 	iov.iov_base = kaddr + offset;
2660 	iov.iov_len = size;
2661 	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2662 	kunmap(page);
2663 	return res;
2664 }
2665 EXPORT_SYMBOL(sock_no_sendpage_locked);
2666 
2667 /*
2668  *	Default Socket Callbacks
2669  */
2670 
2671 static void sock_def_wakeup(struct sock *sk)
2672 {
2673 	struct socket_wq *wq;
2674 
2675 	rcu_read_lock();
2676 	wq = rcu_dereference(sk->sk_wq);
2677 	if (skwq_has_sleeper(wq))
2678 		wake_up_interruptible_all(&wq->wait);
2679 	rcu_read_unlock();
2680 }
2681 
2682 static void sock_def_error_report(struct sock *sk)
2683 {
2684 	struct socket_wq *wq;
2685 
2686 	rcu_read_lock();
2687 	wq = rcu_dereference(sk->sk_wq);
2688 	if (skwq_has_sleeper(wq))
2689 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2690 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2691 	rcu_read_unlock();
2692 }
2693 
2694 static void sock_def_readable(struct sock *sk)
2695 {
2696 	struct socket_wq *wq;
2697 
2698 	rcu_read_lock();
2699 	wq = rcu_dereference(sk->sk_wq);
2700 	if (skwq_has_sleeper(wq))
2701 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2702 						EPOLLRDNORM | EPOLLRDBAND);
2703 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2704 	rcu_read_unlock();
2705 }
2706 
2707 static void sock_def_write_space(struct sock *sk)
2708 {
2709 	struct socket_wq *wq;
2710 
2711 	rcu_read_lock();
2712 
2713 	/* Do not wake up a writer until he can make "significant"
2714 	 * progress.  --DaveM
2715 	 */
2716 	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2717 		wq = rcu_dereference(sk->sk_wq);
2718 		if (skwq_has_sleeper(wq))
2719 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2720 						EPOLLWRNORM | EPOLLWRBAND);
2721 
2722 		/* Should agree with poll, otherwise some programs break */
2723 		if (sock_writeable(sk))
2724 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2725 	}
2726 
2727 	rcu_read_unlock();
2728 }
2729 
2730 static void sock_def_destruct(struct sock *sk)
2731 {
2732 }
2733 
2734 void sk_send_sigurg(struct sock *sk)
2735 {
2736 	if (sk->sk_socket && sk->sk_socket->file)
2737 		if (send_sigurg(&sk->sk_socket->file->f_owner))
2738 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2739 }
2740 EXPORT_SYMBOL(sk_send_sigurg);
2741 
2742 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2743 		    unsigned long expires)
2744 {
2745 	if (!mod_timer(timer, expires))
2746 		sock_hold(sk);
2747 }
2748 EXPORT_SYMBOL(sk_reset_timer);
2749 
2750 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2751 {
2752 	if (del_timer(timer))
2753 		__sock_put(sk);
2754 }
2755 EXPORT_SYMBOL(sk_stop_timer);
2756 
2757 void sock_init_data(struct socket *sock, struct sock *sk)
2758 {
2759 	sk_init_common(sk);
2760 	sk->sk_send_head	=	NULL;
2761 
2762 	timer_setup(&sk->sk_timer, NULL, 0);
2763 
2764 	sk->sk_allocation	=	GFP_KERNEL;
2765 	sk->sk_rcvbuf		=	sysctl_rmem_default;
2766 	sk->sk_sndbuf		=	sysctl_wmem_default;
2767 	sk->sk_state		=	TCP_CLOSE;
2768 	sk_set_socket(sk, sock);
2769 
2770 	sock_set_flag(sk, SOCK_ZAPPED);
2771 
2772 	if (sock) {
2773 		sk->sk_type	=	sock->type;
2774 		sk->sk_wq	=	sock->wq;
2775 		sock->sk	=	sk;
2776 		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
2777 	} else {
2778 		sk->sk_wq	=	NULL;
2779 		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
2780 	}
2781 
2782 	rwlock_init(&sk->sk_callback_lock);
2783 	if (sk->sk_kern_sock)
2784 		lockdep_set_class_and_name(
2785 			&sk->sk_callback_lock,
2786 			af_kern_callback_keys + sk->sk_family,
2787 			af_family_kern_clock_key_strings[sk->sk_family]);
2788 	else
2789 		lockdep_set_class_and_name(
2790 			&sk->sk_callback_lock,
2791 			af_callback_keys + sk->sk_family,
2792 			af_family_clock_key_strings[sk->sk_family]);
2793 
2794 	sk->sk_state_change	=	sock_def_wakeup;
2795 	sk->sk_data_ready	=	sock_def_readable;
2796 	sk->sk_write_space	=	sock_def_write_space;
2797 	sk->sk_error_report	=	sock_def_error_report;
2798 	sk->sk_destruct		=	sock_def_destruct;
2799 
2800 	sk->sk_frag.page	=	NULL;
2801 	sk->sk_frag.offset	=	0;
2802 	sk->sk_peek_off		=	-1;
2803 
2804 	sk->sk_peer_pid 	=	NULL;
2805 	sk->sk_peer_cred	=	NULL;
2806 	sk->sk_write_pending	=	0;
2807 	sk->sk_rcvlowat		=	1;
2808 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2809 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2810 
2811 	sk->sk_stamp = SK_DEFAULT_STAMP;
2812 	atomic_set(&sk->sk_zckey, 0);
2813 
2814 #ifdef CONFIG_NET_RX_BUSY_POLL
2815 	sk->sk_napi_id		=	0;
2816 	sk->sk_ll_usec		=	sysctl_net_busy_read;
2817 #endif
2818 
2819 	sk->sk_max_pacing_rate = ~0U;
2820 	sk->sk_pacing_rate = ~0U;
2821 	sk->sk_pacing_shift = 10;
2822 	sk->sk_incoming_cpu = -1;
2823 	/*
2824 	 * Before updating sk_refcnt, we must commit prior changes to memory
2825 	 * (Documentation/RCU/rculist_nulls.txt for details)
2826 	 */
2827 	smp_wmb();
2828 	refcount_set(&sk->sk_refcnt, 1);
2829 	atomic_set(&sk->sk_drops, 0);
2830 }
2831 EXPORT_SYMBOL(sock_init_data);
2832 
2833 void lock_sock_nested(struct sock *sk, int subclass)
2834 {
2835 	might_sleep();
2836 	spin_lock_bh(&sk->sk_lock.slock);
2837 	if (sk->sk_lock.owned)
2838 		__lock_sock(sk);
2839 	sk->sk_lock.owned = 1;
2840 	spin_unlock(&sk->sk_lock.slock);
2841 	/*
2842 	 * The sk_lock has mutex_lock() semantics here:
2843 	 */
2844 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2845 	local_bh_enable();
2846 }
2847 EXPORT_SYMBOL(lock_sock_nested);
2848 
2849 void release_sock(struct sock *sk)
2850 {
2851 	spin_lock_bh(&sk->sk_lock.slock);
2852 	if (sk->sk_backlog.tail)
2853 		__release_sock(sk);
2854 
2855 	/* Warning : release_cb() might need to release sk ownership,
2856 	 * ie call sock_release_ownership(sk) before us.
2857 	 */
2858 	if (sk->sk_prot->release_cb)
2859 		sk->sk_prot->release_cb(sk);
2860 
2861 	sock_release_ownership(sk);
2862 	if (waitqueue_active(&sk->sk_lock.wq))
2863 		wake_up(&sk->sk_lock.wq);
2864 	spin_unlock_bh(&sk->sk_lock.slock);
2865 }
2866 EXPORT_SYMBOL(release_sock);
2867 
2868 /**
2869  * lock_sock_fast - fast version of lock_sock
2870  * @sk: socket
2871  *
2872  * This version should be used for very small section, where process wont block
2873  * return false if fast path is taken:
2874  *
2875  *   sk_lock.slock locked, owned = 0, BH disabled
2876  *
2877  * return true if slow path is taken:
2878  *
2879  *   sk_lock.slock unlocked, owned = 1, BH enabled
2880  */
2881 bool lock_sock_fast(struct sock *sk)
2882 {
2883 	might_sleep();
2884 	spin_lock_bh(&sk->sk_lock.slock);
2885 
2886 	if (!sk->sk_lock.owned)
2887 		/*
2888 		 * Note : We must disable BH
2889 		 */
2890 		return false;
2891 
2892 	__lock_sock(sk);
2893 	sk->sk_lock.owned = 1;
2894 	spin_unlock(&sk->sk_lock.slock);
2895 	/*
2896 	 * The sk_lock has mutex_lock() semantics here:
2897 	 */
2898 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2899 	local_bh_enable();
2900 	return true;
2901 }
2902 EXPORT_SYMBOL(lock_sock_fast);
2903 
2904 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2905 {
2906 	struct timeval tv;
2907 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2908 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2909 	tv = ktime_to_timeval(sk->sk_stamp);
2910 	if (tv.tv_sec == -1)
2911 		return -ENOENT;
2912 	if (tv.tv_sec == 0) {
2913 		sk->sk_stamp = ktime_get_real();
2914 		tv = ktime_to_timeval(sk->sk_stamp);
2915 	}
2916 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2917 }
2918 EXPORT_SYMBOL(sock_get_timestamp);
2919 
2920 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2921 {
2922 	struct timespec ts;
2923 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2924 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2925 	ts = ktime_to_timespec(sk->sk_stamp);
2926 	if (ts.tv_sec == -1)
2927 		return -ENOENT;
2928 	if (ts.tv_sec == 0) {
2929 		sk->sk_stamp = ktime_get_real();
2930 		ts = ktime_to_timespec(sk->sk_stamp);
2931 	}
2932 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2933 }
2934 EXPORT_SYMBOL(sock_get_timestampns);
2935 
2936 void sock_enable_timestamp(struct sock *sk, int flag)
2937 {
2938 	if (!sock_flag(sk, flag)) {
2939 		unsigned long previous_flags = sk->sk_flags;
2940 
2941 		sock_set_flag(sk, flag);
2942 		/*
2943 		 * we just set one of the two flags which require net
2944 		 * time stamping, but time stamping might have been on
2945 		 * already because of the other one
2946 		 */
2947 		if (sock_needs_netstamp(sk) &&
2948 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
2949 			net_enable_timestamp();
2950 	}
2951 }
2952 
2953 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2954 		       int level, int type)
2955 {
2956 	struct sock_exterr_skb *serr;
2957 	struct sk_buff *skb;
2958 	int copied, err;
2959 
2960 	err = -EAGAIN;
2961 	skb = sock_dequeue_err_skb(sk);
2962 	if (skb == NULL)
2963 		goto out;
2964 
2965 	copied = skb->len;
2966 	if (copied > len) {
2967 		msg->msg_flags |= MSG_TRUNC;
2968 		copied = len;
2969 	}
2970 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
2971 	if (err)
2972 		goto out_free_skb;
2973 
2974 	sock_recv_timestamp(msg, sk, skb);
2975 
2976 	serr = SKB_EXT_ERR(skb);
2977 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2978 
2979 	msg->msg_flags |= MSG_ERRQUEUE;
2980 	err = copied;
2981 
2982 out_free_skb:
2983 	kfree_skb(skb);
2984 out:
2985 	return err;
2986 }
2987 EXPORT_SYMBOL(sock_recv_errqueue);
2988 
2989 /*
2990  *	Get a socket option on an socket.
2991  *
2992  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2993  *	asynchronous errors should be reported by getsockopt. We assume
2994  *	this means if you specify SO_ERROR (otherwise whats the point of it).
2995  */
2996 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2997 			   char __user *optval, int __user *optlen)
2998 {
2999 	struct sock *sk = sock->sk;
3000 
3001 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3002 }
3003 EXPORT_SYMBOL(sock_common_getsockopt);
3004 
3005 #ifdef CONFIG_COMPAT
3006 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
3007 				  char __user *optval, int __user *optlen)
3008 {
3009 	struct sock *sk = sock->sk;
3010 
3011 	if (sk->sk_prot->compat_getsockopt != NULL)
3012 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
3013 						      optval, optlen);
3014 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3015 }
3016 EXPORT_SYMBOL(compat_sock_common_getsockopt);
3017 #endif
3018 
3019 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3020 			int flags)
3021 {
3022 	struct sock *sk = sock->sk;
3023 	int addr_len = 0;
3024 	int err;
3025 
3026 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3027 				   flags & ~MSG_DONTWAIT, &addr_len);
3028 	if (err >= 0)
3029 		msg->msg_namelen = addr_len;
3030 	return err;
3031 }
3032 EXPORT_SYMBOL(sock_common_recvmsg);
3033 
3034 /*
3035  *	Set socket options on an inet socket.
3036  */
3037 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3038 			   char __user *optval, unsigned int optlen)
3039 {
3040 	struct sock *sk = sock->sk;
3041 
3042 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3043 }
3044 EXPORT_SYMBOL(sock_common_setsockopt);
3045 
3046 #ifdef CONFIG_COMPAT
3047 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
3048 				  char __user *optval, unsigned int optlen)
3049 {
3050 	struct sock *sk = sock->sk;
3051 
3052 	if (sk->sk_prot->compat_setsockopt != NULL)
3053 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
3054 						      optval, optlen);
3055 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3056 }
3057 EXPORT_SYMBOL(compat_sock_common_setsockopt);
3058 #endif
3059 
3060 void sk_common_release(struct sock *sk)
3061 {
3062 	if (sk->sk_prot->destroy)
3063 		sk->sk_prot->destroy(sk);
3064 
3065 	/*
3066 	 * Observation: when sock_common_release is called, processes have
3067 	 * no access to socket. But net still has.
3068 	 * Step one, detach it from networking:
3069 	 *
3070 	 * A. Remove from hash tables.
3071 	 */
3072 
3073 	sk->sk_prot->unhash(sk);
3074 
3075 	/*
3076 	 * In this point socket cannot receive new packets, but it is possible
3077 	 * that some packets are in flight because some CPU runs receiver and
3078 	 * did hash table lookup before we unhashed socket. They will achieve
3079 	 * receive queue and will be purged by socket destructor.
3080 	 *
3081 	 * Also we still have packets pending on receive queue and probably,
3082 	 * our own packets waiting in device queues. sock_destroy will drain
3083 	 * receive queue, but transmitted packets will delay socket destruction
3084 	 * until the last reference will be released.
3085 	 */
3086 
3087 	sock_orphan(sk);
3088 
3089 	xfrm_sk_free_policy(sk);
3090 
3091 	sk_refcnt_debug_release(sk);
3092 
3093 	sock_put(sk);
3094 }
3095 EXPORT_SYMBOL(sk_common_release);
3096 
3097 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3098 {
3099 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3100 
3101 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3102 	mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3103 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3104 	mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3105 	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3106 	mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3107 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3108 	mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3109 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3110 }
3111 
3112 #ifdef CONFIG_PROC_FS
3113 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
3114 struct prot_inuse {
3115 	int val[PROTO_INUSE_NR];
3116 };
3117 
3118 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3119 
3120 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3121 {
3122 	__this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3123 }
3124 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3125 
3126 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3127 {
3128 	int cpu, idx = prot->inuse_idx;
3129 	int res = 0;
3130 
3131 	for_each_possible_cpu(cpu)
3132 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3133 
3134 	return res >= 0 ? res : 0;
3135 }
3136 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3137 
3138 static void sock_inuse_add(struct net *net, int val)
3139 {
3140 	this_cpu_add(*net->core.sock_inuse, val);
3141 }
3142 
3143 int sock_inuse_get(struct net *net)
3144 {
3145 	int cpu, res = 0;
3146 
3147 	for_each_possible_cpu(cpu)
3148 		res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3149 
3150 	return res;
3151 }
3152 
3153 EXPORT_SYMBOL_GPL(sock_inuse_get);
3154 
3155 static int __net_init sock_inuse_init_net(struct net *net)
3156 {
3157 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3158 	if (net->core.prot_inuse == NULL)
3159 		return -ENOMEM;
3160 
3161 	net->core.sock_inuse = alloc_percpu(int);
3162 	if (net->core.sock_inuse == NULL)
3163 		goto out;
3164 
3165 	return 0;
3166 
3167 out:
3168 	free_percpu(net->core.prot_inuse);
3169 	return -ENOMEM;
3170 }
3171 
3172 static void __net_exit sock_inuse_exit_net(struct net *net)
3173 {
3174 	free_percpu(net->core.prot_inuse);
3175 	free_percpu(net->core.sock_inuse);
3176 }
3177 
3178 static struct pernet_operations net_inuse_ops = {
3179 	.init = sock_inuse_init_net,
3180 	.exit = sock_inuse_exit_net,
3181 };
3182 
3183 static __init int net_inuse_init(void)
3184 {
3185 	if (register_pernet_subsys(&net_inuse_ops))
3186 		panic("Cannot initialize net inuse counters");
3187 
3188 	return 0;
3189 }
3190 
3191 core_initcall(net_inuse_init);
3192 
3193 static void assign_proto_idx(struct proto *prot)
3194 {
3195 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3196 
3197 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3198 		pr_err("PROTO_INUSE_NR exhausted\n");
3199 		return;
3200 	}
3201 
3202 	set_bit(prot->inuse_idx, proto_inuse_idx);
3203 }
3204 
3205 static void release_proto_idx(struct proto *prot)
3206 {
3207 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3208 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3209 }
3210 #else
3211 static inline void assign_proto_idx(struct proto *prot)
3212 {
3213 }
3214 
3215 static inline void release_proto_idx(struct proto *prot)
3216 {
3217 }
3218 
3219 static void sock_inuse_add(struct net *net, int val)
3220 {
3221 }
3222 #endif
3223 
3224 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3225 {
3226 	if (!rsk_prot)
3227 		return;
3228 	kfree(rsk_prot->slab_name);
3229 	rsk_prot->slab_name = NULL;
3230 	kmem_cache_destroy(rsk_prot->slab);
3231 	rsk_prot->slab = NULL;
3232 }
3233 
3234 static int req_prot_init(const struct proto *prot)
3235 {
3236 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3237 
3238 	if (!rsk_prot)
3239 		return 0;
3240 
3241 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3242 					prot->name);
3243 	if (!rsk_prot->slab_name)
3244 		return -ENOMEM;
3245 
3246 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3247 					   rsk_prot->obj_size, 0,
3248 					   prot->slab_flags, NULL);
3249 
3250 	if (!rsk_prot->slab) {
3251 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3252 			prot->name);
3253 		return -ENOMEM;
3254 	}
3255 	return 0;
3256 }
3257 
3258 int proto_register(struct proto *prot, int alloc_slab)
3259 {
3260 	if (alloc_slab) {
3261 		prot->slab = kmem_cache_create_usercopy(prot->name,
3262 					prot->obj_size, 0,
3263 					SLAB_HWCACHE_ALIGN | prot->slab_flags,
3264 					prot->useroffset, prot->usersize,
3265 					NULL);
3266 
3267 		if (prot->slab == NULL) {
3268 			pr_crit("%s: Can't create sock SLAB cache!\n",
3269 				prot->name);
3270 			goto out;
3271 		}
3272 
3273 		if (req_prot_init(prot))
3274 			goto out_free_request_sock_slab;
3275 
3276 		if (prot->twsk_prot != NULL) {
3277 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3278 
3279 			if (prot->twsk_prot->twsk_slab_name == NULL)
3280 				goto out_free_request_sock_slab;
3281 
3282 			prot->twsk_prot->twsk_slab =
3283 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3284 						  prot->twsk_prot->twsk_obj_size,
3285 						  0,
3286 						  prot->slab_flags,
3287 						  NULL);
3288 			if (prot->twsk_prot->twsk_slab == NULL)
3289 				goto out_free_timewait_sock_slab_name;
3290 		}
3291 	}
3292 
3293 	mutex_lock(&proto_list_mutex);
3294 	list_add(&prot->node, &proto_list);
3295 	assign_proto_idx(prot);
3296 	mutex_unlock(&proto_list_mutex);
3297 	return 0;
3298 
3299 out_free_timewait_sock_slab_name:
3300 	kfree(prot->twsk_prot->twsk_slab_name);
3301 out_free_request_sock_slab:
3302 	req_prot_cleanup(prot->rsk_prot);
3303 
3304 	kmem_cache_destroy(prot->slab);
3305 	prot->slab = NULL;
3306 out:
3307 	return -ENOBUFS;
3308 }
3309 EXPORT_SYMBOL(proto_register);
3310 
3311 void proto_unregister(struct proto *prot)
3312 {
3313 	mutex_lock(&proto_list_mutex);
3314 	release_proto_idx(prot);
3315 	list_del(&prot->node);
3316 	mutex_unlock(&proto_list_mutex);
3317 
3318 	kmem_cache_destroy(prot->slab);
3319 	prot->slab = NULL;
3320 
3321 	req_prot_cleanup(prot->rsk_prot);
3322 
3323 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3324 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3325 		kfree(prot->twsk_prot->twsk_slab_name);
3326 		prot->twsk_prot->twsk_slab = NULL;
3327 	}
3328 }
3329 EXPORT_SYMBOL(proto_unregister);
3330 
3331 int sock_load_diag_module(int family, int protocol)
3332 {
3333 	if (!protocol) {
3334 		if (!sock_is_registered(family))
3335 			return -ENOENT;
3336 
3337 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3338 				      NETLINK_SOCK_DIAG, family);
3339 	}
3340 
3341 #ifdef CONFIG_INET
3342 	if (family == AF_INET &&
3343 	    !rcu_access_pointer(inet_protos[protocol]))
3344 		return -ENOENT;
3345 #endif
3346 
3347 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3348 			      NETLINK_SOCK_DIAG, family, protocol);
3349 }
3350 EXPORT_SYMBOL(sock_load_diag_module);
3351 
3352 #ifdef CONFIG_PROC_FS
3353 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3354 	__acquires(proto_list_mutex)
3355 {
3356 	mutex_lock(&proto_list_mutex);
3357 	return seq_list_start_head(&proto_list, *pos);
3358 }
3359 
3360 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3361 {
3362 	return seq_list_next(v, &proto_list, pos);
3363 }
3364 
3365 static void proto_seq_stop(struct seq_file *seq, void *v)
3366 	__releases(proto_list_mutex)
3367 {
3368 	mutex_unlock(&proto_list_mutex);
3369 }
3370 
3371 static char proto_method_implemented(const void *method)
3372 {
3373 	return method == NULL ? 'n' : 'y';
3374 }
3375 static long sock_prot_memory_allocated(struct proto *proto)
3376 {
3377 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3378 }
3379 
3380 static char *sock_prot_memory_pressure(struct proto *proto)
3381 {
3382 	return proto->memory_pressure != NULL ?
3383 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3384 }
3385 
3386 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3387 {
3388 
3389 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3390 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3391 		   proto->name,
3392 		   proto->obj_size,
3393 		   sock_prot_inuse_get(seq_file_net(seq), proto),
3394 		   sock_prot_memory_allocated(proto),
3395 		   sock_prot_memory_pressure(proto),
3396 		   proto->max_header,
3397 		   proto->slab == NULL ? "no" : "yes",
3398 		   module_name(proto->owner),
3399 		   proto_method_implemented(proto->close),
3400 		   proto_method_implemented(proto->connect),
3401 		   proto_method_implemented(proto->disconnect),
3402 		   proto_method_implemented(proto->accept),
3403 		   proto_method_implemented(proto->ioctl),
3404 		   proto_method_implemented(proto->init),
3405 		   proto_method_implemented(proto->destroy),
3406 		   proto_method_implemented(proto->shutdown),
3407 		   proto_method_implemented(proto->setsockopt),
3408 		   proto_method_implemented(proto->getsockopt),
3409 		   proto_method_implemented(proto->sendmsg),
3410 		   proto_method_implemented(proto->recvmsg),
3411 		   proto_method_implemented(proto->sendpage),
3412 		   proto_method_implemented(proto->bind),
3413 		   proto_method_implemented(proto->backlog_rcv),
3414 		   proto_method_implemented(proto->hash),
3415 		   proto_method_implemented(proto->unhash),
3416 		   proto_method_implemented(proto->get_port),
3417 		   proto_method_implemented(proto->enter_memory_pressure));
3418 }
3419 
3420 static int proto_seq_show(struct seq_file *seq, void *v)
3421 {
3422 	if (v == &proto_list)
3423 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3424 			   "protocol",
3425 			   "size",
3426 			   "sockets",
3427 			   "memory",
3428 			   "press",
3429 			   "maxhdr",
3430 			   "slab",
3431 			   "module",
3432 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3433 	else
3434 		proto_seq_printf(seq, list_entry(v, struct proto, node));
3435 	return 0;
3436 }
3437 
3438 static const struct seq_operations proto_seq_ops = {
3439 	.start  = proto_seq_start,
3440 	.next   = proto_seq_next,
3441 	.stop   = proto_seq_stop,
3442 	.show   = proto_seq_show,
3443 };
3444 
3445 static int proto_seq_open(struct inode *inode, struct file *file)
3446 {
3447 	return seq_open_net(inode, file, &proto_seq_ops,
3448 			    sizeof(struct seq_net_private));
3449 }
3450 
3451 static const struct file_operations proto_seq_fops = {
3452 	.open		= proto_seq_open,
3453 	.read		= seq_read,
3454 	.llseek		= seq_lseek,
3455 	.release	= seq_release_net,
3456 };
3457 
3458 static __net_init int proto_init_net(struct net *net)
3459 {
3460 	if (!proc_create("protocols", 0444, net->proc_net, &proto_seq_fops))
3461 		return -ENOMEM;
3462 
3463 	return 0;
3464 }
3465 
3466 static __net_exit void proto_exit_net(struct net *net)
3467 {
3468 	remove_proc_entry("protocols", net->proc_net);
3469 }
3470 
3471 
3472 static __net_initdata struct pernet_operations proto_net_ops = {
3473 	.init = proto_init_net,
3474 	.exit = proto_exit_net,
3475 };
3476 
3477 static int __init proto_init(void)
3478 {
3479 	return register_pernet_subsys(&proto_net_ops);
3480 }
3481 
3482 subsys_initcall(proto_init);
3483 
3484 #endif /* PROC_FS */
3485 
3486 #ifdef CONFIG_NET_RX_BUSY_POLL
3487 bool sk_busy_loop_end(void *p, unsigned long start_time)
3488 {
3489 	struct sock *sk = p;
3490 
3491 	return !skb_queue_empty(&sk->sk_receive_queue) ||
3492 	       sk_busy_loop_timeout(sk, start_time);
3493 }
3494 EXPORT_SYMBOL(sk_busy_loop_end);
3495 #endif /* CONFIG_NET_RX_BUSY_POLL */
3496