xref: /openbmc/linux/net/core/sock.c (revision a17922de)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *		This program is free software; you can redistribute it and/or
87  *		modify it under the terms of the GNU General Public License
88  *		as published by the Free Software Foundation; either version
89  *		2 of the License, or (at your option) any later version.
90  */
91 
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93 
94 #include <asm/unaligned.h>
95 #include <linux/capability.h>
96 #include <linux/errno.h>
97 #include <linux/errqueue.h>
98 #include <linux/types.h>
99 #include <linux/socket.h>
100 #include <linux/in.h>
101 #include <linux/kernel.h>
102 #include <linux/module.h>
103 #include <linux/proc_fs.h>
104 #include <linux/seq_file.h>
105 #include <linux/sched.h>
106 #include <linux/sched/mm.h>
107 #include <linux/timer.h>
108 #include <linux/string.h>
109 #include <linux/sockios.h>
110 #include <linux/net.h>
111 #include <linux/mm.h>
112 #include <linux/slab.h>
113 #include <linux/interrupt.h>
114 #include <linux/poll.h>
115 #include <linux/tcp.h>
116 #include <linux/init.h>
117 #include <linux/highmem.h>
118 #include <linux/user_namespace.h>
119 #include <linux/static_key.h>
120 #include <linux/memcontrol.h>
121 #include <linux/prefetch.h>
122 
123 #include <linux/uaccess.h>
124 
125 #include <linux/netdevice.h>
126 #include <net/protocol.h>
127 #include <linux/skbuff.h>
128 #include <net/net_namespace.h>
129 #include <net/request_sock.h>
130 #include <net/sock.h>
131 #include <linux/net_tstamp.h>
132 #include <net/xfrm.h>
133 #include <linux/ipsec.h>
134 #include <net/cls_cgroup.h>
135 #include <net/netprio_cgroup.h>
136 #include <linux/sock_diag.h>
137 
138 #include <linux/filter.h>
139 #include <net/sock_reuseport.h>
140 
141 #include <trace/events/sock.h>
142 
143 #include <net/tcp.h>
144 #include <net/busy_poll.h>
145 
146 static DEFINE_MUTEX(proto_list_mutex);
147 static LIST_HEAD(proto_list);
148 
149 static void sock_inuse_add(struct net *net, int val);
150 
151 /**
152  * sk_ns_capable - General socket capability test
153  * @sk: Socket to use a capability on or through
154  * @user_ns: The user namespace of the capability to use
155  * @cap: The capability to use
156  *
157  * Test to see if the opener of the socket had when the socket was
158  * created and the current process has the capability @cap in the user
159  * namespace @user_ns.
160  */
161 bool sk_ns_capable(const struct sock *sk,
162 		   struct user_namespace *user_ns, int cap)
163 {
164 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
165 		ns_capable(user_ns, cap);
166 }
167 EXPORT_SYMBOL(sk_ns_capable);
168 
169 /**
170  * sk_capable - Socket global capability test
171  * @sk: Socket to use a capability on or through
172  * @cap: The global capability to use
173  *
174  * Test to see if the opener of the socket had when the socket was
175  * created and the current process has the capability @cap in all user
176  * namespaces.
177  */
178 bool sk_capable(const struct sock *sk, int cap)
179 {
180 	return sk_ns_capable(sk, &init_user_ns, cap);
181 }
182 EXPORT_SYMBOL(sk_capable);
183 
184 /**
185  * sk_net_capable - Network namespace socket capability test
186  * @sk: Socket to use a capability on or through
187  * @cap: The capability to use
188  *
189  * Test to see if the opener of the socket had when the socket was created
190  * and the current process has the capability @cap over the network namespace
191  * the socket is a member of.
192  */
193 bool sk_net_capable(const struct sock *sk, int cap)
194 {
195 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
196 }
197 EXPORT_SYMBOL(sk_net_capable);
198 
199 /*
200  * Each address family might have different locking rules, so we have
201  * one slock key per address family and separate keys for internal and
202  * userspace sockets.
203  */
204 static struct lock_class_key af_family_keys[AF_MAX];
205 static struct lock_class_key af_family_kern_keys[AF_MAX];
206 static struct lock_class_key af_family_slock_keys[AF_MAX];
207 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
208 
209 /*
210  * Make lock validator output more readable. (we pre-construct these
211  * strings build-time, so that runtime initialization of socket
212  * locks is fast):
213  */
214 
215 #define _sock_locks(x)						  \
216   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
217   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
218   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
219   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
220   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
221   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
222   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
223   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
224   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
225   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
226   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
227   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
228   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
229   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
230   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
231   x "AF_MAX"
232 
233 static const char *const af_family_key_strings[AF_MAX+1] = {
234 	_sock_locks("sk_lock-")
235 };
236 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
237 	_sock_locks("slock-")
238 };
239 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
240 	_sock_locks("clock-")
241 };
242 
243 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
244 	_sock_locks("k-sk_lock-")
245 };
246 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
247 	_sock_locks("k-slock-")
248 };
249 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
250 	_sock_locks("k-clock-")
251 };
252 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
253   "rlock-AF_UNSPEC", "rlock-AF_UNIX"     , "rlock-AF_INET"     ,
254   "rlock-AF_AX25"  , "rlock-AF_IPX"      , "rlock-AF_APPLETALK",
255   "rlock-AF_NETROM", "rlock-AF_BRIDGE"   , "rlock-AF_ATMPVC"   ,
256   "rlock-AF_X25"   , "rlock-AF_INET6"    , "rlock-AF_ROSE"     ,
257   "rlock-AF_DECnet", "rlock-AF_NETBEUI"  , "rlock-AF_SECURITY" ,
258   "rlock-AF_KEY"   , "rlock-AF_NETLINK"  , "rlock-AF_PACKET"   ,
259   "rlock-AF_ASH"   , "rlock-AF_ECONET"   , "rlock-AF_ATMSVC"   ,
260   "rlock-AF_RDS"   , "rlock-AF_SNA"      , "rlock-AF_IRDA"     ,
261   "rlock-AF_PPPOX" , "rlock-AF_WANPIPE"  , "rlock-AF_LLC"      ,
262   "rlock-27"       , "rlock-28"          , "rlock-AF_CAN"      ,
263   "rlock-AF_TIPC"  , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV"     ,
264   "rlock-AF_RXRPC" , "rlock-AF_ISDN"     , "rlock-AF_PHONET"   ,
265   "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG"      ,
266   "rlock-AF_NFC"   , "rlock-AF_VSOCK"    , "rlock-AF_KCM"      ,
267   "rlock-AF_QIPCRTR", "rlock-AF_SMC"     , "rlock-AF_XDP"      ,
268   "rlock-AF_MAX"
269 };
270 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
271   "wlock-AF_UNSPEC", "wlock-AF_UNIX"     , "wlock-AF_INET"     ,
272   "wlock-AF_AX25"  , "wlock-AF_IPX"      , "wlock-AF_APPLETALK",
273   "wlock-AF_NETROM", "wlock-AF_BRIDGE"   , "wlock-AF_ATMPVC"   ,
274   "wlock-AF_X25"   , "wlock-AF_INET6"    , "wlock-AF_ROSE"     ,
275   "wlock-AF_DECnet", "wlock-AF_NETBEUI"  , "wlock-AF_SECURITY" ,
276   "wlock-AF_KEY"   , "wlock-AF_NETLINK"  , "wlock-AF_PACKET"   ,
277   "wlock-AF_ASH"   , "wlock-AF_ECONET"   , "wlock-AF_ATMSVC"   ,
278   "wlock-AF_RDS"   , "wlock-AF_SNA"      , "wlock-AF_IRDA"     ,
279   "wlock-AF_PPPOX" , "wlock-AF_WANPIPE"  , "wlock-AF_LLC"      ,
280   "wlock-27"       , "wlock-28"          , "wlock-AF_CAN"      ,
281   "wlock-AF_TIPC"  , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV"     ,
282   "wlock-AF_RXRPC" , "wlock-AF_ISDN"     , "wlock-AF_PHONET"   ,
283   "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG"      ,
284   "wlock-AF_NFC"   , "wlock-AF_VSOCK"    , "wlock-AF_KCM"      ,
285   "wlock-AF_QIPCRTR", "wlock-AF_SMC"     , "wlock-AF_XDP"      ,
286   "wlock-AF_MAX"
287 };
288 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
289   "elock-AF_UNSPEC", "elock-AF_UNIX"     , "elock-AF_INET"     ,
290   "elock-AF_AX25"  , "elock-AF_IPX"      , "elock-AF_APPLETALK",
291   "elock-AF_NETROM", "elock-AF_BRIDGE"   , "elock-AF_ATMPVC"   ,
292   "elock-AF_X25"   , "elock-AF_INET6"    , "elock-AF_ROSE"     ,
293   "elock-AF_DECnet", "elock-AF_NETBEUI"  , "elock-AF_SECURITY" ,
294   "elock-AF_KEY"   , "elock-AF_NETLINK"  , "elock-AF_PACKET"   ,
295   "elock-AF_ASH"   , "elock-AF_ECONET"   , "elock-AF_ATMSVC"   ,
296   "elock-AF_RDS"   , "elock-AF_SNA"      , "elock-AF_IRDA"     ,
297   "elock-AF_PPPOX" , "elock-AF_WANPIPE"  , "elock-AF_LLC"      ,
298   "elock-27"       , "elock-28"          , "elock-AF_CAN"      ,
299   "elock-AF_TIPC"  , "elock-AF_BLUETOOTH", "elock-AF_IUCV"     ,
300   "elock-AF_RXRPC" , "elock-AF_ISDN"     , "elock-AF_PHONET"   ,
301   "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG"      ,
302   "elock-AF_NFC"   , "elock-AF_VSOCK"    , "elock-AF_KCM"      ,
303   "elock-AF_QIPCRTR", "elock-AF_SMC"     , "elock-AF_XDP"      ,
304   "elock-AF_MAX"
305 };
306 
307 /*
308  * sk_callback_lock and sk queues locking rules are per-address-family,
309  * so split the lock classes by using a per-AF key:
310  */
311 static struct lock_class_key af_callback_keys[AF_MAX];
312 static struct lock_class_key af_rlock_keys[AF_MAX];
313 static struct lock_class_key af_wlock_keys[AF_MAX];
314 static struct lock_class_key af_elock_keys[AF_MAX];
315 static struct lock_class_key af_kern_callback_keys[AF_MAX];
316 
317 /* Run time adjustable parameters. */
318 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
319 EXPORT_SYMBOL(sysctl_wmem_max);
320 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
321 EXPORT_SYMBOL(sysctl_rmem_max);
322 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
323 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
324 
325 /* Maximal space eaten by iovec or ancillary data plus some space */
326 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
327 EXPORT_SYMBOL(sysctl_optmem_max);
328 
329 int sysctl_tstamp_allow_data __read_mostly = 1;
330 
331 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
332 EXPORT_SYMBOL_GPL(memalloc_socks_key);
333 
334 /**
335  * sk_set_memalloc - sets %SOCK_MEMALLOC
336  * @sk: socket to set it on
337  *
338  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
339  * It's the responsibility of the admin to adjust min_free_kbytes
340  * to meet the requirements
341  */
342 void sk_set_memalloc(struct sock *sk)
343 {
344 	sock_set_flag(sk, SOCK_MEMALLOC);
345 	sk->sk_allocation |= __GFP_MEMALLOC;
346 	static_branch_inc(&memalloc_socks_key);
347 }
348 EXPORT_SYMBOL_GPL(sk_set_memalloc);
349 
350 void sk_clear_memalloc(struct sock *sk)
351 {
352 	sock_reset_flag(sk, SOCK_MEMALLOC);
353 	sk->sk_allocation &= ~__GFP_MEMALLOC;
354 	static_branch_dec(&memalloc_socks_key);
355 
356 	/*
357 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
358 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
359 	 * it has rmem allocations due to the last swapfile being deactivated
360 	 * but there is a risk that the socket is unusable due to exceeding
361 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
362 	 */
363 	sk_mem_reclaim(sk);
364 }
365 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
366 
367 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
368 {
369 	int ret;
370 	unsigned int noreclaim_flag;
371 
372 	/* these should have been dropped before queueing */
373 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
374 
375 	noreclaim_flag = memalloc_noreclaim_save();
376 	ret = sk->sk_backlog_rcv(sk, skb);
377 	memalloc_noreclaim_restore(noreclaim_flag);
378 
379 	return ret;
380 }
381 EXPORT_SYMBOL(__sk_backlog_rcv);
382 
383 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
384 {
385 	struct timeval tv;
386 
387 	if (optlen < sizeof(tv))
388 		return -EINVAL;
389 	if (copy_from_user(&tv, optval, sizeof(tv)))
390 		return -EFAULT;
391 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
392 		return -EDOM;
393 
394 	if (tv.tv_sec < 0) {
395 		static int warned __read_mostly;
396 
397 		*timeo_p = 0;
398 		if (warned < 10 && net_ratelimit()) {
399 			warned++;
400 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
401 				__func__, current->comm, task_pid_nr(current));
402 		}
403 		return 0;
404 	}
405 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
406 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
407 		return 0;
408 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
409 		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
410 	return 0;
411 }
412 
413 static void sock_warn_obsolete_bsdism(const char *name)
414 {
415 	static int warned;
416 	static char warncomm[TASK_COMM_LEN];
417 	if (strcmp(warncomm, current->comm) && warned < 5) {
418 		strcpy(warncomm,  current->comm);
419 		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
420 			warncomm, name);
421 		warned++;
422 	}
423 }
424 
425 static bool sock_needs_netstamp(const struct sock *sk)
426 {
427 	switch (sk->sk_family) {
428 	case AF_UNSPEC:
429 	case AF_UNIX:
430 		return false;
431 	default:
432 		return true;
433 	}
434 }
435 
436 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
437 {
438 	if (sk->sk_flags & flags) {
439 		sk->sk_flags &= ~flags;
440 		if (sock_needs_netstamp(sk) &&
441 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
442 			net_disable_timestamp();
443 	}
444 }
445 
446 
447 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
448 {
449 	unsigned long flags;
450 	struct sk_buff_head *list = &sk->sk_receive_queue;
451 
452 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
453 		atomic_inc(&sk->sk_drops);
454 		trace_sock_rcvqueue_full(sk, skb);
455 		return -ENOMEM;
456 	}
457 
458 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
459 		atomic_inc(&sk->sk_drops);
460 		return -ENOBUFS;
461 	}
462 
463 	skb->dev = NULL;
464 	skb_set_owner_r(skb, sk);
465 
466 	/* we escape from rcu protected region, make sure we dont leak
467 	 * a norefcounted dst
468 	 */
469 	skb_dst_force(skb);
470 
471 	spin_lock_irqsave(&list->lock, flags);
472 	sock_skb_set_dropcount(sk, skb);
473 	__skb_queue_tail(list, skb);
474 	spin_unlock_irqrestore(&list->lock, flags);
475 
476 	if (!sock_flag(sk, SOCK_DEAD))
477 		sk->sk_data_ready(sk);
478 	return 0;
479 }
480 EXPORT_SYMBOL(__sock_queue_rcv_skb);
481 
482 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
483 {
484 	int err;
485 
486 	err = sk_filter(sk, skb);
487 	if (err)
488 		return err;
489 
490 	return __sock_queue_rcv_skb(sk, skb);
491 }
492 EXPORT_SYMBOL(sock_queue_rcv_skb);
493 
494 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
495 		     const int nested, unsigned int trim_cap, bool refcounted)
496 {
497 	int rc = NET_RX_SUCCESS;
498 
499 	if (sk_filter_trim_cap(sk, skb, trim_cap))
500 		goto discard_and_relse;
501 
502 	skb->dev = NULL;
503 
504 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
505 		atomic_inc(&sk->sk_drops);
506 		goto discard_and_relse;
507 	}
508 	if (nested)
509 		bh_lock_sock_nested(sk);
510 	else
511 		bh_lock_sock(sk);
512 	if (!sock_owned_by_user(sk)) {
513 		/*
514 		 * trylock + unlock semantics:
515 		 */
516 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
517 
518 		rc = sk_backlog_rcv(sk, skb);
519 
520 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
521 	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
522 		bh_unlock_sock(sk);
523 		atomic_inc(&sk->sk_drops);
524 		goto discard_and_relse;
525 	}
526 
527 	bh_unlock_sock(sk);
528 out:
529 	if (refcounted)
530 		sock_put(sk);
531 	return rc;
532 discard_and_relse:
533 	kfree_skb(skb);
534 	goto out;
535 }
536 EXPORT_SYMBOL(__sk_receive_skb);
537 
538 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
539 {
540 	struct dst_entry *dst = __sk_dst_get(sk);
541 
542 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
543 		sk_tx_queue_clear(sk);
544 		sk->sk_dst_pending_confirm = 0;
545 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
546 		dst_release(dst);
547 		return NULL;
548 	}
549 
550 	return dst;
551 }
552 EXPORT_SYMBOL(__sk_dst_check);
553 
554 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
555 {
556 	struct dst_entry *dst = sk_dst_get(sk);
557 
558 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
559 		sk_dst_reset(sk);
560 		dst_release(dst);
561 		return NULL;
562 	}
563 
564 	return dst;
565 }
566 EXPORT_SYMBOL(sk_dst_check);
567 
568 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
569 				int optlen)
570 {
571 	int ret = -ENOPROTOOPT;
572 #ifdef CONFIG_NETDEVICES
573 	struct net *net = sock_net(sk);
574 	char devname[IFNAMSIZ];
575 	int index;
576 
577 	/* Sorry... */
578 	ret = -EPERM;
579 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
580 		goto out;
581 
582 	ret = -EINVAL;
583 	if (optlen < 0)
584 		goto out;
585 
586 	/* Bind this socket to a particular device like "eth0",
587 	 * as specified in the passed interface name. If the
588 	 * name is "" or the option length is zero the socket
589 	 * is not bound.
590 	 */
591 	if (optlen > IFNAMSIZ - 1)
592 		optlen = IFNAMSIZ - 1;
593 	memset(devname, 0, sizeof(devname));
594 
595 	ret = -EFAULT;
596 	if (copy_from_user(devname, optval, optlen))
597 		goto out;
598 
599 	index = 0;
600 	if (devname[0] != '\0') {
601 		struct net_device *dev;
602 
603 		rcu_read_lock();
604 		dev = dev_get_by_name_rcu(net, devname);
605 		if (dev)
606 			index = dev->ifindex;
607 		rcu_read_unlock();
608 		ret = -ENODEV;
609 		if (!dev)
610 			goto out;
611 	}
612 
613 	lock_sock(sk);
614 	sk->sk_bound_dev_if = index;
615 	sk_dst_reset(sk);
616 	release_sock(sk);
617 
618 	ret = 0;
619 
620 out:
621 #endif
622 
623 	return ret;
624 }
625 
626 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
627 				int __user *optlen, int len)
628 {
629 	int ret = -ENOPROTOOPT;
630 #ifdef CONFIG_NETDEVICES
631 	struct net *net = sock_net(sk);
632 	char devname[IFNAMSIZ];
633 
634 	if (sk->sk_bound_dev_if == 0) {
635 		len = 0;
636 		goto zero;
637 	}
638 
639 	ret = -EINVAL;
640 	if (len < IFNAMSIZ)
641 		goto out;
642 
643 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
644 	if (ret)
645 		goto out;
646 
647 	len = strlen(devname) + 1;
648 
649 	ret = -EFAULT;
650 	if (copy_to_user(optval, devname, len))
651 		goto out;
652 
653 zero:
654 	ret = -EFAULT;
655 	if (put_user(len, optlen))
656 		goto out;
657 
658 	ret = 0;
659 
660 out:
661 #endif
662 
663 	return ret;
664 }
665 
666 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
667 {
668 	if (valbool)
669 		sock_set_flag(sk, bit);
670 	else
671 		sock_reset_flag(sk, bit);
672 }
673 
674 bool sk_mc_loop(struct sock *sk)
675 {
676 	if (dev_recursion_level())
677 		return false;
678 	if (!sk)
679 		return true;
680 	switch (sk->sk_family) {
681 	case AF_INET:
682 		return inet_sk(sk)->mc_loop;
683 #if IS_ENABLED(CONFIG_IPV6)
684 	case AF_INET6:
685 		return inet6_sk(sk)->mc_loop;
686 #endif
687 	}
688 	WARN_ON(1);
689 	return true;
690 }
691 EXPORT_SYMBOL(sk_mc_loop);
692 
693 /*
694  *	This is meant for all protocols to use and covers goings on
695  *	at the socket level. Everything here is generic.
696  */
697 
698 int sock_setsockopt(struct socket *sock, int level, int optname,
699 		    char __user *optval, unsigned int optlen)
700 {
701 	struct sock_txtime sk_txtime;
702 	struct sock *sk = sock->sk;
703 	int val;
704 	int valbool;
705 	struct linger ling;
706 	int ret = 0;
707 
708 	/*
709 	 *	Options without arguments
710 	 */
711 
712 	if (optname == SO_BINDTODEVICE)
713 		return sock_setbindtodevice(sk, optval, optlen);
714 
715 	if (optlen < sizeof(int))
716 		return -EINVAL;
717 
718 	if (get_user(val, (int __user *)optval))
719 		return -EFAULT;
720 
721 	valbool = val ? 1 : 0;
722 
723 	lock_sock(sk);
724 
725 	switch (optname) {
726 	case SO_DEBUG:
727 		if (val && !capable(CAP_NET_ADMIN))
728 			ret = -EACCES;
729 		else
730 			sock_valbool_flag(sk, SOCK_DBG, valbool);
731 		break;
732 	case SO_REUSEADDR:
733 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
734 		break;
735 	case SO_REUSEPORT:
736 		sk->sk_reuseport = valbool;
737 		break;
738 	case SO_TYPE:
739 	case SO_PROTOCOL:
740 	case SO_DOMAIN:
741 	case SO_ERROR:
742 		ret = -ENOPROTOOPT;
743 		break;
744 	case SO_DONTROUTE:
745 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
746 		break;
747 	case SO_BROADCAST:
748 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
749 		break;
750 	case SO_SNDBUF:
751 		/* Don't error on this BSD doesn't and if you think
752 		 * about it this is right. Otherwise apps have to
753 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
754 		 * are treated in BSD as hints
755 		 */
756 		val = min_t(u32, val, sysctl_wmem_max);
757 set_sndbuf:
758 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
759 		sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
760 		/* Wake up sending tasks if we upped the value. */
761 		sk->sk_write_space(sk);
762 		break;
763 
764 	case SO_SNDBUFFORCE:
765 		if (!capable(CAP_NET_ADMIN)) {
766 			ret = -EPERM;
767 			break;
768 		}
769 		goto set_sndbuf;
770 
771 	case SO_RCVBUF:
772 		/* Don't error on this BSD doesn't and if you think
773 		 * about it this is right. Otherwise apps have to
774 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
775 		 * are treated in BSD as hints
776 		 */
777 		val = min_t(u32, val, sysctl_rmem_max);
778 set_rcvbuf:
779 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
780 		/*
781 		 * We double it on the way in to account for
782 		 * "struct sk_buff" etc. overhead.   Applications
783 		 * assume that the SO_RCVBUF setting they make will
784 		 * allow that much actual data to be received on that
785 		 * socket.
786 		 *
787 		 * Applications are unaware that "struct sk_buff" and
788 		 * other overheads allocate from the receive buffer
789 		 * during socket buffer allocation.
790 		 *
791 		 * And after considering the possible alternatives,
792 		 * returning the value we actually used in getsockopt
793 		 * is the most desirable behavior.
794 		 */
795 		sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
796 		break;
797 
798 	case SO_RCVBUFFORCE:
799 		if (!capable(CAP_NET_ADMIN)) {
800 			ret = -EPERM;
801 			break;
802 		}
803 		goto set_rcvbuf;
804 
805 	case SO_KEEPALIVE:
806 		if (sk->sk_prot->keepalive)
807 			sk->sk_prot->keepalive(sk, valbool);
808 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
809 		break;
810 
811 	case SO_OOBINLINE:
812 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
813 		break;
814 
815 	case SO_NO_CHECK:
816 		sk->sk_no_check_tx = valbool;
817 		break;
818 
819 	case SO_PRIORITY:
820 		if ((val >= 0 && val <= 6) ||
821 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
822 			sk->sk_priority = val;
823 		else
824 			ret = -EPERM;
825 		break;
826 
827 	case SO_LINGER:
828 		if (optlen < sizeof(ling)) {
829 			ret = -EINVAL;	/* 1003.1g */
830 			break;
831 		}
832 		if (copy_from_user(&ling, optval, sizeof(ling))) {
833 			ret = -EFAULT;
834 			break;
835 		}
836 		if (!ling.l_onoff)
837 			sock_reset_flag(sk, SOCK_LINGER);
838 		else {
839 #if (BITS_PER_LONG == 32)
840 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
841 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
842 			else
843 #endif
844 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
845 			sock_set_flag(sk, SOCK_LINGER);
846 		}
847 		break;
848 
849 	case SO_BSDCOMPAT:
850 		sock_warn_obsolete_bsdism("setsockopt");
851 		break;
852 
853 	case SO_PASSCRED:
854 		if (valbool)
855 			set_bit(SOCK_PASSCRED, &sock->flags);
856 		else
857 			clear_bit(SOCK_PASSCRED, &sock->flags);
858 		break;
859 
860 	case SO_TIMESTAMP:
861 	case SO_TIMESTAMPNS:
862 		if (valbool)  {
863 			if (optname == SO_TIMESTAMP)
864 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
865 			else
866 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
867 			sock_set_flag(sk, SOCK_RCVTSTAMP);
868 			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
869 		} else {
870 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
871 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
872 		}
873 		break;
874 
875 	case SO_TIMESTAMPING:
876 		if (val & ~SOF_TIMESTAMPING_MASK) {
877 			ret = -EINVAL;
878 			break;
879 		}
880 
881 		if (val & SOF_TIMESTAMPING_OPT_ID &&
882 		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
883 			if (sk->sk_protocol == IPPROTO_TCP &&
884 			    sk->sk_type == SOCK_STREAM) {
885 				if ((1 << sk->sk_state) &
886 				    (TCPF_CLOSE | TCPF_LISTEN)) {
887 					ret = -EINVAL;
888 					break;
889 				}
890 				sk->sk_tskey = tcp_sk(sk)->snd_una;
891 			} else {
892 				sk->sk_tskey = 0;
893 			}
894 		}
895 
896 		if (val & SOF_TIMESTAMPING_OPT_STATS &&
897 		    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
898 			ret = -EINVAL;
899 			break;
900 		}
901 
902 		sk->sk_tsflags = val;
903 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
904 			sock_enable_timestamp(sk,
905 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
906 		else
907 			sock_disable_timestamp(sk,
908 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
909 		break;
910 
911 	case SO_RCVLOWAT:
912 		if (val < 0)
913 			val = INT_MAX;
914 		if (sock->ops->set_rcvlowat)
915 			ret = sock->ops->set_rcvlowat(sk, val);
916 		else
917 			sk->sk_rcvlowat = val ? : 1;
918 		break;
919 
920 	case SO_RCVTIMEO:
921 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
922 		break;
923 
924 	case SO_SNDTIMEO:
925 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
926 		break;
927 
928 	case SO_ATTACH_FILTER:
929 		ret = -EINVAL;
930 		if (optlen == sizeof(struct sock_fprog)) {
931 			struct sock_fprog fprog;
932 
933 			ret = -EFAULT;
934 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
935 				break;
936 
937 			ret = sk_attach_filter(&fprog, sk);
938 		}
939 		break;
940 
941 	case SO_ATTACH_BPF:
942 		ret = -EINVAL;
943 		if (optlen == sizeof(u32)) {
944 			u32 ufd;
945 
946 			ret = -EFAULT;
947 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
948 				break;
949 
950 			ret = sk_attach_bpf(ufd, sk);
951 		}
952 		break;
953 
954 	case SO_ATTACH_REUSEPORT_CBPF:
955 		ret = -EINVAL;
956 		if (optlen == sizeof(struct sock_fprog)) {
957 			struct sock_fprog fprog;
958 
959 			ret = -EFAULT;
960 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
961 				break;
962 
963 			ret = sk_reuseport_attach_filter(&fprog, sk);
964 		}
965 		break;
966 
967 	case SO_ATTACH_REUSEPORT_EBPF:
968 		ret = -EINVAL;
969 		if (optlen == sizeof(u32)) {
970 			u32 ufd;
971 
972 			ret = -EFAULT;
973 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
974 				break;
975 
976 			ret = sk_reuseport_attach_bpf(ufd, sk);
977 		}
978 		break;
979 
980 	case SO_DETACH_FILTER:
981 		ret = sk_detach_filter(sk);
982 		break;
983 
984 	case SO_LOCK_FILTER:
985 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
986 			ret = -EPERM;
987 		else
988 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
989 		break;
990 
991 	case SO_PASSSEC:
992 		if (valbool)
993 			set_bit(SOCK_PASSSEC, &sock->flags);
994 		else
995 			clear_bit(SOCK_PASSSEC, &sock->flags);
996 		break;
997 	case SO_MARK:
998 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
999 			ret = -EPERM;
1000 		else
1001 			sk->sk_mark = val;
1002 		break;
1003 
1004 	case SO_RXQ_OVFL:
1005 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1006 		break;
1007 
1008 	case SO_WIFI_STATUS:
1009 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1010 		break;
1011 
1012 	case SO_PEEK_OFF:
1013 		if (sock->ops->set_peek_off)
1014 			ret = sock->ops->set_peek_off(sk, val);
1015 		else
1016 			ret = -EOPNOTSUPP;
1017 		break;
1018 
1019 	case SO_NOFCS:
1020 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1021 		break;
1022 
1023 	case SO_SELECT_ERR_QUEUE:
1024 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1025 		break;
1026 
1027 #ifdef CONFIG_NET_RX_BUSY_POLL
1028 	case SO_BUSY_POLL:
1029 		/* allow unprivileged users to decrease the value */
1030 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1031 			ret = -EPERM;
1032 		else {
1033 			if (val < 0)
1034 				ret = -EINVAL;
1035 			else
1036 				sk->sk_ll_usec = val;
1037 		}
1038 		break;
1039 #endif
1040 
1041 	case SO_MAX_PACING_RATE:
1042 		if (val != ~0U)
1043 			cmpxchg(&sk->sk_pacing_status,
1044 				SK_PACING_NONE,
1045 				SK_PACING_NEEDED);
1046 		sk->sk_max_pacing_rate = val;
1047 		sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1048 					 sk->sk_max_pacing_rate);
1049 		break;
1050 
1051 	case SO_INCOMING_CPU:
1052 		sk->sk_incoming_cpu = val;
1053 		break;
1054 
1055 	case SO_CNX_ADVICE:
1056 		if (val == 1)
1057 			dst_negative_advice(sk);
1058 		break;
1059 
1060 	case SO_ZEROCOPY:
1061 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1062 			if (sk->sk_protocol != IPPROTO_TCP)
1063 				ret = -ENOTSUPP;
1064 		} else if (sk->sk_family != PF_RDS) {
1065 			ret = -ENOTSUPP;
1066 		}
1067 		if (!ret) {
1068 			if (val < 0 || val > 1)
1069 				ret = -EINVAL;
1070 			else
1071 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1072 		}
1073 		break;
1074 
1075 	case SO_TXTIME:
1076 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1077 			ret = -EPERM;
1078 		} else if (optlen != sizeof(struct sock_txtime)) {
1079 			ret = -EINVAL;
1080 		} else if (copy_from_user(&sk_txtime, optval,
1081 			   sizeof(struct sock_txtime))) {
1082 			ret = -EFAULT;
1083 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1084 			ret = -EINVAL;
1085 		} else {
1086 			sock_valbool_flag(sk, SOCK_TXTIME, true);
1087 			sk->sk_clockid = sk_txtime.clockid;
1088 			sk->sk_txtime_deadline_mode =
1089 				!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1090 			sk->sk_txtime_report_errors =
1091 				!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1092 		}
1093 		break;
1094 
1095 	default:
1096 		ret = -ENOPROTOOPT;
1097 		break;
1098 	}
1099 	release_sock(sk);
1100 	return ret;
1101 }
1102 EXPORT_SYMBOL(sock_setsockopt);
1103 
1104 
1105 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1106 			  struct ucred *ucred)
1107 {
1108 	ucred->pid = pid_vnr(pid);
1109 	ucred->uid = ucred->gid = -1;
1110 	if (cred) {
1111 		struct user_namespace *current_ns = current_user_ns();
1112 
1113 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1114 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1115 	}
1116 }
1117 
1118 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1119 {
1120 	struct user_namespace *user_ns = current_user_ns();
1121 	int i;
1122 
1123 	for (i = 0; i < src->ngroups; i++)
1124 		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1125 			return -EFAULT;
1126 
1127 	return 0;
1128 }
1129 
1130 int sock_getsockopt(struct socket *sock, int level, int optname,
1131 		    char __user *optval, int __user *optlen)
1132 {
1133 	struct sock *sk = sock->sk;
1134 
1135 	union {
1136 		int val;
1137 		u64 val64;
1138 		struct linger ling;
1139 		struct timeval tm;
1140 		struct sock_txtime txtime;
1141 	} v;
1142 
1143 	int lv = sizeof(int);
1144 	int len;
1145 
1146 	if (get_user(len, optlen))
1147 		return -EFAULT;
1148 	if (len < 0)
1149 		return -EINVAL;
1150 
1151 	memset(&v, 0, sizeof(v));
1152 
1153 	switch (optname) {
1154 	case SO_DEBUG:
1155 		v.val = sock_flag(sk, SOCK_DBG);
1156 		break;
1157 
1158 	case SO_DONTROUTE:
1159 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1160 		break;
1161 
1162 	case SO_BROADCAST:
1163 		v.val = sock_flag(sk, SOCK_BROADCAST);
1164 		break;
1165 
1166 	case SO_SNDBUF:
1167 		v.val = sk->sk_sndbuf;
1168 		break;
1169 
1170 	case SO_RCVBUF:
1171 		v.val = sk->sk_rcvbuf;
1172 		break;
1173 
1174 	case SO_REUSEADDR:
1175 		v.val = sk->sk_reuse;
1176 		break;
1177 
1178 	case SO_REUSEPORT:
1179 		v.val = sk->sk_reuseport;
1180 		break;
1181 
1182 	case SO_KEEPALIVE:
1183 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1184 		break;
1185 
1186 	case SO_TYPE:
1187 		v.val = sk->sk_type;
1188 		break;
1189 
1190 	case SO_PROTOCOL:
1191 		v.val = sk->sk_protocol;
1192 		break;
1193 
1194 	case SO_DOMAIN:
1195 		v.val = sk->sk_family;
1196 		break;
1197 
1198 	case SO_ERROR:
1199 		v.val = -sock_error(sk);
1200 		if (v.val == 0)
1201 			v.val = xchg(&sk->sk_err_soft, 0);
1202 		break;
1203 
1204 	case SO_OOBINLINE:
1205 		v.val = sock_flag(sk, SOCK_URGINLINE);
1206 		break;
1207 
1208 	case SO_NO_CHECK:
1209 		v.val = sk->sk_no_check_tx;
1210 		break;
1211 
1212 	case SO_PRIORITY:
1213 		v.val = sk->sk_priority;
1214 		break;
1215 
1216 	case SO_LINGER:
1217 		lv		= sizeof(v.ling);
1218 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1219 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1220 		break;
1221 
1222 	case SO_BSDCOMPAT:
1223 		sock_warn_obsolete_bsdism("getsockopt");
1224 		break;
1225 
1226 	case SO_TIMESTAMP:
1227 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1228 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1229 		break;
1230 
1231 	case SO_TIMESTAMPNS:
1232 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1233 		break;
1234 
1235 	case SO_TIMESTAMPING:
1236 		v.val = sk->sk_tsflags;
1237 		break;
1238 
1239 	case SO_RCVTIMEO:
1240 		lv = sizeof(struct timeval);
1241 		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1242 			v.tm.tv_sec = 0;
1243 			v.tm.tv_usec = 0;
1244 		} else {
1245 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1246 			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1247 		}
1248 		break;
1249 
1250 	case SO_SNDTIMEO:
1251 		lv = sizeof(struct timeval);
1252 		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1253 			v.tm.tv_sec = 0;
1254 			v.tm.tv_usec = 0;
1255 		} else {
1256 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1257 			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1258 		}
1259 		break;
1260 
1261 	case SO_RCVLOWAT:
1262 		v.val = sk->sk_rcvlowat;
1263 		break;
1264 
1265 	case SO_SNDLOWAT:
1266 		v.val = 1;
1267 		break;
1268 
1269 	case SO_PASSCRED:
1270 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1271 		break;
1272 
1273 	case SO_PEERCRED:
1274 	{
1275 		struct ucred peercred;
1276 		if (len > sizeof(peercred))
1277 			len = sizeof(peercred);
1278 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1279 		if (copy_to_user(optval, &peercred, len))
1280 			return -EFAULT;
1281 		goto lenout;
1282 	}
1283 
1284 	case SO_PEERGROUPS:
1285 	{
1286 		int ret, n;
1287 
1288 		if (!sk->sk_peer_cred)
1289 			return -ENODATA;
1290 
1291 		n = sk->sk_peer_cred->group_info->ngroups;
1292 		if (len < n * sizeof(gid_t)) {
1293 			len = n * sizeof(gid_t);
1294 			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1295 		}
1296 		len = n * sizeof(gid_t);
1297 
1298 		ret = groups_to_user((gid_t __user *)optval,
1299 				     sk->sk_peer_cred->group_info);
1300 		if (ret)
1301 			return ret;
1302 		goto lenout;
1303 	}
1304 
1305 	case SO_PEERNAME:
1306 	{
1307 		char address[128];
1308 
1309 		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1310 		if (lv < 0)
1311 			return -ENOTCONN;
1312 		if (lv < len)
1313 			return -EINVAL;
1314 		if (copy_to_user(optval, address, len))
1315 			return -EFAULT;
1316 		goto lenout;
1317 	}
1318 
1319 	/* Dubious BSD thing... Probably nobody even uses it, but
1320 	 * the UNIX standard wants it for whatever reason... -DaveM
1321 	 */
1322 	case SO_ACCEPTCONN:
1323 		v.val = sk->sk_state == TCP_LISTEN;
1324 		break;
1325 
1326 	case SO_PASSSEC:
1327 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1328 		break;
1329 
1330 	case SO_PEERSEC:
1331 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1332 
1333 	case SO_MARK:
1334 		v.val = sk->sk_mark;
1335 		break;
1336 
1337 	case SO_RXQ_OVFL:
1338 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1339 		break;
1340 
1341 	case SO_WIFI_STATUS:
1342 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1343 		break;
1344 
1345 	case SO_PEEK_OFF:
1346 		if (!sock->ops->set_peek_off)
1347 			return -EOPNOTSUPP;
1348 
1349 		v.val = sk->sk_peek_off;
1350 		break;
1351 	case SO_NOFCS:
1352 		v.val = sock_flag(sk, SOCK_NOFCS);
1353 		break;
1354 
1355 	case SO_BINDTODEVICE:
1356 		return sock_getbindtodevice(sk, optval, optlen, len);
1357 
1358 	case SO_GET_FILTER:
1359 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1360 		if (len < 0)
1361 			return len;
1362 
1363 		goto lenout;
1364 
1365 	case SO_LOCK_FILTER:
1366 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1367 		break;
1368 
1369 	case SO_BPF_EXTENSIONS:
1370 		v.val = bpf_tell_extensions();
1371 		break;
1372 
1373 	case SO_SELECT_ERR_QUEUE:
1374 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1375 		break;
1376 
1377 #ifdef CONFIG_NET_RX_BUSY_POLL
1378 	case SO_BUSY_POLL:
1379 		v.val = sk->sk_ll_usec;
1380 		break;
1381 #endif
1382 
1383 	case SO_MAX_PACING_RATE:
1384 		v.val = sk->sk_max_pacing_rate;
1385 		break;
1386 
1387 	case SO_INCOMING_CPU:
1388 		v.val = sk->sk_incoming_cpu;
1389 		break;
1390 
1391 	case SO_MEMINFO:
1392 	{
1393 		u32 meminfo[SK_MEMINFO_VARS];
1394 
1395 		if (get_user(len, optlen))
1396 			return -EFAULT;
1397 
1398 		sk_get_meminfo(sk, meminfo);
1399 
1400 		len = min_t(unsigned int, len, sizeof(meminfo));
1401 		if (copy_to_user(optval, &meminfo, len))
1402 			return -EFAULT;
1403 
1404 		goto lenout;
1405 	}
1406 
1407 #ifdef CONFIG_NET_RX_BUSY_POLL
1408 	case SO_INCOMING_NAPI_ID:
1409 		v.val = READ_ONCE(sk->sk_napi_id);
1410 
1411 		/* aggregate non-NAPI IDs down to 0 */
1412 		if (v.val < MIN_NAPI_ID)
1413 			v.val = 0;
1414 
1415 		break;
1416 #endif
1417 
1418 	case SO_COOKIE:
1419 		lv = sizeof(u64);
1420 		if (len < lv)
1421 			return -EINVAL;
1422 		v.val64 = sock_gen_cookie(sk);
1423 		break;
1424 
1425 	case SO_ZEROCOPY:
1426 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1427 		break;
1428 
1429 	case SO_TXTIME:
1430 		lv = sizeof(v.txtime);
1431 		v.txtime.clockid = sk->sk_clockid;
1432 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1433 				  SOF_TXTIME_DEADLINE_MODE : 0;
1434 		v.txtime.flags |= sk->sk_txtime_report_errors ?
1435 				  SOF_TXTIME_REPORT_ERRORS : 0;
1436 		break;
1437 
1438 	default:
1439 		/* We implement the SO_SNDLOWAT etc to not be settable
1440 		 * (1003.1g 7).
1441 		 */
1442 		return -ENOPROTOOPT;
1443 	}
1444 
1445 	if (len > lv)
1446 		len = lv;
1447 	if (copy_to_user(optval, &v, len))
1448 		return -EFAULT;
1449 lenout:
1450 	if (put_user(len, optlen))
1451 		return -EFAULT;
1452 	return 0;
1453 }
1454 
1455 /*
1456  * Initialize an sk_lock.
1457  *
1458  * (We also register the sk_lock with the lock validator.)
1459  */
1460 static inline void sock_lock_init(struct sock *sk)
1461 {
1462 	if (sk->sk_kern_sock)
1463 		sock_lock_init_class_and_name(
1464 			sk,
1465 			af_family_kern_slock_key_strings[sk->sk_family],
1466 			af_family_kern_slock_keys + sk->sk_family,
1467 			af_family_kern_key_strings[sk->sk_family],
1468 			af_family_kern_keys + sk->sk_family);
1469 	else
1470 		sock_lock_init_class_and_name(
1471 			sk,
1472 			af_family_slock_key_strings[sk->sk_family],
1473 			af_family_slock_keys + sk->sk_family,
1474 			af_family_key_strings[sk->sk_family],
1475 			af_family_keys + sk->sk_family);
1476 }
1477 
1478 /*
1479  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1480  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1481  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1482  */
1483 static void sock_copy(struct sock *nsk, const struct sock *osk)
1484 {
1485 #ifdef CONFIG_SECURITY_NETWORK
1486 	void *sptr = nsk->sk_security;
1487 #endif
1488 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1489 
1490 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1491 	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1492 
1493 #ifdef CONFIG_SECURITY_NETWORK
1494 	nsk->sk_security = sptr;
1495 	security_sk_clone(osk, nsk);
1496 #endif
1497 }
1498 
1499 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1500 		int family)
1501 {
1502 	struct sock *sk;
1503 	struct kmem_cache *slab;
1504 
1505 	slab = prot->slab;
1506 	if (slab != NULL) {
1507 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1508 		if (!sk)
1509 			return sk;
1510 		if (priority & __GFP_ZERO)
1511 			sk_prot_clear_nulls(sk, prot->obj_size);
1512 	} else
1513 		sk = kmalloc(prot->obj_size, priority);
1514 
1515 	if (sk != NULL) {
1516 		if (security_sk_alloc(sk, family, priority))
1517 			goto out_free;
1518 
1519 		if (!try_module_get(prot->owner))
1520 			goto out_free_sec;
1521 		sk_tx_queue_clear(sk);
1522 	}
1523 
1524 	return sk;
1525 
1526 out_free_sec:
1527 	security_sk_free(sk);
1528 out_free:
1529 	if (slab != NULL)
1530 		kmem_cache_free(slab, sk);
1531 	else
1532 		kfree(sk);
1533 	return NULL;
1534 }
1535 
1536 static void sk_prot_free(struct proto *prot, struct sock *sk)
1537 {
1538 	struct kmem_cache *slab;
1539 	struct module *owner;
1540 
1541 	owner = prot->owner;
1542 	slab = prot->slab;
1543 
1544 	cgroup_sk_free(&sk->sk_cgrp_data);
1545 	mem_cgroup_sk_free(sk);
1546 	security_sk_free(sk);
1547 	if (slab != NULL)
1548 		kmem_cache_free(slab, sk);
1549 	else
1550 		kfree(sk);
1551 	module_put(owner);
1552 }
1553 
1554 /**
1555  *	sk_alloc - All socket objects are allocated here
1556  *	@net: the applicable net namespace
1557  *	@family: protocol family
1558  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1559  *	@prot: struct proto associated with this new sock instance
1560  *	@kern: is this to be a kernel socket?
1561  */
1562 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1563 		      struct proto *prot, int kern)
1564 {
1565 	struct sock *sk;
1566 
1567 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1568 	if (sk) {
1569 		sk->sk_family = family;
1570 		/*
1571 		 * See comment in struct sock definition to understand
1572 		 * why we need sk_prot_creator -acme
1573 		 */
1574 		sk->sk_prot = sk->sk_prot_creator = prot;
1575 		sk->sk_kern_sock = kern;
1576 		sock_lock_init(sk);
1577 		sk->sk_net_refcnt = kern ? 0 : 1;
1578 		if (likely(sk->sk_net_refcnt)) {
1579 			get_net(net);
1580 			sock_inuse_add(net, 1);
1581 		}
1582 
1583 		sock_net_set(sk, net);
1584 		refcount_set(&sk->sk_wmem_alloc, 1);
1585 
1586 		mem_cgroup_sk_alloc(sk);
1587 		cgroup_sk_alloc(&sk->sk_cgrp_data);
1588 		sock_update_classid(&sk->sk_cgrp_data);
1589 		sock_update_netprioidx(&sk->sk_cgrp_data);
1590 	}
1591 
1592 	return sk;
1593 }
1594 EXPORT_SYMBOL(sk_alloc);
1595 
1596 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1597  * grace period. This is the case for UDP sockets and TCP listeners.
1598  */
1599 static void __sk_destruct(struct rcu_head *head)
1600 {
1601 	struct sock *sk = container_of(head, struct sock, sk_rcu);
1602 	struct sk_filter *filter;
1603 
1604 	if (sk->sk_destruct)
1605 		sk->sk_destruct(sk);
1606 
1607 	filter = rcu_dereference_check(sk->sk_filter,
1608 				       refcount_read(&sk->sk_wmem_alloc) == 0);
1609 	if (filter) {
1610 		sk_filter_uncharge(sk, filter);
1611 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1612 	}
1613 	if (rcu_access_pointer(sk->sk_reuseport_cb))
1614 		reuseport_detach_sock(sk);
1615 
1616 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1617 
1618 	if (atomic_read(&sk->sk_omem_alloc))
1619 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1620 			 __func__, atomic_read(&sk->sk_omem_alloc));
1621 
1622 	if (sk->sk_frag.page) {
1623 		put_page(sk->sk_frag.page);
1624 		sk->sk_frag.page = NULL;
1625 	}
1626 
1627 	if (sk->sk_peer_cred)
1628 		put_cred(sk->sk_peer_cred);
1629 	put_pid(sk->sk_peer_pid);
1630 	if (likely(sk->sk_net_refcnt))
1631 		put_net(sock_net(sk));
1632 	sk_prot_free(sk->sk_prot_creator, sk);
1633 }
1634 
1635 void sk_destruct(struct sock *sk)
1636 {
1637 	if (sock_flag(sk, SOCK_RCU_FREE))
1638 		call_rcu(&sk->sk_rcu, __sk_destruct);
1639 	else
1640 		__sk_destruct(&sk->sk_rcu);
1641 }
1642 
1643 static void __sk_free(struct sock *sk)
1644 {
1645 	if (likely(sk->sk_net_refcnt))
1646 		sock_inuse_add(sock_net(sk), -1);
1647 
1648 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1649 		sock_diag_broadcast_destroy(sk);
1650 	else
1651 		sk_destruct(sk);
1652 }
1653 
1654 void sk_free(struct sock *sk)
1655 {
1656 	/*
1657 	 * We subtract one from sk_wmem_alloc and can know if
1658 	 * some packets are still in some tx queue.
1659 	 * If not null, sock_wfree() will call __sk_free(sk) later
1660 	 */
1661 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1662 		__sk_free(sk);
1663 }
1664 EXPORT_SYMBOL(sk_free);
1665 
1666 static void sk_init_common(struct sock *sk)
1667 {
1668 	skb_queue_head_init(&sk->sk_receive_queue);
1669 	skb_queue_head_init(&sk->sk_write_queue);
1670 	skb_queue_head_init(&sk->sk_error_queue);
1671 
1672 	rwlock_init(&sk->sk_callback_lock);
1673 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1674 			af_rlock_keys + sk->sk_family,
1675 			af_family_rlock_key_strings[sk->sk_family]);
1676 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1677 			af_wlock_keys + sk->sk_family,
1678 			af_family_wlock_key_strings[sk->sk_family]);
1679 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1680 			af_elock_keys + sk->sk_family,
1681 			af_family_elock_key_strings[sk->sk_family]);
1682 	lockdep_set_class_and_name(&sk->sk_callback_lock,
1683 			af_callback_keys + sk->sk_family,
1684 			af_family_clock_key_strings[sk->sk_family]);
1685 }
1686 
1687 /**
1688  *	sk_clone_lock - clone a socket, and lock its clone
1689  *	@sk: the socket to clone
1690  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1691  *
1692  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1693  */
1694 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1695 {
1696 	struct sock *newsk;
1697 	bool is_charged = true;
1698 
1699 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1700 	if (newsk != NULL) {
1701 		struct sk_filter *filter;
1702 
1703 		sock_copy(newsk, sk);
1704 
1705 		newsk->sk_prot_creator = sk->sk_prot;
1706 
1707 		/* SANITY */
1708 		if (likely(newsk->sk_net_refcnt))
1709 			get_net(sock_net(newsk));
1710 		sk_node_init(&newsk->sk_node);
1711 		sock_lock_init(newsk);
1712 		bh_lock_sock(newsk);
1713 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1714 		newsk->sk_backlog.len = 0;
1715 
1716 		atomic_set(&newsk->sk_rmem_alloc, 0);
1717 		/*
1718 		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1719 		 */
1720 		refcount_set(&newsk->sk_wmem_alloc, 1);
1721 		atomic_set(&newsk->sk_omem_alloc, 0);
1722 		sk_init_common(newsk);
1723 
1724 		newsk->sk_dst_cache	= NULL;
1725 		newsk->sk_dst_pending_confirm = 0;
1726 		newsk->sk_wmem_queued	= 0;
1727 		newsk->sk_forward_alloc = 0;
1728 		atomic_set(&newsk->sk_drops, 0);
1729 		newsk->sk_send_head	= NULL;
1730 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1731 		atomic_set(&newsk->sk_zckey, 0);
1732 
1733 		sock_reset_flag(newsk, SOCK_DONE);
1734 		mem_cgroup_sk_alloc(newsk);
1735 		cgroup_sk_alloc(&newsk->sk_cgrp_data);
1736 
1737 		rcu_read_lock();
1738 		filter = rcu_dereference(sk->sk_filter);
1739 		if (filter != NULL)
1740 			/* though it's an empty new sock, the charging may fail
1741 			 * if sysctl_optmem_max was changed between creation of
1742 			 * original socket and cloning
1743 			 */
1744 			is_charged = sk_filter_charge(newsk, filter);
1745 		RCU_INIT_POINTER(newsk->sk_filter, filter);
1746 		rcu_read_unlock();
1747 
1748 		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1749 			/* We need to make sure that we don't uncharge the new
1750 			 * socket if we couldn't charge it in the first place
1751 			 * as otherwise we uncharge the parent's filter.
1752 			 */
1753 			if (!is_charged)
1754 				RCU_INIT_POINTER(newsk->sk_filter, NULL);
1755 			sk_free_unlock_clone(newsk);
1756 			newsk = NULL;
1757 			goto out;
1758 		}
1759 		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1760 
1761 		newsk->sk_err	   = 0;
1762 		newsk->sk_err_soft = 0;
1763 		newsk->sk_priority = 0;
1764 		newsk->sk_incoming_cpu = raw_smp_processor_id();
1765 		atomic64_set(&newsk->sk_cookie, 0);
1766 		if (likely(newsk->sk_net_refcnt))
1767 			sock_inuse_add(sock_net(newsk), 1);
1768 
1769 		/*
1770 		 * Before updating sk_refcnt, we must commit prior changes to memory
1771 		 * (Documentation/RCU/rculist_nulls.txt for details)
1772 		 */
1773 		smp_wmb();
1774 		refcount_set(&newsk->sk_refcnt, 2);
1775 
1776 		/*
1777 		 * Increment the counter in the same struct proto as the master
1778 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1779 		 * is the same as sk->sk_prot->socks, as this field was copied
1780 		 * with memcpy).
1781 		 *
1782 		 * This _changes_ the previous behaviour, where
1783 		 * tcp_create_openreq_child always was incrementing the
1784 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1785 		 * to be taken into account in all callers. -acme
1786 		 */
1787 		sk_refcnt_debug_inc(newsk);
1788 		sk_set_socket(newsk, NULL);
1789 		newsk->sk_wq = NULL;
1790 
1791 		if (newsk->sk_prot->sockets_allocated)
1792 			sk_sockets_allocated_inc(newsk);
1793 
1794 		if (sock_needs_netstamp(sk) &&
1795 		    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1796 			net_enable_timestamp();
1797 	}
1798 out:
1799 	return newsk;
1800 }
1801 EXPORT_SYMBOL_GPL(sk_clone_lock);
1802 
1803 void sk_free_unlock_clone(struct sock *sk)
1804 {
1805 	/* It is still raw copy of parent, so invalidate
1806 	 * destructor and make plain sk_free() */
1807 	sk->sk_destruct = NULL;
1808 	bh_unlock_sock(sk);
1809 	sk_free(sk);
1810 }
1811 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1812 
1813 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1814 {
1815 	u32 max_segs = 1;
1816 
1817 	sk_dst_set(sk, dst);
1818 	sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
1819 	if (sk->sk_route_caps & NETIF_F_GSO)
1820 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1821 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1822 	if (sk_can_gso(sk)) {
1823 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
1824 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1825 		} else {
1826 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1827 			sk->sk_gso_max_size = dst->dev->gso_max_size;
1828 			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1829 		}
1830 	}
1831 	sk->sk_gso_max_segs = max_segs;
1832 }
1833 EXPORT_SYMBOL_GPL(sk_setup_caps);
1834 
1835 /*
1836  *	Simple resource managers for sockets.
1837  */
1838 
1839 
1840 /*
1841  * Write buffer destructor automatically called from kfree_skb.
1842  */
1843 void sock_wfree(struct sk_buff *skb)
1844 {
1845 	struct sock *sk = skb->sk;
1846 	unsigned int len = skb->truesize;
1847 
1848 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1849 		/*
1850 		 * Keep a reference on sk_wmem_alloc, this will be released
1851 		 * after sk_write_space() call
1852 		 */
1853 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1854 		sk->sk_write_space(sk);
1855 		len = 1;
1856 	}
1857 	/*
1858 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1859 	 * could not do because of in-flight packets
1860 	 */
1861 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1862 		__sk_free(sk);
1863 }
1864 EXPORT_SYMBOL(sock_wfree);
1865 
1866 /* This variant of sock_wfree() is used by TCP,
1867  * since it sets SOCK_USE_WRITE_QUEUE.
1868  */
1869 void __sock_wfree(struct sk_buff *skb)
1870 {
1871 	struct sock *sk = skb->sk;
1872 
1873 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1874 		__sk_free(sk);
1875 }
1876 
1877 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1878 {
1879 	skb_orphan(skb);
1880 	skb->sk = sk;
1881 #ifdef CONFIG_INET
1882 	if (unlikely(!sk_fullsock(sk))) {
1883 		skb->destructor = sock_edemux;
1884 		sock_hold(sk);
1885 		return;
1886 	}
1887 #endif
1888 	skb->destructor = sock_wfree;
1889 	skb_set_hash_from_sk(skb, sk);
1890 	/*
1891 	 * We used to take a refcount on sk, but following operation
1892 	 * is enough to guarantee sk_free() wont free this sock until
1893 	 * all in-flight packets are completed
1894 	 */
1895 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1896 }
1897 EXPORT_SYMBOL(skb_set_owner_w);
1898 
1899 /* This helper is used by netem, as it can hold packets in its
1900  * delay queue. We want to allow the owner socket to send more
1901  * packets, as if they were already TX completed by a typical driver.
1902  * But we also want to keep skb->sk set because some packet schedulers
1903  * rely on it (sch_fq for example).
1904  */
1905 void skb_orphan_partial(struct sk_buff *skb)
1906 {
1907 	if (skb_is_tcp_pure_ack(skb))
1908 		return;
1909 
1910 	if (skb->destructor == sock_wfree
1911 #ifdef CONFIG_INET
1912 	    || skb->destructor == tcp_wfree
1913 #endif
1914 		) {
1915 		struct sock *sk = skb->sk;
1916 
1917 		if (refcount_inc_not_zero(&sk->sk_refcnt)) {
1918 			WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
1919 			skb->destructor = sock_efree;
1920 		}
1921 	} else {
1922 		skb_orphan(skb);
1923 	}
1924 }
1925 EXPORT_SYMBOL(skb_orphan_partial);
1926 
1927 /*
1928  * Read buffer destructor automatically called from kfree_skb.
1929  */
1930 void sock_rfree(struct sk_buff *skb)
1931 {
1932 	struct sock *sk = skb->sk;
1933 	unsigned int len = skb->truesize;
1934 
1935 	atomic_sub(len, &sk->sk_rmem_alloc);
1936 	sk_mem_uncharge(sk, len);
1937 }
1938 EXPORT_SYMBOL(sock_rfree);
1939 
1940 /*
1941  * Buffer destructor for skbs that are not used directly in read or write
1942  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1943  */
1944 void sock_efree(struct sk_buff *skb)
1945 {
1946 	sock_put(skb->sk);
1947 }
1948 EXPORT_SYMBOL(sock_efree);
1949 
1950 kuid_t sock_i_uid(struct sock *sk)
1951 {
1952 	kuid_t uid;
1953 
1954 	read_lock_bh(&sk->sk_callback_lock);
1955 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1956 	read_unlock_bh(&sk->sk_callback_lock);
1957 	return uid;
1958 }
1959 EXPORT_SYMBOL(sock_i_uid);
1960 
1961 unsigned long sock_i_ino(struct sock *sk)
1962 {
1963 	unsigned long ino;
1964 
1965 	read_lock_bh(&sk->sk_callback_lock);
1966 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1967 	read_unlock_bh(&sk->sk_callback_lock);
1968 	return ino;
1969 }
1970 EXPORT_SYMBOL(sock_i_ino);
1971 
1972 /*
1973  * Allocate a skb from the socket's send buffer.
1974  */
1975 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1976 			     gfp_t priority)
1977 {
1978 	if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1979 		struct sk_buff *skb = alloc_skb(size, priority);
1980 		if (skb) {
1981 			skb_set_owner_w(skb, sk);
1982 			return skb;
1983 		}
1984 	}
1985 	return NULL;
1986 }
1987 EXPORT_SYMBOL(sock_wmalloc);
1988 
1989 static void sock_ofree(struct sk_buff *skb)
1990 {
1991 	struct sock *sk = skb->sk;
1992 
1993 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
1994 }
1995 
1996 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
1997 			     gfp_t priority)
1998 {
1999 	struct sk_buff *skb;
2000 
2001 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2002 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2003 	    sysctl_optmem_max)
2004 		return NULL;
2005 
2006 	skb = alloc_skb(size, priority);
2007 	if (!skb)
2008 		return NULL;
2009 
2010 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2011 	skb->sk = sk;
2012 	skb->destructor = sock_ofree;
2013 	return skb;
2014 }
2015 
2016 /*
2017  * Allocate a memory block from the socket's option memory buffer.
2018  */
2019 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2020 {
2021 	if ((unsigned int)size <= sysctl_optmem_max &&
2022 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2023 		void *mem;
2024 		/* First do the add, to avoid the race if kmalloc
2025 		 * might sleep.
2026 		 */
2027 		atomic_add(size, &sk->sk_omem_alloc);
2028 		mem = kmalloc(size, priority);
2029 		if (mem)
2030 			return mem;
2031 		atomic_sub(size, &sk->sk_omem_alloc);
2032 	}
2033 	return NULL;
2034 }
2035 EXPORT_SYMBOL(sock_kmalloc);
2036 
2037 /* Free an option memory block. Note, we actually want the inline
2038  * here as this allows gcc to detect the nullify and fold away the
2039  * condition entirely.
2040  */
2041 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2042 				  const bool nullify)
2043 {
2044 	if (WARN_ON_ONCE(!mem))
2045 		return;
2046 	if (nullify)
2047 		kzfree(mem);
2048 	else
2049 		kfree(mem);
2050 	atomic_sub(size, &sk->sk_omem_alloc);
2051 }
2052 
2053 void sock_kfree_s(struct sock *sk, void *mem, int size)
2054 {
2055 	__sock_kfree_s(sk, mem, size, false);
2056 }
2057 EXPORT_SYMBOL(sock_kfree_s);
2058 
2059 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2060 {
2061 	__sock_kfree_s(sk, mem, size, true);
2062 }
2063 EXPORT_SYMBOL(sock_kzfree_s);
2064 
2065 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2066    I think, these locks should be removed for datagram sockets.
2067  */
2068 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2069 {
2070 	DEFINE_WAIT(wait);
2071 
2072 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2073 	for (;;) {
2074 		if (!timeo)
2075 			break;
2076 		if (signal_pending(current))
2077 			break;
2078 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2079 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2080 		if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
2081 			break;
2082 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2083 			break;
2084 		if (sk->sk_err)
2085 			break;
2086 		timeo = schedule_timeout(timeo);
2087 	}
2088 	finish_wait(sk_sleep(sk), &wait);
2089 	return timeo;
2090 }
2091 
2092 
2093 /*
2094  *	Generic send/receive buffer handlers
2095  */
2096 
2097 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2098 				     unsigned long data_len, int noblock,
2099 				     int *errcode, int max_page_order)
2100 {
2101 	struct sk_buff *skb;
2102 	long timeo;
2103 	int err;
2104 
2105 	timeo = sock_sndtimeo(sk, noblock);
2106 	for (;;) {
2107 		err = sock_error(sk);
2108 		if (err != 0)
2109 			goto failure;
2110 
2111 		err = -EPIPE;
2112 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2113 			goto failure;
2114 
2115 		if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2116 			break;
2117 
2118 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2119 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2120 		err = -EAGAIN;
2121 		if (!timeo)
2122 			goto failure;
2123 		if (signal_pending(current))
2124 			goto interrupted;
2125 		timeo = sock_wait_for_wmem(sk, timeo);
2126 	}
2127 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2128 				   errcode, sk->sk_allocation);
2129 	if (skb)
2130 		skb_set_owner_w(skb, sk);
2131 	return skb;
2132 
2133 interrupted:
2134 	err = sock_intr_errno(timeo);
2135 failure:
2136 	*errcode = err;
2137 	return NULL;
2138 }
2139 EXPORT_SYMBOL(sock_alloc_send_pskb);
2140 
2141 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2142 				    int noblock, int *errcode)
2143 {
2144 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2145 }
2146 EXPORT_SYMBOL(sock_alloc_send_skb);
2147 
2148 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2149 		     struct sockcm_cookie *sockc)
2150 {
2151 	u32 tsflags;
2152 
2153 	switch (cmsg->cmsg_type) {
2154 	case SO_MARK:
2155 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2156 			return -EPERM;
2157 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2158 			return -EINVAL;
2159 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2160 		break;
2161 	case SO_TIMESTAMPING:
2162 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2163 			return -EINVAL;
2164 
2165 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2166 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2167 			return -EINVAL;
2168 
2169 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2170 		sockc->tsflags |= tsflags;
2171 		break;
2172 	case SCM_TXTIME:
2173 		if (!sock_flag(sk, SOCK_TXTIME))
2174 			return -EINVAL;
2175 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2176 			return -EINVAL;
2177 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2178 		break;
2179 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2180 	case SCM_RIGHTS:
2181 	case SCM_CREDENTIALS:
2182 		break;
2183 	default:
2184 		return -EINVAL;
2185 	}
2186 	return 0;
2187 }
2188 EXPORT_SYMBOL(__sock_cmsg_send);
2189 
2190 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2191 		   struct sockcm_cookie *sockc)
2192 {
2193 	struct cmsghdr *cmsg;
2194 	int ret;
2195 
2196 	for_each_cmsghdr(cmsg, msg) {
2197 		if (!CMSG_OK(msg, cmsg))
2198 			return -EINVAL;
2199 		if (cmsg->cmsg_level != SOL_SOCKET)
2200 			continue;
2201 		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2202 		if (ret)
2203 			return ret;
2204 	}
2205 	return 0;
2206 }
2207 EXPORT_SYMBOL(sock_cmsg_send);
2208 
2209 static void sk_enter_memory_pressure(struct sock *sk)
2210 {
2211 	if (!sk->sk_prot->enter_memory_pressure)
2212 		return;
2213 
2214 	sk->sk_prot->enter_memory_pressure(sk);
2215 }
2216 
2217 static void sk_leave_memory_pressure(struct sock *sk)
2218 {
2219 	if (sk->sk_prot->leave_memory_pressure) {
2220 		sk->sk_prot->leave_memory_pressure(sk);
2221 	} else {
2222 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2223 
2224 		if (memory_pressure && *memory_pressure)
2225 			*memory_pressure = 0;
2226 	}
2227 }
2228 
2229 /* On 32bit arches, an skb frag is limited to 2^15 */
2230 #define SKB_FRAG_PAGE_ORDER	get_order(32768)
2231 
2232 /**
2233  * skb_page_frag_refill - check that a page_frag contains enough room
2234  * @sz: minimum size of the fragment we want to get
2235  * @pfrag: pointer to page_frag
2236  * @gfp: priority for memory allocation
2237  *
2238  * Note: While this allocator tries to use high order pages, there is
2239  * no guarantee that allocations succeed. Therefore, @sz MUST be
2240  * less or equal than PAGE_SIZE.
2241  */
2242 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2243 {
2244 	if (pfrag->page) {
2245 		if (page_ref_count(pfrag->page) == 1) {
2246 			pfrag->offset = 0;
2247 			return true;
2248 		}
2249 		if (pfrag->offset + sz <= pfrag->size)
2250 			return true;
2251 		put_page(pfrag->page);
2252 	}
2253 
2254 	pfrag->offset = 0;
2255 	if (SKB_FRAG_PAGE_ORDER) {
2256 		/* Avoid direct reclaim but allow kswapd to wake */
2257 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2258 					  __GFP_COMP | __GFP_NOWARN |
2259 					  __GFP_NORETRY,
2260 					  SKB_FRAG_PAGE_ORDER);
2261 		if (likely(pfrag->page)) {
2262 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2263 			return true;
2264 		}
2265 	}
2266 	pfrag->page = alloc_page(gfp);
2267 	if (likely(pfrag->page)) {
2268 		pfrag->size = PAGE_SIZE;
2269 		return true;
2270 	}
2271 	return false;
2272 }
2273 EXPORT_SYMBOL(skb_page_frag_refill);
2274 
2275 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2276 {
2277 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2278 		return true;
2279 
2280 	sk_enter_memory_pressure(sk);
2281 	sk_stream_moderate_sndbuf(sk);
2282 	return false;
2283 }
2284 EXPORT_SYMBOL(sk_page_frag_refill);
2285 
2286 int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
2287 		int sg_start, int *sg_curr_index, unsigned int *sg_curr_size,
2288 		int first_coalesce)
2289 {
2290 	int sg_curr = *sg_curr_index, use = 0, rc = 0;
2291 	unsigned int size = *sg_curr_size;
2292 	struct page_frag *pfrag;
2293 	struct scatterlist *sge;
2294 
2295 	len -= size;
2296 	pfrag = sk_page_frag(sk);
2297 
2298 	while (len > 0) {
2299 		unsigned int orig_offset;
2300 
2301 		if (!sk_page_frag_refill(sk, pfrag)) {
2302 			rc = -ENOMEM;
2303 			goto out;
2304 		}
2305 
2306 		use = min_t(int, len, pfrag->size - pfrag->offset);
2307 
2308 		if (!sk_wmem_schedule(sk, use)) {
2309 			rc = -ENOMEM;
2310 			goto out;
2311 		}
2312 
2313 		sk_mem_charge(sk, use);
2314 		size += use;
2315 		orig_offset = pfrag->offset;
2316 		pfrag->offset += use;
2317 
2318 		sge = sg + sg_curr - 1;
2319 		if (sg_curr > first_coalesce && sg_page(sg) == pfrag->page &&
2320 		    sg->offset + sg->length == orig_offset) {
2321 			sg->length += use;
2322 		} else {
2323 			sge = sg + sg_curr;
2324 			sg_unmark_end(sge);
2325 			sg_set_page(sge, pfrag->page, use, orig_offset);
2326 			get_page(pfrag->page);
2327 			sg_curr++;
2328 
2329 			if (sg_curr == MAX_SKB_FRAGS)
2330 				sg_curr = 0;
2331 
2332 			if (sg_curr == sg_start) {
2333 				rc = -ENOSPC;
2334 				break;
2335 			}
2336 		}
2337 
2338 		len -= use;
2339 	}
2340 out:
2341 	*sg_curr_size = size;
2342 	*sg_curr_index = sg_curr;
2343 	return rc;
2344 }
2345 EXPORT_SYMBOL(sk_alloc_sg);
2346 
2347 static void __lock_sock(struct sock *sk)
2348 	__releases(&sk->sk_lock.slock)
2349 	__acquires(&sk->sk_lock.slock)
2350 {
2351 	DEFINE_WAIT(wait);
2352 
2353 	for (;;) {
2354 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2355 					TASK_UNINTERRUPTIBLE);
2356 		spin_unlock_bh(&sk->sk_lock.slock);
2357 		schedule();
2358 		spin_lock_bh(&sk->sk_lock.slock);
2359 		if (!sock_owned_by_user(sk))
2360 			break;
2361 	}
2362 	finish_wait(&sk->sk_lock.wq, &wait);
2363 }
2364 
2365 static void __release_sock(struct sock *sk)
2366 	__releases(&sk->sk_lock.slock)
2367 	__acquires(&sk->sk_lock.slock)
2368 {
2369 	struct sk_buff *skb, *next;
2370 
2371 	while ((skb = sk->sk_backlog.head) != NULL) {
2372 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2373 
2374 		spin_unlock_bh(&sk->sk_lock.slock);
2375 
2376 		do {
2377 			next = skb->next;
2378 			prefetch(next);
2379 			WARN_ON_ONCE(skb_dst_is_noref(skb));
2380 			skb->next = NULL;
2381 			sk_backlog_rcv(sk, skb);
2382 
2383 			cond_resched();
2384 
2385 			skb = next;
2386 		} while (skb != NULL);
2387 
2388 		spin_lock_bh(&sk->sk_lock.slock);
2389 	}
2390 
2391 	/*
2392 	 * Doing the zeroing here guarantee we can not loop forever
2393 	 * while a wild producer attempts to flood us.
2394 	 */
2395 	sk->sk_backlog.len = 0;
2396 }
2397 
2398 void __sk_flush_backlog(struct sock *sk)
2399 {
2400 	spin_lock_bh(&sk->sk_lock.slock);
2401 	__release_sock(sk);
2402 	spin_unlock_bh(&sk->sk_lock.slock);
2403 }
2404 
2405 /**
2406  * sk_wait_data - wait for data to arrive at sk_receive_queue
2407  * @sk:    sock to wait on
2408  * @timeo: for how long
2409  * @skb:   last skb seen on sk_receive_queue
2410  *
2411  * Now socket state including sk->sk_err is changed only under lock,
2412  * hence we may omit checks after joining wait queue.
2413  * We check receive queue before schedule() only as optimization;
2414  * it is very likely that release_sock() added new data.
2415  */
2416 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2417 {
2418 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2419 	int rc;
2420 
2421 	add_wait_queue(sk_sleep(sk), &wait);
2422 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2423 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2424 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2425 	remove_wait_queue(sk_sleep(sk), &wait);
2426 	return rc;
2427 }
2428 EXPORT_SYMBOL(sk_wait_data);
2429 
2430 /**
2431  *	__sk_mem_raise_allocated - increase memory_allocated
2432  *	@sk: socket
2433  *	@size: memory size to allocate
2434  *	@amt: pages to allocate
2435  *	@kind: allocation type
2436  *
2437  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2438  */
2439 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2440 {
2441 	struct proto *prot = sk->sk_prot;
2442 	long allocated = sk_memory_allocated_add(sk, amt);
2443 	bool charged = true;
2444 
2445 	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2446 	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
2447 		goto suppress_allocation;
2448 
2449 	/* Under limit. */
2450 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2451 		sk_leave_memory_pressure(sk);
2452 		return 1;
2453 	}
2454 
2455 	/* Under pressure. */
2456 	if (allocated > sk_prot_mem_limits(sk, 1))
2457 		sk_enter_memory_pressure(sk);
2458 
2459 	/* Over hard limit. */
2460 	if (allocated > sk_prot_mem_limits(sk, 2))
2461 		goto suppress_allocation;
2462 
2463 	/* guarantee minimum buffer size under pressure */
2464 	if (kind == SK_MEM_RECV) {
2465 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2466 			return 1;
2467 
2468 	} else { /* SK_MEM_SEND */
2469 		int wmem0 = sk_get_wmem0(sk, prot);
2470 
2471 		if (sk->sk_type == SOCK_STREAM) {
2472 			if (sk->sk_wmem_queued < wmem0)
2473 				return 1;
2474 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2475 				return 1;
2476 		}
2477 	}
2478 
2479 	if (sk_has_memory_pressure(sk)) {
2480 		int alloc;
2481 
2482 		if (!sk_under_memory_pressure(sk))
2483 			return 1;
2484 		alloc = sk_sockets_allocated_read_positive(sk);
2485 		if (sk_prot_mem_limits(sk, 2) > alloc *
2486 		    sk_mem_pages(sk->sk_wmem_queued +
2487 				 atomic_read(&sk->sk_rmem_alloc) +
2488 				 sk->sk_forward_alloc))
2489 			return 1;
2490 	}
2491 
2492 suppress_allocation:
2493 
2494 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2495 		sk_stream_moderate_sndbuf(sk);
2496 
2497 		/* Fail only if socket is _under_ its sndbuf.
2498 		 * In this case we cannot block, so that we have to fail.
2499 		 */
2500 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2501 			return 1;
2502 	}
2503 
2504 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2505 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2506 
2507 	sk_memory_allocated_sub(sk, amt);
2508 
2509 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2510 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2511 
2512 	return 0;
2513 }
2514 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2515 
2516 /**
2517  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2518  *	@sk: socket
2519  *	@size: memory size to allocate
2520  *	@kind: allocation type
2521  *
2522  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2523  *	rmem allocation. This function assumes that protocols which have
2524  *	memory_pressure use sk_wmem_queued as write buffer accounting.
2525  */
2526 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2527 {
2528 	int ret, amt = sk_mem_pages(size);
2529 
2530 	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2531 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2532 	if (!ret)
2533 		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2534 	return ret;
2535 }
2536 EXPORT_SYMBOL(__sk_mem_schedule);
2537 
2538 /**
2539  *	__sk_mem_reduce_allocated - reclaim memory_allocated
2540  *	@sk: socket
2541  *	@amount: number of quanta
2542  *
2543  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2544  */
2545 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2546 {
2547 	sk_memory_allocated_sub(sk, amount);
2548 
2549 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2550 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2551 
2552 	if (sk_under_memory_pressure(sk) &&
2553 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2554 		sk_leave_memory_pressure(sk);
2555 }
2556 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2557 
2558 /**
2559  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2560  *	@sk: socket
2561  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2562  */
2563 void __sk_mem_reclaim(struct sock *sk, int amount)
2564 {
2565 	amount >>= SK_MEM_QUANTUM_SHIFT;
2566 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2567 	__sk_mem_reduce_allocated(sk, amount);
2568 }
2569 EXPORT_SYMBOL(__sk_mem_reclaim);
2570 
2571 int sk_set_peek_off(struct sock *sk, int val)
2572 {
2573 	sk->sk_peek_off = val;
2574 	return 0;
2575 }
2576 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2577 
2578 /*
2579  * Set of default routines for initialising struct proto_ops when
2580  * the protocol does not support a particular function. In certain
2581  * cases where it makes no sense for a protocol to have a "do nothing"
2582  * function, some default processing is provided.
2583  */
2584 
2585 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2586 {
2587 	return -EOPNOTSUPP;
2588 }
2589 EXPORT_SYMBOL(sock_no_bind);
2590 
2591 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2592 		    int len, int flags)
2593 {
2594 	return -EOPNOTSUPP;
2595 }
2596 EXPORT_SYMBOL(sock_no_connect);
2597 
2598 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2599 {
2600 	return -EOPNOTSUPP;
2601 }
2602 EXPORT_SYMBOL(sock_no_socketpair);
2603 
2604 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2605 		   bool kern)
2606 {
2607 	return -EOPNOTSUPP;
2608 }
2609 EXPORT_SYMBOL(sock_no_accept);
2610 
2611 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2612 		    int peer)
2613 {
2614 	return -EOPNOTSUPP;
2615 }
2616 EXPORT_SYMBOL(sock_no_getname);
2617 
2618 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2619 {
2620 	return -EOPNOTSUPP;
2621 }
2622 EXPORT_SYMBOL(sock_no_ioctl);
2623 
2624 int sock_no_listen(struct socket *sock, int backlog)
2625 {
2626 	return -EOPNOTSUPP;
2627 }
2628 EXPORT_SYMBOL(sock_no_listen);
2629 
2630 int sock_no_shutdown(struct socket *sock, int how)
2631 {
2632 	return -EOPNOTSUPP;
2633 }
2634 EXPORT_SYMBOL(sock_no_shutdown);
2635 
2636 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2637 		    char __user *optval, unsigned int optlen)
2638 {
2639 	return -EOPNOTSUPP;
2640 }
2641 EXPORT_SYMBOL(sock_no_setsockopt);
2642 
2643 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2644 		    char __user *optval, int __user *optlen)
2645 {
2646 	return -EOPNOTSUPP;
2647 }
2648 EXPORT_SYMBOL(sock_no_getsockopt);
2649 
2650 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2651 {
2652 	return -EOPNOTSUPP;
2653 }
2654 EXPORT_SYMBOL(sock_no_sendmsg);
2655 
2656 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2657 {
2658 	return -EOPNOTSUPP;
2659 }
2660 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2661 
2662 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2663 		    int flags)
2664 {
2665 	return -EOPNOTSUPP;
2666 }
2667 EXPORT_SYMBOL(sock_no_recvmsg);
2668 
2669 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2670 {
2671 	/* Mirror missing mmap method error code */
2672 	return -ENODEV;
2673 }
2674 EXPORT_SYMBOL(sock_no_mmap);
2675 
2676 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2677 {
2678 	ssize_t res;
2679 	struct msghdr msg = {.msg_flags = flags};
2680 	struct kvec iov;
2681 	char *kaddr = kmap(page);
2682 	iov.iov_base = kaddr + offset;
2683 	iov.iov_len = size;
2684 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2685 	kunmap(page);
2686 	return res;
2687 }
2688 EXPORT_SYMBOL(sock_no_sendpage);
2689 
2690 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2691 				int offset, size_t size, int flags)
2692 {
2693 	ssize_t res;
2694 	struct msghdr msg = {.msg_flags = flags};
2695 	struct kvec iov;
2696 	char *kaddr = kmap(page);
2697 
2698 	iov.iov_base = kaddr + offset;
2699 	iov.iov_len = size;
2700 	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2701 	kunmap(page);
2702 	return res;
2703 }
2704 EXPORT_SYMBOL(sock_no_sendpage_locked);
2705 
2706 /*
2707  *	Default Socket Callbacks
2708  */
2709 
2710 static void sock_def_wakeup(struct sock *sk)
2711 {
2712 	struct socket_wq *wq;
2713 
2714 	rcu_read_lock();
2715 	wq = rcu_dereference(sk->sk_wq);
2716 	if (skwq_has_sleeper(wq))
2717 		wake_up_interruptible_all(&wq->wait);
2718 	rcu_read_unlock();
2719 }
2720 
2721 static void sock_def_error_report(struct sock *sk)
2722 {
2723 	struct socket_wq *wq;
2724 
2725 	rcu_read_lock();
2726 	wq = rcu_dereference(sk->sk_wq);
2727 	if (skwq_has_sleeper(wq))
2728 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2729 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2730 	rcu_read_unlock();
2731 }
2732 
2733 static void sock_def_readable(struct sock *sk)
2734 {
2735 	struct socket_wq *wq;
2736 
2737 	rcu_read_lock();
2738 	wq = rcu_dereference(sk->sk_wq);
2739 	if (skwq_has_sleeper(wq))
2740 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2741 						EPOLLRDNORM | EPOLLRDBAND);
2742 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2743 	rcu_read_unlock();
2744 }
2745 
2746 static void sock_def_write_space(struct sock *sk)
2747 {
2748 	struct socket_wq *wq;
2749 
2750 	rcu_read_lock();
2751 
2752 	/* Do not wake up a writer until he can make "significant"
2753 	 * progress.  --DaveM
2754 	 */
2755 	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2756 		wq = rcu_dereference(sk->sk_wq);
2757 		if (skwq_has_sleeper(wq))
2758 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2759 						EPOLLWRNORM | EPOLLWRBAND);
2760 
2761 		/* Should agree with poll, otherwise some programs break */
2762 		if (sock_writeable(sk))
2763 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2764 	}
2765 
2766 	rcu_read_unlock();
2767 }
2768 
2769 static void sock_def_destruct(struct sock *sk)
2770 {
2771 }
2772 
2773 void sk_send_sigurg(struct sock *sk)
2774 {
2775 	if (sk->sk_socket && sk->sk_socket->file)
2776 		if (send_sigurg(&sk->sk_socket->file->f_owner))
2777 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2778 }
2779 EXPORT_SYMBOL(sk_send_sigurg);
2780 
2781 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2782 		    unsigned long expires)
2783 {
2784 	if (!mod_timer(timer, expires))
2785 		sock_hold(sk);
2786 }
2787 EXPORT_SYMBOL(sk_reset_timer);
2788 
2789 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2790 {
2791 	if (del_timer(timer))
2792 		__sock_put(sk);
2793 }
2794 EXPORT_SYMBOL(sk_stop_timer);
2795 
2796 void sock_init_data(struct socket *sock, struct sock *sk)
2797 {
2798 	sk_init_common(sk);
2799 	sk->sk_send_head	=	NULL;
2800 
2801 	timer_setup(&sk->sk_timer, NULL, 0);
2802 
2803 	sk->sk_allocation	=	GFP_KERNEL;
2804 	sk->sk_rcvbuf		=	sysctl_rmem_default;
2805 	sk->sk_sndbuf		=	sysctl_wmem_default;
2806 	sk->sk_state		=	TCP_CLOSE;
2807 	sk_set_socket(sk, sock);
2808 
2809 	sock_set_flag(sk, SOCK_ZAPPED);
2810 
2811 	if (sock) {
2812 		sk->sk_type	=	sock->type;
2813 		sk->sk_wq	=	sock->wq;
2814 		sock->sk	=	sk;
2815 		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
2816 	} else {
2817 		sk->sk_wq	=	NULL;
2818 		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
2819 	}
2820 
2821 	rwlock_init(&sk->sk_callback_lock);
2822 	if (sk->sk_kern_sock)
2823 		lockdep_set_class_and_name(
2824 			&sk->sk_callback_lock,
2825 			af_kern_callback_keys + sk->sk_family,
2826 			af_family_kern_clock_key_strings[sk->sk_family]);
2827 	else
2828 		lockdep_set_class_and_name(
2829 			&sk->sk_callback_lock,
2830 			af_callback_keys + sk->sk_family,
2831 			af_family_clock_key_strings[sk->sk_family]);
2832 
2833 	sk->sk_state_change	=	sock_def_wakeup;
2834 	sk->sk_data_ready	=	sock_def_readable;
2835 	sk->sk_write_space	=	sock_def_write_space;
2836 	sk->sk_error_report	=	sock_def_error_report;
2837 	sk->sk_destruct		=	sock_def_destruct;
2838 
2839 	sk->sk_frag.page	=	NULL;
2840 	sk->sk_frag.offset	=	0;
2841 	sk->sk_peek_off		=	-1;
2842 
2843 	sk->sk_peer_pid 	=	NULL;
2844 	sk->sk_peer_cred	=	NULL;
2845 	sk->sk_write_pending	=	0;
2846 	sk->sk_rcvlowat		=	1;
2847 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2848 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2849 
2850 	sk->sk_stamp = SK_DEFAULT_STAMP;
2851 	atomic_set(&sk->sk_zckey, 0);
2852 
2853 #ifdef CONFIG_NET_RX_BUSY_POLL
2854 	sk->sk_napi_id		=	0;
2855 	sk->sk_ll_usec		=	sysctl_net_busy_read;
2856 #endif
2857 
2858 	sk->sk_max_pacing_rate = ~0U;
2859 	sk->sk_pacing_rate = ~0U;
2860 	sk->sk_pacing_shift = 10;
2861 	sk->sk_incoming_cpu = -1;
2862 
2863 	sk_rx_queue_clear(sk);
2864 	/*
2865 	 * Before updating sk_refcnt, we must commit prior changes to memory
2866 	 * (Documentation/RCU/rculist_nulls.txt for details)
2867 	 */
2868 	smp_wmb();
2869 	refcount_set(&sk->sk_refcnt, 1);
2870 	atomic_set(&sk->sk_drops, 0);
2871 }
2872 EXPORT_SYMBOL(sock_init_data);
2873 
2874 void lock_sock_nested(struct sock *sk, int subclass)
2875 {
2876 	might_sleep();
2877 	spin_lock_bh(&sk->sk_lock.slock);
2878 	if (sk->sk_lock.owned)
2879 		__lock_sock(sk);
2880 	sk->sk_lock.owned = 1;
2881 	spin_unlock(&sk->sk_lock.slock);
2882 	/*
2883 	 * The sk_lock has mutex_lock() semantics here:
2884 	 */
2885 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2886 	local_bh_enable();
2887 }
2888 EXPORT_SYMBOL(lock_sock_nested);
2889 
2890 void release_sock(struct sock *sk)
2891 {
2892 	spin_lock_bh(&sk->sk_lock.slock);
2893 	if (sk->sk_backlog.tail)
2894 		__release_sock(sk);
2895 
2896 	/* Warning : release_cb() might need to release sk ownership,
2897 	 * ie call sock_release_ownership(sk) before us.
2898 	 */
2899 	if (sk->sk_prot->release_cb)
2900 		sk->sk_prot->release_cb(sk);
2901 
2902 	sock_release_ownership(sk);
2903 	if (waitqueue_active(&sk->sk_lock.wq))
2904 		wake_up(&sk->sk_lock.wq);
2905 	spin_unlock_bh(&sk->sk_lock.slock);
2906 }
2907 EXPORT_SYMBOL(release_sock);
2908 
2909 /**
2910  * lock_sock_fast - fast version of lock_sock
2911  * @sk: socket
2912  *
2913  * This version should be used for very small section, where process wont block
2914  * return false if fast path is taken:
2915  *
2916  *   sk_lock.slock locked, owned = 0, BH disabled
2917  *
2918  * return true if slow path is taken:
2919  *
2920  *   sk_lock.slock unlocked, owned = 1, BH enabled
2921  */
2922 bool lock_sock_fast(struct sock *sk)
2923 {
2924 	might_sleep();
2925 	spin_lock_bh(&sk->sk_lock.slock);
2926 
2927 	if (!sk->sk_lock.owned)
2928 		/*
2929 		 * Note : We must disable BH
2930 		 */
2931 		return false;
2932 
2933 	__lock_sock(sk);
2934 	sk->sk_lock.owned = 1;
2935 	spin_unlock(&sk->sk_lock.slock);
2936 	/*
2937 	 * The sk_lock has mutex_lock() semantics here:
2938 	 */
2939 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2940 	local_bh_enable();
2941 	return true;
2942 }
2943 EXPORT_SYMBOL(lock_sock_fast);
2944 
2945 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2946 {
2947 	struct timeval tv;
2948 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2949 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2950 	tv = ktime_to_timeval(sk->sk_stamp);
2951 	if (tv.tv_sec == -1)
2952 		return -ENOENT;
2953 	if (tv.tv_sec == 0) {
2954 		sk->sk_stamp = ktime_get_real();
2955 		tv = ktime_to_timeval(sk->sk_stamp);
2956 	}
2957 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2958 }
2959 EXPORT_SYMBOL(sock_get_timestamp);
2960 
2961 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2962 {
2963 	struct timespec ts;
2964 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2965 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2966 	ts = ktime_to_timespec(sk->sk_stamp);
2967 	if (ts.tv_sec == -1)
2968 		return -ENOENT;
2969 	if (ts.tv_sec == 0) {
2970 		sk->sk_stamp = ktime_get_real();
2971 		ts = ktime_to_timespec(sk->sk_stamp);
2972 	}
2973 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2974 }
2975 EXPORT_SYMBOL(sock_get_timestampns);
2976 
2977 void sock_enable_timestamp(struct sock *sk, int flag)
2978 {
2979 	if (!sock_flag(sk, flag)) {
2980 		unsigned long previous_flags = sk->sk_flags;
2981 
2982 		sock_set_flag(sk, flag);
2983 		/*
2984 		 * we just set one of the two flags which require net
2985 		 * time stamping, but time stamping might have been on
2986 		 * already because of the other one
2987 		 */
2988 		if (sock_needs_netstamp(sk) &&
2989 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
2990 			net_enable_timestamp();
2991 	}
2992 }
2993 
2994 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2995 		       int level, int type)
2996 {
2997 	struct sock_exterr_skb *serr;
2998 	struct sk_buff *skb;
2999 	int copied, err;
3000 
3001 	err = -EAGAIN;
3002 	skb = sock_dequeue_err_skb(sk);
3003 	if (skb == NULL)
3004 		goto out;
3005 
3006 	copied = skb->len;
3007 	if (copied > len) {
3008 		msg->msg_flags |= MSG_TRUNC;
3009 		copied = len;
3010 	}
3011 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3012 	if (err)
3013 		goto out_free_skb;
3014 
3015 	sock_recv_timestamp(msg, sk, skb);
3016 
3017 	serr = SKB_EXT_ERR(skb);
3018 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3019 
3020 	msg->msg_flags |= MSG_ERRQUEUE;
3021 	err = copied;
3022 
3023 out_free_skb:
3024 	kfree_skb(skb);
3025 out:
3026 	return err;
3027 }
3028 EXPORT_SYMBOL(sock_recv_errqueue);
3029 
3030 /*
3031  *	Get a socket option on an socket.
3032  *
3033  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3034  *	asynchronous errors should be reported by getsockopt. We assume
3035  *	this means if you specify SO_ERROR (otherwise whats the point of it).
3036  */
3037 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3038 			   char __user *optval, int __user *optlen)
3039 {
3040 	struct sock *sk = sock->sk;
3041 
3042 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3043 }
3044 EXPORT_SYMBOL(sock_common_getsockopt);
3045 
3046 #ifdef CONFIG_COMPAT
3047 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
3048 				  char __user *optval, int __user *optlen)
3049 {
3050 	struct sock *sk = sock->sk;
3051 
3052 	if (sk->sk_prot->compat_getsockopt != NULL)
3053 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
3054 						      optval, optlen);
3055 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3056 }
3057 EXPORT_SYMBOL(compat_sock_common_getsockopt);
3058 #endif
3059 
3060 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3061 			int flags)
3062 {
3063 	struct sock *sk = sock->sk;
3064 	int addr_len = 0;
3065 	int err;
3066 
3067 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3068 				   flags & ~MSG_DONTWAIT, &addr_len);
3069 	if (err >= 0)
3070 		msg->msg_namelen = addr_len;
3071 	return err;
3072 }
3073 EXPORT_SYMBOL(sock_common_recvmsg);
3074 
3075 /*
3076  *	Set socket options on an inet socket.
3077  */
3078 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3079 			   char __user *optval, unsigned int optlen)
3080 {
3081 	struct sock *sk = sock->sk;
3082 
3083 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3084 }
3085 EXPORT_SYMBOL(sock_common_setsockopt);
3086 
3087 #ifdef CONFIG_COMPAT
3088 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
3089 				  char __user *optval, unsigned int optlen)
3090 {
3091 	struct sock *sk = sock->sk;
3092 
3093 	if (sk->sk_prot->compat_setsockopt != NULL)
3094 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
3095 						      optval, optlen);
3096 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3097 }
3098 EXPORT_SYMBOL(compat_sock_common_setsockopt);
3099 #endif
3100 
3101 void sk_common_release(struct sock *sk)
3102 {
3103 	if (sk->sk_prot->destroy)
3104 		sk->sk_prot->destroy(sk);
3105 
3106 	/*
3107 	 * Observation: when sock_common_release is called, processes have
3108 	 * no access to socket. But net still has.
3109 	 * Step one, detach it from networking:
3110 	 *
3111 	 * A. Remove from hash tables.
3112 	 */
3113 
3114 	sk->sk_prot->unhash(sk);
3115 
3116 	/*
3117 	 * In this point socket cannot receive new packets, but it is possible
3118 	 * that some packets are in flight because some CPU runs receiver and
3119 	 * did hash table lookup before we unhashed socket. They will achieve
3120 	 * receive queue and will be purged by socket destructor.
3121 	 *
3122 	 * Also we still have packets pending on receive queue and probably,
3123 	 * our own packets waiting in device queues. sock_destroy will drain
3124 	 * receive queue, but transmitted packets will delay socket destruction
3125 	 * until the last reference will be released.
3126 	 */
3127 
3128 	sock_orphan(sk);
3129 
3130 	xfrm_sk_free_policy(sk);
3131 
3132 	sk_refcnt_debug_release(sk);
3133 
3134 	sock_put(sk);
3135 }
3136 EXPORT_SYMBOL(sk_common_release);
3137 
3138 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3139 {
3140 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3141 
3142 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3143 	mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3144 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3145 	mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3146 	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3147 	mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3148 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3149 	mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3150 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3151 }
3152 
3153 #ifdef CONFIG_PROC_FS
3154 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
3155 struct prot_inuse {
3156 	int val[PROTO_INUSE_NR];
3157 };
3158 
3159 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3160 
3161 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3162 {
3163 	__this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3164 }
3165 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3166 
3167 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3168 {
3169 	int cpu, idx = prot->inuse_idx;
3170 	int res = 0;
3171 
3172 	for_each_possible_cpu(cpu)
3173 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3174 
3175 	return res >= 0 ? res : 0;
3176 }
3177 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3178 
3179 static void sock_inuse_add(struct net *net, int val)
3180 {
3181 	this_cpu_add(*net->core.sock_inuse, val);
3182 }
3183 
3184 int sock_inuse_get(struct net *net)
3185 {
3186 	int cpu, res = 0;
3187 
3188 	for_each_possible_cpu(cpu)
3189 		res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3190 
3191 	return res;
3192 }
3193 
3194 EXPORT_SYMBOL_GPL(sock_inuse_get);
3195 
3196 static int __net_init sock_inuse_init_net(struct net *net)
3197 {
3198 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3199 	if (net->core.prot_inuse == NULL)
3200 		return -ENOMEM;
3201 
3202 	net->core.sock_inuse = alloc_percpu(int);
3203 	if (net->core.sock_inuse == NULL)
3204 		goto out;
3205 
3206 	return 0;
3207 
3208 out:
3209 	free_percpu(net->core.prot_inuse);
3210 	return -ENOMEM;
3211 }
3212 
3213 static void __net_exit sock_inuse_exit_net(struct net *net)
3214 {
3215 	free_percpu(net->core.prot_inuse);
3216 	free_percpu(net->core.sock_inuse);
3217 }
3218 
3219 static struct pernet_operations net_inuse_ops = {
3220 	.init = sock_inuse_init_net,
3221 	.exit = sock_inuse_exit_net,
3222 };
3223 
3224 static __init int net_inuse_init(void)
3225 {
3226 	if (register_pernet_subsys(&net_inuse_ops))
3227 		panic("Cannot initialize net inuse counters");
3228 
3229 	return 0;
3230 }
3231 
3232 core_initcall(net_inuse_init);
3233 
3234 static void assign_proto_idx(struct proto *prot)
3235 {
3236 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3237 
3238 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3239 		pr_err("PROTO_INUSE_NR exhausted\n");
3240 		return;
3241 	}
3242 
3243 	set_bit(prot->inuse_idx, proto_inuse_idx);
3244 }
3245 
3246 static void release_proto_idx(struct proto *prot)
3247 {
3248 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3249 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3250 }
3251 #else
3252 static inline void assign_proto_idx(struct proto *prot)
3253 {
3254 }
3255 
3256 static inline void release_proto_idx(struct proto *prot)
3257 {
3258 }
3259 
3260 static void sock_inuse_add(struct net *net, int val)
3261 {
3262 }
3263 #endif
3264 
3265 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3266 {
3267 	if (!rsk_prot)
3268 		return;
3269 	kfree(rsk_prot->slab_name);
3270 	rsk_prot->slab_name = NULL;
3271 	kmem_cache_destroy(rsk_prot->slab);
3272 	rsk_prot->slab = NULL;
3273 }
3274 
3275 static int req_prot_init(const struct proto *prot)
3276 {
3277 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3278 
3279 	if (!rsk_prot)
3280 		return 0;
3281 
3282 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3283 					prot->name);
3284 	if (!rsk_prot->slab_name)
3285 		return -ENOMEM;
3286 
3287 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3288 					   rsk_prot->obj_size, 0,
3289 					   SLAB_ACCOUNT | prot->slab_flags,
3290 					   NULL);
3291 
3292 	if (!rsk_prot->slab) {
3293 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3294 			prot->name);
3295 		return -ENOMEM;
3296 	}
3297 	return 0;
3298 }
3299 
3300 int proto_register(struct proto *prot, int alloc_slab)
3301 {
3302 	if (alloc_slab) {
3303 		prot->slab = kmem_cache_create_usercopy(prot->name,
3304 					prot->obj_size, 0,
3305 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3306 					prot->slab_flags,
3307 					prot->useroffset, prot->usersize,
3308 					NULL);
3309 
3310 		if (prot->slab == NULL) {
3311 			pr_crit("%s: Can't create sock SLAB cache!\n",
3312 				prot->name);
3313 			goto out;
3314 		}
3315 
3316 		if (req_prot_init(prot))
3317 			goto out_free_request_sock_slab;
3318 
3319 		if (prot->twsk_prot != NULL) {
3320 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3321 
3322 			if (prot->twsk_prot->twsk_slab_name == NULL)
3323 				goto out_free_request_sock_slab;
3324 
3325 			prot->twsk_prot->twsk_slab =
3326 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3327 						  prot->twsk_prot->twsk_obj_size,
3328 						  0,
3329 						  SLAB_ACCOUNT |
3330 						  prot->slab_flags,
3331 						  NULL);
3332 			if (prot->twsk_prot->twsk_slab == NULL)
3333 				goto out_free_timewait_sock_slab_name;
3334 		}
3335 	}
3336 
3337 	mutex_lock(&proto_list_mutex);
3338 	list_add(&prot->node, &proto_list);
3339 	assign_proto_idx(prot);
3340 	mutex_unlock(&proto_list_mutex);
3341 	return 0;
3342 
3343 out_free_timewait_sock_slab_name:
3344 	kfree(prot->twsk_prot->twsk_slab_name);
3345 out_free_request_sock_slab:
3346 	req_prot_cleanup(prot->rsk_prot);
3347 
3348 	kmem_cache_destroy(prot->slab);
3349 	prot->slab = NULL;
3350 out:
3351 	return -ENOBUFS;
3352 }
3353 EXPORT_SYMBOL(proto_register);
3354 
3355 void proto_unregister(struct proto *prot)
3356 {
3357 	mutex_lock(&proto_list_mutex);
3358 	release_proto_idx(prot);
3359 	list_del(&prot->node);
3360 	mutex_unlock(&proto_list_mutex);
3361 
3362 	kmem_cache_destroy(prot->slab);
3363 	prot->slab = NULL;
3364 
3365 	req_prot_cleanup(prot->rsk_prot);
3366 
3367 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3368 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3369 		kfree(prot->twsk_prot->twsk_slab_name);
3370 		prot->twsk_prot->twsk_slab = NULL;
3371 	}
3372 }
3373 EXPORT_SYMBOL(proto_unregister);
3374 
3375 int sock_load_diag_module(int family, int protocol)
3376 {
3377 	if (!protocol) {
3378 		if (!sock_is_registered(family))
3379 			return -ENOENT;
3380 
3381 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3382 				      NETLINK_SOCK_DIAG, family);
3383 	}
3384 
3385 #ifdef CONFIG_INET
3386 	if (family == AF_INET &&
3387 	    !rcu_access_pointer(inet_protos[protocol]))
3388 		return -ENOENT;
3389 #endif
3390 
3391 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3392 			      NETLINK_SOCK_DIAG, family, protocol);
3393 }
3394 EXPORT_SYMBOL(sock_load_diag_module);
3395 
3396 #ifdef CONFIG_PROC_FS
3397 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3398 	__acquires(proto_list_mutex)
3399 {
3400 	mutex_lock(&proto_list_mutex);
3401 	return seq_list_start_head(&proto_list, *pos);
3402 }
3403 
3404 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3405 {
3406 	return seq_list_next(v, &proto_list, pos);
3407 }
3408 
3409 static void proto_seq_stop(struct seq_file *seq, void *v)
3410 	__releases(proto_list_mutex)
3411 {
3412 	mutex_unlock(&proto_list_mutex);
3413 }
3414 
3415 static char proto_method_implemented(const void *method)
3416 {
3417 	return method == NULL ? 'n' : 'y';
3418 }
3419 static long sock_prot_memory_allocated(struct proto *proto)
3420 {
3421 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3422 }
3423 
3424 static char *sock_prot_memory_pressure(struct proto *proto)
3425 {
3426 	return proto->memory_pressure != NULL ?
3427 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3428 }
3429 
3430 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3431 {
3432 
3433 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3434 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3435 		   proto->name,
3436 		   proto->obj_size,
3437 		   sock_prot_inuse_get(seq_file_net(seq), proto),
3438 		   sock_prot_memory_allocated(proto),
3439 		   sock_prot_memory_pressure(proto),
3440 		   proto->max_header,
3441 		   proto->slab == NULL ? "no" : "yes",
3442 		   module_name(proto->owner),
3443 		   proto_method_implemented(proto->close),
3444 		   proto_method_implemented(proto->connect),
3445 		   proto_method_implemented(proto->disconnect),
3446 		   proto_method_implemented(proto->accept),
3447 		   proto_method_implemented(proto->ioctl),
3448 		   proto_method_implemented(proto->init),
3449 		   proto_method_implemented(proto->destroy),
3450 		   proto_method_implemented(proto->shutdown),
3451 		   proto_method_implemented(proto->setsockopt),
3452 		   proto_method_implemented(proto->getsockopt),
3453 		   proto_method_implemented(proto->sendmsg),
3454 		   proto_method_implemented(proto->recvmsg),
3455 		   proto_method_implemented(proto->sendpage),
3456 		   proto_method_implemented(proto->bind),
3457 		   proto_method_implemented(proto->backlog_rcv),
3458 		   proto_method_implemented(proto->hash),
3459 		   proto_method_implemented(proto->unhash),
3460 		   proto_method_implemented(proto->get_port),
3461 		   proto_method_implemented(proto->enter_memory_pressure));
3462 }
3463 
3464 static int proto_seq_show(struct seq_file *seq, void *v)
3465 {
3466 	if (v == &proto_list)
3467 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3468 			   "protocol",
3469 			   "size",
3470 			   "sockets",
3471 			   "memory",
3472 			   "press",
3473 			   "maxhdr",
3474 			   "slab",
3475 			   "module",
3476 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3477 	else
3478 		proto_seq_printf(seq, list_entry(v, struct proto, node));
3479 	return 0;
3480 }
3481 
3482 static const struct seq_operations proto_seq_ops = {
3483 	.start  = proto_seq_start,
3484 	.next   = proto_seq_next,
3485 	.stop   = proto_seq_stop,
3486 	.show   = proto_seq_show,
3487 };
3488 
3489 static __net_init int proto_init_net(struct net *net)
3490 {
3491 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3492 			sizeof(struct seq_net_private)))
3493 		return -ENOMEM;
3494 
3495 	return 0;
3496 }
3497 
3498 static __net_exit void proto_exit_net(struct net *net)
3499 {
3500 	remove_proc_entry("protocols", net->proc_net);
3501 }
3502 
3503 
3504 static __net_initdata struct pernet_operations proto_net_ops = {
3505 	.init = proto_init_net,
3506 	.exit = proto_exit_net,
3507 };
3508 
3509 static int __init proto_init(void)
3510 {
3511 	return register_pernet_subsys(&proto_net_ops);
3512 }
3513 
3514 subsys_initcall(proto_init);
3515 
3516 #endif /* PROC_FS */
3517 
3518 #ifdef CONFIG_NET_RX_BUSY_POLL
3519 bool sk_busy_loop_end(void *p, unsigned long start_time)
3520 {
3521 	struct sock *sk = p;
3522 
3523 	return !skb_queue_empty(&sk->sk_receive_queue) ||
3524 	       sk_busy_loop_timeout(sk, start_time);
3525 }
3526 EXPORT_SYMBOL(sk_busy_loop_end);
3527 #endif /* CONFIG_NET_RX_BUSY_POLL */
3528