xref: /openbmc/linux/net/core/sock.c (revision 9144f784f852f9a125cabe9927b986d909bfa439)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/udp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113 #include <linux/user_namespace.h>
114 #include <linux/static_key.h>
115 #include <linux/memcontrol.h>
116 #include <linux/prefetch.h>
117 #include <linux/compat.h>
118 #include <linux/mroute.h>
119 #include <linux/mroute6.h>
120 #include <linux/icmpv6.h>
121 
122 #include <linux/uaccess.h>
123 
124 #include <linux/netdevice.h>
125 #include <net/protocol.h>
126 #include <linux/skbuff.h>
127 #include <net/net_namespace.h>
128 #include <net/request_sock.h>
129 #include <net/sock.h>
130 #include <linux/net_tstamp.h>
131 #include <net/xfrm.h>
132 #include <linux/ipsec.h>
133 #include <net/cls_cgroup.h>
134 #include <net/netprio_cgroup.h>
135 #include <linux/sock_diag.h>
136 
137 #include <linux/filter.h>
138 #include <net/sock_reuseport.h>
139 #include <net/bpf_sk_storage.h>
140 
141 #include <trace/events/sock.h>
142 
143 #include <net/tcp.h>
144 #include <net/busy_poll.h>
145 #include <net/phonet/phonet.h>
146 
147 #include <linux/ethtool.h>
148 
149 #include "dev.h"
150 
151 static DEFINE_MUTEX(proto_list_mutex);
152 static LIST_HEAD(proto_list);
153 
154 static void sock_def_write_space_wfree(struct sock *sk);
155 static void sock_def_write_space(struct sock *sk);
156 
157 /**
158  * sk_ns_capable - General socket capability test
159  * @sk: Socket to use a capability on or through
160  * @user_ns: The user namespace of the capability to use
161  * @cap: The capability to use
162  *
163  * Test to see if the opener of the socket had when the socket was
164  * created and the current process has the capability @cap in the user
165  * namespace @user_ns.
166  */
sk_ns_capable(const struct sock * sk,struct user_namespace * user_ns,int cap)167 bool sk_ns_capable(const struct sock *sk,
168 		   struct user_namespace *user_ns, int cap)
169 {
170 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
171 		ns_capable(user_ns, cap);
172 }
173 EXPORT_SYMBOL(sk_ns_capable);
174 
175 /**
176  * sk_capable - Socket global capability test
177  * @sk: Socket to use a capability on or through
178  * @cap: The global capability to use
179  *
180  * Test to see if the opener of the socket had when the socket was
181  * created and the current process has the capability @cap in all user
182  * namespaces.
183  */
sk_capable(const struct sock * sk,int cap)184 bool sk_capable(const struct sock *sk, int cap)
185 {
186 	return sk_ns_capable(sk, &init_user_ns, cap);
187 }
188 EXPORT_SYMBOL(sk_capable);
189 
190 /**
191  * sk_net_capable - Network namespace socket capability test
192  * @sk: Socket to use a capability on or through
193  * @cap: The capability to use
194  *
195  * Test to see if the opener of the socket had when the socket was created
196  * and the current process has the capability @cap over the network namespace
197  * the socket is a member of.
198  */
sk_net_capable(const struct sock * sk,int cap)199 bool sk_net_capable(const struct sock *sk, int cap)
200 {
201 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
202 }
203 EXPORT_SYMBOL(sk_net_capable);
204 
205 /*
206  * Each address family might have different locking rules, so we have
207  * one slock key per address family and separate keys for internal and
208  * userspace sockets.
209  */
210 static struct lock_class_key af_family_keys[AF_MAX];
211 static struct lock_class_key af_family_kern_keys[AF_MAX];
212 static struct lock_class_key af_family_slock_keys[AF_MAX];
213 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
214 
215 /*
216  * Make lock validator output more readable. (we pre-construct these
217  * strings build-time, so that runtime initialization of socket
218  * locks is fast):
219  */
220 
221 #define _sock_locks(x)						  \
222   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
223   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
224   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
225   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
226   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
227   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
228   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
229   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
230   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
231   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
232   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
233   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
234   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
235   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
236   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
237   x "AF_MCTP"  , \
238   x "AF_MAX"
239 
240 static const char *const af_family_key_strings[AF_MAX+1] = {
241 	_sock_locks("sk_lock-")
242 };
243 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
244 	_sock_locks("slock-")
245 };
246 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
247 	_sock_locks("clock-")
248 };
249 
250 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
251 	_sock_locks("k-sk_lock-")
252 };
253 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
254 	_sock_locks("k-slock-")
255 };
256 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
257 	_sock_locks("k-clock-")
258 };
259 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
260 	_sock_locks("rlock-")
261 };
262 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
263 	_sock_locks("wlock-")
264 };
265 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
266 	_sock_locks("elock-")
267 };
268 
269 /*
270  * sk_callback_lock and sk queues locking rules are per-address-family,
271  * so split the lock classes by using a per-AF key:
272  */
273 static struct lock_class_key af_callback_keys[AF_MAX];
274 static struct lock_class_key af_rlock_keys[AF_MAX];
275 static struct lock_class_key af_wlock_keys[AF_MAX];
276 static struct lock_class_key af_elock_keys[AF_MAX];
277 static struct lock_class_key af_kern_callback_keys[AF_MAX];
278 
279 /* Run time adjustable parameters. */
280 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
281 EXPORT_SYMBOL(sysctl_wmem_max);
282 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
283 EXPORT_SYMBOL(sysctl_rmem_max);
284 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
285 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
286 int sysctl_mem_pcpu_rsv __read_mostly = SK_MEMORY_PCPU_RESERVE;
287 
288 /* Maximal space eaten by iovec or ancillary data plus some space */
289 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
290 EXPORT_SYMBOL(sysctl_optmem_max);
291 
292 int sysctl_tstamp_allow_data __read_mostly = 1;
293 
294 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
295 EXPORT_SYMBOL_GPL(memalloc_socks_key);
296 
297 /**
298  * sk_set_memalloc - sets %SOCK_MEMALLOC
299  * @sk: socket to set it on
300  *
301  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
302  * It's the responsibility of the admin to adjust min_free_kbytes
303  * to meet the requirements
304  */
sk_set_memalloc(struct sock * sk)305 void sk_set_memalloc(struct sock *sk)
306 {
307 	sock_set_flag(sk, SOCK_MEMALLOC);
308 	sk->sk_allocation |= __GFP_MEMALLOC;
309 	static_branch_inc(&memalloc_socks_key);
310 }
311 EXPORT_SYMBOL_GPL(sk_set_memalloc);
312 
sk_clear_memalloc(struct sock * sk)313 void sk_clear_memalloc(struct sock *sk)
314 {
315 	sock_reset_flag(sk, SOCK_MEMALLOC);
316 	sk->sk_allocation &= ~__GFP_MEMALLOC;
317 	static_branch_dec(&memalloc_socks_key);
318 
319 	/*
320 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
321 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
322 	 * it has rmem allocations due to the last swapfile being deactivated
323 	 * but there is a risk that the socket is unusable due to exceeding
324 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
325 	 */
326 	sk_mem_reclaim(sk);
327 }
328 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
329 
__sk_backlog_rcv(struct sock * sk,struct sk_buff * skb)330 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
331 {
332 	int ret;
333 	unsigned int noreclaim_flag;
334 
335 	/* these should have been dropped before queueing */
336 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
337 
338 	noreclaim_flag = memalloc_noreclaim_save();
339 	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
340 				 tcp_v6_do_rcv,
341 				 tcp_v4_do_rcv,
342 				 sk, skb);
343 	memalloc_noreclaim_restore(noreclaim_flag);
344 
345 	return ret;
346 }
347 EXPORT_SYMBOL(__sk_backlog_rcv);
348 
sk_error_report(struct sock * sk)349 void sk_error_report(struct sock *sk)
350 {
351 	sk->sk_error_report(sk);
352 
353 	switch (sk->sk_family) {
354 	case AF_INET:
355 		fallthrough;
356 	case AF_INET6:
357 		trace_inet_sk_error_report(sk);
358 		break;
359 	default:
360 		break;
361 	}
362 }
363 EXPORT_SYMBOL(sk_error_report);
364 
sock_get_timeout(long timeo,void * optval,bool old_timeval)365 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
366 {
367 	struct __kernel_sock_timeval tv;
368 
369 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
370 		tv.tv_sec = 0;
371 		tv.tv_usec = 0;
372 	} else {
373 		tv.tv_sec = timeo / HZ;
374 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
375 	}
376 
377 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
378 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
379 		*(struct old_timeval32 *)optval = tv32;
380 		return sizeof(tv32);
381 	}
382 
383 	if (old_timeval) {
384 		struct __kernel_old_timeval old_tv;
385 		old_tv.tv_sec = tv.tv_sec;
386 		old_tv.tv_usec = tv.tv_usec;
387 		*(struct __kernel_old_timeval *)optval = old_tv;
388 		return sizeof(old_tv);
389 	}
390 
391 	*(struct __kernel_sock_timeval *)optval = tv;
392 	return sizeof(tv);
393 }
394 EXPORT_SYMBOL(sock_get_timeout);
395 
sock_copy_user_timeval(struct __kernel_sock_timeval * tv,sockptr_t optval,int optlen,bool old_timeval)396 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
397 			   sockptr_t optval, int optlen, bool old_timeval)
398 {
399 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
400 		struct old_timeval32 tv32;
401 
402 		if (optlen < sizeof(tv32))
403 			return -EINVAL;
404 
405 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
406 			return -EFAULT;
407 		tv->tv_sec = tv32.tv_sec;
408 		tv->tv_usec = tv32.tv_usec;
409 	} else if (old_timeval) {
410 		struct __kernel_old_timeval old_tv;
411 
412 		if (optlen < sizeof(old_tv))
413 			return -EINVAL;
414 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
415 			return -EFAULT;
416 		tv->tv_sec = old_tv.tv_sec;
417 		tv->tv_usec = old_tv.tv_usec;
418 	} else {
419 		if (optlen < sizeof(*tv))
420 			return -EINVAL;
421 		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
422 			return -EFAULT;
423 	}
424 
425 	return 0;
426 }
427 EXPORT_SYMBOL(sock_copy_user_timeval);
428 
sock_set_timeout(long * timeo_p,sockptr_t optval,int optlen,bool old_timeval)429 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
430 			    bool old_timeval)
431 {
432 	struct __kernel_sock_timeval tv;
433 	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
434 	long val;
435 
436 	if (err)
437 		return err;
438 
439 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
440 		return -EDOM;
441 
442 	if (tv.tv_sec < 0) {
443 		static int warned __read_mostly;
444 
445 		WRITE_ONCE(*timeo_p, 0);
446 		if (warned < 10 && net_ratelimit()) {
447 			warned++;
448 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
449 				__func__, current->comm, task_pid_nr(current));
450 		}
451 		return 0;
452 	}
453 	val = MAX_SCHEDULE_TIMEOUT;
454 	if ((tv.tv_sec || tv.tv_usec) &&
455 	    (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
456 		val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
457 						    USEC_PER_SEC / HZ);
458 	WRITE_ONCE(*timeo_p, val);
459 	return 0;
460 }
461 
sock_needs_netstamp(const struct sock * sk)462 static bool sock_needs_netstamp(const struct sock *sk)
463 {
464 	switch (sk->sk_family) {
465 	case AF_UNSPEC:
466 	case AF_UNIX:
467 		return false;
468 	default:
469 		return true;
470 	}
471 }
472 
sock_disable_timestamp(struct sock * sk,unsigned long flags)473 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
474 {
475 	if (sk->sk_flags & flags) {
476 		sk->sk_flags &= ~flags;
477 		if (sock_needs_netstamp(sk) &&
478 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
479 			net_disable_timestamp();
480 	}
481 }
482 
483 
__sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)484 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
485 {
486 	unsigned long flags;
487 	struct sk_buff_head *list = &sk->sk_receive_queue;
488 
489 	if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
490 		atomic_inc(&sk->sk_drops);
491 		trace_sock_rcvqueue_full(sk, skb);
492 		return -ENOMEM;
493 	}
494 
495 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
496 		atomic_inc(&sk->sk_drops);
497 		return -ENOBUFS;
498 	}
499 
500 	skb->dev = NULL;
501 	skb_set_owner_r(skb, sk);
502 
503 	/* we escape from rcu protected region, make sure we dont leak
504 	 * a norefcounted dst
505 	 */
506 	skb_dst_force(skb);
507 
508 	spin_lock_irqsave(&list->lock, flags);
509 	sock_skb_set_dropcount(sk, skb);
510 	__skb_queue_tail(list, skb);
511 	spin_unlock_irqrestore(&list->lock, flags);
512 
513 	if (!sock_flag(sk, SOCK_DEAD))
514 		sk->sk_data_ready(sk);
515 	return 0;
516 }
517 EXPORT_SYMBOL(__sock_queue_rcv_skb);
518 
sock_queue_rcv_skb_reason(struct sock * sk,struct sk_buff * skb,enum skb_drop_reason * reason)519 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
520 			      enum skb_drop_reason *reason)
521 {
522 	enum skb_drop_reason drop_reason;
523 	int err;
524 
525 	err = sk_filter(sk, skb);
526 	if (err) {
527 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
528 		goto out;
529 	}
530 	err = __sock_queue_rcv_skb(sk, skb);
531 	switch (err) {
532 	case -ENOMEM:
533 		drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
534 		break;
535 	case -ENOBUFS:
536 		drop_reason = SKB_DROP_REASON_PROTO_MEM;
537 		break;
538 	default:
539 		drop_reason = SKB_NOT_DROPPED_YET;
540 		break;
541 	}
542 out:
543 	if (reason)
544 		*reason = drop_reason;
545 	return err;
546 }
547 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
548 
__sk_receive_skb(struct sock * sk,struct sk_buff * skb,const int nested,unsigned int trim_cap,bool refcounted)549 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
550 		     const int nested, unsigned int trim_cap, bool refcounted)
551 {
552 	int rc = NET_RX_SUCCESS;
553 
554 	if (sk_filter_trim_cap(sk, skb, trim_cap))
555 		goto discard_and_relse;
556 
557 	skb->dev = NULL;
558 
559 	if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
560 		atomic_inc(&sk->sk_drops);
561 		goto discard_and_relse;
562 	}
563 	if (nested)
564 		bh_lock_sock_nested(sk);
565 	else
566 		bh_lock_sock(sk);
567 	if (!sock_owned_by_user(sk)) {
568 		/*
569 		 * trylock + unlock semantics:
570 		 */
571 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
572 
573 		rc = sk_backlog_rcv(sk, skb);
574 
575 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
576 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
577 		bh_unlock_sock(sk);
578 		atomic_inc(&sk->sk_drops);
579 		goto discard_and_relse;
580 	}
581 
582 	bh_unlock_sock(sk);
583 out:
584 	if (refcounted)
585 		sock_put(sk);
586 	return rc;
587 discard_and_relse:
588 	kfree_skb(skb);
589 	goto out;
590 }
591 EXPORT_SYMBOL(__sk_receive_skb);
592 
593 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
594 							  u32));
595 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
596 							   u32));
__sk_dst_check(struct sock * sk,u32 cookie)597 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
598 {
599 	struct dst_entry *dst = __sk_dst_get(sk);
600 
601 	if (dst && dst->obsolete &&
602 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
603 			       dst, cookie) == NULL) {
604 		sk_tx_queue_clear(sk);
605 		WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
606 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
607 		dst_release(dst);
608 		return NULL;
609 	}
610 
611 	return dst;
612 }
613 EXPORT_SYMBOL(__sk_dst_check);
614 
sk_dst_check(struct sock * sk,u32 cookie)615 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
616 {
617 	struct dst_entry *dst = sk_dst_get(sk);
618 
619 	if (dst && dst->obsolete &&
620 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
621 			       dst, cookie) == NULL) {
622 		sk_dst_reset(sk);
623 		dst_release(dst);
624 		return NULL;
625 	}
626 
627 	return dst;
628 }
629 EXPORT_SYMBOL(sk_dst_check);
630 
sock_bindtoindex_locked(struct sock * sk,int ifindex)631 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
632 {
633 	int ret = -ENOPROTOOPT;
634 #ifdef CONFIG_NETDEVICES
635 	struct net *net = sock_net(sk);
636 
637 	/* Sorry... */
638 	ret = -EPERM;
639 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
640 		goto out;
641 
642 	ret = -EINVAL;
643 	if (ifindex < 0)
644 		goto out;
645 
646 	/* Paired with all READ_ONCE() done locklessly. */
647 	WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
648 
649 	if (sk->sk_prot->rehash)
650 		sk->sk_prot->rehash(sk);
651 	sk_dst_reset(sk);
652 
653 	ret = 0;
654 
655 out:
656 #endif
657 
658 	return ret;
659 }
660 
sock_bindtoindex(struct sock * sk,int ifindex,bool lock_sk)661 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
662 {
663 	int ret;
664 
665 	if (lock_sk)
666 		lock_sock(sk);
667 	ret = sock_bindtoindex_locked(sk, ifindex);
668 	if (lock_sk)
669 		release_sock(sk);
670 
671 	return ret;
672 }
673 EXPORT_SYMBOL(sock_bindtoindex);
674 
sock_setbindtodevice(struct sock * sk,sockptr_t optval,int optlen)675 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
676 {
677 	int ret = -ENOPROTOOPT;
678 #ifdef CONFIG_NETDEVICES
679 	struct net *net = sock_net(sk);
680 	char devname[IFNAMSIZ];
681 	int index;
682 
683 	ret = -EINVAL;
684 	if (optlen < 0)
685 		goto out;
686 
687 	/* Bind this socket to a particular device like "eth0",
688 	 * as specified in the passed interface name. If the
689 	 * name is "" or the option length is zero the socket
690 	 * is not bound.
691 	 */
692 	if (optlen > IFNAMSIZ - 1)
693 		optlen = IFNAMSIZ - 1;
694 	memset(devname, 0, sizeof(devname));
695 
696 	ret = -EFAULT;
697 	if (copy_from_sockptr(devname, optval, optlen))
698 		goto out;
699 
700 	index = 0;
701 	if (devname[0] != '\0') {
702 		struct net_device *dev;
703 
704 		rcu_read_lock();
705 		dev = dev_get_by_name_rcu(net, devname);
706 		if (dev)
707 			index = dev->ifindex;
708 		rcu_read_unlock();
709 		ret = -ENODEV;
710 		if (!dev)
711 			goto out;
712 	}
713 
714 	sockopt_lock_sock(sk);
715 	ret = sock_bindtoindex_locked(sk, index);
716 	sockopt_release_sock(sk);
717 out:
718 #endif
719 
720 	return ret;
721 }
722 
sock_getbindtodevice(struct sock * sk,sockptr_t optval,sockptr_t optlen,int len)723 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
724 				sockptr_t optlen, int len)
725 {
726 	int ret = -ENOPROTOOPT;
727 #ifdef CONFIG_NETDEVICES
728 	int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
729 	struct net *net = sock_net(sk);
730 	char devname[IFNAMSIZ];
731 
732 	if (bound_dev_if == 0) {
733 		len = 0;
734 		goto zero;
735 	}
736 
737 	ret = -EINVAL;
738 	if (len < IFNAMSIZ)
739 		goto out;
740 
741 	ret = netdev_get_name(net, devname, bound_dev_if);
742 	if (ret)
743 		goto out;
744 
745 	len = strlen(devname) + 1;
746 
747 	ret = -EFAULT;
748 	if (copy_to_sockptr(optval, devname, len))
749 		goto out;
750 
751 zero:
752 	ret = -EFAULT;
753 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
754 		goto out;
755 
756 	ret = 0;
757 
758 out:
759 #endif
760 
761 	return ret;
762 }
763 
sk_mc_loop(struct sock * sk)764 bool sk_mc_loop(struct sock *sk)
765 {
766 	if (dev_recursion_level())
767 		return false;
768 	if (!sk)
769 		return true;
770 	/* IPV6_ADDRFORM can change sk->sk_family under us. */
771 	switch (READ_ONCE(sk->sk_family)) {
772 	case AF_INET:
773 		return inet_test_bit(MC_LOOP, sk);
774 #if IS_ENABLED(CONFIG_IPV6)
775 	case AF_INET6:
776 		return inet6_sk(sk)->mc_loop;
777 #endif
778 	}
779 	WARN_ON_ONCE(1);
780 	return true;
781 }
782 EXPORT_SYMBOL(sk_mc_loop);
783 
sock_set_reuseaddr(struct sock * sk)784 void sock_set_reuseaddr(struct sock *sk)
785 {
786 	lock_sock(sk);
787 	sk->sk_reuse = SK_CAN_REUSE;
788 	release_sock(sk);
789 }
790 EXPORT_SYMBOL(sock_set_reuseaddr);
791 
sock_set_reuseport(struct sock * sk)792 void sock_set_reuseport(struct sock *sk)
793 {
794 	lock_sock(sk);
795 	sk->sk_reuseport = true;
796 	release_sock(sk);
797 }
798 EXPORT_SYMBOL(sock_set_reuseport);
799 
sock_no_linger(struct sock * sk)800 void sock_no_linger(struct sock *sk)
801 {
802 	lock_sock(sk);
803 	WRITE_ONCE(sk->sk_lingertime, 0);
804 	sock_set_flag(sk, SOCK_LINGER);
805 	release_sock(sk);
806 }
807 EXPORT_SYMBOL(sock_no_linger);
808 
sock_set_priority(struct sock * sk,u32 priority)809 void sock_set_priority(struct sock *sk, u32 priority)
810 {
811 	lock_sock(sk);
812 	WRITE_ONCE(sk->sk_priority, priority);
813 	release_sock(sk);
814 }
815 EXPORT_SYMBOL(sock_set_priority);
816 
sock_set_sndtimeo(struct sock * sk,s64 secs)817 void sock_set_sndtimeo(struct sock *sk, s64 secs)
818 {
819 	lock_sock(sk);
820 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
821 		WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
822 	else
823 		WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
824 	release_sock(sk);
825 }
826 EXPORT_SYMBOL(sock_set_sndtimeo);
827 
__sock_set_timestamps(struct sock * sk,bool val,bool new,bool ns)828 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
829 {
830 	if (val)  {
831 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
832 		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
833 		sock_set_flag(sk, SOCK_RCVTSTAMP);
834 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
835 	} else {
836 		sock_reset_flag(sk, SOCK_RCVTSTAMP);
837 		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
838 	}
839 }
840 
sock_enable_timestamps(struct sock * sk)841 void sock_enable_timestamps(struct sock *sk)
842 {
843 	lock_sock(sk);
844 	__sock_set_timestamps(sk, true, false, true);
845 	release_sock(sk);
846 }
847 EXPORT_SYMBOL(sock_enable_timestamps);
848 
sock_set_timestamp(struct sock * sk,int optname,bool valbool)849 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
850 {
851 	switch (optname) {
852 	case SO_TIMESTAMP_OLD:
853 		__sock_set_timestamps(sk, valbool, false, false);
854 		break;
855 	case SO_TIMESTAMP_NEW:
856 		__sock_set_timestamps(sk, valbool, true, false);
857 		break;
858 	case SO_TIMESTAMPNS_OLD:
859 		__sock_set_timestamps(sk, valbool, false, true);
860 		break;
861 	case SO_TIMESTAMPNS_NEW:
862 		__sock_set_timestamps(sk, valbool, true, true);
863 		break;
864 	}
865 }
866 
sock_timestamping_bind_phc(struct sock * sk,int phc_index)867 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
868 {
869 	struct net *net = sock_net(sk);
870 	struct net_device *dev = NULL;
871 	bool match = false;
872 	int *vclock_index;
873 	int i, num;
874 
875 	if (sk->sk_bound_dev_if)
876 		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
877 
878 	if (!dev) {
879 		pr_err("%s: sock not bind to device\n", __func__);
880 		return -EOPNOTSUPP;
881 	}
882 
883 	num = ethtool_get_phc_vclocks(dev, &vclock_index);
884 	dev_put(dev);
885 
886 	for (i = 0; i < num; i++) {
887 		if (*(vclock_index + i) == phc_index) {
888 			match = true;
889 			break;
890 		}
891 	}
892 
893 	if (num > 0)
894 		kfree(vclock_index);
895 
896 	if (!match)
897 		return -EINVAL;
898 
899 	WRITE_ONCE(sk->sk_bind_phc, phc_index);
900 
901 	return 0;
902 }
903 
sock_set_timestamping(struct sock * sk,int optname,struct so_timestamping timestamping)904 int sock_set_timestamping(struct sock *sk, int optname,
905 			  struct so_timestamping timestamping)
906 {
907 	int val = timestamping.flags;
908 	int ret;
909 
910 	if (val & ~SOF_TIMESTAMPING_MASK)
911 		return -EINVAL;
912 
913 	if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
914 	    !(val & SOF_TIMESTAMPING_OPT_ID))
915 		return -EINVAL;
916 
917 	if (val & SOF_TIMESTAMPING_OPT_ID &&
918 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
919 		if (sk_is_tcp(sk)) {
920 			if ((1 << sk->sk_state) &
921 			    (TCPF_CLOSE | TCPF_LISTEN))
922 				return -EINVAL;
923 			if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
924 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
925 			else
926 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
927 		} else {
928 			atomic_set(&sk->sk_tskey, 0);
929 		}
930 	}
931 
932 	if (val & SOF_TIMESTAMPING_OPT_STATS &&
933 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
934 		return -EINVAL;
935 
936 	if (val & SOF_TIMESTAMPING_BIND_PHC) {
937 		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
938 		if (ret)
939 			return ret;
940 	}
941 
942 	WRITE_ONCE(sk->sk_tsflags, val);
943 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
944 
945 	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
946 		sock_enable_timestamp(sk,
947 				      SOCK_TIMESTAMPING_RX_SOFTWARE);
948 	else
949 		sock_disable_timestamp(sk,
950 				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
951 	return 0;
952 }
953 
sock_set_keepalive(struct sock * sk)954 void sock_set_keepalive(struct sock *sk)
955 {
956 	lock_sock(sk);
957 	if (sk->sk_prot->keepalive)
958 		sk->sk_prot->keepalive(sk, true);
959 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
960 	release_sock(sk);
961 }
962 EXPORT_SYMBOL(sock_set_keepalive);
963 
__sock_set_rcvbuf(struct sock * sk,int val)964 static void __sock_set_rcvbuf(struct sock *sk, int val)
965 {
966 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
967 	 * as a negative value.
968 	 */
969 	val = min_t(int, val, INT_MAX / 2);
970 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
971 
972 	/* We double it on the way in to account for "struct sk_buff" etc.
973 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
974 	 * will allow that much actual data to be received on that socket.
975 	 *
976 	 * Applications are unaware that "struct sk_buff" and other overheads
977 	 * allocate from the receive buffer during socket buffer allocation.
978 	 *
979 	 * And after considering the possible alternatives, returning the value
980 	 * we actually used in getsockopt is the most desirable behavior.
981 	 */
982 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
983 }
984 
sock_set_rcvbuf(struct sock * sk,int val)985 void sock_set_rcvbuf(struct sock *sk, int val)
986 {
987 	lock_sock(sk);
988 	__sock_set_rcvbuf(sk, val);
989 	release_sock(sk);
990 }
991 EXPORT_SYMBOL(sock_set_rcvbuf);
992 
__sock_set_mark(struct sock * sk,u32 val)993 static void __sock_set_mark(struct sock *sk, u32 val)
994 {
995 	if (val != sk->sk_mark) {
996 		WRITE_ONCE(sk->sk_mark, val);
997 		sk_dst_reset(sk);
998 	}
999 }
1000 
sock_set_mark(struct sock * sk,u32 val)1001 void sock_set_mark(struct sock *sk, u32 val)
1002 {
1003 	lock_sock(sk);
1004 	__sock_set_mark(sk, val);
1005 	release_sock(sk);
1006 }
1007 EXPORT_SYMBOL(sock_set_mark);
1008 
sock_release_reserved_memory(struct sock * sk,int bytes)1009 static void sock_release_reserved_memory(struct sock *sk, int bytes)
1010 {
1011 	/* Round down bytes to multiple of pages */
1012 	bytes = round_down(bytes, PAGE_SIZE);
1013 
1014 	WARN_ON(bytes > sk->sk_reserved_mem);
1015 	WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1016 	sk_mem_reclaim(sk);
1017 }
1018 
sock_reserve_memory(struct sock * sk,int bytes)1019 static int sock_reserve_memory(struct sock *sk, int bytes)
1020 {
1021 	long allocated;
1022 	bool charged;
1023 	int pages;
1024 
1025 	if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1026 		return -EOPNOTSUPP;
1027 
1028 	if (!bytes)
1029 		return 0;
1030 
1031 	pages = sk_mem_pages(bytes);
1032 
1033 	/* pre-charge to memcg */
1034 	charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1035 					  GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1036 	if (!charged)
1037 		return -ENOMEM;
1038 
1039 	/* pre-charge to forward_alloc */
1040 	sk_memory_allocated_add(sk, pages);
1041 	allocated = sk_memory_allocated(sk);
1042 	/* If the system goes into memory pressure with this
1043 	 * precharge, give up and return error.
1044 	 */
1045 	if (allocated > sk_prot_mem_limits(sk, 1)) {
1046 		sk_memory_allocated_sub(sk, pages);
1047 		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1048 		return -ENOMEM;
1049 	}
1050 	sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1051 
1052 	WRITE_ONCE(sk->sk_reserved_mem,
1053 		   sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1054 
1055 	return 0;
1056 }
1057 
sockopt_lock_sock(struct sock * sk)1058 void sockopt_lock_sock(struct sock *sk)
1059 {
1060 	/* When current->bpf_ctx is set, the setsockopt is called from
1061 	 * a bpf prog.  bpf has ensured the sk lock has been
1062 	 * acquired before calling setsockopt().
1063 	 */
1064 	if (has_current_bpf_ctx())
1065 		return;
1066 
1067 	lock_sock(sk);
1068 }
1069 EXPORT_SYMBOL(sockopt_lock_sock);
1070 
sockopt_release_sock(struct sock * sk)1071 void sockopt_release_sock(struct sock *sk)
1072 {
1073 	if (has_current_bpf_ctx())
1074 		return;
1075 
1076 	release_sock(sk);
1077 }
1078 EXPORT_SYMBOL(sockopt_release_sock);
1079 
sockopt_ns_capable(struct user_namespace * ns,int cap)1080 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1081 {
1082 	return has_current_bpf_ctx() || ns_capable(ns, cap);
1083 }
1084 EXPORT_SYMBOL(sockopt_ns_capable);
1085 
sockopt_capable(int cap)1086 bool sockopt_capable(int cap)
1087 {
1088 	return has_current_bpf_ctx() || capable(cap);
1089 }
1090 EXPORT_SYMBOL(sockopt_capable);
1091 
1092 /*
1093  *	This is meant for all protocols to use and covers goings on
1094  *	at the socket level. Everything here is generic.
1095  */
1096 
sk_setsockopt(struct sock * sk,int level,int optname,sockptr_t optval,unsigned int optlen)1097 int sk_setsockopt(struct sock *sk, int level, int optname,
1098 		  sockptr_t optval, unsigned int optlen)
1099 {
1100 	struct so_timestamping timestamping;
1101 	struct socket *sock = sk->sk_socket;
1102 	struct sock_txtime sk_txtime;
1103 	int val;
1104 	int valbool;
1105 	struct linger ling;
1106 	int ret = 0;
1107 
1108 	/*
1109 	 *	Options without arguments
1110 	 */
1111 
1112 	if (optname == SO_BINDTODEVICE)
1113 		return sock_setbindtodevice(sk, optval, optlen);
1114 
1115 	if (optlen < sizeof(int))
1116 		return -EINVAL;
1117 
1118 	if (copy_from_sockptr(&val, optval, sizeof(val)))
1119 		return -EFAULT;
1120 
1121 	valbool = val ? 1 : 0;
1122 
1123 	sockopt_lock_sock(sk);
1124 
1125 	switch (optname) {
1126 	case SO_DEBUG:
1127 		if (val && !sockopt_capable(CAP_NET_ADMIN))
1128 			ret = -EACCES;
1129 		else
1130 			sock_valbool_flag(sk, SOCK_DBG, valbool);
1131 		break;
1132 	case SO_REUSEADDR:
1133 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1134 		break;
1135 	case SO_REUSEPORT:
1136 		if (valbool && !sk_is_inet(sk))
1137 			ret = -EOPNOTSUPP;
1138 		else
1139 			sk->sk_reuseport = valbool;
1140 		break;
1141 	case SO_TYPE:
1142 	case SO_PROTOCOL:
1143 	case SO_DOMAIN:
1144 	case SO_ERROR:
1145 		ret = -ENOPROTOOPT;
1146 		break;
1147 	case SO_DONTROUTE:
1148 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1149 		sk_dst_reset(sk);
1150 		break;
1151 	case SO_BROADCAST:
1152 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1153 		break;
1154 	case SO_SNDBUF:
1155 		/* Don't error on this BSD doesn't and if you think
1156 		 * about it this is right. Otherwise apps have to
1157 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1158 		 * are treated in BSD as hints
1159 		 */
1160 		val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1161 set_sndbuf:
1162 		/* Ensure val * 2 fits into an int, to prevent max_t()
1163 		 * from treating it as a negative value.
1164 		 */
1165 		val = min_t(int, val, INT_MAX / 2);
1166 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1167 		WRITE_ONCE(sk->sk_sndbuf,
1168 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1169 		/* Wake up sending tasks if we upped the value. */
1170 		sk->sk_write_space(sk);
1171 		break;
1172 
1173 	case SO_SNDBUFFORCE:
1174 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1175 			ret = -EPERM;
1176 			break;
1177 		}
1178 
1179 		/* No negative values (to prevent underflow, as val will be
1180 		 * multiplied by 2).
1181 		 */
1182 		if (val < 0)
1183 			val = 0;
1184 		goto set_sndbuf;
1185 
1186 	case SO_RCVBUF:
1187 		/* Don't error on this BSD doesn't and if you think
1188 		 * about it this is right. Otherwise apps have to
1189 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1190 		 * are treated in BSD as hints
1191 		 */
1192 		__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1193 		break;
1194 
1195 	case SO_RCVBUFFORCE:
1196 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1197 			ret = -EPERM;
1198 			break;
1199 		}
1200 
1201 		/* No negative values (to prevent underflow, as val will be
1202 		 * multiplied by 2).
1203 		 */
1204 		__sock_set_rcvbuf(sk, max(val, 0));
1205 		break;
1206 
1207 	case SO_KEEPALIVE:
1208 		if (sk->sk_prot->keepalive)
1209 			sk->sk_prot->keepalive(sk, valbool);
1210 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1211 		break;
1212 
1213 	case SO_OOBINLINE:
1214 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1215 		break;
1216 
1217 	case SO_NO_CHECK:
1218 		sk->sk_no_check_tx = valbool;
1219 		break;
1220 
1221 	case SO_PRIORITY:
1222 		if ((val >= 0 && val <= 6) ||
1223 		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1224 		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1225 			WRITE_ONCE(sk->sk_priority, val);
1226 		else
1227 			ret = -EPERM;
1228 		break;
1229 
1230 	case SO_LINGER:
1231 		if (optlen < sizeof(ling)) {
1232 			ret = -EINVAL;	/* 1003.1g */
1233 			break;
1234 		}
1235 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1236 			ret = -EFAULT;
1237 			break;
1238 		}
1239 		if (!ling.l_onoff) {
1240 			sock_reset_flag(sk, SOCK_LINGER);
1241 		} else {
1242 			unsigned long t_sec = ling.l_linger;
1243 
1244 			if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1245 				WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1246 			else
1247 				WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1248 			sock_set_flag(sk, SOCK_LINGER);
1249 		}
1250 		break;
1251 
1252 	case SO_BSDCOMPAT:
1253 		break;
1254 
1255 	case SO_PASSCRED:
1256 		assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
1257 		break;
1258 
1259 	case SO_PASSPIDFD:
1260 		assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
1261 		break;
1262 
1263 	case SO_TIMESTAMP_OLD:
1264 	case SO_TIMESTAMP_NEW:
1265 	case SO_TIMESTAMPNS_OLD:
1266 	case SO_TIMESTAMPNS_NEW:
1267 		sock_set_timestamp(sk, optname, valbool);
1268 		break;
1269 
1270 	case SO_TIMESTAMPING_NEW:
1271 	case SO_TIMESTAMPING_OLD:
1272 		if (optlen == sizeof(timestamping)) {
1273 			if (copy_from_sockptr(&timestamping, optval,
1274 					      sizeof(timestamping))) {
1275 				ret = -EFAULT;
1276 				break;
1277 			}
1278 		} else {
1279 			memset(&timestamping, 0, sizeof(timestamping));
1280 			timestamping.flags = val;
1281 		}
1282 		ret = sock_set_timestamping(sk, optname, timestamping);
1283 		break;
1284 
1285 	case SO_RCVLOWAT:
1286 		{
1287 		int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1288 
1289 		if (val < 0)
1290 			val = INT_MAX;
1291 		if (sock)
1292 			set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1293 		if (set_rcvlowat)
1294 			ret = set_rcvlowat(sk, val);
1295 		else
1296 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1297 		break;
1298 		}
1299 	case SO_RCVTIMEO_OLD:
1300 	case SO_RCVTIMEO_NEW:
1301 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1302 				       optlen, optname == SO_RCVTIMEO_OLD);
1303 		break;
1304 
1305 	case SO_SNDTIMEO_OLD:
1306 	case SO_SNDTIMEO_NEW:
1307 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1308 				       optlen, optname == SO_SNDTIMEO_OLD);
1309 		break;
1310 
1311 	case SO_ATTACH_FILTER: {
1312 		struct sock_fprog fprog;
1313 
1314 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1315 		if (!ret)
1316 			ret = sk_attach_filter(&fprog, sk);
1317 		break;
1318 	}
1319 	case SO_ATTACH_BPF:
1320 		ret = -EINVAL;
1321 		if (optlen == sizeof(u32)) {
1322 			u32 ufd;
1323 
1324 			ret = -EFAULT;
1325 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1326 				break;
1327 
1328 			ret = sk_attach_bpf(ufd, sk);
1329 		}
1330 		break;
1331 
1332 	case SO_ATTACH_REUSEPORT_CBPF: {
1333 		struct sock_fprog fprog;
1334 
1335 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1336 		if (!ret)
1337 			ret = sk_reuseport_attach_filter(&fprog, sk);
1338 		break;
1339 	}
1340 	case SO_ATTACH_REUSEPORT_EBPF:
1341 		ret = -EINVAL;
1342 		if (optlen == sizeof(u32)) {
1343 			u32 ufd;
1344 
1345 			ret = -EFAULT;
1346 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1347 				break;
1348 
1349 			ret = sk_reuseport_attach_bpf(ufd, sk);
1350 		}
1351 		break;
1352 
1353 	case SO_DETACH_REUSEPORT_BPF:
1354 		ret = reuseport_detach_prog(sk);
1355 		break;
1356 
1357 	case SO_DETACH_FILTER:
1358 		ret = sk_detach_filter(sk);
1359 		break;
1360 
1361 	case SO_LOCK_FILTER:
1362 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1363 			ret = -EPERM;
1364 		else
1365 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1366 		break;
1367 
1368 	case SO_PASSSEC:
1369 		assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
1370 		break;
1371 	case SO_MARK:
1372 		if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1373 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1374 			ret = -EPERM;
1375 			break;
1376 		}
1377 
1378 		__sock_set_mark(sk, val);
1379 		break;
1380 	case SO_RCVMARK:
1381 		sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1382 		break;
1383 
1384 	case SO_RXQ_OVFL:
1385 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1386 		break;
1387 
1388 	case SO_WIFI_STATUS:
1389 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1390 		break;
1391 
1392 	case SO_PEEK_OFF:
1393 		{
1394 		int (*set_peek_off)(struct sock *sk, int val);
1395 
1396 		set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1397 		if (set_peek_off)
1398 			ret = set_peek_off(sk, val);
1399 		else
1400 			ret = -EOPNOTSUPP;
1401 		break;
1402 		}
1403 
1404 	case SO_NOFCS:
1405 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1406 		break;
1407 
1408 	case SO_SELECT_ERR_QUEUE:
1409 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1410 		break;
1411 
1412 #ifdef CONFIG_NET_RX_BUSY_POLL
1413 	case SO_BUSY_POLL:
1414 		if (val < 0)
1415 			ret = -EINVAL;
1416 		else
1417 			WRITE_ONCE(sk->sk_ll_usec, val);
1418 		break;
1419 	case SO_PREFER_BUSY_POLL:
1420 		if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1421 			ret = -EPERM;
1422 		else
1423 			WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1424 		break;
1425 	case SO_BUSY_POLL_BUDGET:
1426 		if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) {
1427 			ret = -EPERM;
1428 		} else {
1429 			if (val < 0 || val > U16_MAX)
1430 				ret = -EINVAL;
1431 			else
1432 				WRITE_ONCE(sk->sk_busy_poll_budget, val);
1433 		}
1434 		break;
1435 #endif
1436 
1437 	case SO_MAX_PACING_RATE:
1438 		{
1439 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1440 
1441 		if (sizeof(ulval) != sizeof(val) &&
1442 		    optlen >= sizeof(ulval) &&
1443 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1444 			ret = -EFAULT;
1445 			break;
1446 		}
1447 		if (ulval != ~0UL)
1448 			cmpxchg(&sk->sk_pacing_status,
1449 				SK_PACING_NONE,
1450 				SK_PACING_NEEDED);
1451 		/* Pairs with READ_ONCE() from sk_getsockopt() */
1452 		WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1453 		sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1454 		break;
1455 		}
1456 	case SO_INCOMING_CPU:
1457 		reuseport_update_incoming_cpu(sk, val);
1458 		break;
1459 
1460 	case SO_CNX_ADVICE:
1461 		if (val == 1)
1462 			dst_negative_advice(sk);
1463 		break;
1464 
1465 	case SO_ZEROCOPY:
1466 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1467 			if (!(sk_is_tcp(sk) ||
1468 			      (sk->sk_type == SOCK_DGRAM &&
1469 			       sk->sk_protocol == IPPROTO_UDP)))
1470 				ret = -EOPNOTSUPP;
1471 		} else if (sk->sk_family != PF_RDS) {
1472 			ret = -EOPNOTSUPP;
1473 		}
1474 		if (!ret) {
1475 			if (val < 0 || val > 1)
1476 				ret = -EINVAL;
1477 			else
1478 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1479 		}
1480 		break;
1481 
1482 	case SO_TXTIME:
1483 		if (optlen != sizeof(struct sock_txtime)) {
1484 			ret = -EINVAL;
1485 			break;
1486 		} else if (copy_from_sockptr(&sk_txtime, optval,
1487 			   sizeof(struct sock_txtime))) {
1488 			ret = -EFAULT;
1489 			break;
1490 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1491 			ret = -EINVAL;
1492 			break;
1493 		}
1494 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1495 		 * scheduler has enough safe guards.
1496 		 */
1497 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1498 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1499 			ret = -EPERM;
1500 			break;
1501 		}
1502 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1503 		sk->sk_clockid = sk_txtime.clockid;
1504 		sk->sk_txtime_deadline_mode =
1505 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1506 		sk->sk_txtime_report_errors =
1507 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1508 		break;
1509 
1510 	case SO_BINDTOIFINDEX:
1511 		ret = sock_bindtoindex_locked(sk, val);
1512 		break;
1513 
1514 	case SO_BUF_LOCK:
1515 		if (val & ~SOCK_BUF_LOCK_MASK) {
1516 			ret = -EINVAL;
1517 			break;
1518 		}
1519 		sk->sk_userlocks = val | (sk->sk_userlocks &
1520 					  ~SOCK_BUF_LOCK_MASK);
1521 		break;
1522 
1523 	case SO_RESERVE_MEM:
1524 	{
1525 		int delta;
1526 
1527 		if (val < 0) {
1528 			ret = -EINVAL;
1529 			break;
1530 		}
1531 
1532 		delta = val - sk->sk_reserved_mem;
1533 		if (delta < 0)
1534 			sock_release_reserved_memory(sk, -delta);
1535 		else
1536 			ret = sock_reserve_memory(sk, delta);
1537 		break;
1538 	}
1539 
1540 	case SO_TXREHASH:
1541 		if (val < -1 || val > 1) {
1542 			ret = -EINVAL;
1543 			break;
1544 		}
1545 		if ((u8)val == SOCK_TXREHASH_DEFAULT)
1546 			val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1547 		/* Paired with READ_ONCE() in tcp_rtx_synack()
1548 		 * and sk_getsockopt().
1549 		 */
1550 		WRITE_ONCE(sk->sk_txrehash, (u8)val);
1551 		break;
1552 
1553 	default:
1554 		ret = -ENOPROTOOPT;
1555 		break;
1556 	}
1557 	sockopt_release_sock(sk);
1558 	return ret;
1559 }
1560 
sock_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)1561 int sock_setsockopt(struct socket *sock, int level, int optname,
1562 		    sockptr_t optval, unsigned int optlen)
1563 {
1564 	return sk_setsockopt(sock->sk, level, optname,
1565 			     optval, optlen);
1566 }
1567 EXPORT_SYMBOL(sock_setsockopt);
1568 
sk_get_peer_cred(struct sock * sk)1569 static const struct cred *sk_get_peer_cred(struct sock *sk)
1570 {
1571 	const struct cred *cred;
1572 
1573 	spin_lock(&sk->sk_peer_lock);
1574 	cred = get_cred(sk->sk_peer_cred);
1575 	spin_unlock(&sk->sk_peer_lock);
1576 
1577 	return cred;
1578 }
1579 
cred_to_ucred(struct pid * pid,const struct cred * cred,struct ucred * ucred)1580 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1581 			  struct ucred *ucred)
1582 {
1583 	ucred->pid = pid_vnr(pid);
1584 	ucred->uid = ucred->gid = -1;
1585 	if (cred) {
1586 		struct user_namespace *current_ns = current_user_ns();
1587 
1588 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1589 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1590 	}
1591 }
1592 
groups_to_user(sockptr_t dst,const struct group_info * src)1593 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1594 {
1595 	struct user_namespace *user_ns = current_user_ns();
1596 	int i;
1597 
1598 	for (i = 0; i < src->ngroups; i++) {
1599 		gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1600 
1601 		if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1602 			return -EFAULT;
1603 	}
1604 
1605 	return 0;
1606 }
1607 
sk_getsockopt(struct sock * sk,int level,int optname,sockptr_t optval,sockptr_t optlen)1608 int sk_getsockopt(struct sock *sk, int level, int optname,
1609 		  sockptr_t optval, sockptr_t optlen)
1610 {
1611 	struct socket *sock = sk->sk_socket;
1612 
1613 	union {
1614 		int val;
1615 		u64 val64;
1616 		unsigned long ulval;
1617 		struct linger ling;
1618 		struct old_timeval32 tm32;
1619 		struct __kernel_old_timeval tm;
1620 		struct  __kernel_sock_timeval stm;
1621 		struct sock_txtime txtime;
1622 		struct so_timestamping timestamping;
1623 	} v;
1624 
1625 	int lv = sizeof(int);
1626 	int len;
1627 
1628 	if (copy_from_sockptr(&len, optlen, sizeof(int)))
1629 		return -EFAULT;
1630 	if (len < 0)
1631 		return -EINVAL;
1632 
1633 	memset(&v, 0, sizeof(v));
1634 
1635 	switch (optname) {
1636 	case SO_DEBUG:
1637 		v.val = sock_flag(sk, SOCK_DBG);
1638 		break;
1639 
1640 	case SO_DONTROUTE:
1641 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1642 		break;
1643 
1644 	case SO_BROADCAST:
1645 		v.val = sock_flag(sk, SOCK_BROADCAST);
1646 		break;
1647 
1648 	case SO_SNDBUF:
1649 		v.val = READ_ONCE(sk->sk_sndbuf);
1650 		break;
1651 
1652 	case SO_RCVBUF:
1653 		v.val = READ_ONCE(sk->sk_rcvbuf);
1654 		break;
1655 
1656 	case SO_REUSEADDR:
1657 		v.val = sk->sk_reuse;
1658 		break;
1659 
1660 	case SO_REUSEPORT:
1661 		v.val = sk->sk_reuseport;
1662 		break;
1663 
1664 	case SO_KEEPALIVE:
1665 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1666 		break;
1667 
1668 	case SO_TYPE:
1669 		v.val = sk->sk_type;
1670 		break;
1671 
1672 	case SO_PROTOCOL:
1673 		v.val = sk->sk_protocol;
1674 		break;
1675 
1676 	case SO_DOMAIN:
1677 		v.val = sk->sk_family;
1678 		break;
1679 
1680 	case SO_ERROR:
1681 		v.val = -sock_error(sk);
1682 		if (v.val == 0)
1683 			v.val = xchg(&sk->sk_err_soft, 0);
1684 		break;
1685 
1686 	case SO_OOBINLINE:
1687 		v.val = sock_flag(sk, SOCK_URGINLINE);
1688 		break;
1689 
1690 	case SO_NO_CHECK:
1691 		v.val = sk->sk_no_check_tx;
1692 		break;
1693 
1694 	case SO_PRIORITY:
1695 		v.val = READ_ONCE(sk->sk_priority);
1696 		break;
1697 
1698 	case SO_LINGER:
1699 		lv		= sizeof(v.ling);
1700 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1701 		v.ling.l_linger	= READ_ONCE(sk->sk_lingertime) / HZ;
1702 		break;
1703 
1704 	case SO_BSDCOMPAT:
1705 		break;
1706 
1707 	case SO_TIMESTAMP_OLD:
1708 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1709 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1710 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1711 		break;
1712 
1713 	case SO_TIMESTAMPNS_OLD:
1714 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1715 		break;
1716 
1717 	case SO_TIMESTAMP_NEW:
1718 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1719 		break;
1720 
1721 	case SO_TIMESTAMPNS_NEW:
1722 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1723 		break;
1724 
1725 	case SO_TIMESTAMPING_OLD:
1726 	case SO_TIMESTAMPING_NEW:
1727 		lv = sizeof(v.timestamping);
1728 		/* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1729 		 * returning the flags when they were set through the same option.
1730 		 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1731 		 */
1732 		if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1733 			v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1734 			v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1735 		}
1736 		break;
1737 
1738 	case SO_RCVTIMEO_OLD:
1739 	case SO_RCVTIMEO_NEW:
1740 		lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1741 				      SO_RCVTIMEO_OLD == optname);
1742 		break;
1743 
1744 	case SO_SNDTIMEO_OLD:
1745 	case SO_SNDTIMEO_NEW:
1746 		lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1747 				      SO_SNDTIMEO_OLD == optname);
1748 		break;
1749 
1750 	case SO_RCVLOWAT:
1751 		v.val = READ_ONCE(sk->sk_rcvlowat);
1752 		break;
1753 
1754 	case SO_SNDLOWAT:
1755 		v.val = 1;
1756 		break;
1757 
1758 	case SO_PASSCRED:
1759 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1760 		break;
1761 
1762 	case SO_PASSPIDFD:
1763 		v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1764 		break;
1765 
1766 	case SO_PEERCRED:
1767 	{
1768 		struct ucred peercred;
1769 		if (len > sizeof(peercred))
1770 			len = sizeof(peercred);
1771 
1772 		spin_lock(&sk->sk_peer_lock);
1773 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1774 		spin_unlock(&sk->sk_peer_lock);
1775 
1776 		if (copy_to_sockptr(optval, &peercred, len))
1777 			return -EFAULT;
1778 		goto lenout;
1779 	}
1780 
1781 	case SO_PEERPIDFD:
1782 	{
1783 		struct pid *peer_pid;
1784 		struct file *pidfd_file = NULL;
1785 		int pidfd;
1786 
1787 		if (len > sizeof(pidfd))
1788 			len = sizeof(pidfd);
1789 
1790 		spin_lock(&sk->sk_peer_lock);
1791 		peer_pid = get_pid(sk->sk_peer_pid);
1792 		spin_unlock(&sk->sk_peer_lock);
1793 
1794 		if (!peer_pid)
1795 			return -ENODATA;
1796 
1797 		pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1798 		put_pid(peer_pid);
1799 		if (pidfd < 0)
1800 			return pidfd;
1801 
1802 		if (copy_to_sockptr(optval, &pidfd, len) ||
1803 		    copy_to_sockptr(optlen, &len, sizeof(int))) {
1804 			put_unused_fd(pidfd);
1805 			fput(pidfd_file);
1806 
1807 			return -EFAULT;
1808 		}
1809 
1810 		fd_install(pidfd, pidfd_file);
1811 		return 0;
1812 	}
1813 
1814 	case SO_PEERGROUPS:
1815 	{
1816 		const struct cred *cred;
1817 		int ret, n;
1818 
1819 		cred = sk_get_peer_cred(sk);
1820 		if (!cred)
1821 			return -ENODATA;
1822 
1823 		n = cred->group_info->ngroups;
1824 		if (len < n * sizeof(gid_t)) {
1825 			len = n * sizeof(gid_t);
1826 			put_cred(cred);
1827 			return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1828 		}
1829 		len = n * sizeof(gid_t);
1830 
1831 		ret = groups_to_user(optval, cred->group_info);
1832 		put_cred(cred);
1833 		if (ret)
1834 			return ret;
1835 		goto lenout;
1836 	}
1837 
1838 	case SO_PEERNAME:
1839 	{
1840 		struct sockaddr_storage address;
1841 
1842 		lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1843 		if (lv < 0)
1844 			return -ENOTCONN;
1845 		if (lv < len)
1846 			return -EINVAL;
1847 		if (copy_to_sockptr(optval, &address, len))
1848 			return -EFAULT;
1849 		goto lenout;
1850 	}
1851 
1852 	/* Dubious BSD thing... Probably nobody even uses it, but
1853 	 * the UNIX standard wants it for whatever reason... -DaveM
1854 	 */
1855 	case SO_ACCEPTCONN:
1856 		v.val = sk->sk_state == TCP_LISTEN;
1857 		break;
1858 
1859 	case SO_PASSSEC:
1860 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1861 		break;
1862 
1863 	case SO_PEERSEC:
1864 		return security_socket_getpeersec_stream(sock,
1865 							 optval, optlen, len);
1866 
1867 	case SO_MARK:
1868 		v.val = READ_ONCE(sk->sk_mark);
1869 		break;
1870 
1871 	case SO_RCVMARK:
1872 		v.val = sock_flag(sk, SOCK_RCVMARK);
1873 		break;
1874 
1875 	case SO_RXQ_OVFL:
1876 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1877 		break;
1878 
1879 	case SO_WIFI_STATUS:
1880 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1881 		break;
1882 
1883 	case SO_PEEK_OFF:
1884 		if (!READ_ONCE(sock->ops)->set_peek_off)
1885 			return -EOPNOTSUPP;
1886 
1887 		v.val = READ_ONCE(sk->sk_peek_off);
1888 		break;
1889 	case SO_NOFCS:
1890 		v.val = sock_flag(sk, SOCK_NOFCS);
1891 		break;
1892 
1893 	case SO_BINDTODEVICE:
1894 		return sock_getbindtodevice(sk, optval, optlen, len);
1895 
1896 	case SO_GET_FILTER:
1897 		len = sk_get_filter(sk, optval, len);
1898 		if (len < 0)
1899 			return len;
1900 
1901 		goto lenout;
1902 
1903 	case SO_LOCK_FILTER:
1904 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1905 		break;
1906 
1907 	case SO_BPF_EXTENSIONS:
1908 		v.val = bpf_tell_extensions();
1909 		break;
1910 
1911 	case SO_SELECT_ERR_QUEUE:
1912 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1913 		break;
1914 
1915 #ifdef CONFIG_NET_RX_BUSY_POLL
1916 	case SO_BUSY_POLL:
1917 		v.val = READ_ONCE(sk->sk_ll_usec);
1918 		break;
1919 	case SO_PREFER_BUSY_POLL:
1920 		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1921 		break;
1922 #endif
1923 
1924 	case SO_MAX_PACING_RATE:
1925 		/* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1926 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1927 			lv = sizeof(v.ulval);
1928 			v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1929 		} else {
1930 			/* 32bit version */
1931 			v.val = min_t(unsigned long, ~0U,
1932 				      READ_ONCE(sk->sk_max_pacing_rate));
1933 		}
1934 		break;
1935 
1936 	case SO_INCOMING_CPU:
1937 		v.val = READ_ONCE(sk->sk_incoming_cpu);
1938 		break;
1939 
1940 	case SO_MEMINFO:
1941 	{
1942 		u32 meminfo[SK_MEMINFO_VARS];
1943 
1944 		sk_get_meminfo(sk, meminfo);
1945 
1946 		len = min_t(unsigned int, len, sizeof(meminfo));
1947 		if (copy_to_sockptr(optval, &meminfo, len))
1948 			return -EFAULT;
1949 
1950 		goto lenout;
1951 	}
1952 
1953 #ifdef CONFIG_NET_RX_BUSY_POLL
1954 	case SO_INCOMING_NAPI_ID:
1955 		v.val = READ_ONCE(sk->sk_napi_id);
1956 
1957 		/* aggregate non-NAPI IDs down to 0 */
1958 		if (v.val < MIN_NAPI_ID)
1959 			v.val = 0;
1960 
1961 		break;
1962 #endif
1963 
1964 	case SO_COOKIE:
1965 		lv = sizeof(u64);
1966 		if (len < lv)
1967 			return -EINVAL;
1968 		v.val64 = sock_gen_cookie(sk);
1969 		break;
1970 
1971 	case SO_ZEROCOPY:
1972 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1973 		break;
1974 
1975 	case SO_TXTIME:
1976 		lv = sizeof(v.txtime);
1977 		v.txtime.clockid = sk->sk_clockid;
1978 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1979 				  SOF_TXTIME_DEADLINE_MODE : 0;
1980 		v.txtime.flags |= sk->sk_txtime_report_errors ?
1981 				  SOF_TXTIME_REPORT_ERRORS : 0;
1982 		break;
1983 
1984 	case SO_BINDTOIFINDEX:
1985 		v.val = READ_ONCE(sk->sk_bound_dev_if);
1986 		break;
1987 
1988 	case SO_NETNS_COOKIE:
1989 		lv = sizeof(u64);
1990 		if (len != lv)
1991 			return -EINVAL;
1992 		v.val64 = sock_net(sk)->net_cookie;
1993 		break;
1994 
1995 	case SO_BUF_LOCK:
1996 		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1997 		break;
1998 
1999 	case SO_RESERVE_MEM:
2000 		v.val = READ_ONCE(sk->sk_reserved_mem);
2001 		break;
2002 
2003 	case SO_TXREHASH:
2004 		/* Paired with WRITE_ONCE() in sk_setsockopt() */
2005 		v.val = READ_ONCE(sk->sk_txrehash);
2006 		break;
2007 
2008 	default:
2009 		/* We implement the SO_SNDLOWAT etc to not be settable
2010 		 * (1003.1g 7).
2011 		 */
2012 		return -ENOPROTOOPT;
2013 	}
2014 
2015 	if (len > lv)
2016 		len = lv;
2017 	if (copy_to_sockptr(optval, &v, len))
2018 		return -EFAULT;
2019 lenout:
2020 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
2021 		return -EFAULT;
2022 	return 0;
2023 }
2024 
2025 /*
2026  * Initialize an sk_lock.
2027  *
2028  * (We also register the sk_lock with the lock validator.)
2029  */
sock_lock_init(struct sock * sk)2030 static inline void sock_lock_init(struct sock *sk)
2031 {
2032 	if (sk->sk_kern_sock)
2033 		sock_lock_init_class_and_name(
2034 			sk,
2035 			af_family_kern_slock_key_strings[sk->sk_family],
2036 			af_family_kern_slock_keys + sk->sk_family,
2037 			af_family_kern_key_strings[sk->sk_family],
2038 			af_family_kern_keys + sk->sk_family);
2039 	else
2040 		sock_lock_init_class_and_name(
2041 			sk,
2042 			af_family_slock_key_strings[sk->sk_family],
2043 			af_family_slock_keys + sk->sk_family,
2044 			af_family_key_strings[sk->sk_family],
2045 			af_family_keys + sk->sk_family);
2046 }
2047 
2048 /*
2049  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2050  * even temporarly, because of RCU lookups. sk_node should also be left as is.
2051  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2052  */
sock_copy(struct sock * nsk,const struct sock * osk)2053 static void sock_copy(struct sock *nsk, const struct sock *osk)
2054 {
2055 	const struct proto *prot = READ_ONCE(osk->sk_prot);
2056 #ifdef CONFIG_SECURITY_NETWORK
2057 	void *sptr = nsk->sk_security;
2058 #endif
2059 
2060 	/* If we move sk_tx_queue_mapping out of the private section,
2061 	 * we must check if sk_tx_queue_clear() is called after
2062 	 * sock_copy() in sk_clone_lock().
2063 	 */
2064 	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2065 		     offsetof(struct sock, sk_dontcopy_begin) ||
2066 		     offsetof(struct sock, sk_tx_queue_mapping) >=
2067 		     offsetof(struct sock, sk_dontcopy_end));
2068 
2069 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2070 
2071 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2072 	       prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
2073 
2074 #ifdef CONFIG_SECURITY_NETWORK
2075 	nsk->sk_security = sptr;
2076 	security_sk_clone(osk, nsk);
2077 #endif
2078 }
2079 
sk_prot_alloc(struct proto * prot,gfp_t priority,int family)2080 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2081 		int family)
2082 {
2083 	struct sock *sk;
2084 	struct kmem_cache *slab;
2085 
2086 	slab = prot->slab;
2087 	if (slab != NULL) {
2088 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2089 		if (!sk)
2090 			return sk;
2091 		if (want_init_on_alloc(priority))
2092 			sk_prot_clear_nulls(sk, prot->obj_size);
2093 	} else
2094 		sk = kmalloc(prot->obj_size, priority);
2095 
2096 	if (sk != NULL) {
2097 		if (security_sk_alloc(sk, family, priority))
2098 			goto out_free;
2099 
2100 		if (!try_module_get(prot->owner))
2101 			goto out_free_sec;
2102 	}
2103 
2104 	return sk;
2105 
2106 out_free_sec:
2107 	security_sk_free(sk);
2108 out_free:
2109 	if (slab != NULL)
2110 		kmem_cache_free(slab, sk);
2111 	else
2112 		kfree(sk);
2113 	return NULL;
2114 }
2115 
sk_prot_free(struct proto * prot,struct sock * sk)2116 static void sk_prot_free(struct proto *prot, struct sock *sk)
2117 {
2118 	struct kmem_cache *slab;
2119 	struct module *owner;
2120 
2121 	owner = prot->owner;
2122 	slab = prot->slab;
2123 
2124 	cgroup_sk_free(&sk->sk_cgrp_data);
2125 	mem_cgroup_sk_free(sk);
2126 	security_sk_free(sk);
2127 	if (slab != NULL)
2128 		kmem_cache_free(slab, sk);
2129 	else
2130 		kfree(sk);
2131 	module_put(owner);
2132 }
2133 
2134 /**
2135  *	sk_alloc - All socket objects are allocated here
2136  *	@net: the applicable net namespace
2137  *	@family: protocol family
2138  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2139  *	@prot: struct proto associated with this new sock instance
2140  *	@kern: is this to be a kernel socket?
2141  */
sk_alloc(struct net * net,int family,gfp_t priority,struct proto * prot,int kern)2142 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2143 		      struct proto *prot, int kern)
2144 {
2145 	struct sock *sk;
2146 
2147 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2148 	if (sk) {
2149 		sk->sk_family = family;
2150 		/*
2151 		 * See comment in struct sock definition to understand
2152 		 * why we need sk_prot_creator -acme
2153 		 */
2154 		sk->sk_prot = sk->sk_prot_creator = prot;
2155 		sk->sk_kern_sock = kern;
2156 		sock_lock_init(sk);
2157 		sk->sk_net_refcnt = kern ? 0 : 1;
2158 		if (likely(sk->sk_net_refcnt)) {
2159 			get_net_track(net, &sk->ns_tracker, priority);
2160 			sock_inuse_add(net, 1);
2161 		} else {
2162 			__netns_tracker_alloc(net, &sk->ns_tracker,
2163 					      false, priority);
2164 		}
2165 
2166 		sock_net_set(sk, net);
2167 		refcount_set(&sk->sk_wmem_alloc, 1);
2168 
2169 		mem_cgroup_sk_alloc(sk);
2170 		cgroup_sk_alloc(&sk->sk_cgrp_data);
2171 		sock_update_classid(&sk->sk_cgrp_data);
2172 		sock_update_netprioidx(&sk->sk_cgrp_data);
2173 		sk_tx_queue_clear(sk);
2174 	}
2175 
2176 	return sk;
2177 }
2178 EXPORT_SYMBOL(sk_alloc);
2179 
2180 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2181  * grace period. This is the case for UDP sockets and TCP listeners.
2182  */
__sk_destruct(struct rcu_head * head)2183 static void __sk_destruct(struct rcu_head *head)
2184 {
2185 	struct sock *sk = container_of(head, struct sock, sk_rcu);
2186 	struct sk_filter *filter;
2187 
2188 	if (sk->sk_destruct)
2189 		sk->sk_destruct(sk);
2190 
2191 	filter = rcu_dereference_check(sk->sk_filter,
2192 				       refcount_read(&sk->sk_wmem_alloc) == 0);
2193 	if (filter) {
2194 		sk_filter_uncharge(sk, filter);
2195 		RCU_INIT_POINTER(sk->sk_filter, NULL);
2196 	}
2197 
2198 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2199 
2200 #ifdef CONFIG_BPF_SYSCALL
2201 	bpf_sk_storage_free(sk);
2202 #endif
2203 
2204 	if (atomic_read(&sk->sk_omem_alloc))
2205 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2206 			 __func__, atomic_read(&sk->sk_omem_alloc));
2207 
2208 	if (sk->sk_frag.page) {
2209 		put_page(sk->sk_frag.page);
2210 		sk->sk_frag.page = NULL;
2211 	}
2212 
2213 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2214 	put_cred(sk->sk_peer_cred);
2215 	put_pid(sk->sk_peer_pid);
2216 
2217 	if (likely(sk->sk_net_refcnt))
2218 		put_net_track(sock_net(sk), &sk->ns_tracker);
2219 	else
2220 		__netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
2221 
2222 	sk_prot_free(sk->sk_prot_creator, sk);
2223 }
2224 
sk_destruct(struct sock * sk)2225 void sk_destruct(struct sock *sk)
2226 {
2227 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2228 
2229 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2230 		reuseport_detach_sock(sk);
2231 		use_call_rcu = true;
2232 	}
2233 
2234 	if (use_call_rcu)
2235 		call_rcu(&sk->sk_rcu, __sk_destruct);
2236 	else
2237 		__sk_destruct(&sk->sk_rcu);
2238 }
2239 
__sk_free(struct sock * sk)2240 static void __sk_free(struct sock *sk)
2241 {
2242 	if (likely(sk->sk_net_refcnt))
2243 		sock_inuse_add(sock_net(sk), -1);
2244 
2245 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2246 		sock_diag_broadcast_destroy(sk);
2247 	else
2248 		sk_destruct(sk);
2249 }
2250 
sk_free(struct sock * sk)2251 void sk_free(struct sock *sk)
2252 {
2253 	/*
2254 	 * We subtract one from sk_wmem_alloc and can know if
2255 	 * some packets are still in some tx queue.
2256 	 * If not null, sock_wfree() will call __sk_free(sk) later
2257 	 */
2258 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2259 		__sk_free(sk);
2260 }
2261 EXPORT_SYMBOL(sk_free);
2262 
sk_init_common(struct sock * sk)2263 static void sk_init_common(struct sock *sk)
2264 {
2265 	skb_queue_head_init(&sk->sk_receive_queue);
2266 	skb_queue_head_init(&sk->sk_write_queue);
2267 	skb_queue_head_init(&sk->sk_error_queue);
2268 
2269 	rwlock_init(&sk->sk_callback_lock);
2270 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2271 			af_rlock_keys + sk->sk_family,
2272 			af_family_rlock_key_strings[sk->sk_family]);
2273 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2274 			af_wlock_keys + sk->sk_family,
2275 			af_family_wlock_key_strings[sk->sk_family]);
2276 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2277 			af_elock_keys + sk->sk_family,
2278 			af_family_elock_key_strings[sk->sk_family]);
2279 	lockdep_set_class_and_name(&sk->sk_callback_lock,
2280 			af_callback_keys + sk->sk_family,
2281 			af_family_clock_key_strings[sk->sk_family]);
2282 }
2283 
2284 /**
2285  *	sk_clone_lock - clone a socket, and lock its clone
2286  *	@sk: the socket to clone
2287  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2288  *
2289  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2290  */
sk_clone_lock(const struct sock * sk,const gfp_t priority)2291 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2292 {
2293 	struct proto *prot = READ_ONCE(sk->sk_prot);
2294 	struct sk_filter *filter;
2295 	bool is_charged = true;
2296 	struct sock *newsk;
2297 
2298 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2299 	if (!newsk)
2300 		goto out;
2301 
2302 	sock_copy(newsk, sk);
2303 
2304 	newsk->sk_prot_creator = prot;
2305 
2306 	/* SANITY */
2307 	if (likely(newsk->sk_net_refcnt)) {
2308 		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2309 		sock_inuse_add(sock_net(newsk), 1);
2310 	} else {
2311 		/* Kernel sockets are not elevating the struct net refcount.
2312 		 * Instead, use a tracker to more easily detect if a layer
2313 		 * is not properly dismantling its kernel sockets at netns
2314 		 * destroy time.
2315 		 */
2316 		__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2317 				      false, priority);
2318 	}
2319 	sk_node_init(&newsk->sk_node);
2320 	sock_lock_init(newsk);
2321 	bh_lock_sock(newsk);
2322 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2323 	newsk->sk_backlog.len = 0;
2324 
2325 	atomic_set(&newsk->sk_rmem_alloc, 0);
2326 
2327 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2328 	refcount_set(&newsk->sk_wmem_alloc, 1);
2329 
2330 	atomic_set(&newsk->sk_omem_alloc, 0);
2331 	sk_init_common(newsk);
2332 
2333 	newsk->sk_dst_cache	= NULL;
2334 	newsk->sk_dst_pending_confirm = 0;
2335 	newsk->sk_wmem_queued	= 0;
2336 	newsk->sk_forward_alloc = 0;
2337 	newsk->sk_reserved_mem  = 0;
2338 	atomic_set(&newsk->sk_drops, 0);
2339 	newsk->sk_send_head	= NULL;
2340 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2341 	atomic_set(&newsk->sk_zckey, 0);
2342 
2343 	sock_reset_flag(newsk, SOCK_DONE);
2344 
2345 	/* sk->sk_memcg will be populated at accept() time */
2346 	newsk->sk_memcg = NULL;
2347 
2348 	cgroup_sk_clone(&newsk->sk_cgrp_data);
2349 
2350 	rcu_read_lock();
2351 	filter = rcu_dereference(sk->sk_filter);
2352 	if (filter != NULL)
2353 		/* though it's an empty new sock, the charging may fail
2354 		 * if sysctl_optmem_max was changed between creation of
2355 		 * original socket and cloning
2356 		 */
2357 		is_charged = sk_filter_charge(newsk, filter);
2358 	RCU_INIT_POINTER(newsk->sk_filter, filter);
2359 	rcu_read_unlock();
2360 
2361 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2362 		/* We need to make sure that we don't uncharge the new
2363 		 * socket if we couldn't charge it in the first place
2364 		 * as otherwise we uncharge the parent's filter.
2365 		 */
2366 		if (!is_charged)
2367 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2368 		sk_free_unlock_clone(newsk);
2369 		newsk = NULL;
2370 		goto out;
2371 	}
2372 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2373 
2374 	if (bpf_sk_storage_clone(sk, newsk)) {
2375 		sk_free_unlock_clone(newsk);
2376 		newsk = NULL;
2377 		goto out;
2378 	}
2379 
2380 	/* Clear sk_user_data if parent had the pointer tagged
2381 	 * as not suitable for copying when cloning.
2382 	 */
2383 	if (sk_user_data_is_nocopy(newsk))
2384 		newsk->sk_user_data = NULL;
2385 
2386 	newsk->sk_err	   = 0;
2387 	newsk->sk_err_soft = 0;
2388 	newsk->sk_priority = 0;
2389 	newsk->sk_incoming_cpu = raw_smp_processor_id();
2390 
2391 	/* Before updating sk_refcnt, we must commit prior changes to memory
2392 	 * (Documentation/RCU/rculist_nulls.rst for details)
2393 	 */
2394 	smp_wmb();
2395 	refcount_set(&newsk->sk_refcnt, 2);
2396 
2397 	sk_set_socket(newsk, NULL);
2398 	sk_tx_queue_clear(newsk);
2399 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2400 
2401 	if (newsk->sk_prot->sockets_allocated)
2402 		sk_sockets_allocated_inc(newsk);
2403 
2404 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2405 		net_enable_timestamp();
2406 out:
2407 	return newsk;
2408 }
2409 EXPORT_SYMBOL_GPL(sk_clone_lock);
2410 
sk_free_unlock_clone(struct sock * sk)2411 void sk_free_unlock_clone(struct sock *sk)
2412 {
2413 	/* It is still raw copy of parent, so invalidate
2414 	 * destructor and make plain sk_free() */
2415 	sk->sk_destruct = NULL;
2416 	bh_unlock_sock(sk);
2417 	sk_free(sk);
2418 }
2419 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2420 
sk_dst_gso_max_size(struct sock * sk,struct dst_entry * dst)2421 static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2422 {
2423 	bool is_ipv6 = false;
2424 	u32 max_size;
2425 
2426 #if IS_ENABLED(CONFIG_IPV6)
2427 	is_ipv6 = (sk->sk_family == AF_INET6 &&
2428 		   !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2429 #endif
2430 	/* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2431 	max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2432 			READ_ONCE(dst->dev->gso_ipv4_max_size);
2433 	if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2434 		max_size = GSO_LEGACY_MAX_SIZE;
2435 
2436 	return max_size - (MAX_TCP_HEADER + 1);
2437 }
2438 
sk_setup_caps(struct sock * sk,struct dst_entry * dst)2439 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2440 {
2441 	u32 max_segs = 1;
2442 
2443 	sk->sk_route_caps = dst->dev->features;
2444 	if (sk_is_tcp(sk))
2445 		sk->sk_route_caps |= NETIF_F_GSO;
2446 	if (sk->sk_route_caps & NETIF_F_GSO)
2447 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2448 	if (unlikely(sk->sk_gso_disabled))
2449 		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2450 	if (sk_can_gso(sk)) {
2451 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2452 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2453 		} else {
2454 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2455 			sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2456 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2457 			max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2458 		}
2459 	}
2460 	sk->sk_gso_max_segs = max_segs;
2461 	sk_dst_set(sk, dst);
2462 }
2463 EXPORT_SYMBOL_GPL(sk_setup_caps);
2464 
2465 /*
2466  *	Simple resource managers for sockets.
2467  */
2468 
2469 
2470 /*
2471  * Write buffer destructor automatically called from kfree_skb.
2472  */
sock_wfree(struct sk_buff * skb)2473 void sock_wfree(struct sk_buff *skb)
2474 {
2475 	struct sock *sk = skb->sk;
2476 	unsigned int len = skb->truesize;
2477 	bool free;
2478 
2479 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2480 		if (sock_flag(sk, SOCK_RCU_FREE) &&
2481 		    sk->sk_write_space == sock_def_write_space) {
2482 			rcu_read_lock();
2483 			free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2484 			sock_def_write_space_wfree(sk);
2485 			rcu_read_unlock();
2486 			if (unlikely(free))
2487 				__sk_free(sk);
2488 			return;
2489 		}
2490 
2491 		/*
2492 		 * Keep a reference on sk_wmem_alloc, this will be released
2493 		 * after sk_write_space() call
2494 		 */
2495 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2496 		sk->sk_write_space(sk);
2497 		len = 1;
2498 	}
2499 	/*
2500 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2501 	 * could not do because of in-flight packets
2502 	 */
2503 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2504 		__sk_free(sk);
2505 }
2506 EXPORT_SYMBOL(sock_wfree);
2507 
2508 /* This variant of sock_wfree() is used by TCP,
2509  * since it sets SOCK_USE_WRITE_QUEUE.
2510  */
__sock_wfree(struct sk_buff * skb)2511 void __sock_wfree(struct sk_buff *skb)
2512 {
2513 	struct sock *sk = skb->sk;
2514 
2515 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2516 		__sk_free(sk);
2517 }
2518 
skb_set_owner_w(struct sk_buff * skb,struct sock * sk)2519 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2520 {
2521 	skb_orphan(skb);
2522 	skb->sk = sk;
2523 #ifdef CONFIG_INET
2524 	if (unlikely(!sk_fullsock(sk))) {
2525 		skb->destructor = sock_edemux;
2526 		sock_hold(sk);
2527 		return;
2528 	}
2529 #endif
2530 	skb->destructor = sock_wfree;
2531 	skb_set_hash_from_sk(skb, sk);
2532 	/*
2533 	 * We used to take a refcount on sk, but following operation
2534 	 * is enough to guarantee sk_free() wont free this sock until
2535 	 * all in-flight packets are completed
2536 	 */
2537 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2538 }
2539 EXPORT_SYMBOL(skb_set_owner_w);
2540 
can_skb_orphan_partial(const struct sk_buff * skb)2541 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2542 {
2543 #ifdef CONFIG_TLS_DEVICE
2544 	/* Drivers depend on in-order delivery for crypto offload,
2545 	 * partial orphan breaks out-of-order-OK logic.
2546 	 */
2547 	if (skb->decrypted)
2548 		return false;
2549 #endif
2550 	return (skb->destructor == sock_wfree ||
2551 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2552 }
2553 
2554 /* This helper is used by netem, as it can hold packets in its
2555  * delay queue. We want to allow the owner socket to send more
2556  * packets, as if they were already TX completed by a typical driver.
2557  * But we also want to keep skb->sk set because some packet schedulers
2558  * rely on it (sch_fq for example).
2559  */
skb_orphan_partial(struct sk_buff * skb)2560 void skb_orphan_partial(struct sk_buff *skb)
2561 {
2562 	if (skb_is_tcp_pure_ack(skb))
2563 		return;
2564 
2565 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2566 		return;
2567 
2568 	skb_orphan(skb);
2569 }
2570 EXPORT_SYMBOL(skb_orphan_partial);
2571 
2572 /*
2573  * Read buffer destructor automatically called from kfree_skb.
2574  */
sock_rfree(struct sk_buff * skb)2575 void sock_rfree(struct sk_buff *skb)
2576 {
2577 	struct sock *sk = skb->sk;
2578 	unsigned int len = skb->truesize;
2579 
2580 	atomic_sub(len, &sk->sk_rmem_alloc);
2581 	sk_mem_uncharge(sk, len);
2582 }
2583 EXPORT_SYMBOL(sock_rfree);
2584 
2585 /*
2586  * Buffer destructor for skbs that are not used directly in read or write
2587  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2588  */
sock_efree(struct sk_buff * skb)2589 void sock_efree(struct sk_buff *skb)
2590 {
2591 	sock_put(skb->sk);
2592 }
2593 EXPORT_SYMBOL(sock_efree);
2594 
2595 /* Buffer destructor for prefetch/receive path where reference count may
2596  * not be held, e.g. for listen sockets.
2597  */
2598 #ifdef CONFIG_INET
sock_pfree(struct sk_buff * skb)2599 void sock_pfree(struct sk_buff *skb)
2600 {
2601 	if (sk_is_refcounted(skb->sk))
2602 		sock_gen_put(skb->sk);
2603 }
2604 EXPORT_SYMBOL(sock_pfree);
2605 #endif /* CONFIG_INET */
2606 
sock_i_uid(struct sock * sk)2607 kuid_t sock_i_uid(struct sock *sk)
2608 {
2609 	kuid_t uid;
2610 
2611 	read_lock_bh(&sk->sk_callback_lock);
2612 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2613 	read_unlock_bh(&sk->sk_callback_lock);
2614 	return uid;
2615 }
2616 EXPORT_SYMBOL(sock_i_uid);
2617 
__sock_i_ino(struct sock * sk)2618 unsigned long __sock_i_ino(struct sock *sk)
2619 {
2620 	unsigned long ino;
2621 
2622 	read_lock(&sk->sk_callback_lock);
2623 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2624 	read_unlock(&sk->sk_callback_lock);
2625 	return ino;
2626 }
2627 EXPORT_SYMBOL(__sock_i_ino);
2628 
sock_i_ino(struct sock * sk)2629 unsigned long sock_i_ino(struct sock *sk)
2630 {
2631 	unsigned long ino;
2632 
2633 	local_bh_disable();
2634 	ino = __sock_i_ino(sk);
2635 	local_bh_enable();
2636 	return ino;
2637 }
2638 EXPORT_SYMBOL(sock_i_ino);
2639 
2640 /*
2641  * Allocate a skb from the socket's send buffer.
2642  */
sock_wmalloc(struct sock * sk,unsigned long size,int force,gfp_t priority)2643 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2644 			     gfp_t priority)
2645 {
2646 	if (force ||
2647 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2648 		struct sk_buff *skb = alloc_skb(size, priority);
2649 
2650 		if (skb) {
2651 			skb_set_owner_w(skb, sk);
2652 			return skb;
2653 		}
2654 	}
2655 	return NULL;
2656 }
2657 EXPORT_SYMBOL(sock_wmalloc);
2658 
sock_ofree(struct sk_buff * skb)2659 static void sock_ofree(struct sk_buff *skb)
2660 {
2661 	struct sock *sk = skb->sk;
2662 
2663 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2664 }
2665 
sock_omalloc(struct sock * sk,unsigned long size,gfp_t priority)2666 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2667 			     gfp_t priority)
2668 {
2669 	struct sk_buff *skb;
2670 
2671 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2672 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2673 	    READ_ONCE(sysctl_optmem_max))
2674 		return NULL;
2675 
2676 	skb = alloc_skb(size, priority);
2677 	if (!skb)
2678 		return NULL;
2679 
2680 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2681 	skb->sk = sk;
2682 	skb->destructor = sock_ofree;
2683 	return skb;
2684 }
2685 
2686 /*
2687  * Allocate a memory block from the socket's option memory buffer.
2688  */
sock_kmalloc(struct sock * sk,int size,gfp_t priority)2689 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2690 {
2691 	int optmem_max = READ_ONCE(sysctl_optmem_max);
2692 
2693 	if ((unsigned int)size <= optmem_max &&
2694 	    atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2695 		void *mem;
2696 		/* First do the add, to avoid the race if kmalloc
2697 		 * might sleep.
2698 		 */
2699 		atomic_add(size, &sk->sk_omem_alloc);
2700 		mem = kmalloc(size, priority);
2701 		if (mem)
2702 			return mem;
2703 		atomic_sub(size, &sk->sk_omem_alloc);
2704 	}
2705 	return NULL;
2706 }
2707 EXPORT_SYMBOL(sock_kmalloc);
2708 
2709 /* Free an option memory block. Note, we actually want the inline
2710  * here as this allows gcc to detect the nullify and fold away the
2711  * condition entirely.
2712  */
__sock_kfree_s(struct sock * sk,void * mem,int size,const bool nullify)2713 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2714 				  const bool nullify)
2715 {
2716 	if (WARN_ON_ONCE(!mem))
2717 		return;
2718 	if (nullify)
2719 		kfree_sensitive(mem);
2720 	else
2721 		kfree(mem);
2722 	atomic_sub(size, &sk->sk_omem_alloc);
2723 }
2724 
sock_kfree_s(struct sock * sk,void * mem,int size)2725 void sock_kfree_s(struct sock *sk, void *mem, int size)
2726 {
2727 	__sock_kfree_s(sk, mem, size, false);
2728 }
2729 EXPORT_SYMBOL(sock_kfree_s);
2730 
sock_kzfree_s(struct sock * sk,void * mem,int size)2731 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2732 {
2733 	__sock_kfree_s(sk, mem, size, true);
2734 }
2735 EXPORT_SYMBOL(sock_kzfree_s);
2736 
2737 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2738    I think, these locks should be removed for datagram sockets.
2739  */
sock_wait_for_wmem(struct sock * sk,long timeo)2740 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2741 {
2742 	DEFINE_WAIT(wait);
2743 
2744 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2745 	for (;;) {
2746 		if (!timeo)
2747 			break;
2748 		if (signal_pending(current))
2749 			break;
2750 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2751 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2752 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2753 			break;
2754 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2755 			break;
2756 		if (READ_ONCE(sk->sk_err))
2757 			break;
2758 		timeo = schedule_timeout(timeo);
2759 	}
2760 	finish_wait(sk_sleep(sk), &wait);
2761 	return timeo;
2762 }
2763 
2764 
2765 /*
2766  *	Generic send/receive buffer handlers
2767  */
2768 
sock_alloc_send_pskb(struct sock * sk,unsigned long header_len,unsigned long data_len,int noblock,int * errcode,int max_page_order)2769 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2770 				     unsigned long data_len, int noblock,
2771 				     int *errcode, int max_page_order)
2772 {
2773 	struct sk_buff *skb;
2774 	long timeo;
2775 	int err;
2776 
2777 	timeo = sock_sndtimeo(sk, noblock);
2778 	for (;;) {
2779 		err = sock_error(sk);
2780 		if (err != 0)
2781 			goto failure;
2782 
2783 		err = -EPIPE;
2784 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2785 			goto failure;
2786 
2787 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2788 			break;
2789 
2790 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2791 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2792 		err = -EAGAIN;
2793 		if (!timeo)
2794 			goto failure;
2795 		if (signal_pending(current))
2796 			goto interrupted;
2797 		timeo = sock_wait_for_wmem(sk, timeo);
2798 	}
2799 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2800 				   errcode, sk->sk_allocation);
2801 	if (skb)
2802 		skb_set_owner_w(skb, sk);
2803 	return skb;
2804 
2805 interrupted:
2806 	err = sock_intr_errno(timeo);
2807 failure:
2808 	*errcode = err;
2809 	return NULL;
2810 }
2811 EXPORT_SYMBOL(sock_alloc_send_pskb);
2812 
__sock_cmsg_send(struct sock * sk,struct cmsghdr * cmsg,struct sockcm_cookie * sockc)2813 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
2814 		     struct sockcm_cookie *sockc)
2815 {
2816 	u32 tsflags;
2817 
2818 	switch (cmsg->cmsg_type) {
2819 	case SO_MARK:
2820 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2821 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2822 			return -EPERM;
2823 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2824 			return -EINVAL;
2825 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2826 		break;
2827 	case SO_TIMESTAMPING_OLD:
2828 	case SO_TIMESTAMPING_NEW:
2829 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2830 			return -EINVAL;
2831 
2832 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2833 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2834 			return -EINVAL;
2835 
2836 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2837 		sockc->tsflags |= tsflags;
2838 		break;
2839 	case SCM_TXTIME:
2840 		if (!sock_flag(sk, SOCK_TXTIME))
2841 			return -EINVAL;
2842 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2843 			return -EINVAL;
2844 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2845 		break;
2846 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2847 	case SCM_RIGHTS:
2848 	case SCM_CREDENTIALS:
2849 		break;
2850 	default:
2851 		return -EINVAL;
2852 	}
2853 	return 0;
2854 }
2855 EXPORT_SYMBOL(__sock_cmsg_send);
2856 
sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct sockcm_cookie * sockc)2857 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2858 		   struct sockcm_cookie *sockc)
2859 {
2860 	struct cmsghdr *cmsg;
2861 	int ret;
2862 
2863 	for_each_cmsghdr(cmsg, msg) {
2864 		if (!CMSG_OK(msg, cmsg))
2865 			return -EINVAL;
2866 		if (cmsg->cmsg_level != SOL_SOCKET)
2867 			continue;
2868 		ret = __sock_cmsg_send(sk, cmsg, sockc);
2869 		if (ret)
2870 			return ret;
2871 	}
2872 	return 0;
2873 }
2874 EXPORT_SYMBOL(sock_cmsg_send);
2875 
sk_enter_memory_pressure(struct sock * sk)2876 static void sk_enter_memory_pressure(struct sock *sk)
2877 {
2878 	if (!sk->sk_prot->enter_memory_pressure)
2879 		return;
2880 
2881 	sk->sk_prot->enter_memory_pressure(sk);
2882 }
2883 
sk_leave_memory_pressure(struct sock * sk)2884 static void sk_leave_memory_pressure(struct sock *sk)
2885 {
2886 	if (sk->sk_prot->leave_memory_pressure) {
2887 		INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
2888 				     tcp_leave_memory_pressure, sk);
2889 	} else {
2890 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2891 
2892 		if (memory_pressure && READ_ONCE(*memory_pressure))
2893 			WRITE_ONCE(*memory_pressure, 0);
2894 	}
2895 }
2896 
2897 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2898 
2899 /**
2900  * skb_page_frag_refill - check that a page_frag contains enough room
2901  * @sz: minimum size of the fragment we want to get
2902  * @pfrag: pointer to page_frag
2903  * @gfp: priority for memory allocation
2904  *
2905  * Note: While this allocator tries to use high order pages, there is
2906  * no guarantee that allocations succeed. Therefore, @sz MUST be
2907  * less or equal than PAGE_SIZE.
2908  */
skb_page_frag_refill(unsigned int sz,struct page_frag * pfrag,gfp_t gfp)2909 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2910 {
2911 	if (pfrag->page) {
2912 		if (page_ref_count(pfrag->page) == 1) {
2913 			pfrag->offset = 0;
2914 			return true;
2915 		}
2916 		if (pfrag->offset + sz <= pfrag->size)
2917 			return true;
2918 		put_page(pfrag->page);
2919 	}
2920 
2921 	pfrag->offset = 0;
2922 	if (SKB_FRAG_PAGE_ORDER &&
2923 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2924 		/* Avoid direct reclaim but allow kswapd to wake */
2925 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2926 					  __GFP_COMP | __GFP_NOWARN |
2927 					  __GFP_NORETRY,
2928 					  SKB_FRAG_PAGE_ORDER);
2929 		if (likely(pfrag->page)) {
2930 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2931 			return true;
2932 		}
2933 	}
2934 	pfrag->page = alloc_page(gfp);
2935 	if (likely(pfrag->page)) {
2936 		pfrag->size = PAGE_SIZE;
2937 		return true;
2938 	}
2939 	return false;
2940 }
2941 EXPORT_SYMBOL(skb_page_frag_refill);
2942 
sk_page_frag_refill(struct sock * sk,struct page_frag * pfrag)2943 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2944 {
2945 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2946 		return true;
2947 
2948 	sk_enter_memory_pressure(sk);
2949 	sk_stream_moderate_sndbuf(sk);
2950 	return false;
2951 }
2952 EXPORT_SYMBOL(sk_page_frag_refill);
2953 
__lock_sock(struct sock * sk)2954 void __lock_sock(struct sock *sk)
2955 	__releases(&sk->sk_lock.slock)
2956 	__acquires(&sk->sk_lock.slock)
2957 {
2958 	DEFINE_WAIT(wait);
2959 
2960 	for (;;) {
2961 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2962 					TASK_UNINTERRUPTIBLE);
2963 		spin_unlock_bh(&sk->sk_lock.slock);
2964 		schedule();
2965 		spin_lock_bh(&sk->sk_lock.slock);
2966 		if (!sock_owned_by_user(sk))
2967 			break;
2968 	}
2969 	finish_wait(&sk->sk_lock.wq, &wait);
2970 }
2971 
__release_sock(struct sock * sk)2972 void __release_sock(struct sock *sk)
2973 	__releases(&sk->sk_lock.slock)
2974 	__acquires(&sk->sk_lock.slock)
2975 {
2976 	struct sk_buff *skb, *next;
2977 
2978 	while ((skb = sk->sk_backlog.head) != NULL) {
2979 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2980 
2981 		spin_unlock_bh(&sk->sk_lock.slock);
2982 
2983 		do {
2984 			next = skb->next;
2985 			prefetch(next);
2986 			DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
2987 			skb_mark_not_on_list(skb);
2988 			sk_backlog_rcv(sk, skb);
2989 
2990 			cond_resched();
2991 
2992 			skb = next;
2993 		} while (skb != NULL);
2994 
2995 		spin_lock_bh(&sk->sk_lock.slock);
2996 	}
2997 
2998 	/*
2999 	 * Doing the zeroing here guarantee we can not loop forever
3000 	 * while a wild producer attempts to flood us.
3001 	 */
3002 	sk->sk_backlog.len = 0;
3003 }
3004 
__sk_flush_backlog(struct sock * sk)3005 void __sk_flush_backlog(struct sock *sk)
3006 {
3007 	spin_lock_bh(&sk->sk_lock.slock);
3008 	__release_sock(sk);
3009 	spin_unlock_bh(&sk->sk_lock.slock);
3010 }
3011 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3012 
3013 /**
3014  * sk_wait_data - wait for data to arrive at sk_receive_queue
3015  * @sk:    sock to wait on
3016  * @timeo: for how long
3017  * @skb:   last skb seen on sk_receive_queue
3018  *
3019  * Now socket state including sk->sk_err is changed only under lock,
3020  * hence we may omit checks after joining wait queue.
3021  * We check receive queue before schedule() only as optimization;
3022  * it is very likely that release_sock() added new data.
3023  */
sk_wait_data(struct sock * sk,long * timeo,const struct sk_buff * skb)3024 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3025 {
3026 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
3027 	int rc;
3028 
3029 	add_wait_queue(sk_sleep(sk), &wait);
3030 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3031 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3032 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3033 	remove_wait_queue(sk_sleep(sk), &wait);
3034 	return rc;
3035 }
3036 EXPORT_SYMBOL(sk_wait_data);
3037 
3038 /**
3039  *	__sk_mem_raise_allocated - increase memory_allocated
3040  *	@sk: socket
3041  *	@size: memory size to allocate
3042  *	@amt: pages to allocate
3043  *	@kind: allocation type
3044  *
3045  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
3046  */
__sk_mem_raise_allocated(struct sock * sk,int size,int amt,int kind)3047 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3048 {
3049 	bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
3050 	struct proto *prot = sk->sk_prot;
3051 	bool charged = true;
3052 	long allocated;
3053 
3054 	sk_memory_allocated_add(sk, amt);
3055 	allocated = sk_memory_allocated(sk);
3056 	if (memcg_charge &&
3057 	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
3058 						gfp_memcg_charge())))
3059 		goto suppress_allocation;
3060 
3061 	/* Under limit. */
3062 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
3063 		sk_leave_memory_pressure(sk);
3064 		return 1;
3065 	}
3066 
3067 	/* Under pressure. */
3068 	if (allocated > sk_prot_mem_limits(sk, 1))
3069 		sk_enter_memory_pressure(sk);
3070 
3071 	/* Over hard limit. */
3072 	if (allocated > sk_prot_mem_limits(sk, 2))
3073 		goto suppress_allocation;
3074 
3075 	/* guarantee minimum buffer size under pressure */
3076 	if (kind == SK_MEM_RECV) {
3077 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3078 			return 1;
3079 
3080 	} else { /* SK_MEM_SEND */
3081 		int wmem0 = sk_get_wmem0(sk, prot);
3082 
3083 		if (sk->sk_type == SOCK_STREAM) {
3084 			if (sk->sk_wmem_queued < wmem0)
3085 				return 1;
3086 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3087 				return 1;
3088 		}
3089 	}
3090 
3091 	if (sk_has_memory_pressure(sk)) {
3092 		u64 alloc;
3093 
3094 		if (!sk_under_memory_pressure(sk))
3095 			return 1;
3096 		alloc = sk_sockets_allocated_read_positive(sk);
3097 		if (sk_prot_mem_limits(sk, 2) > alloc *
3098 		    sk_mem_pages(sk->sk_wmem_queued +
3099 				 atomic_read(&sk->sk_rmem_alloc) +
3100 				 sk->sk_forward_alloc))
3101 			return 1;
3102 	}
3103 
3104 suppress_allocation:
3105 
3106 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3107 		sk_stream_moderate_sndbuf(sk);
3108 
3109 		/* Fail only if socket is _under_ its sndbuf.
3110 		 * In this case we cannot block, so that we have to fail.
3111 		 */
3112 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3113 			/* Force charge with __GFP_NOFAIL */
3114 			if (memcg_charge && !charged) {
3115 				mem_cgroup_charge_skmem(sk->sk_memcg, amt,
3116 					gfp_memcg_charge() | __GFP_NOFAIL);
3117 			}
3118 			return 1;
3119 		}
3120 	}
3121 
3122 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3123 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3124 
3125 	sk_memory_allocated_sub(sk, amt);
3126 
3127 	if (memcg_charge && charged)
3128 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
3129 
3130 	return 0;
3131 }
3132 
3133 /**
3134  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3135  *	@sk: socket
3136  *	@size: memory size to allocate
3137  *	@kind: allocation type
3138  *
3139  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3140  *	rmem allocation. This function assumes that protocols which have
3141  *	memory_pressure use sk_wmem_queued as write buffer accounting.
3142  */
__sk_mem_schedule(struct sock * sk,int size,int kind)3143 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3144 {
3145 	int ret, amt = sk_mem_pages(size);
3146 
3147 	sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3148 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3149 	if (!ret)
3150 		sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3151 	return ret;
3152 }
3153 EXPORT_SYMBOL(__sk_mem_schedule);
3154 
3155 /**
3156  *	__sk_mem_reduce_allocated - reclaim memory_allocated
3157  *	@sk: socket
3158  *	@amount: number of quanta
3159  *
3160  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3161  */
__sk_mem_reduce_allocated(struct sock * sk,int amount)3162 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3163 {
3164 	sk_memory_allocated_sub(sk, amount);
3165 
3166 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3167 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3168 
3169 	if (sk_under_global_memory_pressure(sk) &&
3170 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3171 		sk_leave_memory_pressure(sk);
3172 }
3173 
3174 /**
3175  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3176  *	@sk: socket
3177  *	@amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3178  */
__sk_mem_reclaim(struct sock * sk,int amount)3179 void __sk_mem_reclaim(struct sock *sk, int amount)
3180 {
3181 	amount >>= PAGE_SHIFT;
3182 	sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3183 	__sk_mem_reduce_allocated(sk, amount);
3184 }
3185 EXPORT_SYMBOL(__sk_mem_reclaim);
3186 
sk_set_peek_off(struct sock * sk,int val)3187 int sk_set_peek_off(struct sock *sk, int val)
3188 {
3189 	WRITE_ONCE(sk->sk_peek_off, val);
3190 	return 0;
3191 }
3192 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3193 
3194 /*
3195  * Set of default routines for initialising struct proto_ops when
3196  * the protocol does not support a particular function. In certain
3197  * cases where it makes no sense for a protocol to have a "do nothing"
3198  * function, some default processing is provided.
3199  */
3200 
sock_no_bind(struct socket * sock,struct sockaddr * saddr,int len)3201 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3202 {
3203 	return -EOPNOTSUPP;
3204 }
3205 EXPORT_SYMBOL(sock_no_bind);
3206 
sock_no_connect(struct socket * sock,struct sockaddr * saddr,int len,int flags)3207 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3208 		    int len, int flags)
3209 {
3210 	return -EOPNOTSUPP;
3211 }
3212 EXPORT_SYMBOL(sock_no_connect);
3213 
sock_no_socketpair(struct socket * sock1,struct socket * sock2)3214 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3215 {
3216 	return -EOPNOTSUPP;
3217 }
3218 EXPORT_SYMBOL(sock_no_socketpair);
3219 
sock_no_accept(struct socket * sock,struct socket * newsock,int flags,bool kern)3220 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
3221 		   bool kern)
3222 {
3223 	return -EOPNOTSUPP;
3224 }
3225 EXPORT_SYMBOL(sock_no_accept);
3226 
sock_no_getname(struct socket * sock,struct sockaddr * saddr,int peer)3227 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3228 		    int peer)
3229 {
3230 	return -EOPNOTSUPP;
3231 }
3232 EXPORT_SYMBOL(sock_no_getname);
3233 
sock_no_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3234 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3235 {
3236 	return -EOPNOTSUPP;
3237 }
3238 EXPORT_SYMBOL(sock_no_ioctl);
3239 
sock_no_listen(struct socket * sock,int backlog)3240 int sock_no_listen(struct socket *sock, int backlog)
3241 {
3242 	return -EOPNOTSUPP;
3243 }
3244 EXPORT_SYMBOL(sock_no_listen);
3245 
sock_no_shutdown(struct socket * sock,int how)3246 int sock_no_shutdown(struct socket *sock, int how)
3247 {
3248 	return -EOPNOTSUPP;
3249 }
3250 EXPORT_SYMBOL(sock_no_shutdown);
3251 
sock_no_sendmsg(struct socket * sock,struct msghdr * m,size_t len)3252 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3253 {
3254 	return -EOPNOTSUPP;
3255 }
3256 EXPORT_SYMBOL(sock_no_sendmsg);
3257 
sock_no_sendmsg_locked(struct sock * sk,struct msghdr * m,size_t len)3258 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3259 {
3260 	return -EOPNOTSUPP;
3261 }
3262 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3263 
sock_no_recvmsg(struct socket * sock,struct msghdr * m,size_t len,int flags)3264 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3265 		    int flags)
3266 {
3267 	return -EOPNOTSUPP;
3268 }
3269 EXPORT_SYMBOL(sock_no_recvmsg);
3270 
sock_no_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)3271 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3272 {
3273 	/* Mirror missing mmap method error code */
3274 	return -ENODEV;
3275 }
3276 EXPORT_SYMBOL(sock_no_mmap);
3277 
3278 /*
3279  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3280  * various sock-based usage counts.
3281  */
__receive_sock(struct file * file)3282 void __receive_sock(struct file *file)
3283 {
3284 	struct socket *sock;
3285 
3286 	sock = sock_from_file(file);
3287 	if (sock) {
3288 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3289 		sock_update_classid(&sock->sk->sk_cgrp_data);
3290 	}
3291 }
3292 
3293 /*
3294  *	Default Socket Callbacks
3295  */
3296 
sock_def_wakeup(struct sock * sk)3297 static void sock_def_wakeup(struct sock *sk)
3298 {
3299 	struct socket_wq *wq;
3300 
3301 	rcu_read_lock();
3302 	wq = rcu_dereference(sk->sk_wq);
3303 	if (skwq_has_sleeper(wq))
3304 		wake_up_interruptible_all(&wq->wait);
3305 	rcu_read_unlock();
3306 }
3307 
sock_def_error_report(struct sock * sk)3308 static void sock_def_error_report(struct sock *sk)
3309 {
3310 	struct socket_wq *wq;
3311 
3312 	rcu_read_lock();
3313 	wq = rcu_dereference(sk->sk_wq);
3314 	if (skwq_has_sleeper(wq))
3315 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3316 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3317 	rcu_read_unlock();
3318 }
3319 
sock_def_readable(struct sock * sk)3320 void sock_def_readable(struct sock *sk)
3321 {
3322 	struct socket_wq *wq;
3323 
3324 	trace_sk_data_ready(sk);
3325 
3326 	rcu_read_lock();
3327 	wq = rcu_dereference(sk->sk_wq);
3328 	if (skwq_has_sleeper(wq))
3329 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3330 						EPOLLRDNORM | EPOLLRDBAND);
3331 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3332 	rcu_read_unlock();
3333 }
3334 
sock_def_write_space(struct sock * sk)3335 static void sock_def_write_space(struct sock *sk)
3336 {
3337 	struct socket_wq *wq;
3338 
3339 	rcu_read_lock();
3340 
3341 	/* Do not wake up a writer until he can make "significant"
3342 	 * progress.  --DaveM
3343 	 */
3344 	if (sock_writeable(sk)) {
3345 		wq = rcu_dereference(sk->sk_wq);
3346 		if (skwq_has_sleeper(wq))
3347 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3348 						EPOLLWRNORM | EPOLLWRBAND);
3349 
3350 		/* Should agree with poll, otherwise some programs break */
3351 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3352 	}
3353 
3354 	rcu_read_unlock();
3355 }
3356 
3357 /* An optimised version of sock_def_write_space(), should only be called
3358  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3359  * ->sk_wmem_alloc.
3360  */
sock_def_write_space_wfree(struct sock * sk)3361 static void sock_def_write_space_wfree(struct sock *sk)
3362 {
3363 	/* Do not wake up a writer until he can make "significant"
3364 	 * progress.  --DaveM
3365 	 */
3366 	if (sock_writeable(sk)) {
3367 		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3368 
3369 		/* rely on refcount_sub from sock_wfree() */
3370 		smp_mb__after_atomic();
3371 		if (wq && waitqueue_active(&wq->wait))
3372 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3373 						EPOLLWRNORM | EPOLLWRBAND);
3374 
3375 		/* Should agree with poll, otherwise some programs break */
3376 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3377 	}
3378 }
3379 
sock_def_destruct(struct sock * sk)3380 static void sock_def_destruct(struct sock *sk)
3381 {
3382 }
3383 
sk_send_sigurg(struct sock * sk)3384 void sk_send_sigurg(struct sock *sk)
3385 {
3386 	if (sk->sk_socket && sk->sk_socket->file)
3387 		if (send_sigurg(&sk->sk_socket->file->f_owner))
3388 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3389 }
3390 EXPORT_SYMBOL(sk_send_sigurg);
3391 
sk_reset_timer(struct sock * sk,struct timer_list * timer,unsigned long expires)3392 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3393 		    unsigned long expires)
3394 {
3395 	if (!mod_timer(timer, expires))
3396 		sock_hold(sk);
3397 }
3398 EXPORT_SYMBOL(sk_reset_timer);
3399 
sk_stop_timer(struct sock * sk,struct timer_list * timer)3400 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3401 {
3402 	if (del_timer(timer))
3403 		__sock_put(sk);
3404 }
3405 EXPORT_SYMBOL(sk_stop_timer);
3406 
sk_stop_timer_sync(struct sock * sk,struct timer_list * timer)3407 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3408 {
3409 	if (del_timer_sync(timer))
3410 		__sock_put(sk);
3411 }
3412 EXPORT_SYMBOL(sk_stop_timer_sync);
3413 
sock_init_data_uid(struct socket * sock,struct sock * sk,kuid_t uid)3414 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3415 {
3416 	sk_init_common(sk);
3417 	sk->sk_send_head	=	NULL;
3418 
3419 	timer_setup(&sk->sk_timer, NULL, 0);
3420 
3421 	sk->sk_allocation	=	GFP_KERNEL;
3422 	sk->sk_rcvbuf		=	READ_ONCE(sysctl_rmem_default);
3423 	sk->sk_sndbuf		=	READ_ONCE(sysctl_wmem_default);
3424 	sk->sk_state		=	TCP_CLOSE;
3425 	sk->sk_use_task_frag	=	true;
3426 	sk_set_socket(sk, sock);
3427 
3428 	sock_set_flag(sk, SOCK_ZAPPED);
3429 
3430 	if (sock) {
3431 		sk->sk_type	=	sock->type;
3432 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3433 		sock->sk	=	sk;
3434 	} else {
3435 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3436 	}
3437 	sk->sk_uid	=	uid;
3438 
3439 	rwlock_init(&sk->sk_callback_lock);
3440 	if (sk->sk_kern_sock)
3441 		lockdep_set_class_and_name(
3442 			&sk->sk_callback_lock,
3443 			af_kern_callback_keys + sk->sk_family,
3444 			af_family_kern_clock_key_strings[sk->sk_family]);
3445 	else
3446 		lockdep_set_class_and_name(
3447 			&sk->sk_callback_lock,
3448 			af_callback_keys + sk->sk_family,
3449 			af_family_clock_key_strings[sk->sk_family]);
3450 
3451 	sk->sk_state_change	=	sock_def_wakeup;
3452 	sk->sk_data_ready	=	sock_def_readable;
3453 	sk->sk_write_space	=	sock_def_write_space;
3454 	sk->sk_error_report	=	sock_def_error_report;
3455 	sk->sk_destruct		=	sock_def_destruct;
3456 
3457 	sk->sk_frag.page	=	NULL;
3458 	sk->sk_frag.offset	=	0;
3459 	sk->sk_peek_off		=	-1;
3460 
3461 	sk->sk_peer_pid 	=	NULL;
3462 	sk->sk_peer_cred	=	NULL;
3463 	spin_lock_init(&sk->sk_peer_lock);
3464 
3465 	sk->sk_write_pending	=	0;
3466 	sk->sk_rcvlowat		=	1;
3467 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3468 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3469 
3470 	sk->sk_stamp = SK_DEFAULT_STAMP;
3471 #if BITS_PER_LONG==32
3472 	seqlock_init(&sk->sk_stamp_seq);
3473 #endif
3474 	atomic_set(&sk->sk_zckey, 0);
3475 
3476 #ifdef CONFIG_NET_RX_BUSY_POLL
3477 	sk->sk_napi_id		=	0;
3478 	sk->sk_ll_usec		=	READ_ONCE(sysctl_net_busy_read);
3479 #endif
3480 
3481 	sk->sk_max_pacing_rate = ~0UL;
3482 	sk->sk_pacing_rate = ~0UL;
3483 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3484 	sk->sk_incoming_cpu = -1;
3485 
3486 	sk_rx_queue_clear(sk);
3487 	/*
3488 	 * Before updating sk_refcnt, we must commit prior changes to memory
3489 	 * (Documentation/RCU/rculist_nulls.rst for details)
3490 	 */
3491 	smp_wmb();
3492 	refcount_set(&sk->sk_refcnt, 1);
3493 	atomic_set(&sk->sk_drops, 0);
3494 }
3495 EXPORT_SYMBOL(sock_init_data_uid);
3496 
sock_init_data(struct socket * sock,struct sock * sk)3497 void sock_init_data(struct socket *sock, struct sock *sk)
3498 {
3499 	kuid_t uid = sock ?
3500 		SOCK_INODE(sock)->i_uid :
3501 		make_kuid(sock_net(sk)->user_ns, 0);
3502 
3503 	sock_init_data_uid(sock, sk, uid);
3504 }
3505 EXPORT_SYMBOL(sock_init_data);
3506 
lock_sock_nested(struct sock * sk,int subclass)3507 void lock_sock_nested(struct sock *sk, int subclass)
3508 {
3509 	/* The sk_lock has mutex_lock() semantics here. */
3510 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3511 
3512 	might_sleep();
3513 	spin_lock_bh(&sk->sk_lock.slock);
3514 	if (sock_owned_by_user_nocheck(sk))
3515 		__lock_sock(sk);
3516 	sk->sk_lock.owned = 1;
3517 	spin_unlock_bh(&sk->sk_lock.slock);
3518 }
3519 EXPORT_SYMBOL(lock_sock_nested);
3520 
release_sock(struct sock * sk)3521 void release_sock(struct sock *sk)
3522 {
3523 	spin_lock_bh(&sk->sk_lock.slock);
3524 	if (sk->sk_backlog.tail)
3525 		__release_sock(sk);
3526 
3527 	/* Warning : release_cb() might need to release sk ownership,
3528 	 * ie call sock_release_ownership(sk) before us.
3529 	 */
3530 	if (sk->sk_prot->release_cb)
3531 		sk->sk_prot->release_cb(sk);
3532 
3533 	sock_release_ownership(sk);
3534 	if (waitqueue_active(&sk->sk_lock.wq))
3535 		wake_up(&sk->sk_lock.wq);
3536 	spin_unlock_bh(&sk->sk_lock.slock);
3537 }
3538 EXPORT_SYMBOL(release_sock);
3539 
__lock_sock_fast(struct sock * sk)3540 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3541 {
3542 	might_sleep();
3543 	spin_lock_bh(&sk->sk_lock.slock);
3544 
3545 	if (!sock_owned_by_user_nocheck(sk)) {
3546 		/*
3547 		 * Fast path return with bottom halves disabled and
3548 		 * sock::sk_lock.slock held.
3549 		 *
3550 		 * The 'mutex' is not contended and holding
3551 		 * sock::sk_lock.slock prevents all other lockers to
3552 		 * proceed so the corresponding unlock_sock_fast() can
3553 		 * avoid the slow path of release_sock() completely and
3554 		 * just release slock.
3555 		 *
3556 		 * From a semantical POV this is equivalent to 'acquiring'
3557 		 * the 'mutex', hence the corresponding lockdep
3558 		 * mutex_release() has to happen in the fast path of
3559 		 * unlock_sock_fast().
3560 		 */
3561 		return false;
3562 	}
3563 
3564 	__lock_sock(sk);
3565 	sk->sk_lock.owned = 1;
3566 	__acquire(&sk->sk_lock.slock);
3567 	spin_unlock_bh(&sk->sk_lock.slock);
3568 	return true;
3569 }
3570 EXPORT_SYMBOL(__lock_sock_fast);
3571 
sock_gettstamp(struct socket * sock,void __user * userstamp,bool timeval,bool time32)3572 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3573 		   bool timeval, bool time32)
3574 {
3575 	struct sock *sk = sock->sk;
3576 	struct timespec64 ts;
3577 
3578 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3579 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3580 	if (ts.tv_sec == -1)
3581 		return -ENOENT;
3582 	if (ts.tv_sec == 0) {
3583 		ktime_t kt = ktime_get_real();
3584 		sock_write_timestamp(sk, kt);
3585 		ts = ktime_to_timespec64(kt);
3586 	}
3587 
3588 	if (timeval)
3589 		ts.tv_nsec /= 1000;
3590 
3591 #ifdef CONFIG_COMPAT_32BIT_TIME
3592 	if (time32)
3593 		return put_old_timespec32(&ts, userstamp);
3594 #endif
3595 #ifdef CONFIG_SPARC64
3596 	/* beware of padding in sparc64 timeval */
3597 	if (timeval && !in_compat_syscall()) {
3598 		struct __kernel_old_timeval __user tv = {
3599 			.tv_sec = ts.tv_sec,
3600 			.tv_usec = ts.tv_nsec,
3601 		};
3602 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3603 			return -EFAULT;
3604 		return 0;
3605 	}
3606 #endif
3607 	return put_timespec64(&ts, userstamp);
3608 }
3609 EXPORT_SYMBOL(sock_gettstamp);
3610 
sock_enable_timestamp(struct sock * sk,enum sock_flags flag)3611 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3612 {
3613 	if (!sock_flag(sk, flag)) {
3614 		unsigned long previous_flags = sk->sk_flags;
3615 
3616 		sock_set_flag(sk, flag);
3617 		/*
3618 		 * we just set one of the two flags which require net
3619 		 * time stamping, but time stamping might have been on
3620 		 * already because of the other one
3621 		 */
3622 		if (sock_needs_netstamp(sk) &&
3623 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3624 			net_enable_timestamp();
3625 	}
3626 }
3627 
sock_recv_errqueue(struct sock * sk,struct msghdr * msg,int len,int level,int type)3628 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3629 		       int level, int type)
3630 {
3631 	struct sock_exterr_skb *serr;
3632 	struct sk_buff *skb;
3633 	int copied, err;
3634 
3635 	err = -EAGAIN;
3636 	skb = sock_dequeue_err_skb(sk);
3637 	if (skb == NULL)
3638 		goto out;
3639 
3640 	copied = skb->len;
3641 	if (copied > len) {
3642 		msg->msg_flags |= MSG_TRUNC;
3643 		copied = len;
3644 	}
3645 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3646 	if (err)
3647 		goto out_free_skb;
3648 
3649 	sock_recv_timestamp(msg, sk, skb);
3650 
3651 	serr = SKB_EXT_ERR(skb);
3652 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3653 
3654 	msg->msg_flags |= MSG_ERRQUEUE;
3655 	err = copied;
3656 
3657 out_free_skb:
3658 	kfree_skb(skb);
3659 out:
3660 	return err;
3661 }
3662 EXPORT_SYMBOL(sock_recv_errqueue);
3663 
3664 /*
3665  *	Get a socket option on an socket.
3666  *
3667  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3668  *	asynchronous errors should be reported by getsockopt. We assume
3669  *	this means if you specify SO_ERROR (otherwise whats the point of it).
3670  */
sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)3671 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3672 			   char __user *optval, int __user *optlen)
3673 {
3674 	struct sock *sk = sock->sk;
3675 
3676 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3677 	return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3678 }
3679 EXPORT_SYMBOL(sock_common_getsockopt);
3680 
sock_common_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)3681 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3682 			int flags)
3683 {
3684 	struct sock *sk = sock->sk;
3685 	int addr_len = 0;
3686 	int err;
3687 
3688 	err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3689 	if (err >= 0)
3690 		msg->msg_namelen = addr_len;
3691 	return err;
3692 }
3693 EXPORT_SYMBOL(sock_common_recvmsg);
3694 
3695 /*
3696  *	Set socket options on an inet socket.
3697  */
sock_common_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)3698 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3699 			   sockptr_t optval, unsigned int optlen)
3700 {
3701 	struct sock *sk = sock->sk;
3702 
3703 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3704 	return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3705 }
3706 EXPORT_SYMBOL(sock_common_setsockopt);
3707 
sk_common_release(struct sock * sk)3708 void sk_common_release(struct sock *sk)
3709 {
3710 	if (sk->sk_prot->destroy)
3711 		sk->sk_prot->destroy(sk);
3712 
3713 	/*
3714 	 * Observation: when sk_common_release is called, processes have
3715 	 * no access to socket. But net still has.
3716 	 * Step one, detach it from networking:
3717 	 *
3718 	 * A. Remove from hash tables.
3719 	 */
3720 
3721 	sk->sk_prot->unhash(sk);
3722 
3723 	if (sk->sk_socket)
3724 		sk->sk_socket->sk = NULL;
3725 
3726 	/*
3727 	 * In this point socket cannot receive new packets, but it is possible
3728 	 * that some packets are in flight because some CPU runs receiver and
3729 	 * did hash table lookup before we unhashed socket. They will achieve
3730 	 * receive queue and will be purged by socket destructor.
3731 	 *
3732 	 * Also we still have packets pending on receive queue and probably,
3733 	 * our own packets waiting in device queues. sock_destroy will drain
3734 	 * receive queue, but transmitted packets will delay socket destruction
3735 	 * until the last reference will be released.
3736 	 */
3737 
3738 	sock_orphan(sk);
3739 
3740 	xfrm_sk_free_policy(sk);
3741 
3742 	sock_put(sk);
3743 }
3744 EXPORT_SYMBOL(sk_common_release);
3745 
sk_get_meminfo(const struct sock * sk,u32 * mem)3746 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3747 {
3748 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3749 
3750 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3751 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3752 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3753 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3754 	mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk);
3755 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3756 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3757 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3758 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3759 }
3760 
3761 #ifdef CONFIG_PROC_FS
3762 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3763 
sock_prot_inuse_get(struct net * net,struct proto * prot)3764 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3765 {
3766 	int cpu, idx = prot->inuse_idx;
3767 	int res = 0;
3768 
3769 	for_each_possible_cpu(cpu)
3770 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3771 
3772 	return res >= 0 ? res : 0;
3773 }
3774 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3775 
sock_inuse_get(struct net * net)3776 int sock_inuse_get(struct net *net)
3777 {
3778 	int cpu, res = 0;
3779 
3780 	for_each_possible_cpu(cpu)
3781 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3782 
3783 	return res;
3784 }
3785 
3786 EXPORT_SYMBOL_GPL(sock_inuse_get);
3787 
sock_inuse_init_net(struct net * net)3788 static int __net_init sock_inuse_init_net(struct net *net)
3789 {
3790 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3791 	if (net->core.prot_inuse == NULL)
3792 		return -ENOMEM;
3793 	return 0;
3794 }
3795 
sock_inuse_exit_net(struct net * net)3796 static void __net_exit sock_inuse_exit_net(struct net *net)
3797 {
3798 	free_percpu(net->core.prot_inuse);
3799 }
3800 
3801 static struct pernet_operations net_inuse_ops = {
3802 	.init = sock_inuse_init_net,
3803 	.exit = sock_inuse_exit_net,
3804 };
3805 
net_inuse_init(void)3806 static __init int net_inuse_init(void)
3807 {
3808 	if (register_pernet_subsys(&net_inuse_ops))
3809 		panic("Cannot initialize net inuse counters");
3810 
3811 	return 0;
3812 }
3813 
3814 core_initcall(net_inuse_init);
3815 
assign_proto_idx(struct proto * prot)3816 static int assign_proto_idx(struct proto *prot)
3817 {
3818 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3819 
3820 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3821 		pr_err("PROTO_INUSE_NR exhausted\n");
3822 		return -ENOSPC;
3823 	}
3824 
3825 	set_bit(prot->inuse_idx, proto_inuse_idx);
3826 	return 0;
3827 }
3828 
release_proto_idx(struct proto * prot)3829 static void release_proto_idx(struct proto *prot)
3830 {
3831 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3832 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3833 }
3834 #else
assign_proto_idx(struct proto * prot)3835 static inline int assign_proto_idx(struct proto *prot)
3836 {
3837 	return 0;
3838 }
3839 
release_proto_idx(struct proto * prot)3840 static inline void release_proto_idx(struct proto *prot)
3841 {
3842 }
3843 
3844 #endif
3845 
tw_prot_cleanup(struct timewait_sock_ops * twsk_prot)3846 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3847 {
3848 	if (!twsk_prot)
3849 		return;
3850 	kfree(twsk_prot->twsk_slab_name);
3851 	twsk_prot->twsk_slab_name = NULL;
3852 	kmem_cache_destroy(twsk_prot->twsk_slab);
3853 	twsk_prot->twsk_slab = NULL;
3854 }
3855 
tw_prot_init(const struct proto * prot)3856 static int tw_prot_init(const struct proto *prot)
3857 {
3858 	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3859 
3860 	if (!twsk_prot)
3861 		return 0;
3862 
3863 	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3864 					      prot->name);
3865 	if (!twsk_prot->twsk_slab_name)
3866 		return -ENOMEM;
3867 
3868 	twsk_prot->twsk_slab =
3869 		kmem_cache_create(twsk_prot->twsk_slab_name,
3870 				  twsk_prot->twsk_obj_size, 0,
3871 				  SLAB_ACCOUNT | prot->slab_flags,
3872 				  NULL);
3873 	if (!twsk_prot->twsk_slab) {
3874 		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3875 			prot->name);
3876 		return -ENOMEM;
3877 	}
3878 
3879 	return 0;
3880 }
3881 
req_prot_cleanup(struct request_sock_ops * rsk_prot)3882 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3883 {
3884 	if (!rsk_prot)
3885 		return;
3886 	kfree(rsk_prot->slab_name);
3887 	rsk_prot->slab_name = NULL;
3888 	kmem_cache_destroy(rsk_prot->slab);
3889 	rsk_prot->slab = NULL;
3890 }
3891 
req_prot_init(const struct proto * prot)3892 static int req_prot_init(const struct proto *prot)
3893 {
3894 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3895 
3896 	if (!rsk_prot)
3897 		return 0;
3898 
3899 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3900 					prot->name);
3901 	if (!rsk_prot->slab_name)
3902 		return -ENOMEM;
3903 
3904 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3905 					   rsk_prot->obj_size, 0,
3906 					   SLAB_ACCOUNT | prot->slab_flags,
3907 					   NULL);
3908 
3909 	if (!rsk_prot->slab) {
3910 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3911 			prot->name);
3912 		return -ENOMEM;
3913 	}
3914 	return 0;
3915 }
3916 
proto_register(struct proto * prot,int alloc_slab)3917 int proto_register(struct proto *prot, int alloc_slab)
3918 {
3919 	int ret = -ENOBUFS;
3920 
3921 	if (prot->memory_allocated && !prot->sysctl_mem) {
3922 		pr_err("%s: missing sysctl_mem\n", prot->name);
3923 		return -EINVAL;
3924 	}
3925 	if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
3926 		pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
3927 		return -EINVAL;
3928 	}
3929 	if (alloc_slab) {
3930 		prot->slab = kmem_cache_create_usercopy(prot->name,
3931 					prot->obj_size, 0,
3932 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3933 					prot->slab_flags,
3934 					prot->useroffset, prot->usersize,
3935 					NULL);
3936 
3937 		if (prot->slab == NULL) {
3938 			pr_crit("%s: Can't create sock SLAB cache!\n",
3939 				prot->name);
3940 			goto out;
3941 		}
3942 
3943 		if (req_prot_init(prot))
3944 			goto out_free_request_sock_slab;
3945 
3946 		if (tw_prot_init(prot))
3947 			goto out_free_timewait_sock_slab;
3948 	}
3949 
3950 	mutex_lock(&proto_list_mutex);
3951 	ret = assign_proto_idx(prot);
3952 	if (ret) {
3953 		mutex_unlock(&proto_list_mutex);
3954 		goto out_free_timewait_sock_slab;
3955 	}
3956 	list_add(&prot->node, &proto_list);
3957 	mutex_unlock(&proto_list_mutex);
3958 	return ret;
3959 
3960 out_free_timewait_sock_slab:
3961 	if (alloc_slab)
3962 		tw_prot_cleanup(prot->twsk_prot);
3963 out_free_request_sock_slab:
3964 	if (alloc_slab) {
3965 		req_prot_cleanup(prot->rsk_prot);
3966 
3967 		kmem_cache_destroy(prot->slab);
3968 		prot->slab = NULL;
3969 	}
3970 out:
3971 	return ret;
3972 }
3973 EXPORT_SYMBOL(proto_register);
3974 
proto_unregister(struct proto * prot)3975 void proto_unregister(struct proto *prot)
3976 {
3977 	mutex_lock(&proto_list_mutex);
3978 	release_proto_idx(prot);
3979 	list_del(&prot->node);
3980 	mutex_unlock(&proto_list_mutex);
3981 
3982 	kmem_cache_destroy(prot->slab);
3983 	prot->slab = NULL;
3984 
3985 	req_prot_cleanup(prot->rsk_prot);
3986 	tw_prot_cleanup(prot->twsk_prot);
3987 }
3988 EXPORT_SYMBOL(proto_unregister);
3989 
sock_load_diag_module(int family,int protocol)3990 int sock_load_diag_module(int family, int protocol)
3991 {
3992 	if (!protocol) {
3993 		if (!sock_is_registered(family))
3994 			return -ENOENT;
3995 
3996 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3997 				      NETLINK_SOCK_DIAG, family);
3998 	}
3999 
4000 #ifdef CONFIG_INET
4001 	if (family == AF_INET &&
4002 	    protocol != IPPROTO_RAW &&
4003 	    protocol < MAX_INET_PROTOS &&
4004 	    !rcu_access_pointer(inet_protos[protocol]))
4005 		return -ENOENT;
4006 #endif
4007 
4008 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4009 			      NETLINK_SOCK_DIAG, family, protocol);
4010 }
4011 EXPORT_SYMBOL(sock_load_diag_module);
4012 
4013 #ifdef CONFIG_PROC_FS
proto_seq_start(struct seq_file * seq,loff_t * pos)4014 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4015 	__acquires(proto_list_mutex)
4016 {
4017 	mutex_lock(&proto_list_mutex);
4018 	return seq_list_start_head(&proto_list, *pos);
4019 }
4020 
proto_seq_next(struct seq_file * seq,void * v,loff_t * pos)4021 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4022 {
4023 	return seq_list_next(v, &proto_list, pos);
4024 }
4025 
proto_seq_stop(struct seq_file * seq,void * v)4026 static void proto_seq_stop(struct seq_file *seq, void *v)
4027 	__releases(proto_list_mutex)
4028 {
4029 	mutex_unlock(&proto_list_mutex);
4030 }
4031 
proto_method_implemented(const void * method)4032 static char proto_method_implemented(const void *method)
4033 {
4034 	return method == NULL ? 'n' : 'y';
4035 }
sock_prot_memory_allocated(struct proto * proto)4036 static long sock_prot_memory_allocated(struct proto *proto)
4037 {
4038 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4039 }
4040 
sock_prot_memory_pressure(struct proto * proto)4041 static const char *sock_prot_memory_pressure(struct proto *proto)
4042 {
4043 	return proto->memory_pressure != NULL ?
4044 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4045 }
4046 
proto_seq_printf(struct seq_file * seq,struct proto * proto)4047 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4048 {
4049 
4050 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4051 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4052 		   proto->name,
4053 		   proto->obj_size,
4054 		   sock_prot_inuse_get(seq_file_net(seq), proto),
4055 		   sock_prot_memory_allocated(proto),
4056 		   sock_prot_memory_pressure(proto),
4057 		   proto->max_header,
4058 		   proto->slab == NULL ? "no" : "yes",
4059 		   module_name(proto->owner),
4060 		   proto_method_implemented(proto->close),
4061 		   proto_method_implemented(proto->connect),
4062 		   proto_method_implemented(proto->disconnect),
4063 		   proto_method_implemented(proto->accept),
4064 		   proto_method_implemented(proto->ioctl),
4065 		   proto_method_implemented(proto->init),
4066 		   proto_method_implemented(proto->destroy),
4067 		   proto_method_implemented(proto->shutdown),
4068 		   proto_method_implemented(proto->setsockopt),
4069 		   proto_method_implemented(proto->getsockopt),
4070 		   proto_method_implemented(proto->sendmsg),
4071 		   proto_method_implemented(proto->recvmsg),
4072 		   proto_method_implemented(proto->bind),
4073 		   proto_method_implemented(proto->backlog_rcv),
4074 		   proto_method_implemented(proto->hash),
4075 		   proto_method_implemented(proto->unhash),
4076 		   proto_method_implemented(proto->get_port),
4077 		   proto_method_implemented(proto->enter_memory_pressure));
4078 }
4079 
proto_seq_show(struct seq_file * seq,void * v)4080 static int proto_seq_show(struct seq_file *seq, void *v)
4081 {
4082 	if (v == &proto_list)
4083 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4084 			   "protocol",
4085 			   "size",
4086 			   "sockets",
4087 			   "memory",
4088 			   "press",
4089 			   "maxhdr",
4090 			   "slab",
4091 			   "module",
4092 			   "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4093 	else
4094 		proto_seq_printf(seq, list_entry(v, struct proto, node));
4095 	return 0;
4096 }
4097 
4098 static const struct seq_operations proto_seq_ops = {
4099 	.start  = proto_seq_start,
4100 	.next   = proto_seq_next,
4101 	.stop   = proto_seq_stop,
4102 	.show   = proto_seq_show,
4103 };
4104 
proto_init_net(struct net * net)4105 static __net_init int proto_init_net(struct net *net)
4106 {
4107 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4108 			sizeof(struct seq_net_private)))
4109 		return -ENOMEM;
4110 
4111 	return 0;
4112 }
4113 
proto_exit_net(struct net * net)4114 static __net_exit void proto_exit_net(struct net *net)
4115 {
4116 	remove_proc_entry("protocols", net->proc_net);
4117 }
4118 
4119 
4120 static __net_initdata struct pernet_operations proto_net_ops = {
4121 	.init = proto_init_net,
4122 	.exit = proto_exit_net,
4123 };
4124 
proto_init(void)4125 static int __init proto_init(void)
4126 {
4127 	return register_pernet_subsys(&proto_net_ops);
4128 }
4129 
4130 subsys_initcall(proto_init);
4131 
4132 #endif /* PROC_FS */
4133 
4134 #ifdef CONFIG_NET_RX_BUSY_POLL
sk_busy_loop_end(void * p,unsigned long start_time)4135 bool sk_busy_loop_end(void *p, unsigned long start_time)
4136 {
4137 	struct sock *sk = p;
4138 
4139 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
4140 		return true;
4141 
4142 	if (sk_is_udp(sk) &&
4143 	    !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
4144 		return true;
4145 
4146 	return sk_busy_loop_timeout(sk, start_time);
4147 }
4148 EXPORT_SYMBOL(sk_busy_loop_end);
4149 #endif /* CONFIG_NET_RX_BUSY_POLL */
4150 
sock_bind_add(struct sock * sk,struct sockaddr * addr,int addr_len)4151 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4152 {
4153 	if (!sk->sk_prot->bind_add)
4154 		return -EOPNOTSUPP;
4155 	return sk->sk_prot->bind_add(sk, addr, addr_len);
4156 }
4157 EXPORT_SYMBOL(sock_bind_add);
4158 
4159 /* Copy 'size' bytes from userspace and return `size` back to userspace */
sock_ioctl_inout(struct sock * sk,unsigned int cmd,void __user * arg,void * karg,size_t size)4160 int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4161 		     void __user *arg, void *karg, size_t size)
4162 {
4163 	int ret;
4164 
4165 	if (copy_from_user(karg, arg, size))
4166 		return -EFAULT;
4167 
4168 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4169 	if (ret)
4170 		return ret;
4171 
4172 	if (copy_to_user(arg, karg, size))
4173 		return -EFAULT;
4174 
4175 	return 0;
4176 }
4177 EXPORT_SYMBOL(sock_ioctl_inout);
4178 
4179 /* This is the most common ioctl prep function, where the result (4 bytes) is
4180  * copied back to userspace if the ioctl() returns successfully. No input is
4181  * copied from userspace as input argument.
4182  */
sock_ioctl_out(struct sock * sk,unsigned int cmd,void __user * arg)4183 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4184 {
4185 	int ret, karg = 0;
4186 
4187 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4188 	if (ret)
4189 		return ret;
4190 
4191 	return put_user(karg, (int __user *)arg);
4192 }
4193 
4194 /* A wrapper around sock ioctls, which copies the data from userspace
4195  * (depending on the protocol/ioctl), and copies back the result to userspace.
4196  * The main motivation for this function is to pass kernel memory to the
4197  * protocol ioctl callbacks, instead of userspace memory.
4198  */
sk_ioctl(struct sock * sk,unsigned int cmd,void __user * arg)4199 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4200 {
4201 	int rc = 1;
4202 
4203 	if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4204 		rc = ipmr_sk_ioctl(sk, cmd, arg);
4205 	else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4206 		rc = ip6mr_sk_ioctl(sk, cmd, arg);
4207 	else if (sk_is_phonet(sk))
4208 		rc = phonet_sk_ioctl(sk, cmd, arg);
4209 
4210 	/* If ioctl was processed, returns its value */
4211 	if (rc <= 0)
4212 		return rc;
4213 
4214 	/* Otherwise call the default handler */
4215 	return sock_ioctl_out(sk, cmd, arg);
4216 }
4217 EXPORT_SYMBOL(sk_ioctl);
4218