xref: /openbmc/linux/net/core/sock.c (revision ac3899c6)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/init.h>
111 #include <linux/highmem.h>
112 #include <linux/user_namespace.h>
113 #include <linux/static_key.h>
114 #include <linux/memcontrol.h>
115 #include <linux/prefetch.h>
116 #include <linux/compat.h>
117 #include <linux/mroute.h>
118 #include <linux/mroute6.h>
119 #include <linux/icmpv6.h>
120 
121 #include <linux/uaccess.h>
122 
123 #include <linux/netdevice.h>
124 #include <net/protocol.h>
125 #include <linux/skbuff.h>
126 #include <net/net_namespace.h>
127 #include <net/request_sock.h>
128 #include <net/sock.h>
129 #include <linux/net_tstamp.h>
130 #include <net/xfrm.h>
131 #include <linux/ipsec.h>
132 #include <net/cls_cgroup.h>
133 #include <net/netprio_cgroup.h>
134 #include <linux/sock_diag.h>
135 
136 #include <linux/filter.h>
137 #include <net/sock_reuseport.h>
138 #include <net/bpf_sk_storage.h>
139 
140 #include <trace/events/sock.h>
141 
142 #include <net/tcp.h>
143 #include <net/busy_poll.h>
144 #include <net/phonet/phonet.h>
145 
146 #include <linux/ethtool.h>
147 
148 #include "dev.h"
149 
150 static DEFINE_MUTEX(proto_list_mutex);
151 static LIST_HEAD(proto_list);
152 
153 static void sock_def_write_space_wfree(struct sock *sk);
154 static void sock_def_write_space(struct sock *sk);
155 
156 /**
157  * sk_ns_capable - General socket capability test
158  * @sk: Socket to use a capability on or through
159  * @user_ns: The user namespace of the capability to use
160  * @cap: The capability to use
161  *
162  * Test to see if the opener of the socket had when the socket was
163  * created and the current process has the capability @cap in the user
164  * namespace @user_ns.
165  */
166 bool sk_ns_capable(const struct sock *sk,
167 		   struct user_namespace *user_ns, int cap)
168 {
169 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
170 		ns_capable(user_ns, cap);
171 }
172 EXPORT_SYMBOL(sk_ns_capable);
173 
174 /**
175  * sk_capable - Socket global capability test
176  * @sk: Socket to use a capability on or through
177  * @cap: The global capability to use
178  *
179  * Test to see if the opener of the socket had when the socket was
180  * created and the current process has the capability @cap in all user
181  * namespaces.
182  */
183 bool sk_capable(const struct sock *sk, int cap)
184 {
185 	return sk_ns_capable(sk, &init_user_ns, cap);
186 }
187 EXPORT_SYMBOL(sk_capable);
188 
189 /**
190  * sk_net_capable - Network namespace socket capability test
191  * @sk: Socket to use a capability on or through
192  * @cap: The capability to use
193  *
194  * Test to see if the opener of the socket had when the socket was created
195  * and the current process has the capability @cap over the network namespace
196  * the socket is a member of.
197  */
198 bool sk_net_capable(const struct sock *sk, int cap)
199 {
200 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
201 }
202 EXPORT_SYMBOL(sk_net_capable);
203 
204 /*
205  * Each address family might have different locking rules, so we have
206  * one slock key per address family and separate keys for internal and
207  * userspace sockets.
208  */
209 static struct lock_class_key af_family_keys[AF_MAX];
210 static struct lock_class_key af_family_kern_keys[AF_MAX];
211 static struct lock_class_key af_family_slock_keys[AF_MAX];
212 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
213 
214 /*
215  * Make lock validator output more readable. (we pre-construct these
216  * strings build-time, so that runtime initialization of socket
217  * locks is fast):
218  */
219 
220 #define _sock_locks(x)						  \
221   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
222   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
223   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
224   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
225   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
226   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
227   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
228   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
229   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
230   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
231   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
232   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
233   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
234   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
235   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
236   x "AF_MCTP"  , \
237   x "AF_MAX"
238 
239 static const char *const af_family_key_strings[AF_MAX+1] = {
240 	_sock_locks("sk_lock-")
241 };
242 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
243 	_sock_locks("slock-")
244 };
245 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
246 	_sock_locks("clock-")
247 };
248 
249 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
250 	_sock_locks("k-sk_lock-")
251 };
252 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
253 	_sock_locks("k-slock-")
254 };
255 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
256 	_sock_locks("k-clock-")
257 };
258 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
259 	_sock_locks("rlock-")
260 };
261 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
262 	_sock_locks("wlock-")
263 };
264 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
265 	_sock_locks("elock-")
266 };
267 
268 /*
269  * sk_callback_lock and sk queues locking rules are per-address-family,
270  * so split the lock classes by using a per-AF key:
271  */
272 static struct lock_class_key af_callback_keys[AF_MAX];
273 static struct lock_class_key af_rlock_keys[AF_MAX];
274 static struct lock_class_key af_wlock_keys[AF_MAX];
275 static struct lock_class_key af_elock_keys[AF_MAX];
276 static struct lock_class_key af_kern_callback_keys[AF_MAX];
277 
278 /* Run time adjustable parameters. */
279 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
280 EXPORT_SYMBOL(sysctl_wmem_max);
281 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
282 EXPORT_SYMBOL(sysctl_rmem_max);
283 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
284 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
285 
286 /* Maximal space eaten by iovec or ancillary data plus some space */
287 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
288 EXPORT_SYMBOL(sysctl_optmem_max);
289 
290 int sysctl_tstamp_allow_data __read_mostly = 1;
291 
292 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
293 EXPORT_SYMBOL_GPL(memalloc_socks_key);
294 
295 /**
296  * sk_set_memalloc - sets %SOCK_MEMALLOC
297  * @sk: socket to set it on
298  *
299  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
300  * It's the responsibility of the admin to adjust min_free_kbytes
301  * to meet the requirements
302  */
303 void sk_set_memalloc(struct sock *sk)
304 {
305 	sock_set_flag(sk, SOCK_MEMALLOC);
306 	sk->sk_allocation |= __GFP_MEMALLOC;
307 	static_branch_inc(&memalloc_socks_key);
308 }
309 EXPORT_SYMBOL_GPL(sk_set_memalloc);
310 
311 void sk_clear_memalloc(struct sock *sk)
312 {
313 	sock_reset_flag(sk, SOCK_MEMALLOC);
314 	sk->sk_allocation &= ~__GFP_MEMALLOC;
315 	static_branch_dec(&memalloc_socks_key);
316 
317 	/*
318 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
319 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
320 	 * it has rmem allocations due to the last swapfile being deactivated
321 	 * but there is a risk that the socket is unusable due to exceeding
322 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
323 	 */
324 	sk_mem_reclaim(sk);
325 }
326 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
327 
328 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
329 {
330 	int ret;
331 	unsigned int noreclaim_flag;
332 
333 	/* these should have been dropped before queueing */
334 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
335 
336 	noreclaim_flag = memalloc_noreclaim_save();
337 	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
338 				 tcp_v6_do_rcv,
339 				 tcp_v4_do_rcv,
340 				 sk, skb);
341 	memalloc_noreclaim_restore(noreclaim_flag);
342 
343 	return ret;
344 }
345 EXPORT_SYMBOL(__sk_backlog_rcv);
346 
347 void sk_error_report(struct sock *sk)
348 {
349 	sk->sk_error_report(sk);
350 
351 	switch (sk->sk_family) {
352 	case AF_INET:
353 		fallthrough;
354 	case AF_INET6:
355 		trace_inet_sk_error_report(sk);
356 		break;
357 	default:
358 		break;
359 	}
360 }
361 EXPORT_SYMBOL(sk_error_report);
362 
363 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
364 {
365 	struct __kernel_sock_timeval tv;
366 
367 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
368 		tv.tv_sec = 0;
369 		tv.tv_usec = 0;
370 	} else {
371 		tv.tv_sec = timeo / HZ;
372 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
373 	}
374 
375 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
376 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
377 		*(struct old_timeval32 *)optval = tv32;
378 		return sizeof(tv32);
379 	}
380 
381 	if (old_timeval) {
382 		struct __kernel_old_timeval old_tv;
383 		old_tv.tv_sec = tv.tv_sec;
384 		old_tv.tv_usec = tv.tv_usec;
385 		*(struct __kernel_old_timeval *)optval = old_tv;
386 		return sizeof(old_tv);
387 	}
388 
389 	*(struct __kernel_sock_timeval *)optval = tv;
390 	return sizeof(tv);
391 }
392 EXPORT_SYMBOL(sock_get_timeout);
393 
394 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
395 			   sockptr_t optval, int optlen, bool old_timeval)
396 {
397 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
398 		struct old_timeval32 tv32;
399 
400 		if (optlen < sizeof(tv32))
401 			return -EINVAL;
402 
403 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
404 			return -EFAULT;
405 		tv->tv_sec = tv32.tv_sec;
406 		tv->tv_usec = tv32.tv_usec;
407 	} else if (old_timeval) {
408 		struct __kernel_old_timeval old_tv;
409 
410 		if (optlen < sizeof(old_tv))
411 			return -EINVAL;
412 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
413 			return -EFAULT;
414 		tv->tv_sec = old_tv.tv_sec;
415 		tv->tv_usec = old_tv.tv_usec;
416 	} else {
417 		if (optlen < sizeof(*tv))
418 			return -EINVAL;
419 		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
420 			return -EFAULT;
421 	}
422 
423 	return 0;
424 }
425 EXPORT_SYMBOL(sock_copy_user_timeval);
426 
427 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
428 			    bool old_timeval)
429 {
430 	struct __kernel_sock_timeval tv;
431 	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
432 	long val;
433 
434 	if (err)
435 		return err;
436 
437 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
438 		return -EDOM;
439 
440 	if (tv.tv_sec < 0) {
441 		static int warned __read_mostly;
442 
443 		WRITE_ONCE(*timeo_p, 0);
444 		if (warned < 10 && net_ratelimit()) {
445 			warned++;
446 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
447 				__func__, current->comm, task_pid_nr(current));
448 		}
449 		return 0;
450 	}
451 	val = MAX_SCHEDULE_TIMEOUT;
452 	if ((tv.tv_sec || tv.tv_usec) &&
453 	    (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
454 		val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
455 						    USEC_PER_SEC / HZ);
456 	WRITE_ONCE(*timeo_p, val);
457 	return 0;
458 }
459 
460 static bool sock_needs_netstamp(const struct sock *sk)
461 {
462 	switch (sk->sk_family) {
463 	case AF_UNSPEC:
464 	case AF_UNIX:
465 		return false;
466 	default:
467 		return true;
468 	}
469 }
470 
471 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
472 {
473 	if (sk->sk_flags & flags) {
474 		sk->sk_flags &= ~flags;
475 		if (sock_needs_netstamp(sk) &&
476 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
477 			net_disable_timestamp();
478 	}
479 }
480 
481 
482 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
483 {
484 	unsigned long flags;
485 	struct sk_buff_head *list = &sk->sk_receive_queue;
486 
487 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
488 		atomic_inc(&sk->sk_drops);
489 		trace_sock_rcvqueue_full(sk, skb);
490 		return -ENOMEM;
491 	}
492 
493 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
494 		atomic_inc(&sk->sk_drops);
495 		return -ENOBUFS;
496 	}
497 
498 	skb->dev = NULL;
499 	skb_set_owner_r(skb, sk);
500 
501 	/* we escape from rcu protected region, make sure we dont leak
502 	 * a norefcounted dst
503 	 */
504 	skb_dst_force(skb);
505 
506 	spin_lock_irqsave(&list->lock, flags);
507 	sock_skb_set_dropcount(sk, skb);
508 	__skb_queue_tail(list, skb);
509 	spin_unlock_irqrestore(&list->lock, flags);
510 
511 	if (!sock_flag(sk, SOCK_DEAD))
512 		sk->sk_data_ready(sk);
513 	return 0;
514 }
515 EXPORT_SYMBOL(__sock_queue_rcv_skb);
516 
517 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
518 			      enum skb_drop_reason *reason)
519 {
520 	enum skb_drop_reason drop_reason;
521 	int err;
522 
523 	err = sk_filter(sk, skb);
524 	if (err) {
525 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
526 		goto out;
527 	}
528 	err = __sock_queue_rcv_skb(sk, skb);
529 	switch (err) {
530 	case -ENOMEM:
531 		drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
532 		break;
533 	case -ENOBUFS:
534 		drop_reason = SKB_DROP_REASON_PROTO_MEM;
535 		break;
536 	default:
537 		drop_reason = SKB_NOT_DROPPED_YET;
538 		break;
539 	}
540 out:
541 	if (reason)
542 		*reason = drop_reason;
543 	return err;
544 }
545 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
546 
547 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
548 		     const int nested, unsigned int trim_cap, bool refcounted)
549 {
550 	int rc = NET_RX_SUCCESS;
551 
552 	if (sk_filter_trim_cap(sk, skb, trim_cap))
553 		goto discard_and_relse;
554 
555 	skb->dev = NULL;
556 
557 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
558 		atomic_inc(&sk->sk_drops);
559 		goto discard_and_relse;
560 	}
561 	if (nested)
562 		bh_lock_sock_nested(sk);
563 	else
564 		bh_lock_sock(sk);
565 	if (!sock_owned_by_user(sk)) {
566 		/*
567 		 * trylock + unlock semantics:
568 		 */
569 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
570 
571 		rc = sk_backlog_rcv(sk, skb);
572 
573 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
574 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
575 		bh_unlock_sock(sk);
576 		atomic_inc(&sk->sk_drops);
577 		goto discard_and_relse;
578 	}
579 
580 	bh_unlock_sock(sk);
581 out:
582 	if (refcounted)
583 		sock_put(sk);
584 	return rc;
585 discard_and_relse:
586 	kfree_skb(skb);
587 	goto out;
588 }
589 EXPORT_SYMBOL(__sk_receive_skb);
590 
591 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
592 							  u32));
593 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
594 							   u32));
595 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
596 {
597 	struct dst_entry *dst = __sk_dst_get(sk);
598 
599 	if (dst && dst->obsolete &&
600 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
601 			       dst, cookie) == NULL) {
602 		sk_tx_queue_clear(sk);
603 		sk->sk_dst_pending_confirm = 0;
604 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
605 		dst_release(dst);
606 		return NULL;
607 	}
608 
609 	return dst;
610 }
611 EXPORT_SYMBOL(__sk_dst_check);
612 
613 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
614 {
615 	struct dst_entry *dst = sk_dst_get(sk);
616 
617 	if (dst && dst->obsolete &&
618 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
619 			       dst, cookie) == NULL) {
620 		sk_dst_reset(sk);
621 		dst_release(dst);
622 		return NULL;
623 	}
624 
625 	return dst;
626 }
627 EXPORT_SYMBOL(sk_dst_check);
628 
629 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
630 {
631 	int ret = -ENOPROTOOPT;
632 #ifdef CONFIG_NETDEVICES
633 	struct net *net = sock_net(sk);
634 
635 	/* Sorry... */
636 	ret = -EPERM;
637 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
638 		goto out;
639 
640 	ret = -EINVAL;
641 	if (ifindex < 0)
642 		goto out;
643 
644 	/* Paired with all READ_ONCE() done locklessly. */
645 	WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
646 
647 	if (sk->sk_prot->rehash)
648 		sk->sk_prot->rehash(sk);
649 	sk_dst_reset(sk);
650 
651 	ret = 0;
652 
653 out:
654 #endif
655 
656 	return ret;
657 }
658 
659 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
660 {
661 	int ret;
662 
663 	if (lock_sk)
664 		lock_sock(sk);
665 	ret = sock_bindtoindex_locked(sk, ifindex);
666 	if (lock_sk)
667 		release_sock(sk);
668 
669 	return ret;
670 }
671 EXPORT_SYMBOL(sock_bindtoindex);
672 
673 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
674 {
675 	int ret = -ENOPROTOOPT;
676 #ifdef CONFIG_NETDEVICES
677 	struct net *net = sock_net(sk);
678 	char devname[IFNAMSIZ];
679 	int index;
680 
681 	ret = -EINVAL;
682 	if (optlen < 0)
683 		goto out;
684 
685 	/* Bind this socket to a particular device like "eth0",
686 	 * as specified in the passed interface name. If the
687 	 * name is "" or the option length is zero the socket
688 	 * is not bound.
689 	 */
690 	if (optlen > IFNAMSIZ - 1)
691 		optlen = IFNAMSIZ - 1;
692 	memset(devname, 0, sizeof(devname));
693 
694 	ret = -EFAULT;
695 	if (copy_from_sockptr(devname, optval, optlen))
696 		goto out;
697 
698 	index = 0;
699 	if (devname[0] != '\0') {
700 		struct net_device *dev;
701 
702 		rcu_read_lock();
703 		dev = dev_get_by_name_rcu(net, devname);
704 		if (dev)
705 			index = dev->ifindex;
706 		rcu_read_unlock();
707 		ret = -ENODEV;
708 		if (!dev)
709 			goto out;
710 	}
711 
712 	sockopt_lock_sock(sk);
713 	ret = sock_bindtoindex_locked(sk, index);
714 	sockopt_release_sock(sk);
715 out:
716 #endif
717 
718 	return ret;
719 }
720 
721 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
722 				sockptr_t optlen, int len)
723 {
724 	int ret = -ENOPROTOOPT;
725 #ifdef CONFIG_NETDEVICES
726 	int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
727 	struct net *net = sock_net(sk);
728 	char devname[IFNAMSIZ];
729 
730 	if (bound_dev_if == 0) {
731 		len = 0;
732 		goto zero;
733 	}
734 
735 	ret = -EINVAL;
736 	if (len < IFNAMSIZ)
737 		goto out;
738 
739 	ret = netdev_get_name(net, devname, bound_dev_if);
740 	if (ret)
741 		goto out;
742 
743 	len = strlen(devname) + 1;
744 
745 	ret = -EFAULT;
746 	if (copy_to_sockptr(optval, devname, len))
747 		goto out;
748 
749 zero:
750 	ret = -EFAULT;
751 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
752 		goto out;
753 
754 	ret = 0;
755 
756 out:
757 #endif
758 
759 	return ret;
760 }
761 
762 bool sk_mc_loop(struct sock *sk)
763 {
764 	if (dev_recursion_level())
765 		return false;
766 	if (!sk)
767 		return true;
768 	switch (sk->sk_family) {
769 	case AF_INET:
770 		return inet_sk(sk)->mc_loop;
771 #if IS_ENABLED(CONFIG_IPV6)
772 	case AF_INET6:
773 		return inet6_sk(sk)->mc_loop;
774 #endif
775 	}
776 	WARN_ON_ONCE(1);
777 	return true;
778 }
779 EXPORT_SYMBOL(sk_mc_loop);
780 
781 void sock_set_reuseaddr(struct sock *sk)
782 {
783 	lock_sock(sk);
784 	sk->sk_reuse = SK_CAN_REUSE;
785 	release_sock(sk);
786 }
787 EXPORT_SYMBOL(sock_set_reuseaddr);
788 
789 void sock_set_reuseport(struct sock *sk)
790 {
791 	lock_sock(sk);
792 	sk->sk_reuseport = true;
793 	release_sock(sk);
794 }
795 EXPORT_SYMBOL(sock_set_reuseport);
796 
797 void sock_no_linger(struct sock *sk)
798 {
799 	lock_sock(sk);
800 	sk->sk_lingertime = 0;
801 	sock_set_flag(sk, SOCK_LINGER);
802 	release_sock(sk);
803 }
804 EXPORT_SYMBOL(sock_no_linger);
805 
806 void sock_set_priority(struct sock *sk, u32 priority)
807 {
808 	lock_sock(sk);
809 	WRITE_ONCE(sk->sk_priority, priority);
810 	release_sock(sk);
811 }
812 EXPORT_SYMBOL(sock_set_priority);
813 
814 void sock_set_sndtimeo(struct sock *sk, s64 secs)
815 {
816 	lock_sock(sk);
817 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
818 		WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
819 	else
820 		WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
821 	release_sock(sk);
822 }
823 EXPORT_SYMBOL(sock_set_sndtimeo);
824 
825 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
826 {
827 	if (val)  {
828 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
829 		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
830 		sock_set_flag(sk, SOCK_RCVTSTAMP);
831 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
832 	} else {
833 		sock_reset_flag(sk, SOCK_RCVTSTAMP);
834 		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
835 	}
836 }
837 
838 void sock_enable_timestamps(struct sock *sk)
839 {
840 	lock_sock(sk);
841 	__sock_set_timestamps(sk, true, false, true);
842 	release_sock(sk);
843 }
844 EXPORT_SYMBOL(sock_enable_timestamps);
845 
846 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
847 {
848 	switch (optname) {
849 	case SO_TIMESTAMP_OLD:
850 		__sock_set_timestamps(sk, valbool, false, false);
851 		break;
852 	case SO_TIMESTAMP_NEW:
853 		__sock_set_timestamps(sk, valbool, true, false);
854 		break;
855 	case SO_TIMESTAMPNS_OLD:
856 		__sock_set_timestamps(sk, valbool, false, true);
857 		break;
858 	case SO_TIMESTAMPNS_NEW:
859 		__sock_set_timestamps(sk, valbool, true, true);
860 		break;
861 	}
862 }
863 
864 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
865 {
866 	struct net *net = sock_net(sk);
867 	struct net_device *dev = NULL;
868 	bool match = false;
869 	int *vclock_index;
870 	int i, num;
871 
872 	if (sk->sk_bound_dev_if)
873 		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
874 
875 	if (!dev) {
876 		pr_err("%s: sock not bind to device\n", __func__);
877 		return -EOPNOTSUPP;
878 	}
879 
880 	num = ethtool_get_phc_vclocks(dev, &vclock_index);
881 	dev_put(dev);
882 
883 	for (i = 0; i < num; i++) {
884 		if (*(vclock_index + i) == phc_index) {
885 			match = true;
886 			break;
887 		}
888 	}
889 
890 	if (num > 0)
891 		kfree(vclock_index);
892 
893 	if (!match)
894 		return -EINVAL;
895 
896 	sk->sk_bind_phc = phc_index;
897 
898 	return 0;
899 }
900 
901 int sock_set_timestamping(struct sock *sk, int optname,
902 			  struct so_timestamping timestamping)
903 {
904 	int val = timestamping.flags;
905 	int ret;
906 
907 	if (val & ~SOF_TIMESTAMPING_MASK)
908 		return -EINVAL;
909 
910 	if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
911 	    !(val & SOF_TIMESTAMPING_OPT_ID))
912 		return -EINVAL;
913 
914 	if (val & SOF_TIMESTAMPING_OPT_ID &&
915 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
916 		if (sk_is_tcp(sk)) {
917 			if ((1 << sk->sk_state) &
918 			    (TCPF_CLOSE | TCPF_LISTEN))
919 				return -EINVAL;
920 			if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
921 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
922 			else
923 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
924 		} else {
925 			atomic_set(&sk->sk_tskey, 0);
926 		}
927 	}
928 
929 	if (val & SOF_TIMESTAMPING_OPT_STATS &&
930 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
931 		return -EINVAL;
932 
933 	if (val & SOF_TIMESTAMPING_BIND_PHC) {
934 		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
935 		if (ret)
936 			return ret;
937 	}
938 
939 	sk->sk_tsflags = val;
940 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
941 
942 	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
943 		sock_enable_timestamp(sk,
944 				      SOCK_TIMESTAMPING_RX_SOFTWARE);
945 	else
946 		sock_disable_timestamp(sk,
947 				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
948 	return 0;
949 }
950 
951 void sock_set_keepalive(struct sock *sk)
952 {
953 	lock_sock(sk);
954 	if (sk->sk_prot->keepalive)
955 		sk->sk_prot->keepalive(sk, true);
956 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
957 	release_sock(sk);
958 }
959 EXPORT_SYMBOL(sock_set_keepalive);
960 
961 static void __sock_set_rcvbuf(struct sock *sk, int val)
962 {
963 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
964 	 * as a negative value.
965 	 */
966 	val = min_t(int, val, INT_MAX / 2);
967 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
968 
969 	/* We double it on the way in to account for "struct sk_buff" etc.
970 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
971 	 * will allow that much actual data to be received on that socket.
972 	 *
973 	 * Applications are unaware that "struct sk_buff" and other overheads
974 	 * allocate from the receive buffer during socket buffer allocation.
975 	 *
976 	 * And after considering the possible alternatives, returning the value
977 	 * we actually used in getsockopt is the most desirable behavior.
978 	 */
979 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
980 }
981 
982 void sock_set_rcvbuf(struct sock *sk, int val)
983 {
984 	lock_sock(sk);
985 	__sock_set_rcvbuf(sk, val);
986 	release_sock(sk);
987 }
988 EXPORT_SYMBOL(sock_set_rcvbuf);
989 
990 static void __sock_set_mark(struct sock *sk, u32 val)
991 {
992 	if (val != sk->sk_mark) {
993 		WRITE_ONCE(sk->sk_mark, val);
994 		sk_dst_reset(sk);
995 	}
996 }
997 
998 void sock_set_mark(struct sock *sk, u32 val)
999 {
1000 	lock_sock(sk);
1001 	__sock_set_mark(sk, val);
1002 	release_sock(sk);
1003 }
1004 EXPORT_SYMBOL(sock_set_mark);
1005 
1006 static void sock_release_reserved_memory(struct sock *sk, int bytes)
1007 {
1008 	/* Round down bytes to multiple of pages */
1009 	bytes = round_down(bytes, PAGE_SIZE);
1010 
1011 	WARN_ON(bytes > sk->sk_reserved_mem);
1012 	WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1013 	sk_mem_reclaim(sk);
1014 }
1015 
1016 static int sock_reserve_memory(struct sock *sk, int bytes)
1017 {
1018 	long allocated;
1019 	bool charged;
1020 	int pages;
1021 
1022 	if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1023 		return -EOPNOTSUPP;
1024 
1025 	if (!bytes)
1026 		return 0;
1027 
1028 	pages = sk_mem_pages(bytes);
1029 
1030 	/* pre-charge to memcg */
1031 	charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1032 					  GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1033 	if (!charged)
1034 		return -ENOMEM;
1035 
1036 	/* pre-charge to forward_alloc */
1037 	sk_memory_allocated_add(sk, pages);
1038 	allocated = sk_memory_allocated(sk);
1039 	/* If the system goes into memory pressure with this
1040 	 * precharge, give up and return error.
1041 	 */
1042 	if (allocated > sk_prot_mem_limits(sk, 1)) {
1043 		sk_memory_allocated_sub(sk, pages);
1044 		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1045 		return -ENOMEM;
1046 	}
1047 	sk->sk_forward_alloc += pages << PAGE_SHIFT;
1048 
1049 	WRITE_ONCE(sk->sk_reserved_mem,
1050 		   sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1051 
1052 	return 0;
1053 }
1054 
1055 void sockopt_lock_sock(struct sock *sk)
1056 {
1057 	/* When current->bpf_ctx is set, the setsockopt is called from
1058 	 * a bpf prog.  bpf has ensured the sk lock has been
1059 	 * acquired before calling setsockopt().
1060 	 */
1061 	if (has_current_bpf_ctx())
1062 		return;
1063 
1064 	lock_sock(sk);
1065 }
1066 EXPORT_SYMBOL(sockopt_lock_sock);
1067 
1068 void sockopt_release_sock(struct sock *sk)
1069 {
1070 	if (has_current_bpf_ctx())
1071 		return;
1072 
1073 	release_sock(sk);
1074 }
1075 EXPORT_SYMBOL(sockopt_release_sock);
1076 
1077 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1078 {
1079 	return has_current_bpf_ctx() || ns_capable(ns, cap);
1080 }
1081 EXPORT_SYMBOL(sockopt_ns_capable);
1082 
1083 bool sockopt_capable(int cap)
1084 {
1085 	return has_current_bpf_ctx() || capable(cap);
1086 }
1087 EXPORT_SYMBOL(sockopt_capable);
1088 
1089 /*
1090  *	This is meant for all protocols to use and covers goings on
1091  *	at the socket level. Everything here is generic.
1092  */
1093 
1094 int sk_setsockopt(struct sock *sk, int level, int optname,
1095 		  sockptr_t optval, unsigned int optlen)
1096 {
1097 	struct so_timestamping timestamping;
1098 	struct socket *sock = sk->sk_socket;
1099 	struct sock_txtime sk_txtime;
1100 	int val;
1101 	int valbool;
1102 	struct linger ling;
1103 	int ret = 0;
1104 
1105 	/*
1106 	 *	Options without arguments
1107 	 */
1108 
1109 	if (optname == SO_BINDTODEVICE)
1110 		return sock_setbindtodevice(sk, optval, optlen);
1111 
1112 	if (optlen < sizeof(int))
1113 		return -EINVAL;
1114 
1115 	if (copy_from_sockptr(&val, optval, sizeof(val)))
1116 		return -EFAULT;
1117 
1118 	valbool = val ? 1 : 0;
1119 
1120 	sockopt_lock_sock(sk);
1121 
1122 	switch (optname) {
1123 	case SO_DEBUG:
1124 		if (val && !sockopt_capable(CAP_NET_ADMIN))
1125 			ret = -EACCES;
1126 		else
1127 			sock_valbool_flag(sk, SOCK_DBG, valbool);
1128 		break;
1129 	case SO_REUSEADDR:
1130 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1131 		break;
1132 	case SO_REUSEPORT:
1133 		sk->sk_reuseport = valbool;
1134 		break;
1135 	case SO_TYPE:
1136 	case SO_PROTOCOL:
1137 	case SO_DOMAIN:
1138 	case SO_ERROR:
1139 		ret = -ENOPROTOOPT;
1140 		break;
1141 	case SO_DONTROUTE:
1142 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1143 		sk_dst_reset(sk);
1144 		break;
1145 	case SO_BROADCAST:
1146 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1147 		break;
1148 	case SO_SNDBUF:
1149 		/* Don't error on this BSD doesn't and if you think
1150 		 * about it this is right. Otherwise apps have to
1151 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1152 		 * are treated in BSD as hints
1153 		 */
1154 		val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1155 set_sndbuf:
1156 		/* Ensure val * 2 fits into an int, to prevent max_t()
1157 		 * from treating it as a negative value.
1158 		 */
1159 		val = min_t(int, val, INT_MAX / 2);
1160 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1161 		WRITE_ONCE(sk->sk_sndbuf,
1162 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1163 		/* Wake up sending tasks if we upped the value. */
1164 		sk->sk_write_space(sk);
1165 		break;
1166 
1167 	case SO_SNDBUFFORCE:
1168 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1169 			ret = -EPERM;
1170 			break;
1171 		}
1172 
1173 		/* No negative values (to prevent underflow, as val will be
1174 		 * multiplied by 2).
1175 		 */
1176 		if (val < 0)
1177 			val = 0;
1178 		goto set_sndbuf;
1179 
1180 	case SO_RCVBUF:
1181 		/* Don't error on this BSD doesn't and if you think
1182 		 * about it this is right. Otherwise apps have to
1183 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1184 		 * are treated in BSD as hints
1185 		 */
1186 		__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1187 		break;
1188 
1189 	case SO_RCVBUFFORCE:
1190 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1191 			ret = -EPERM;
1192 			break;
1193 		}
1194 
1195 		/* No negative values (to prevent underflow, as val will be
1196 		 * multiplied by 2).
1197 		 */
1198 		__sock_set_rcvbuf(sk, max(val, 0));
1199 		break;
1200 
1201 	case SO_KEEPALIVE:
1202 		if (sk->sk_prot->keepalive)
1203 			sk->sk_prot->keepalive(sk, valbool);
1204 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1205 		break;
1206 
1207 	case SO_OOBINLINE:
1208 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1209 		break;
1210 
1211 	case SO_NO_CHECK:
1212 		sk->sk_no_check_tx = valbool;
1213 		break;
1214 
1215 	case SO_PRIORITY:
1216 		if ((val >= 0 && val <= 6) ||
1217 		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1218 		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1219 			WRITE_ONCE(sk->sk_priority, val);
1220 		else
1221 			ret = -EPERM;
1222 		break;
1223 
1224 	case SO_LINGER:
1225 		if (optlen < sizeof(ling)) {
1226 			ret = -EINVAL;	/* 1003.1g */
1227 			break;
1228 		}
1229 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1230 			ret = -EFAULT;
1231 			break;
1232 		}
1233 		if (!ling.l_onoff)
1234 			sock_reset_flag(sk, SOCK_LINGER);
1235 		else {
1236 #if (BITS_PER_LONG == 32)
1237 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
1238 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1239 			else
1240 #endif
1241 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
1242 			sock_set_flag(sk, SOCK_LINGER);
1243 		}
1244 		break;
1245 
1246 	case SO_BSDCOMPAT:
1247 		break;
1248 
1249 	case SO_PASSCRED:
1250 		assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
1251 		break;
1252 
1253 	case SO_PASSPIDFD:
1254 		assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
1255 		break;
1256 
1257 	case SO_TIMESTAMP_OLD:
1258 	case SO_TIMESTAMP_NEW:
1259 	case SO_TIMESTAMPNS_OLD:
1260 	case SO_TIMESTAMPNS_NEW:
1261 		sock_set_timestamp(sk, optname, valbool);
1262 		break;
1263 
1264 	case SO_TIMESTAMPING_NEW:
1265 	case SO_TIMESTAMPING_OLD:
1266 		if (optlen == sizeof(timestamping)) {
1267 			if (copy_from_sockptr(&timestamping, optval,
1268 					      sizeof(timestamping))) {
1269 				ret = -EFAULT;
1270 				break;
1271 			}
1272 		} else {
1273 			memset(&timestamping, 0, sizeof(timestamping));
1274 			timestamping.flags = val;
1275 		}
1276 		ret = sock_set_timestamping(sk, optname, timestamping);
1277 		break;
1278 
1279 	case SO_RCVLOWAT:
1280 		{
1281 		int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1282 
1283 		if (val < 0)
1284 			val = INT_MAX;
1285 		if (sock)
1286 			set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1287 		if (set_rcvlowat)
1288 			ret = set_rcvlowat(sk, val);
1289 		else
1290 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1291 		break;
1292 		}
1293 	case SO_RCVTIMEO_OLD:
1294 	case SO_RCVTIMEO_NEW:
1295 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1296 				       optlen, optname == SO_RCVTIMEO_OLD);
1297 		break;
1298 
1299 	case SO_SNDTIMEO_OLD:
1300 	case SO_SNDTIMEO_NEW:
1301 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1302 				       optlen, optname == SO_SNDTIMEO_OLD);
1303 		break;
1304 
1305 	case SO_ATTACH_FILTER: {
1306 		struct sock_fprog fprog;
1307 
1308 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1309 		if (!ret)
1310 			ret = sk_attach_filter(&fprog, sk);
1311 		break;
1312 	}
1313 	case SO_ATTACH_BPF:
1314 		ret = -EINVAL;
1315 		if (optlen == sizeof(u32)) {
1316 			u32 ufd;
1317 
1318 			ret = -EFAULT;
1319 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1320 				break;
1321 
1322 			ret = sk_attach_bpf(ufd, sk);
1323 		}
1324 		break;
1325 
1326 	case SO_ATTACH_REUSEPORT_CBPF: {
1327 		struct sock_fprog fprog;
1328 
1329 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1330 		if (!ret)
1331 			ret = sk_reuseport_attach_filter(&fprog, sk);
1332 		break;
1333 	}
1334 	case SO_ATTACH_REUSEPORT_EBPF:
1335 		ret = -EINVAL;
1336 		if (optlen == sizeof(u32)) {
1337 			u32 ufd;
1338 
1339 			ret = -EFAULT;
1340 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1341 				break;
1342 
1343 			ret = sk_reuseport_attach_bpf(ufd, sk);
1344 		}
1345 		break;
1346 
1347 	case SO_DETACH_REUSEPORT_BPF:
1348 		ret = reuseport_detach_prog(sk);
1349 		break;
1350 
1351 	case SO_DETACH_FILTER:
1352 		ret = sk_detach_filter(sk);
1353 		break;
1354 
1355 	case SO_LOCK_FILTER:
1356 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1357 			ret = -EPERM;
1358 		else
1359 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1360 		break;
1361 
1362 	case SO_PASSSEC:
1363 		assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
1364 		break;
1365 	case SO_MARK:
1366 		if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1367 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1368 			ret = -EPERM;
1369 			break;
1370 		}
1371 
1372 		__sock_set_mark(sk, val);
1373 		break;
1374 	case SO_RCVMARK:
1375 		sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1376 		break;
1377 
1378 	case SO_RXQ_OVFL:
1379 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1380 		break;
1381 
1382 	case SO_WIFI_STATUS:
1383 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1384 		break;
1385 
1386 	case SO_PEEK_OFF:
1387 		{
1388 		int (*set_peek_off)(struct sock *sk, int val);
1389 
1390 		set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1391 		if (set_peek_off)
1392 			ret = set_peek_off(sk, val);
1393 		else
1394 			ret = -EOPNOTSUPP;
1395 		break;
1396 		}
1397 
1398 	case SO_NOFCS:
1399 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1400 		break;
1401 
1402 	case SO_SELECT_ERR_QUEUE:
1403 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1404 		break;
1405 
1406 #ifdef CONFIG_NET_RX_BUSY_POLL
1407 	case SO_BUSY_POLL:
1408 		if (val < 0)
1409 			ret = -EINVAL;
1410 		else
1411 			WRITE_ONCE(sk->sk_ll_usec, val);
1412 		break;
1413 	case SO_PREFER_BUSY_POLL:
1414 		if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1415 			ret = -EPERM;
1416 		else
1417 			WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1418 		break;
1419 	case SO_BUSY_POLL_BUDGET:
1420 		if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) {
1421 			ret = -EPERM;
1422 		} else {
1423 			if (val < 0 || val > U16_MAX)
1424 				ret = -EINVAL;
1425 			else
1426 				WRITE_ONCE(sk->sk_busy_poll_budget, val);
1427 		}
1428 		break;
1429 #endif
1430 
1431 	case SO_MAX_PACING_RATE:
1432 		{
1433 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1434 
1435 		if (sizeof(ulval) != sizeof(val) &&
1436 		    optlen >= sizeof(ulval) &&
1437 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1438 			ret = -EFAULT;
1439 			break;
1440 		}
1441 		if (ulval != ~0UL)
1442 			cmpxchg(&sk->sk_pacing_status,
1443 				SK_PACING_NONE,
1444 				SK_PACING_NEEDED);
1445 		/* Pairs with READ_ONCE() from sk_getsockopt() */
1446 		WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1447 		sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1448 		break;
1449 		}
1450 	case SO_INCOMING_CPU:
1451 		reuseport_update_incoming_cpu(sk, val);
1452 		break;
1453 
1454 	case SO_CNX_ADVICE:
1455 		if (val == 1)
1456 			dst_negative_advice(sk);
1457 		break;
1458 
1459 	case SO_ZEROCOPY:
1460 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1461 			if (!(sk_is_tcp(sk) ||
1462 			      (sk->sk_type == SOCK_DGRAM &&
1463 			       sk->sk_protocol == IPPROTO_UDP)))
1464 				ret = -EOPNOTSUPP;
1465 		} else if (sk->sk_family != PF_RDS) {
1466 			ret = -EOPNOTSUPP;
1467 		}
1468 		if (!ret) {
1469 			if (val < 0 || val > 1)
1470 				ret = -EINVAL;
1471 			else
1472 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1473 		}
1474 		break;
1475 
1476 	case SO_TXTIME:
1477 		if (optlen != sizeof(struct sock_txtime)) {
1478 			ret = -EINVAL;
1479 			break;
1480 		} else if (copy_from_sockptr(&sk_txtime, optval,
1481 			   sizeof(struct sock_txtime))) {
1482 			ret = -EFAULT;
1483 			break;
1484 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1485 			ret = -EINVAL;
1486 			break;
1487 		}
1488 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1489 		 * scheduler has enough safe guards.
1490 		 */
1491 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1492 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1493 			ret = -EPERM;
1494 			break;
1495 		}
1496 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1497 		sk->sk_clockid = sk_txtime.clockid;
1498 		sk->sk_txtime_deadline_mode =
1499 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1500 		sk->sk_txtime_report_errors =
1501 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1502 		break;
1503 
1504 	case SO_BINDTOIFINDEX:
1505 		ret = sock_bindtoindex_locked(sk, val);
1506 		break;
1507 
1508 	case SO_BUF_LOCK:
1509 		if (val & ~SOCK_BUF_LOCK_MASK) {
1510 			ret = -EINVAL;
1511 			break;
1512 		}
1513 		sk->sk_userlocks = val | (sk->sk_userlocks &
1514 					  ~SOCK_BUF_LOCK_MASK);
1515 		break;
1516 
1517 	case SO_RESERVE_MEM:
1518 	{
1519 		int delta;
1520 
1521 		if (val < 0) {
1522 			ret = -EINVAL;
1523 			break;
1524 		}
1525 
1526 		delta = val - sk->sk_reserved_mem;
1527 		if (delta < 0)
1528 			sock_release_reserved_memory(sk, -delta);
1529 		else
1530 			ret = sock_reserve_memory(sk, delta);
1531 		break;
1532 	}
1533 
1534 	case SO_TXREHASH:
1535 		if (val < -1 || val > 1) {
1536 			ret = -EINVAL;
1537 			break;
1538 		}
1539 		if ((u8)val == SOCK_TXREHASH_DEFAULT)
1540 			val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1541 		/* Paired with READ_ONCE() in tcp_rtx_synack()
1542 		 * and sk_getsockopt().
1543 		 */
1544 		WRITE_ONCE(sk->sk_txrehash, (u8)val);
1545 		break;
1546 
1547 	default:
1548 		ret = -ENOPROTOOPT;
1549 		break;
1550 	}
1551 	sockopt_release_sock(sk);
1552 	return ret;
1553 }
1554 
1555 int sock_setsockopt(struct socket *sock, int level, int optname,
1556 		    sockptr_t optval, unsigned int optlen)
1557 {
1558 	return sk_setsockopt(sock->sk, level, optname,
1559 			     optval, optlen);
1560 }
1561 EXPORT_SYMBOL(sock_setsockopt);
1562 
1563 static const struct cred *sk_get_peer_cred(struct sock *sk)
1564 {
1565 	const struct cred *cred;
1566 
1567 	spin_lock(&sk->sk_peer_lock);
1568 	cred = get_cred(sk->sk_peer_cred);
1569 	spin_unlock(&sk->sk_peer_lock);
1570 
1571 	return cred;
1572 }
1573 
1574 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1575 			  struct ucred *ucred)
1576 {
1577 	ucred->pid = pid_vnr(pid);
1578 	ucred->uid = ucred->gid = -1;
1579 	if (cred) {
1580 		struct user_namespace *current_ns = current_user_ns();
1581 
1582 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1583 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1584 	}
1585 }
1586 
1587 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1588 {
1589 	struct user_namespace *user_ns = current_user_ns();
1590 	int i;
1591 
1592 	for (i = 0; i < src->ngroups; i++) {
1593 		gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1594 
1595 		if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1596 			return -EFAULT;
1597 	}
1598 
1599 	return 0;
1600 }
1601 
1602 int sk_getsockopt(struct sock *sk, int level, int optname,
1603 		  sockptr_t optval, sockptr_t optlen)
1604 {
1605 	struct socket *sock = sk->sk_socket;
1606 
1607 	union {
1608 		int val;
1609 		u64 val64;
1610 		unsigned long ulval;
1611 		struct linger ling;
1612 		struct old_timeval32 tm32;
1613 		struct __kernel_old_timeval tm;
1614 		struct  __kernel_sock_timeval stm;
1615 		struct sock_txtime txtime;
1616 		struct so_timestamping timestamping;
1617 	} v;
1618 
1619 	int lv = sizeof(int);
1620 	int len;
1621 
1622 	if (copy_from_sockptr(&len, optlen, sizeof(int)))
1623 		return -EFAULT;
1624 	if (len < 0)
1625 		return -EINVAL;
1626 
1627 	memset(&v, 0, sizeof(v));
1628 
1629 	switch (optname) {
1630 	case SO_DEBUG:
1631 		v.val = sock_flag(sk, SOCK_DBG);
1632 		break;
1633 
1634 	case SO_DONTROUTE:
1635 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1636 		break;
1637 
1638 	case SO_BROADCAST:
1639 		v.val = sock_flag(sk, SOCK_BROADCAST);
1640 		break;
1641 
1642 	case SO_SNDBUF:
1643 		v.val = READ_ONCE(sk->sk_sndbuf);
1644 		break;
1645 
1646 	case SO_RCVBUF:
1647 		v.val = READ_ONCE(sk->sk_rcvbuf);
1648 		break;
1649 
1650 	case SO_REUSEADDR:
1651 		v.val = sk->sk_reuse;
1652 		break;
1653 
1654 	case SO_REUSEPORT:
1655 		v.val = sk->sk_reuseport;
1656 		break;
1657 
1658 	case SO_KEEPALIVE:
1659 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1660 		break;
1661 
1662 	case SO_TYPE:
1663 		v.val = sk->sk_type;
1664 		break;
1665 
1666 	case SO_PROTOCOL:
1667 		v.val = sk->sk_protocol;
1668 		break;
1669 
1670 	case SO_DOMAIN:
1671 		v.val = sk->sk_family;
1672 		break;
1673 
1674 	case SO_ERROR:
1675 		v.val = -sock_error(sk);
1676 		if (v.val == 0)
1677 			v.val = xchg(&sk->sk_err_soft, 0);
1678 		break;
1679 
1680 	case SO_OOBINLINE:
1681 		v.val = sock_flag(sk, SOCK_URGINLINE);
1682 		break;
1683 
1684 	case SO_NO_CHECK:
1685 		v.val = sk->sk_no_check_tx;
1686 		break;
1687 
1688 	case SO_PRIORITY:
1689 		v.val = READ_ONCE(sk->sk_priority);
1690 		break;
1691 
1692 	case SO_LINGER:
1693 		lv		= sizeof(v.ling);
1694 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1695 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1696 		break;
1697 
1698 	case SO_BSDCOMPAT:
1699 		break;
1700 
1701 	case SO_TIMESTAMP_OLD:
1702 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1703 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1704 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1705 		break;
1706 
1707 	case SO_TIMESTAMPNS_OLD:
1708 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1709 		break;
1710 
1711 	case SO_TIMESTAMP_NEW:
1712 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1713 		break;
1714 
1715 	case SO_TIMESTAMPNS_NEW:
1716 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1717 		break;
1718 
1719 	case SO_TIMESTAMPING_OLD:
1720 		lv = sizeof(v.timestamping);
1721 		v.timestamping.flags = sk->sk_tsflags;
1722 		v.timestamping.bind_phc = sk->sk_bind_phc;
1723 		break;
1724 
1725 	case SO_RCVTIMEO_OLD:
1726 	case SO_RCVTIMEO_NEW:
1727 		lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1728 				      SO_RCVTIMEO_OLD == optname);
1729 		break;
1730 
1731 	case SO_SNDTIMEO_OLD:
1732 	case SO_SNDTIMEO_NEW:
1733 		lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1734 				      SO_SNDTIMEO_OLD == optname);
1735 		break;
1736 
1737 	case SO_RCVLOWAT:
1738 		v.val = READ_ONCE(sk->sk_rcvlowat);
1739 		break;
1740 
1741 	case SO_SNDLOWAT:
1742 		v.val = 1;
1743 		break;
1744 
1745 	case SO_PASSCRED:
1746 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1747 		break;
1748 
1749 	case SO_PASSPIDFD:
1750 		v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1751 		break;
1752 
1753 	case SO_PEERCRED:
1754 	{
1755 		struct ucred peercred;
1756 		if (len > sizeof(peercred))
1757 			len = sizeof(peercred);
1758 
1759 		spin_lock(&sk->sk_peer_lock);
1760 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1761 		spin_unlock(&sk->sk_peer_lock);
1762 
1763 		if (copy_to_sockptr(optval, &peercred, len))
1764 			return -EFAULT;
1765 		goto lenout;
1766 	}
1767 
1768 	case SO_PEERPIDFD:
1769 	{
1770 		struct pid *peer_pid;
1771 		struct file *pidfd_file = NULL;
1772 		int pidfd;
1773 
1774 		if (len > sizeof(pidfd))
1775 			len = sizeof(pidfd);
1776 
1777 		spin_lock(&sk->sk_peer_lock);
1778 		peer_pid = get_pid(sk->sk_peer_pid);
1779 		spin_unlock(&sk->sk_peer_lock);
1780 
1781 		if (!peer_pid)
1782 			return -ENODATA;
1783 
1784 		pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1785 		put_pid(peer_pid);
1786 		if (pidfd < 0)
1787 			return pidfd;
1788 
1789 		if (copy_to_sockptr(optval, &pidfd, len) ||
1790 		    copy_to_sockptr(optlen, &len, sizeof(int))) {
1791 			put_unused_fd(pidfd);
1792 			fput(pidfd_file);
1793 
1794 			return -EFAULT;
1795 		}
1796 
1797 		fd_install(pidfd, pidfd_file);
1798 		return 0;
1799 	}
1800 
1801 	case SO_PEERGROUPS:
1802 	{
1803 		const struct cred *cred;
1804 		int ret, n;
1805 
1806 		cred = sk_get_peer_cred(sk);
1807 		if (!cred)
1808 			return -ENODATA;
1809 
1810 		n = cred->group_info->ngroups;
1811 		if (len < n * sizeof(gid_t)) {
1812 			len = n * sizeof(gid_t);
1813 			put_cred(cred);
1814 			return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1815 		}
1816 		len = n * sizeof(gid_t);
1817 
1818 		ret = groups_to_user(optval, cred->group_info);
1819 		put_cred(cred);
1820 		if (ret)
1821 			return ret;
1822 		goto lenout;
1823 	}
1824 
1825 	case SO_PEERNAME:
1826 	{
1827 		struct sockaddr_storage address;
1828 
1829 		lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1830 		if (lv < 0)
1831 			return -ENOTCONN;
1832 		if (lv < len)
1833 			return -EINVAL;
1834 		if (copy_to_sockptr(optval, &address, len))
1835 			return -EFAULT;
1836 		goto lenout;
1837 	}
1838 
1839 	/* Dubious BSD thing... Probably nobody even uses it, but
1840 	 * the UNIX standard wants it for whatever reason... -DaveM
1841 	 */
1842 	case SO_ACCEPTCONN:
1843 		v.val = sk->sk_state == TCP_LISTEN;
1844 		break;
1845 
1846 	case SO_PASSSEC:
1847 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1848 		break;
1849 
1850 	case SO_PEERSEC:
1851 		return security_socket_getpeersec_stream(sock,
1852 							 optval, optlen, len);
1853 
1854 	case SO_MARK:
1855 		v.val = READ_ONCE(sk->sk_mark);
1856 		break;
1857 
1858 	case SO_RCVMARK:
1859 		v.val = sock_flag(sk, SOCK_RCVMARK);
1860 		break;
1861 
1862 	case SO_RXQ_OVFL:
1863 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1864 		break;
1865 
1866 	case SO_WIFI_STATUS:
1867 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1868 		break;
1869 
1870 	case SO_PEEK_OFF:
1871 		if (!READ_ONCE(sock->ops)->set_peek_off)
1872 			return -EOPNOTSUPP;
1873 
1874 		v.val = READ_ONCE(sk->sk_peek_off);
1875 		break;
1876 	case SO_NOFCS:
1877 		v.val = sock_flag(sk, SOCK_NOFCS);
1878 		break;
1879 
1880 	case SO_BINDTODEVICE:
1881 		return sock_getbindtodevice(sk, optval, optlen, len);
1882 
1883 	case SO_GET_FILTER:
1884 		len = sk_get_filter(sk, optval, len);
1885 		if (len < 0)
1886 			return len;
1887 
1888 		goto lenout;
1889 
1890 	case SO_LOCK_FILTER:
1891 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1892 		break;
1893 
1894 	case SO_BPF_EXTENSIONS:
1895 		v.val = bpf_tell_extensions();
1896 		break;
1897 
1898 	case SO_SELECT_ERR_QUEUE:
1899 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1900 		break;
1901 
1902 #ifdef CONFIG_NET_RX_BUSY_POLL
1903 	case SO_BUSY_POLL:
1904 		v.val = READ_ONCE(sk->sk_ll_usec);
1905 		break;
1906 	case SO_PREFER_BUSY_POLL:
1907 		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1908 		break;
1909 #endif
1910 
1911 	case SO_MAX_PACING_RATE:
1912 		/* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1913 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1914 			lv = sizeof(v.ulval);
1915 			v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1916 		} else {
1917 			/* 32bit version */
1918 			v.val = min_t(unsigned long, ~0U,
1919 				      READ_ONCE(sk->sk_max_pacing_rate));
1920 		}
1921 		break;
1922 
1923 	case SO_INCOMING_CPU:
1924 		v.val = READ_ONCE(sk->sk_incoming_cpu);
1925 		break;
1926 
1927 	case SO_MEMINFO:
1928 	{
1929 		u32 meminfo[SK_MEMINFO_VARS];
1930 
1931 		sk_get_meminfo(sk, meminfo);
1932 
1933 		len = min_t(unsigned int, len, sizeof(meminfo));
1934 		if (copy_to_sockptr(optval, &meminfo, len))
1935 			return -EFAULT;
1936 
1937 		goto lenout;
1938 	}
1939 
1940 #ifdef CONFIG_NET_RX_BUSY_POLL
1941 	case SO_INCOMING_NAPI_ID:
1942 		v.val = READ_ONCE(sk->sk_napi_id);
1943 
1944 		/* aggregate non-NAPI IDs down to 0 */
1945 		if (v.val < MIN_NAPI_ID)
1946 			v.val = 0;
1947 
1948 		break;
1949 #endif
1950 
1951 	case SO_COOKIE:
1952 		lv = sizeof(u64);
1953 		if (len < lv)
1954 			return -EINVAL;
1955 		v.val64 = sock_gen_cookie(sk);
1956 		break;
1957 
1958 	case SO_ZEROCOPY:
1959 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1960 		break;
1961 
1962 	case SO_TXTIME:
1963 		lv = sizeof(v.txtime);
1964 		v.txtime.clockid = sk->sk_clockid;
1965 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1966 				  SOF_TXTIME_DEADLINE_MODE : 0;
1967 		v.txtime.flags |= sk->sk_txtime_report_errors ?
1968 				  SOF_TXTIME_REPORT_ERRORS : 0;
1969 		break;
1970 
1971 	case SO_BINDTOIFINDEX:
1972 		v.val = READ_ONCE(sk->sk_bound_dev_if);
1973 		break;
1974 
1975 	case SO_NETNS_COOKIE:
1976 		lv = sizeof(u64);
1977 		if (len != lv)
1978 			return -EINVAL;
1979 		v.val64 = sock_net(sk)->net_cookie;
1980 		break;
1981 
1982 	case SO_BUF_LOCK:
1983 		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1984 		break;
1985 
1986 	case SO_RESERVE_MEM:
1987 		v.val = READ_ONCE(sk->sk_reserved_mem);
1988 		break;
1989 
1990 	case SO_TXREHASH:
1991 		/* Paired with WRITE_ONCE() in sk_setsockopt() */
1992 		v.val = READ_ONCE(sk->sk_txrehash);
1993 		break;
1994 
1995 	default:
1996 		/* We implement the SO_SNDLOWAT etc to not be settable
1997 		 * (1003.1g 7).
1998 		 */
1999 		return -ENOPROTOOPT;
2000 	}
2001 
2002 	if (len > lv)
2003 		len = lv;
2004 	if (copy_to_sockptr(optval, &v, len))
2005 		return -EFAULT;
2006 lenout:
2007 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
2008 		return -EFAULT;
2009 	return 0;
2010 }
2011 
2012 int sock_getsockopt(struct socket *sock, int level, int optname,
2013 		    char __user *optval, int __user *optlen)
2014 {
2015 	return sk_getsockopt(sock->sk, level, optname,
2016 			     USER_SOCKPTR(optval),
2017 			     USER_SOCKPTR(optlen));
2018 }
2019 
2020 /*
2021  * Initialize an sk_lock.
2022  *
2023  * (We also register the sk_lock with the lock validator.)
2024  */
2025 static inline void sock_lock_init(struct sock *sk)
2026 {
2027 	if (sk->sk_kern_sock)
2028 		sock_lock_init_class_and_name(
2029 			sk,
2030 			af_family_kern_slock_key_strings[sk->sk_family],
2031 			af_family_kern_slock_keys + sk->sk_family,
2032 			af_family_kern_key_strings[sk->sk_family],
2033 			af_family_kern_keys + sk->sk_family);
2034 	else
2035 		sock_lock_init_class_and_name(
2036 			sk,
2037 			af_family_slock_key_strings[sk->sk_family],
2038 			af_family_slock_keys + sk->sk_family,
2039 			af_family_key_strings[sk->sk_family],
2040 			af_family_keys + sk->sk_family);
2041 }
2042 
2043 /*
2044  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2045  * even temporarly, because of RCU lookups. sk_node should also be left as is.
2046  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2047  */
2048 static void sock_copy(struct sock *nsk, const struct sock *osk)
2049 {
2050 	const struct proto *prot = READ_ONCE(osk->sk_prot);
2051 #ifdef CONFIG_SECURITY_NETWORK
2052 	void *sptr = nsk->sk_security;
2053 #endif
2054 
2055 	/* If we move sk_tx_queue_mapping out of the private section,
2056 	 * we must check if sk_tx_queue_clear() is called after
2057 	 * sock_copy() in sk_clone_lock().
2058 	 */
2059 	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2060 		     offsetof(struct sock, sk_dontcopy_begin) ||
2061 		     offsetof(struct sock, sk_tx_queue_mapping) >=
2062 		     offsetof(struct sock, sk_dontcopy_end));
2063 
2064 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2065 
2066 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2067 	       prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
2068 
2069 #ifdef CONFIG_SECURITY_NETWORK
2070 	nsk->sk_security = sptr;
2071 	security_sk_clone(osk, nsk);
2072 #endif
2073 }
2074 
2075 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2076 		int family)
2077 {
2078 	struct sock *sk;
2079 	struct kmem_cache *slab;
2080 
2081 	slab = prot->slab;
2082 	if (slab != NULL) {
2083 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2084 		if (!sk)
2085 			return sk;
2086 		if (want_init_on_alloc(priority))
2087 			sk_prot_clear_nulls(sk, prot->obj_size);
2088 	} else
2089 		sk = kmalloc(prot->obj_size, priority);
2090 
2091 	if (sk != NULL) {
2092 		if (security_sk_alloc(sk, family, priority))
2093 			goto out_free;
2094 
2095 		if (!try_module_get(prot->owner))
2096 			goto out_free_sec;
2097 	}
2098 
2099 	return sk;
2100 
2101 out_free_sec:
2102 	security_sk_free(sk);
2103 out_free:
2104 	if (slab != NULL)
2105 		kmem_cache_free(slab, sk);
2106 	else
2107 		kfree(sk);
2108 	return NULL;
2109 }
2110 
2111 static void sk_prot_free(struct proto *prot, struct sock *sk)
2112 {
2113 	struct kmem_cache *slab;
2114 	struct module *owner;
2115 
2116 	owner = prot->owner;
2117 	slab = prot->slab;
2118 
2119 	cgroup_sk_free(&sk->sk_cgrp_data);
2120 	mem_cgroup_sk_free(sk);
2121 	security_sk_free(sk);
2122 	if (slab != NULL)
2123 		kmem_cache_free(slab, sk);
2124 	else
2125 		kfree(sk);
2126 	module_put(owner);
2127 }
2128 
2129 /**
2130  *	sk_alloc - All socket objects are allocated here
2131  *	@net: the applicable net namespace
2132  *	@family: protocol family
2133  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2134  *	@prot: struct proto associated with this new sock instance
2135  *	@kern: is this to be a kernel socket?
2136  */
2137 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2138 		      struct proto *prot, int kern)
2139 {
2140 	struct sock *sk;
2141 
2142 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2143 	if (sk) {
2144 		sk->sk_family = family;
2145 		/*
2146 		 * See comment in struct sock definition to understand
2147 		 * why we need sk_prot_creator -acme
2148 		 */
2149 		sk->sk_prot = sk->sk_prot_creator = prot;
2150 		sk->sk_kern_sock = kern;
2151 		sock_lock_init(sk);
2152 		sk->sk_net_refcnt = kern ? 0 : 1;
2153 		if (likely(sk->sk_net_refcnt)) {
2154 			get_net_track(net, &sk->ns_tracker, priority);
2155 			sock_inuse_add(net, 1);
2156 		} else {
2157 			__netns_tracker_alloc(net, &sk->ns_tracker,
2158 					      false, priority);
2159 		}
2160 
2161 		sock_net_set(sk, net);
2162 		refcount_set(&sk->sk_wmem_alloc, 1);
2163 
2164 		mem_cgroup_sk_alloc(sk);
2165 		cgroup_sk_alloc(&sk->sk_cgrp_data);
2166 		sock_update_classid(&sk->sk_cgrp_data);
2167 		sock_update_netprioidx(&sk->sk_cgrp_data);
2168 		sk_tx_queue_clear(sk);
2169 	}
2170 
2171 	return sk;
2172 }
2173 EXPORT_SYMBOL(sk_alloc);
2174 
2175 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2176  * grace period. This is the case for UDP sockets and TCP listeners.
2177  */
2178 static void __sk_destruct(struct rcu_head *head)
2179 {
2180 	struct sock *sk = container_of(head, struct sock, sk_rcu);
2181 	struct sk_filter *filter;
2182 
2183 	if (sk->sk_destruct)
2184 		sk->sk_destruct(sk);
2185 
2186 	filter = rcu_dereference_check(sk->sk_filter,
2187 				       refcount_read(&sk->sk_wmem_alloc) == 0);
2188 	if (filter) {
2189 		sk_filter_uncharge(sk, filter);
2190 		RCU_INIT_POINTER(sk->sk_filter, NULL);
2191 	}
2192 
2193 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2194 
2195 #ifdef CONFIG_BPF_SYSCALL
2196 	bpf_sk_storage_free(sk);
2197 #endif
2198 
2199 	if (atomic_read(&sk->sk_omem_alloc))
2200 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2201 			 __func__, atomic_read(&sk->sk_omem_alloc));
2202 
2203 	if (sk->sk_frag.page) {
2204 		put_page(sk->sk_frag.page);
2205 		sk->sk_frag.page = NULL;
2206 	}
2207 
2208 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2209 	put_cred(sk->sk_peer_cred);
2210 	put_pid(sk->sk_peer_pid);
2211 
2212 	if (likely(sk->sk_net_refcnt))
2213 		put_net_track(sock_net(sk), &sk->ns_tracker);
2214 	else
2215 		__netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
2216 
2217 	sk_prot_free(sk->sk_prot_creator, sk);
2218 }
2219 
2220 void sk_destruct(struct sock *sk)
2221 {
2222 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2223 
2224 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2225 		reuseport_detach_sock(sk);
2226 		use_call_rcu = true;
2227 	}
2228 
2229 	if (use_call_rcu)
2230 		call_rcu(&sk->sk_rcu, __sk_destruct);
2231 	else
2232 		__sk_destruct(&sk->sk_rcu);
2233 }
2234 
2235 static void __sk_free(struct sock *sk)
2236 {
2237 	if (likely(sk->sk_net_refcnt))
2238 		sock_inuse_add(sock_net(sk), -1);
2239 
2240 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2241 		sock_diag_broadcast_destroy(sk);
2242 	else
2243 		sk_destruct(sk);
2244 }
2245 
2246 void sk_free(struct sock *sk)
2247 {
2248 	/*
2249 	 * We subtract one from sk_wmem_alloc and can know if
2250 	 * some packets are still in some tx queue.
2251 	 * If not null, sock_wfree() will call __sk_free(sk) later
2252 	 */
2253 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2254 		__sk_free(sk);
2255 }
2256 EXPORT_SYMBOL(sk_free);
2257 
2258 static void sk_init_common(struct sock *sk)
2259 {
2260 	skb_queue_head_init(&sk->sk_receive_queue);
2261 	skb_queue_head_init(&sk->sk_write_queue);
2262 	skb_queue_head_init(&sk->sk_error_queue);
2263 
2264 	rwlock_init(&sk->sk_callback_lock);
2265 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2266 			af_rlock_keys + sk->sk_family,
2267 			af_family_rlock_key_strings[sk->sk_family]);
2268 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2269 			af_wlock_keys + sk->sk_family,
2270 			af_family_wlock_key_strings[sk->sk_family]);
2271 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2272 			af_elock_keys + sk->sk_family,
2273 			af_family_elock_key_strings[sk->sk_family]);
2274 	lockdep_set_class_and_name(&sk->sk_callback_lock,
2275 			af_callback_keys + sk->sk_family,
2276 			af_family_clock_key_strings[sk->sk_family]);
2277 }
2278 
2279 /**
2280  *	sk_clone_lock - clone a socket, and lock its clone
2281  *	@sk: the socket to clone
2282  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2283  *
2284  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2285  */
2286 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2287 {
2288 	struct proto *prot = READ_ONCE(sk->sk_prot);
2289 	struct sk_filter *filter;
2290 	bool is_charged = true;
2291 	struct sock *newsk;
2292 
2293 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2294 	if (!newsk)
2295 		goto out;
2296 
2297 	sock_copy(newsk, sk);
2298 
2299 	newsk->sk_prot_creator = prot;
2300 
2301 	/* SANITY */
2302 	if (likely(newsk->sk_net_refcnt)) {
2303 		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2304 		sock_inuse_add(sock_net(newsk), 1);
2305 	} else {
2306 		/* Kernel sockets are not elevating the struct net refcount.
2307 		 * Instead, use a tracker to more easily detect if a layer
2308 		 * is not properly dismantling its kernel sockets at netns
2309 		 * destroy time.
2310 		 */
2311 		__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2312 				      false, priority);
2313 	}
2314 	sk_node_init(&newsk->sk_node);
2315 	sock_lock_init(newsk);
2316 	bh_lock_sock(newsk);
2317 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2318 	newsk->sk_backlog.len = 0;
2319 
2320 	atomic_set(&newsk->sk_rmem_alloc, 0);
2321 
2322 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2323 	refcount_set(&newsk->sk_wmem_alloc, 1);
2324 
2325 	atomic_set(&newsk->sk_omem_alloc, 0);
2326 	sk_init_common(newsk);
2327 
2328 	newsk->sk_dst_cache	= NULL;
2329 	newsk->sk_dst_pending_confirm = 0;
2330 	newsk->sk_wmem_queued	= 0;
2331 	newsk->sk_forward_alloc = 0;
2332 	newsk->sk_reserved_mem  = 0;
2333 	atomic_set(&newsk->sk_drops, 0);
2334 	newsk->sk_send_head	= NULL;
2335 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2336 	atomic_set(&newsk->sk_zckey, 0);
2337 
2338 	sock_reset_flag(newsk, SOCK_DONE);
2339 
2340 	/* sk->sk_memcg will be populated at accept() time */
2341 	newsk->sk_memcg = NULL;
2342 
2343 	cgroup_sk_clone(&newsk->sk_cgrp_data);
2344 
2345 	rcu_read_lock();
2346 	filter = rcu_dereference(sk->sk_filter);
2347 	if (filter != NULL)
2348 		/* though it's an empty new sock, the charging may fail
2349 		 * if sysctl_optmem_max was changed between creation of
2350 		 * original socket and cloning
2351 		 */
2352 		is_charged = sk_filter_charge(newsk, filter);
2353 	RCU_INIT_POINTER(newsk->sk_filter, filter);
2354 	rcu_read_unlock();
2355 
2356 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2357 		/* We need to make sure that we don't uncharge the new
2358 		 * socket if we couldn't charge it in the first place
2359 		 * as otherwise we uncharge the parent's filter.
2360 		 */
2361 		if (!is_charged)
2362 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2363 		sk_free_unlock_clone(newsk);
2364 		newsk = NULL;
2365 		goto out;
2366 	}
2367 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2368 
2369 	if (bpf_sk_storage_clone(sk, newsk)) {
2370 		sk_free_unlock_clone(newsk);
2371 		newsk = NULL;
2372 		goto out;
2373 	}
2374 
2375 	/* Clear sk_user_data if parent had the pointer tagged
2376 	 * as not suitable for copying when cloning.
2377 	 */
2378 	if (sk_user_data_is_nocopy(newsk))
2379 		newsk->sk_user_data = NULL;
2380 
2381 	newsk->sk_err	   = 0;
2382 	newsk->sk_err_soft = 0;
2383 	newsk->sk_priority = 0;
2384 	newsk->sk_incoming_cpu = raw_smp_processor_id();
2385 
2386 	/* Before updating sk_refcnt, we must commit prior changes to memory
2387 	 * (Documentation/RCU/rculist_nulls.rst for details)
2388 	 */
2389 	smp_wmb();
2390 	refcount_set(&newsk->sk_refcnt, 2);
2391 
2392 	sk_set_socket(newsk, NULL);
2393 	sk_tx_queue_clear(newsk);
2394 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2395 
2396 	if (newsk->sk_prot->sockets_allocated)
2397 		sk_sockets_allocated_inc(newsk);
2398 
2399 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2400 		net_enable_timestamp();
2401 out:
2402 	return newsk;
2403 }
2404 EXPORT_SYMBOL_GPL(sk_clone_lock);
2405 
2406 void sk_free_unlock_clone(struct sock *sk)
2407 {
2408 	/* It is still raw copy of parent, so invalidate
2409 	 * destructor and make plain sk_free() */
2410 	sk->sk_destruct = NULL;
2411 	bh_unlock_sock(sk);
2412 	sk_free(sk);
2413 }
2414 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2415 
2416 static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2417 {
2418 	bool is_ipv6 = false;
2419 	u32 max_size;
2420 
2421 #if IS_ENABLED(CONFIG_IPV6)
2422 	is_ipv6 = (sk->sk_family == AF_INET6 &&
2423 		   !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2424 #endif
2425 	/* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2426 	max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2427 			READ_ONCE(dst->dev->gso_ipv4_max_size);
2428 	if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2429 		max_size = GSO_LEGACY_MAX_SIZE;
2430 
2431 	return max_size - (MAX_TCP_HEADER + 1);
2432 }
2433 
2434 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2435 {
2436 	u32 max_segs = 1;
2437 
2438 	sk->sk_route_caps = dst->dev->features;
2439 	if (sk_is_tcp(sk))
2440 		sk->sk_route_caps |= NETIF_F_GSO;
2441 	if (sk->sk_route_caps & NETIF_F_GSO)
2442 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2443 	if (unlikely(sk->sk_gso_disabled))
2444 		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2445 	if (sk_can_gso(sk)) {
2446 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2447 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2448 		} else {
2449 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2450 			sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2451 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2452 			max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2453 		}
2454 	}
2455 	sk->sk_gso_max_segs = max_segs;
2456 	sk_dst_set(sk, dst);
2457 }
2458 EXPORT_SYMBOL_GPL(sk_setup_caps);
2459 
2460 /*
2461  *	Simple resource managers for sockets.
2462  */
2463 
2464 
2465 /*
2466  * Write buffer destructor automatically called from kfree_skb.
2467  */
2468 void sock_wfree(struct sk_buff *skb)
2469 {
2470 	struct sock *sk = skb->sk;
2471 	unsigned int len = skb->truesize;
2472 	bool free;
2473 
2474 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2475 		if (sock_flag(sk, SOCK_RCU_FREE) &&
2476 		    sk->sk_write_space == sock_def_write_space) {
2477 			rcu_read_lock();
2478 			free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2479 			sock_def_write_space_wfree(sk);
2480 			rcu_read_unlock();
2481 			if (unlikely(free))
2482 				__sk_free(sk);
2483 			return;
2484 		}
2485 
2486 		/*
2487 		 * Keep a reference on sk_wmem_alloc, this will be released
2488 		 * after sk_write_space() call
2489 		 */
2490 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2491 		sk->sk_write_space(sk);
2492 		len = 1;
2493 	}
2494 	/*
2495 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2496 	 * could not do because of in-flight packets
2497 	 */
2498 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2499 		__sk_free(sk);
2500 }
2501 EXPORT_SYMBOL(sock_wfree);
2502 
2503 /* This variant of sock_wfree() is used by TCP,
2504  * since it sets SOCK_USE_WRITE_QUEUE.
2505  */
2506 void __sock_wfree(struct sk_buff *skb)
2507 {
2508 	struct sock *sk = skb->sk;
2509 
2510 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2511 		__sk_free(sk);
2512 }
2513 
2514 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2515 {
2516 	skb_orphan(skb);
2517 	skb->sk = sk;
2518 #ifdef CONFIG_INET
2519 	if (unlikely(!sk_fullsock(sk))) {
2520 		skb->destructor = sock_edemux;
2521 		sock_hold(sk);
2522 		return;
2523 	}
2524 #endif
2525 	skb->destructor = sock_wfree;
2526 	skb_set_hash_from_sk(skb, sk);
2527 	/*
2528 	 * We used to take a refcount on sk, but following operation
2529 	 * is enough to guarantee sk_free() wont free this sock until
2530 	 * all in-flight packets are completed
2531 	 */
2532 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2533 }
2534 EXPORT_SYMBOL(skb_set_owner_w);
2535 
2536 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2537 {
2538 #ifdef CONFIG_TLS_DEVICE
2539 	/* Drivers depend on in-order delivery for crypto offload,
2540 	 * partial orphan breaks out-of-order-OK logic.
2541 	 */
2542 	if (skb->decrypted)
2543 		return false;
2544 #endif
2545 	return (skb->destructor == sock_wfree ||
2546 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2547 }
2548 
2549 /* This helper is used by netem, as it can hold packets in its
2550  * delay queue. We want to allow the owner socket to send more
2551  * packets, as if they were already TX completed by a typical driver.
2552  * But we also want to keep skb->sk set because some packet schedulers
2553  * rely on it (sch_fq for example).
2554  */
2555 void skb_orphan_partial(struct sk_buff *skb)
2556 {
2557 	if (skb_is_tcp_pure_ack(skb))
2558 		return;
2559 
2560 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2561 		return;
2562 
2563 	skb_orphan(skb);
2564 }
2565 EXPORT_SYMBOL(skb_orphan_partial);
2566 
2567 /*
2568  * Read buffer destructor automatically called from kfree_skb.
2569  */
2570 void sock_rfree(struct sk_buff *skb)
2571 {
2572 	struct sock *sk = skb->sk;
2573 	unsigned int len = skb->truesize;
2574 
2575 	atomic_sub(len, &sk->sk_rmem_alloc);
2576 	sk_mem_uncharge(sk, len);
2577 }
2578 EXPORT_SYMBOL(sock_rfree);
2579 
2580 /*
2581  * Buffer destructor for skbs that are not used directly in read or write
2582  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2583  */
2584 void sock_efree(struct sk_buff *skb)
2585 {
2586 	sock_put(skb->sk);
2587 }
2588 EXPORT_SYMBOL(sock_efree);
2589 
2590 /* Buffer destructor for prefetch/receive path where reference count may
2591  * not be held, e.g. for listen sockets.
2592  */
2593 #ifdef CONFIG_INET
2594 void sock_pfree(struct sk_buff *skb)
2595 {
2596 	if (sk_is_refcounted(skb->sk))
2597 		sock_gen_put(skb->sk);
2598 }
2599 EXPORT_SYMBOL(sock_pfree);
2600 #endif /* CONFIG_INET */
2601 
2602 kuid_t sock_i_uid(struct sock *sk)
2603 {
2604 	kuid_t uid;
2605 
2606 	read_lock_bh(&sk->sk_callback_lock);
2607 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2608 	read_unlock_bh(&sk->sk_callback_lock);
2609 	return uid;
2610 }
2611 EXPORT_SYMBOL(sock_i_uid);
2612 
2613 unsigned long __sock_i_ino(struct sock *sk)
2614 {
2615 	unsigned long ino;
2616 
2617 	read_lock(&sk->sk_callback_lock);
2618 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2619 	read_unlock(&sk->sk_callback_lock);
2620 	return ino;
2621 }
2622 EXPORT_SYMBOL(__sock_i_ino);
2623 
2624 unsigned long sock_i_ino(struct sock *sk)
2625 {
2626 	unsigned long ino;
2627 
2628 	local_bh_disable();
2629 	ino = __sock_i_ino(sk);
2630 	local_bh_enable();
2631 	return ino;
2632 }
2633 EXPORT_SYMBOL(sock_i_ino);
2634 
2635 /*
2636  * Allocate a skb from the socket's send buffer.
2637  */
2638 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2639 			     gfp_t priority)
2640 {
2641 	if (force ||
2642 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2643 		struct sk_buff *skb = alloc_skb(size, priority);
2644 
2645 		if (skb) {
2646 			skb_set_owner_w(skb, sk);
2647 			return skb;
2648 		}
2649 	}
2650 	return NULL;
2651 }
2652 EXPORT_SYMBOL(sock_wmalloc);
2653 
2654 static void sock_ofree(struct sk_buff *skb)
2655 {
2656 	struct sock *sk = skb->sk;
2657 
2658 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2659 }
2660 
2661 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2662 			     gfp_t priority)
2663 {
2664 	struct sk_buff *skb;
2665 
2666 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2667 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2668 	    READ_ONCE(sysctl_optmem_max))
2669 		return NULL;
2670 
2671 	skb = alloc_skb(size, priority);
2672 	if (!skb)
2673 		return NULL;
2674 
2675 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2676 	skb->sk = sk;
2677 	skb->destructor = sock_ofree;
2678 	return skb;
2679 }
2680 
2681 /*
2682  * Allocate a memory block from the socket's option memory buffer.
2683  */
2684 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2685 {
2686 	int optmem_max = READ_ONCE(sysctl_optmem_max);
2687 
2688 	if ((unsigned int)size <= optmem_max &&
2689 	    atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2690 		void *mem;
2691 		/* First do the add, to avoid the race if kmalloc
2692 		 * might sleep.
2693 		 */
2694 		atomic_add(size, &sk->sk_omem_alloc);
2695 		mem = kmalloc(size, priority);
2696 		if (mem)
2697 			return mem;
2698 		atomic_sub(size, &sk->sk_omem_alloc);
2699 	}
2700 	return NULL;
2701 }
2702 EXPORT_SYMBOL(sock_kmalloc);
2703 
2704 /* Free an option memory block. Note, we actually want the inline
2705  * here as this allows gcc to detect the nullify and fold away the
2706  * condition entirely.
2707  */
2708 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2709 				  const bool nullify)
2710 {
2711 	if (WARN_ON_ONCE(!mem))
2712 		return;
2713 	if (nullify)
2714 		kfree_sensitive(mem);
2715 	else
2716 		kfree(mem);
2717 	atomic_sub(size, &sk->sk_omem_alloc);
2718 }
2719 
2720 void sock_kfree_s(struct sock *sk, void *mem, int size)
2721 {
2722 	__sock_kfree_s(sk, mem, size, false);
2723 }
2724 EXPORT_SYMBOL(sock_kfree_s);
2725 
2726 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2727 {
2728 	__sock_kfree_s(sk, mem, size, true);
2729 }
2730 EXPORT_SYMBOL(sock_kzfree_s);
2731 
2732 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2733    I think, these locks should be removed for datagram sockets.
2734  */
2735 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2736 {
2737 	DEFINE_WAIT(wait);
2738 
2739 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2740 	for (;;) {
2741 		if (!timeo)
2742 			break;
2743 		if (signal_pending(current))
2744 			break;
2745 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2746 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2747 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2748 			break;
2749 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2750 			break;
2751 		if (sk->sk_err)
2752 			break;
2753 		timeo = schedule_timeout(timeo);
2754 	}
2755 	finish_wait(sk_sleep(sk), &wait);
2756 	return timeo;
2757 }
2758 
2759 
2760 /*
2761  *	Generic send/receive buffer handlers
2762  */
2763 
2764 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2765 				     unsigned long data_len, int noblock,
2766 				     int *errcode, int max_page_order)
2767 {
2768 	struct sk_buff *skb;
2769 	long timeo;
2770 	int err;
2771 
2772 	timeo = sock_sndtimeo(sk, noblock);
2773 	for (;;) {
2774 		err = sock_error(sk);
2775 		if (err != 0)
2776 			goto failure;
2777 
2778 		err = -EPIPE;
2779 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2780 			goto failure;
2781 
2782 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2783 			break;
2784 
2785 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2786 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2787 		err = -EAGAIN;
2788 		if (!timeo)
2789 			goto failure;
2790 		if (signal_pending(current))
2791 			goto interrupted;
2792 		timeo = sock_wait_for_wmem(sk, timeo);
2793 	}
2794 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2795 				   errcode, sk->sk_allocation);
2796 	if (skb)
2797 		skb_set_owner_w(skb, sk);
2798 	return skb;
2799 
2800 interrupted:
2801 	err = sock_intr_errno(timeo);
2802 failure:
2803 	*errcode = err;
2804 	return NULL;
2805 }
2806 EXPORT_SYMBOL(sock_alloc_send_pskb);
2807 
2808 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
2809 		     struct sockcm_cookie *sockc)
2810 {
2811 	u32 tsflags;
2812 
2813 	switch (cmsg->cmsg_type) {
2814 	case SO_MARK:
2815 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2816 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2817 			return -EPERM;
2818 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2819 			return -EINVAL;
2820 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2821 		break;
2822 	case SO_TIMESTAMPING_OLD:
2823 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2824 			return -EINVAL;
2825 
2826 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2827 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2828 			return -EINVAL;
2829 
2830 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2831 		sockc->tsflags |= tsflags;
2832 		break;
2833 	case SCM_TXTIME:
2834 		if (!sock_flag(sk, SOCK_TXTIME))
2835 			return -EINVAL;
2836 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2837 			return -EINVAL;
2838 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2839 		break;
2840 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2841 	case SCM_RIGHTS:
2842 	case SCM_CREDENTIALS:
2843 		break;
2844 	default:
2845 		return -EINVAL;
2846 	}
2847 	return 0;
2848 }
2849 EXPORT_SYMBOL(__sock_cmsg_send);
2850 
2851 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2852 		   struct sockcm_cookie *sockc)
2853 {
2854 	struct cmsghdr *cmsg;
2855 	int ret;
2856 
2857 	for_each_cmsghdr(cmsg, msg) {
2858 		if (!CMSG_OK(msg, cmsg))
2859 			return -EINVAL;
2860 		if (cmsg->cmsg_level != SOL_SOCKET)
2861 			continue;
2862 		ret = __sock_cmsg_send(sk, cmsg, sockc);
2863 		if (ret)
2864 			return ret;
2865 	}
2866 	return 0;
2867 }
2868 EXPORT_SYMBOL(sock_cmsg_send);
2869 
2870 static void sk_enter_memory_pressure(struct sock *sk)
2871 {
2872 	if (!sk->sk_prot->enter_memory_pressure)
2873 		return;
2874 
2875 	sk->sk_prot->enter_memory_pressure(sk);
2876 }
2877 
2878 static void sk_leave_memory_pressure(struct sock *sk)
2879 {
2880 	if (sk->sk_prot->leave_memory_pressure) {
2881 		INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
2882 				     tcp_leave_memory_pressure, sk);
2883 	} else {
2884 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2885 
2886 		if (memory_pressure && READ_ONCE(*memory_pressure))
2887 			WRITE_ONCE(*memory_pressure, 0);
2888 	}
2889 }
2890 
2891 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2892 
2893 /**
2894  * skb_page_frag_refill - check that a page_frag contains enough room
2895  * @sz: minimum size of the fragment we want to get
2896  * @pfrag: pointer to page_frag
2897  * @gfp: priority for memory allocation
2898  *
2899  * Note: While this allocator tries to use high order pages, there is
2900  * no guarantee that allocations succeed. Therefore, @sz MUST be
2901  * less or equal than PAGE_SIZE.
2902  */
2903 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2904 {
2905 	if (pfrag->page) {
2906 		if (page_ref_count(pfrag->page) == 1) {
2907 			pfrag->offset = 0;
2908 			return true;
2909 		}
2910 		if (pfrag->offset + sz <= pfrag->size)
2911 			return true;
2912 		put_page(pfrag->page);
2913 	}
2914 
2915 	pfrag->offset = 0;
2916 	if (SKB_FRAG_PAGE_ORDER &&
2917 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2918 		/* Avoid direct reclaim but allow kswapd to wake */
2919 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2920 					  __GFP_COMP | __GFP_NOWARN |
2921 					  __GFP_NORETRY,
2922 					  SKB_FRAG_PAGE_ORDER);
2923 		if (likely(pfrag->page)) {
2924 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2925 			return true;
2926 		}
2927 	}
2928 	pfrag->page = alloc_page(gfp);
2929 	if (likely(pfrag->page)) {
2930 		pfrag->size = PAGE_SIZE;
2931 		return true;
2932 	}
2933 	return false;
2934 }
2935 EXPORT_SYMBOL(skb_page_frag_refill);
2936 
2937 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2938 {
2939 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2940 		return true;
2941 
2942 	sk_enter_memory_pressure(sk);
2943 	sk_stream_moderate_sndbuf(sk);
2944 	return false;
2945 }
2946 EXPORT_SYMBOL(sk_page_frag_refill);
2947 
2948 void __lock_sock(struct sock *sk)
2949 	__releases(&sk->sk_lock.slock)
2950 	__acquires(&sk->sk_lock.slock)
2951 {
2952 	DEFINE_WAIT(wait);
2953 
2954 	for (;;) {
2955 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2956 					TASK_UNINTERRUPTIBLE);
2957 		spin_unlock_bh(&sk->sk_lock.slock);
2958 		schedule();
2959 		spin_lock_bh(&sk->sk_lock.slock);
2960 		if (!sock_owned_by_user(sk))
2961 			break;
2962 	}
2963 	finish_wait(&sk->sk_lock.wq, &wait);
2964 }
2965 
2966 void __release_sock(struct sock *sk)
2967 	__releases(&sk->sk_lock.slock)
2968 	__acquires(&sk->sk_lock.slock)
2969 {
2970 	struct sk_buff *skb, *next;
2971 
2972 	while ((skb = sk->sk_backlog.head) != NULL) {
2973 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2974 
2975 		spin_unlock_bh(&sk->sk_lock.slock);
2976 
2977 		do {
2978 			next = skb->next;
2979 			prefetch(next);
2980 			DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
2981 			skb_mark_not_on_list(skb);
2982 			sk_backlog_rcv(sk, skb);
2983 
2984 			cond_resched();
2985 
2986 			skb = next;
2987 		} while (skb != NULL);
2988 
2989 		spin_lock_bh(&sk->sk_lock.slock);
2990 	}
2991 
2992 	/*
2993 	 * Doing the zeroing here guarantee we can not loop forever
2994 	 * while a wild producer attempts to flood us.
2995 	 */
2996 	sk->sk_backlog.len = 0;
2997 }
2998 
2999 void __sk_flush_backlog(struct sock *sk)
3000 {
3001 	spin_lock_bh(&sk->sk_lock.slock);
3002 	__release_sock(sk);
3003 	spin_unlock_bh(&sk->sk_lock.slock);
3004 }
3005 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3006 
3007 /**
3008  * sk_wait_data - wait for data to arrive at sk_receive_queue
3009  * @sk:    sock to wait on
3010  * @timeo: for how long
3011  * @skb:   last skb seen on sk_receive_queue
3012  *
3013  * Now socket state including sk->sk_err is changed only under lock,
3014  * hence we may omit checks after joining wait queue.
3015  * We check receive queue before schedule() only as optimization;
3016  * it is very likely that release_sock() added new data.
3017  */
3018 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3019 {
3020 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
3021 	int rc;
3022 
3023 	add_wait_queue(sk_sleep(sk), &wait);
3024 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3025 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3026 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3027 	remove_wait_queue(sk_sleep(sk), &wait);
3028 	return rc;
3029 }
3030 EXPORT_SYMBOL(sk_wait_data);
3031 
3032 /**
3033  *	__sk_mem_raise_allocated - increase memory_allocated
3034  *	@sk: socket
3035  *	@size: memory size to allocate
3036  *	@amt: pages to allocate
3037  *	@kind: allocation type
3038  *
3039  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
3040  */
3041 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3042 {
3043 	bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
3044 	struct proto *prot = sk->sk_prot;
3045 	bool charged = true;
3046 	long allocated;
3047 
3048 	sk_memory_allocated_add(sk, amt);
3049 	allocated = sk_memory_allocated(sk);
3050 	if (memcg_charge &&
3051 	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
3052 						gfp_memcg_charge())))
3053 		goto suppress_allocation;
3054 
3055 	/* Under limit. */
3056 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
3057 		sk_leave_memory_pressure(sk);
3058 		return 1;
3059 	}
3060 
3061 	/* Under pressure. */
3062 	if (allocated > sk_prot_mem_limits(sk, 1))
3063 		sk_enter_memory_pressure(sk);
3064 
3065 	/* Over hard limit. */
3066 	if (allocated > sk_prot_mem_limits(sk, 2))
3067 		goto suppress_allocation;
3068 
3069 	/* guarantee minimum buffer size under pressure */
3070 	if (kind == SK_MEM_RECV) {
3071 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3072 			return 1;
3073 
3074 	} else { /* SK_MEM_SEND */
3075 		int wmem0 = sk_get_wmem0(sk, prot);
3076 
3077 		if (sk->sk_type == SOCK_STREAM) {
3078 			if (sk->sk_wmem_queued < wmem0)
3079 				return 1;
3080 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3081 				return 1;
3082 		}
3083 	}
3084 
3085 	if (sk_has_memory_pressure(sk)) {
3086 		u64 alloc;
3087 
3088 		if (!sk_under_memory_pressure(sk))
3089 			return 1;
3090 		alloc = sk_sockets_allocated_read_positive(sk);
3091 		if (sk_prot_mem_limits(sk, 2) > alloc *
3092 		    sk_mem_pages(sk->sk_wmem_queued +
3093 				 atomic_read(&sk->sk_rmem_alloc) +
3094 				 sk->sk_forward_alloc))
3095 			return 1;
3096 	}
3097 
3098 suppress_allocation:
3099 
3100 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3101 		sk_stream_moderate_sndbuf(sk);
3102 
3103 		/* Fail only if socket is _under_ its sndbuf.
3104 		 * In this case we cannot block, so that we have to fail.
3105 		 */
3106 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3107 			/* Force charge with __GFP_NOFAIL */
3108 			if (memcg_charge && !charged) {
3109 				mem_cgroup_charge_skmem(sk->sk_memcg, amt,
3110 					gfp_memcg_charge() | __GFP_NOFAIL);
3111 			}
3112 			return 1;
3113 		}
3114 	}
3115 
3116 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3117 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3118 
3119 	sk_memory_allocated_sub(sk, amt);
3120 
3121 	if (memcg_charge && charged)
3122 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
3123 
3124 	return 0;
3125 }
3126 
3127 /**
3128  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3129  *	@sk: socket
3130  *	@size: memory size to allocate
3131  *	@kind: allocation type
3132  *
3133  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3134  *	rmem allocation. This function assumes that protocols which have
3135  *	memory_pressure use sk_wmem_queued as write buffer accounting.
3136  */
3137 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3138 {
3139 	int ret, amt = sk_mem_pages(size);
3140 
3141 	sk->sk_forward_alloc += amt << PAGE_SHIFT;
3142 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3143 	if (!ret)
3144 		sk->sk_forward_alloc -= amt << PAGE_SHIFT;
3145 	return ret;
3146 }
3147 EXPORT_SYMBOL(__sk_mem_schedule);
3148 
3149 /**
3150  *	__sk_mem_reduce_allocated - reclaim memory_allocated
3151  *	@sk: socket
3152  *	@amount: number of quanta
3153  *
3154  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3155  */
3156 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3157 {
3158 	sk_memory_allocated_sub(sk, amount);
3159 
3160 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3161 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3162 
3163 	if (sk_under_memory_pressure(sk) &&
3164 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3165 		sk_leave_memory_pressure(sk);
3166 }
3167 
3168 /**
3169  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3170  *	@sk: socket
3171  *	@amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3172  */
3173 void __sk_mem_reclaim(struct sock *sk, int amount)
3174 {
3175 	amount >>= PAGE_SHIFT;
3176 	sk->sk_forward_alloc -= amount << PAGE_SHIFT;
3177 	__sk_mem_reduce_allocated(sk, amount);
3178 }
3179 EXPORT_SYMBOL(__sk_mem_reclaim);
3180 
3181 int sk_set_peek_off(struct sock *sk, int val)
3182 {
3183 	WRITE_ONCE(sk->sk_peek_off, val);
3184 	return 0;
3185 }
3186 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3187 
3188 /*
3189  * Set of default routines for initialising struct proto_ops when
3190  * the protocol does not support a particular function. In certain
3191  * cases where it makes no sense for a protocol to have a "do nothing"
3192  * function, some default processing is provided.
3193  */
3194 
3195 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3196 {
3197 	return -EOPNOTSUPP;
3198 }
3199 EXPORT_SYMBOL(sock_no_bind);
3200 
3201 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3202 		    int len, int flags)
3203 {
3204 	return -EOPNOTSUPP;
3205 }
3206 EXPORT_SYMBOL(sock_no_connect);
3207 
3208 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3209 {
3210 	return -EOPNOTSUPP;
3211 }
3212 EXPORT_SYMBOL(sock_no_socketpair);
3213 
3214 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
3215 		   bool kern)
3216 {
3217 	return -EOPNOTSUPP;
3218 }
3219 EXPORT_SYMBOL(sock_no_accept);
3220 
3221 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3222 		    int peer)
3223 {
3224 	return -EOPNOTSUPP;
3225 }
3226 EXPORT_SYMBOL(sock_no_getname);
3227 
3228 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3229 {
3230 	return -EOPNOTSUPP;
3231 }
3232 EXPORT_SYMBOL(sock_no_ioctl);
3233 
3234 int sock_no_listen(struct socket *sock, int backlog)
3235 {
3236 	return -EOPNOTSUPP;
3237 }
3238 EXPORT_SYMBOL(sock_no_listen);
3239 
3240 int sock_no_shutdown(struct socket *sock, int how)
3241 {
3242 	return -EOPNOTSUPP;
3243 }
3244 EXPORT_SYMBOL(sock_no_shutdown);
3245 
3246 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3247 {
3248 	return -EOPNOTSUPP;
3249 }
3250 EXPORT_SYMBOL(sock_no_sendmsg);
3251 
3252 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3253 {
3254 	return -EOPNOTSUPP;
3255 }
3256 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3257 
3258 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3259 		    int flags)
3260 {
3261 	return -EOPNOTSUPP;
3262 }
3263 EXPORT_SYMBOL(sock_no_recvmsg);
3264 
3265 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3266 {
3267 	/* Mirror missing mmap method error code */
3268 	return -ENODEV;
3269 }
3270 EXPORT_SYMBOL(sock_no_mmap);
3271 
3272 /*
3273  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3274  * various sock-based usage counts.
3275  */
3276 void __receive_sock(struct file *file)
3277 {
3278 	struct socket *sock;
3279 
3280 	sock = sock_from_file(file);
3281 	if (sock) {
3282 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3283 		sock_update_classid(&sock->sk->sk_cgrp_data);
3284 	}
3285 }
3286 
3287 /*
3288  *	Default Socket Callbacks
3289  */
3290 
3291 static void sock_def_wakeup(struct sock *sk)
3292 {
3293 	struct socket_wq *wq;
3294 
3295 	rcu_read_lock();
3296 	wq = rcu_dereference(sk->sk_wq);
3297 	if (skwq_has_sleeper(wq))
3298 		wake_up_interruptible_all(&wq->wait);
3299 	rcu_read_unlock();
3300 }
3301 
3302 static void sock_def_error_report(struct sock *sk)
3303 {
3304 	struct socket_wq *wq;
3305 
3306 	rcu_read_lock();
3307 	wq = rcu_dereference(sk->sk_wq);
3308 	if (skwq_has_sleeper(wq))
3309 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3310 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3311 	rcu_read_unlock();
3312 }
3313 
3314 void sock_def_readable(struct sock *sk)
3315 {
3316 	struct socket_wq *wq;
3317 
3318 	trace_sk_data_ready(sk);
3319 
3320 	rcu_read_lock();
3321 	wq = rcu_dereference(sk->sk_wq);
3322 	if (skwq_has_sleeper(wq))
3323 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3324 						EPOLLRDNORM | EPOLLRDBAND);
3325 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3326 	rcu_read_unlock();
3327 }
3328 
3329 static void sock_def_write_space(struct sock *sk)
3330 {
3331 	struct socket_wq *wq;
3332 
3333 	rcu_read_lock();
3334 
3335 	/* Do not wake up a writer until he can make "significant"
3336 	 * progress.  --DaveM
3337 	 */
3338 	if (sock_writeable(sk)) {
3339 		wq = rcu_dereference(sk->sk_wq);
3340 		if (skwq_has_sleeper(wq))
3341 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3342 						EPOLLWRNORM | EPOLLWRBAND);
3343 
3344 		/* Should agree with poll, otherwise some programs break */
3345 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3346 	}
3347 
3348 	rcu_read_unlock();
3349 }
3350 
3351 /* An optimised version of sock_def_write_space(), should only be called
3352  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3353  * ->sk_wmem_alloc.
3354  */
3355 static void sock_def_write_space_wfree(struct sock *sk)
3356 {
3357 	/* Do not wake up a writer until he can make "significant"
3358 	 * progress.  --DaveM
3359 	 */
3360 	if (sock_writeable(sk)) {
3361 		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3362 
3363 		/* rely on refcount_sub from sock_wfree() */
3364 		smp_mb__after_atomic();
3365 		if (wq && waitqueue_active(&wq->wait))
3366 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3367 						EPOLLWRNORM | EPOLLWRBAND);
3368 
3369 		/* Should agree with poll, otherwise some programs break */
3370 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3371 	}
3372 }
3373 
3374 static void sock_def_destruct(struct sock *sk)
3375 {
3376 }
3377 
3378 void sk_send_sigurg(struct sock *sk)
3379 {
3380 	if (sk->sk_socket && sk->sk_socket->file)
3381 		if (send_sigurg(&sk->sk_socket->file->f_owner))
3382 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3383 }
3384 EXPORT_SYMBOL(sk_send_sigurg);
3385 
3386 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3387 		    unsigned long expires)
3388 {
3389 	if (!mod_timer(timer, expires))
3390 		sock_hold(sk);
3391 }
3392 EXPORT_SYMBOL(sk_reset_timer);
3393 
3394 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3395 {
3396 	if (del_timer(timer))
3397 		__sock_put(sk);
3398 }
3399 EXPORT_SYMBOL(sk_stop_timer);
3400 
3401 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3402 {
3403 	if (del_timer_sync(timer))
3404 		__sock_put(sk);
3405 }
3406 EXPORT_SYMBOL(sk_stop_timer_sync);
3407 
3408 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3409 {
3410 	sk_init_common(sk);
3411 	sk->sk_send_head	=	NULL;
3412 
3413 	timer_setup(&sk->sk_timer, NULL, 0);
3414 
3415 	sk->sk_allocation	=	GFP_KERNEL;
3416 	sk->sk_rcvbuf		=	READ_ONCE(sysctl_rmem_default);
3417 	sk->sk_sndbuf		=	READ_ONCE(sysctl_wmem_default);
3418 	sk->sk_state		=	TCP_CLOSE;
3419 	sk->sk_use_task_frag	=	true;
3420 	sk_set_socket(sk, sock);
3421 
3422 	sock_set_flag(sk, SOCK_ZAPPED);
3423 
3424 	if (sock) {
3425 		sk->sk_type	=	sock->type;
3426 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3427 		sock->sk	=	sk;
3428 	} else {
3429 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3430 	}
3431 	sk->sk_uid	=	uid;
3432 
3433 	rwlock_init(&sk->sk_callback_lock);
3434 	if (sk->sk_kern_sock)
3435 		lockdep_set_class_and_name(
3436 			&sk->sk_callback_lock,
3437 			af_kern_callback_keys + sk->sk_family,
3438 			af_family_kern_clock_key_strings[sk->sk_family]);
3439 	else
3440 		lockdep_set_class_and_name(
3441 			&sk->sk_callback_lock,
3442 			af_callback_keys + sk->sk_family,
3443 			af_family_clock_key_strings[sk->sk_family]);
3444 
3445 	sk->sk_state_change	=	sock_def_wakeup;
3446 	sk->sk_data_ready	=	sock_def_readable;
3447 	sk->sk_write_space	=	sock_def_write_space;
3448 	sk->sk_error_report	=	sock_def_error_report;
3449 	sk->sk_destruct		=	sock_def_destruct;
3450 
3451 	sk->sk_frag.page	=	NULL;
3452 	sk->sk_frag.offset	=	0;
3453 	sk->sk_peek_off		=	-1;
3454 
3455 	sk->sk_peer_pid 	=	NULL;
3456 	sk->sk_peer_cred	=	NULL;
3457 	spin_lock_init(&sk->sk_peer_lock);
3458 
3459 	sk->sk_write_pending	=	0;
3460 	sk->sk_rcvlowat		=	1;
3461 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3462 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3463 
3464 	sk->sk_stamp = SK_DEFAULT_STAMP;
3465 #if BITS_PER_LONG==32
3466 	seqlock_init(&sk->sk_stamp_seq);
3467 #endif
3468 	atomic_set(&sk->sk_zckey, 0);
3469 
3470 #ifdef CONFIG_NET_RX_BUSY_POLL
3471 	sk->sk_napi_id		=	0;
3472 	sk->sk_ll_usec		=	READ_ONCE(sysctl_net_busy_read);
3473 #endif
3474 
3475 	sk->sk_max_pacing_rate = ~0UL;
3476 	sk->sk_pacing_rate = ~0UL;
3477 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3478 	sk->sk_incoming_cpu = -1;
3479 
3480 	sk_rx_queue_clear(sk);
3481 	/*
3482 	 * Before updating sk_refcnt, we must commit prior changes to memory
3483 	 * (Documentation/RCU/rculist_nulls.rst for details)
3484 	 */
3485 	smp_wmb();
3486 	refcount_set(&sk->sk_refcnt, 1);
3487 	atomic_set(&sk->sk_drops, 0);
3488 }
3489 EXPORT_SYMBOL(sock_init_data_uid);
3490 
3491 void sock_init_data(struct socket *sock, struct sock *sk)
3492 {
3493 	kuid_t uid = sock ?
3494 		SOCK_INODE(sock)->i_uid :
3495 		make_kuid(sock_net(sk)->user_ns, 0);
3496 
3497 	sock_init_data_uid(sock, sk, uid);
3498 }
3499 EXPORT_SYMBOL(sock_init_data);
3500 
3501 void lock_sock_nested(struct sock *sk, int subclass)
3502 {
3503 	/* The sk_lock has mutex_lock() semantics here. */
3504 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3505 
3506 	might_sleep();
3507 	spin_lock_bh(&sk->sk_lock.slock);
3508 	if (sock_owned_by_user_nocheck(sk))
3509 		__lock_sock(sk);
3510 	sk->sk_lock.owned = 1;
3511 	spin_unlock_bh(&sk->sk_lock.slock);
3512 }
3513 EXPORT_SYMBOL(lock_sock_nested);
3514 
3515 void release_sock(struct sock *sk)
3516 {
3517 	spin_lock_bh(&sk->sk_lock.slock);
3518 	if (sk->sk_backlog.tail)
3519 		__release_sock(sk);
3520 
3521 	/* Warning : release_cb() might need to release sk ownership,
3522 	 * ie call sock_release_ownership(sk) before us.
3523 	 */
3524 	if (sk->sk_prot->release_cb)
3525 		sk->sk_prot->release_cb(sk);
3526 
3527 	sock_release_ownership(sk);
3528 	if (waitqueue_active(&sk->sk_lock.wq))
3529 		wake_up(&sk->sk_lock.wq);
3530 	spin_unlock_bh(&sk->sk_lock.slock);
3531 }
3532 EXPORT_SYMBOL(release_sock);
3533 
3534 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3535 {
3536 	might_sleep();
3537 	spin_lock_bh(&sk->sk_lock.slock);
3538 
3539 	if (!sock_owned_by_user_nocheck(sk)) {
3540 		/*
3541 		 * Fast path return with bottom halves disabled and
3542 		 * sock::sk_lock.slock held.
3543 		 *
3544 		 * The 'mutex' is not contended and holding
3545 		 * sock::sk_lock.slock prevents all other lockers to
3546 		 * proceed so the corresponding unlock_sock_fast() can
3547 		 * avoid the slow path of release_sock() completely and
3548 		 * just release slock.
3549 		 *
3550 		 * From a semantical POV this is equivalent to 'acquiring'
3551 		 * the 'mutex', hence the corresponding lockdep
3552 		 * mutex_release() has to happen in the fast path of
3553 		 * unlock_sock_fast().
3554 		 */
3555 		return false;
3556 	}
3557 
3558 	__lock_sock(sk);
3559 	sk->sk_lock.owned = 1;
3560 	__acquire(&sk->sk_lock.slock);
3561 	spin_unlock_bh(&sk->sk_lock.slock);
3562 	return true;
3563 }
3564 EXPORT_SYMBOL(__lock_sock_fast);
3565 
3566 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3567 		   bool timeval, bool time32)
3568 {
3569 	struct sock *sk = sock->sk;
3570 	struct timespec64 ts;
3571 
3572 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3573 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3574 	if (ts.tv_sec == -1)
3575 		return -ENOENT;
3576 	if (ts.tv_sec == 0) {
3577 		ktime_t kt = ktime_get_real();
3578 		sock_write_timestamp(sk, kt);
3579 		ts = ktime_to_timespec64(kt);
3580 	}
3581 
3582 	if (timeval)
3583 		ts.tv_nsec /= 1000;
3584 
3585 #ifdef CONFIG_COMPAT_32BIT_TIME
3586 	if (time32)
3587 		return put_old_timespec32(&ts, userstamp);
3588 #endif
3589 #ifdef CONFIG_SPARC64
3590 	/* beware of padding in sparc64 timeval */
3591 	if (timeval && !in_compat_syscall()) {
3592 		struct __kernel_old_timeval __user tv = {
3593 			.tv_sec = ts.tv_sec,
3594 			.tv_usec = ts.tv_nsec,
3595 		};
3596 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3597 			return -EFAULT;
3598 		return 0;
3599 	}
3600 #endif
3601 	return put_timespec64(&ts, userstamp);
3602 }
3603 EXPORT_SYMBOL(sock_gettstamp);
3604 
3605 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3606 {
3607 	if (!sock_flag(sk, flag)) {
3608 		unsigned long previous_flags = sk->sk_flags;
3609 
3610 		sock_set_flag(sk, flag);
3611 		/*
3612 		 * we just set one of the two flags which require net
3613 		 * time stamping, but time stamping might have been on
3614 		 * already because of the other one
3615 		 */
3616 		if (sock_needs_netstamp(sk) &&
3617 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3618 			net_enable_timestamp();
3619 	}
3620 }
3621 
3622 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3623 		       int level, int type)
3624 {
3625 	struct sock_exterr_skb *serr;
3626 	struct sk_buff *skb;
3627 	int copied, err;
3628 
3629 	err = -EAGAIN;
3630 	skb = sock_dequeue_err_skb(sk);
3631 	if (skb == NULL)
3632 		goto out;
3633 
3634 	copied = skb->len;
3635 	if (copied > len) {
3636 		msg->msg_flags |= MSG_TRUNC;
3637 		copied = len;
3638 	}
3639 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3640 	if (err)
3641 		goto out_free_skb;
3642 
3643 	sock_recv_timestamp(msg, sk, skb);
3644 
3645 	serr = SKB_EXT_ERR(skb);
3646 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3647 
3648 	msg->msg_flags |= MSG_ERRQUEUE;
3649 	err = copied;
3650 
3651 out_free_skb:
3652 	kfree_skb(skb);
3653 out:
3654 	return err;
3655 }
3656 EXPORT_SYMBOL(sock_recv_errqueue);
3657 
3658 /*
3659  *	Get a socket option on an socket.
3660  *
3661  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3662  *	asynchronous errors should be reported by getsockopt. We assume
3663  *	this means if you specify SO_ERROR (otherwise whats the point of it).
3664  */
3665 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3666 			   char __user *optval, int __user *optlen)
3667 {
3668 	struct sock *sk = sock->sk;
3669 
3670 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3671 	return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3672 }
3673 EXPORT_SYMBOL(sock_common_getsockopt);
3674 
3675 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3676 			int flags)
3677 {
3678 	struct sock *sk = sock->sk;
3679 	int addr_len = 0;
3680 	int err;
3681 
3682 	err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3683 	if (err >= 0)
3684 		msg->msg_namelen = addr_len;
3685 	return err;
3686 }
3687 EXPORT_SYMBOL(sock_common_recvmsg);
3688 
3689 /*
3690  *	Set socket options on an inet socket.
3691  */
3692 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3693 			   sockptr_t optval, unsigned int optlen)
3694 {
3695 	struct sock *sk = sock->sk;
3696 
3697 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3698 	return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3699 }
3700 EXPORT_SYMBOL(sock_common_setsockopt);
3701 
3702 void sk_common_release(struct sock *sk)
3703 {
3704 	if (sk->sk_prot->destroy)
3705 		sk->sk_prot->destroy(sk);
3706 
3707 	/*
3708 	 * Observation: when sk_common_release is called, processes have
3709 	 * no access to socket. But net still has.
3710 	 * Step one, detach it from networking:
3711 	 *
3712 	 * A. Remove from hash tables.
3713 	 */
3714 
3715 	sk->sk_prot->unhash(sk);
3716 
3717 	/*
3718 	 * In this point socket cannot receive new packets, but it is possible
3719 	 * that some packets are in flight because some CPU runs receiver and
3720 	 * did hash table lookup before we unhashed socket. They will achieve
3721 	 * receive queue and will be purged by socket destructor.
3722 	 *
3723 	 * Also we still have packets pending on receive queue and probably,
3724 	 * our own packets waiting in device queues. sock_destroy will drain
3725 	 * receive queue, but transmitted packets will delay socket destruction
3726 	 * until the last reference will be released.
3727 	 */
3728 
3729 	sock_orphan(sk);
3730 
3731 	xfrm_sk_free_policy(sk);
3732 
3733 	sock_put(sk);
3734 }
3735 EXPORT_SYMBOL(sk_common_release);
3736 
3737 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3738 {
3739 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3740 
3741 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3742 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3743 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3744 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3745 	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3746 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3747 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3748 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3749 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3750 }
3751 
3752 #ifdef CONFIG_PROC_FS
3753 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3754 
3755 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3756 {
3757 	int cpu, idx = prot->inuse_idx;
3758 	int res = 0;
3759 
3760 	for_each_possible_cpu(cpu)
3761 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3762 
3763 	return res >= 0 ? res : 0;
3764 }
3765 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3766 
3767 int sock_inuse_get(struct net *net)
3768 {
3769 	int cpu, res = 0;
3770 
3771 	for_each_possible_cpu(cpu)
3772 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3773 
3774 	return res;
3775 }
3776 
3777 EXPORT_SYMBOL_GPL(sock_inuse_get);
3778 
3779 static int __net_init sock_inuse_init_net(struct net *net)
3780 {
3781 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3782 	if (net->core.prot_inuse == NULL)
3783 		return -ENOMEM;
3784 	return 0;
3785 }
3786 
3787 static void __net_exit sock_inuse_exit_net(struct net *net)
3788 {
3789 	free_percpu(net->core.prot_inuse);
3790 }
3791 
3792 static struct pernet_operations net_inuse_ops = {
3793 	.init = sock_inuse_init_net,
3794 	.exit = sock_inuse_exit_net,
3795 };
3796 
3797 static __init int net_inuse_init(void)
3798 {
3799 	if (register_pernet_subsys(&net_inuse_ops))
3800 		panic("Cannot initialize net inuse counters");
3801 
3802 	return 0;
3803 }
3804 
3805 core_initcall(net_inuse_init);
3806 
3807 static int assign_proto_idx(struct proto *prot)
3808 {
3809 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3810 
3811 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3812 		pr_err("PROTO_INUSE_NR exhausted\n");
3813 		return -ENOSPC;
3814 	}
3815 
3816 	set_bit(prot->inuse_idx, proto_inuse_idx);
3817 	return 0;
3818 }
3819 
3820 static void release_proto_idx(struct proto *prot)
3821 {
3822 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3823 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3824 }
3825 #else
3826 static inline int assign_proto_idx(struct proto *prot)
3827 {
3828 	return 0;
3829 }
3830 
3831 static inline void release_proto_idx(struct proto *prot)
3832 {
3833 }
3834 
3835 #endif
3836 
3837 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3838 {
3839 	if (!twsk_prot)
3840 		return;
3841 	kfree(twsk_prot->twsk_slab_name);
3842 	twsk_prot->twsk_slab_name = NULL;
3843 	kmem_cache_destroy(twsk_prot->twsk_slab);
3844 	twsk_prot->twsk_slab = NULL;
3845 }
3846 
3847 static int tw_prot_init(const struct proto *prot)
3848 {
3849 	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3850 
3851 	if (!twsk_prot)
3852 		return 0;
3853 
3854 	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3855 					      prot->name);
3856 	if (!twsk_prot->twsk_slab_name)
3857 		return -ENOMEM;
3858 
3859 	twsk_prot->twsk_slab =
3860 		kmem_cache_create(twsk_prot->twsk_slab_name,
3861 				  twsk_prot->twsk_obj_size, 0,
3862 				  SLAB_ACCOUNT | prot->slab_flags,
3863 				  NULL);
3864 	if (!twsk_prot->twsk_slab) {
3865 		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3866 			prot->name);
3867 		return -ENOMEM;
3868 	}
3869 
3870 	return 0;
3871 }
3872 
3873 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3874 {
3875 	if (!rsk_prot)
3876 		return;
3877 	kfree(rsk_prot->slab_name);
3878 	rsk_prot->slab_name = NULL;
3879 	kmem_cache_destroy(rsk_prot->slab);
3880 	rsk_prot->slab = NULL;
3881 }
3882 
3883 static int req_prot_init(const struct proto *prot)
3884 {
3885 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3886 
3887 	if (!rsk_prot)
3888 		return 0;
3889 
3890 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3891 					prot->name);
3892 	if (!rsk_prot->slab_name)
3893 		return -ENOMEM;
3894 
3895 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3896 					   rsk_prot->obj_size, 0,
3897 					   SLAB_ACCOUNT | prot->slab_flags,
3898 					   NULL);
3899 
3900 	if (!rsk_prot->slab) {
3901 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3902 			prot->name);
3903 		return -ENOMEM;
3904 	}
3905 	return 0;
3906 }
3907 
3908 int proto_register(struct proto *prot, int alloc_slab)
3909 {
3910 	int ret = -ENOBUFS;
3911 
3912 	if (prot->memory_allocated && !prot->sysctl_mem) {
3913 		pr_err("%s: missing sysctl_mem\n", prot->name);
3914 		return -EINVAL;
3915 	}
3916 	if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
3917 		pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
3918 		return -EINVAL;
3919 	}
3920 	if (alloc_slab) {
3921 		prot->slab = kmem_cache_create_usercopy(prot->name,
3922 					prot->obj_size, 0,
3923 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3924 					prot->slab_flags,
3925 					prot->useroffset, prot->usersize,
3926 					NULL);
3927 
3928 		if (prot->slab == NULL) {
3929 			pr_crit("%s: Can't create sock SLAB cache!\n",
3930 				prot->name);
3931 			goto out;
3932 		}
3933 
3934 		if (req_prot_init(prot))
3935 			goto out_free_request_sock_slab;
3936 
3937 		if (tw_prot_init(prot))
3938 			goto out_free_timewait_sock_slab;
3939 	}
3940 
3941 	mutex_lock(&proto_list_mutex);
3942 	ret = assign_proto_idx(prot);
3943 	if (ret) {
3944 		mutex_unlock(&proto_list_mutex);
3945 		goto out_free_timewait_sock_slab;
3946 	}
3947 	list_add(&prot->node, &proto_list);
3948 	mutex_unlock(&proto_list_mutex);
3949 	return ret;
3950 
3951 out_free_timewait_sock_slab:
3952 	if (alloc_slab)
3953 		tw_prot_cleanup(prot->twsk_prot);
3954 out_free_request_sock_slab:
3955 	if (alloc_slab) {
3956 		req_prot_cleanup(prot->rsk_prot);
3957 
3958 		kmem_cache_destroy(prot->slab);
3959 		prot->slab = NULL;
3960 	}
3961 out:
3962 	return ret;
3963 }
3964 EXPORT_SYMBOL(proto_register);
3965 
3966 void proto_unregister(struct proto *prot)
3967 {
3968 	mutex_lock(&proto_list_mutex);
3969 	release_proto_idx(prot);
3970 	list_del(&prot->node);
3971 	mutex_unlock(&proto_list_mutex);
3972 
3973 	kmem_cache_destroy(prot->slab);
3974 	prot->slab = NULL;
3975 
3976 	req_prot_cleanup(prot->rsk_prot);
3977 	tw_prot_cleanup(prot->twsk_prot);
3978 }
3979 EXPORT_SYMBOL(proto_unregister);
3980 
3981 int sock_load_diag_module(int family, int protocol)
3982 {
3983 	if (!protocol) {
3984 		if (!sock_is_registered(family))
3985 			return -ENOENT;
3986 
3987 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3988 				      NETLINK_SOCK_DIAG, family);
3989 	}
3990 
3991 #ifdef CONFIG_INET
3992 	if (family == AF_INET &&
3993 	    protocol != IPPROTO_RAW &&
3994 	    protocol < MAX_INET_PROTOS &&
3995 	    !rcu_access_pointer(inet_protos[protocol]))
3996 		return -ENOENT;
3997 #endif
3998 
3999 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4000 			      NETLINK_SOCK_DIAG, family, protocol);
4001 }
4002 EXPORT_SYMBOL(sock_load_diag_module);
4003 
4004 #ifdef CONFIG_PROC_FS
4005 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4006 	__acquires(proto_list_mutex)
4007 {
4008 	mutex_lock(&proto_list_mutex);
4009 	return seq_list_start_head(&proto_list, *pos);
4010 }
4011 
4012 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4013 {
4014 	return seq_list_next(v, &proto_list, pos);
4015 }
4016 
4017 static void proto_seq_stop(struct seq_file *seq, void *v)
4018 	__releases(proto_list_mutex)
4019 {
4020 	mutex_unlock(&proto_list_mutex);
4021 }
4022 
4023 static char proto_method_implemented(const void *method)
4024 {
4025 	return method == NULL ? 'n' : 'y';
4026 }
4027 static long sock_prot_memory_allocated(struct proto *proto)
4028 {
4029 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4030 }
4031 
4032 static const char *sock_prot_memory_pressure(struct proto *proto)
4033 {
4034 	return proto->memory_pressure != NULL ?
4035 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4036 }
4037 
4038 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4039 {
4040 
4041 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4042 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4043 		   proto->name,
4044 		   proto->obj_size,
4045 		   sock_prot_inuse_get(seq_file_net(seq), proto),
4046 		   sock_prot_memory_allocated(proto),
4047 		   sock_prot_memory_pressure(proto),
4048 		   proto->max_header,
4049 		   proto->slab == NULL ? "no" : "yes",
4050 		   module_name(proto->owner),
4051 		   proto_method_implemented(proto->close),
4052 		   proto_method_implemented(proto->connect),
4053 		   proto_method_implemented(proto->disconnect),
4054 		   proto_method_implemented(proto->accept),
4055 		   proto_method_implemented(proto->ioctl),
4056 		   proto_method_implemented(proto->init),
4057 		   proto_method_implemented(proto->destroy),
4058 		   proto_method_implemented(proto->shutdown),
4059 		   proto_method_implemented(proto->setsockopt),
4060 		   proto_method_implemented(proto->getsockopt),
4061 		   proto_method_implemented(proto->sendmsg),
4062 		   proto_method_implemented(proto->recvmsg),
4063 		   proto_method_implemented(proto->bind),
4064 		   proto_method_implemented(proto->backlog_rcv),
4065 		   proto_method_implemented(proto->hash),
4066 		   proto_method_implemented(proto->unhash),
4067 		   proto_method_implemented(proto->get_port),
4068 		   proto_method_implemented(proto->enter_memory_pressure));
4069 }
4070 
4071 static int proto_seq_show(struct seq_file *seq, void *v)
4072 {
4073 	if (v == &proto_list)
4074 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4075 			   "protocol",
4076 			   "size",
4077 			   "sockets",
4078 			   "memory",
4079 			   "press",
4080 			   "maxhdr",
4081 			   "slab",
4082 			   "module",
4083 			   "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4084 	else
4085 		proto_seq_printf(seq, list_entry(v, struct proto, node));
4086 	return 0;
4087 }
4088 
4089 static const struct seq_operations proto_seq_ops = {
4090 	.start  = proto_seq_start,
4091 	.next   = proto_seq_next,
4092 	.stop   = proto_seq_stop,
4093 	.show   = proto_seq_show,
4094 };
4095 
4096 static __net_init int proto_init_net(struct net *net)
4097 {
4098 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4099 			sizeof(struct seq_net_private)))
4100 		return -ENOMEM;
4101 
4102 	return 0;
4103 }
4104 
4105 static __net_exit void proto_exit_net(struct net *net)
4106 {
4107 	remove_proc_entry("protocols", net->proc_net);
4108 }
4109 
4110 
4111 static __net_initdata struct pernet_operations proto_net_ops = {
4112 	.init = proto_init_net,
4113 	.exit = proto_exit_net,
4114 };
4115 
4116 static int __init proto_init(void)
4117 {
4118 	return register_pernet_subsys(&proto_net_ops);
4119 }
4120 
4121 subsys_initcall(proto_init);
4122 
4123 #endif /* PROC_FS */
4124 
4125 #ifdef CONFIG_NET_RX_BUSY_POLL
4126 bool sk_busy_loop_end(void *p, unsigned long start_time)
4127 {
4128 	struct sock *sk = p;
4129 
4130 	return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
4131 	       sk_busy_loop_timeout(sk, start_time);
4132 }
4133 EXPORT_SYMBOL(sk_busy_loop_end);
4134 #endif /* CONFIG_NET_RX_BUSY_POLL */
4135 
4136 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4137 {
4138 	if (!sk->sk_prot->bind_add)
4139 		return -EOPNOTSUPP;
4140 	return sk->sk_prot->bind_add(sk, addr, addr_len);
4141 }
4142 EXPORT_SYMBOL(sock_bind_add);
4143 
4144 /* Copy 'size' bytes from userspace and return `size` back to userspace */
4145 int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4146 		     void __user *arg, void *karg, size_t size)
4147 {
4148 	int ret;
4149 
4150 	if (copy_from_user(karg, arg, size))
4151 		return -EFAULT;
4152 
4153 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4154 	if (ret)
4155 		return ret;
4156 
4157 	if (copy_to_user(arg, karg, size))
4158 		return -EFAULT;
4159 
4160 	return 0;
4161 }
4162 EXPORT_SYMBOL(sock_ioctl_inout);
4163 
4164 /* This is the most common ioctl prep function, where the result (4 bytes) is
4165  * copied back to userspace if the ioctl() returns successfully. No input is
4166  * copied from userspace as input argument.
4167  */
4168 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4169 {
4170 	int ret, karg = 0;
4171 
4172 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4173 	if (ret)
4174 		return ret;
4175 
4176 	return put_user(karg, (int __user *)arg);
4177 }
4178 
4179 /* A wrapper around sock ioctls, which copies the data from userspace
4180  * (depending on the protocol/ioctl), and copies back the result to userspace.
4181  * The main motivation for this function is to pass kernel memory to the
4182  * protocol ioctl callbacks, instead of userspace memory.
4183  */
4184 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4185 {
4186 	int rc = 1;
4187 
4188 	if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4189 		rc = ipmr_sk_ioctl(sk, cmd, arg);
4190 	else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4191 		rc = ip6mr_sk_ioctl(sk, cmd, arg);
4192 	else if (sk_is_phonet(sk))
4193 		rc = phonet_sk_ioctl(sk, cmd, arg);
4194 
4195 	/* If ioctl was processed, returns its value */
4196 	if (rc <= 0)
4197 		return rc;
4198 
4199 	/* Otherwise call the default handler */
4200 	return sock_ioctl_out(sk, cmd, arg);
4201 }
4202 EXPORT_SYMBOL(sk_ioctl);
4203