xref: /openbmc/linux/net/core/sock.c (revision 30ff01ee)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/init.h>
111 #include <linux/highmem.h>
112 #include <linux/user_namespace.h>
113 #include <linux/static_key.h>
114 #include <linux/memcontrol.h>
115 #include <linux/prefetch.h>
116 #include <linux/compat.h>
117 #include <linux/mroute.h>
118 #include <linux/mroute6.h>
119 #include <linux/icmpv6.h>
120 
121 #include <linux/uaccess.h>
122 
123 #include <linux/netdevice.h>
124 #include <net/protocol.h>
125 #include <linux/skbuff.h>
126 #include <net/net_namespace.h>
127 #include <net/request_sock.h>
128 #include <net/sock.h>
129 #include <linux/net_tstamp.h>
130 #include <net/xfrm.h>
131 #include <linux/ipsec.h>
132 #include <net/cls_cgroup.h>
133 #include <net/netprio_cgroup.h>
134 #include <linux/sock_diag.h>
135 
136 #include <linux/filter.h>
137 #include <net/sock_reuseport.h>
138 #include <net/bpf_sk_storage.h>
139 
140 #include <trace/events/sock.h>
141 
142 #include <net/tcp.h>
143 #include <net/busy_poll.h>
144 #include <net/phonet/phonet.h>
145 
146 #include <linux/ethtool.h>
147 
148 #include "dev.h"
149 
150 static DEFINE_MUTEX(proto_list_mutex);
151 static LIST_HEAD(proto_list);
152 
153 static void sock_def_write_space_wfree(struct sock *sk);
154 static void sock_def_write_space(struct sock *sk);
155 
156 /**
157  * sk_ns_capable - General socket capability test
158  * @sk: Socket to use a capability on or through
159  * @user_ns: The user namespace of the capability to use
160  * @cap: The capability to use
161  *
162  * Test to see if the opener of the socket had when the socket was
163  * created and the current process has the capability @cap in the user
164  * namespace @user_ns.
165  */
166 bool sk_ns_capable(const struct sock *sk,
167 		   struct user_namespace *user_ns, int cap)
168 {
169 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
170 		ns_capable(user_ns, cap);
171 }
172 EXPORT_SYMBOL(sk_ns_capable);
173 
174 /**
175  * sk_capable - Socket global capability test
176  * @sk: Socket to use a capability on or through
177  * @cap: The global capability to use
178  *
179  * Test to see if the opener of the socket had when the socket was
180  * created and the current process has the capability @cap in all user
181  * namespaces.
182  */
183 bool sk_capable(const struct sock *sk, int cap)
184 {
185 	return sk_ns_capable(sk, &init_user_ns, cap);
186 }
187 EXPORT_SYMBOL(sk_capable);
188 
189 /**
190  * sk_net_capable - Network namespace socket capability test
191  * @sk: Socket to use a capability on or through
192  * @cap: The capability to use
193  *
194  * Test to see if the opener of the socket had when the socket was created
195  * and the current process has the capability @cap over the network namespace
196  * the socket is a member of.
197  */
198 bool sk_net_capable(const struct sock *sk, int cap)
199 {
200 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
201 }
202 EXPORT_SYMBOL(sk_net_capable);
203 
204 /*
205  * Each address family might have different locking rules, so we have
206  * one slock key per address family and separate keys for internal and
207  * userspace sockets.
208  */
209 static struct lock_class_key af_family_keys[AF_MAX];
210 static struct lock_class_key af_family_kern_keys[AF_MAX];
211 static struct lock_class_key af_family_slock_keys[AF_MAX];
212 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
213 
214 /*
215  * Make lock validator output more readable. (we pre-construct these
216  * strings build-time, so that runtime initialization of socket
217  * locks is fast):
218  */
219 
220 #define _sock_locks(x)						  \
221   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
222   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
223   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
224   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
225   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
226   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
227   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
228   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
229   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
230   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
231   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
232   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
233   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
234   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
235   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
236   x "AF_MCTP"  , \
237   x "AF_MAX"
238 
239 static const char *const af_family_key_strings[AF_MAX+1] = {
240 	_sock_locks("sk_lock-")
241 };
242 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
243 	_sock_locks("slock-")
244 };
245 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
246 	_sock_locks("clock-")
247 };
248 
249 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
250 	_sock_locks("k-sk_lock-")
251 };
252 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
253 	_sock_locks("k-slock-")
254 };
255 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
256 	_sock_locks("k-clock-")
257 };
258 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
259 	_sock_locks("rlock-")
260 };
261 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
262 	_sock_locks("wlock-")
263 };
264 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
265 	_sock_locks("elock-")
266 };
267 
268 /*
269  * sk_callback_lock and sk queues locking rules are per-address-family,
270  * so split the lock classes by using a per-AF key:
271  */
272 static struct lock_class_key af_callback_keys[AF_MAX];
273 static struct lock_class_key af_rlock_keys[AF_MAX];
274 static struct lock_class_key af_wlock_keys[AF_MAX];
275 static struct lock_class_key af_elock_keys[AF_MAX];
276 static struct lock_class_key af_kern_callback_keys[AF_MAX];
277 
278 /* Run time adjustable parameters. */
279 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
280 EXPORT_SYMBOL(sysctl_wmem_max);
281 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
282 EXPORT_SYMBOL(sysctl_rmem_max);
283 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
284 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
285 
286 /* Maximal space eaten by iovec or ancillary data plus some space */
287 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
288 EXPORT_SYMBOL(sysctl_optmem_max);
289 
290 int sysctl_tstamp_allow_data __read_mostly = 1;
291 
292 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
293 EXPORT_SYMBOL_GPL(memalloc_socks_key);
294 
295 /**
296  * sk_set_memalloc - sets %SOCK_MEMALLOC
297  * @sk: socket to set it on
298  *
299  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
300  * It's the responsibility of the admin to adjust min_free_kbytes
301  * to meet the requirements
302  */
303 void sk_set_memalloc(struct sock *sk)
304 {
305 	sock_set_flag(sk, SOCK_MEMALLOC);
306 	sk->sk_allocation |= __GFP_MEMALLOC;
307 	static_branch_inc(&memalloc_socks_key);
308 }
309 EXPORT_SYMBOL_GPL(sk_set_memalloc);
310 
311 void sk_clear_memalloc(struct sock *sk)
312 {
313 	sock_reset_flag(sk, SOCK_MEMALLOC);
314 	sk->sk_allocation &= ~__GFP_MEMALLOC;
315 	static_branch_dec(&memalloc_socks_key);
316 
317 	/*
318 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
319 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
320 	 * it has rmem allocations due to the last swapfile being deactivated
321 	 * but there is a risk that the socket is unusable due to exceeding
322 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
323 	 */
324 	sk_mem_reclaim(sk);
325 }
326 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
327 
328 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
329 {
330 	int ret;
331 	unsigned int noreclaim_flag;
332 
333 	/* these should have been dropped before queueing */
334 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
335 
336 	noreclaim_flag = memalloc_noreclaim_save();
337 	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
338 				 tcp_v6_do_rcv,
339 				 tcp_v4_do_rcv,
340 				 sk, skb);
341 	memalloc_noreclaim_restore(noreclaim_flag);
342 
343 	return ret;
344 }
345 EXPORT_SYMBOL(__sk_backlog_rcv);
346 
347 void sk_error_report(struct sock *sk)
348 {
349 	sk->sk_error_report(sk);
350 
351 	switch (sk->sk_family) {
352 	case AF_INET:
353 		fallthrough;
354 	case AF_INET6:
355 		trace_inet_sk_error_report(sk);
356 		break;
357 	default:
358 		break;
359 	}
360 }
361 EXPORT_SYMBOL(sk_error_report);
362 
363 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
364 {
365 	struct __kernel_sock_timeval tv;
366 
367 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
368 		tv.tv_sec = 0;
369 		tv.tv_usec = 0;
370 	} else {
371 		tv.tv_sec = timeo / HZ;
372 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
373 	}
374 
375 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
376 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
377 		*(struct old_timeval32 *)optval = tv32;
378 		return sizeof(tv32);
379 	}
380 
381 	if (old_timeval) {
382 		struct __kernel_old_timeval old_tv;
383 		old_tv.tv_sec = tv.tv_sec;
384 		old_tv.tv_usec = tv.tv_usec;
385 		*(struct __kernel_old_timeval *)optval = old_tv;
386 		return sizeof(old_tv);
387 	}
388 
389 	*(struct __kernel_sock_timeval *)optval = tv;
390 	return sizeof(tv);
391 }
392 EXPORT_SYMBOL(sock_get_timeout);
393 
394 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
395 			   sockptr_t optval, int optlen, bool old_timeval)
396 {
397 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
398 		struct old_timeval32 tv32;
399 
400 		if (optlen < sizeof(tv32))
401 			return -EINVAL;
402 
403 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
404 			return -EFAULT;
405 		tv->tv_sec = tv32.tv_sec;
406 		tv->tv_usec = tv32.tv_usec;
407 	} else if (old_timeval) {
408 		struct __kernel_old_timeval old_tv;
409 
410 		if (optlen < sizeof(old_tv))
411 			return -EINVAL;
412 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
413 			return -EFAULT;
414 		tv->tv_sec = old_tv.tv_sec;
415 		tv->tv_usec = old_tv.tv_usec;
416 	} else {
417 		if (optlen < sizeof(*tv))
418 			return -EINVAL;
419 		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
420 			return -EFAULT;
421 	}
422 
423 	return 0;
424 }
425 EXPORT_SYMBOL(sock_copy_user_timeval);
426 
427 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
428 			    bool old_timeval)
429 {
430 	struct __kernel_sock_timeval tv;
431 	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
432 
433 	if (err)
434 		return err;
435 
436 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
437 		return -EDOM;
438 
439 	if (tv.tv_sec < 0) {
440 		static int warned __read_mostly;
441 
442 		*timeo_p = 0;
443 		if (warned < 10 && net_ratelimit()) {
444 			warned++;
445 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
446 				__func__, current->comm, task_pid_nr(current));
447 		}
448 		return 0;
449 	}
450 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
451 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
452 		return 0;
453 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
454 		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
455 	return 0;
456 }
457 
458 static bool sock_needs_netstamp(const struct sock *sk)
459 {
460 	switch (sk->sk_family) {
461 	case AF_UNSPEC:
462 	case AF_UNIX:
463 		return false;
464 	default:
465 		return true;
466 	}
467 }
468 
469 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
470 {
471 	if (sk->sk_flags & flags) {
472 		sk->sk_flags &= ~flags;
473 		if (sock_needs_netstamp(sk) &&
474 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
475 			net_disable_timestamp();
476 	}
477 }
478 
479 
480 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
481 {
482 	unsigned long flags;
483 	struct sk_buff_head *list = &sk->sk_receive_queue;
484 
485 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
486 		atomic_inc(&sk->sk_drops);
487 		trace_sock_rcvqueue_full(sk, skb);
488 		return -ENOMEM;
489 	}
490 
491 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
492 		atomic_inc(&sk->sk_drops);
493 		return -ENOBUFS;
494 	}
495 
496 	skb->dev = NULL;
497 	skb_set_owner_r(skb, sk);
498 
499 	/* we escape from rcu protected region, make sure we dont leak
500 	 * a norefcounted dst
501 	 */
502 	skb_dst_force(skb);
503 
504 	spin_lock_irqsave(&list->lock, flags);
505 	sock_skb_set_dropcount(sk, skb);
506 	__skb_queue_tail(list, skb);
507 	spin_unlock_irqrestore(&list->lock, flags);
508 
509 	if (!sock_flag(sk, SOCK_DEAD))
510 		sk->sk_data_ready(sk);
511 	return 0;
512 }
513 EXPORT_SYMBOL(__sock_queue_rcv_skb);
514 
515 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
516 			      enum skb_drop_reason *reason)
517 {
518 	enum skb_drop_reason drop_reason;
519 	int err;
520 
521 	err = sk_filter(sk, skb);
522 	if (err) {
523 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
524 		goto out;
525 	}
526 	err = __sock_queue_rcv_skb(sk, skb);
527 	switch (err) {
528 	case -ENOMEM:
529 		drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
530 		break;
531 	case -ENOBUFS:
532 		drop_reason = SKB_DROP_REASON_PROTO_MEM;
533 		break;
534 	default:
535 		drop_reason = SKB_NOT_DROPPED_YET;
536 		break;
537 	}
538 out:
539 	if (reason)
540 		*reason = drop_reason;
541 	return err;
542 }
543 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
544 
545 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
546 		     const int nested, unsigned int trim_cap, bool refcounted)
547 {
548 	int rc = NET_RX_SUCCESS;
549 
550 	if (sk_filter_trim_cap(sk, skb, trim_cap))
551 		goto discard_and_relse;
552 
553 	skb->dev = NULL;
554 
555 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
556 		atomic_inc(&sk->sk_drops);
557 		goto discard_and_relse;
558 	}
559 	if (nested)
560 		bh_lock_sock_nested(sk);
561 	else
562 		bh_lock_sock(sk);
563 	if (!sock_owned_by_user(sk)) {
564 		/*
565 		 * trylock + unlock semantics:
566 		 */
567 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
568 
569 		rc = sk_backlog_rcv(sk, skb);
570 
571 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
572 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
573 		bh_unlock_sock(sk);
574 		atomic_inc(&sk->sk_drops);
575 		goto discard_and_relse;
576 	}
577 
578 	bh_unlock_sock(sk);
579 out:
580 	if (refcounted)
581 		sock_put(sk);
582 	return rc;
583 discard_and_relse:
584 	kfree_skb(skb);
585 	goto out;
586 }
587 EXPORT_SYMBOL(__sk_receive_skb);
588 
589 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
590 							  u32));
591 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
592 							   u32));
593 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
594 {
595 	struct dst_entry *dst = __sk_dst_get(sk);
596 
597 	if (dst && dst->obsolete &&
598 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
599 			       dst, cookie) == NULL) {
600 		sk_tx_queue_clear(sk);
601 		sk->sk_dst_pending_confirm = 0;
602 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
603 		dst_release(dst);
604 		return NULL;
605 	}
606 
607 	return dst;
608 }
609 EXPORT_SYMBOL(__sk_dst_check);
610 
611 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
612 {
613 	struct dst_entry *dst = sk_dst_get(sk);
614 
615 	if (dst && dst->obsolete &&
616 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
617 			       dst, cookie) == NULL) {
618 		sk_dst_reset(sk);
619 		dst_release(dst);
620 		return NULL;
621 	}
622 
623 	return dst;
624 }
625 EXPORT_SYMBOL(sk_dst_check);
626 
627 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
628 {
629 	int ret = -ENOPROTOOPT;
630 #ifdef CONFIG_NETDEVICES
631 	struct net *net = sock_net(sk);
632 
633 	/* Sorry... */
634 	ret = -EPERM;
635 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
636 		goto out;
637 
638 	ret = -EINVAL;
639 	if (ifindex < 0)
640 		goto out;
641 
642 	/* Paired with all READ_ONCE() done locklessly. */
643 	WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
644 
645 	if (sk->sk_prot->rehash)
646 		sk->sk_prot->rehash(sk);
647 	sk_dst_reset(sk);
648 
649 	ret = 0;
650 
651 out:
652 #endif
653 
654 	return ret;
655 }
656 
657 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
658 {
659 	int ret;
660 
661 	if (lock_sk)
662 		lock_sock(sk);
663 	ret = sock_bindtoindex_locked(sk, ifindex);
664 	if (lock_sk)
665 		release_sock(sk);
666 
667 	return ret;
668 }
669 EXPORT_SYMBOL(sock_bindtoindex);
670 
671 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
672 {
673 	int ret = -ENOPROTOOPT;
674 #ifdef CONFIG_NETDEVICES
675 	struct net *net = sock_net(sk);
676 	char devname[IFNAMSIZ];
677 	int index;
678 
679 	ret = -EINVAL;
680 	if (optlen < 0)
681 		goto out;
682 
683 	/* Bind this socket to a particular device like "eth0",
684 	 * as specified in the passed interface name. If the
685 	 * name is "" or the option length is zero the socket
686 	 * is not bound.
687 	 */
688 	if (optlen > IFNAMSIZ - 1)
689 		optlen = IFNAMSIZ - 1;
690 	memset(devname, 0, sizeof(devname));
691 
692 	ret = -EFAULT;
693 	if (copy_from_sockptr(devname, optval, optlen))
694 		goto out;
695 
696 	index = 0;
697 	if (devname[0] != '\0') {
698 		struct net_device *dev;
699 
700 		rcu_read_lock();
701 		dev = dev_get_by_name_rcu(net, devname);
702 		if (dev)
703 			index = dev->ifindex;
704 		rcu_read_unlock();
705 		ret = -ENODEV;
706 		if (!dev)
707 			goto out;
708 	}
709 
710 	sockopt_lock_sock(sk);
711 	ret = sock_bindtoindex_locked(sk, index);
712 	sockopt_release_sock(sk);
713 out:
714 #endif
715 
716 	return ret;
717 }
718 
719 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
720 				sockptr_t optlen, int len)
721 {
722 	int ret = -ENOPROTOOPT;
723 #ifdef CONFIG_NETDEVICES
724 	int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
725 	struct net *net = sock_net(sk);
726 	char devname[IFNAMSIZ];
727 
728 	if (bound_dev_if == 0) {
729 		len = 0;
730 		goto zero;
731 	}
732 
733 	ret = -EINVAL;
734 	if (len < IFNAMSIZ)
735 		goto out;
736 
737 	ret = netdev_get_name(net, devname, bound_dev_if);
738 	if (ret)
739 		goto out;
740 
741 	len = strlen(devname) + 1;
742 
743 	ret = -EFAULT;
744 	if (copy_to_sockptr(optval, devname, len))
745 		goto out;
746 
747 zero:
748 	ret = -EFAULT;
749 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
750 		goto out;
751 
752 	ret = 0;
753 
754 out:
755 #endif
756 
757 	return ret;
758 }
759 
760 bool sk_mc_loop(struct sock *sk)
761 {
762 	if (dev_recursion_level())
763 		return false;
764 	if (!sk)
765 		return true;
766 	switch (sk->sk_family) {
767 	case AF_INET:
768 		return inet_sk(sk)->mc_loop;
769 #if IS_ENABLED(CONFIG_IPV6)
770 	case AF_INET6:
771 		return inet6_sk(sk)->mc_loop;
772 #endif
773 	}
774 	WARN_ON_ONCE(1);
775 	return true;
776 }
777 EXPORT_SYMBOL(sk_mc_loop);
778 
779 void sock_set_reuseaddr(struct sock *sk)
780 {
781 	lock_sock(sk);
782 	sk->sk_reuse = SK_CAN_REUSE;
783 	release_sock(sk);
784 }
785 EXPORT_SYMBOL(sock_set_reuseaddr);
786 
787 void sock_set_reuseport(struct sock *sk)
788 {
789 	lock_sock(sk);
790 	sk->sk_reuseport = true;
791 	release_sock(sk);
792 }
793 EXPORT_SYMBOL(sock_set_reuseport);
794 
795 void sock_no_linger(struct sock *sk)
796 {
797 	lock_sock(sk);
798 	sk->sk_lingertime = 0;
799 	sock_set_flag(sk, SOCK_LINGER);
800 	release_sock(sk);
801 }
802 EXPORT_SYMBOL(sock_no_linger);
803 
804 void sock_set_priority(struct sock *sk, u32 priority)
805 {
806 	lock_sock(sk);
807 	sk->sk_priority = priority;
808 	release_sock(sk);
809 }
810 EXPORT_SYMBOL(sock_set_priority);
811 
812 void sock_set_sndtimeo(struct sock *sk, s64 secs)
813 {
814 	lock_sock(sk);
815 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
816 		sk->sk_sndtimeo = secs * HZ;
817 	else
818 		sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
819 	release_sock(sk);
820 }
821 EXPORT_SYMBOL(sock_set_sndtimeo);
822 
823 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
824 {
825 	if (val)  {
826 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
827 		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
828 		sock_set_flag(sk, SOCK_RCVTSTAMP);
829 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
830 	} else {
831 		sock_reset_flag(sk, SOCK_RCVTSTAMP);
832 		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
833 	}
834 }
835 
836 void sock_enable_timestamps(struct sock *sk)
837 {
838 	lock_sock(sk);
839 	__sock_set_timestamps(sk, true, false, true);
840 	release_sock(sk);
841 }
842 EXPORT_SYMBOL(sock_enable_timestamps);
843 
844 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
845 {
846 	switch (optname) {
847 	case SO_TIMESTAMP_OLD:
848 		__sock_set_timestamps(sk, valbool, false, false);
849 		break;
850 	case SO_TIMESTAMP_NEW:
851 		__sock_set_timestamps(sk, valbool, true, false);
852 		break;
853 	case SO_TIMESTAMPNS_OLD:
854 		__sock_set_timestamps(sk, valbool, false, true);
855 		break;
856 	case SO_TIMESTAMPNS_NEW:
857 		__sock_set_timestamps(sk, valbool, true, true);
858 		break;
859 	}
860 }
861 
862 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
863 {
864 	struct net *net = sock_net(sk);
865 	struct net_device *dev = NULL;
866 	bool match = false;
867 	int *vclock_index;
868 	int i, num;
869 
870 	if (sk->sk_bound_dev_if)
871 		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
872 
873 	if (!dev) {
874 		pr_err("%s: sock not bind to device\n", __func__);
875 		return -EOPNOTSUPP;
876 	}
877 
878 	num = ethtool_get_phc_vclocks(dev, &vclock_index);
879 	dev_put(dev);
880 
881 	for (i = 0; i < num; i++) {
882 		if (*(vclock_index + i) == phc_index) {
883 			match = true;
884 			break;
885 		}
886 	}
887 
888 	if (num > 0)
889 		kfree(vclock_index);
890 
891 	if (!match)
892 		return -EINVAL;
893 
894 	sk->sk_bind_phc = phc_index;
895 
896 	return 0;
897 }
898 
899 int sock_set_timestamping(struct sock *sk, int optname,
900 			  struct so_timestamping timestamping)
901 {
902 	int val = timestamping.flags;
903 	int ret;
904 
905 	if (val & ~SOF_TIMESTAMPING_MASK)
906 		return -EINVAL;
907 
908 	if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
909 	    !(val & SOF_TIMESTAMPING_OPT_ID))
910 		return -EINVAL;
911 
912 	if (val & SOF_TIMESTAMPING_OPT_ID &&
913 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
914 		if (sk_is_tcp(sk)) {
915 			if ((1 << sk->sk_state) &
916 			    (TCPF_CLOSE | TCPF_LISTEN))
917 				return -EINVAL;
918 			if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
919 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
920 			else
921 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
922 		} else {
923 			atomic_set(&sk->sk_tskey, 0);
924 		}
925 	}
926 
927 	if (val & SOF_TIMESTAMPING_OPT_STATS &&
928 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
929 		return -EINVAL;
930 
931 	if (val & SOF_TIMESTAMPING_BIND_PHC) {
932 		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
933 		if (ret)
934 			return ret;
935 	}
936 
937 	sk->sk_tsflags = val;
938 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
939 
940 	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
941 		sock_enable_timestamp(sk,
942 				      SOCK_TIMESTAMPING_RX_SOFTWARE);
943 	else
944 		sock_disable_timestamp(sk,
945 				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
946 	return 0;
947 }
948 
949 void sock_set_keepalive(struct sock *sk)
950 {
951 	lock_sock(sk);
952 	if (sk->sk_prot->keepalive)
953 		sk->sk_prot->keepalive(sk, true);
954 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
955 	release_sock(sk);
956 }
957 EXPORT_SYMBOL(sock_set_keepalive);
958 
959 static void __sock_set_rcvbuf(struct sock *sk, int val)
960 {
961 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
962 	 * as a negative value.
963 	 */
964 	val = min_t(int, val, INT_MAX / 2);
965 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
966 
967 	/* We double it on the way in to account for "struct sk_buff" etc.
968 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
969 	 * will allow that much actual data to be received on that socket.
970 	 *
971 	 * Applications are unaware that "struct sk_buff" and other overheads
972 	 * allocate from the receive buffer during socket buffer allocation.
973 	 *
974 	 * And after considering the possible alternatives, returning the value
975 	 * we actually used in getsockopt is the most desirable behavior.
976 	 */
977 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
978 }
979 
980 void sock_set_rcvbuf(struct sock *sk, int val)
981 {
982 	lock_sock(sk);
983 	__sock_set_rcvbuf(sk, val);
984 	release_sock(sk);
985 }
986 EXPORT_SYMBOL(sock_set_rcvbuf);
987 
988 static void __sock_set_mark(struct sock *sk, u32 val)
989 {
990 	if (val != sk->sk_mark) {
991 		sk->sk_mark = val;
992 		sk_dst_reset(sk);
993 	}
994 }
995 
996 void sock_set_mark(struct sock *sk, u32 val)
997 {
998 	lock_sock(sk);
999 	__sock_set_mark(sk, val);
1000 	release_sock(sk);
1001 }
1002 EXPORT_SYMBOL(sock_set_mark);
1003 
1004 static void sock_release_reserved_memory(struct sock *sk, int bytes)
1005 {
1006 	/* Round down bytes to multiple of pages */
1007 	bytes = round_down(bytes, PAGE_SIZE);
1008 
1009 	WARN_ON(bytes > sk->sk_reserved_mem);
1010 	sk->sk_reserved_mem -= bytes;
1011 	sk_mem_reclaim(sk);
1012 }
1013 
1014 static int sock_reserve_memory(struct sock *sk, int bytes)
1015 {
1016 	long allocated;
1017 	bool charged;
1018 	int pages;
1019 
1020 	if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1021 		return -EOPNOTSUPP;
1022 
1023 	if (!bytes)
1024 		return 0;
1025 
1026 	pages = sk_mem_pages(bytes);
1027 
1028 	/* pre-charge to memcg */
1029 	charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1030 					  GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1031 	if (!charged)
1032 		return -ENOMEM;
1033 
1034 	/* pre-charge to forward_alloc */
1035 	sk_memory_allocated_add(sk, pages);
1036 	allocated = sk_memory_allocated(sk);
1037 	/* If the system goes into memory pressure with this
1038 	 * precharge, give up and return error.
1039 	 */
1040 	if (allocated > sk_prot_mem_limits(sk, 1)) {
1041 		sk_memory_allocated_sub(sk, pages);
1042 		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1043 		return -ENOMEM;
1044 	}
1045 	sk->sk_forward_alloc += pages << PAGE_SHIFT;
1046 
1047 	sk->sk_reserved_mem += pages << PAGE_SHIFT;
1048 
1049 	return 0;
1050 }
1051 
1052 void sockopt_lock_sock(struct sock *sk)
1053 {
1054 	/* When current->bpf_ctx is set, the setsockopt is called from
1055 	 * a bpf prog.  bpf has ensured the sk lock has been
1056 	 * acquired before calling setsockopt().
1057 	 */
1058 	if (has_current_bpf_ctx())
1059 		return;
1060 
1061 	lock_sock(sk);
1062 }
1063 EXPORT_SYMBOL(sockopt_lock_sock);
1064 
1065 void sockopt_release_sock(struct sock *sk)
1066 {
1067 	if (has_current_bpf_ctx())
1068 		return;
1069 
1070 	release_sock(sk);
1071 }
1072 EXPORT_SYMBOL(sockopt_release_sock);
1073 
1074 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1075 {
1076 	return has_current_bpf_ctx() || ns_capable(ns, cap);
1077 }
1078 EXPORT_SYMBOL(sockopt_ns_capable);
1079 
1080 bool sockopt_capable(int cap)
1081 {
1082 	return has_current_bpf_ctx() || capable(cap);
1083 }
1084 EXPORT_SYMBOL(sockopt_capable);
1085 
1086 /*
1087  *	This is meant for all protocols to use and covers goings on
1088  *	at the socket level. Everything here is generic.
1089  */
1090 
1091 int sk_setsockopt(struct sock *sk, int level, int optname,
1092 		  sockptr_t optval, unsigned int optlen)
1093 {
1094 	struct so_timestamping timestamping;
1095 	struct socket *sock = sk->sk_socket;
1096 	struct sock_txtime sk_txtime;
1097 	int val;
1098 	int valbool;
1099 	struct linger ling;
1100 	int ret = 0;
1101 
1102 	/*
1103 	 *	Options without arguments
1104 	 */
1105 
1106 	if (optname == SO_BINDTODEVICE)
1107 		return sock_setbindtodevice(sk, optval, optlen);
1108 
1109 	if (optlen < sizeof(int))
1110 		return -EINVAL;
1111 
1112 	if (copy_from_sockptr(&val, optval, sizeof(val)))
1113 		return -EFAULT;
1114 
1115 	valbool = val ? 1 : 0;
1116 
1117 	sockopt_lock_sock(sk);
1118 
1119 	switch (optname) {
1120 	case SO_DEBUG:
1121 		if (val && !sockopt_capable(CAP_NET_ADMIN))
1122 			ret = -EACCES;
1123 		else
1124 			sock_valbool_flag(sk, SOCK_DBG, valbool);
1125 		break;
1126 	case SO_REUSEADDR:
1127 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1128 		break;
1129 	case SO_REUSEPORT:
1130 		sk->sk_reuseport = valbool;
1131 		break;
1132 	case SO_TYPE:
1133 	case SO_PROTOCOL:
1134 	case SO_DOMAIN:
1135 	case SO_ERROR:
1136 		ret = -ENOPROTOOPT;
1137 		break;
1138 	case SO_DONTROUTE:
1139 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1140 		sk_dst_reset(sk);
1141 		break;
1142 	case SO_BROADCAST:
1143 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1144 		break;
1145 	case SO_SNDBUF:
1146 		/* Don't error on this BSD doesn't and if you think
1147 		 * about it this is right. Otherwise apps have to
1148 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1149 		 * are treated in BSD as hints
1150 		 */
1151 		val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1152 set_sndbuf:
1153 		/* Ensure val * 2 fits into an int, to prevent max_t()
1154 		 * from treating it as a negative value.
1155 		 */
1156 		val = min_t(int, val, INT_MAX / 2);
1157 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1158 		WRITE_ONCE(sk->sk_sndbuf,
1159 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1160 		/* Wake up sending tasks if we upped the value. */
1161 		sk->sk_write_space(sk);
1162 		break;
1163 
1164 	case SO_SNDBUFFORCE:
1165 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1166 			ret = -EPERM;
1167 			break;
1168 		}
1169 
1170 		/* No negative values (to prevent underflow, as val will be
1171 		 * multiplied by 2).
1172 		 */
1173 		if (val < 0)
1174 			val = 0;
1175 		goto set_sndbuf;
1176 
1177 	case SO_RCVBUF:
1178 		/* Don't error on this BSD doesn't and if you think
1179 		 * about it this is right. Otherwise apps have to
1180 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1181 		 * are treated in BSD as hints
1182 		 */
1183 		__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1184 		break;
1185 
1186 	case SO_RCVBUFFORCE:
1187 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1188 			ret = -EPERM;
1189 			break;
1190 		}
1191 
1192 		/* No negative values (to prevent underflow, as val will be
1193 		 * multiplied by 2).
1194 		 */
1195 		__sock_set_rcvbuf(sk, max(val, 0));
1196 		break;
1197 
1198 	case SO_KEEPALIVE:
1199 		if (sk->sk_prot->keepalive)
1200 			sk->sk_prot->keepalive(sk, valbool);
1201 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1202 		break;
1203 
1204 	case SO_OOBINLINE:
1205 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1206 		break;
1207 
1208 	case SO_NO_CHECK:
1209 		sk->sk_no_check_tx = valbool;
1210 		break;
1211 
1212 	case SO_PRIORITY:
1213 		if ((val >= 0 && val <= 6) ||
1214 		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1215 		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1216 			sk->sk_priority = val;
1217 		else
1218 			ret = -EPERM;
1219 		break;
1220 
1221 	case SO_LINGER:
1222 		if (optlen < sizeof(ling)) {
1223 			ret = -EINVAL;	/* 1003.1g */
1224 			break;
1225 		}
1226 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1227 			ret = -EFAULT;
1228 			break;
1229 		}
1230 		if (!ling.l_onoff)
1231 			sock_reset_flag(sk, SOCK_LINGER);
1232 		else {
1233 #if (BITS_PER_LONG == 32)
1234 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
1235 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1236 			else
1237 #endif
1238 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
1239 			sock_set_flag(sk, SOCK_LINGER);
1240 		}
1241 		break;
1242 
1243 	case SO_BSDCOMPAT:
1244 		break;
1245 
1246 	case SO_PASSCRED:
1247 		assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
1248 		break;
1249 
1250 	case SO_PASSPIDFD:
1251 		assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
1252 		break;
1253 
1254 	case SO_TIMESTAMP_OLD:
1255 	case SO_TIMESTAMP_NEW:
1256 	case SO_TIMESTAMPNS_OLD:
1257 	case SO_TIMESTAMPNS_NEW:
1258 		sock_set_timestamp(sk, optname, valbool);
1259 		break;
1260 
1261 	case SO_TIMESTAMPING_NEW:
1262 	case SO_TIMESTAMPING_OLD:
1263 		if (optlen == sizeof(timestamping)) {
1264 			if (copy_from_sockptr(&timestamping, optval,
1265 					      sizeof(timestamping))) {
1266 				ret = -EFAULT;
1267 				break;
1268 			}
1269 		} else {
1270 			memset(&timestamping, 0, sizeof(timestamping));
1271 			timestamping.flags = val;
1272 		}
1273 		ret = sock_set_timestamping(sk, optname, timestamping);
1274 		break;
1275 
1276 	case SO_RCVLOWAT:
1277 		if (val < 0)
1278 			val = INT_MAX;
1279 		if (sock && sock->ops->set_rcvlowat)
1280 			ret = sock->ops->set_rcvlowat(sk, val);
1281 		else
1282 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1283 		break;
1284 
1285 	case SO_RCVTIMEO_OLD:
1286 	case SO_RCVTIMEO_NEW:
1287 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1288 				       optlen, optname == SO_RCVTIMEO_OLD);
1289 		break;
1290 
1291 	case SO_SNDTIMEO_OLD:
1292 	case SO_SNDTIMEO_NEW:
1293 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1294 				       optlen, optname == SO_SNDTIMEO_OLD);
1295 		break;
1296 
1297 	case SO_ATTACH_FILTER: {
1298 		struct sock_fprog fprog;
1299 
1300 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1301 		if (!ret)
1302 			ret = sk_attach_filter(&fprog, sk);
1303 		break;
1304 	}
1305 	case SO_ATTACH_BPF:
1306 		ret = -EINVAL;
1307 		if (optlen == sizeof(u32)) {
1308 			u32 ufd;
1309 
1310 			ret = -EFAULT;
1311 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1312 				break;
1313 
1314 			ret = sk_attach_bpf(ufd, sk);
1315 		}
1316 		break;
1317 
1318 	case SO_ATTACH_REUSEPORT_CBPF: {
1319 		struct sock_fprog fprog;
1320 
1321 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1322 		if (!ret)
1323 			ret = sk_reuseport_attach_filter(&fprog, sk);
1324 		break;
1325 	}
1326 	case SO_ATTACH_REUSEPORT_EBPF:
1327 		ret = -EINVAL;
1328 		if (optlen == sizeof(u32)) {
1329 			u32 ufd;
1330 
1331 			ret = -EFAULT;
1332 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1333 				break;
1334 
1335 			ret = sk_reuseport_attach_bpf(ufd, sk);
1336 		}
1337 		break;
1338 
1339 	case SO_DETACH_REUSEPORT_BPF:
1340 		ret = reuseport_detach_prog(sk);
1341 		break;
1342 
1343 	case SO_DETACH_FILTER:
1344 		ret = sk_detach_filter(sk);
1345 		break;
1346 
1347 	case SO_LOCK_FILTER:
1348 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1349 			ret = -EPERM;
1350 		else
1351 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1352 		break;
1353 
1354 	case SO_PASSSEC:
1355 		assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
1356 		break;
1357 	case SO_MARK:
1358 		if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1359 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1360 			ret = -EPERM;
1361 			break;
1362 		}
1363 
1364 		__sock_set_mark(sk, val);
1365 		break;
1366 	case SO_RCVMARK:
1367 		sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1368 		break;
1369 
1370 	case SO_RXQ_OVFL:
1371 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1372 		break;
1373 
1374 	case SO_WIFI_STATUS:
1375 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1376 		break;
1377 
1378 	case SO_PEEK_OFF:
1379 		if (sock->ops->set_peek_off)
1380 			ret = sock->ops->set_peek_off(sk, val);
1381 		else
1382 			ret = -EOPNOTSUPP;
1383 		break;
1384 
1385 	case SO_NOFCS:
1386 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1387 		break;
1388 
1389 	case SO_SELECT_ERR_QUEUE:
1390 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1391 		break;
1392 
1393 #ifdef CONFIG_NET_RX_BUSY_POLL
1394 	case SO_BUSY_POLL:
1395 		if (val < 0)
1396 			ret = -EINVAL;
1397 		else
1398 			WRITE_ONCE(sk->sk_ll_usec, val);
1399 		break;
1400 	case SO_PREFER_BUSY_POLL:
1401 		if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1402 			ret = -EPERM;
1403 		else
1404 			WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1405 		break;
1406 	case SO_BUSY_POLL_BUDGET:
1407 		if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) {
1408 			ret = -EPERM;
1409 		} else {
1410 			if (val < 0 || val > U16_MAX)
1411 				ret = -EINVAL;
1412 			else
1413 				WRITE_ONCE(sk->sk_busy_poll_budget, val);
1414 		}
1415 		break;
1416 #endif
1417 
1418 	case SO_MAX_PACING_RATE:
1419 		{
1420 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1421 
1422 		if (sizeof(ulval) != sizeof(val) &&
1423 		    optlen >= sizeof(ulval) &&
1424 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1425 			ret = -EFAULT;
1426 			break;
1427 		}
1428 		if (ulval != ~0UL)
1429 			cmpxchg(&sk->sk_pacing_status,
1430 				SK_PACING_NONE,
1431 				SK_PACING_NEEDED);
1432 		sk->sk_max_pacing_rate = ulval;
1433 		sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1434 		break;
1435 		}
1436 	case SO_INCOMING_CPU:
1437 		reuseport_update_incoming_cpu(sk, val);
1438 		break;
1439 
1440 	case SO_CNX_ADVICE:
1441 		if (val == 1)
1442 			dst_negative_advice(sk);
1443 		break;
1444 
1445 	case SO_ZEROCOPY:
1446 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1447 			if (!(sk_is_tcp(sk) ||
1448 			      (sk->sk_type == SOCK_DGRAM &&
1449 			       sk->sk_protocol == IPPROTO_UDP)))
1450 				ret = -EOPNOTSUPP;
1451 		} else if (sk->sk_family != PF_RDS) {
1452 			ret = -EOPNOTSUPP;
1453 		}
1454 		if (!ret) {
1455 			if (val < 0 || val > 1)
1456 				ret = -EINVAL;
1457 			else
1458 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1459 		}
1460 		break;
1461 
1462 	case SO_TXTIME:
1463 		if (optlen != sizeof(struct sock_txtime)) {
1464 			ret = -EINVAL;
1465 			break;
1466 		} else if (copy_from_sockptr(&sk_txtime, optval,
1467 			   sizeof(struct sock_txtime))) {
1468 			ret = -EFAULT;
1469 			break;
1470 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1471 			ret = -EINVAL;
1472 			break;
1473 		}
1474 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1475 		 * scheduler has enough safe guards.
1476 		 */
1477 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1478 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1479 			ret = -EPERM;
1480 			break;
1481 		}
1482 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1483 		sk->sk_clockid = sk_txtime.clockid;
1484 		sk->sk_txtime_deadline_mode =
1485 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1486 		sk->sk_txtime_report_errors =
1487 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1488 		break;
1489 
1490 	case SO_BINDTOIFINDEX:
1491 		ret = sock_bindtoindex_locked(sk, val);
1492 		break;
1493 
1494 	case SO_BUF_LOCK:
1495 		if (val & ~SOCK_BUF_LOCK_MASK) {
1496 			ret = -EINVAL;
1497 			break;
1498 		}
1499 		sk->sk_userlocks = val | (sk->sk_userlocks &
1500 					  ~SOCK_BUF_LOCK_MASK);
1501 		break;
1502 
1503 	case SO_RESERVE_MEM:
1504 	{
1505 		int delta;
1506 
1507 		if (val < 0) {
1508 			ret = -EINVAL;
1509 			break;
1510 		}
1511 
1512 		delta = val - sk->sk_reserved_mem;
1513 		if (delta < 0)
1514 			sock_release_reserved_memory(sk, -delta);
1515 		else
1516 			ret = sock_reserve_memory(sk, delta);
1517 		break;
1518 	}
1519 
1520 	case SO_TXREHASH:
1521 		if (val < -1 || val > 1) {
1522 			ret = -EINVAL;
1523 			break;
1524 		}
1525 		if ((u8)val == SOCK_TXREHASH_DEFAULT)
1526 			val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1527 		/* Paired with READ_ONCE() in tcp_rtx_synack() */
1528 		WRITE_ONCE(sk->sk_txrehash, (u8)val);
1529 		break;
1530 
1531 	default:
1532 		ret = -ENOPROTOOPT;
1533 		break;
1534 	}
1535 	sockopt_release_sock(sk);
1536 	return ret;
1537 }
1538 
1539 int sock_setsockopt(struct socket *sock, int level, int optname,
1540 		    sockptr_t optval, unsigned int optlen)
1541 {
1542 	return sk_setsockopt(sock->sk, level, optname,
1543 			     optval, optlen);
1544 }
1545 EXPORT_SYMBOL(sock_setsockopt);
1546 
1547 static const struct cred *sk_get_peer_cred(struct sock *sk)
1548 {
1549 	const struct cred *cred;
1550 
1551 	spin_lock(&sk->sk_peer_lock);
1552 	cred = get_cred(sk->sk_peer_cred);
1553 	spin_unlock(&sk->sk_peer_lock);
1554 
1555 	return cred;
1556 }
1557 
1558 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1559 			  struct ucred *ucred)
1560 {
1561 	ucred->pid = pid_vnr(pid);
1562 	ucred->uid = ucred->gid = -1;
1563 	if (cred) {
1564 		struct user_namespace *current_ns = current_user_ns();
1565 
1566 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1567 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1568 	}
1569 }
1570 
1571 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1572 {
1573 	struct user_namespace *user_ns = current_user_ns();
1574 	int i;
1575 
1576 	for (i = 0; i < src->ngroups; i++) {
1577 		gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1578 
1579 		if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1580 			return -EFAULT;
1581 	}
1582 
1583 	return 0;
1584 }
1585 
1586 int sk_getsockopt(struct sock *sk, int level, int optname,
1587 		  sockptr_t optval, sockptr_t optlen)
1588 {
1589 	struct socket *sock = sk->sk_socket;
1590 
1591 	union {
1592 		int val;
1593 		u64 val64;
1594 		unsigned long ulval;
1595 		struct linger ling;
1596 		struct old_timeval32 tm32;
1597 		struct __kernel_old_timeval tm;
1598 		struct  __kernel_sock_timeval stm;
1599 		struct sock_txtime txtime;
1600 		struct so_timestamping timestamping;
1601 	} v;
1602 
1603 	int lv = sizeof(int);
1604 	int len;
1605 
1606 	if (copy_from_sockptr(&len, optlen, sizeof(int)))
1607 		return -EFAULT;
1608 	if (len < 0)
1609 		return -EINVAL;
1610 
1611 	memset(&v, 0, sizeof(v));
1612 
1613 	switch (optname) {
1614 	case SO_DEBUG:
1615 		v.val = sock_flag(sk, SOCK_DBG);
1616 		break;
1617 
1618 	case SO_DONTROUTE:
1619 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1620 		break;
1621 
1622 	case SO_BROADCAST:
1623 		v.val = sock_flag(sk, SOCK_BROADCAST);
1624 		break;
1625 
1626 	case SO_SNDBUF:
1627 		v.val = sk->sk_sndbuf;
1628 		break;
1629 
1630 	case SO_RCVBUF:
1631 		v.val = sk->sk_rcvbuf;
1632 		break;
1633 
1634 	case SO_REUSEADDR:
1635 		v.val = sk->sk_reuse;
1636 		break;
1637 
1638 	case SO_REUSEPORT:
1639 		v.val = sk->sk_reuseport;
1640 		break;
1641 
1642 	case SO_KEEPALIVE:
1643 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1644 		break;
1645 
1646 	case SO_TYPE:
1647 		v.val = sk->sk_type;
1648 		break;
1649 
1650 	case SO_PROTOCOL:
1651 		v.val = sk->sk_protocol;
1652 		break;
1653 
1654 	case SO_DOMAIN:
1655 		v.val = sk->sk_family;
1656 		break;
1657 
1658 	case SO_ERROR:
1659 		v.val = -sock_error(sk);
1660 		if (v.val == 0)
1661 			v.val = xchg(&sk->sk_err_soft, 0);
1662 		break;
1663 
1664 	case SO_OOBINLINE:
1665 		v.val = sock_flag(sk, SOCK_URGINLINE);
1666 		break;
1667 
1668 	case SO_NO_CHECK:
1669 		v.val = sk->sk_no_check_tx;
1670 		break;
1671 
1672 	case SO_PRIORITY:
1673 		v.val = sk->sk_priority;
1674 		break;
1675 
1676 	case SO_LINGER:
1677 		lv		= sizeof(v.ling);
1678 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1679 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1680 		break;
1681 
1682 	case SO_BSDCOMPAT:
1683 		break;
1684 
1685 	case SO_TIMESTAMP_OLD:
1686 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1687 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1688 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1689 		break;
1690 
1691 	case SO_TIMESTAMPNS_OLD:
1692 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1693 		break;
1694 
1695 	case SO_TIMESTAMP_NEW:
1696 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1697 		break;
1698 
1699 	case SO_TIMESTAMPNS_NEW:
1700 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1701 		break;
1702 
1703 	case SO_TIMESTAMPING_OLD:
1704 		lv = sizeof(v.timestamping);
1705 		v.timestamping.flags = sk->sk_tsflags;
1706 		v.timestamping.bind_phc = sk->sk_bind_phc;
1707 		break;
1708 
1709 	case SO_RCVTIMEO_OLD:
1710 	case SO_RCVTIMEO_NEW:
1711 		lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1712 		break;
1713 
1714 	case SO_SNDTIMEO_OLD:
1715 	case SO_SNDTIMEO_NEW:
1716 		lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1717 		break;
1718 
1719 	case SO_RCVLOWAT:
1720 		v.val = sk->sk_rcvlowat;
1721 		break;
1722 
1723 	case SO_SNDLOWAT:
1724 		v.val = 1;
1725 		break;
1726 
1727 	case SO_PASSCRED:
1728 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1729 		break;
1730 
1731 	case SO_PASSPIDFD:
1732 		v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1733 		break;
1734 
1735 	case SO_PEERCRED:
1736 	{
1737 		struct ucred peercred;
1738 		if (len > sizeof(peercred))
1739 			len = sizeof(peercred);
1740 
1741 		spin_lock(&sk->sk_peer_lock);
1742 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1743 		spin_unlock(&sk->sk_peer_lock);
1744 
1745 		if (copy_to_sockptr(optval, &peercred, len))
1746 			return -EFAULT;
1747 		goto lenout;
1748 	}
1749 
1750 	case SO_PEERPIDFD:
1751 	{
1752 		struct pid *peer_pid;
1753 		struct file *pidfd_file = NULL;
1754 		int pidfd;
1755 
1756 		if (len > sizeof(pidfd))
1757 			len = sizeof(pidfd);
1758 
1759 		spin_lock(&sk->sk_peer_lock);
1760 		peer_pid = get_pid(sk->sk_peer_pid);
1761 		spin_unlock(&sk->sk_peer_lock);
1762 
1763 		if (!peer_pid)
1764 			return -ESRCH;
1765 
1766 		pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1767 		put_pid(peer_pid);
1768 		if (pidfd < 0)
1769 			return pidfd;
1770 
1771 		if (copy_to_sockptr(optval, &pidfd, len) ||
1772 		    copy_to_sockptr(optlen, &len, sizeof(int))) {
1773 			put_unused_fd(pidfd);
1774 			fput(pidfd_file);
1775 
1776 			return -EFAULT;
1777 		}
1778 
1779 		fd_install(pidfd, pidfd_file);
1780 		return 0;
1781 	}
1782 
1783 	case SO_PEERGROUPS:
1784 	{
1785 		const struct cred *cred;
1786 		int ret, n;
1787 
1788 		cred = sk_get_peer_cred(sk);
1789 		if (!cred)
1790 			return -ENODATA;
1791 
1792 		n = cred->group_info->ngroups;
1793 		if (len < n * sizeof(gid_t)) {
1794 			len = n * sizeof(gid_t);
1795 			put_cred(cred);
1796 			return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1797 		}
1798 		len = n * sizeof(gid_t);
1799 
1800 		ret = groups_to_user(optval, cred->group_info);
1801 		put_cred(cred);
1802 		if (ret)
1803 			return ret;
1804 		goto lenout;
1805 	}
1806 
1807 	case SO_PEERNAME:
1808 	{
1809 		struct sockaddr_storage address;
1810 
1811 		lv = sock->ops->getname(sock, (struct sockaddr *)&address, 2);
1812 		if (lv < 0)
1813 			return -ENOTCONN;
1814 		if (lv < len)
1815 			return -EINVAL;
1816 		if (copy_to_sockptr(optval, &address, len))
1817 			return -EFAULT;
1818 		goto lenout;
1819 	}
1820 
1821 	/* Dubious BSD thing... Probably nobody even uses it, but
1822 	 * the UNIX standard wants it for whatever reason... -DaveM
1823 	 */
1824 	case SO_ACCEPTCONN:
1825 		v.val = sk->sk_state == TCP_LISTEN;
1826 		break;
1827 
1828 	case SO_PASSSEC:
1829 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1830 		break;
1831 
1832 	case SO_PEERSEC:
1833 		return security_socket_getpeersec_stream(sock,
1834 							 optval, optlen, len);
1835 
1836 	case SO_MARK:
1837 		v.val = sk->sk_mark;
1838 		break;
1839 
1840 	case SO_RCVMARK:
1841 		v.val = sock_flag(sk, SOCK_RCVMARK);
1842 		break;
1843 
1844 	case SO_RXQ_OVFL:
1845 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1846 		break;
1847 
1848 	case SO_WIFI_STATUS:
1849 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1850 		break;
1851 
1852 	case SO_PEEK_OFF:
1853 		if (!sock->ops->set_peek_off)
1854 			return -EOPNOTSUPP;
1855 
1856 		v.val = sk->sk_peek_off;
1857 		break;
1858 	case SO_NOFCS:
1859 		v.val = sock_flag(sk, SOCK_NOFCS);
1860 		break;
1861 
1862 	case SO_BINDTODEVICE:
1863 		return sock_getbindtodevice(sk, optval, optlen, len);
1864 
1865 	case SO_GET_FILTER:
1866 		len = sk_get_filter(sk, optval, len);
1867 		if (len < 0)
1868 			return len;
1869 
1870 		goto lenout;
1871 
1872 	case SO_LOCK_FILTER:
1873 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1874 		break;
1875 
1876 	case SO_BPF_EXTENSIONS:
1877 		v.val = bpf_tell_extensions();
1878 		break;
1879 
1880 	case SO_SELECT_ERR_QUEUE:
1881 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1882 		break;
1883 
1884 #ifdef CONFIG_NET_RX_BUSY_POLL
1885 	case SO_BUSY_POLL:
1886 		v.val = sk->sk_ll_usec;
1887 		break;
1888 	case SO_PREFER_BUSY_POLL:
1889 		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1890 		break;
1891 #endif
1892 
1893 	case SO_MAX_PACING_RATE:
1894 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1895 			lv = sizeof(v.ulval);
1896 			v.ulval = sk->sk_max_pacing_rate;
1897 		} else {
1898 			/* 32bit version */
1899 			v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1900 		}
1901 		break;
1902 
1903 	case SO_INCOMING_CPU:
1904 		v.val = READ_ONCE(sk->sk_incoming_cpu);
1905 		break;
1906 
1907 	case SO_MEMINFO:
1908 	{
1909 		u32 meminfo[SK_MEMINFO_VARS];
1910 
1911 		sk_get_meminfo(sk, meminfo);
1912 
1913 		len = min_t(unsigned int, len, sizeof(meminfo));
1914 		if (copy_to_sockptr(optval, &meminfo, len))
1915 			return -EFAULT;
1916 
1917 		goto lenout;
1918 	}
1919 
1920 #ifdef CONFIG_NET_RX_BUSY_POLL
1921 	case SO_INCOMING_NAPI_ID:
1922 		v.val = READ_ONCE(sk->sk_napi_id);
1923 
1924 		/* aggregate non-NAPI IDs down to 0 */
1925 		if (v.val < MIN_NAPI_ID)
1926 			v.val = 0;
1927 
1928 		break;
1929 #endif
1930 
1931 	case SO_COOKIE:
1932 		lv = sizeof(u64);
1933 		if (len < lv)
1934 			return -EINVAL;
1935 		v.val64 = sock_gen_cookie(sk);
1936 		break;
1937 
1938 	case SO_ZEROCOPY:
1939 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1940 		break;
1941 
1942 	case SO_TXTIME:
1943 		lv = sizeof(v.txtime);
1944 		v.txtime.clockid = sk->sk_clockid;
1945 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1946 				  SOF_TXTIME_DEADLINE_MODE : 0;
1947 		v.txtime.flags |= sk->sk_txtime_report_errors ?
1948 				  SOF_TXTIME_REPORT_ERRORS : 0;
1949 		break;
1950 
1951 	case SO_BINDTOIFINDEX:
1952 		v.val = READ_ONCE(sk->sk_bound_dev_if);
1953 		break;
1954 
1955 	case SO_NETNS_COOKIE:
1956 		lv = sizeof(u64);
1957 		if (len != lv)
1958 			return -EINVAL;
1959 		v.val64 = sock_net(sk)->net_cookie;
1960 		break;
1961 
1962 	case SO_BUF_LOCK:
1963 		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1964 		break;
1965 
1966 	case SO_RESERVE_MEM:
1967 		v.val = sk->sk_reserved_mem;
1968 		break;
1969 
1970 	case SO_TXREHASH:
1971 		v.val = sk->sk_txrehash;
1972 		break;
1973 
1974 	default:
1975 		/* We implement the SO_SNDLOWAT etc to not be settable
1976 		 * (1003.1g 7).
1977 		 */
1978 		return -ENOPROTOOPT;
1979 	}
1980 
1981 	if (len > lv)
1982 		len = lv;
1983 	if (copy_to_sockptr(optval, &v, len))
1984 		return -EFAULT;
1985 lenout:
1986 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
1987 		return -EFAULT;
1988 	return 0;
1989 }
1990 
1991 int sock_getsockopt(struct socket *sock, int level, int optname,
1992 		    char __user *optval, int __user *optlen)
1993 {
1994 	return sk_getsockopt(sock->sk, level, optname,
1995 			     USER_SOCKPTR(optval),
1996 			     USER_SOCKPTR(optlen));
1997 }
1998 
1999 /*
2000  * Initialize an sk_lock.
2001  *
2002  * (We also register the sk_lock with the lock validator.)
2003  */
2004 static inline void sock_lock_init(struct sock *sk)
2005 {
2006 	if (sk->sk_kern_sock)
2007 		sock_lock_init_class_and_name(
2008 			sk,
2009 			af_family_kern_slock_key_strings[sk->sk_family],
2010 			af_family_kern_slock_keys + sk->sk_family,
2011 			af_family_kern_key_strings[sk->sk_family],
2012 			af_family_kern_keys + sk->sk_family);
2013 	else
2014 		sock_lock_init_class_and_name(
2015 			sk,
2016 			af_family_slock_key_strings[sk->sk_family],
2017 			af_family_slock_keys + sk->sk_family,
2018 			af_family_key_strings[sk->sk_family],
2019 			af_family_keys + sk->sk_family);
2020 }
2021 
2022 /*
2023  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2024  * even temporarly, because of RCU lookups. sk_node should also be left as is.
2025  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2026  */
2027 static void sock_copy(struct sock *nsk, const struct sock *osk)
2028 {
2029 	const struct proto *prot = READ_ONCE(osk->sk_prot);
2030 #ifdef CONFIG_SECURITY_NETWORK
2031 	void *sptr = nsk->sk_security;
2032 #endif
2033 
2034 	/* If we move sk_tx_queue_mapping out of the private section,
2035 	 * we must check if sk_tx_queue_clear() is called after
2036 	 * sock_copy() in sk_clone_lock().
2037 	 */
2038 	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2039 		     offsetof(struct sock, sk_dontcopy_begin) ||
2040 		     offsetof(struct sock, sk_tx_queue_mapping) >=
2041 		     offsetof(struct sock, sk_dontcopy_end));
2042 
2043 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2044 
2045 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2046 	       prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
2047 
2048 #ifdef CONFIG_SECURITY_NETWORK
2049 	nsk->sk_security = sptr;
2050 	security_sk_clone(osk, nsk);
2051 #endif
2052 }
2053 
2054 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2055 		int family)
2056 {
2057 	struct sock *sk;
2058 	struct kmem_cache *slab;
2059 
2060 	slab = prot->slab;
2061 	if (slab != NULL) {
2062 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2063 		if (!sk)
2064 			return sk;
2065 		if (want_init_on_alloc(priority))
2066 			sk_prot_clear_nulls(sk, prot->obj_size);
2067 	} else
2068 		sk = kmalloc(prot->obj_size, priority);
2069 
2070 	if (sk != NULL) {
2071 		if (security_sk_alloc(sk, family, priority))
2072 			goto out_free;
2073 
2074 		if (!try_module_get(prot->owner))
2075 			goto out_free_sec;
2076 	}
2077 
2078 	return sk;
2079 
2080 out_free_sec:
2081 	security_sk_free(sk);
2082 out_free:
2083 	if (slab != NULL)
2084 		kmem_cache_free(slab, sk);
2085 	else
2086 		kfree(sk);
2087 	return NULL;
2088 }
2089 
2090 static void sk_prot_free(struct proto *prot, struct sock *sk)
2091 {
2092 	struct kmem_cache *slab;
2093 	struct module *owner;
2094 
2095 	owner = prot->owner;
2096 	slab = prot->slab;
2097 
2098 	cgroup_sk_free(&sk->sk_cgrp_data);
2099 	mem_cgroup_sk_free(sk);
2100 	security_sk_free(sk);
2101 	if (slab != NULL)
2102 		kmem_cache_free(slab, sk);
2103 	else
2104 		kfree(sk);
2105 	module_put(owner);
2106 }
2107 
2108 /**
2109  *	sk_alloc - All socket objects are allocated here
2110  *	@net: the applicable net namespace
2111  *	@family: protocol family
2112  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2113  *	@prot: struct proto associated with this new sock instance
2114  *	@kern: is this to be a kernel socket?
2115  */
2116 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2117 		      struct proto *prot, int kern)
2118 {
2119 	struct sock *sk;
2120 
2121 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2122 	if (sk) {
2123 		sk->sk_family = family;
2124 		/*
2125 		 * See comment in struct sock definition to understand
2126 		 * why we need sk_prot_creator -acme
2127 		 */
2128 		sk->sk_prot = sk->sk_prot_creator = prot;
2129 		sk->sk_kern_sock = kern;
2130 		sock_lock_init(sk);
2131 		sk->sk_net_refcnt = kern ? 0 : 1;
2132 		if (likely(sk->sk_net_refcnt)) {
2133 			get_net_track(net, &sk->ns_tracker, priority);
2134 			sock_inuse_add(net, 1);
2135 		} else {
2136 			__netns_tracker_alloc(net, &sk->ns_tracker,
2137 					      false, priority);
2138 		}
2139 
2140 		sock_net_set(sk, net);
2141 		refcount_set(&sk->sk_wmem_alloc, 1);
2142 
2143 		mem_cgroup_sk_alloc(sk);
2144 		cgroup_sk_alloc(&sk->sk_cgrp_data);
2145 		sock_update_classid(&sk->sk_cgrp_data);
2146 		sock_update_netprioidx(&sk->sk_cgrp_data);
2147 		sk_tx_queue_clear(sk);
2148 	}
2149 
2150 	return sk;
2151 }
2152 EXPORT_SYMBOL(sk_alloc);
2153 
2154 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2155  * grace period. This is the case for UDP sockets and TCP listeners.
2156  */
2157 static void __sk_destruct(struct rcu_head *head)
2158 {
2159 	struct sock *sk = container_of(head, struct sock, sk_rcu);
2160 	struct sk_filter *filter;
2161 
2162 	if (sk->sk_destruct)
2163 		sk->sk_destruct(sk);
2164 
2165 	filter = rcu_dereference_check(sk->sk_filter,
2166 				       refcount_read(&sk->sk_wmem_alloc) == 0);
2167 	if (filter) {
2168 		sk_filter_uncharge(sk, filter);
2169 		RCU_INIT_POINTER(sk->sk_filter, NULL);
2170 	}
2171 
2172 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2173 
2174 #ifdef CONFIG_BPF_SYSCALL
2175 	bpf_sk_storage_free(sk);
2176 #endif
2177 
2178 	if (atomic_read(&sk->sk_omem_alloc))
2179 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2180 			 __func__, atomic_read(&sk->sk_omem_alloc));
2181 
2182 	if (sk->sk_frag.page) {
2183 		put_page(sk->sk_frag.page);
2184 		sk->sk_frag.page = NULL;
2185 	}
2186 
2187 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2188 	put_cred(sk->sk_peer_cred);
2189 	put_pid(sk->sk_peer_pid);
2190 
2191 	if (likely(sk->sk_net_refcnt))
2192 		put_net_track(sock_net(sk), &sk->ns_tracker);
2193 	else
2194 		__netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
2195 
2196 	sk_prot_free(sk->sk_prot_creator, sk);
2197 }
2198 
2199 void sk_destruct(struct sock *sk)
2200 {
2201 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2202 
2203 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2204 		reuseport_detach_sock(sk);
2205 		use_call_rcu = true;
2206 	}
2207 
2208 	if (use_call_rcu)
2209 		call_rcu(&sk->sk_rcu, __sk_destruct);
2210 	else
2211 		__sk_destruct(&sk->sk_rcu);
2212 }
2213 
2214 static void __sk_free(struct sock *sk)
2215 {
2216 	if (likely(sk->sk_net_refcnt))
2217 		sock_inuse_add(sock_net(sk), -1);
2218 
2219 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2220 		sock_diag_broadcast_destroy(sk);
2221 	else
2222 		sk_destruct(sk);
2223 }
2224 
2225 void sk_free(struct sock *sk)
2226 {
2227 	/*
2228 	 * We subtract one from sk_wmem_alloc and can know if
2229 	 * some packets are still in some tx queue.
2230 	 * If not null, sock_wfree() will call __sk_free(sk) later
2231 	 */
2232 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2233 		__sk_free(sk);
2234 }
2235 EXPORT_SYMBOL(sk_free);
2236 
2237 static void sk_init_common(struct sock *sk)
2238 {
2239 	skb_queue_head_init(&sk->sk_receive_queue);
2240 	skb_queue_head_init(&sk->sk_write_queue);
2241 	skb_queue_head_init(&sk->sk_error_queue);
2242 
2243 	rwlock_init(&sk->sk_callback_lock);
2244 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2245 			af_rlock_keys + sk->sk_family,
2246 			af_family_rlock_key_strings[sk->sk_family]);
2247 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2248 			af_wlock_keys + sk->sk_family,
2249 			af_family_wlock_key_strings[sk->sk_family]);
2250 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2251 			af_elock_keys + sk->sk_family,
2252 			af_family_elock_key_strings[sk->sk_family]);
2253 	lockdep_set_class_and_name(&sk->sk_callback_lock,
2254 			af_callback_keys + sk->sk_family,
2255 			af_family_clock_key_strings[sk->sk_family]);
2256 }
2257 
2258 /**
2259  *	sk_clone_lock - clone a socket, and lock its clone
2260  *	@sk: the socket to clone
2261  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2262  *
2263  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2264  */
2265 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2266 {
2267 	struct proto *prot = READ_ONCE(sk->sk_prot);
2268 	struct sk_filter *filter;
2269 	bool is_charged = true;
2270 	struct sock *newsk;
2271 
2272 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2273 	if (!newsk)
2274 		goto out;
2275 
2276 	sock_copy(newsk, sk);
2277 
2278 	newsk->sk_prot_creator = prot;
2279 
2280 	/* SANITY */
2281 	if (likely(newsk->sk_net_refcnt)) {
2282 		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2283 		sock_inuse_add(sock_net(newsk), 1);
2284 	} else {
2285 		/* Kernel sockets are not elevating the struct net refcount.
2286 		 * Instead, use a tracker to more easily detect if a layer
2287 		 * is not properly dismantling its kernel sockets at netns
2288 		 * destroy time.
2289 		 */
2290 		__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2291 				      false, priority);
2292 	}
2293 	sk_node_init(&newsk->sk_node);
2294 	sock_lock_init(newsk);
2295 	bh_lock_sock(newsk);
2296 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2297 	newsk->sk_backlog.len = 0;
2298 
2299 	atomic_set(&newsk->sk_rmem_alloc, 0);
2300 
2301 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2302 	refcount_set(&newsk->sk_wmem_alloc, 1);
2303 
2304 	atomic_set(&newsk->sk_omem_alloc, 0);
2305 	sk_init_common(newsk);
2306 
2307 	newsk->sk_dst_cache	= NULL;
2308 	newsk->sk_dst_pending_confirm = 0;
2309 	newsk->sk_wmem_queued	= 0;
2310 	newsk->sk_forward_alloc = 0;
2311 	newsk->sk_reserved_mem  = 0;
2312 	atomic_set(&newsk->sk_drops, 0);
2313 	newsk->sk_send_head	= NULL;
2314 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2315 	atomic_set(&newsk->sk_zckey, 0);
2316 
2317 	sock_reset_flag(newsk, SOCK_DONE);
2318 
2319 	/* sk->sk_memcg will be populated at accept() time */
2320 	newsk->sk_memcg = NULL;
2321 
2322 	cgroup_sk_clone(&newsk->sk_cgrp_data);
2323 
2324 	rcu_read_lock();
2325 	filter = rcu_dereference(sk->sk_filter);
2326 	if (filter != NULL)
2327 		/* though it's an empty new sock, the charging may fail
2328 		 * if sysctl_optmem_max was changed between creation of
2329 		 * original socket and cloning
2330 		 */
2331 		is_charged = sk_filter_charge(newsk, filter);
2332 	RCU_INIT_POINTER(newsk->sk_filter, filter);
2333 	rcu_read_unlock();
2334 
2335 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2336 		/* We need to make sure that we don't uncharge the new
2337 		 * socket if we couldn't charge it in the first place
2338 		 * as otherwise we uncharge the parent's filter.
2339 		 */
2340 		if (!is_charged)
2341 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2342 		sk_free_unlock_clone(newsk);
2343 		newsk = NULL;
2344 		goto out;
2345 	}
2346 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2347 
2348 	if (bpf_sk_storage_clone(sk, newsk)) {
2349 		sk_free_unlock_clone(newsk);
2350 		newsk = NULL;
2351 		goto out;
2352 	}
2353 
2354 	/* Clear sk_user_data if parent had the pointer tagged
2355 	 * as not suitable for copying when cloning.
2356 	 */
2357 	if (sk_user_data_is_nocopy(newsk))
2358 		newsk->sk_user_data = NULL;
2359 
2360 	newsk->sk_err	   = 0;
2361 	newsk->sk_err_soft = 0;
2362 	newsk->sk_priority = 0;
2363 	newsk->sk_incoming_cpu = raw_smp_processor_id();
2364 
2365 	/* Before updating sk_refcnt, we must commit prior changes to memory
2366 	 * (Documentation/RCU/rculist_nulls.rst for details)
2367 	 */
2368 	smp_wmb();
2369 	refcount_set(&newsk->sk_refcnt, 2);
2370 
2371 	sk_set_socket(newsk, NULL);
2372 	sk_tx_queue_clear(newsk);
2373 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2374 
2375 	if (newsk->sk_prot->sockets_allocated)
2376 		sk_sockets_allocated_inc(newsk);
2377 
2378 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2379 		net_enable_timestamp();
2380 out:
2381 	return newsk;
2382 }
2383 EXPORT_SYMBOL_GPL(sk_clone_lock);
2384 
2385 void sk_free_unlock_clone(struct sock *sk)
2386 {
2387 	/* It is still raw copy of parent, so invalidate
2388 	 * destructor and make plain sk_free() */
2389 	sk->sk_destruct = NULL;
2390 	bh_unlock_sock(sk);
2391 	sk_free(sk);
2392 }
2393 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2394 
2395 static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2396 {
2397 	bool is_ipv6 = false;
2398 	u32 max_size;
2399 
2400 #if IS_ENABLED(CONFIG_IPV6)
2401 	is_ipv6 = (sk->sk_family == AF_INET6 &&
2402 		   !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2403 #endif
2404 	/* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2405 	max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2406 			READ_ONCE(dst->dev->gso_ipv4_max_size);
2407 	if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2408 		max_size = GSO_LEGACY_MAX_SIZE;
2409 
2410 	return max_size - (MAX_TCP_HEADER + 1);
2411 }
2412 
2413 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2414 {
2415 	u32 max_segs = 1;
2416 
2417 	sk->sk_route_caps = dst->dev->features;
2418 	if (sk_is_tcp(sk))
2419 		sk->sk_route_caps |= NETIF_F_GSO;
2420 	if (sk->sk_route_caps & NETIF_F_GSO)
2421 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2422 	if (unlikely(sk->sk_gso_disabled))
2423 		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2424 	if (sk_can_gso(sk)) {
2425 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2426 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2427 		} else {
2428 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2429 			sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2430 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2431 			max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2432 		}
2433 	}
2434 	sk->sk_gso_max_segs = max_segs;
2435 	sk_dst_set(sk, dst);
2436 }
2437 EXPORT_SYMBOL_GPL(sk_setup_caps);
2438 
2439 /*
2440  *	Simple resource managers for sockets.
2441  */
2442 
2443 
2444 /*
2445  * Write buffer destructor automatically called from kfree_skb.
2446  */
2447 void sock_wfree(struct sk_buff *skb)
2448 {
2449 	struct sock *sk = skb->sk;
2450 	unsigned int len = skb->truesize;
2451 	bool free;
2452 
2453 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2454 		if (sock_flag(sk, SOCK_RCU_FREE) &&
2455 		    sk->sk_write_space == sock_def_write_space) {
2456 			rcu_read_lock();
2457 			free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2458 			sock_def_write_space_wfree(sk);
2459 			rcu_read_unlock();
2460 			if (unlikely(free))
2461 				__sk_free(sk);
2462 			return;
2463 		}
2464 
2465 		/*
2466 		 * Keep a reference on sk_wmem_alloc, this will be released
2467 		 * after sk_write_space() call
2468 		 */
2469 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2470 		sk->sk_write_space(sk);
2471 		len = 1;
2472 	}
2473 	/*
2474 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2475 	 * could not do because of in-flight packets
2476 	 */
2477 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2478 		__sk_free(sk);
2479 }
2480 EXPORT_SYMBOL(sock_wfree);
2481 
2482 /* This variant of sock_wfree() is used by TCP,
2483  * since it sets SOCK_USE_WRITE_QUEUE.
2484  */
2485 void __sock_wfree(struct sk_buff *skb)
2486 {
2487 	struct sock *sk = skb->sk;
2488 
2489 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2490 		__sk_free(sk);
2491 }
2492 
2493 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2494 {
2495 	skb_orphan(skb);
2496 	skb->sk = sk;
2497 #ifdef CONFIG_INET
2498 	if (unlikely(!sk_fullsock(sk))) {
2499 		skb->destructor = sock_edemux;
2500 		sock_hold(sk);
2501 		return;
2502 	}
2503 #endif
2504 	skb->destructor = sock_wfree;
2505 	skb_set_hash_from_sk(skb, sk);
2506 	/*
2507 	 * We used to take a refcount on sk, but following operation
2508 	 * is enough to guarantee sk_free() wont free this sock until
2509 	 * all in-flight packets are completed
2510 	 */
2511 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2512 }
2513 EXPORT_SYMBOL(skb_set_owner_w);
2514 
2515 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2516 {
2517 #ifdef CONFIG_TLS_DEVICE
2518 	/* Drivers depend on in-order delivery for crypto offload,
2519 	 * partial orphan breaks out-of-order-OK logic.
2520 	 */
2521 	if (skb->decrypted)
2522 		return false;
2523 #endif
2524 	return (skb->destructor == sock_wfree ||
2525 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2526 }
2527 
2528 /* This helper is used by netem, as it can hold packets in its
2529  * delay queue. We want to allow the owner socket to send more
2530  * packets, as if they were already TX completed by a typical driver.
2531  * But we also want to keep skb->sk set because some packet schedulers
2532  * rely on it (sch_fq for example).
2533  */
2534 void skb_orphan_partial(struct sk_buff *skb)
2535 {
2536 	if (skb_is_tcp_pure_ack(skb))
2537 		return;
2538 
2539 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2540 		return;
2541 
2542 	skb_orphan(skb);
2543 }
2544 EXPORT_SYMBOL(skb_orphan_partial);
2545 
2546 /*
2547  * Read buffer destructor automatically called from kfree_skb.
2548  */
2549 void sock_rfree(struct sk_buff *skb)
2550 {
2551 	struct sock *sk = skb->sk;
2552 	unsigned int len = skb->truesize;
2553 
2554 	atomic_sub(len, &sk->sk_rmem_alloc);
2555 	sk_mem_uncharge(sk, len);
2556 }
2557 EXPORT_SYMBOL(sock_rfree);
2558 
2559 /*
2560  * Buffer destructor for skbs that are not used directly in read or write
2561  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2562  */
2563 void sock_efree(struct sk_buff *skb)
2564 {
2565 	sock_put(skb->sk);
2566 }
2567 EXPORT_SYMBOL(sock_efree);
2568 
2569 /* Buffer destructor for prefetch/receive path where reference count may
2570  * not be held, e.g. for listen sockets.
2571  */
2572 #ifdef CONFIG_INET
2573 void sock_pfree(struct sk_buff *skb)
2574 {
2575 	if (sk_is_refcounted(skb->sk))
2576 		sock_gen_put(skb->sk);
2577 }
2578 EXPORT_SYMBOL(sock_pfree);
2579 #endif /* CONFIG_INET */
2580 
2581 kuid_t sock_i_uid(struct sock *sk)
2582 {
2583 	kuid_t uid;
2584 
2585 	read_lock_bh(&sk->sk_callback_lock);
2586 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2587 	read_unlock_bh(&sk->sk_callback_lock);
2588 	return uid;
2589 }
2590 EXPORT_SYMBOL(sock_i_uid);
2591 
2592 unsigned long __sock_i_ino(struct sock *sk)
2593 {
2594 	unsigned long ino;
2595 
2596 	read_lock(&sk->sk_callback_lock);
2597 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2598 	read_unlock(&sk->sk_callback_lock);
2599 	return ino;
2600 }
2601 EXPORT_SYMBOL(__sock_i_ino);
2602 
2603 unsigned long sock_i_ino(struct sock *sk)
2604 {
2605 	unsigned long ino;
2606 
2607 	local_bh_disable();
2608 	ino = __sock_i_ino(sk);
2609 	local_bh_enable();
2610 	return ino;
2611 }
2612 EXPORT_SYMBOL(sock_i_ino);
2613 
2614 /*
2615  * Allocate a skb from the socket's send buffer.
2616  */
2617 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2618 			     gfp_t priority)
2619 {
2620 	if (force ||
2621 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2622 		struct sk_buff *skb = alloc_skb(size, priority);
2623 
2624 		if (skb) {
2625 			skb_set_owner_w(skb, sk);
2626 			return skb;
2627 		}
2628 	}
2629 	return NULL;
2630 }
2631 EXPORT_SYMBOL(sock_wmalloc);
2632 
2633 static void sock_ofree(struct sk_buff *skb)
2634 {
2635 	struct sock *sk = skb->sk;
2636 
2637 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2638 }
2639 
2640 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2641 			     gfp_t priority)
2642 {
2643 	struct sk_buff *skb;
2644 
2645 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2646 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2647 	    READ_ONCE(sysctl_optmem_max))
2648 		return NULL;
2649 
2650 	skb = alloc_skb(size, priority);
2651 	if (!skb)
2652 		return NULL;
2653 
2654 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2655 	skb->sk = sk;
2656 	skb->destructor = sock_ofree;
2657 	return skb;
2658 }
2659 
2660 /*
2661  * Allocate a memory block from the socket's option memory buffer.
2662  */
2663 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2664 {
2665 	int optmem_max = READ_ONCE(sysctl_optmem_max);
2666 
2667 	if ((unsigned int)size <= optmem_max &&
2668 	    atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2669 		void *mem;
2670 		/* First do the add, to avoid the race if kmalloc
2671 		 * might sleep.
2672 		 */
2673 		atomic_add(size, &sk->sk_omem_alloc);
2674 		mem = kmalloc(size, priority);
2675 		if (mem)
2676 			return mem;
2677 		atomic_sub(size, &sk->sk_omem_alloc);
2678 	}
2679 	return NULL;
2680 }
2681 EXPORT_SYMBOL(sock_kmalloc);
2682 
2683 /* Free an option memory block. Note, we actually want the inline
2684  * here as this allows gcc to detect the nullify and fold away the
2685  * condition entirely.
2686  */
2687 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2688 				  const bool nullify)
2689 {
2690 	if (WARN_ON_ONCE(!mem))
2691 		return;
2692 	if (nullify)
2693 		kfree_sensitive(mem);
2694 	else
2695 		kfree(mem);
2696 	atomic_sub(size, &sk->sk_omem_alloc);
2697 }
2698 
2699 void sock_kfree_s(struct sock *sk, void *mem, int size)
2700 {
2701 	__sock_kfree_s(sk, mem, size, false);
2702 }
2703 EXPORT_SYMBOL(sock_kfree_s);
2704 
2705 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2706 {
2707 	__sock_kfree_s(sk, mem, size, true);
2708 }
2709 EXPORT_SYMBOL(sock_kzfree_s);
2710 
2711 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2712    I think, these locks should be removed for datagram sockets.
2713  */
2714 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2715 {
2716 	DEFINE_WAIT(wait);
2717 
2718 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2719 	for (;;) {
2720 		if (!timeo)
2721 			break;
2722 		if (signal_pending(current))
2723 			break;
2724 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2725 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2726 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2727 			break;
2728 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2729 			break;
2730 		if (sk->sk_err)
2731 			break;
2732 		timeo = schedule_timeout(timeo);
2733 	}
2734 	finish_wait(sk_sleep(sk), &wait);
2735 	return timeo;
2736 }
2737 
2738 
2739 /*
2740  *	Generic send/receive buffer handlers
2741  */
2742 
2743 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2744 				     unsigned long data_len, int noblock,
2745 				     int *errcode, int max_page_order)
2746 {
2747 	struct sk_buff *skb;
2748 	long timeo;
2749 	int err;
2750 
2751 	timeo = sock_sndtimeo(sk, noblock);
2752 	for (;;) {
2753 		err = sock_error(sk);
2754 		if (err != 0)
2755 			goto failure;
2756 
2757 		err = -EPIPE;
2758 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2759 			goto failure;
2760 
2761 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2762 			break;
2763 
2764 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2765 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2766 		err = -EAGAIN;
2767 		if (!timeo)
2768 			goto failure;
2769 		if (signal_pending(current))
2770 			goto interrupted;
2771 		timeo = sock_wait_for_wmem(sk, timeo);
2772 	}
2773 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2774 				   errcode, sk->sk_allocation);
2775 	if (skb)
2776 		skb_set_owner_w(skb, sk);
2777 	return skb;
2778 
2779 interrupted:
2780 	err = sock_intr_errno(timeo);
2781 failure:
2782 	*errcode = err;
2783 	return NULL;
2784 }
2785 EXPORT_SYMBOL(sock_alloc_send_pskb);
2786 
2787 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
2788 		     struct sockcm_cookie *sockc)
2789 {
2790 	u32 tsflags;
2791 
2792 	switch (cmsg->cmsg_type) {
2793 	case SO_MARK:
2794 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2795 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2796 			return -EPERM;
2797 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2798 			return -EINVAL;
2799 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2800 		break;
2801 	case SO_TIMESTAMPING_OLD:
2802 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2803 			return -EINVAL;
2804 
2805 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2806 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2807 			return -EINVAL;
2808 
2809 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2810 		sockc->tsflags |= tsflags;
2811 		break;
2812 	case SCM_TXTIME:
2813 		if (!sock_flag(sk, SOCK_TXTIME))
2814 			return -EINVAL;
2815 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2816 			return -EINVAL;
2817 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2818 		break;
2819 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2820 	case SCM_RIGHTS:
2821 	case SCM_CREDENTIALS:
2822 		break;
2823 	default:
2824 		return -EINVAL;
2825 	}
2826 	return 0;
2827 }
2828 EXPORT_SYMBOL(__sock_cmsg_send);
2829 
2830 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2831 		   struct sockcm_cookie *sockc)
2832 {
2833 	struct cmsghdr *cmsg;
2834 	int ret;
2835 
2836 	for_each_cmsghdr(cmsg, msg) {
2837 		if (!CMSG_OK(msg, cmsg))
2838 			return -EINVAL;
2839 		if (cmsg->cmsg_level != SOL_SOCKET)
2840 			continue;
2841 		ret = __sock_cmsg_send(sk, cmsg, sockc);
2842 		if (ret)
2843 			return ret;
2844 	}
2845 	return 0;
2846 }
2847 EXPORT_SYMBOL(sock_cmsg_send);
2848 
2849 static void sk_enter_memory_pressure(struct sock *sk)
2850 {
2851 	if (!sk->sk_prot->enter_memory_pressure)
2852 		return;
2853 
2854 	sk->sk_prot->enter_memory_pressure(sk);
2855 }
2856 
2857 static void sk_leave_memory_pressure(struct sock *sk)
2858 {
2859 	if (sk->sk_prot->leave_memory_pressure) {
2860 		INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
2861 				     tcp_leave_memory_pressure, sk);
2862 	} else {
2863 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2864 
2865 		if (memory_pressure && READ_ONCE(*memory_pressure))
2866 			WRITE_ONCE(*memory_pressure, 0);
2867 	}
2868 }
2869 
2870 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2871 
2872 /**
2873  * skb_page_frag_refill - check that a page_frag contains enough room
2874  * @sz: minimum size of the fragment we want to get
2875  * @pfrag: pointer to page_frag
2876  * @gfp: priority for memory allocation
2877  *
2878  * Note: While this allocator tries to use high order pages, there is
2879  * no guarantee that allocations succeed. Therefore, @sz MUST be
2880  * less or equal than PAGE_SIZE.
2881  */
2882 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2883 {
2884 	if (pfrag->page) {
2885 		if (page_ref_count(pfrag->page) == 1) {
2886 			pfrag->offset = 0;
2887 			return true;
2888 		}
2889 		if (pfrag->offset + sz <= pfrag->size)
2890 			return true;
2891 		put_page(pfrag->page);
2892 	}
2893 
2894 	pfrag->offset = 0;
2895 	if (SKB_FRAG_PAGE_ORDER &&
2896 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2897 		/* Avoid direct reclaim but allow kswapd to wake */
2898 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2899 					  __GFP_COMP | __GFP_NOWARN |
2900 					  __GFP_NORETRY,
2901 					  SKB_FRAG_PAGE_ORDER);
2902 		if (likely(pfrag->page)) {
2903 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2904 			return true;
2905 		}
2906 	}
2907 	pfrag->page = alloc_page(gfp);
2908 	if (likely(pfrag->page)) {
2909 		pfrag->size = PAGE_SIZE;
2910 		return true;
2911 	}
2912 	return false;
2913 }
2914 EXPORT_SYMBOL(skb_page_frag_refill);
2915 
2916 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2917 {
2918 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2919 		return true;
2920 
2921 	sk_enter_memory_pressure(sk);
2922 	sk_stream_moderate_sndbuf(sk);
2923 	return false;
2924 }
2925 EXPORT_SYMBOL(sk_page_frag_refill);
2926 
2927 void __lock_sock(struct sock *sk)
2928 	__releases(&sk->sk_lock.slock)
2929 	__acquires(&sk->sk_lock.slock)
2930 {
2931 	DEFINE_WAIT(wait);
2932 
2933 	for (;;) {
2934 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2935 					TASK_UNINTERRUPTIBLE);
2936 		spin_unlock_bh(&sk->sk_lock.slock);
2937 		schedule();
2938 		spin_lock_bh(&sk->sk_lock.slock);
2939 		if (!sock_owned_by_user(sk))
2940 			break;
2941 	}
2942 	finish_wait(&sk->sk_lock.wq, &wait);
2943 }
2944 
2945 void __release_sock(struct sock *sk)
2946 	__releases(&sk->sk_lock.slock)
2947 	__acquires(&sk->sk_lock.slock)
2948 {
2949 	struct sk_buff *skb, *next;
2950 
2951 	while ((skb = sk->sk_backlog.head) != NULL) {
2952 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2953 
2954 		spin_unlock_bh(&sk->sk_lock.slock);
2955 
2956 		do {
2957 			next = skb->next;
2958 			prefetch(next);
2959 			DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
2960 			skb_mark_not_on_list(skb);
2961 			sk_backlog_rcv(sk, skb);
2962 
2963 			cond_resched();
2964 
2965 			skb = next;
2966 		} while (skb != NULL);
2967 
2968 		spin_lock_bh(&sk->sk_lock.slock);
2969 	}
2970 
2971 	/*
2972 	 * Doing the zeroing here guarantee we can not loop forever
2973 	 * while a wild producer attempts to flood us.
2974 	 */
2975 	sk->sk_backlog.len = 0;
2976 }
2977 
2978 void __sk_flush_backlog(struct sock *sk)
2979 {
2980 	spin_lock_bh(&sk->sk_lock.slock);
2981 	__release_sock(sk);
2982 	spin_unlock_bh(&sk->sk_lock.slock);
2983 }
2984 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
2985 
2986 /**
2987  * sk_wait_data - wait for data to arrive at sk_receive_queue
2988  * @sk:    sock to wait on
2989  * @timeo: for how long
2990  * @skb:   last skb seen on sk_receive_queue
2991  *
2992  * Now socket state including sk->sk_err is changed only under lock,
2993  * hence we may omit checks after joining wait queue.
2994  * We check receive queue before schedule() only as optimization;
2995  * it is very likely that release_sock() added new data.
2996  */
2997 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2998 {
2999 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
3000 	int rc;
3001 
3002 	add_wait_queue(sk_sleep(sk), &wait);
3003 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3004 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3005 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3006 	remove_wait_queue(sk_sleep(sk), &wait);
3007 	return rc;
3008 }
3009 EXPORT_SYMBOL(sk_wait_data);
3010 
3011 /**
3012  *	__sk_mem_raise_allocated - increase memory_allocated
3013  *	@sk: socket
3014  *	@size: memory size to allocate
3015  *	@amt: pages to allocate
3016  *	@kind: allocation type
3017  *
3018  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
3019  */
3020 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3021 {
3022 	bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
3023 	struct proto *prot = sk->sk_prot;
3024 	bool charged = true;
3025 	long allocated;
3026 
3027 	sk_memory_allocated_add(sk, amt);
3028 	allocated = sk_memory_allocated(sk);
3029 	if (memcg_charge &&
3030 	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
3031 						gfp_memcg_charge())))
3032 		goto suppress_allocation;
3033 
3034 	/* Under limit. */
3035 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
3036 		sk_leave_memory_pressure(sk);
3037 		return 1;
3038 	}
3039 
3040 	/* Under pressure. */
3041 	if (allocated > sk_prot_mem_limits(sk, 1))
3042 		sk_enter_memory_pressure(sk);
3043 
3044 	/* Over hard limit. */
3045 	if (allocated > sk_prot_mem_limits(sk, 2))
3046 		goto suppress_allocation;
3047 
3048 	/* guarantee minimum buffer size under pressure */
3049 	if (kind == SK_MEM_RECV) {
3050 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3051 			return 1;
3052 
3053 	} else { /* SK_MEM_SEND */
3054 		int wmem0 = sk_get_wmem0(sk, prot);
3055 
3056 		if (sk->sk_type == SOCK_STREAM) {
3057 			if (sk->sk_wmem_queued < wmem0)
3058 				return 1;
3059 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3060 				return 1;
3061 		}
3062 	}
3063 
3064 	if (sk_has_memory_pressure(sk)) {
3065 		u64 alloc;
3066 
3067 		if (!sk_under_memory_pressure(sk))
3068 			return 1;
3069 		alloc = sk_sockets_allocated_read_positive(sk);
3070 		if (sk_prot_mem_limits(sk, 2) > alloc *
3071 		    sk_mem_pages(sk->sk_wmem_queued +
3072 				 atomic_read(&sk->sk_rmem_alloc) +
3073 				 sk->sk_forward_alloc))
3074 			return 1;
3075 	}
3076 
3077 suppress_allocation:
3078 
3079 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3080 		sk_stream_moderate_sndbuf(sk);
3081 
3082 		/* Fail only if socket is _under_ its sndbuf.
3083 		 * In this case we cannot block, so that we have to fail.
3084 		 */
3085 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3086 			/* Force charge with __GFP_NOFAIL */
3087 			if (memcg_charge && !charged) {
3088 				mem_cgroup_charge_skmem(sk->sk_memcg, amt,
3089 					gfp_memcg_charge() | __GFP_NOFAIL);
3090 			}
3091 			return 1;
3092 		}
3093 	}
3094 
3095 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3096 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3097 
3098 	sk_memory_allocated_sub(sk, amt);
3099 
3100 	if (memcg_charge && charged)
3101 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
3102 
3103 	return 0;
3104 }
3105 
3106 /**
3107  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3108  *	@sk: socket
3109  *	@size: memory size to allocate
3110  *	@kind: allocation type
3111  *
3112  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3113  *	rmem allocation. This function assumes that protocols which have
3114  *	memory_pressure use sk_wmem_queued as write buffer accounting.
3115  */
3116 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3117 {
3118 	int ret, amt = sk_mem_pages(size);
3119 
3120 	sk->sk_forward_alloc += amt << PAGE_SHIFT;
3121 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3122 	if (!ret)
3123 		sk->sk_forward_alloc -= amt << PAGE_SHIFT;
3124 	return ret;
3125 }
3126 EXPORT_SYMBOL(__sk_mem_schedule);
3127 
3128 /**
3129  *	__sk_mem_reduce_allocated - reclaim memory_allocated
3130  *	@sk: socket
3131  *	@amount: number of quanta
3132  *
3133  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3134  */
3135 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3136 {
3137 	sk_memory_allocated_sub(sk, amount);
3138 
3139 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3140 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3141 
3142 	if (sk_under_memory_pressure(sk) &&
3143 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3144 		sk_leave_memory_pressure(sk);
3145 }
3146 
3147 /**
3148  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3149  *	@sk: socket
3150  *	@amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3151  */
3152 void __sk_mem_reclaim(struct sock *sk, int amount)
3153 {
3154 	amount >>= PAGE_SHIFT;
3155 	sk->sk_forward_alloc -= amount << PAGE_SHIFT;
3156 	__sk_mem_reduce_allocated(sk, amount);
3157 }
3158 EXPORT_SYMBOL(__sk_mem_reclaim);
3159 
3160 int sk_set_peek_off(struct sock *sk, int val)
3161 {
3162 	sk->sk_peek_off = val;
3163 	return 0;
3164 }
3165 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3166 
3167 /*
3168  * Set of default routines for initialising struct proto_ops when
3169  * the protocol does not support a particular function. In certain
3170  * cases where it makes no sense for a protocol to have a "do nothing"
3171  * function, some default processing is provided.
3172  */
3173 
3174 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3175 {
3176 	return -EOPNOTSUPP;
3177 }
3178 EXPORT_SYMBOL(sock_no_bind);
3179 
3180 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3181 		    int len, int flags)
3182 {
3183 	return -EOPNOTSUPP;
3184 }
3185 EXPORT_SYMBOL(sock_no_connect);
3186 
3187 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3188 {
3189 	return -EOPNOTSUPP;
3190 }
3191 EXPORT_SYMBOL(sock_no_socketpair);
3192 
3193 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
3194 		   bool kern)
3195 {
3196 	return -EOPNOTSUPP;
3197 }
3198 EXPORT_SYMBOL(sock_no_accept);
3199 
3200 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3201 		    int peer)
3202 {
3203 	return -EOPNOTSUPP;
3204 }
3205 EXPORT_SYMBOL(sock_no_getname);
3206 
3207 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3208 {
3209 	return -EOPNOTSUPP;
3210 }
3211 EXPORT_SYMBOL(sock_no_ioctl);
3212 
3213 int sock_no_listen(struct socket *sock, int backlog)
3214 {
3215 	return -EOPNOTSUPP;
3216 }
3217 EXPORT_SYMBOL(sock_no_listen);
3218 
3219 int sock_no_shutdown(struct socket *sock, int how)
3220 {
3221 	return -EOPNOTSUPP;
3222 }
3223 EXPORT_SYMBOL(sock_no_shutdown);
3224 
3225 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3226 {
3227 	return -EOPNOTSUPP;
3228 }
3229 EXPORT_SYMBOL(sock_no_sendmsg);
3230 
3231 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3232 {
3233 	return -EOPNOTSUPP;
3234 }
3235 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3236 
3237 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3238 		    int flags)
3239 {
3240 	return -EOPNOTSUPP;
3241 }
3242 EXPORT_SYMBOL(sock_no_recvmsg);
3243 
3244 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3245 {
3246 	/* Mirror missing mmap method error code */
3247 	return -ENODEV;
3248 }
3249 EXPORT_SYMBOL(sock_no_mmap);
3250 
3251 /*
3252  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3253  * various sock-based usage counts.
3254  */
3255 void __receive_sock(struct file *file)
3256 {
3257 	struct socket *sock;
3258 
3259 	sock = sock_from_file(file);
3260 	if (sock) {
3261 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3262 		sock_update_classid(&sock->sk->sk_cgrp_data);
3263 	}
3264 }
3265 
3266 /*
3267  *	Default Socket Callbacks
3268  */
3269 
3270 static void sock_def_wakeup(struct sock *sk)
3271 {
3272 	struct socket_wq *wq;
3273 
3274 	rcu_read_lock();
3275 	wq = rcu_dereference(sk->sk_wq);
3276 	if (skwq_has_sleeper(wq))
3277 		wake_up_interruptible_all(&wq->wait);
3278 	rcu_read_unlock();
3279 }
3280 
3281 static void sock_def_error_report(struct sock *sk)
3282 {
3283 	struct socket_wq *wq;
3284 
3285 	rcu_read_lock();
3286 	wq = rcu_dereference(sk->sk_wq);
3287 	if (skwq_has_sleeper(wq))
3288 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3289 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3290 	rcu_read_unlock();
3291 }
3292 
3293 void sock_def_readable(struct sock *sk)
3294 {
3295 	struct socket_wq *wq;
3296 
3297 	trace_sk_data_ready(sk);
3298 
3299 	rcu_read_lock();
3300 	wq = rcu_dereference(sk->sk_wq);
3301 	if (skwq_has_sleeper(wq))
3302 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3303 						EPOLLRDNORM | EPOLLRDBAND);
3304 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3305 	rcu_read_unlock();
3306 }
3307 
3308 static void sock_def_write_space(struct sock *sk)
3309 {
3310 	struct socket_wq *wq;
3311 
3312 	rcu_read_lock();
3313 
3314 	/* Do not wake up a writer until he can make "significant"
3315 	 * progress.  --DaveM
3316 	 */
3317 	if (sock_writeable(sk)) {
3318 		wq = rcu_dereference(sk->sk_wq);
3319 		if (skwq_has_sleeper(wq))
3320 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3321 						EPOLLWRNORM | EPOLLWRBAND);
3322 
3323 		/* Should agree with poll, otherwise some programs break */
3324 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3325 	}
3326 
3327 	rcu_read_unlock();
3328 }
3329 
3330 /* An optimised version of sock_def_write_space(), should only be called
3331  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3332  * ->sk_wmem_alloc.
3333  */
3334 static void sock_def_write_space_wfree(struct sock *sk)
3335 {
3336 	/* Do not wake up a writer until he can make "significant"
3337 	 * progress.  --DaveM
3338 	 */
3339 	if (sock_writeable(sk)) {
3340 		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3341 
3342 		/* rely on refcount_sub from sock_wfree() */
3343 		smp_mb__after_atomic();
3344 		if (wq && waitqueue_active(&wq->wait))
3345 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3346 						EPOLLWRNORM | EPOLLWRBAND);
3347 
3348 		/* Should agree with poll, otherwise some programs break */
3349 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3350 	}
3351 }
3352 
3353 static void sock_def_destruct(struct sock *sk)
3354 {
3355 }
3356 
3357 void sk_send_sigurg(struct sock *sk)
3358 {
3359 	if (sk->sk_socket && sk->sk_socket->file)
3360 		if (send_sigurg(&sk->sk_socket->file->f_owner))
3361 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3362 }
3363 EXPORT_SYMBOL(sk_send_sigurg);
3364 
3365 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3366 		    unsigned long expires)
3367 {
3368 	if (!mod_timer(timer, expires))
3369 		sock_hold(sk);
3370 }
3371 EXPORT_SYMBOL(sk_reset_timer);
3372 
3373 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3374 {
3375 	if (del_timer(timer))
3376 		__sock_put(sk);
3377 }
3378 EXPORT_SYMBOL(sk_stop_timer);
3379 
3380 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3381 {
3382 	if (del_timer_sync(timer))
3383 		__sock_put(sk);
3384 }
3385 EXPORT_SYMBOL(sk_stop_timer_sync);
3386 
3387 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3388 {
3389 	sk_init_common(sk);
3390 	sk->sk_send_head	=	NULL;
3391 
3392 	timer_setup(&sk->sk_timer, NULL, 0);
3393 
3394 	sk->sk_allocation	=	GFP_KERNEL;
3395 	sk->sk_rcvbuf		=	READ_ONCE(sysctl_rmem_default);
3396 	sk->sk_sndbuf		=	READ_ONCE(sysctl_wmem_default);
3397 	sk->sk_state		=	TCP_CLOSE;
3398 	sk->sk_use_task_frag	=	true;
3399 	sk_set_socket(sk, sock);
3400 
3401 	sock_set_flag(sk, SOCK_ZAPPED);
3402 
3403 	if (sock) {
3404 		sk->sk_type	=	sock->type;
3405 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3406 		sock->sk	=	sk;
3407 	} else {
3408 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3409 	}
3410 	sk->sk_uid	=	uid;
3411 
3412 	rwlock_init(&sk->sk_callback_lock);
3413 	if (sk->sk_kern_sock)
3414 		lockdep_set_class_and_name(
3415 			&sk->sk_callback_lock,
3416 			af_kern_callback_keys + sk->sk_family,
3417 			af_family_kern_clock_key_strings[sk->sk_family]);
3418 	else
3419 		lockdep_set_class_and_name(
3420 			&sk->sk_callback_lock,
3421 			af_callback_keys + sk->sk_family,
3422 			af_family_clock_key_strings[sk->sk_family]);
3423 
3424 	sk->sk_state_change	=	sock_def_wakeup;
3425 	sk->sk_data_ready	=	sock_def_readable;
3426 	sk->sk_write_space	=	sock_def_write_space;
3427 	sk->sk_error_report	=	sock_def_error_report;
3428 	sk->sk_destruct		=	sock_def_destruct;
3429 
3430 	sk->sk_frag.page	=	NULL;
3431 	sk->sk_frag.offset	=	0;
3432 	sk->sk_peek_off		=	-1;
3433 
3434 	sk->sk_peer_pid 	=	NULL;
3435 	sk->sk_peer_cred	=	NULL;
3436 	spin_lock_init(&sk->sk_peer_lock);
3437 
3438 	sk->sk_write_pending	=	0;
3439 	sk->sk_rcvlowat		=	1;
3440 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3441 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3442 
3443 	sk->sk_stamp = SK_DEFAULT_STAMP;
3444 #if BITS_PER_LONG==32
3445 	seqlock_init(&sk->sk_stamp_seq);
3446 #endif
3447 	atomic_set(&sk->sk_zckey, 0);
3448 
3449 #ifdef CONFIG_NET_RX_BUSY_POLL
3450 	sk->sk_napi_id		=	0;
3451 	sk->sk_ll_usec		=	READ_ONCE(sysctl_net_busy_read);
3452 #endif
3453 
3454 	sk->sk_max_pacing_rate = ~0UL;
3455 	sk->sk_pacing_rate = ~0UL;
3456 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3457 	sk->sk_incoming_cpu = -1;
3458 
3459 	sk_rx_queue_clear(sk);
3460 	/*
3461 	 * Before updating sk_refcnt, we must commit prior changes to memory
3462 	 * (Documentation/RCU/rculist_nulls.rst for details)
3463 	 */
3464 	smp_wmb();
3465 	refcount_set(&sk->sk_refcnt, 1);
3466 	atomic_set(&sk->sk_drops, 0);
3467 }
3468 EXPORT_SYMBOL(sock_init_data_uid);
3469 
3470 void sock_init_data(struct socket *sock, struct sock *sk)
3471 {
3472 	kuid_t uid = sock ?
3473 		SOCK_INODE(sock)->i_uid :
3474 		make_kuid(sock_net(sk)->user_ns, 0);
3475 
3476 	sock_init_data_uid(sock, sk, uid);
3477 }
3478 EXPORT_SYMBOL(sock_init_data);
3479 
3480 void lock_sock_nested(struct sock *sk, int subclass)
3481 {
3482 	/* The sk_lock has mutex_lock() semantics here. */
3483 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3484 
3485 	might_sleep();
3486 	spin_lock_bh(&sk->sk_lock.slock);
3487 	if (sock_owned_by_user_nocheck(sk))
3488 		__lock_sock(sk);
3489 	sk->sk_lock.owned = 1;
3490 	spin_unlock_bh(&sk->sk_lock.slock);
3491 }
3492 EXPORT_SYMBOL(lock_sock_nested);
3493 
3494 void release_sock(struct sock *sk)
3495 {
3496 	spin_lock_bh(&sk->sk_lock.slock);
3497 	if (sk->sk_backlog.tail)
3498 		__release_sock(sk);
3499 
3500 	/* Warning : release_cb() might need to release sk ownership,
3501 	 * ie call sock_release_ownership(sk) before us.
3502 	 */
3503 	if (sk->sk_prot->release_cb)
3504 		sk->sk_prot->release_cb(sk);
3505 
3506 	sock_release_ownership(sk);
3507 	if (waitqueue_active(&sk->sk_lock.wq))
3508 		wake_up(&sk->sk_lock.wq);
3509 	spin_unlock_bh(&sk->sk_lock.slock);
3510 }
3511 EXPORT_SYMBOL(release_sock);
3512 
3513 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3514 {
3515 	might_sleep();
3516 	spin_lock_bh(&sk->sk_lock.slock);
3517 
3518 	if (!sock_owned_by_user_nocheck(sk)) {
3519 		/*
3520 		 * Fast path return with bottom halves disabled and
3521 		 * sock::sk_lock.slock held.
3522 		 *
3523 		 * The 'mutex' is not contended and holding
3524 		 * sock::sk_lock.slock prevents all other lockers to
3525 		 * proceed so the corresponding unlock_sock_fast() can
3526 		 * avoid the slow path of release_sock() completely and
3527 		 * just release slock.
3528 		 *
3529 		 * From a semantical POV this is equivalent to 'acquiring'
3530 		 * the 'mutex', hence the corresponding lockdep
3531 		 * mutex_release() has to happen in the fast path of
3532 		 * unlock_sock_fast().
3533 		 */
3534 		return false;
3535 	}
3536 
3537 	__lock_sock(sk);
3538 	sk->sk_lock.owned = 1;
3539 	__acquire(&sk->sk_lock.slock);
3540 	spin_unlock_bh(&sk->sk_lock.slock);
3541 	return true;
3542 }
3543 EXPORT_SYMBOL(__lock_sock_fast);
3544 
3545 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3546 		   bool timeval, bool time32)
3547 {
3548 	struct sock *sk = sock->sk;
3549 	struct timespec64 ts;
3550 
3551 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3552 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3553 	if (ts.tv_sec == -1)
3554 		return -ENOENT;
3555 	if (ts.tv_sec == 0) {
3556 		ktime_t kt = ktime_get_real();
3557 		sock_write_timestamp(sk, kt);
3558 		ts = ktime_to_timespec64(kt);
3559 	}
3560 
3561 	if (timeval)
3562 		ts.tv_nsec /= 1000;
3563 
3564 #ifdef CONFIG_COMPAT_32BIT_TIME
3565 	if (time32)
3566 		return put_old_timespec32(&ts, userstamp);
3567 #endif
3568 #ifdef CONFIG_SPARC64
3569 	/* beware of padding in sparc64 timeval */
3570 	if (timeval && !in_compat_syscall()) {
3571 		struct __kernel_old_timeval __user tv = {
3572 			.tv_sec = ts.tv_sec,
3573 			.tv_usec = ts.tv_nsec,
3574 		};
3575 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3576 			return -EFAULT;
3577 		return 0;
3578 	}
3579 #endif
3580 	return put_timespec64(&ts, userstamp);
3581 }
3582 EXPORT_SYMBOL(sock_gettstamp);
3583 
3584 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3585 {
3586 	if (!sock_flag(sk, flag)) {
3587 		unsigned long previous_flags = sk->sk_flags;
3588 
3589 		sock_set_flag(sk, flag);
3590 		/*
3591 		 * we just set one of the two flags which require net
3592 		 * time stamping, but time stamping might have been on
3593 		 * already because of the other one
3594 		 */
3595 		if (sock_needs_netstamp(sk) &&
3596 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3597 			net_enable_timestamp();
3598 	}
3599 }
3600 
3601 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3602 		       int level, int type)
3603 {
3604 	struct sock_exterr_skb *serr;
3605 	struct sk_buff *skb;
3606 	int copied, err;
3607 
3608 	err = -EAGAIN;
3609 	skb = sock_dequeue_err_skb(sk);
3610 	if (skb == NULL)
3611 		goto out;
3612 
3613 	copied = skb->len;
3614 	if (copied > len) {
3615 		msg->msg_flags |= MSG_TRUNC;
3616 		copied = len;
3617 	}
3618 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3619 	if (err)
3620 		goto out_free_skb;
3621 
3622 	sock_recv_timestamp(msg, sk, skb);
3623 
3624 	serr = SKB_EXT_ERR(skb);
3625 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3626 
3627 	msg->msg_flags |= MSG_ERRQUEUE;
3628 	err = copied;
3629 
3630 out_free_skb:
3631 	kfree_skb(skb);
3632 out:
3633 	return err;
3634 }
3635 EXPORT_SYMBOL(sock_recv_errqueue);
3636 
3637 /*
3638  *	Get a socket option on an socket.
3639  *
3640  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3641  *	asynchronous errors should be reported by getsockopt. We assume
3642  *	this means if you specify SO_ERROR (otherwise whats the point of it).
3643  */
3644 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3645 			   char __user *optval, int __user *optlen)
3646 {
3647 	struct sock *sk = sock->sk;
3648 
3649 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3650 	return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3651 }
3652 EXPORT_SYMBOL(sock_common_getsockopt);
3653 
3654 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3655 			int flags)
3656 {
3657 	struct sock *sk = sock->sk;
3658 	int addr_len = 0;
3659 	int err;
3660 
3661 	err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3662 	if (err >= 0)
3663 		msg->msg_namelen = addr_len;
3664 	return err;
3665 }
3666 EXPORT_SYMBOL(sock_common_recvmsg);
3667 
3668 /*
3669  *	Set socket options on an inet socket.
3670  */
3671 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3672 			   sockptr_t optval, unsigned int optlen)
3673 {
3674 	struct sock *sk = sock->sk;
3675 
3676 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3677 	return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3678 }
3679 EXPORT_SYMBOL(sock_common_setsockopt);
3680 
3681 void sk_common_release(struct sock *sk)
3682 {
3683 	if (sk->sk_prot->destroy)
3684 		sk->sk_prot->destroy(sk);
3685 
3686 	/*
3687 	 * Observation: when sk_common_release is called, processes have
3688 	 * no access to socket. But net still has.
3689 	 * Step one, detach it from networking:
3690 	 *
3691 	 * A. Remove from hash tables.
3692 	 */
3693 
3694 	sk->sk_prot->unhash(sk);
3695 
3696 	/*
3697 	 * In this point socket cannot receive new packets, but it is possible
3698 	 * that some packets are in flight because some CPU runs receiver and
3699 	 * did hash table lookup before we unhashed socket. They will achieve
3700 	 * receive queue and will be purged by socket destructor.
3701 	 *
3702 	 * Also we still have packets pending on receive queue and probably,
3703 	 * our own packets waiting in device queues. sock_destroy will drain
3704 	 * receive queue, but transmitted packets will delay socket destruction
3705 	 * until the last reference will be released.
3706 	 */
3707 
3708 	sock_orphan(sk);
3709 
3710 	xfrm_sk_free_policy(sk);
3711 
3712 	sock_put(sk);
3713 }
3714 EXPORT_SYMBOL(sk_common_release);
3715 
3716 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3717 {
3718 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3719 
3720 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3721 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3722 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3723 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3724 	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3725 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3726 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3727 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3728 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3729 }
3730 
3731 #ifdef CONFIG_PROC_FS
3732 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3733 
3734 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3735 {
3736 	int cpu, idx = prot->inuse_idx;
3737 	int res = 0;
3738 
3739 	for_each_possible_cpu(cpu)
3740 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3741 
3742 	return res >= 0 ? res : 0;
3743 }
3744 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3745 
3746 int sock_inuse_get(struct net *net)
3747 {
3748 	int cpu, res = 0;
3749 
3750 	for_each_possible_cpu(cpu)
3751 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3752 
3753 	return res;
3754 }
3755 
3756 EXPORT_SYMBOL_GPL(sock_inuse_get);
3757 
3758 static int __net_init sock_inuse_init_net(struct net *net)
3759 {
3760 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3761 	if (net->core.prot_inuse == NULL)
3762 		return -ENOMEM;
3763 	return 0;
3764 }
3765 
3766 static void __net_exit sock_inuse_exit_net(struct net *net)
3767 {
3768 	free_percpu(net->core.prot_inuse);
3769 }
3770 
3771 static struct pernet_operations net_inuse_ops = {
3772 	.init = sock_inuse_init_net,
3773 	.exit = sock_inuse_exit_net,
3774 };
3775 
3776 static __init int net_inuse_init(void)
3777 {
3778 	if (register_pernet_subsys(&net_inuse_ops))
3779 		panic("Cannot initialize net inuse counters");
3780 
3781 	return 0;
3782 }
3783 
3784 core_initcall(net_inuse_init);
3785 
3786 static int assign_proto_idx(struct proto *prot)
3787 {
3788 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3789 
3790 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3791 		pr_err("PROTO_INUSE_NR exhausted\n");
3792 		return -ENOSPC;
3793 	}
3794 
3795 	set_bit(prot->inuse_idx, proto_inuse_idx);
3796 	return 0;
3797 }
3798 
3799 static void release_proto_idx(struct proto *prot)
3800 {
3801 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3802 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3803 }
3804 #else
3805 static inline int assign_proto_idx(struct proto *prot)
3806 {
3807 	return 0;
3808 }
3809 
3810 static inline void release_proto_idx(struct proto *prot)
3811 {
3812 }
3813 
3814 #endif
3815 
3816 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3817 {
3818 	if (!twsk_prot)
3819 		return;
3820 	kfree(twsk_prot->twsk_slab_name);
3821 	twsk_prot->twsk_slab_name = NULL;
3822 	kmem_cache_destroy(twsk_prot->twsk_slab);
3823 	twsk_prot->twsk_slab = NULL;
3824 }
3825 
3826 static int tw_prot_init(const struct proto *prot)
3827 {
3828 	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3829 
3830 	if (!twsk_prot)
3831 		return 0;
3832 
3833 	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3834 					      prot->name);
3835 	if (!twsk_prot->twsk_slab_name)
3836 		return -ENOMEM;
3837 
3838 	twsk_prot->twsk_slab =
3839 		kmem_cache_create(twsk_prot->twsk_slab_name,
3840 				  twsk_prot->twsk_obj_size, 0,
3841 				  SLAB_ACCOUNT | prot->slab_flags,
3842 				  NULL);
3843 	if (!twsk_prot->twsk_slab) {
3844 		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3845 			prot->name);
3846 		return -ENOMEM;
3847 	}
3848 
3849 	return 0;
3850 }
3851 
3852 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3853 {
3854 	if (!rsk_prot)
3855 		return;
3856 	kfree(rsk_prot->slab_name);
3857 	rsk_prot->slab_name = NULL;
3858 	kmem_cache_destroy(rsk_prot->slab);
3859 	rsk_prot->slab = NULL;
3860 }
3861 
3862 static int req_prot_init(const struct proto *prot)
3863 {
3864 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3865 
3866 	if (!rsk_prot)
3867 		return 0;
3868 
3869 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3870 					prot->name);
3871 	if (!rsk_prot->slab_name)
3872 		return -ENOMEM;
3873 
3874 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3875 					   rsk_prot->obj_size, 0,
3876 					   SLAB_ACCOUNT | prot->slab_flags,
3877 					   NULL);
3878 
3879 	if (!rsk_prot->slab) {
3880 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3881 			prot->name);
3882 		return -ENOMEM;
3883 	}
3884 	return 0;
3885 }
3886 
3887 int proto_register(struct proto *prot, int alloc_slab)
3888 {
3889 	int ret = -ENOBUFS;
3890 
3891 	if (prot->memory_allocated && !prot->sysctl_mem) {
3892 		pr_err("%s: missing sysctl_mem\n", prot->name);
3893 		return -EINVAL;
3894 	}
3895 	if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
3896 		pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
3897 		return -EINVAL;
3898 	}
3899 	if (alloc_slab) {
3900 		prot->slab = kmem_cache_create_usercopy(prot->name,
3901 					prot->obj_size, 0,
3902 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3903 					prot->slab_flags,
3904 					prot->useroffset, prot->usersize,
3905 					NULL);
3906 
3907 		if (prot->slab == NULL) {
3908 			pr_crit("%s: Can't create sock SLAB cache!\n",
3909 				prot->name);
3910 			goto out;
3911 		}
3912 
3913 		if (req_prot_init(prot))
3914 			goto out_free_request_sock_slab;
3915 
3916 		if (tw_prot_init(prot))
3917 			goto out_free_timewait_sock_slab;
3918 	}
3919 
3920 	mutex_lock(&proto_list_mutex);
3921 	ret = assign_proto_idx(prot);
3922 	if (ret) {
3923 		mutex_unlock(&proto_list_mutex);
3924 		goto out_free_timewait_sock_slab;
3925 	}
3926 	list_add(&prot->node, &proto_list);
3927 	mutex_unlock(&proto_list_mutex);
3928 	return ret;
3929 
3930 out_free_timewait_sock_slab:
3931 	if (alloc_slab)
3932 		tw_prot_cleanup(prot->twsk_prot);
3933 out_free_request_sock_slab:
3934 	if (alloc_slab) {
3935 		req_prot_cleanup(prot->rsk_prot);
3936 
3937 		kmem_cache_destroy(prot->slab);
3938 		prot->slab = NULL;
3939 	}
3940 out:
3941 	return ret;
3942 }
3943 EXPORT_SYMBOL(proto_register);
3944 
3945 void proto_unregister(struct proto *prot)
3946 {
3947 	mutex_lock(&proto_list_mutex);
3948 	release_proto_idx(prot);
3949 	list_del(&prot->node);
3950 	mutex_unlock(&proto_list_mutex);
3951 
3952 	kmem_cache_destroy(prot->slab);
3953 	prot->slab = NULL;
3954 
3955 	req_prot_cleanup(prot->rsk_prot);
3956 	tw_prot_cleanup(prot->twsk_prot);
3957 }
3958 EXPORT_SYMBOL(proto_unregister);
3959 
3960 int sock_load_diag_module(int family, int protocol)
3961 {
3962 	if (!protocol) {
3963 		if (!sock_is_registered(family))
3964 			return -ENOENT;
3965 
3966 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3967 				      NETLINK_SOCK_DIAG, family);
3968 	}
3969 
3970 #ifdef CONFIG_INET
3971 	if (family == AF_INET &&
3972 	    protocol != IPPROTO_RAW &&
3973 	    protocol < MAX_INET_PROTOS &&
3974 	    !rcu_access_pointer(inet_protos[protocol]))
3975 		return -ENOENT;
3976 #endif
3977 
3978 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3979 			      NETLINK_SOCK_DIAG, family, protocol);
3980 }
3981 EXPORT_SYMBOL(sock_load_diag_module);
3982 
3983 #ifdef CONFIG_PROC_FS
3984 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3985 	__acquires(proto_list_mutex)
3986 {
3987 	mutex_lock(&proto_list_mutex);
3988 	return seq_list_start_head(&proto_list, *pos);
3989 }
3990 
3991 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3992 {
3993 	return seq_list_next(v, &proto_list, pos);
3994 }
3995 
3996 static void proto_seq_stop(struct seq_file *seq, void *v)
3997 	__releases(proto_list_mutex)
3998 {
3999 	mutex_unlock(&proto_list_mutex);
4000 }
4001 
4002 static char proto_method_implemented(const void *method)
4003 {
4004 	return method == NULL ? 'n' : 'y';
4005 }
4006 static long sock_prot_memory_allocated(struct proto *proto)
4007 {
4008 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4009 }
4010 
4011 static const char *sock_prot_memory_pressure(struct proto *proto)
4012 {
4013 	return proto->memory_pressure != NULL ?
4014 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4015 }
4016 
4017 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4018 {
4019 
4020 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4021 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4022 		   proto->name,
4023 		   proto->obj_size,
4024 		   sock_prot_inuse_get(seq_file_net(seq), proto),
4025 		   sock_prot_memory_allocated(proto),
4026 		   sock_prot_memory_pressure(proto),
4027 		   proto->max_header,
4028 		   proto->slab == NULL ? "no" : "yes",
4029 		   module_name(proto->owner),
4030 		   proto_method_implemented(proto->close),
4031 		   proto_method_implemented(proto->connect),
4032 		   proto_method_implemented(proto->disconnect),
4033 		   proto_method_implemented(proto->accept),
4034 		   proto_method_implemented(proto->ioctl),
4035 		   proto_method_implemented(proto->init),
4036 		   proto_method_implemented(proto->destroy),
4037 		   proto_method_implemented(proto->shutdown),
4038 		   proto_method_implemented(proto->setsockopt),
4039 		   proto_method_implemented(proto->getsockopt),
4040 		   proto_method_implemented(proto->sendmsg),
4041 		   proto_method_implemented(proto->recvmsg),
4042 		   proto_method_implemented(proto->bind),
4043 		   proto_method_implemented(proto->backlog_rcv),
4044 		   proto_method_implemented(proto->hash),
4045 		   proto_method_implemented(proto->unhash),
4046 		   proto_method_implemented(proto->get_port),
4047 		   proto_method_implemented(proto->enter_memory_pressure));
4048 }
4049 
4050 static int proto_seq_show(struct seq_file *seq, void *v)
4051 {
4052 	if (v == &proto_list)
4053 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4054 			   "protocol",
4055 			   "size",
4056 			   "sockets",
4057 			   "memory",
4058 			   "press",
4059 			   "maxhdr",
4060 			   "slab",
4061 			   "module",
4062 			   "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4063 	else
4064 		proto_seq_printf(seq, list_entry(v, struct proto, node));
4065 	return 0;
4066 }
4067 
4068 static const struct seq_operations proto_seq_ops = {
4069 	.start  = proto_seq_start,
4070 	.next   = proto_seq_next,
4071 	.stop   = proto_seq_stop,
4072 	.show   = proto_seq_show,
4073 };
4074 
4075 static __net_init int proto_init_net(struct net *net)
4076 {
4077 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4078 			sizeof(struct seq_net_private)))
4079 		return -ENOMEM;
4080 
4081 	return 0;
4082 }
4083 
4084 static __net_exit void proto_exit_net(struct net *net)
4085 {
4086 	remove_proc_entry("protocols", net->proc_net);
4087 }
4088 
4089 
4090 static __net_initdata struct pernet_operations proto_net_ops = {
4091 	.init = proto_init_net,
4092 	.exit = proto_exit_net,
4093 };
4094 
4095 static int __init proto_init(void)
4096 {
4097 	return register_pernet_subsys(&proto_net_ops);
4098 }
4099 
4100 subsys_initcall(proto_init);
4101 
4102 #endif /* PROC_FS */
4103 
4104 #ifdef CONFIG_NET_RX_BUSY_POLL
4105 bool sk_busy_loop_end(void *p, unsigned long start_time)
4106 {
4107 	struct sock *sk = p;
4108 
4109 	return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
4110 	       sk_busy_loop_timeout(sk, start_time);
4111 }
4112 EXPORT_SYMBOL(sk_busy_loop_end);
4113 #endif /* CONFIG_NET_RX_BUSY_POLL */
4114 
4115 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4116 {
4117 	if (!sk->sk_prot->bind_add)
4118 		return -EOPNOTSUPP;
4119 	return sk->sk_prot->bind_add(sk, addr, addr_len);
4120 }
4121 EXPORT_SYMBOL(sock_bind_add);
4122 
4123 /* Copy 'size' bytes from userspace and return `size` back to userspace */
4124 int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4125 		     void __user *arg, void *karg, size_t size)
4126 {
4127 	int ret;
4128 
4129 	if (copy_from_user(karg, arg, size))
4130 		return -EFAULT;
4131 
4132 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4133 	if (ret)
4134 		return ret;
4135 
4136 	if (copy_to_user(arg, karg, size))
4137 		return -EFAULT;
4138 
4139 	return 0;
4140 }
4141 EXPORT_SYMBOL(sock_ioctl_inout);
4142 
4143 /* This is the most common ioctl prep function, where the result (4 bytes) is
4144  * copied back to userspace if the ioctl() returns successfully. No input is
4145  * copied from userspace as input argument.
4146  */
4147 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4148 {
4149 	int ret, karg = 0;
4150 
4151 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4152 	if (ret)
4153 		return ret;
4154 
4155 	return put_user(karg, (int __user *)arg);
4156 }
4157 
4158 /* A wrapper around sock ioctls, which copies the data from userspace
4159  * (depending on the protocol/ioctl), and copies back the result to userspace.
4160  * The main motivation for this function is to pass kernel memory to the
4161  * protocol ioctl callbacks, instead of userspace memory.
4162  */
4163 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4164 {
4165 	int rc = 1;
4166 
4167 	if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4168 		rc = ipmr_sk_ioctl(sk, cmd, arg);
4169 	else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4170 		rc = ip6mr_sk_ioctl(sk, cmd, arg);
4171 	else if (sk_is_phonet(sk))
4172 		rc = phonet_sk_ioctl(sk, cmd, arg);
4173 
4174 	/* If ioctl was processed, returns its value */
4175 	if (rc <= 0)
4176 		return rc;
4177 
4178 	/* Otherwise call the default handler */
4179 	return sock_ioctl_out(sk, cmd, arg);
4180 }
4181 EXPORT_SYMBOL(sk_ioctl);
4182