xref: /openbmc/linux/net/core/sock.c (revision c8013a1f714f6d9f2d8d673177a824c6b9653218)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/init.h>
111 #include <linux/highmem.h>
112 #include <linux/user_namespace.h>
113 #include <linux/static_key.h>
114 #include <linux/memcontrol.h>
115 #include <linux/prefetch.h>
116 #include <linux/compat.h>
117 #include <linux/mroute.h>
118 #include <linux/mroute6.h>
119 #include <linux/icmpv6.h>
120 
121 #include <linux/uaccess.h>
122 
123 #include <linux/netdevice.h>
124 #include <net/protocol.h>
125 #include <linux/skbuff.h>
126 #include <net/net_namespace.h>
127 #include <net/request_sock.h>
128 #include <net/sock.h>
129 #include <linux/net_tstamp.h>
130 #include <net/xfrm.h>
131 #include <linux/ipsec.h>
132 #include <net/cls_cgroup.h>
133 #include <net/netprio_cgroup.h>
134 #include <linux/sock_diag.h>
135 
136 #include <linux/filter.h>
137 #include <net/sock_reuseport.h>
138 #include <net/bpf_sk_storage.h>
139 
140 #include <trace/events/sock.h>
141 
142 #include <net/tcp.h>
143 #include <net/busy_poll.h>
144 #include <net/phonet/phonet.h>
145 
146 #include <linux/ethtool.h>
147 
148 #include "dev.h"
149 
150 static DEFINE_MUTEX(proto_list_mutex);
151 static LIST_HEAD(proto_list);
152 
153 static void sock_def_write_space_wfree(struct sock *sk);
154 static void sock_def_write_space(struct sock *sk);
155 
156 /**
157  * sk_ns_capable - General socket capability test
158  * @sk: Socket to use a capability on or through
159  * @user_ns: The user namespace of the capability to use
160  * @cap: The capability to use
161  *
162  * Test to see if the opener of the socket had when the socket was
163  * created and the current process has the capability @cap in the user
164  * namespace @user_ns.
165  */
166 bool sk_ns_capable(const struct sock *sk,
167 		   struct user_namespace *user_ns, int cap)
168 {
169 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
170 		ns_capable(user_ns, cap);
171 }
172 EXPORT_SYMBOL(sk_ns_capable);
173 
174 /**
175  * sk_capable - Socket global capability test
176  * @sk: Socket to use a capability on or through
177  * @cap: The global capability to use
178  *
179  * Test to see if the opener of the socket had when the socket was
180  * created and the current process has the capability @cap in all user
181  * namespaces.
182  */
183 bool sk_capable(const struct sock *sk, int cap)
184 {
185 	return sk_ns_capable(sk, &init_user_ns, cap);
186 }
187 EXPORT_SYMBOL(sk_capable);
188 
189 /**
190  * sk_net_capable - Network namespace socket capability test
191  * @sk: Socket to use a capability on or through
192  * @cap: The capability to use
193  *
194  * Test to see if the opener of the socket had when the socket was created
195  * and the current process has the capability @cap over the network namespace
196  * the socket is a member of.
197  */
198 bool sk_net_capable(const struct sock *sk, int cap)
199 {
200 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
201 }
202 EXPORT_SYMBOL(sk_net_capable);
203 
204 /*
205  * Each address family might have different locking rules, so we have
206  * one slock key per address family and separate keys for internal and
207  * userspace sockets.
208  */
209 static struct lock_class_key af_family_keys[AF_MAX];
210 static struct lock_class_key af_family_kern_keys[AF_MAX];
211 static struct lock_class_key af_family_slock_keys[AF_MAX];
212 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
213 
214 /*
215  * Make lock validator output more readable. (we pre-construct these
216  * strings build-time, so that runtime initialization of socket
217  * locks is fast):
218  */
219 
220 #define _sock_locks(x)						  \
221   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
222   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
223   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
224   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
225   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
226   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
227   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
228   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
229   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
230   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
231   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
232   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
233   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
234   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
235   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
236   x "AF_MCTP"  , \
237   x "AF_MAX"
238 
239 static const char *const af_family_key_strings[AF_MAX+1] = {
240 	_sock_locks("sk_lock-")
241 };
242 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
243 	_sock_locks("slock-")
244 };
245 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
246 	_sock_locks("clock-")
247 };
248 
249 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
250 	_sock_locks("k-sk_lock-")
251 };
252 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
253 	_sock_locks("k-slock-")
254 };
255 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
256 	_sock_locks("k-clock-")
257 };
258 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
259 	_sock_locks("rlock-")
260 };
261 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
262 	_sock_locks("wlock-")
263 };
264 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
265 	_sock_locks("elock-")
266 };
267 
268 /*
269  * sk_callback_lock and sk queues locking rules are per-address-family,
270  * so split the lock classes by using a per-AF key:
271  */
272 static struct lock_class_key af_callback_keys[AF_MAX];
273 static struct lock_class_key af_rlock_keys[AF_MAX];
274 static struct lock_class_key af_wlock_keys[AF_MAX];
275 static struct lock_class_key af_elock_keys[AF_MAX];
276 static struct lock_class_key af_kern_callback_keys[AF_MAX];
277 
278 /* Run time adjustable parameters. */
279 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
280 EXPORT_SYMBOL(sysctl_wmem_max);
281 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
282 EXPORT_SYMBOL(sysctl_rmem_max);
283 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
284 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
285 
286 /* Maximal space eaten by iovec or ancillary data plus some space */
287 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
288 EXPORT_SYMBOL(sysctl_optmem_max);
289 
290 int sysctl_tstamp_allow_data __read_mostly = 1;
291 
292 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
293 EXPORT_SYMBOL_GPL(memalloc_socks_key);
294 
295 /**
296  * sk_set_memalloc - sets %SOCK_MEMALLOC
297  * @sk: socket to set it on
298  *
299  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
300  * It's the responsibility of the admin to adjust min_free_kbytes
301  * to meet the requirements
302  */
303 void sk_set_memalloc(struct sock *sk)
304 {
305 	sock_set_flag(sk, SOCK_MEMALLOC);
306 	sk->sk_allocation |= __GFP_MEMALLOC;
307 	static_branch_inc(&memalloc_socks_key);
308 }
309 EXPORT_SYMBOL_GPL(sk_set_memalloc);
310 
311 void sk_clear_memalloc(struct sock *sk)
312 {
313 	sock_reset_flag(sk, SOCK_MEMALLOC);
314 	sk->sk_allocation &= ~__GFP_MEMALLOC;
315 	static_branch_dec(&memalloc_socks_key);
316 
317 	/*
318 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
319 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
320 	 * it has rmem allocations due to the last swapfile being deactivated
321 	 * but there is a risk that the socket is unusable due to exceeding
322 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
323 	 */
324 	sk_mem_reclaim(sk);
325 }
326 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
327 
328 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
329 {
330 	int ret;
331 	unsigned int noreclaim_flag;
332 
333 	/* these should have been dropped before queueing */
334 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
335 
336 	noreclaim_flag = memalloc_noreclaim_save();
337 	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
338 				 tcp_v6_do_rcv,
339 				 tcp_v4_do_rcv,
340 				 sk, skb);
341 	memalloc_noreclaim_restore(noreclaim_flag);
342 
343 	return ret;
344 }
345 EXPORT_SYMBOL(__sk_backlog_rcv);
346 
347 void sk_error_report(struct sock *sk)
348 {
349 	sk->sk_error_report(sk);
350 
351 	switch (sk->sk_family) {
352 	case AF_INET:
353 		fallthrough;
354 	case AF_INET6:
355 		trace_inet_sk_error_report(sk);
356 		break;
357 	default:
358 		break;
359 	}
360 }
361 EXPORT_SYMBOL(sk_error_report);
362 
363 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
364 {
365 	struct __kernel_sock_timeval tv;
366 
367 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
368 		tv.tv_sec = 0;
369 		tv.tv_usec = 0;
370 	} else {
371 		tv.tv_sec = timeo / HZ;
372 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
373 	}
374 
375 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
376 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
377 		*(struct old_timeval32 *)optval = tv32;
378 		return sizeof(tv32);
379 	}
380 
381 	if (old_timeval) {
382 		struct __kernel_old_timeval old_tv;
383 		old_tv.tv_sec = tv.tv_sec;
384 		old_tv.tv_usec = tv.tv_usec;
385 		*(struct __kernel_old_timeval *)optval = old_tv;
386 		return sizeof(old_tv);
387 	}
388 
389 	*(struct __kernel_sock_timeval *)optval = tv;
390 	return sizeof(tv);
391 }
392 EXPORT_SYMBOL(sock_get_timeout);
393 
394 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
395 			   sockptr_t optval, int optlen, bool old_timeval)
396 {
397 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
398 		struct old_timeval32 tv32;
399 
400 		if (optlen < sizeof(tv32))
401 			return -EINVAL;
402 
403 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
404 			return -EFAULT;
405 		tv->tv_sec = tv32.tv_sec;
406 		tv->tv_usec = tv32.tv_usec;
407 	} else if (old_timeval) {
408 		struct __kernel_old_timeval old_tv;
409 
410 		if (optlen < sizeof(old_tv))
411 			return -EINVAL;
412 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
413 			return -EFAULT;
414 		tv->tv_sec = old_tv.tv_sec;
415 		tv->tv_usec = old_tv.tv_usec;
416 	} else {
417 		if (optlen < sizeof(*tv))
418 			return -EINVAL;
419 		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
420 			return -EFAULT;
421 	}
422 
423 	return 0;
424 }
425 EXPORT_SYMBOL(sock_copy_user_timeval);
426 
427 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
428 			    bool old_timeval)
429 {
430 	struct __kernel_sock_timeval tv;
431 	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
432 
433 	if (err)
434 		return err;
435 
436 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
437 		return -EDOM;
438 
439 	if (tv.tv_sec < 0) {
440 		static int warned __read_mostly;
441 
442 		*timeo_p = 0;
443 		if (warned < 10 && net_ratelimit()) {
444 			warned++;
445 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
446 				__func__, current->comm, task_pid_nr(current));
447 		}
448 		return 0;
449 	}
450 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
451 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
452 		return 0;
453 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
454 		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
455 	return 0;
456 }
457 
458 static bool sock_needs_netstamp(const struct sock *sk)
459 {
460 	switch (sk->sk_family) {
461 	case AF_UNSPEC:
462 	case AF_UNIX:
463 		return false;
464 	default:
465 		return true;
466 	}
467 }
468 
469 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
470 {
471 	if (sk->sk_flags & flags) {
472 		sk->sk_flags &= ~flags;
473 		if (sock_needs_netstamp(sk) &&
474 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
475 			net_disable_timestamp();
476 	}
477 }
478 
479 
480 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
481 {
482 	unsigned long flags;
483 	struct sk_buff_head *list = &sk->sk_receive_queue;
484 
485 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
486 		atomic_inc(&sk->sk_drops);
487 		trace_sock_rcvqueue_full(sk, skb);
488 		return -ENOMEM;
489 	}
490 
491 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
492 		atomic_inc(&sk->sk_drops);
493 		return -ENOBUFS;
494 	}
495 
496 	skb->dev = NULL;
497 	skb_set_owner_r(skb, sk);
498 
499 	/* we escape from rcu protected region, make sure we dont leak
500 	 * a norefcounted dst
501 	 */
502 	skb_dst_force(skb);
503 
504 	spin_lock_irqsave(&list->lock, flags);
505 	sock_skb_set_dropcount(sk, skb);
506 	__skb_queue_tail(list, skb);
507 	spin_unlock_irqrestore(&list->lock, flags);
508 
509 	if (!sock_flag(sk, SOCK_DEAD))
510 		sk->sk_data_ready(sk);
511 	return 0;
512 }
513 EXPORT_SYMBOL(__sock_queue_rcv_skb);
514 
515 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
516 			      enum skb_drop_reason *reason)
517 {
518 	enum skb_drop_reason drop_reason;
519 	int err;
520 
521 	err = sk_filter(sk, skb);
522 	if (err) {
523 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
524 		goto out;
525 	}
526 	err = __sock_queue_rcv_skb(sk, skb);
527 	switch (err) {
528 	case -ENOMEM:
529 		drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
530 		break;
531 	case -ENOBUFS:
532 		drop_reason = SKB_DROP_REASON_PROTO_MEM;
533 		break;
534 	default:
535 		drop_reason = SKB_NOT_DROPPED_YET;
536 		break;
537 	}
538 out:
539 	if (reason)
540 		*reason = drop_reason;
541 	return err;
542 }
543 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
544 
545 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
546 		     const int nested, unsigned int trim_cap, bool refcounted)
547 {
548 	int rc = NET_RX_SUCCESS;
549 
550 	if (sk_filter_trim_cap(sk, skb, trim_cap))
551 		goto discard_and_relse;
552 
553 	skb->dev = NULL;
554 
555 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
556 		atomic_inc(&sk->sk_drops);
557 		goto discard_and_relse;
558 	}
559 	if (nested)
560 		bh_lock_sock_nested(sk);
561 	else
562 		bh_lock_sock(sk);
563 	if (!sock_owned_by_user(sk)) {
564 		/*
565 		 * trylock + unlock semantics:
566 		 */
567 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
568 
569 		rc = sk_backlog_rcv(sk, skb);
570 
571 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
572 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
573 		bh_unlock_sock(sk);
574 		atomic_inc(&sk->sk_drops);
575 		goto discard_and_relse;
576 	}
577 
578 	bh_unlock_sock(sk);
579 out:
580 	if (refcounted)
581 		sock_put(sk);
582 	return rc;
583 discard_and_relse:
584 	kfree_skb(skb);
585 	goto out;
586 }
587 EXPORT_SYMBOL(__sk_receive_skb);
588 
589 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
590 							  u32));
591 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
592 							   u32));
593 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
594 {
595 	struct dst_entry *dst = __sk_dst_get(sk);
596 
597 	if (dst && dst->obsolete &&
598 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
599 			       dst, cookie) == NULL) {
600 		sk_tx_queue_clear(sk);
601 		sk->sk_dst_pending_confirm = 0;
602 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
603 		dst_release(dst);
604 		return NULL;
605 	}
606 
607 	return dst;
608 }
609 EXPORT_SYMBOL(__sk_dst_check);
610 
611 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
612 {
613 	struct dst_entry *dst = sk_dst_get(sk);
614 
615 	if (dst && dst->obsolete &&
616 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
617 			       dst, cookie) == NULL) {
618 		sk_dst_reset(sk);
619 		dst_release(dst);
620 		return NULL;
621 	}
622 
623 	return dst;
624 }
625 EXPORT_SYMBOL(sk_dst_check);
626 
627 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
628 {
629 	int ret = -ENOPROTOOPT;
630 #ifdef CONFIG_NETDEVICES
631 	struct net *net = sock_net(sk);
632 
633 	/* Sorry... */
634 	ret = -EPERM;
635 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
636 		goto out;
637 
638 	ret = -EINVAL;
639 	if (ifindex < 0)
640 		goto out;
641 
642 	/* Paired with all READ_ONCE() done locklessly. */
643 	WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
644 
645 	if (sk->sk_prot->rehash)
646 		sk->sk_prot->rehash(sk);
647 	sk_dst_reset(sk);
648 
649 	ret = 0;
650 
651 out:
652 #endif
653 
654 	return ret;
655 }
656 
657 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
658 {
659 	int ret;
660 
661 	if (lock_sk)
662 		lock_sock(sk);
663 	ret = sock_bindtoindex_locked(sk, ifindex);
664 	if (lock_sk)
665 		release_sock(sk);
666 
667 	return ret;
668 }
669 EXPORT_SYMBOL(sock_bindtoindex);
670 
671 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
672 {
673 	int ret = -ENOPROTOOPT;
674 #ifdef CONFIG_NETDEVICES
675 	struct net *net = sock_net(sk);
676 	char devname[IFNAMSIZ];
677 	int index;
678 
679 	ret = -EINVAL;
680 	if (optlen < 0)
681 		goto out;
682 
683 	/* Bind this socket to a particular device like "eth0",
684 	 * as specified in the passed interface name. If the
685 	 * name is "" or the option length is zero the socket
686 	 * is not bound.
687 	 */
688 	if (optlen > IFNAMSIZ - 1)
689 		optlen = IFNAMSIZ - 1;
690 	memset(devname, 0, sizeof(devname));
691 
692 	ret = -EFAULT;
693 	if (copy_from_sockptr(devname, optval, optlen))
694 		goto out;
695 
696 	index = 0;
697 	if (devname[0] != '\0') {
698 		struct net_device *dev;
699 
700 		rcu_read_lock();
701 		dev = dev_get_by_name_rcu(net, devname);
702 		if (dev)
703 			index = dev->ifindex;
704 		rcu_read_unlock();
705 		ret = -ENODEV;
706 		if (!dev)
707 			goto out;
708 	}
709 
710 	sockopt_lock_sock(sk);
711 	ret = sock_bindtoindex_locked(sk, index);
712 	sockopt_release_sock(sk);
713 out:
714 #endif
715 
716 	return ret;
717 }
718 
719 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
720 				sockptr_t optlen, int len)
721 {
722 	int ret = -ENOPROTOOPT;
723 #ifdef CONFIG_NETDEVICES
724 	int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
725 	struct net *net = sock_net(sk);
726 	char devname[IFNAMSIZ];
727 
728 	if (bound_dev_if == 0) {
729 		len = 0;
730 		goto zero;
731 	}
732 
733 	ret = -EINVAL;
734 	if (len < IFNAMSIZ)
735 		goto out;
736 
737 	ret = netdev_get_name(net, devname, bound_dev_if);
738 	if (ret)
739 		goto out;
740 
741 	len = strlen(devname) + 1;
742 
743 	ret = -EFAULT;
744 	if (copy_to_sockptr(optval, devname, len))
745 		goto out;
746 
747 zero:
748 	ret = -EFAULT;
749 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
750 		goto out;
751 
752 	ret = 0;
753 
754 out:
755 #endif
756 
757 	return ret;
758 }
759 
760 bool sk_mc_loop(struct sock *sk)
761 {
762 	if (dev_recursion_level())
763 		return false;
764 	if (!sk)
765 		return true;
766 	switch (sk->sk_family) {
767 	case AF_INET:
768 		return inet_sk(sk)->mc_loop;
769 #if IS_ENABLED(CONFIG_IPV6)
770 	case AF_INET6:
771 		return inet6_sk(sk)->mc_loop;
772 #endif
773 	}
774 	WARN_ON_ONCE(1);
775 	return true;
776 }
777 EXPORT_SYMBOL(sk_mc_loop);
778 
779 void sock_set_reuseaddr(struct sock *sk)
780 {
781 	lock_sock(sk);
782 	sk->sk_reuse = SK_CAN_REUSE;
783 	release_sock(sk);
784 }
785 EXPORT_SYMBOL(sock_set_reuseaddr);
786 
787 void sock_set_reuseport(struct sock *sk)
788 {
789 	lock_sock(sk);
790 	sk->sk_reuseport = true;
791 	release_sock(sk);
792 }
793 EXPORT_SYMBOL(sock_set_reuseport);
794 
795 void sock_no_linger(struct sock *sk)
796 {
797 	lock_sock(sk);
798 	sk->sk_lingertime = 0;
799 	sock_set_flag(sk, SOCK_LINGER);
800 	release_sock(sk);
801 }
802 EXPORT_SYMBOL(sock_no_linger);
803 
804 void sock_set_priority(struct sock *sk, u32 priority)
805 {
806 	lock_sock(sk);
807 	sk->sk_priority = priority;
808 	release_sock(sk);
809 }
810 EXPORT_SYMBOL(sock_set_priority);
811 
812 void sock_set_sndtimeo(struct sock *sk, s64 secs)
813 {
814 	lock_sock(sk);
815 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
816 		sk->sk_sndtimeo = secs * HZ;
817 	else
818 		sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
819 	release_sock(sk);
820 }
821 EXPORT_SYMBOL(sock_set_sndtimeo);
822 
823 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
824 {
825 	if (val)  {
826 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
827 		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
828 		sock_set_flag(sk, SOCK_RCVTSTAMP);
829 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
830 	} else {
831 		sock_reset_flag(sk, SOCK_RCVTSTAMP);
832 		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
833 	}
834 }
835 
836 void sock_enable_timestamps(struct sock *sk)
837 {
838 	lock_sock(sk);
839 	__sock_set_timestamps(sk, true, false, true);
840 	release_sock(sk);
841 }
842 EXPORT_SYMBOL(sock_enable_timestamps);
843 
844 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
845 {
846 	switch (optname) {
847 	case SO_TIMESTAMP_OLD:
848 		__sock_set_timestamps(sk, valbool, false, false);
849 		break;
850 	case SO_TIMESTAMP_NEW:
851 		__sock_set_timestamps(sk, valbool, true, false);
852 		break;
853 	case SO_TIMESTAMPNS_OLD:
854 		__sock_set_timestamps(sk, valbool, false, true);
855 		break;
856 	case SO_TIMESTAMPNS_NEW:
857 		__sock_set_timestamps(sk, valbool, true, true);
858 		break;
859 	}
860 }
861 
862 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
863 {
864 	struct net *net = sock_net(sk);
865 	struct net_device *dev = NULL;
866 	bool match = false;
867 	int *vclock_index;
868 	int i, num;
869 
870 	if (sk->sk_bound_dev_if)
871 		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
872 
873 	if (!dev) {
874 		pr_err("%s: sock not bind to device\n", __func__);
875 		return -EOPNOTSUPP;
876 	}
877 
878 	num = ethtool_get_phc_vclocks(dev, &vclock_index);
879 	dev_put(dev);
880 
881 	for (i = 0; i < num; i++) {
882 		if (*(vclock_index + i) == phc_index) {
883 			match = true;
884 			break;
885 		}
886 	}
887 
888 	if (num > 0)
889 		kfree(vclock_index);
890 
891 	if (!match)
892 		return -EINVAL;
893 
894 	sk->sk_bind_phc = phc_index;
895 
896 	return 0;
897 }
898 
899 int sock_set_timestamping(struct sock *sk, int optname,
900 			  struct so_timestamping timestamping)
901 {
902 	int val = timestamping.flags;
903 	int ret;
904 
905 	if (val & ~SOF_TIMESTAMPING_MASK)
906 		return -EINVAL;
907 
908 	if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
909 	    !(val & SOF_TIMESTAMPING_OPT_ID))
910 		return -EINVAL;
911 
912 	if (val & SOF_TIMESTAMPING_OPT_ID &&
913 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
914 		if (sk_is_tcp(sk)) {
915 			if ((1 << sk->sk_state) &
916 			    (TCPF_CLOSE | TCPF_LISTEN))
917 				return -EINVAL;
918 			if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
919 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
920 			else
921 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
922 		} else {
923 			atomic_set(&sk->sk_tskey, 0);
924 		}
925 	}
926 
927 	if (val & SOF_TIMESTAMPING_OPT_STATS &&
928 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
929 		return -EINVAL;
930 
931 	if (val & SOF_TIMESTAMPING_BIND_PHC) {
932 		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
933 		if (ret)
934 			return ret;
935 	}
936 
937 	sk->sk_tsflags = val;
938 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
939 
940 	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
941 		sock_enable_timestamp(sk,
942 				      SOCK_TIMESTAMPING_RX_SOFTWARE);
943 	else
944 		sock_disable_timestamp(sk,
945 				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
946 	return 0;
947 }
948 
949 void sock_set_keepalive(struct sock *sk)
950 {
951 	lock_sock(sk);
952 	if (sk->sk_prot->keepalive)
953 		sk->sk_prot->keepalive(sk, true);
954 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
955 	release_sock(sk);
956 }
957 EXPORT_SYMBOL(sock_set_keepalive);
958 
959 static void __sock_set_rcvbuf(struct sock *sk, int val)
960 {
961 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
962 	 * as a negative value.
963 	 */
964 	val = min_t(int, val, INT_MAX / 2);
965 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
966 
967 	/* We double it on the way in to account for "struct sk_buff" etc.
968 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
969 	 * will allow that much actual data to be received on that socket.
970 	 *
971 	 * Applications are unaware that "struct sk_buff" and other overheads
972 	 * allocate from the receive buffer during socket buffer allocation.
973 	 *
974 	 * And after considering the possible alternatives, returning the value
975 	 * we actually used in getsockopt is the most desirable behavior.
976 	 */
977 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
978 }
979 
980 void sock_set_rcvbuf(struct sock *sk, int val)
981 {
982 	lock_sock(sk);
983 	__sock_set_rcvbuf(sk, val);
984 	release_sock(sk);
985 }
986 EXPORT_SYMBOL(sock_set_rcvbuf);
987 
988 static void __sock_set_mark(struct sock *sk, u32 val)
989 {
990 	if (val != sk->sk_mark) {
991 		sk->sk_mark = val;
992 		sk_dst_reset(sk);
993 	}
994 }
995 
996 void sock_set_mark(struct sock *sk, u32 val)
997 {
998 	lock_sock(sk);
999 	__sock_set_mark(sk, val);
1000 	release_sock(sk);
1001 }
1002 EXPORT_SYMBOL(sock_set_mark);
1003 
1004 static void sock_release_reserved_memory(struct sock *sk, int bytes)
1005 {
1006 	/* Round down bytes to multiple of pages */
1007 	bytes = round_down(bytes, PAGE_SIZE);
1008 
1009 	WARN_ON(bytes > sk->sk_reserved_mem);
1010 	sk->sk_reserved_mem -= bytes;
1011 	sk_mem_reclaim(sk);
1012 }
1013 
1014 static int sock_reserve_memory(struct sock *sk, int bytes)
1015 {
1016 	long allocated;
1017 	bool charged;
1018 	int pages;
1019 
1020 	if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1021 		return -EOPNOTSUPP;
1022 
1023 	if (!bytes)
1024 		return 0;
1025 
1026 	pages = sk_mem_pages(bytes);
1027 
1028 	/* pre-charge to memcg */
1029 	charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1030 					  GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1031 	if (!charged)
1032 		return -ENOMEM;
1033 
1034 	/* pre-charge to forward_alloc */
1035 	sk_memory_allocated_add(sk, pages);
1036 	allocated = sk_memory_allocated(sk);
1037 	/* If the system goes into memory pressure with this
1038 	 * precharge, give up and return error.
1039 	 */
1040 	if (allocated > sk_prot_mem_limits(sk, 1)) {
1041 		sk_memory_allocated_sub(sk, pages);
1042 		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1043 		return -ENOMEM;
1044 	}
1045 	sk->sk_forward_alloc += pages << PAGE_SHIFT;
1046 
1047 	sk->sk_reserved_mem += pages << PAGE_SHIFT;
1048 
1049 	return 0;
1050 }
1051 
1052 void sockopt_lock_sock(struct sock *sk)
1053 {
1054 	/* When current->bpf_ctx is set, the setsockopt is called from
1055 	 * a bpf prog.  bpf has ensured the sk lock has been
1056 	 * acquired before calling setsockopt().
1057 	 */
1058 	if (has_current_bpf_ctx())
1059 		return;
1060 
1061 	lock_sock(sk);
1062 }
1063 EXPORT_SYMBOL(sockopt_lock_sock);
1064 
1065 void sockopt_release_sock(struct sock *sk)
1066 {
1067 	if (has_current_bpf_ctx())
1068 		return;
1069 
1070 	release_sock(sk);
1071 }
1072 EXPORT_SYMBOL(sockopt_release_sock);
1073 
1074 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1075 {
1076 	return has_current_bpf_ctx() || ns_capable(ns, cap);
1077 }
1078 EXPORT_SYMBOL(sockopt_ns_capable);
1079 
1080 bool sockopt_capable(int cap)
1081 {
1082 	return has_current_bpf_ctx() || capable(cap);
1083 }
1084 EXPORT_SYMBOL(sockopt_capable);
1085 
1086 /*
1087  *	This is meant for all protocols to use and covers goings on
1088  *	at the socket level. Everything here is generic.
1089  */
1090 
1091 int sk_setsockopt(struct sock *sk, int level, int optname,
1092 		  sockptr_t optval, unsigned int optlen)
1093 {
1094 	struct so_timestamping timestamping;
1095 	struct socket *sock = sk->sk_socket;
1096 	struct sock_txtime sk_txtime;
1097 	int val;
1098 	int valbool;
1099 	struct linger ling;
1100 	int ret = 0;
1101 
1102 	/*
1103 	 *	Options without arguments
1104 	 */
1105 
1106 	if (optname == SO_BINDTODEVICE)
1107 		return sock_setbindtodevice(sk, optval, optlen);
1108 
1109 	if (optlen < sizeof(int))
1110 		return -EINVAL;
1111 
1112 	if (copy_from_sockptr(&val, optval, sizeof(val)))
1113 		return -EFAULT;
1114 
1115 	valbool = val ? 1 : 0;
1116 
1117 	sockopt_lock_sock(sk);
1118 
1119 	switch (optname) {
1120 	case SO_DEBUG:
1121 		if (val && !sockopt_capable(CAP_NET_ADMIN))
1122 			ret = -EACCES;
1123 		else
1124 			sock_valbool_flag(sk, SOCK_DBG, valbool);
1125 		break;
1126 	case SO_REUSEADDR:
1127 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1128 		break;
1129 	case SO_REUSEPORT:
1130 		sk->sk_reuseport = valbool;
1131 		break;
1132 	case SO_TYPE:
1133 	case SO_PROTOCOL:
1134 	case SO_DOMAIN:
1135 	case SO_ERROR:
1136 		ret = -ENOPROTOOPT;
1137 		break;
1138 	case SO_DONTROUTE:
1139 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1140 		sk_dst_reset(sk);
1141 		break;
1142 	case SO_BROADCAST:
1143 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1144 		break;
1145 	case SO_SNDBUF:
1146 		/* Don't error on this BSD doesn't and if you think
1147 		 * about it this is right. Otherwise apps have to
1148 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1149 		 * are treated in BSD as hints
1150 		 */
1151 		val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1152 set_sndbuf:
1153 		/* Ensure val * 2 fits into an int, to prevent max_t()
1154 		 * from treating it as a negative value.
1155 		 */
1156 		val = min_t(int, val, INT_MAX / 2);
1157 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1158 		WRITE_ONCE(sk->sk_sndbuf,
1159 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1160 		/* Wake up sending tasks if we upped the value. */
1161 		sk->sk_write_space(sk);
1162 		break;
1163 
1164 	case SO_SNDBUFFORCE:
1165 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1166 			ret = -EPERM;
1167 			break;
1168 		}
1169 
1170 		/* No negative values (to prevent underflow, as val will be
1171 		 * multiplied by 2).
1172 		 */
1173 		if (val < 0)
1174 			val = 0;
1175 		goto set_sndbuf;
1176 
1177 	case SO_RCVBUF:
1178 		/* Don't error on this BSD doesn't and if you think
1179 		 * about it this is right. Otherwise apps have to
1180 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1181 		 * are treated in BSD as hints
1182 		 */
1183 		__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1184 		break;
1185 
1186 	case SO_RCVBUFFORCE:
1187 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1188 			ret = -EPERM;
1189 			break;
1190 		}
1191 
1192 		/* No negative values (to prevent underflow, as val will be
1193 		 * multiplied by 2).
1194 		 */
1195 		__sock_set_rcvbuf(sk, max(val, 0));
1196 		break;
1197 
1198 	case SO_KEEPALIVE:
1199 		if (sk->sk_prot->keepalive)
1200 			sk->sk_prot->keepalive(sk, valbool);
1201 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1202 		break;
1203 
1204 	case SO_OOBINLINE:
1205 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1206 		break;
1207 
1208 	case SO_NO_CHECK:
1209 		sk->sk_no_check_tx = valbool;
1210 		break;
1211 
1212 	case SO_PRIORITY:
1213 		if ((val >= 0 && val <= 6) ||
1214 		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1215 		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1216 			sk->sk_priority = val;
1217 		else
1218 			ret = -EPERM;
1219 		break;
1220 
1221 	case SO_LINGER:
1222 		if (optlen < sizeof(ling)) {
1223 			ret = -EINVAL;	/* 1003.1g */
1224 			break;
1225 		}
1226 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1227 			ret = -EFAULT;
1228 			break;
1229 		}
1230 		if (!ling.l_onoff)
1231 			sock_reset_flag(sk, SOCK_LINGER);
1232 		else {
1233 #if (BITS_PER_LONG == 32)
1234 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
1235 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1236 			else
1237 #endif
1238 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
1239 			sock_set_flag(sk, SOCK_LINGER);
1240 		}
1241 		break;
1242 
1243 	case SO_BSDCOMPAT:
1244 		break;
1245 
1246 	case SO_PASSCRED:
1247 		if (valbool)
1248 			set_bit(SOCK_PASSCRED, &sock->flags);
1249 		else
1250 			clear_bit(SOCK_PASSCRED, &sock->flags);
1251 		break;
1252 
1253 	case SO_PASSPIDFD:
1254 		if (valbool)
1255 			set_bit(SOCK_PASSPIDFD, &sock->flags);
1256 		else
1257 			clear_bit(SOCK_PASSPIDFD, &sock->flags);
1258 		break;
1259 
1260 	case SO_TIMESTAMP_OLD:
1261 	case SO_TIMESTAMP_NEW:
1262 	case SO_TIMESTAMPNS_OLD:
1263 	case SO_TIMESTAMPNS_NEW:
1264 		sock_set_timestamp(sk, optname, valbool);
1265 		break;
1266 
1267 	case SO_TIMESTAMPING_NEW:
1268 	case SO_TIMESTAMPING_OLD:
1269 		if (optlen == sizeof(timestamping)) {
1270 			if (copy_from_sockptr(&timestamping, optval,
1271 					      sizeof(timestamping))) {
1272 				ret = -EFAULT;
1273 				break;
1274 			}
1275 		} else {
1276 			memset(&timestamping, 0, sizeof(timestamping));
1277 			timestamping.flags = val;
1278 		}
1279 		ret = sock_set_timestamping(sk, optname, timestamping);
1280 		break;
1281 
1282 	case SO_RCVLOWAT:
1283 		if (val < 0)
1284 			val = INT_MAX;
1285 		if (sock && sock->ops->set_rcvlowat)
1286 			ret = sock->ops->set_rcvlowat(sk, val);
1287 		else
1288 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1289 		break;
1290 
1291 	case SO_RCVTIMEO_OLD:
1292 	case SO_RCVTIMEO_NEW:
1293 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1294 				       optlen, optname == SO_RCVTIMEO_OLD);
1295 		break;
1296 
1297 	case SO_SNDTIMEO_OLD:
1298 	case SO_SNDTIMEO_NEW:
1299 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1300 				       optlen, optname == SO_SNDTIMEO_OLD);
1301 		break;
1302 
1303 	case SO_ATTACH_FILTER: {
1304 		struct sock_fprog fprog;
1305 
1306 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1307 		if (!ret)
1308 			ret = sk_attach_filter(&fprog, sk);
1309 		break;
1310 	}
1311 	case SO_ATTACH_BPF:
1312 		ret = -EINVAL;
1313 		if (optlen == sizeof(u32)) {
1314 			u32 ufd;
1315 
1316 			ret = -EFAULT;
1317 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1318 				break;
1319 
1320 			ret = sk_attach_bpf(ufd, sk);
1321 		}
1322 		break;
1323 
1324 	case SO_ATTACH_REUSEPORT_CBPF: {
1325 		struct sock_fprog fprog;
1326 
1327 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1328 		if (!ret)
1329 			ret = sk_reuseport_attach_filter(&fprog, sk);
1330 		break;
1331 	}
1332 	case SO_ATTACH_REUSEPORT_EBPF:
1333 		ret = -EINVAL;
1334 		if (optlen == sizeof(u32)) {
1335 			u32 ufd;
1336 
1337 			ret = -EFAULT;
1338 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1339 				break;
1340 
1341 			ret = sk_reuseport_attach_bpf(ufd, sk);
1342 		}
1343 		break;
1344 
1345 	case SO_DETACH_REUSEPORT_BPF:
1346 		ret = reuseport_detach_prog(sk);
1347 		break;
1348 
1349 	case SO_DETACH_FILTER:
1350 		ret = sk_detach_filter(sk);
1351 		break;
1352 
1353 	case SO_LOCK_FILTER:
1354 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1355 			ret = -EPERM;
1356 		else
1357 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1358 		break;
1359 
1360 	case SO_PASSSEC:
1361 		if (valbool)
1362 			set_bit(SOCK_PASSSEC, &sock->flags);
1363 		else
1364 			clear_bit(SOCK_PASSSEC, &sock->flags);
1365 		break;
1366 	case SO_MARK:
1367 		if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1368 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1369 			ret = -EPERM;
1370 			break;
1371 		}
1372 
1373 		__sock_set_mark(sk, val);
1374 		break;
1375 	case SO_RCVMARK:
1376 		if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1377 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1378 			ret = -EPERM;
1379 			break;
1380 		}
1381 
1382 		sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1383 		break;
1384 
1385 	case SO_RXQ_OVFL:
1386 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1387 		break;
1388 
1389 	case SO_WIFI_STATUS:
1390 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1391 		break;
1392 
1393 	case SO_PEEK_OFF:
1394 		if (sock->ops->set_peek_off)
1395 			ret = sock->ops->set_peek_off(sk, val);
1396 		else
1397 			ret = -EOPNOTSUPP;
1398 		break;
1399 
1400 	case SO_NOFCS:
1401 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1402 		break;
1403 
1404 	case SO_SELECT_ERR_QUEUE:
1405 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1406 		break;
1407 
1408 #ifdef CONFIG_NET_RX_BUSY_POLL
1409 	case SO_BUSY_POLL:
1410 		if (val < 0)
1411 			ret = -EINVAL;
1412 		else
1413 			WRITE_ONCE(sk->sk_ll_usec, val);
1414 		break;
1415 	case SO_PREFER_BUSY_POLL:
1416 		if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1417 			ret = -EPERM;
1418 		else
1419 			WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1420 		break;
1421 	case SO_BUSY_POLL_BUDGET:
1422 		if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) {
1423 			ret = -EPERM;
1424 		} else {
1425 			if (val < 0 || val > U16_MAX)
1426 				ret = -EINVAL;
1427 			else
1428 				WRITE_ONCE(sk->sk_busy_poll_budget, val);
1429 		}
1430 		break;
1431 #endif
1432 
1433 	case SO_MAX_PACING_RATE:
1434 		{
1435 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1436 
1437 		if (sizeof(ulval) != sizeof(val) &&
1438 		    optlen >= sizeof(ulval) &&
1439 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1440 			ret = -EFAULT;
1441 			break;
1442 		}
1443 		if (ulval != ~0UL)
1444 			cmpxchg(&sk->sk_pacing_status,
1445 				SK_PACING_NONE,
1446 				SK_PACING_NEEDED);
1447 		sk->sk_max_pacing_rate = ulval;
1448 		sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1449 		break;
1450 		}
1451 	case SO_INCOMING_CPU:
1452 		reuseport_update_incoming_cpu(sk, val);
1453 		break;
1454 
1455 	case SO_CNX_ADVICE:
1456 		if (val == 1)
1457 			dst_negative_advice(sk);
1458 		break;
1459 
1460 	case SO_ZEROCOPY:
1461 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1462 			if (!(sk_is_tcp(sk) ||
1463 			      (sk->sk_type == SOCK_DGRAM &&
1464 			       sk->sk_protocol == IPPROTO_UDP)))
1465 				ret = -EOPNOTSUPP;
1466 		} else if (sk->sk_family != PF_RDS) {
1467 			ret = -EOPNOTSUPP;
1468 		}
1469 		if (!ret) {
1470 			if (val < 0 || val > 1)
1471 				ret = -EINVAL;
1472 			else
1473 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1474 		}
1475 		break;
1476 
1477 	case SO_TXTIME:
1478 		if (optlen != sizeof(struct sock_txtime)) {
1479 			ret = -EINVAL;
1480 			break;
1481 		} else if (copy_from_sockptr(&sk_txtime, optval,
1482 			   sizeof(struct sock_txtime))) {
1483 			ret = -EFAULT;
1484 			break;
1485 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1486 			ret = -EINVAL;
1487 			break;
1488 		}
1489 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1490 		 * scheduler has enough safe guards.
1491 		 */
1492 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1493 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1494 			ret = -EPERM;
1495 			break;
1496 		}
1497 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1498 		sk->sk_clockid = sk_txtime.clockid;
1499 		sk->sk_txtime_deadline_mode =
1500 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1501 		sk->sk_txtime_report_errors =
1502 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1503 		break;
1504 
1505 	case SO_BINDTOIFINDEX:
1506 		ret = sock_bindtoindex_locked(sk, val);
1507 		break;
1508 
1509 	case SO_BUF_LOCK:
1510 		if (val & ~SOCK_BUF_LOCK_MASK) {
1511 			ret = -EINVAL;
1512 			break;
1513 		}
1514 		sk->sk_userlocks = val | (sk->sk_userlocks &
1515 					  ~SOCK_BUF_LOCK_MASK);
1516 		break;
1517 
1518 	case SO_RESERVE_MEM:
1519 	{
1520 		int delta;
1521 
1522 		if (val < 0) {
1523 			ret = -EINVAL;
1524 			break;
1525 		}
1526 
1527 		delta = val - sk->sk_reserved_mem;
1528 		if (delta < 0)
1529 			sock_release_reserved_memory(sk, -delta);
1530 		else
1531 			ret = sock_reserve_memory(sk, delta);
1532 		break;
1533 	}
1534 
1535 	case SO_TXREHASH:
1536 		if (val < -1 || val > 1) {
1537 			ret = -EINVAL;
1538 			break;
1539 		}
1540 		if ((u8)val == SOCK_TXREHASH_DEFAULT)
1541 			val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1542 		/* Paired with READ_ONCE() in tcp_rtx_synack() */
1543 		WRITE_ONCE(sk->sk_txrehash, (u8)val);
1544 		break;
1545 
1546 	default:
1547 		ret = -ENOPROTOOPT;
1548 		break;
1549 	}
1550 	sockopt_release_sock(sk);
1551 	return ret;
1552 }
1553 
1554 int sock_setsockopt(struct socket *sock, int level, int optname,
1555 		    sockptr_t optval, unsigned int optlen)
1556 {
1557 	return sk_setsockopt(sock->sk, level, optname,
1558 			     optval, optlen);
1559 }
1560 EXPORT_SYMBOL(sock_setsockopt);
1561 
1562 static const struct cred *sk_get_peer_cred(struct sock *sk)
1563 {
1564 	const struct cred *cred;
1565 
1566 	spin_lock(&sk->sk_peer_lock);
1567 	cred = get_cred(sk->sk_peer_cred);
1568 	spin_unlock(&sk->sk_peer_lock);
1569 
1570 	return cred;
1571 }
1572 
1573 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1574 			  struct ucred *ucred)
1575 {
1576 	ucred->pid = pid_vnr(pid);
1577 	ucred->uid = ucred->gid = -1;
1578 	if (cred) {
1579 		struct user_namespace *current_ns = current_user_ns();
1580 
1581 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1582 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1583 	}
1584 }
1585 
1586 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1587 {
1588 	struct user_namespace *user_ns = current_user_ns();
1589 	int i;
1590 
1591 	for (i = 0; i < src->ngroups; i++) {
1592 		gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1593 
1594 		if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1595 			return -EFAULT;
1596 	}
1597 
1598 	return 0;
1599 }
1600 
1601 int sk_getsockopt(struct sock *sk, int level, int optname,
1602 		  sockptr_t optval, sockptr_t optlen)
1603 {
1604 	struct socket *sock = sk->sk_socket;
1605 
1606 	union {
1607 		int val;
1608 		u64 val64;
1609 		unsigned long ulval;
1610 		struct linger ling;
1611 		struct old_timeval32 tm32;
1612 		struct __kernel_old_timeval tm;
1613 		struct  __kernel_sock_timeval stm;
1614 		struct sock_txtime txtime;
1615 		struct so_timestamping timestamping;
1616 	} v;
1617 
1618 	int lv = sizeof(int);
1619 	int len;
1620 
1621 	if (copy_from_sockptr(&len, optlen, sizeof(int)))
1622 		return -EFAULT;
1623 	if (len < 0)
1624 		return -EINVAL;
1625 
1626 	memset(&v, 0, sizeof(v));
1627 
1628 	switch (optname) {
1629 	case SO_DEBUG:
1630 		v.val = sock_flag(sk, SOCK_DBG);
1631 		break;
1632 
1633 	case SO_DONTROUTE:
1634 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1635 		break;
1636 
1637 	case SO_BROADCAST:
1638 		v.val = sock_flag(sk, SOCK_BROADCAST);
1639 		break;
1640 
1641 	case SO_SNDBUF:
1642 		v.val = sk->sk_sndbuf;
1643 		break;
1644 
1645 	case SO_RCVBUF:
1646 		v.val = sk->sk_rcvbuf;
1647 		break;
1648 
1649 	case SO_REUSEADDR:
1650 		v.val = sk->sk_reuse;
1651 		break;
1652 
1653 	case SO_REUSEPORT:
1654 		v.val = sk->sk_reuseport;
1655 		break;
1656 
1657 	case SO_KEEPALIVE:
1658 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1659 		break;
1660 
1661 	case SO_TYPE:
1662 		v.val = sk->sk_type;
1663 		break;
1664 
1665 	case SO_PROTOCOL:
1666 		v.val = sk->sk_protocol;
1667 		break;
1668 
1669 	case SO_DOMAIN:
1670 		v.val = sk->sk_family;
1671 		break;
1672 
1673 	case SO_ERROR:
1674 		v.val = -sock_error(sk);
1675 		if (v.val == 0)
1676 			v.val = xchg(&sk->sk_err_soft, 0);
1677 		break;
1678 
1679 	case SO_OOBINLINE:
1680 		v.val = sock_flag(sk, SOCK_URGINLINE);
1681 		break;
1682 
1683 	case SO_NO_CHECK:
1684 		v.val = sk->sk_no_check_tx;
1685 		break;
1686 
1687 	case SO_PRIORITY:
1688 		v.val = sk->sk_priority;
1689 		break;
1690 
1691 	case SO_LINGER:
1692 		lv		= sizeof(v.ling);
1693 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1694 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1695 		break;
1696 
1697 	case SO_BSDCOMPAT:
1698 		break;
1699 
1700 	case SO_TIMESTAMP_OLD:
1701 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1702 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1703 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1704 		break;
1705 
1706 	case SO_TIMESTAMPNS_OLD:
1707 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1708 		break;
1709 
1710 	case SO_TIMESTAMP_NEW:
1711 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1712 		break;
1713 
1714 	case SO_TIMESTAMPNS_NEW:
1715 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1716 		break;
1717 
1718 	case SO_TIMESTAMPING_OLD:
1719 		lv = sizeof(v.timestamping);
1720 		v.timestamping.flags = sk->sk_tsflags;
1721 		v.timestamping.bind_phc = sk->sk_bind_phc;
1722 		break;
1723 
1724 	case SO_RCVTIMEO_OLD:
1725 	case SO_RCVTIMEO_NEW:
1726 		lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1727 		break;
1728 
1729 	case SO_SNDTIMEO_OLD:
1730 	case SO_SNDTIMEO_NEW:
1731 		lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1732 		break;
1733 
1734 	case SO_RCVLOWAT:
1735 		v.val = sk->sk_rcvlowat;
1736 		break;
1737 
1738 	case SO_SNDLOWAT:
1739 		v.val = 1;
1740 		break;
1741 
1742 	case SO_PASSCRED:
1743 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1744 		break;
1745 
1746 	case SO_PASSPIDFD:
1747 		v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1748 		break;
1749 
1750 	case SO_PEERCRED:
1751 	{
1752 		struct ucred peercred;
1753 		if (len > sizeof(peercred))
1754 			len = sizeof(peercred);
1755 
1756 		spin_lock(&sk->sk_peer_lock);
1757 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1758 		spin_unlock(&sk->sk_peer_lock);
1759 
1760 		if (copy_to_sockptr(optval, &peercred, len))
1761 			return -EFAULT;
1762 		goto lenout;
1763 	}
1764 
1765 	case SO_PEERPIDFD:
1766 	{
1767 		struct pid *peer_pid;
1768 		struct file *pidfd_file = NULL;
1769 		int pidfd;
1770 
1771 		if (len > sizeof(pidfd))
1772 			len = sizeof(pidfd);
1773 
1774 		spin_lock(&sk->sk_peer_lock);
1775 		peer_pid = get_pid(sk->sk_peer_pid);
1776 		spin_unlock(&sk->sk_peer_lock);
1777 
1778 		if (!peer_pid)
1779 			return -ESRCH;
1780 
1781 		pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1782 		put_pid(peer_pid);
1783 		if (pidfd < 0)
1784 			return pidfd;
1785 
1786 		if (copy_to_sockptr(optval, &pidfd, len) ||
1787 		    copy_to_sockptr(optlen, &len, sizeof(int))) {
1788 			put_unused_fd(pidfd);
1789 			fput(pidfd_file);
1790 
1791 			return -EFAULT;
1792 		}
1793 
1794 		fd_install(pidfd, pidfd_file);
1795 		return 0;
1796 	}
1797 
1798 	case SO_PEERGROUPS:
1799 	{
1800 		const struct cred *cred;
1801 		int ret, n;
1802 
1803 		cred = sk_get_peer_cred(sk);
1804 		if (!cred)
1805 			return -ENODATA;
1806 
1807 		n = cred->group_info->ngroups;
1808 		if (len < n * sizeof(gid_t)) {
1809 			len = n * sizeof(gid_t);
1810 			put_cred(cred);
1811 			return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1812 		}
1813 		len = n * sizeof(gid_t);
1814 
1815 		ret = groups_to_user(optval, cred->group_info);
1816 		put_cred(cred);
1817 		if (ret)
1818 			return ret;
1819 		goto lenout;
1820 	}
1821 
1822 	case SO_PEERNAME:
1823 	{
1824 		char address[128];
1825 
1826 		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1827 		if (lv < 0)
1828 			return -ENOTCONN;
1829 		if (lv < len)
1830 			return -EINVAL;
1831 		if (copy_to_sockptr(optval, address, len))
1832 			return -EFAULT;
1833 		goto lenout;
1834 	}
1835 
1836 	/* Dubious BSD thing... Probably nobody even uses it, but
1837 	 * the UNIX standard wants it for whatever reason... -DaveM
1838 	 */
1839 	case SO_ACCEPTCONN:
1840 		v.val = sk->sk_state == TCP_LISTEN;
1841 		break;
1842 
1843 	case SO_PASSSEC:
1844 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1845 		break;
1846 
1847 	case SO_PEERSEC:
1848 		return security_socket_getpeersec_stream(sock,
1849 							 optval, optlen, len);
1850 
1851 	case SO_MARK:
1852 		v.val = sk->sk_mark;
1853 		break;
1854 
1855 	case SO_RCVMARK:
1856 		v.val = sock_flag(sk, SOCK_RCVMARK);
1857 		break;
1858 
1859 	case SO_RXQ_OVFL:
1860 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1861 		break;
1862 
1863 	case SO_WIFI_STATUS:
1864 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1865 		break;
1866 
1867 	case SO_PEEK_OFF:
1868 		if (!sock->ops->set_peek_off)
1869 			return -EOPNOTSUPP;
1870 
1871 		v.val = sk->sk_peek_off;
1872 		break;
1873 	case SO_NOFCS:
1874 		v.val = sock_flag(sk, SOCK_NOFCS);
1875 		break;
1876 
1877 	case SO_BINDTODEVICE:
1878 		return sock_getbindtodevice(sk, optval, optlen, len);
1879 
1880 	case SO_GET_FILTER:
1881 		len = sk_get_filter(sk, optval, len);
1882 		if (len < 0)
1883 			return len;
1884 
1885 		goto lenout;
1886 
1887 	case SO_LOCK_FILTER:
1888 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1889 		break;
1890 
1891 	case SO_BPF_EXTENSIONS:
1892 		v.val = bpf_tell_extensions();
1893 		break;
1894 
1895 	case SO_SELECT_ERR_QUEUE:
1896 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1897 		break;
1898 
1899 #ifdef CONFIG_NET_RX_BUSY_POLL
1900 	case SO_BUSY_POLL:
1901 		v.val = sk->sk_ll_usec;
1902 		break;
1903 	case SO_PREFER_BUSY_POLL:
1904 		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1905 		break;
1906 #endif
1907 
1908 	case SO_MAX_PACING_RATE:
1909 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1910 			lv = sizeof(v.ulval);
1911 			v.ulval = sk->sk_max_pacing_rate;
1912 		} else {
1913 			/* 32bit version */
1914 			v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1915 		}
1916 		break;
1917 
1918 	case SO_INCOMING_CPU:
1919 		v.val = READ_ONCE(sk->sk_incoming_cpu);
1920 		break;
1921 
1922 	case SO_MEMINFO:
1923 	{
1924 		u32 meminfo[SK_MEMINFO_VARS];
1925 
1926 		sk_get_meminfo(sk, meminfo);
1927 
1928 		len = min_t(unsigned int, len, sizeof(meminfo));
1929 		if (copy_to_sockptr(optval, &meminfo, len))
1930 			return -EFAULT;
1931 
1932 		goto lenout;
1933 	}
1934 
1935 #ifdef CONFIG_NET_RX_BUSY_POLL
1936 	case SO_INCOMING_NAPI_ID:
1937 		v.val = READ_ONCE(sk->sk_napi_id);
1938 
1939 		/* aggregate non-NAPI IDs down to 0 */
1940 		if (v.val < MIN_NAPI_ID)
1941 			v.val = 0;
1942 
1943 		break;
1944 #endif
1945 
1946 	case SO_COOKIE:
1947 		lv = sizeof(u64);
1948 		if (len < lv)
1949 			return -EINVAL;
1950 		v.val64 = sock_gen_cookie(sk);
1951 		break;
1952 
1953 	case SO_ZEROCOPY:
1954 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1955 		break;
1956 
1957 	case SO_TXTIME:
1958 		lv = sizeof(v.txtime);
1959 		v.txtime.clockid = sk->sk_clockid;
1960 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1961 				  SOF_TXTIME_DEADLINE_MODE : 0;
1962 		v.txtime.flags |= sk->sk_txtime_report_errors ?
1963 				  SOF_TXTIME_REPORT_ERRORS : 0;
1964 		break;
1965 
1966 	case SO_BINDTOIFINDEX:
1967 		v.val = READ_ONCE(sk->sk_bound_dev_if);
1968 		break;
1969 
1970 	case SO_NETNS_COOKIE:
1971 		lv = sizeof(u64);
1972 		if (len != lv)
1973 			return -EINVAL;
1974 		v.val64 = sock_net(sk)->net_cookie;
1975 		break;
1976 
1977 	case SO_BUF_LOCK:
1978 		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1979 		break;
1980 
1981 	case SO_RESERVE_MEM:
1982 		v.val = sk->sk_reserved_mem;
1983 		break;
1984 
1985 	case SO_TXREHASH:
1986 		v.val = sk->sk_txrehash;
1987 		break;
1988 
1989 	default:
1990 		/* We implement the SO_SNDLOWAT etc to not be settable
1991 		 * (1003.1g 7).
1992 		 */
1993 		return -ENOPROTOOPT;
1994 	}
1995 
1996 	if (len > lv)
1997 		len = lv;
1998 	if (copy_to_sockptr(optval, &v, len))
1999 		return -EFAULT;
2000 lenout:
2001 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
2002 		return -EFAULT;
2003 	return 0;
2004 }
2005 
2006 int sock_getsockopt(struct socket *sock, int level, int optname,
2007 		    char __user *optval, int __user *optlen)
2008 {
2009 	return sk_getsockopt(sock->sk, level, optname,
2010 			     USER_SOCKPTR(optval),
2011 			     USER_SOCKPTR(optlen));
2012 }
2013 
2014 /*
2015  * Initialize an sk_lock.
2016  *
2017  * (We also register the sk_lock with the lock validator.)
2018  */
2019 static inline void sock_lock_init(struct sock *sk)
2020 {
2021 	if (sk->sk_kern_sock)
2022 		sock_lock_init_class_and_name(
2023 			sk,
2024 			af_family_kern_slock_key_strings[sk->sk_family],
2025 			af_family_kern_slock_keys + sk->sk_family,
2026 			af_family_kern_key_strings[sk->sk_family],
2027 			af_family_kern_keys + sk->sk_family);
2028 	else
2029 		sock_lock_init_class_and_name(
2030 			sk,
2031 			af_family_slock_key_strings[sk->sk_family],
2032 			af_family_slock_keys + sk->sk_family,
2033 			af_family_key_strings[sk->sk_family],
2034 			af_family_keys + sk->sk_family);
2035 }
2036 
2037 /*
2038  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2039  * even temporarly, because of RCU lookups. sk_node should also be left as is.
2040  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2041  */
2042 static void sock_copy(struct sock *nsk, const struct sock *osk)
2043 {
2044 	const struct proto *prot = READ_ONCE(osk->sk_prot);
2045 #ifdef CONFIG_SECURITY_NETWORK
2046 	void *sptr = nsk->sk_security;
2047 #endif
2048 
2049 	/* If we move sk_tx_queue_mapping out of the private section,
2050 	 * we must check if sk_tx_queue_clear() is called after
2051 	 * sock_copy() in sk_clone_lock().
2052 	 */
2053 	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2054 		     offsetof(struct sock, sk_dontcopy_begin) ||
2055 		     offsetof(struct sock, sk_tx_queue_mapping) >=
2056 		     offsetof(struct sock, sk_dontcopy_end));
2057 
2058 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2059 
2060 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2061 	       prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
2062 
2063 #ifdef CONFIG_SECURITY_NETWORK
2064 	nsk->sk_security = sptr;
2065 	security_sk_clone(osk, nsk);
2066 #endif
2067 }
2068 
2069 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2070 		int family)
2071 {
2072 	struct sock *sk;
2073 	struct kmem_cache *slab;
2074 
2075 	slab = prot->slab;
2076 	if (slab != NULL) {
2077 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2078 		if (!sk)
2079 			return sk;
2080 		if (want_init_on_alloc(priority))
2081 			sk_prot_clear_nulls(sk, prot->obj_size);
2082 	} else
2083 		sk = kmalloc(prot->obj_size, priority);
2084 
2085 	if (sk != NULL) {
2086 		if (security_sk_alloc(sk, family, priority))
2087 			goto out_free;
2088 
2089 		if (!try_module_get(prot->owner))
2090 			goto out_free_sec;
2091 	}
2092 
2093 	return sk;
2094 
2095 out_free_sec:
2096 	security_sk_free(sk);
2097 out_free:
2098 	if (slab != NULL)
2099 		kmem_cache_free(slab, sk);
2100 	else
2101 		kfree(sk);
2102 	return NULL;
2103 }
2104 
2105 static void sk_prot_free(struct proto *prot, struct sock *sk)
2106 {
2107 	struct kmem_cache *slab;
2108 	struct module *owner;
2109 
2110 	owner = prot->owner;
2111 	slab = prot->slab;
2112 
2113 	cgroup_sk_free(&sk->sk_cgrp_data);
2114 	mem_cgroup_sk_free(sk);
2115 	security_sk_free(sk);
2116 	if (slab != NULL)
2117 		kmem_cache_free(slab, sk);
2118 	else
2119 		kfree(sk);
2120 	module_put(owner);
2121 }
2122 
2123 /**
2124  *	sk_alloc - All socket objects are allocated here
2125  *	@net: the applicable net namespace
2126  *	@family: protocol family
2127  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2128  *	@prot: struct proto associated with this new sock instance
2129  *	@kern: is this to be a kernel socket?
2130  */
2131 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2132 		      struct proto *prot, int kern)
2133 {
2134 	struct sock *sk;
2135 
2136 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2137 	if (sk) {
2138 		sk->sk_family = family;
2139 		/*
2140 		 * See comment in struct sock definition to understand
2141 		 * why we need sk_prot_creator -acme
2142 		 */
2143 		sk->sk_prot = sk->sk_prot_creator = prot;
2144 		sk->sk_kern_sock = kern;
2145 		sock_lock_init(sk);
2146 		sk->sk_net_refcnt = kern ? 0 : 1;
2147 		if (likely(sk->sk_net_refcnt)) {
2148 			get_net_track(net, &sk->ns_tracker, priority);
2149 			sock_inuse_add(net, 1);
2150 		} else {
2151 			__netns_tracker_alloc(net, &sk->ns_tracker,
2152 					      false, priority);
2153 		}
2154 
2155 		sock_net_set(sk, net);
2156 		refcount_set(&sk->sk_wmem_alloc, 1);
2157 
2158 		mem_cgroup_sk_alloc(sk);
2159 		cgroup_sk_alloc(&sk->sk_cgrp_data);
2160 		sock_update_classid(&sk->sk_cgrp_data);
2161 		sock_update_netprioidx(&sk->sk_cgrp_data);
2162 		sk_tx_queue_clear(sk);
2163 	}
2164 
2165 	return sk;
2166 }
2167 EXPORT_SYMBOL(sk_alloc);
2168 
2169 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2170  * grace period. This is the case for UDP sockets and TCP listeners.
2171  */
2172 static void __sk_destruct(struct rcu_head *head)
2173 {
2174 	struct sock *sk = container_of(head, struct sock, sk_rcu);
2175 	struct sk_filter *filter;
2176 
2177 	if (sk->sk_destruct)
2178 		sk->sk_destruct(sk);
2179 
2180 	filter = rcu_dereference_check(sk->sk_filter,
2181 				       refcount_read(&sk->sk_wmem_alloc) == 0);
2182 	if (filter) {
2183 		sk_filter_uncharge(sk, filter);
2184 		RCU_INIT_POINTER(sk->sk_filter, NULL);
2185 	}
2186 
2187 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2188 
2189 #ifdef CONFIG_BPF_SYSCALL
2190 	bpf_sk_storage_free(sk);
2191 #endif
2192 
2193 	if (atomic_read(&sk->sk_omem_alloc))
2194 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2195 			 __func__, atomic_read(&sk->sk_omem_alloc));
2196 
2197 	if (sk->sk_frag.page) {
2198 		put_page(sk->sk_frag.page);
2199 		sk->sk_frag.page = NULL;
2200 	}
2201 
2202 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2203 	put_cred(sk->sk_peer_cred);
2204 	put_pid(sk->sk_peer_pid);
2205 
2206 	if (likely(sk->sk_net_refcnt))
2207 		put_net_track(sock_net(sk), &sk->ns_tracker);
2208 	else
2209 		__netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
2210 
2211 	sk_prot_free(sk->sk_prot_creator, sk);
2212 }
2213 
2214 void sk_destruct(struct sock *sk)
2215 {
2216 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2217 
2218 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2219 		reuseport_detach_sock(sk);
2220 		use_call_rcu = true;
2221 	}
2222 
2223 	if (use_call_rcu)
2224 		call_rcu(&sk->sk_rcu, __sk_destruct);
2225 	else
2226 		__sk_destruct(&sk->sk_rcu);
2227 }
2228 
2229 static void __sk_free(struct sock *sk)
2230 {
2231 	if (likely(sk->sk_net_refcnt))
2232 		sock_inuse_add(sock_net(sk), -1);
2233 
2234 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2235 		sock_diag_broadcast_destroy(sk);
2236 	else
2237 		sk_destruct(sk);
2238 }
2239 
2240 void sk_free(struct sock *sk)
2241 {
2242 	/*
2243 	 * We subtract one from sk_wmem_alloc and can know if
2244 	 * some packets are still in some tx queue.
2245 	 * If not null, sock_wfree() will call __sk_free(sk) later
2246 	 */
2247 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2248 		__sk_free(sk);
2249 }
2250 EXPORT_SYMBOL(sk_free);
2251 
2252 static void sk_init_common(struct sock *sk)
2253 {
2254 	skb_queue_head_init(&sk->sk_receive_queue);
2255 	skb_queue_head_init(&sk->sk_write_queue);
2256 	skb_queue_head_init(&sk->sk_error_queue);
2257 
2258 	rwlock_init(&sk->sk_callback_lock);
2259 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2260 			af_rlock_keys + sk->sk_family,
2261 			af_family_rlock_key_strings[sk->sk_family]);
2262 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2263 			af_wlock_keys + sk->sk_family,
2264 			af_family_wlock_key_strings[sk->sk_family]);
2265 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2266 			af_elock_keys + sk->sk_family,
2267 			af_family_elock_key_strings[sk->sk_family]);
2268 	lockdep_set_class_and_name(&sk->sk_callback_lock,
2269 			af_callback_keys + sk->sk_family,
2270 			af_family_clock_key_strings[sk->sk_family]);
2271 }
2272 
2273 /**
2274  *	sk_clone_lock - clone a socket, and lock its clone
2275  *	@sk: the socket to clone
2276  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2277  *
2278  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2279  */
2280 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2281 {
2282 	struct proto *prot = READ_ONCE(sk->sk_prot);
2283 	struct sk_filter *filter;
2284 	bool is_charged = true;
2285 	struct sock *newsk;
2286 
2287 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2288 	if (!newsk)
2289 		goto out;
2290 
2291 	sock_copy(newsk, sk);
2292 
2293 	newsk->sk_prot_creator = prot;
2294 
2295 	/* SANITY */
2296 	if (likely(newsk->sk_net_refcnt)) {
2297 		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2298 		sock_inuse_add(sock_net(newsk), 1);
2299 	} else {
2300 		/* Kernel sockets are not elevating the struct net refcount.
2301 		 * Instead, use a tracker to more easily detect if a layer
2302 		 * is not properly dismantling its kernel sockets at netns
2303 		 * destroy time.
2304 		 */
2305 		__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2306 				      false, priority);
2307 	}
2308 	sk_node_init(&newsk->sk_node);
2309 	sock_lock_init(newsk);
2310 	bh_lock_sock(newsk);
2311 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2312 	newsk->sk_backlog.len = 0;
2313 
2314 	atomic_set(&newsk->sk_rmem_alloc, 0);
2315 
2316 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2317 	refcount_set(&newsk->sk_wmem_alloc, 1);
2318 
2319 	atomic_set(&newsk->sk_omem_alloc, 0);
2320 	sk_init_common(newsk);
2321 
2322 	newsk->sk_dst_cache	= NULL;
2323 	newsk->sk_dst_pending_confirm = 0;
2324 	newsk->sk_wmem_queued	= 0;
2325 	newsk->sk_forward_alloc = 0;
2326 	newsk->sk_reserved_mem  = 0;
2327 	atomic_set(&newsk->sk_drops, 0);
2328 	newsk->sk_send_head	= NULL;
2329 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2330 	atomic_set(&newsk->sk_zckey, 0);
2331 
2332 	sock_reset_flag(newsk, SOCK_DONE);
2333 
2334 	/* sk->sk_memcg will be populated at accept() time */
2335 	newsk->sk_memcg = NULL;
2336 
2337 	cgroup_sk_clone(&newsk->sk_cgrp_data);
2338 
2339 	rcu_read_lock();
2340 	filter = rcu_dereference(sk->sk_filter);
2341 	if (filter != NULL)
2342 		/* though it's an empty new sock, the charging may fail
2343 		 * if sysctl_optmem_max was changed between creation of
2344 		 * original socket and cloning
2345 		 */
2346 		is_charged = sk_filter_charge(newsk, filter);
2347 	RCU_INIT_POINTER(newsk->sk_filter, filter);
2348 	rcu_read_unlock();
2349 
2350 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2351 		/* We need to make sure that we don't uncharge the new
2352 		 * socket if we couldn't charge it in the first place
2353 		 * as otherwise we uncharge the parent's filter.
2354 		 */
2355 		if (!is_charged)
2356 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2357 		sk_free_unlock_clone(newsk);
2358 		newsk = NULL;
2359 		goto out;
2360 	}
2361 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2362 
2363 	if (bpf_sk_storage_clone(sk, newsk)) {
2364 		sk_free_unlock_clone(newsk);
2365 		newsk = NULL;
2366 		goto out;
2367 	}
2368 
2369 	/* Clear sk_user_data if parent had the pointer tagged
2370 	 * as not suitable for copying when cloning.
2371 	 */
2372 	if (sk_user_data_is_nocopy(newsk))
2373 		newsk->sk_user_data = NULL;
2374 
2375 	newsk->sk_err	   = 0;
2376 	newsk->sk_err_soft = 0;
2377 	newsk->sk_priority = 0;
2378 	newsk->sk_incoming_cpu = raw_smp_processor_id();
2379 
2380 	/* Before updating sk_refcnt, we must commit prior changes to memory
2381 	 * (Documentation/RCU/rculist_nulls.rst for details)
2382 	 */
2383 	smp_wmb();
2384 	refcount_set(&newsk->sk_refcnt, 2);
2385 
2386 	sk_set_socket(newsk, NULL);
2387 	sk_tx_queue_clear(newsk);
2388 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2389 
2390 	if (newsk->sk_prot->sockets_allocated)
2391 		sk_sockets_allocated_inc(newsk);
2392 
2393 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2394 		net_enable_timestamp();
2395 out:
2396 	return newsk;
2397 }
2398 EXPORT_SYMBOL_GPL(sk_clone_lock);
2399 
2400 void sk_free_unlock_clone(struct sock *sk)
2401 {
2402 	/* It is still raw copy of parent, so invalidate
2403 	 * destructor and make plain sk_free() */
2404 	sk->sk_destruct = NULL;
2405 	bh_unlock_sock(sk);
2406 	sk_free(sk);
2407 }
2408 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2409 
2410 static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2411 {
2412 	bool is_ipv6 = false;
2413 	u32 max_size;
2414 
2415 #if IS_ENABLED(CONFIG_IPV6)
2416 	is_ipv6 = (sk->sk_family == AF_INET6 &&
2417 		   !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2418 #endif
2419 	/* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2420 	max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2421 			READ_ONCE(dst->dev->gso_ipv4_max_size);
2422 	if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2423 		max_size = GSO_LEGACY_MAX_SIZE;
2424 
2425 	return max_size - (MAX_TCP_HEADER + 1);
2426 }
2427 
2428 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2429 {
2430 	u32 max_segs = 1;
2431 
2432 	sk->sk_route_caps = dst->dev->features;
2433 	if (sk_is_tcp(sk))
2434 		sk->sk_route_caps |= NETIF_F_GSO;
2435 	if (sk->sk_route_caps & NETIF_F_GSO)
2436 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2437 	if (unlikely(sk->sk_gso_disabled))
2438 		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2439 	if (sk_can_gso(sk)) {
2440 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2441 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2442 		} else {
2443 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2444 			sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2445 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2446 			max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2447 		}
2448 	}
2449 	sk->sk_gso_max_segs = max_segs;
2450 	sk_dst_set(sk, dst);
2451 }
2452 EXPORT_SYMBOL_GPL(sk_setup_caps);
2453 
2454 /*
2455  *	Simple resource managers for sockets.
2456  */
2457 
2458 
2459 /*
2460  * Write buffer destructor automatically called from kfree_skb.
2461  */
2462 void sock_wfree(struct sk_buff *skb)
2463 {
2464 	struct sock *sk = skb->sk;
2465 	unsigned int len = skb->truesize;
2466 	bool free;
2467 
2468 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2469 		if (sock_flag(sk, SOCK_RCU_FREE) &&
2470 		    sk->sk_write_space == sock_def_write_space) {
2471 			rcu_read_lock();
2472 			free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2473 			sock_def_write_space_wfree(sk);
2474 			rcu_read_unlock();
2475 			if (unlikely(free))
2476 				__sk_free(sk);
2477 			return;
2478 		}
2479 
2480 		/*
2481 		 * Keep a reference on sk_wmem_alloc, this will be released
2482 		 * after sk_write_space() call
2483 		 */
2484 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2485 		sk->sk_write_space(sk);
2486 		len = 1;
2487 	}
2488 	/*
2489 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2490 	 * could not do because of in-flight packets
2491 	 */
2492 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2493 		__sk_free(sk);
2494 }
2495 EXPORT_SYMBOL(sock_wfree);
2496 
2497 /* This variant of sock_wfree() is used by TCP,
2498  * since it sets SOCK_USE_WRITE_QUEUE.
2499  */
2500 void __sock_wfree(struct sk_buff *skb)
2501 {
2502 	struct sock *sk = skb->sk;
2503 
2504 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2505 		__sk_free(sk);
2506 }
2507 
2508 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2509 {
2510 	skb_orphan(skb);
2511 	skb->sk = sk;
2512 #ifdef CONFIG_INET
2513 	if (unlikely(!sk_fullsock(sk))) {
2514 		skb->destructor = sock_edemux;
2515 		sock_hold(sk);
2516 		return;
2517 	}
2518 #endif
2519 	skb->destructor = sock_wfree;
2520 	skb_set_hash_from_sk(skb, sk);
2521 	/*
2522 	 * We used to take a refcount on sk, but following operation
2523 	 * is enough to guarantee sk_free() wont free this sock until
2524 	 * all in-flight packets are completed
2525 	 */
2526 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2527 }
2528 EXPORT_SYMBOL(skb_set_owner_w);
2529 
2530 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2531 {
2532 #ifdef CONFIG_TLS_DEVICE
2533 	/* Drivers depend on in-order delivery for crypto offload,
2534 	 * partial orphan breaks out-of-order-OK logic.
2535 	 */
2536 	if (skb->decrypted)
2537 		return false;
2538 #endif
2539 	return (skb->destructor == sock_wfree ||
2540 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2541 }
2542 
2543 /* This helper is used by netem, as it can hold packets in its
2544  * delay queue. We want to allow the owner socket to send more
2545  * packets, as if they were already TX completed by a typical driver.
2546  * But we also want to keep skb->sk set because some packet schedulers
2547  * rely on it (sch_fq for example).
2548  */
2549 void skb_orphan_partial(struct sk_buff *skb)
2550 {
2551 	if (skb_is_tcp_pure_ack(skb))
2552 		return;
2553 
2554 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2555 		return;
2556 
2557 	skb_orphan(skb);
2558 }
2559 EXPORT_SYMBOL(skb_orphan_partial);
2560 
2561 /*
2562  * Read buffer destructor automatically called from kfree_skb.
2563  */
2564 void sock_rfree(struct sk_buff *skb)
2565 {
2566 	struct sock *sk = skb->sk;
2567 	unsigned int len = skb->truesize;
2568 
2569 	atomic_sub(len, &sk->sk_rmem_alloc);
2570 	sk_mem_uncharge(sk, len);
2571 }
2572 EXPORT_SYMBOL(sock_rfree);
2573 
2574 /*
2575  * Buffer destructor for skbs that are not used directly in read or write
2576  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2577  */
2578 void sock_efree(struct sk_buff *skb)
2579 {
2580 	sock_put(skb->sk);
2581 }
2582 EXPORT_SYMBOL(sock_efree);
2583 
2584 /* Buffer destructor for prefetch/receive path where reference count may
2585  * not be held, e.g. for listen sockets.
2586  */
2587 #ifdef CONFIG_INET
2588 void sock_pfree(struct sk_buff *skb)
2589 {
2590 	if (sk_is_refcounted(skb->sk))
2591 		sock_gen_put(skb->sk);
2592 }
2593 EXPORT_SYMBOL(sock_pfree);
2594 #endif /* CONFIG_INET */
2595 
2596 kuid_t sock_i_uid(struct sock *sk)
2597 {
2598 	kuid_t uid;
2599 
2600 	read_lock_bh(&sk->sk_callback_lock);
2601 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2602 	read_unlock_bh(&sk->sk_callback_lock);
2603 	return uid;
2604 }
2605 EXPORT_SYMBOL(sock_i_uid);
2606 
2607 unsigned long sock_i_ino(struct sock *sk)
2608 {
2609 	unsigned long ino;
2610 
2611 	read_lock_bh(&sk->sk_callback_lock);
2612 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2613 	read_unlock_bh(&sk->sk_callback_lock);
2614 	return ino;
2615 }
2616 EXPORT_SYMBOL(sock_i_ino);
2617 
2618 /*
2619  * Allocate a skb from the socket's send buffer.
2620  */
2621 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2622 			     gfp_t priority)
2623 {
2624 	if (force ||
2625 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2626 		struct sk_buff *skb = alloc_skb(size, priority);
2627 
2628 		if (skb) {
2629 			skb_set_owner_w(skb, sk);
2630 			return skb;
2631 		}
2632 	}
2633 	return NULL;
2634 }
2635 EXPORT_SYMBOL(sock_wmalloc);
2636 
2637 static void sock_ofree(struct sk_buff *skb)
2638 {
2639 	struct sock *sk = skb->sk;
2640 
2641 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2642 }
2643 
2644 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2645 			     gfp_t priority)
2646 {
2647 	struct sk_buff *skb;
2648 
2649 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2650 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2651 	    READ_ONCE(sysctl_optmem_max))
2652 		return NULL;
2653 
2654 	skb = alloc_skb(size, priority);
2655 	if (!skb)
2656 		return NULL;
2657 
2658 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2659 	skb->sk = sk;
2660 	skb->destructor = sock_ofree;
2661 	return skb;
2662 }
2663 
2664 /*
2665  * Allocate a memory block from the socket's option memory buffer.
2666  */
2667 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2668 {
2669 	int optmem_max = READ_ONCE(sysctl_optmem_max);
2670 
2671 	if ((unsigned int)size <= optmem_max &&
2672 	    atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2673 		void *mem;
2674 		/* First do the add, to avoid the race if kmalloc
2675 		 * might sleep.
2676 		 */
2677 		atomic_add(size, &sk->sk_omem_alloc);
2678 		mem = kmalloc(size, priority);
2679 		if (mem)
2680 			return mem;
2681 		atomic_sub(size, &sk->sk_omem_alloc);
2682 	}
2683 	return NULL;
2684 }
2685 EXPORT_SYMBOL(sock_kmalloc);
2686 
2687 /* Free an option memory block. Note, we actually want the inline
2688  * here as this allows gcc to detect the nullify and fold away the
2689  * condition entirely.
2690  */
2691 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2692 				  const bool nullify)
2693 {
2694 	if (WARN_ON_ONCE(!mem))
2695 		return;
2696 	if (nullify)
2697 		kfree_sensitive(mem);
2698 	else
2699 		kfree(mem);
2700 	atomic_sub(size, &sk->sk_omem_alloc);
2701 }
2702 
2703 void sock_kfree_s(struct sock *sk, void *mem, int size)
2704 {
2705 	__sock_kfree_s(sk, mem, size, false);
2706 }
2707 EXPORT_SYMBOL(sock_kfree_s);
2708 
2709 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2710 {
2711 	__sock_kfree_s(sk, mem, size, true);
2712 }
2713 EXPORT_SYMBOL(sock_kzfree_s);
2714 
2715 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2716    I think, these locks should be removed for datagram sockets.
2717  */
2718 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2719 {
2720 	DEFINE_WAIT(wait);
2721 
2722 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2723 	for (;;) {
2724 		if (!timeo)
2725 			break;
2726 		if (signal_pending(current))
2727 			break;
2728 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2729 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2730 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2731 			break;
2732 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2733 			break;
2734 		if (sk->sk_err)
2735 			break;
2736 		timeo = schedule_timeout(timeo);
2737 	}
2738 	finish_wait(sk_sleep(sk), &wait);
2739 	return timeo;
2740 }
2741 
2742 
2743 /*
2744  *	Generic send/receive buffer handlers
2745  */
2746 
2747 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2748 				     unsigned long data_len, int noblock,
2749 				     int *errcode, int max_page_order)
2750 {
2751 	struct sk_buff *skb;
2752 	long timeo;
2753 	int err;
2754 
2755 	timeo = sock_sndtimeo(sk, noblock);
2756 	for (;;) {
2757 		err = sock_error(sk);
2758 		if (err != 0)
2759 			goto failure;
2760 
2761 		err = -EPIPE;
2762 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2763 			goto failure;
2764 
2765 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2766 			break;
2767 
2768 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2769 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2770 		err = -EAGAIN;
2771 		if (!timeo)
2772 			goto failure;
2773 		if (signal_pending(current))
2774 			goto interrupted;
2775 		timeo = sock_wait_for_wmem(sk, timeo);
2776 	}
2777 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2778 				   errcode, sk->sk_allocation);
2779 	if (skb)
2780 		skb_set_owner_w(skb, sk);
2781 	return skb;
2782 
2783 interrupted:
2784 	err = sock_intr_errno(timeo);
2785 failure:
2786 	*errcode = err;
2787 	return NULL;
2788 }
2789 EXPORT_SYMBOL(sock_alloc_send_pskb);
2790 
2791 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
2792 		     struct sockcm_cookie *sockc)
2793 {
2794 	u32 tsflags;
2795 
2796 	switch (cmsg->cmsg_type) {
2797 	case SO_MARK:
2798 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2799 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2800 			return -EPERM;
2801 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2802 			return -EINVAL;
2803 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2804 		break;
2805 	case SO_TIMESTAMPING_OLD:
2806 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2807 			return -EINVAL;
2808 
2809 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2810 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2811 			return -EINVAL;
2812 
2813 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2814 		sockc->tsflags |= tsflags;
2815 		break;
2816 	case SCM_TXTIME:
2817 		if (!sock_flag(sk, SOCK_TXTIME))
2818 			return -EINVAL;
2819 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2820 			return -EINVAL;
2821 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2822 		break;
2823 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2824 	case SCM_RIGHTS:
2825 	case SCM_CREDENTIALS:
2826 		break;
2827 	default:
2828 		return -EINVAL;
2829 	}
2830 	return 0;
2831 }
2832 EXPORT_SYMBOL(__sock_cmsg_send);
2833 
2834 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2835 		   struct sockcm_cookie *sockc)
2836 {
2837 	struct cmsghdr *cmsg;
2838 	int ret;
2839 
2840 	for_each_cmsghdr(cmsg, msg) {
2841 		if (!CMSG_OK(msg, cmsg))
2842 			return -EINVAL;
2843 		if (cmsg->cmsg_level != SOL_SOCKET)
2844 			continue;
2845 		ret = __sock_cmsg_send(sk, cmsg, sockc);
2846 		if (ret)
2847 			return ret;
2848 	}
2849 	return 0;
2850 }
2851 EXPORT_SYMBOL(sock_cmsg_send);
2852 
2853 static void sk_enter_memory_pressure(struct sock *sk)
2854 {
2855 	if (!sk->sk_prot->enter_memory_pressure)
2856 		return;
2857 
2858 	sk->sk_prot->enter_memory_pressure(sk);
2859 }
2860 
2861 static void sk_leave_memory_pressure(struct sock *sk)
2862 {
2863 	if (sk->sk_prot->leave_memory_pressure) {
2864 		INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
2865 				     tcp_leave_memory_pressure, sk);
2866 	} else {
2867 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2868 
2869 		if (memory_pressure && READ_ONCE(*memory_pressure))
2870 			WRITE_ONCE(*memory_pressure, 0);
2871 	}
2872 }
2873 
2874 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2875 
2876 /**
2877  * skb_page_frag_refill - check that a page_frag contains enough room
2878  * @sz: minimum size of the fragment we want to get
2879  * @pfrag: pointer to page_frag
2880  * @gfp: priority for memory allocation
2881  *
2882  * Note: While this allocator tries to use high order pages, there is
2883  * no guarantee that allocations succeed. Therefore, @sz MUST be
2884  * less or equal than PAGE_SIZE.
2885  */
2886 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2887 {
2888 	if (pfrag->page) {
2889 		if (page_ref_count(pfrag->page) == 1) {
2890 			pfrag->offset = 0;
2891 			return true;
2892 		}
2893 		if (pfrag->offset + sz <= pfrag->size)
2894 			return true;
2895 		put_page(pfrag->page);
2896 	}
2897 
2898 	pfrag->offset = 0;
2899 	if (SKB_FRAG_PAGE_ORDER &&
2900 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2901 		/* Avoid direct reclaim but allow kswapd to wake */
2902 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2903 					  __GFP_COMP | __GFP_NOWARN |
2904 					  __GFP_NORETRY,
2905 					  SKB_FRAG_PAGE_ORDER);
2906 		if (likely(pfrag->page)) {
2907 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2908 			return true;
2909 		}
2910 	}
2911 	pfrag->page = alloc_page(gfp);
2912 	if (likely(pfrag->page)) {
2913 		pfrag->size = PAGE_SIZE;
2914 		return true;
2915 	}
2916 	return false;
2917 }
2918 EXPORT_SYMBOL(skb_page_frag_refill);
2919 
2920 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2921 {
2922 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2923 		return true;
2924 
2925 	sk_enter_memory_pressure(sk);
2926 	sk_stream_moderate_sndbuf(sk);
2927 	return false;
2928 }
2929 EXPORT_SYMBOL(sk_page_frag_refill);
2930 
2931 void __lock_sock(struct sock *sk)
2932 	__releases(&sk->sk_lock.slock)
2933 	__acquires(&sk->sk_lock.slock)
2934 {
2935 	DEFINE_WAIT(wait);
2936 
2937 	for (;;) {
2938 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2939 					TASK_UNINTERRUPTIBLE);
2940 		spin_unlock_bh(&sk->sk_lock.slock);
2941 		schedule();
2942 		spin_lock_bh(&sk->sk_lock.slock);
2943 		if (!sock_owned_by_user(sk))
2944 			break;
2945 	}
2946 	finish_wait(&sk->sk_lock.wq, &wait);
2947 }
2948 
2949 void __release_sock(struct sock *sk)
2950 	__releases(&sk->sk_lock.slock)
2951 	__acquires(&sk->sk_lock.slock)
2952 {
2953 	struct sk_buff *skb, *next;
2954 
2955 	while ((skb = sk->sk_backlog.head) != NULL) {
2956 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2957 
2958 		spin_unlock_bh(&sk->sk_lock.slock);
2959 
2960 		do {
2961 			next = skb->next;
2962 			prefetch(next);
2963 			DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
2964 			skb_mark_not_on_list(skb);
2965 			sk_backlog_rcv(sk, skb);
2966 
2967 			cond_resched();
2968 
2969 			skb = next;
2970 		} while (skb != NULL);
2971 
2972 		spin_lock_bh(&sk->sk_lock.slock);
2973 	}
2974 
2975 	/*
2976 	 * Doing the zeroing here guarantee we can not loop forever
2977 	 * while a wild producer attempts to flood us.
2978 	 */
2979 	sk->sk_backlog.len = 0;
2980 }
2981 
2982 void __sk_flush_backlog(struct sock *sk)
2983 {
2984 	spin_lock_bh(&sk->sk_lock.slock);
2985 	__release_sock(sk);
2986 	spin_unlock_bh(&sk->sk_lock.slock);
2987 }
2988 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
2989 
2990 /**
2991  * sk_wait_data - wait for data to arrive at sk_receive_queue
2992  * @sk:    sock to wait on
2993  * @timeo: for how long
2994  * @skb:   last skb seen on sk_receive_queue
2995  *
2996  * Now socket state including sk->sk_err is changed only under lock,
2997  * hence we may omit checks after joining wait queue.
2998  * We check receive queue before schedule() only as optimization;
2999  * it is very likely that release_sock() added new data.
3000  */
3001 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3002 {
3003 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
3004 	int rc;
3005 
3006 	add_wait_queue(sk_sleep(sk), &wait);
3007 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3008 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3009 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3010 	remove_wait_queue(sk_sleep(sk), &wait);
3011 	return rc;
3012 }
3013 EXPORT_SYMBOL(sk_wait_data);
3014 
3015 /**
3016  *	__sk_mem_raise_allocated - increase memory_allocated
3017  *	@sk: socket
3018  *	@size: memory size to allocate
3019  *	@amt: pages to allocate
3020  *	@kind: allocation type
3021  *
3022  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
3023  */
3024 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3025 {
3026 	bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
3027 	struct proto *prot = sk->sk_prot;
3028 	bool charged = true;
3029 	long allocated;
3030 
3031 	sk_memory_allocated_add(sk, amt);
3032 	allocated = sk_memory_allocated(sk);
3033 	if (memcg_charge &&
3034 	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
3035 						gfp_memcg_charge())))
3036 		goto suppress_allocation;
3037 
3038 	/* Under limit. */
3039 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
3040 		sk_leave_memory_pressure(sk);
3041 		return 1;
3042 	}
3043 
3044 	/* Under pressure. */
3045 	if (allocated > sk_prot_mem_limits(sk, 1))
3046 		sk_enter_memory_pressure(sk);
3047 
3048 	/* Over hard limit. */
3049 	if (allocated > sk_prot_mem_limits(sk, 2))
3050 		goto suppress_allocation;
3051 
3052 	/* guarantee minimum buffer size under pressure */
3053 	if (kind == SK_MEM_RECV) {
3054 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3055 			return 1;
3056 
3057 	} else { /* SK_MEM_SEND */
3058 		int wmem0 = sk_get_wmem0(sk, prot);
3059 
3060 		if (sk->sk_type == SOCK_STREAM) {
3061 			if (sk->sk_wmem_queued < wmem0)
3062 				return 1;
3063 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3064 				return 1;
3065 		}
3066 	}
3067 
3068 	if (sk_has_memory_pressure(sk)) {
3069 		u64 alloc;
3070 
3071 		if (!sk_under_memory_pressure(sk))
3072 			return 1;
3073 		alloc = sk_sockets_allocated_read_positive(sk);
3074 		if (sk_prot_mem_limits(sk, 2) > alloc *
3075 		    sk_mem_pages(sk->sk_wmem_queued +
3076 				 atomic_read(&sk->sk_rmem_alloc) +
3077 				 sk->sk_forward_alloc))
3078 			return 1;
3079 	}
3080 
3081 suppress_allocation:
3082 
3083 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3084 		sk_stream_moderate_sndbuf(sk);
3085 
3086 		/* Fail only if socket is _under_ its sndbuf.
3087 		 * In this case we cannot block, so that we have to fail.
3088 		 */
3089 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3090 			/* Force charge with __GFP_NOFAIL */
3091 			if (memcg_charge && !charged) {
3092 				mem_cgroup_charge_skmem(sk->sk_memcg, amt,
3093 					gfp_memcg_charge() | __GFP_NOFAIL);
3094 			}
3095 			return 1;
3096 		}
3097 	}
3098 
3099 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3100 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3101 
3102 	sk_memory_allocated_sub(sk, amt);
3103 
3104 	if (memcg_charge && charged)
3105 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
3106 
3107 	return 0;
3108 }
3109 
3110 /**
3111  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3112  *	@sk: socket
3113  *	@size: memory size to allocate
3114  *	@kind: allocation type
3115  *
3116  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3117  *	rmem allocation. This function assumes that protocols which have
3118  *	memory_pressure use sk_wmem_queued as write buffer accounting.
3119  */
3120 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3121 {
3122 	int ret, amt = sk_mem_pages(size);
3123 
3124 	sk->sk_forward_alloc += amt << PAGE_SHIFT;
3125 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3126 	if (!ret)
3127 		sk->sk_forward_alloc -= amt << PAGE_SHIFT;
3128 	return ret;
3129 }
3130 EXPORT_SYMBOL(__sk_mem_schedule);
3131 
3132 /**
3133  *	__sk_mem_reduce_allocated - reclaim memory_allocated
3134  *	@sk: socket
3135  *	@amount: number of quanta
3136  *
3137  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3138  */
3139 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3140 {
3141 	sk_memory_allocated_sub(sk, amount);
3142 
3143 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3144 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3145 
3146 	if (sk_under_memory_pressure(sk) &&
3147 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3148 		sk_leave_memory_pressure(sk);
3149 }
3150 
3151 /**
3152  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3153  *	@sk: socket
3154  *	@amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3155  */
3156 void __sk_mem_reclaim(struct sock *sk, int amount)
3157 {
3158 	amount >>= PAGE_SHIFT;
3159 	sk->sk_forward_alloc -= amount << PAGE_SHIFT;
3160 	__sk_mem_reduce_allocated(sk, amount);
3161 }
3162 EXPORT_SYMBOL(__sk_mem_reclaim);
3163 
3164 int sk_set_peek_off(struct sock *sk, int val)
3165 {
3166 	sk->sk_peek_off = val;
3167 	return 0;
3168 }
3169 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3170 
3171 /*
3172  * Set of default routines for initialising struct proto_ops when
3173  * the protocol does not support a particular function. In certain
3174  * cases where it makes no sense for a protocol to have a "do nothing"
3175  * function, some default processing is provided.
3176  */
3177 
3178 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3179 {
3180 	return -EOPNOTSUPP;
3181 }
3182 EXPORT_SYMBOL(sock_no_bind);
3183 
3184 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3185 		    int len, int flags)
3186 {
3187 	return -EOPNOTSUPP;
3188 }
3189 EXPORT_SYMBOL(sock_no_connect);
3190 
3191 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3192 {
3193 	return -EOPNOTSUPP;
3194 }
3195 EXPORT_SYMBOL(sock_no_socketpair);
3196 
3197 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
3198 		   bool kern)
3199 {
3200 	return -EOPNOTSUPP;
3201 }
3202 EXPORT_SYMBOL(sock_no_accept);
3203 
3204 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3205 		    int peer)
3206 {
3207 	return -EOPNOTSUPP;
3208 }
3209 EXPORT_SYMBOL(sock_no_getname);
3210 
3211 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3212 {
3213 	return -EOPNOTSUPP;
3214 }
3215 EXPORT_SYMBOL(sock_no_ioctl);
3216 
3217 int sock_no_listen(struct socket *sock, int backlog)
3218 {
3219 	return -EOPNOTSUPP;
3220 }
3221 EXPORT_SYMBOL(sock_no_listen);
3222 
3223 int sock_no_shutdown(struct socket *sock, int how)
3224 {
3225 	return -EOPNOTSUPP;
3226 }
3227 EXPORT_SYMBOL(sock_no_shutdown);
3228 
3229 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3230 {
3231 	return -EOPNOTSUPP;
3232 }
3233 EXPORT_SYMBOL(sock_no_sendmsg);
3234 
3235 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3236 {
3237 	return -EOPNOTSUPP;
3238 }
3239 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3240 
3241 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3242 		    int flags)
3243 {
3244 	return -EOPNOTSUPP;
3245 }
3246 EXPORT_SYMBOL(sock_no_recvmsg);
3247 
3248 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3249 {
3250 	/* Mirror missing mmap method error code */
3251 	return -ENODEV;
3252 }
3253 EXPORT_SYMBOL(sock_no_mmap);
3254 
3255 /*
3256  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3257  * various sock-based usage counts.
3258  */
3259 void __receive_sock(struct file *file)
3260 {
3261 	struct socket *sock;
3262 
3263 	sock = sock_from_file(file);
3264 	if (sock) {
3265 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3266 		sock_update_classid(&sock->sk->sk_cgrp_data);
3267 	}
3268 }
3269 
3270 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
3271 {
3272 	ssize_t res;
3273 	struct msghdr msg = {.msg_flags = flags};
3274 	struct kvec iov;
3275 	char *kaddr = kmap(page);
3276 	iov.iov_base = kaddr + offset;
3277 	iov.iov_len = size;
3278 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
3279 	kunmap(page);
3280 	return res;
3281 }
3282 EXPORT_SYMBOL(sock_no_sendpage);
3283 
3284 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
3285 				int offset, size_t size, int flags)
3286 {
3287 	ssize_t res;
3288 	struct msghdr msg = {.msg_flags = flags};
3289 	struct kvec iov;
3290 	char *kaddr = kmap(page);
3291 
3292 	iov.iov_base = kaddr + offset;
3293 	iov.iov_len = size;
3294 	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
3295 	kunmap(page);
3296 	return res;
3297 }
3298 EXPORT_SYMBOL(sock_no_sendpage_locked);
3299 
3300 /*
3301  *	Default Socket Callbacks
3302  */
3303 
3304 static void sock_def_wakeup(struct sock *sk)
3305 {
3306 	struct socket_wq *wq;
3307 
3308 	rcu_read_lock();
3309 	wq = rcu_dereference(sk->sk_wq);
3310 	if (skwq_has_sleeper(wq))
3311 		wake_up_interruptible_all(&wq->wait);
3312 	rcu_read_unlock();
3313 }
3314 
3315 static void sock_def_error_report(struct sock *sk)
3316 {
3317 	struct socket_wq *wq;
3318 
3319 	rcu_read_lock();
3320 	wq = rcu_dereference(sk->sk_wq);
3321 	if (skwq_has_sleeper(wq))
3322 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3323 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3324 	rcu_read_unlock();
3325 }
3326 
3327 void sock_def_readable(struct sock *sk)
3328 {
3329 	struct socket_wq *wq;
3330 
3331 	trace_sk_data_ready(sk);
3332 
3333 	rcu_read_lock();
3334 	wq = rcu_dereference(sk->sk_wq);
3335 	if (skwq_has_sleeper(wq))
3336 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3337 						EPOLLRDNORM | EPOLLRDBAND);
3338 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3339 	rcu_read_unlock();
3340 }
3341 
3342 static void sock_def_write_space(struct sock *sk)
3343 {
3344 	struct socket_wq *wq;
3345 
3346 	rcu_read_lock();
3347 
3348 	/* Do not wake up a writer until he can make "significant"
3349 	 * progress.  --DaveM
3350 	 */
3351 	if (sock_writeable(sk)) {
3352 		wq = rcu_dereference(sk->sk_wq);
3353 		if (skwq_has_sleeper(wq))
3354 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3355 						EPOLLWRNORM | EPOLLWRBAND);
3356 
3357 		/* Should agree with poll, otherwise some programs break */
3358 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3359 	}
3360 
3361 	rcu_read_unlock();
3362 }
3363 
3364 /* An optimised version of sock_def_write_space(), should only be called
3365  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3366  * ->sk_wmem_alloc.
3367  */
3368 static void sock_def_write_space_wfree(struct sock *sk)
3369 {
3370 	/* Do not wake up a writer until he can make "significant"
3371 	 * progress.  --DaveM
3372 	 */
3373 	if (sock_writeable(sk)) {
3374 		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3375 
3376 		/* rely on refcount_sub from sock_wfree() */
3377 		smp_mb__after_atomic();
3378 		if (wq && waitqueue_active(&wq->wait))
3379 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3380 						EPOLLWRNORM | EPOLLWRBAND);
3381 
3382 		/* Should agree with poll, otherwise some programs break */
3383 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3384 	}
3385 }
3386 
3387 static void sock_def_destruct(struct sock *sk)
3388 {
3389 }
3390 
3391 void sk_send_sigurg(struct sock *sk)
3392 {
3393 	if (sk->sk_socket && sk->sk_socket->file)
3394 		if (send_sigurg(&sk->sk_socket->file->f_owner))
3395 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3396 }
3397 EXPORT_SYMBOL(sk_send_sigurg);
3398 
3399 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3400 		    unsigned long expires)
3401 {
3402 	if (!mod_timer(timer, expires))
3403 		sock_hold(sk);
3404 }
3405 EXPORT_SYMBOL(sk_reset_timer);
3406 
3407 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3408 {
3409 	if (del_timer(timer))
3410 		__sock_put(sk);
3411 }
3412 EXPORT_SYMBOL(sk_stop_timer);
3413 
3414 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3415 {
3416 	if (del_timer_sync(timer))
3417 		__sock_put(sk);
3418 }
3419 EXPORT_SYMBOL(sk_stop_timer_sync);
3420 
3421 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3422 {
3423 	sk_init_common(sk);
3424 	sk->sk_send_head	=	NULL;
3425 
3426 	timer_setup(&sk->sk_timer, NULL, 0);
3427 
3428 	sk->sk_allocation	=	GFP_KERNEL;
3429 	sk->sk_rcvbuf		=	READ_ONCE(sysctl_rmem_default);
3430 	sk->sk_sndbuf		=	READ_ONCE(sysctl_wmem_default);
3431 	sk->sk_state		=	TCP_CLOSE;
3432 	sk->sk_use_task_frag	=	true;
3433 	sk_set_socket(sk, sock);
3434 
3435 	sock_set_flag(sk, SOCK_ZAPPED);
3436 
3437 	if (sock) {
3438 		sk->sk_type	=	sock->type;
3439 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3440 		sock->sk	=	sk;
3441 	} else {
3442 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3443 	}
3444 	sk->sk_uid	=	uid;
3445 
3446 	rwlock_init(&sk->sk_callback_lock);
3447 	if (sk->sk_kern_sock)
3448 		lockdep_set_class_and_name(
3449 			&sk->sk_callback_lock,
3450 			af_kern_callback_keys + sk->sk_family,
3451 			af_family_kern_clock_key_strings[sk->sk_family]);
3452 	else
3453 		lockdep_set_class_and_name(
3454 			&sk->sk_callback_lock,
3455 			af_callback_keys + sk->sk_family,
3456 			af_family_clock_key_strings[sk->sk_family]);
3457 
3458 	sk->sk_state_change	=	sock_def_wakeup;
3459 	sk->sk_data_ready	=	sock_def_readable;
3460 	sk->sk_write_space	=	sock_def_write_space;
3461 	sk->sk_error_report	=	sock_def_error_report;
3462 	sk->sk_destruct		=	sock_def_destruct;
3463 
3464 	sk->sk_frag.page	=	NULL;
3465 	sk->sk_frag.offset	=	0;
3466 	sk->sk_peek_off		=	-1;
3467 
3468 	sk->sk_peer_pid 	=	NULL;
3469 	sk->sk_peer_cred	=	NULL;
3470 	spin_lock_init(&sk->sk_peer_lock);
3471 
3472 	sk->sk_write_pending	=	0;
3473 	sk->sk_rcvlowat		=	1;
3474 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3475 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3476 
3477 	sk->sk_stamp = SK_DEFAULT_STAMP;
3478 #if BITS_PER_LONG==32
3479 	seqlock_init(&sk->sk_stamp_seq);
3480 #endif
3481 	atomic_set(&sk->sk_zckey, 0);
3482 
3483 #ifdef CONFIG_NET_RX_BUSY_POLL
3484 	sk->sk_napi_id		=	0;
3485 	sk->sk_ll_usec		=	READ_ONCE(sysctl_net_busy_read);
3486 #endif
3487 
3488 	sk->sk_max_pacing_rate = ~0UL;
3489 	sk->sk_pacing_rate = ~0UL;
3490 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3491 	sk->sk_incoming_cpu = -1;
3492 
3493 	sk_rx_queue_clear(sk);
3494 	/*
3495 	 * Before updating sk_refcnt, we must commit prior changes to memory
3496 	 * (Documentation/RCU/rculist_nulls.rst for details)
3497 	 */
3498 	smp_wmb();
3499 	refcount_set(&sk->sk_refcnt, 1);
3500 	atomic_set(&sk->sk_drops, 0);
3501 }
3502 EXPORT_SYMBOL(sock_init_data_uid);
3503 
3504 void sock_init_data(struct socket *sock, struct sock *sk)
3505 {
3506 	kuid_t uid = sock ?
3507 		SOCK_INODE(sock)->i_uid :
3508 		make_kuid(sock_net(sk)->user_ns, 0);
3509 
3510 	sock_init_data_uid(sock, sk, uid);
3511 }
3512 EXPORT_SYMBOL(sock_init_data);
3513 
3514 void lock_sock_nested(struct sock *sk, int subclass)
3515 {
3516 	/* The sk_lock has mutex_lock() semantics here. */
3517 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3518 
3519 	might_sleep();
3520 	spin_lock_bh(&sk->sk_lock.slock);
3521 	if (sock_owned_by_user_nocheck(sk))
3522 		__lock_sock(sk);
3523 	sk->sk_lock.owned = 1;
3524 	spin_unlock_bh(&sk->sk_lock.slock);
3525 }
3526 EXPORT_SYMBOL(lock_sock_nested);
3527 
3528 void release_sock(struct sock *sk)
3529 {
3530 	spin_lock_bh(&sk->sk_lock.slock);
3531 	if (sk->sk_backlog.tail)
3532 		__release_sock(sk);
3533 
3534 	/* Warning : release_cb() might need to release sk ownership,
3535 	 * ie call sock_release_ownership(sk) before us.
3536 	 */
3537 	if (sk->sk_prot->release_cb)
3538 		sk->sk_prot->release_cb(sk);
3539 
3540 	sock_release_ownership(sk);
3541 	if (waitqueue_active(&sk->sk_lock.wq))
3542 		wake_up(&sk->sk_lock.wq);
3543 	spin_unlock_bh(&sk->sk_lock.slock);
3544 }
3545 EXPORT_SYMBOL(release_sock);
3546 
3547 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3548 {
3549 	might_sleep();
3550 	spin_lock_bh(&sk->sk_lock.slock);
3551 
3552 	if (!sock_owned_by_user_nocheck(sk)) {
3553 		/*
3554 		 * Fast path return with bottom halves disabled and
3555 		 * sock::sk_lock.slock held.
3556 		 *
3557 		 * The 'mutex' is not contended and holding
3558 		 * sock::sk_lock.slock prevents all other lockers to
3559 		 * proceed so the corresponding unlock_sock_fast() can
3560 		 * avoid the slow path of release_sock() completely and
3561 		 * just release slock.
3562 		 *
3563 		 * From a semantical POV this is equivalent to 'acquiring'
3564 		 * the 'mutex', hence the corresponding lockdep
3565 		 * mutex_release() has to happen in the fast path of
3566 		 * unlock_sock_fast().
3567 		 */
3568 		return false;
3569 	}
3570 
3571 	__lock_sock(sk);
3572 	sk->sk_lock.owned = 1;
3573 	__acquire(&sk->sk_lock.slock);
3574 	spin_unlock_bh(&sk->sk_lock.slock);
3575 	return true;
3576 }
3577 EXPORT_SYMBOL(__lock_sock_fast);
3578 
3579 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3580 		   bool timeval, bool time32)
3581 {
3582 	struct sock *sk = sock->sk;
3583 	struct timespec64 ts;
3584 
3585 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3586 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3587 	if (ts.tv_sec == -1)
3588 		return -ENOENT;
3589 	if (ts.tv_sec == 0) {
3590 		ktime_t kt = ktime_get_real();
3591 		sock_write_timestamp(sk, kt);
3592 		ts = ktime_to_timespec64(kt);
3593 	}
3594 
3595 	if (timeval)
3596 		ts.tv_nsec /= 1000;
3597 
3598 #ifdef CONFIG_COMPAT_32BIT_TIME
3599 	if (time32)
3600 		return put_old_timespec32(&ts, userstamp);
3601 #endif
3602 #ifdef CONFIG_SPARC64
3603 	/* beware of padding in sparc64 timeval */
3604 	if (timeval && !in_compat_syscall()) {
3605 		struct __kernel_old_timeval __user tv = {
3606 			.tv_sec = ts.tv_sec,
3607 			.tv_usec = ts.tv_nsec,
3608 		};
3609 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3610 			return -EFAULT;
3611 		return 0;
3612 	}
3613 #endif
3614 	return put_timespec64(&ts, userstamp);
3615 }
3616 EXPORT_SYMBOL(sock_gettstamp);
3617 
3618 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3619 {
3620 	if (!sock_flag(sk, flag)) {
3621 		unsigned long previous_flags = sk->sk_flags;
3622 
3623 		sock_set_flag(sk, flag);
3624 		/*
3625 		 * we just set one of the two flags which require net
3626 		 * time stamping, but time stamping might have been on
3627 		 * already because of the other one
3628 		 */
3629 		if (sock_needs_netstamp(sk) &&
3630 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3631 			net_enable_timestamp();
3632 	}
3633 }
3634 
3635 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3636 		       int level, int type)
3637 {
3638 	struct sock_exterr_skb *serr;
3639 	struct sk_buff *skb;
3640 	int copied, err;
3641 
3642 	err = -EAGAIN;
3643 	skb = sock_dequeue_err_skb(sk);
3644 	if (skb == NULL)
3645 		goto out;
3646 
3647 	copied = skb->len;
3648 	if (copied > len) {
3649 		msg->msg_flags |= MSG_TRUNC;
3650 		copied = len;
3651 	}
3652 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3653 	if (err)
3654 		goto out_free_skb;
3655 
3656 	sock_recv_timestamp(msg, sk, skb);
3657 
3658 	serr = SKB_EXT_ERR(skb);
3659 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3660 
3661 	msg->msg_flags |= MSG_ERRQUEUE;
3662 	err = copied;
3663 
3664 out_free_skb:
3665 	kfree_skb(skb);
3666 out:
3667 	return err;
3668 }
3669 EXPORT_SYMBOL(sock_recv_errqueue);
3670 
3671 /*
3672  *	Get a socket option on an socket.
3673  *
3674  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3675  *	asynchronous errors should be reported by getsockopt. We assume
3676  *	this means if you specify SO_ERROR (otherwise whats the point of it).
3677  */
3678 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3679 			   char __user *optval, int __user *optlen)
3680 {
3681 	struct sock *sk = sock->sk;
3682 
3683 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3684 	return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3685 }
3686 EXPORT_SYMBOL(sock_common_getsockopt);
3687 
3688 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3689 			int flags)
3690 {
3691 	struct sock *sk = sock->sk;
3692 	int addr_len = 0;
3693 	int err;
3694 
3695 	err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3696 	if (err >= 0)
3697 		msg->msg_namelen = addr_len;
3698 	return err;
3699 }
3700 EXPORT_SYMBOL(sock_common_recvmsg);
3701 
3702 /*
3703  *	Set socket options on an inet socket.
3704  */
3705 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3706 			   sockptr_t optval, unsigned int optlen)
3707 {
3708 	struct sock *sk = sock->sk;
3709 
3710 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3711 	return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3712 }
3713 EXPORT_SYMBOL(sock_common_setsockopt);
3714 
3715 void sk_common_release(struct sock *sk)
3716 {
3717 	if (sk->sk_prot->destroy)
3718 		sk->sk_prot->destroy(sk);
3719 
3720 	/*
3721 	 * Observation: when sk_common_release is called, processes have
3722 	 * no access to socket. But net still has.
3723 	 * Step one, detach it from networking:
3724 	 *
3725 	 * A. Remove from hash tables.
3726 	 */
3727 
3728 	sk->sk_prot->unhash(sk);
3729 
3730 	/*
3731 	 * In this point socket cannot receive new packets, but it is possible
3732 	 * that some packets are in flight because some CPU runs receiver and
3733 	 * did hash table lookup before we unhashed socket. They will achieve
3734 	 * receive queue and will be purged by socket destructor.
3735 	 *
3736 	 * Also we still have packets pending on receive queue and probably,
3737 	 * our own packets waiting in device queues. sock_destroy will drain
3738 	 * receive queue, but transmitted packets will delay socket destruction
3739 	 * until the last reference will be released.
3740 	 */
3741 
3742 	sock_orphan(sk);
3743 
3744 	xfrm_sk_free_policy(sk);
3745 
3746 	sock_put(sk);
3747 }
3748 EXPORT_SYMBOL(sk_common_release);
3749 
3750 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3751 {
3752 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3753 
3754 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3755 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3756 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3757 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3758 	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3759 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3760 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3761 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3762 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3763 }
3764 
3765 #ifdef CONFIG_PROC_FS
3766 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3767 
3768 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3769 {
3770 	int cpu, idx = prot->inuse_idx;
3771 	int res = 0;
3772 
3773 	for_each_possible_cpu(cpu)
3774 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3775 
3776 	return res >= 0 ? res : 0;
3777 }
3778 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3779 
3780 int sock_inuse_get(struct net *net)
3781 {
3782 	int cpu, res = 0;
3783 
3784 	for_each_possible_cpu(cpu)
3785 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3786 
3787 	return res;
3788 }
3789 
3790 EXPORT_SYMBOL_GPL(sock_inuse_get);
3791 
3792 static int __net_init sock_inuse_init_net(struct net *net)
3793 {
3794 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3795 	if (net->core.prot_inuse == NULL)
3796 		return -ENOMEM;
3797 	return 0;
3798 }
3799 
3800 static void __net_exit sock_inuse_exit_net(struct net *net)
3801 {
3802 	free_percpu(net->core.prot_inuse);
3803 }
3804 
3805 static struct pernet_operations net_inuse_ops = {
3806 	.init = sock_inuse_init_net,
3807 	.exit = sock_inuse_exit_net,
3808 };
3809 
3810 static __init int net_inuse_init(void)
3811 {
3812 	if (register_pernet_subsys(&net_inuse_ops))
3813 		panic("Cannot initialize net inuse counters");
3814 
3815 	return 0;
3816 }
3817 
3818 core_initcall(net_inuse_init);
3819 
3820 static int assign_proto_idx(struct proto *prot)
3821 {
3822 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3823 
3824 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3825 		pr_err("PROTO_INUSE_NR exhausted\n");
3826 		return -ENOSPC;
3827 	}
3828 
3829 	set_bit(prot->inuse_idx, proto_inuse_idx);
3830 	return 0;
3831 }
3832 
3833 static void release_proto_idx(struct proto *prot)
3834 {
3835 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3836 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3837 }
3838 #else
3839 static inline int assign_proto_idx(struct proto *prot)
3840 {
3841 	return 0;
3842 }
3843 
3844 static inline void release_proto_idx(struct proto *prot)
3845 {
3846 }
3847 
3848 #endif
3849 
3850 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3851 {
3852 	if (!twsk_prot)
3853 		return;
3854 	kfree(twsk_prot->twsk_slab_name);
3855 	twsk_prot->twsk_slab_name = NULL;
3856 	kmem_cache_destroy(twsk_prot->twsk_slab);
3857 	twsk_prot->twsk_slab = NULL;
3858 }
3859 
3860 static int tw_prot_init(const struct proto *prot)
3861 {
3862 	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3863 
3864 	if (!twsk_prot)
3865 		return 0;
3866 
3867 	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3868 					      prot->name);
3869 	if (!twsk_prot->twsk_slab_name)
3870 		return -ENOMEM;
3871 
3872 	twsk_prot->twsk_slab =
3873 		kmem_cache_create(twsk_prot->twsk_slab_name,
3874 				  twsk_prot->twsk_obj_size, 0,
3875 				  SLAB_ACCOUNT | prot->slab_flags,
3876 				  NULL);
3877 	if (!twsk_prot->twsk_slab) {
3878 		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3879 			prot->name);
3880 		return -ENOMEM;
3881 	}
3882 
3883 	return 0;
3884 }
3885 
3886 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3887 {
3888 	if (!rsk_prot)
3889 		return;
3890 	kfree(rsk_prot->slab_name);
3891 	rsk_prot->slab_name = NULL;
3892 	kmem_cache_destroy(rsk_prot->slab);
3893 	rsk_prot->slab = NULL;
3894 }
3895 
3896 static int req_prot_init(const struct proto *prot)
3897 {
3898 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3899 
3900 	if (!rsk_prot)
3901 		return 0;
3902 
3903 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3904 					prot->name);
3905 	if (!rsk_prot->slab_name)
3906 		return -ENOMEM;
3907 
3908 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3909 					   rsk_prot->obj_size, 0,
3910 					   SLAB_ACCOUNT | prot->slab_flags,
3911 					   NULL);
3912 
3913 	if (!rsk_prot->slab) {
3914 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3915 			prot->name);
3916 		return -ENOMEM;
3917 	}
3918 	return 0;
3919 }
3920 
3921 int proto_register(struct proto *prot, int alloc_slab)
3922 {
3923 	int ret = -ENOBUFS;
3924 
3925 	if (prot->memory_allocated && !prot->sysctl_mem) {
3926 		pr_err("%s: missing sysctl_mem\n", prot->name);
3927 		return -EINVAL;
3928 	}
3929 	if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
3930 		pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
3931 		return -EINVAL;
3932 	}
3933 	if (alloc_slab) {
3934 		prot->slab = kmem_cache_create_usercopy(prot->name,
3935 					prot->obj_size, 0,
3936 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3937 					prot->slab_flags,
3938 					prot->useroffset, prot->usersize,
3939 					NULL);
3940 
3941 		if (prot->slab == NULL) {
3942 			pr_crit("%s: Can't create sock SLAB cache!\n",
3943 				prot->name);
3944 			goto out;
3945 		}
3946 
3947 		if (req_prot_init(prot))
3948 			goto out_free_request_sock_slab;
3949 
3950 		if (tw_prot_init(prot))
3951 			goto out_free_timewait_sock_slab;
3952 	}
3953 
3954 	mutex_lock(&proto_list_mutex);
3955 	ret = assign_proto_idx(prot);
3956 	if (ret) {
3957 		mutex_unlock(&proto_list_mutex);
3958 		goto out_free_timewait_sock_slab;
3959 	}
3960 	list_add(&prot->node, &proto_list);
3961 	mutex_unlock(&proto_list_mutex);
3962 	return ret;
3963 
3964 out_free_timewait_sock_slab:
3965 	if (alloc_slab)
3966 		tw_prot_cleanup(prot->twsk_prot);
3967 out_free_request_sock_slab:
3968 	if (alloc_slab) {
3969 		req_prot_cleanup(prot->rsk_prot);
3970 
3971 		kmem_cache_destroy(prot->slab);
3972 		prot->slab = NULL;
3973 	}
3974 out:
3975 	return ret;
3976 }
3977 EXPORT_SYMBOL(proto_register);
3978 
3979 void proto_unregister(struct proto *prot)
3980 {
3981 	mutex_lock(&proto_list_mutex);
3982 	release_proto_idx(prot);
3983 	list_del(&prot->node);
3984 	mutex_unlock(&proto_list_mutex);
3985 
3986 	kmem_cache_destroy(prot->slab);
3987 	prot->slab = NULL;
3988 
3989 	req_prot_cleanup(prot->rsk_prot);
3990 	tw_prot_cleanup(prot->twsk_prot);
3991 }
3992 EXPORT_SYMBOL(proto_unregister);
3993 
3994 int sock_load_diag_module(int family, int protocol)
3995 {
3996 	if (!protocol) {
3997 		if (!sock_is_registered(family))
3998 			return -ENOENT;
3999 
4000 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4001 				      NETLINK_SOCK_DIAG, family);
4002 	}
4003 
4004 #ifdef CONFIG_INET
4005 	if (family == AF_INET &&
4006 	    protocol != IPPROTO_RAW &&
4007 	    protocol < MAX_INET_PROTOS &&
4008 	    !rcu_access_pointer(inet_protos[protocol]))
4009 		return -ENOENT;
4010 #endif
4011 
4012 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4013 			      NETLINK_SOCK_DIAG, family, protocol);
4014 }
4015 EXPORT_SYMBOL(sock_load_diag_module);
4016 
4017 #ifdef CONFIG_PROC_FS
4018 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4019 	__acquires(proto_list_mutex)
4020 {
4021 	mutex_lock(&proto_list_mutex);
4022 	return seq_list_start_head(&proto_list, *pos);
4023 }
4024 
4025 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4026 {
4027 	return seq_list_next(v, &proto_list, pos);
4028 }
4029 
4030 static void proto_seq_stop(struct seq_file *seq, void *v)
4031 	__releases(proto_list_mutex)
4032 {
4033 	mutex_unlock(&proto_list_mutex);
4034 }
4035 
4036 static char proto_method_implemented(const void *method)
4037 {
4038 	return method == NULL ? 'n' : 'y';
4039 }
4040 static long sock_prot_memory_allocated(struct proto *proto)
4041 {
4042 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4043 }
4044 
4045 static const char *sock_prot_memory_pressure(struct proto *proto)
4046 {
4047 	return proto->memory_pressure != NULL ?
4048 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4049 }
4050 
4051 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4052 {
4053 
4054 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4055 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4056 		   proto->name,
4057 		   proto->obj_size,
4058 		   sock_prot_inuse_get(seq_file_net(seq), proto),
4059 		   sock_prot_memory_allocated(proto),
4060 		   sock_prot_memory_pressure(proto),
4061 		   proto->max_header,
4062 		   proto->slab == NULL ? "no" : "yes",
4063 		   module_name(proto->owner),
4064 		   proto_method_implemented(proto->close),
4065 		   proto_method_implemented(proto->connect),
4066 		   proto_method_implemented(proto->disconnect),
4067 		   proto_method_implemented(proto->accept),
4068 		   proto_method_implemented(proto->ioctl),
4069 		   proto_method_implemented(proto->init),
4070 		   proto_method_implemented(proto->destroy),
4071 		   proto_method_implemented(proto->shutdown),
4072 		   proto_method_implemented(proto->setsockopt),
4073 		   proto_method_implemented(proto->getsockopt),
4074 		   proto_method_implemented(proto->sendmsg),
4075 		   proto_method_implemented(proto->recvmsg),
4076 		   proto_method_implemented(proto->sendpage),
4077 		   proto_method_implemented(proto->bind),
4078 		   proto_method_implemented(proto->backlog_rcv),
4079 		   proto_method_implemented(proto->hash),
4080 		   proto_method_implemented(proto->unhash),
4081 		   proto_method_implemented(proto->get_port),
4082 		   proto_method_implemented(proto->enter_memory_pressure));
4083 }
4084 
4085 static int proto_seq_show(struct seq_file *seq, void *v)
4086 {
4087 	if (v == &proto_list)
4088 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4089 			   "protocol",
4090 			   "size",
4091 			   "sockets",
4092 			   "memory",
4093 			   "press",
4094 			   "maxhdr",
4095 			   "slab",
4096 			   "module",
4097 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
4098 	else
4099 		proto_seq_printf(seq, list_entry(v, struct proto, node));
4100 	return 0;
4101 }
4102 
4103 static const struct seq_operations proto_seq_ops = {
4104 	.start  = proto_seq_start,
4105 	.next   = proto_seq_next,
4106 	.stop   = proto_seq_stop,
4107 	.show   = proto_seq_show,
4108 };
4109 
4110 static __net_init int proto_init_net(struct net *net)
4111 {
4112 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4113 			sizeof(struct seq_net_private)))
4114 		return -ENOMEM;
4115 
4116 	return 0;
4117 }
4118 
4119 static __net_exit void proto_exit_net(struct net *net)
4120 {
4121 	remove_proc_entry("protocols", net->proc_net);
4122 }
4123 
4124 
4125 static __net_initdata struct pernet_operations proto_net_ops = {
4126 	.init = proto_init_net,
4127 	.exit = proto_exit_net,
4128 };
4129 
4130 static int __init proto_init(void)
4131 {
4132 	return register_pernet_subsys(&proto_net_ops);
4133 }
4134 
4135 subsys_initcall(proto_init);
4136 
4137 #endif /* PROC_FS */
4138 
4139 #ifdef CONFIG_NET_RX_BUSY_POLL
4140 bool sk_busy_loop_end(void *p, unsigned long start_time)
4141 {
4142 	struct sock *sk = p;
4143 
4144 	return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
4145 	       sk_busy_loop_timeout(sk, start_time);
4146 }
4147 EXPORT_SYMBOL(sk_busy_loop_end);
4148 #endif /* CONFIG_NET_RX_BUSY_POLL */
4149 
4150 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4151 {
4152 	if (!sk->sk_prot->bind_add)
4153 		return -EOPNOTSUPP;
4154 	return sk->sk_prot->bind_add(sk, addr, addr_len);
4155 }
4156 EXPORT_SYMBOL(sock_bind_add);
4157 
4158 /* Copy 'size' bytes from userspace and return `size` back to userspace */
4159 int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4160 		     void __user *arg, void *karg, size_t size)
4161 {
4162 	int ret;
4163 
4164 	if (copy_from_user(karg, arg, size))
4165 		return -EFAULT;
4166 
4167 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4168 	if (ret)
4169 		return ret;
4170 
4171 	if (copy_to_user(arg, karg, size))
4172 		return -EFAULT;
4173 
4174 	return 0;
4175 }
4176 EXPORT_SYMBOL(sock_ioctl_inout);
4177 
4178 /* This is the most common ioctl prep function, where the result (4 bytes) is
4179  * copied back to userspace if the ioctl() returns successfully. No input is
4180  * copied from userspace as input argument.
4181  */
4182 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4183 {
4184 	int ret, karg = 0;
4185 
4186 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4187 	if (ret)
4188 		return ret;
4189 
4190 	return put_user(karg, (int __user *)arg);
4191 }
4192 
4193 /* A wrapper around sock ioctls, which copies the data from userspace
4194  * (depending on the protocol/ioctl), and copies back the result to userspace.
4195  * The main motivation for this function is to pass kernel memory to the
4196  * protocol ioctl callbacks, instead of userspace memory.
4197  */
4198 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4199 {
4200 	int rc = 1;
4201 
4202 	if (sk_is_ipmr(sk))
4203 		rc = ipmr_sk_ioctl(sk, cmd, arg);
4204 	else if (sk_is_icmpv6(sk))
4205 		rc = ip6mr_sk_ioctl(sk, cmd, arg);
4206 	else if (sk_is_phonet(sk))
4207 		rc = phonet_sk_ioctl(sk, cmd, arg);
4208 
4209 	/* If ioctl was processed, returns its value */
4210 	if (rc <= 0)
4211 		return rc;
4212 
4213 	/* Otherwise call the default handler */
4214 	return sock_ioctl_out(sk, cmd, arg);
4215 }
4216 EXPORT_SYMBOL(sk_ioctl);
4217