xref: /openbmc/linux/net/core/sock.c (revision 24e2d05d)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/init.h>
111 #include <linux/highmem.h>
112 #include <linux/user_namespace.h>
113 #include <linux/static_key.h>
114 #include <linux/memcontrol.h>
115 #include <linux/prefetch.h>
116 #include <linux/compat.h>
117 
118 #include <linux/uaccess.h>
119 
120 #include <linux/netdevice.h>
121 #include <net/protocol.h>
122 #include <linux/skbuff.h>
123 #include <net/net_namespace.h>
124 #include <net/request_sock.h>
125 #include <net/sock.h>
126 #include <linux/net_tstamp.h>
127 #include <net/xfrm.h>
128 #include <linux/ipsec.h>
129 #include <net/cls_cgroup.h>
130 #include <net/netprio_cgroup.h>
131 #include <linux/sock_diag.h>
132 
133 #include <linux/filter.h>
134 #include <net/sock_reuseport.h>
135 #include <net/bpf_sk_storage.h>
136 
137 #include <trace/events/sock.h>
138 
139 #include <net/tcp.h>
140 #include <net/busy_poll.h>
141 
142 #include <linux/ethtool.h>
143 
144 static DEFINE_MUTEX(proto_list_mutex);
145 static LIST_HEAD(proto_list);
146 
147 /**
148  * sk_ns_capable - General socket capability test
149  * @sk: Socket to use a capability on or through
150  * @user_ns: The user namespace of the capability to use
151  * @cap: The capability to use
152  *
153  * Test to see if the opener of the socket had when the socket was
154  * created and the current process has the capability @cap in the user
155  * namespace @user_ns.
156  */
157 bool sk_ns_capable(const struct sock *sk,
158 		   struct user_namespace *user_ns, int cap)
159 {
160 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
161 		ns_capable(user_ns, cap);
162 }
163 EXPORT_SYMBOL(sk_ns_capable);
164 
165 /**
166  * sk_capable - Socket global capability test
167  * @sk: Socket to use a capability on or through
168  * @cap: The global capability to use
169  *
170  * Test to see if the opener of the socket had when the socket was
171  * created and the current process has the capability @cap in all user
172  * namespaces.
173  */
174 bool sk_capable(const struct sock *sk, int cap)
175 {
176 	return sk_ns_capable(sk, &init_user_ns, cap);
177 }
178 EXPORT_SYMBOL(sk_capable);
179 
180 /**
181  * sk_net_capable - Network namespace socket capability test
182  * @sk: Socket to use a capability on or through
183  * @cap: The capability to use
184  *
185  * Test to see if the opener of the socket had when the socket was created
186  * and the current process has the capability @cap over the network namespace
187  * the socket is a member of.
188  */
189 bool sk_net_capable(const struct sock *sk, int cap)
190 {
191 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
192 }
193 EXPORT_SYMBOL(sk_net_capable);
194 
195 /*
196  * Each address family might have different locking rules, so we have
197  * one slock key per address family and separate keys for internal and
198  * userspace sockets.
199  */
200 static struct lock_class_key af_family_keys[AF_MAX];
201 static struct lock_class_key af_family_kern_keys[AF_MAX];
202 static struct lock_class_key af_family_slock_keys[AF_MAX];
203 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
204 
205 /*
206  * Make lock validator output more readable. (we pre-construct these
207  * strings build-time, so that runtime initialization of socket
208  * locks is fast):
209  */
210 
211 #define _sock_locks(x)						  \
212   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
213   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
214   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
215   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
216   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
217   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
218   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
219   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
220   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
221   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
222   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
223   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
224   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
225   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
226   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
227   x "AF_MCTP"  , \
228   x "AF_MAX"
229 
230 static const char *const af_family_key_strings[AF_MAX+1] = {
231 	_sock_locks("sk_lock-")
232 };
233 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
234 	_sock_locks("slock-")
235 };
236 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
237 	_sock_locks("clock-")
238 };
239 
240 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
241 	_sock_locks("k-sk_lock-")
242 };
243 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
244 	_sock_locks("k-slock-")
245 };
246 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
247 	_sock_locks("k-clock-")
248 };
249 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
250 	_sock_locks("rlock-")
251 };
252 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
253 	_sock_locks("wlock-")
254 };
255 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
256 	_sock_locks("elock-")
257 };
258 
259 /*
260  * sk_callback_lock and sk queues locking rules are per-address-family,
261  * so split the lock classes by using a per-AF key:
262  */
263 static struct lock_class_key af_callback_keys[AF_MAX];
264 static struct lock_class_key af_rlock_keys[AF_MAX];
265 static struct lock_class_key af_wlock_keys[AF_MAX];
266 static struct lock_class_key af_elock_keys[AF_MAX];
267 static struct lock_class_key af_kern_callback_keys[AF_MAX];
268 
269 /* Run time adjustable parameters. */
270 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
271 EXPORT_SYMBOL(sysctl_wmem_max);
272 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
273 EXPORT_SYMBOL(sysctl_rmem_max);
274 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
275 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
276 
277 /* Maximal space eaten by iovec or ancillary data plus some space */
278 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
279 EXPORT_SYMBOL(sysctl_optmem_max);
280 
281 int sysctl_tstamp_allow_data __read_mostly = 1;
282 
283 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
284 EXPORT_SYMBOL_GPL(memalloc_socks_key);
285 
286 /**
287  * sk_set_memalloc - sets %SOCK_MEMALLOC
288  * @sk: socket to set it on
289  *
290  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
291  * It's the responsibility of the admin to adjust min_free_kbytes
292  * to meet the requirements
293  */
294 void sk_set_memalloc(struct sock *sk)
295 {
296 	sock_set_flag(sk, SOCK_MEMALLOC);
297 	sk->sk_allocation |= __GFP_MEMALLOC;
298 	static_branch_inc(&memalloc_socks_key);
299 }
300 EXPORT_SYMBOL_GPL(sk_set_memalloc);
301 
302 void sk_clear_memalloc(struct sock *sk)
303 {
304 	sock_reset_flag(sk, SOCK_MEMALLOC);
305 	sk->sk_allocation &= ~__GFP_MEMALLOC;
306 	static_branch_dec(&memalloc_socks_key);
307 
308 	/*
309 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
310 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
311 	 * it has rmem allocations due to the last swapfile being deactivated
312 	 * but there is a risk that the socket is unusable due to exceeding
313 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
314 	 */
315 	sk_mem_reclaim(sk);
316 }
317 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
318 
319 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
320 {
321 	int ret;
322 	unsigned int noreclaim_flag;
323 
324 	/* these should have been dropped before queueing */
325 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
326 
327 	noreclaim_flag = memalloc_noreclaim_save();
328 	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
329 				 tcp_v6_do_rcv,
330 				 tcp_v4_do_rcv,
331 				 sk, skb);
332 	memalloc_noreclaim_restore(noreclaim_flag);
333 
334 	return ret;
335 }
336 EXPORT_SYMBOL(__sk_backlog_rcv);
337 
338 void sk_error_report(struct sock *sk)
339 {
340 	sk->sk_error_report(sk);
341 
342 	switch (sk->sk_family) {
343 	case AF_INET:
344 		fallthrough;
345 	case AF_INET6:
346 		trace_inet_sk_error_report(sk);
347 		break;
348 	default:
349 		break;
350 	}
351 }
352 EXPORT_SYMBOL(sk_error_report);
353 
354 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
355 {
356 	struct __kernel_sock_timeval tv;
357 
358 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
359 		tv.tv_sec = 0;
360 		tv.tv_usec = 0;
361 	} else {
362 		tv.tv_sec = timeo / HZ;
363 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
364 	}
365 
366 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
367 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
368 		*(struct old_timeval32 *)optval = tv32;
369 		return sizeof(tv32);
370 	}
371 
372 	if (old_timeval) {
373 		struct __kernel_old_timeval old_tv;
374 		old_tv.tv_sec = tv.tv_sec;
375 		old_tv.tv_usec = tv.tv_usec;
376 		*(struct __kernel_old_timeval *)optval = old_tv;
377 		return sizeof(old_tv);
378 	}
379 
380 	*(struct __kernel_sock_timeval *)optval = tv;
381 	return sizeof(tv);
382 }
383 EXPORT_SYMBOL(sock_get_timeout);
384 
385 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
386 			   sockptr_t optval, int optlen, bool old_timeval)
387 {
388 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
389 		struct old_timeval32 tv32;
390 
391 		if (optlen < sizeof(tv32))
392 			return -EINVAL;
393 
394 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
395 			return -EFAULT;
396 		tv->tv_sec = tv32.tv_sec;
397 		tv->tv_usec = tv32.tv_usec;
398 	} else if (old_timeval) {
399 		struct __kernel_old_timeval old_tv;
400 
401 		if (optlen < sizeof(old_tv))
402 			return -EINVAL;
403 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
404 			return -EFAULT;
405 		tv->tv_sec = old_tv.tv_sec;
406 		tv->tv_usec = old_tv.tv_usec;
407 	} else {
408 		if (optlen < sizeof(*tv))
409 			return -EINVAL;
410 		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
411 			return -EFAULT;
412 	}
413 
414 	return 0;
415 }
416 EXPORT_SYMBOL(sock_copy_user_timeval);
417 
418 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
419 			    bool old_timeval)
420 {
421 	struct __kernel_sock_timeval tv;
422 	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
423 
424 	if (err)
425 		return err;
426 
427 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
428 		return -EDOM;
429 
430 	if (tv.tv_sec < 0) {
431 		static int warned __read_mostly;
432 
433 		*timeo_p = 0;
434 		if (warned < 10 && net_ratelimit()) {
435 			warned++;
436 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
437 				__func__, current->comm, task_pid_nr(current));
438 		}
439 		return 0;
440 	}
441 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
442 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
443 		return 0;
444 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
445 		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
446 	return 0;
447 }
448 
449 static bool sock_needs_netstamp(const struct sock *sk)
450 {
451 	switch (sk->sk_family) {
452 	case AF_UNSPEC:
453 	case AF_UNIX:
454 		return false;
455 	default:
456 		return true;
457 	}
458 }
459 
460 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
461 {
462 	if (sk->sk_flags & flags) {
463 		sk->sk_flags &= ~flags;
464 		if (sock_needs_netstamp(sk) &&
465 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
466 			net_disable_timestamp();
467 	}
468 }
469 
470 
471 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
472 {
473 	unsigned long flags;
474 	struct sk_buff_head *list = &sk->sk_receive_queue;
475 
476 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
477 		atomic_inc(&sk->sk_drops);
478 		trace_sock_rcvqueue_full(sk, skb);
479 		return -ENOMEM;
480 	}
481 
482 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
483 		atomic_inc(&sk->sk_drops);
484 		return -ENOBUFS;
485 	}
486 
487 	skb->dev = NULL;
488 	skb_set_owner_r(skb, sk);
489 
490 	/* we escape from rcu protected region, make sure we dont leak
491 	 * a norefcounted dst
492 	 */
493 	skb_dst_force(skb);
494 
495 	spin_lock_irqsave(&list->lock, flags);
496 	sock_skb_set_dropcount(sk, skb);
497 	__skb_queue_tail(list, skb);
498 	spin_unlock_irqrestore(&list->lock, flags);
499 
500 	if (!sock_flag(sk, SOCK_DEAD))
501 		sk->sk_data_ready(sk);
502 	return 0;
503 }
504 EXPORT_SYMBOL(__sock_queue_rcv_skb);
505 
506 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
507 {
508 	int err;
509 
510 	err = sk_filter(sk, skb);
511 	if (err)
512 		return err;
513 
514 	return __sock_queue_rcv_skb(sk, skb);
515 }
516 EXPORT_SYMBOL(sock_queue_rcv_skb);
517 
518 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
519 		     const int nested, unsigned int trim_cap, bool refcounted)
520 {
521 	int rc = NET_RX_SUCCESS;
522 
523 	if (sk_filter_trim_cap(sk, skb, trim_cap))
524 		goto discard_and_relse;
525 
526 	skb->dev = NULL;
527 
528 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
529 		atomic_inc(&sk->sk_drops);
530 		goto discard_and_relse;
531 	}
532 	if (nested)
533 		bh_lock_sock_nested(sk);
534 	else
535 		bh_lock_sock(sk);
536 	if (!sock_owned_by_user(sk)) {
537 		/*
538 		 * trylock + unlock semantics:
539 		 */
540 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
541 
542 		rc = sk_backlog_rcv(sk, skb);
543 
544 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
545 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
546 		bh_unlock_sock(sk);
547 		atomic_inc(&sk->sk_drops);
548 		goto discard_and_relse;
549 	}
550 
551 	bh_unlock_sock(sk);
552 out:
553 	if (refcounted)
554 		sock_put(sk);
555 	return rc;
556 discard_and_relse:
557 	kfree_skb(skb);
558 	goto out;
559 }
560 EXPORT_SYMBOL(__sk_receive_skb);
561 
562 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
563 							  u32));
564 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
565 							   u32));
566 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
567 {
568 	struct dst_entry *dst = __sk_dst_get(sk);
569 
570 	if (dst && dst->obsolete &&
571 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
572 			       dst, cookie) == NULL) {
573 		sk_tx_queue_clear(sk);
574 		sk->sk_dst_pending_confirm = 0;
575 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
576 		dst_release(dst);
577 		return NULL;
578 	}
579 
580 	return dst;
581 }
582 EXPORT_SYMBOL(__sk_dst_check);
583 
584 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
585 {
586 	struct dst_entry *dst = sk_dst_get(sk);
587 
588 	if (dst && dst->obsolete &&
589 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
590 			       dst, cookie) == NULL) {
591 		sk_dst_reset(sk);
592 		dst_release(dst);
593 		return NULL;
594 	}
595 
596 	return dst;
597 }
598 EXPORT_SYMBOL(sk_dst_check);
599 
600 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
601 {
602 	int ret = -ENOPROTOOPT;
603 #ifdef CONFIG_NETDEVICES
604 	struct net *net = sock_net(sk);
605 
606 	/* Sorry... */
607 	ret = -EPERM;
608 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
609 		goto out;
610 
611 	ret = -EINVAL;
612 	if (ifindex < 0)
613 		goto out;
614 
615 	sk->sk_bound_dev_if = ifindex;
616 	if (sk->sk_prot->rehash)
617 		sk->sk_prot->rehash(sk);
618 	sk_dst_reset(sk);
619 
620 	ret = 0;
621 
622 out:
623 #endif
624 
625 	return ret;
626 }
627 
628 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
629 {
630 	int ret;
631 
632 	if (lock_sk)
633 		lock_sock(sk);
634 	ret = sock_bindtoindex_locked(sk, ifindex);
635 	if (lock_sk)
636 		release_sock(sk);
637 
638 	return ret;
639 }
640 EXPORT_SYMBOL(sock_bindtoindex);
641 
642 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
643 {
644 	int ret = -ENOPROTOOPT;
645 #ifdef CONFIG_NETDEVICES
646 	struct net *net = sock_net(sk);
647 	char devname[IFNAMSIZ];
648 	int index;
649 
650 	ret = -EINVAL;
651 	if (optlen < 0)
652 		goto out;
653 
654 	/* Bind this socket to a particular device like "eth0",
655 	 * as specified in the passed interface name. If the
656 	 * name is "" or the option length is zero the socket
657 	 * is not bound.
658 	 */
659 	if (optlen > IFNAMSIZ - 1)
660 		optlen = IFNAMSIZ - 1;
661 	memset(devname, 0, sizeof(devname));
662 
663 	ret = -EFAULT;
664 	if (copy_from_sockptr(devname, optval, optlen))
665 		goto out;
666 
667 	index = 0;
668 	if (devname[0] != '\0') {
669 		struct net_device *dev;
670 
671 		rcu_read_lock();
672 		dev = dev_get_by_name_rcu(net, devname);
673 		if (dev)
674 			index = dev->ifindex;
675 		rcu_read_unlock();
676 		ret = -ENODEV;
677 		if (!dev)
678 			goto out;
679 	}
680 
681 	return sock_bindtoindex(sk, index, true);
682 out:
683 #endif
684 
685 	return ret;
686 }
687 
688 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
689 				int __user *optlen, int len)
690 {
691 	int ret = -ENOPROTOOPT;
692 #ifdef CONFIG_NETDEVICES
693 	struct net *net = sock_net(sk);
694 	char devname[IFNAMSIZ];
695 
696 	if (sk->sk_bound_dev_if == 0) {
697 		len = 0;
698 		goto zero;
699 	}
700 
701 	ret = -EINVAL;
702 	if (len < IFNAMSIZ)
703 		goto out;
704 
705 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
706 	if (ret)
707 		goto out;
708 
709 	len = strlen(devname) + 1;
710 
711 	ret = -EFAULT;
712 	if (copy_to_user(optval, devname, len))
713 		goto out;
714 
715 zero:
716 	ret = -EFAULT;
717 	if (put_user(len, optlen))
718 		goto out;
719 
720 	ret = 0;
721 
722 out:
723 #endif
724 
725 	return ret;
726 }
727 
728 bool sk_mc_loop(struct sock *sk)
729 {
730 	if (dev_recursion_level())
731 		return false;
732 	if (!sk)
733 		return true;
734 	switch (sk->sk_family) {
735 	case AF_INET:
736 		return inet_sk(sk)->mc_loop;
737 #if IS_ENABLED(CONFIG_IPV6)
738 	case AF_INET6:
739 		return inet6_sk(sk)->mc_loop;
740 #endif
741 	}
742 	WARN_ON_ONCE(1);
743 	return true;
744 }
745 EXPORT_SYMBOL(sk_mc_loop);
746 
747 void sock_set_reuseaddr(struct sock *sk)
748 {
749 	lock_sock(sk);
750 	sk->sk_reuse = SK_CAN_REUSE;
751 	release_sock(sk);
752 }
753 EXPORT_SYMBOL(sock_set_reuseaddr);
754 
755 void sock_set_reuseport(struct sock *sk)
756 {
757 	lock_sock(sk);
758 	sk->sk_reuseport = true;
759 	release_sock(sk);
760 }
761 EXPORT_SYMBOL(sock_set_reuseport);
762 
763 void sock_no_linger(struct sock *sk)
764 {
765 	lock_sock(sk);
766 	sk->sk_lingertime = 0;
767 	sock_set_flag(sk, SOCK_LINGER);
768 	release_sock(sk);
769 }
770 EXPORT_SYMBOL(sock_no_linger);
771 
772 void sock_set_priority(struct sock *sk, u32 priority)
773 {
774 	lock_sock(sk);
775 	sk->sk_priority = priority;
776 	release_sock(sk);
777 }
778 EXPORT_SYMBOL(sock_set_priority);
779 
780 void sock_set_sndtimeo(struct sock *sk, s64 secs)
781 {
782 	lock_sock(sk);
783 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
784 		sk->sk_sndtimeo = secs * HZ;
785 	else
786 		sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
787 	release_sock(sk);
788 }
789 EXPORT_SYMBOL(sock_set_sndtimeo);
790 
791 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
792 {
793 	if (val)  {
794 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
795 		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
796 		sock_set_flag(sk, SOCK_RCVTSTAMP);
797 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
798 	} else {
799 		sock_reset_flag(sk, SOCK_RCVTSTAMP);
800 		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
801 	}
802 }
803 
804 void sock_enable_timestamps(struct sock *sk)
805 {
806 	lock_sock(sk);
807 	__sock_set_timestamps(sk, true, false, true);
808 	release_sock(sk);
809 }
810 EXPORT_SYMBOL(sock_enable_timestamps);
811 
812 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
813 {
814 	switch (optname) {
815 	case SO_TIMESTAMP_OLD:
816 		__sock_set_timestamps(sk, valbool, false, false);
817 		break;
818 	case SO_TIMESTAMP_NEW:
819 		__sock_set_timestamps(sk, valbool, true, false);
820 		break;
821 	case SO_TIMESTAMPNS_OLD:
822 		__sock_set_timestamps(sk, valbool, false, true);
823 		break;
824 	case SO_TIMESTAMPNS_NEW:
825 		__sock_set_timestamps(sk, valbool, true, true);
826 		break;
827 	}
828 }
829 
830 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
831 {
832 	struct net *net = sock_net(sk);
833 	struct net_device *dev = NULL;
834 	bool match = false;
835 	int *vclock_index;
836 	int i, num;
837 
838 	if (sk->sk_bound_dev_if)
839 		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
840 
841 	if (!dev) {
842 		pr_err("%s: sock not bind to device\n", __func__);
843 		return -EOPNOTSUPP;
844 	}
845 
846 	num = ethtool_get_phc_vclocks(dev, &vclock_index);
847 	dev_put(dev);
848 
849 	for (i = 0; i < num; i++) {
850 		if (*(vclock_index + i) == phc_index) {
851 			match = true;
852 			break;
853 		}
854 	}
855 
856 	if (num > 0)
857 		kfree(vclock_index);
858 
859 	if (!match)
860 		return -EINVAL;
861 
862 	sk->sk_bind_phc = phc_index;
863 
864 	return 0;
865 }
866 
867 int sock_set_timestamping(struct sock *sk, int optname,
868 			  struct so_timestamping timestamping)
869 {
870 	int val = timestamping.flags;
871 	int ret;
872 
873 	if (val & ~SOF_TIMESTAMPING_MASK)
874 		return -EINVAL;
875 
876 	if (val & SOF_TIMESTAMPING_OPT_ID &&
877 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
878 		if (sk_is_tcp(sk)) {
879 			if ((1 << sk->sk_state) &
880 			    (TCPF_CLOSE | TCPF_LISTEN))
881 				return -EINVAL;
882 			atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
883 		} else {
884 			atomic_set(&sk->sk_tskey, 0);
885 		}
886 	}
887 
888 	if (val & SOF_TIMESTAMPING_OPT_STATS &&
889 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
890 		return -EINVAL;
891 
892 	if (val & SOF_TIMESTAMPING_BIND_PHC) {
893 		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
894 		if (ret)
895 			return ret;
896 	}
897 
898 	sk->sk_tsflags = val;
899 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
900 
901 	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
902 		sock_enable_timestamp(sk,
903 				      SOCK_TIMESTAMPING_RX_SOFTWARE);
904 	else
905 		sock_disable_timestamp(sk,
906 				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
907 	return 0;
908 }
909 
910 void sock_set_keepalive(struct sock *sk)
911 {
912 	lock_sock(sk);
913 	if (sk->sk_prot->keepalive)
914 		sk->sk_prot->keepalive(sk, true);
915 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
916 	release_sock(sk);
917 }
918 EXPORT_SYMBOL(sock_set_keepalive);
919 
920 static void __sock_set_rcvbuf(struct sock *sk, int val)
921 {
922 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
923 	 * as a negative value.
924 	 */
925 	val = min_t(int, val, INT_MAX / 2);
926 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
927 
928 	/* We double it on the way in to account for "struct sk_buff" etc.
929 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
930 	 * will allow that much actual data to be received on that socket.
931 	 *
932 	 * Applications are unaware that "struct sk_buff" and other overheads
933 	 * allocate from the receive buffer during socket buffer allocation.
934 	 *
935 	 * And after considering the possible alternatives, returning the value
936 	 * we actually used in getsockopt is the most desirable behavior.
937 	 */
938 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
939 }
940 
941 void sock_set_rcvbuf(struct sock *sk, int val)
942 {
943 	lock_sock(sk);
944 	__sock_set_rcvbuf(sk, val);
945 	release_sock(sk);
946 }
947 EXPORT_SYMBOL(sock_set_rcvbuf);
948 
949 static void __sock_set_mark(struct sock *sk, u32 val)
950 {
951 	if (val != sk->sk_mark) {
952 		sk->sk_mark = val;
953 		sk_dst_reset(sk);
954 	}
955 }
956 
957 void sock_set_mark(struct sock *sk, u32 val)
958 {
959 	lock_sock(sk);
960 	__sock_set_mark(sk, val);
961 	release_sock(sk);
962 }
963 EXPORT_SYMBOL(sock_set_mark);
964 
965 static void sock_release_reserved_memory(struct sock *sk, int bytes)
966 {
967 	/* Round down bytes to multiple of pages */
968 	bytes &= ~(SK_MEM_QUANTUM - 1);
969 
970 	WARN_ON(bytes > sk->sk_reserved_mem);
971 	sk->sk_reserved_mem -= bytes;
972 	sk_mem_reclaim(sk);
973 }
974 
975 static int sock_reserve_memory(struct sock *sk, int bytes)
976 {
977 	long allocated;
978 	bool charged;
979 	int pages;
980 
981 	if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
982 		return -EOPNOTSUPP;
983 
984 	if (!bytes)
985 		return 0;
986 
987 	pages = sk_mem_pages(bytes);
988 
989 	/* pre-charge to memcg */
990 	charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
991 					  GFP_KERNEL | __GFP_RETRY_MAYFAIL);
992 	if (!charged)
993 		return -ENOMEM;
994 
995 	/* pre-charge to forward_alloc */
996 	allocated = sk_memory_allocated_add(sk, pages);
997 	/* If the system goes into memory pressure with this
998 	 * precharge, give up and return error.
999 	 */
1000 	if (allocated > sk_prot_mem_limits(sk, 1)) {
1001 		sk_memory_allocated_sub(sk, pages);
1002 		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1003 		return -ENOMEM;
1004 	}
1005 	sk->sk_forward_alloc += pages << SK_MEM_QUANTUM_SHIFT;
1006 
1007 	sk->sk_reserved_mem += pages << SK_MEM_QUANTUM_SHIFT;
1008 
1009 	return 0;
1010 }
1011 
1012 /*
1013  *	This is meant for all protocols to use and covers goings on
1014  *	at the socket level. Everything here is generic.
1015  */
1016 
1017 int sock_setsockopt(struct socket *sock, int level, int optname,
1018 		    sockptr_t optval, unsigned int optlen)
1019 {
1020 	struct so_timestamping timestamping;
1021 	struct sock_txtime sk_txtime;
1022 	struct sock *sk = sock->sk;
1023 	int val;
1024 	int valbool;
1025 	struct linger ling;
1026 	int ret = 0;
1027 
1028 	/*
1029 	 *	Options without arguments
1030 	 */
1031 
1032 	if (optname == SO_BINDTODEVICE)
1033 		return sock_setbindtodevice(sk, optval, optlen);
1034 
1035 	if (optlen < sizeof(int))
1036 		return -EINVAL;
1037 
1038 	if (copy_from_sockptr(&val, optval, sizeof(val)))
1039 		return -EFAULT;
1040 
1041 	valbool = val ? 1 : 0;
1042 
1043 	lock_sock(sk);
1044 
1045 	switch (optname) {
1046 	case SO_DEBUG:
1047 		if (val && !capable(CAP_NET_ADMIN))
1048 			ret = -EACCES;
1049 		else
1050 			sock_valbool_flag(sk, SOCK_DBG, valbool);
1051 		break;
1052 	case SO_REUSEADDR:
1053 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1054 		break;
1055 	case SO_REUSEPORT:
1056 		sk->sk_reuseport = valbool;
1057 		break;
1058 	case SO_TYPE:
1059 	case SO_PROTOCOL:
1060 	case SO_DOMAIN:
1061 	case SO_ERROR:
1062 		ret = -ENOPROTOOPT;
1063 		break;
1064 	case SO_DONTROUTE:
1065 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1066 		sk_dst_reset(sk);
1067 		break;
1068 	case SO_BROADCAST:
1069 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1070 		break;
1071 	case SO_SNDBUF:
1072 		/* Don't error on this BSD doesn't and if you think
1073 		 * about it this is right. Otherwise apps have to
1074 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1075 		 * are treated in BSD as hints
1076 		 */
1077 		val = min_t(u32, val, sysctl_wmem_max);
1078 set_sndbuf:
1079 		/* Ensure val * 2 fits into an int, to prevent max_t()
1080 		 * from treating it as a negative value.
1081 		 */
1082 		val = min_t(int, val, INT_MAX / 2);
1083 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1084 		WRITE_ONCE(sk->sk_sndbuf,
1085 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1086 		/* Wake up sending tasks if we upped the value. */
1087 		sk->sk_write_space(sk);
1088 		break;
1089 
1090 	case SO_SNDBUFFORCE:
1091 		if (!capable(CAP_NET_ADMIN)) {
1092 			ret = -EPERM;
1093 			break;
1094 		}
1095 
1096 		/* No negative values (to prevent underflow, as val will be
1097 		 * multiplied by 2).
1098 		 */
1099 		if (val < 0)
1100 			val = 0;
1101 		goto set_sndbuf;
1102 
1103 	case SO_RCVBUF:
1104 		/* Don't error on this BSD doesn't and if you think
1105 		 * about it this is right. Otherwise apps have to
1106 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1107 		 * are treated in BSD as hints
1108 		 */
1109 		__sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max));
1110 		break;
1111 
1112 	case SO_RCVBUFFORCE:
1113 		if (!capable(CAP_NET_ADMIN)) {
1114 			ret = -EPERM;
1115 			break;
1116 		}
1117 
1118 		/* No negative values (to prevent underflow, as val will be
1119 		 * multiplied by 2).
1120 		 */
1121 		__sock_set_rcvbuf(sk, max(val, 0));
1122 		break;
1123 
1124 	case SO_KEEPALIVE:
1125 		if (sk->sk_prot->keepalive)
1126 			sk->sk_prot->keepalive(sk, valbool);
1127 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1128 		break;
1129 
1130 	case SO_OOBINLINE:
1131 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1132 		break;
1133 
1134 	case SO_NO_CHECK:
1135 		sk->sk_no_check_tx = valbool;
1136 		break;
1137 
1138 	case SO_PRIORITY:
1139 		if ((val >= 0 && val <= 6) ||
1140 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1141 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1142 			sk->sk_priority = val;
1143 		else
1144 			ret = -EPERM;
1145 		break;
1146 
1147 	case SO_LINGER:
1148 		if (optlen < sizeof(ling)) {
1149 			ret = -EINVAL;	/* 1003.1g */
1150 			break;
1151 		}
1152 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1153 			ret = -EFAULT;
1154 			break;
1155 		}
1156 		if (!ling.l_onoff)
1157 			sock_reset_flag(sk, SOCK_LINGER);
1158 		else {
1159 #if (BITS_PER_LONG == 32)
1160 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
1161 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1162 			else
1163 #endif
1164 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
1165 			sock_set_flag(sk, SOCK_LINGER);
1166 		}
1167 		break;
1168 
1169 	case SO_BSDCOMPAT:
1170 		break;
1171 
1172 	case SO_PASSCRED:
1173 		if (valbool)
1174 			set_bit(SOCK_PASSCRED, &sock->flags);
1175 		else
1176 			clear_bit(SOCK_PASSCRED, &sock->flags);
1177 		break;
1178 
1179 	case SO_TIMESTAMP_OLD:
1180 	case SO_TIMESTAMP_NEW:
1181 	case SO_TIMESTAMPNS_OLD:
1182 	case SO_TIMESTAMPNS_NEW:
1183 		sock_set_timestamp(sk, optname, valbool);
1184 		break;
1185 
1186 	case SO_TIMESTAMPING_NEW:
1187 	case SO_TIMESTAMPING_OLD:
1188 		if (optlen == sizeof(timestamping)) {
1189 			if (copy_from_sockptr(&timestamping, optval,
1190 					      sizeof(timestamping))) {
1191 				ret = -EFAULT;
1192 				break;
1193 			}
1194 		} else {
1195 			memset(&timestamping, 0, sizeof(timestamping));
1196 			timestamping.flags = val;
1197 		}
1198 		ret = sock_set_timestamping(sk, optname, timestamping);
1199 		break;
1200 
1201 	case SO_RCVLOWAT:
1202 		if (val < 0)
1203 			val = INT_MAX;
1204 		if (sock->ops->set_rcvlowat)
1205 			ret = sock->ops->set_rcvlowat(sk, val);
1206 		else
1207 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1208 		break;
1209 
1210 	case SO_RCVTIMEO_OLD:
1211 	case SO_RCVTIMEO_NEW:
1212 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1213 				       optlen, optname == SO_RCVTIMEO_OLD);
1214 		break;
1215 
1216 	case SO_SNDTIMEO_OLD:
1217 	case SO_SNDTIMEO_NEW:
1218 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1219 				       optlen, optname == SO_SNDTIMEO_OLD);
1220 		break;
1221 
1222 	case SO_ATTACH_FILTER: {
1223 		struct sock_fprog fprog;
1224 
1225 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1226 		if (!ret)
1227 			ret = sk_attach_filter(&fprog, sk);
1228 		break;
1229 	}
1230 	case SO_ATTACH_BPF:
1231 		ret = -EINVAL;
1232 		if (optlen == sizeof(u32)) {
1233 			u32 ufd;
1234 
1235 			ret = -EFAULT;
1236 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1237 				break;
1238 
1239 			ret = sk_attach_bpf(ufd, sk);
1240 		}
1241 		break;
1242 
1243 	case SO_ATTACH_REUSEPORT_CBPF: {
1244 		struct sock_fprog fprog;
1245 
1246 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1247 		if (!ret)
1248 			ret = sk_reuseport_attach_filter(&fprog, sk);
1249 		break;
1250 	}
1251 	case SO_ATTACH_REUSEPORT_EBPF:
1252 		ret = -EINVAL;
1253 		if (optlen == sizeof(u32)) {
1254 			u32 ufd;
1255 
1256 			ret = -EFAULT;
1257 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1258 				break;
1259 
1260 			ret = sk_reuseport_attach_bpf(ufd, sk);
1261 		}
1262 		break;
1263 
1264 	case SO_DETACH_REUSEPORT_BPF:
1265 		ret = reuseport_detach_prog(sk);
1266 		break;
1267 
1268 	case SO_DETACH_FILTER:
1269 		ret = sk_detach_filter(sk);
1270 		break;
1271 
1272 	case SO_LOCK_FILTER:
1273 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1274 			ret = -EPERM;
1275 		else
1276 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1277 		break;
1278 
1279 	case SO_PASSSEC:
1280 		if (valbool)
1281 			set_bit(SOCK_PASSSEC, &sock->flags);
1282 		else
1283 			clear_bit(SOCK_PASSSEC, &sock->flags);
1284 		break;
1285 	case SO_MARK:
1286 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1287 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1288 			ret = -EPERM;
1289 			break;
1290 		}
1291 
1292 		__sock_set_mark(sk, val);
1293 		break;
1294 
1295 	case SO_RXQ_OVFL:
1296 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1297 		break;
1298 
1299 	case SO_WIFI_STATUS:
1300 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1301 		break;
1302 
1303 	case SO_PEEK_OFF:
1304 		if (sock->ops->set_peek_off)
1305 			ret = sock->ops->set_peek_off(sk, val);
1306 		else
1307 			ret = -EOPNOTSUPP;
1308 		break;
1309 
1310 	case SO_NOFCS:
1311 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1312 		break;
1313 
1314 	case SO_SELECT_ERR_QUEUE:
1315 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1316 		break;
1317 
1318 #ifdef CONFIG_NET_RX_BUSY_POLL
1319 	case SO_BUSY_POLL:
1320 		/* allow unprivileged users to decrease the value */
1321 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1322 			ret = -EPERM;
1323 		else {
1324 			if (val < 0)
1325 				ret = -EINVAL;
1326 			else
1327 				WRITE_ONCE(sk->sk_ll_usec, val);
1328 		}
1329 		break;
1330 	case SO_PREFER_BUSY_POLL:
1331 		if (valbool && !capable(CAP_NET_ADMIN))
1332 			ret = -EPERM;
1333 		else
1334 			WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1335 		break;
1336 	case SO_BUSY_POLL_BUDGET:
1337 		if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
1338 			ret = -EPERM;
1339 		} else {
1340 			if (val < 0 || val > U16_MAX)
1341 				ret = -EINVAL;
1342 			else
1343 				WRITE_ONCE(sk->sk_busy_poll_budget, val);
1344 		}
1345 		break;
1346 #endif
1347 
1348 	case SO_MAX_PACING_RATE:
1349 		{
1350 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1351 
1352 		if (sizeof(ulval) != sizeof(val) &&
1353 		    optlen >= sizeof(ulval) &&
1354 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1355 			ret = -EFAULT;
1356 			break;
1357 		}
1358 		if (ulval != ~0UL)
1359 			cmpxchg(&sk->sk_pacing_status,
1360 				SK_PACING_NONE,
1361 				SK_PACING_NEEDED);
1362 		sk->sk_max_pacing_rate = ulval;
1363 		sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1364 		break;
1365 		}
1366 	case SO_INCOMING_CPU:
1367 		WRITE_ONCE(sk->sk_incoming_cpu, val);
1368 		break;
1369 
1370 	case SO_CNX_ADVICE:
1371 		if (val == 1)
1372 			dst_negative_advice(sk);
1373 		break;
1374 
1375 	case SO_ZEROCOPY:
1376 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1377 			if (!(sk_is_tcp(sk) ||
1378 			      (sk->sk_type == SOCK_DGRAM &&
1379 			       sk->sk_protocol == IPPROTO_UDP)))
1380 				ret = -EOPNOTSUPP;
1381 		} else if (sk->sk_family != PF_RDS) {
1382 			ret = -EOPNOTSUPP;
1383 		}
1384 		if (!ret) {
1385 			if (val < 0 || val > 1)
1386 				ret = -EINVAL;
1387 			else
1388 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1389 		}
1390 		break;
1391 
1392 	case SO_TXTIME:
1393 		if (optlen != sizeof(struct sock_txtime)) {
1394 			ret = -EINVAL;
1395 			break;
1396 		} else if (copy_from_sockptr(&sk_txtime, optval,
1397 			   sizeof(struct sock_txtime))) {
1398 			ret = -EFAULT;
1399 			break;
1400 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1401 			ret = -EINVAL;
1402 			break;
1403 		}
1404 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1405 		 * scheduler has enough safe guards.
1406 		 */
1407 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1408 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1409 			ret = -EPERM;
1410 			break;
1411 		}
1412 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1413 		sk->sk_clockid = sk_txtime.clockid;
1414 		sk->sk_txtime_deadline_mode =
1415 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1416 		sk->sk_txtime_report_errors =
1417 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1418 		break;
1419 
1420 	case SO_BINDTOIFINDEX:
1421 		ret = sock_bindtoindex_locked(sk, val);
1422 		break;
1423 
1424 	case SO_BUF_LOCK:
1425 		if (val & ~SOCK_BUF_LOCK_MASK) {
1426 			ret = -EINVAL;
1427 			break;
1428 		}
1429 		sk->sk_userlocks = val | (sk->sk_userlocks &
1430 					  ~SOCK_BUF_LOCK_MASK);
1431 		break;
1432 
1433 	case SO_RESERVE_MEM:
1434 	{
1435 		int delta;
1436 
1437 		if (val < 0) {
1438 			ret = -EINVAL;
1439 			break;
1440 		}
1441 
1442 		delta = val - sk->sk_reserved_mem;
1443 		if (delta < 0)
1444 			sock_release_reserved_memory(sk, -delta);
1445 		else
1446 			ret = sock_reserve_memory(sk, delta);
1447 		break;
1448 	}
1449 
1450 	case SO_TXREHASH:
1451 		if (val < -1 || val > 1) {
1452 			ret = -EINVAL;
1453 			break;
1454 		}
1455 		/* Paired with READ_ONCE() in tcp_rtx_synack() */
1456 		WRITE_ONCE(sk->sk_txrehash, (u8)val);
1457 		break;
1458 
1459 	default:
1460 		ret = -ENOPROTOOPT;
1461 		break;
1462 	}
1463 	release_sock(sk);
1464 	return ret;
1465 }
1466 EXPORT_SYMBOL(sock_setsockopt);
1467 
1468 static const struct cred *sk_get_peer_cred(struct sock *sk)
1469 {
1470 	const struct cred *cred;
1471 
1472 	spin_lock(&sk->sk_peer_lock);
1473 	cred = get_cred(sk->sk_peer_cred);
1474 	spin_unlock(&sk->sk_peer_lock);
1475 
1476 	return cred;
1477 }
1478 
1479 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1480 			  struct ucred *ucred)
1481 {
1482 	ucred->pid = pid_vnr(pid);
1483 	ucred->uid = ucred->gid = -1;
1484 	if (cred) {
1485 		struct user_namespace *current_ns = current_user_ns();
1486 
1487 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1488 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1489 	}
1490 }
1491 
1492 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1493 {
1494 	struct user_namespace *user_ns = current_user_ns();
1495 	int i;
1496 
1497 	for (i = 0; i < src->ngroups; i++)
1498 		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1499 			return -EFAULT;
1500 
1501 	return 0;
1502 }
1503 
1504 int sock_getsockopt(struct socket *sock, int level, int optname,
1505 		    char __user *optval, int __user *optlen)
1506 {
1507 	struct sock *sk = sock->sk;
1508 
1509 	union {
1510 		int val;
1511 		u64 val64;
1512 		unsigned long ulval;
1513 		struct linger ling;
1514 		struct old_timeval32 tm32;
1515 		struct __kernel_old_timeval tm;
1516 		struct  __kernel_sock_timeval stm;
1517 		struct sock_txtime txtime;
1518 		struct so_timestamping timestamping;
1519 	} v;
1520 
1521 	int lv = sizeof(int);
1522 	int len;
1523 
1524 	if (get_user(len, optlen))
1525 		return -EFAULT;
1526 	if (len < 0)
1527 		return -EINVAL;
1528 
1529 	memset(&v, 0, sizeof(v));
1530 
1531 	switch (optname) {
1532 	case SO_DEBUG:
1533 		v.val = sock_flag(sk, SOCK_DBG);
1534 		break;
1535 
1536 	case SO_DONTROUTE:
1537 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1538 		break;
1539 
1540 	case SO_BROADCAST:
1541 		v.val = sock_flag(sk, SOCK_BROADCAST);
1542 		break;
1543 
1544 	case SO_SNDBUF:
1545 		v.val = sk->sk_sndbuf;
1546 		break;
1547 
1548 	case SO_RCVBUF:
1549 		v.val = sk->sk_rcvbuf;
1550 		break;
1551 
1552 	case SO_REUSEADDR:
1553 		v.val = sk->sk_reuse;
1554 		break;
1555 
1556 	case SO_REUSEPORT:
1557 		v.val = sk->sk_reuseport;
1558 		break;
1559 
1560 	case SO_KEEPALIVE:
1561 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1562 		break;
1563 
1564 	case SO_TYPE:
1565 		v.val = sk->sk_type;
1566 		break;
1567 
1568 	case SO_PROTOCOL:
1569 		v.val = sk->sk_protocol;
1570 		break;
1571 
1572 	case SO_DOMAIN:
1573 		v.val = sk->sk_family;
1574 		break;
1575 
1576 	case SO_ERROR:
1577 		v.val = -sock_error(sk);
1578 		if (v.val == 0)
1579 			v.val = xchg(&sk->sk_err_soft, 0);
1580 		break;
1581 
1582 	case SO_OOBINLINE:
1583 		v.val = sock_flag(sk, SOCK_URGINLINE);
1584 		break;
1585 
1586 	case SO_NO_CHECK:
1587 		v.val = sk->sk_no_check_tx;
1588 		break;
1589 
1590 	case SO_PRIORITY:
1591 		v.val = sk->sk_priority;
1592 		break;
1593 
1594 	case SO_LINGER:
1595 		lv		= sizeof(v.ling);
1596 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1597 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1598 		break;
1599 
1600 	case SO_BSDCOMPAT:
1601 		break;
1602 
1603 	case SO_TIMESTAMP_OLD:
1604 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1605 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1606 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1607 		break;
1608 
1609 	case SO_TIMESTAMPNS_OLD:
1610 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1611 		break;
1612 
1613 	case SO_TIMESTAMP_NEW:
1614 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1615 		break;
1616 
1617 	case SO_TIMESTAMPNS_NEW:
1618 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1619 		break;
1620 
1621 	case SO_TIMESTAMPING_OLD:
1622 		lv = sizeof(v.timestamping);
1623 		v.timestamping.flags = sk->sk_tsflags;
1624 		v.timestamping.bind_phc = sk->sk_bind_phc;
1625 		break;
1626 
1627 	case SO_RCVTIMEO_OLD:
1628 	case SO_RCVTIMEO_NEW:
1629 		lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1630 		break;
1631 
1632 	case SO_SNDTIMEO_OLD:
1633 	case SO_SNDTIMEO_NEW:
1634 		lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1635 		break;
1636 
1637 	case SO_RCVLOWAT:
1638 		v.val = sk->sk_rcvlowat;
1639 		break;
1640 
1641 	case SO_SNDLOWAT:
1642 		v.val = 1;
1643 		break;
1644 
1645 	case SO_PASSCRED:
1646 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1647 		break;
1648 
1649 	case SO_PEERCRED:
1650 	{
1651 		struct ucred peercred;
1652 		if (len > sizeof(peercred))
1653 			len = sizeof(peercred);
1654 
1655 		spin_lock(&sk->sk_peer_lock);
1656 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1657 		spin_unlock(&sk->sk_peer_lock);
1658 
1659 		if (copy_to_user(optval, &peercred, len))
1660 			return -EFAULT;
1661 		goto lenout;
1662 	}
1663 
1664 	case SO_PEERGROUPS:
1665 	{
1666 		const struct cred *cred;
1667 		int ret, n;
1668 
1669 		cred = sk_get_peer_cred(sk);
1670 		if (!cred)
1671 			return -ENODATA;
1672 
1673 		n = cred->group_info->ngroups;
1674 		if (len < n * sizeof(gid_t)) {
1675 			len = n * sizeof(gid_t);
1676 			put_cred(cred);
1677 			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1678 		}
1679 		len = n * sizeof(gid_t);
1680 
1681 		ret = groups_to_user((gid_t __user *)optval, cred->group_info);
1682 		put_cred(cred);
1683 		if (ret)
1684 			return ret;
1685 		goto lenout;
1686 	}
1687 
1688 	case SO_PEERNAME:
1689 	{
1690 		char address[128];
1691 
1692 		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1693 		if (lv < 0)
1694 			return -ENOTCONN;
1695 		if (lv < len)
1696 			return -EINVAL;
1697 		if (copy_to_user(optval, address, len))
1698 			return -EFAULT;
1699 		goto lenout;
1700 	}
1701 
1702 	/* Dubious BSD thing... Probably nobody even uses it, but
1703 	 * the UNIX standard wants it for whatever reason... -DaveM
1704 	 */
1705 	case SO_ACCEPTCONN:
1706 		v.val = sk->sk_state == TCP_LISTEN;
1707 		break;
1708 
1709 	case SO_PASSSEC:
1710 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1711 		break;
1712 
1713 	case SO_PEERSEC:
1714 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1715 
1716 	case SO_MARK:
1717 		v.val = sk->sk_mark;
1718 		break;
1719 
1720 	case SO_RXQ_OVFL:
1721 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1722 		break;
1723 
1724 	case SO_WIFI_STATUS:
1725 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1726 		break;
1727 
1728 	case SO_PEEK_OFF:
1729 		if (!sock->ops->set_peek_off)
1730 			return -EOPNOTSUPP;
1731 
1732 		v.val = sk->sk_peek_off;
1733 		break;
1734 	case SO_NOFCS:
1735 		v.val = sock_flag(sk, SOCK_NOFCS);
1736 		break;
1737 
1738 	case SO_BINDTODEVICE:
1739 		return sock_getbindtodevice(sk, optval, optlen, len);
1740 
1741 	case SO_GET_FILTER:
1742 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1743 		if (len < 0)
1744 			return len;
1745 
1746 		goto lenout;
1747 
1748 	case SO_LOCK_FILTER:
1749 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1750 		break;
1751 
1752 	case SO_BPF_EXTENSIONS:
1753 		v.val = bpf_tell_extensions();
1754 		break;
1755 
1756 	case SO_SELECT_ERR_QUEUE:
1757 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1758 		break;
1759 
1760 #ifdef CONFIG_NET_RX_BUSY_POLL
1761 	case SO_BUSY_POLL:
1762 		v.val = sk->sk_ll_usec;
1763 		break;
1764 	case SO_PREFER_BUSY_POLL:
1765 		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1766 		break;
1767 #endif
1768 
1769 	case SO_MAX_PACING_RATE:
1770 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1771 			lv = sizeof(v.ulval);
1772 			v.ulval = sk->sk_max_pacing_rate;
1773 		} else {
1774 			/* 32bit version */
1775 			v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1776 		}
1777 		break;
1778 
1779 	case SO_INCOMING_CPU:
1780 		v.val = READ_ONCE(sk->sk_incoming_cpu);
1781 		break;
1782 
1783 	case SO_MEMINFO:
1784 	{
1785 		u32 meminfo[SK_MEMINFO_VARS];
1786 
1787 		sk_get_meminfo(sk, meminfo);
1788 
1789 		len = min_t(unsigned int, len, sizeof(meminfo));
1790 		if (copy_to_user(optval, &meminfo, len))
1791 			return -EFAULT;
1792 
1793 		goto lenout;
1794 	}
1795 
1796 #ifdef CONFIG_NET_RX_BUSY_POLL
1797 	case SO_INCOMING_NAPI_ID:
1798 		v.val = READ_ONCE(sk->sk_napi_id);
1799 
1800 		/* aggregate non-NAPI IDs down to 0 */
1801 		if (v.val < MIN_NAPI_ID)
1802 			v.val = 0;
1803 
1804 		break;
1805 #endif
1806 
1807 	case SO_COOKIE:
1808 		lv = sizeof(u64);
1809 		if (len < lv)
1810 			return -EINVAL;
1811 		v.val64 = sock_gen_cookie(sk);
1812 		break;
1813 
1814 	case SO_ZEROCOPY:
1815 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1816 		break;
1817 
1818 	case SO_TXTIME:
1819 		lv = sizeof(v.txtime);
1820 		v.txtime.clockid = sk->sk_clockid;
1821 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1822 				  SOF_TXTIME_DEADLINE_MODE : 0;
1823 		v.txtime.flags |= sk->sk_txtime_report_errors ?
1824 				  SOF_TXTIME_REPORT_ERRORS : 0;
1825 		break;
1826 
1827 	case SO_BINDTOIFINDEX:
1828 		v.val = sk->sk_bound_dev_if;
1829 		break;
1830 
1831 	case SO_NETNS_COOKIE:
1832 		lv = sizeof(u64);
1833 		if (len != lv)
1834 			return -EINVAL;
1835 		v.val64 = sock_net(sk)->net_cookie;
1836 		break;
1837 
1838 	case SO_BUF_LOCK:
1839 		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1840 		break;
1841 
1842 	case SO_RESERVE_MEM:
1843 		v.val = sk->sk_reserved_mem;
1844 		break;
1845 
1846 	case SO_TXREHASH:
1847 		v.val = sk->sk_txrehash;
1848 		break;
1849 
1850 	default:
1851 		/* We implement the SO_SNDLOWAT etc to not be settable
1852 		 * (1003.1g 7).
1853 		 */
1854 		return -ENOPROTOOPT;
1855 	}
1856 
1857 	if (len > lv)
1858 		len = lv;
1859 	if (copy_to_user(optval, &v, len))
1860 		return -EFAULT;
1861 lenout:
1862 	if (put_user(len, optlen))
1863 		return -EFAULT;
1864 	return 0;
1865 }
1866 
1867 /*
1868  * Initialize an sk_lock.
1869  *
1870  * (We also register the sk_lock with the lock validator.)
1871  */
1872 static inline void sock_lock_init(struct sock *sk)
1873 {
1874 	if (sk->sk_kern_sock)
1875 		sock_lock_init_class_and_name(
1876 			sk,
1877 			af_family_kern_slock_key_strings[sk->sk_family],
1878 			af_family_kern_slock_keys + sk->sk_family,
1879 			af_family_kern_key_strings[sk->sk_family],
1880 			af_family_kern_keys + sk->sk_family);
1881 	else
1882 		sock_lock_init_class_and_name(
1883 			sk,
1884 			af_family_slock_key_strings[sk->sk_family],
1885 			af_family_slock_keys + sk->sk_family,
1886 			af_family_key_strings[sk->sk_family],
1887 			af_family_keys + sk->sk_family);
1888 }
1889 
1890 /*
1891  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1892  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1893  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1894  */
1895 static void sock_copy(struct sock *nsk, const struct sock *osk)
1896 {
1897 	const struct proto *prot = READ_ONCE(osk->sk_prot);
1898 #ifdef CONFIG_SECURITY_NETWORK
1899 	void *sptr = nsk->sk_security;
1900 #endif
1901 
1902 	/* If we move sk_tx_queue_mapping out of the private section,
1903 	 * we must check if sk_tx_queue_clear() is called after
1904 	 * sock_copy() in sk_clone_lock().
1905 	 */
1906 	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
1907 		     offsetof(struct sock, sk_dontcopy_begin) ||
1908 		     offsetof(struct sock, sk_tx_queue_mapping) >=
1909 		     offsetof(struct sock, sk_dontcopy_end));
1910 
1911 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1912 
1913 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1914 	       prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1915 
1916 #ifdef CONFIG_SECURITY_NETWORK
1917 	nsk->sk_security = sptr;
1918 	security_sk_clone(osk, nsk);
1919 #endif
1920 }
1921 
1922 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1923 		int family)
1924 {
1925 	struct sock *sk;
1926 	struct kmem_cache *slab;
1927 
1928 	slab = prot->slab;
1929 	if (slab != NULL) {
1930 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1931 		if (!sk)
1932 			return sk;
1933 		if (want_init_on_alloc(priority))
1934 			sk_prot_clear_nulls(sk, prot->obj_size);
1935 	} else
1936 		sk = kmalloc(prot->obj_size, priority);
1937 
1938 	if (sk != NULL) {
1939 		if (security_sk_alloc(sk, family, priority))
1940 			goto out_free;
1941 
1942 		if (!try_module_get(prot->owner))
1943 			goto out_free_sec;
1944 	}
1945 
1946 	return sk;
1947 
1948 out_free_sec:
1949 	security_sk_free(sk);
1950 out_free:
1951 	if (slab != NULL)
1952 		kmem_cache_free(slab, sk);
1953 	else
1954 		kfree(sk);
1955 	return NULL;
1956 }
1957 
1958 static void sk_prot_free(struct proto *prot, struct sock *sk)
1959 {
1960 	struct kmem_cache *slab;
1961 	struct module *owner;
1962 
1963 	owner = prot->owner;
1964 	slab = prot->slab;
1965 
1966 	cgroup_sk_free(&sk->sk_cgrp_data);
1967 	mem_cgroup_sk_free(sk);
1968 	security_sk_free(sk);
1969 	if (slab != NULL)
1970 		kmem_cache_free(slab, sk);
1971 	else
1972 		kfree(sk);
1973 	module_put(owner);
1974 }
1975 
1976 /**
1977  *	sk_alloc - All socket objects are allocated here
1978  *	@net: the applicable net namespace
1979  *	@family: protocol family
1980  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1981  *	@prot: struct proto associated with this new sock instance
1982  *	@kern: is this to be a kernel socket?
1983  */
1984 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1985 		      struct proto *prot, int kern)
1986 {
1987 	struct sock *sk;
1988 
1989 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1990 	if (sk) {
1991 		sk->sk_family = family;
1992 		/*
1993 		 * See comment in struct sock definition to understand
1994 		 * why we need sk_prot_creator -acme
1995 		 */
1996 		sk->sk_prot = sk->sk_prot_creator = prot;
1997 		sk->sk_kern_sock = kern;
1998 		sock_lock_init(sk);
1999 		sk->sk_net_refcnt = kern ? 0 : 1;
2000 		if (likely(sk->sk_net_refcnt)) {
2001 			get_net_track(net, &sk->ns_tracker, priority);
2002 			sock_inuse_add(net, 1);
2003 		}
2004 
2005 		sock_net_set(sk, net);
2006 		refcount_set(&sk->sk_wmem_alloc, 1);
2007 
2008 		mem_cgroup_sk_alloc(sk);
2009 		cgroup_sk_alloc(&sk->sk_cgrp_data);
2010 		sock_update_classid(&sk->sk_cgrp_data);
2011 		sock_update_netprioidx(&sk->sk_cgrp_data);
2012 		sk_tx_queue_clear(sk);
2013 	}
2014 
2015 	return sk;
2016 }
2017 EXPORT_SYMBOL(sk_alloc);
2018 
2019 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2020  * grace period. This is the case for UDP sockets and TCP listeners.
2021  */
2022 static void __sk_destruct(struct rcu_head *head)
2023 {
2024 	struct sock *sk = container_of(head, struct sock, sk_rcu);
2025 	struct sk_filter *filter;
2026 
2027 	if (sk->sk_destruct)
2028 		sk->sk_destruct(sk);
2029 
2030 	filter = rcu_dereference_check(sk->sk_filter,
2031 				       refcount_read(&sk->sk_wmem_alloc) == 0);
2032 	if (filter) {
2033 		sk_filter_uncharge(sk, filter);
2034 		RCU_INIT_POINTER(sk->sk_filter, NULL);
2035 	}
2036 
2037 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2038 
2039 #ifdef CONFIG_BPF_SYSCALL
2040 	bpf_sk_storage_free(sk);
2041 #endif
2042 
2043 	if (atomic_read(&sk->sk_omem_alloc))
2044 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2045 			 __func__, atomic_read(&sk->sk_omem_alloc));
2046 
2047 	if (sk->sk_frag.page) {
2048 		put_page(sk->sk_frag.page);
2049 		sk->sk_frag.page = NULL;
2050 	}
2051 
2052 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2053 	put_cred(sk->sk_peer_cred);
2054 	put_pid(sk->sk_peer_pid);
2055 
2056 	if (likely(sk->sk_net_refcnt))
2057 		put_net_track(sock_net(sk), &sk->ns_tracker);
2058 	sk_prot_free(sk->sk_prot_creator, sk);
2059 }
2060 
2061 void sk_destruct(struct sock *sk)
2062 {
2063 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2064 
2065 	WARN_ON_ONCE(!llist_empty(&sk->defer_list));
2066 	sk_defer_free_flush(sk);
2067 
2068 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2069 		reuseport_detach_sock(sk);
2070 		use_call_rcu = true;
2071 	}
2072 
2073 	if (use_call_rcu)
2074 		call_rcu(&sk->sk_rcu, __sk_destruct);
2075 	else
2076 		__sk_destruct(&sk->sk_rcu);
2077 }
2078 
2079 static void __sk_free(struct sock *sk)
2080 {
2081 	if (likely(sk->sk_net_refcnt))
2082 		sock_inuse_add(sock_net(sk), -1);
2083 
2084 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2085 		sock_diag_broadcast_destroy(sk);
2086 	else
2087 		sk_destruct(sk);
2088 }
2089 
2090 void sk_free(struct sock *sk)
2091 {
2092 	/*
2093 	 * We subtract one from sk_wmem_alloc and can know if
2094 	 * some packets are still in some tx queue.
2095 	 * If not null, sock_wfree() will call __sk_free(sk) later
2096 	 */
2097 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2098 		__sk_free(sk);
2099 }
2100 EXPORT_SYMBOL(sk_free);
2101 
2102 static void sk_init_common(struct sock *sk)
2103 {
2104 	skb_queue_head_init(&sk->sk_receive_queue);
2105 	skb_queue_head_init(&sk->sk_write_queue);
2106 	skb_queue_head_init(&sk->sk_error_queue);
2107 
2108 	rwlock_init(&sk->sk_callback_lock);
2109 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2110 			af_rlock_keys + sk->sk_family,
2111 			af_family_rlock_key_strings[sk->sk_family]);
2112 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2113 			af_wlock_keys + sk->sk_family,
2114 			af_family_wlock_key_strings[sk->sk_family]);
2115 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2116 			af_elock_keys + sk->sk_family,
2117 			af_family_elock_key_strings[sk->sk_family]);
2118 	lockdep_set_class_and_name(&sk->sk_callback_lock,
2119 			af_callback_keys + sk->sk_family,
2120 			af_family_clock_key_strings[sk->sk_family]);
2121 }
2122 
2123 /**
2124  *	sk_clone_lock - clone a socket, and lock its clone
2125  *	@sk: the socket to clone
2126  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2127  *
2128  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2129  */
2130 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2131 {
2132 	struct proto *prot = READ_ONCE(sk->sk_prot);
2133 	struct sk_filter *filter;
2134 	bool is_charged = true;
2135 	struct sock *newsk;
2136 
2137 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2138 	if (!newsk)
2139 		goto out;
2140 
2141 	sock_copy(newsk, sk);
2142 
2143 	newsk->sk_prot_creator = prot;
2144 
2145 	/* SANITY */
2146 	if (likely(newsk->sk_net_refcnt)) {
2147 		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2148 		sock_inuse_add(sock_net(newsk), 1);
2149 	}
2150 	sk_node_init(&newsk->sk_node);
2151 	sock_lock_init(newsk);
2152 	bh_lock_sock(newsk);
2153 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2154 	newsk->sk_backlog.len = 0;
2155 
2156 	atomic_set(&newsk->sk_rmem_alloc, 0);
2157 
2158 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2159 	refcount_set(&newsk->sk_wmem_alloc, 1);
2160 
2161 	atomic_set(&newsk->sk_omem_alloc, 0);
2162 	sk_init_common(newsk);
2163 
2164 	newsk->sk_dst_cache	= NULL;
2165 	newsk->sk_dst_pending_confirm = 0;
2166 	newsk->sk_wmem_queued	= 0;
2167 	newsk->sk_forward_alloc = 0;
2168 	newsk->sk_reserved_mem  = 0;
2169 	atomic_set(&newsk->sk_drops, 0);
2170 	newsk->sk_send_head	= NULL;
2171 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2172 	atomic_set(&newsk->sk_zckey, 0);
2173 
2174 	sock_reset_flag(newsk, SOCK_DONE);
2175 
2176 	/* sk->sk_memcg will be populated at accept() time */
2177 	newsk->sk_memcg = NULL;
2178 
2179 	cgroup_sk_clone(&newsk->sk_cgrp_data);
2180 
2181 	rcu_read_lock();
2182 	filter = rcu_dereference(sk->sk_filter);
2183 	if (filter != NULL)
2184 		/* though it's an empty new sock, the charging may fail
2185 		 * if sysctl_optmem_max was changed between creation of
2186 		 * original socket and cloning
2187 		 */
2188 		is_charged = sk_filter_charge(newsk, filter);
2189 	RCU_INIT_POINTER(newsk->sk_filter, filter);
2190 	rcu_read_unlock();
2191 
2192 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2193 		/* We need to make sure that we don't uncharge the new
2194 		 * socket if we couldn't charge it in the first place
2195 		 * as otherwise we uncharge the parent's filter.
2196 		 */
2197 		if (!is_charged)
2198 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2199 		sk_free_unlock_clone(newsk);
2200 		newsk = NULL;
2201 		goto out;
2202 	}
2203 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2204 
2205 	if (bpf_sk_storage_clone(sk, newsk)) {
2206 		sk_free_unlock_clone(newsk);
2207 		newsk = NULL;
2208 		goto out;
2209 	}
2210 
2211 	/* Clear sk_user_data if parent had the pointer tagged
2212 	 * as not suitable for copying when cloning.
2213 	 */
2214 	if (sk_user_data_is_nocopy(newsk))
2215 		newsk->sk_user_data = NULL;
2216 
2217 	newsk->sk_err	   = 0;
2218 	newsk->sk_err_soft = 0;
2219 	newsk->sk_priority = 0;
2220 	newsk->sk_incoming_cpu = raw_smp_processor_id();
2221 
2222 	/* Before updating sk_refcnt, we must commit prior changes to memory
2223 	 * (Documentation/RCU/rculist_nulls.rst for details)
2224 	 */
2225 	smp_wmb();
2226 	refcount_set(&newsk->sk_refcnt, 2);
2227 
2228 	/* Increment the counter in the same struct proto as the master
2229 	 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
2230 	 * is the same as sk->sk_prot->socks, as this field was copied
2231 	 * with memcpy).
2232 	 *
2233 	 * This _changes_ the previous behaviour, where
2234 	 * tcp_create_openreq_child always was incrementing the
2235 	 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
2236 	 * to be taken into account in all callers. -acme
2237 	 */
2238 	sk_refcnt_debug_inc(newsk);
2239 	sk_set_socket(newsk, NULL);
2240 	sk_tx_queue_clear(newsk);
2241 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2242 
2243 	if (newsk->sk_prot->sockets_allocated)
2244 		sk_sockets_allocated_inc(newsk);
2245 
2246 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2247 		net_enable_timestamp();
2248 out:
2249 	return newsk;
2250 }
2251 EXPORT_SYMBOL_GPL(sk_clone_lock);
2252 
2253 void sk_free_unlock_clone(struct sock *sk)
2254 {
2255 	/* It is still raw copy of parent, so invalidate
2256 	 * destructor and make plain sk_free() */
2257 	sk->sk_destruct = NULL;
2258 	bh_unlock_sock(sk);
2259 	sk_free(sk);
2260 }
2261 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2262 
2263 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2264 {
2265 	u32 max_segs = 1;
2266 
2267 	sk_dst_set(sk, dst);
2268 	sk->sk_route_caps = dst->dev->features;
2269 	if (sk_is_tcp(sk))
2270 		sk->sk_route_caps |= NETIF_F_GSO;
2271 	if (sk->sk_route_caps & NETIF_F_GSO)
2272 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2273 	if (unlikely(sk->sk_gso_disabled))
2274 		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2275 	if (sk_can_gso(sk)) {
2276 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2277 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2278 		} else {
2279 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2280 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */
2281 			sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size);
2282 			sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1);
2283 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2284 			max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2285 		}
2286 	}
2287 	sk->sk_gso_max_segs = max_segs;
2288 }
2289 EXPORT_SYMBOL_GPL(sk_setup_caps);
2290 
2291 /*
2292  *	Simple resource managers for sockets.
2293  */
2294 
2295 
2296 /*
2297  * Write buffer destructor automatically called from kfree_skb.
2298  */
2299 void sock_wfree(struct sk_buff *skb)
2300 {
2301 	struct sock *sk = skb->sk;
2302 	unsigned int len = skb->truesize;
2303 
2304 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2305 		/*
2306 		 * Keep a reference on sk_wmem_alloc, this will be released
2307 		 * after sk_write_space() call
2308 		 */
2309 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2310 		sk->sk_write_space(sk);
2311 		len = 1;
2312 	}
2313 	/*
2314 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2315 	 * could not do because of in-flight packets
2316 	 */
2317 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2318 		__sk_free(sk);
2319 }
2320 EXPORT_SYMBOL(sock_wfree);
2321 
2322 /* This variant of sock_wfree() is used by TCP,
2323  * since it sets SOCK_USE_WRITE_QUEUE.
2324  */
2325 void __sock_wfree(struct sk_buff *skb)
2326 {
2327 	struct sock *sk = skb->sk;
2328 
2329 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2330 		__sk_free(sk);
2331 }
2332 
2333 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2334 {
2335 	skb_orphan(skb);
2336 	skb->sk = sk;
2337 #ifdef CONFIG_INET
2338 	if (unlikely(!sk_fullsock(sk))) {
2339 		skb->destructor = sock_edemux;
2340 		sock_hold(sk);
2341 		return;
2342 	}
2343 #endif
2344 	skb->destructor = sock_wfree;
2345 	skb_set_hash_from_sk(skb, sk);
2346 	/*
2347 	 * We used to take a refcount on sk, but following operation
2348 	 * is enough to guarantee sk_free() wont free this sock until
2349 	 * all in-flight packets are completed
2350 	 */
2351 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2352 }
2353 EXPORT_SYMBOL(skb_set_owner_w);
2354 
2355 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2356 {
2357 #ifdef CONFIG_TLS_DEVICE
2358 	/* Drivers depend on in-order delivery for crypto offload,
2359 	 * partial orphan breaks out-of-order-OK logic.
2360 	 */
2361 	if (skb->decrypted)
2362 		return false;
2363 #endif
2364 	return (skb->destructor == sock_wfree ||
2365 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2366 }
2367 
2368 /* This helper is used by netem, as it can hold packets in its
2369  * delay queue. We want to allow the owner socket to send more
2370  * packets, as if they were already TX completed by a typical driver.
2371  * But we also want to keep skb->sk set because some packet schedulers
2372  * rely on it (sch_fq for example).
2373  */
2374 void skb_orphan_partial(struct sk_buff *skb)
2375 {
2376 	if (skb_is_tcp_pure_ack(skb))
2377 		return;
2378 
2379 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2380 		return;
2381 
2382 	skb_orphan(skb);
2383 }
2384 EXPORT_SYMBOL(skb_orphan_partial);
2385 
2386 /*
2387  * Read buffer destructor automatically called from kfree_skb.
2388  */
2389 void sock_rfree(struct sk_buff *skb)
2390 {
2391 	struct sock *sk = skb->sk;
2392 	unsigned int len = skb->truesize;
2393 
2394 	atomic_sub(len, &sk->sk_rmem_alloc);
2395 	sk_mem_uncharge(sk, len);
2396 }
2397 EXPORT_SYMBOL(sock_rfree);
2398 
2399 /*
2400  * Buffer destructor for skbs that are not used directly in read or write
2401  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2402  */
2403 void sock_efree(struct sk_buff *skb)
2404 {
2405 	sock_put(skb->sk);
2406 }
2407 EXPORT_SYMBOL(sock_efree);
2408 
2409 /* Buffer destructor for prefetch/receive path where reference count may
2410  * not be held, e.g. for listen sockets.
2411  */
2412 #ifdef CONFIG_INET
2413 void sock_pfree(struct sk_buff *skb)
2414 {
2415 	if (sk_is_refcounted(skb->sk))
2416 		sock_gen_put(skb->sk);
2417 }
2418 EXPORT_SYMBOL(sock_pfree);
2419 #endif /* CONFIG_INET */
2420 
2421 kuid_t sock_i_uid(struct sock *sk)
2422 {
2423 	kuid_t uid;
2424 
2425 	read_lock_bh(&sk->sk_callback_lock);
2426 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2427 	read_unlock_bh(&sk->sk_callback_lock);
2428 	return uid;
2429 }
2430 EXPORT_SYMBOL(sock_i_uid);
2431 
2432 unsigned long sock_i_ino(struct sock *sk)
2433 {
2434 	unsigned long ino;
2435 
2436 	read_lock_bh(&sk->sk_callback_lock);
2437 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2438 	read_unlock_bh(&sk->sk_callback_lock);
2439 	return ino;
2440 }
2441 EXPORT_SYMBOL(sock_i_ino);
2442 
2443 /*
2444  * Allocate a skb from the socket's send buffer.
2445  */
2446 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2447 			     gfp_t priority)
2448 {
2449 	if (force ||
2450 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2451 		struct sk_buff *skb = alloc_skb(size, priority);
2452 
2453 		if (skb) {
2454 			skb_set_owner_w(skb, sk);
2455 			return skb;
2456 		}
2457 	}
2458 	return NULL;
2459 }
2460 EXPORT_SYMBOL(sock_wmalloc);
2461 
2462 static void sock_ofree(struct sk_buff *skb)
2463 {
2464 	struct sock *sk = skb->sk;
2465 
2466 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2467 }
2468 
2469 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2470 			     gfp_t priority)
2471 {
2472 	struct sk_buff *skb;
2473 
2474 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2475 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2476 	    sysctl_optmem_max)
2477 		return NULL;
2478 
2479 	skb = alloc_skb(size, priority);
2480 	if (!skb)
2481 		return NULL;
2482 
2483 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2484 	skb->sk = sk;
2485 	skb->destructor = sock_ofree;
2486 	return skb;
2487 }
2488 
2489 /*
2490  * Allocate a memory block from the socket's option memory buffer.
2491  */
2492 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2493 {
2494 	if ((unsigned int)size <= sysctl_optmem_max &&
2495 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2496 		void *mem;
2497 		/* First do the add, to avoid the race if kmalloc
2498 		 * might sleep.
2499 		 */
2500 		atomic_add(size, &sk->sk_omem_alloc);
2501 		mem = kmalloc(size, priority);
2502 		if (mem)
2503 			return mem;
2504 		atomic_sub(size, &sk->sk_omem_alloc);
2505 	}
2506 	return NULL;
2507 }
2508 EXPORT_SYMBOL(sock_kmalloc);
2509 
2510 /* Free an option memory block. Note, we actually want the inline
2511  * here as this allows gcc to detect the nullify and fold away the
2512  * condition entirely.
2513  */
2514 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2515 				  const bool nullify)
2516 {
2517 	if (WARN_ON_ONCE(!mem))
2518 		return;
2519 	if (nullify)
2520 		kfree_sensitive(mem);
2521 	else
2522 		kfree(mem);
2523 	atomic_sub(size, &sk->sk_omem_alloc);
2524 }
2525 
2526 void sock_kfree_s(struct sock *sk, void *mem, int size)
2527 {
2528 	__sock_kfree_s(sk, mem, size, false);
2529 }
2530 EXPORT_SYMBOL(sock_kfree_s);
2531 
2532 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2533 {
2534 	__sock_kfree_s(sk, mem, size, true);
2535 }
2536 EXPORT_SYMBOL(sock_kzfree_s);
2537 
2538 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2539    I think, these locks should be removed for datagram sockets.
2540  */
2541 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2542 {
2543 	DEFINE_WAIT(wait);
2544 
2545 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2546 	for (;;) {
2547 		if (!timeo)
2548 			break;
2549 		if (signal_pending(current))
2550 			break;
2551 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2552 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2553 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2554 			break;
2555 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2556 			break;
2557 		if (sk->sk_err)
2558 			break;
2559 		timeo = schedule_timeout(timeo);
2560 	}
2561 	finish_wait(sk_sleep(sk), &wait);
2562 	return timeo;
2563 }
2564 
2565 
2566 /*
2567  *	Generic send/receive buffer handlers
2568  */
2569 
2570 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2571 				     unsigned long data_len, int noblock,
2572 				     int *errcode, int max_page_order)
2573 {
2574 	struct sk_buff *skb;
2575 	long timeo;
2576 	int err;
2577 
2578 	timeo = sock_sndtimeo(sk, noblock);
2579 	for (;;) {
2580 		err = sock_error(sk);
2581 		if (err != 0)
2582 			goto failure;
2583 
2584 		err = -EPIPE;
2585 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2586 			goto failure;
2587 
2588 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2589 			break;
2590 
2591 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2592 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2593 		err = -EAGAIN;
2594 		if (!timeo)
2595 			goto failure;
2596 		if (signal_pending(current))
2597 			goto interrupted;
2598 		timeo = sock_wait_for_wmem(sk, timeo);
2599 	}
2600 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2601 				   errcode, sk->sk_allocation);
2602 	if (skb)
2603 		skb_set_owner_w(skb, sk);
2604 	return skb;
2605 
2606 interrupted:
2607 	err = sock_intr_errno(timeo);
2608 failure:
2609 	*errcode = err;
2610 	return NULL;
2611 }
2612 EXPORT_SYMBOL(sock_alloc_send_pskb);
2613 
2614 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2615 				    int noblock, int *errcode)
2616 {
2617 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2618 }
2619 EXPORT_SYMBOL(sock_alloc_send_skb);
2620 
2621 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2622 		     struct sockcm_cookie *sockc)
2623 {
2624 	u32 tsflags;
2625 
2626 	switch (cmsg->cmsg_type) {
2627 	case SO_MARK:
2628 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2629 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2630 			return -EPERM;
2631 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2632 			return -EINVAL;
2633 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2634 		break;
2635 	case SO_TIMESTAMPING_OLD:
2636 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2637 			return -EINVAL;
2638 
2639 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2640 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2641 			return -EINVAL;
2642 
2643 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2644 		sockc->tsflags |= tsflags;
2645 		break;
2646 	case SCM_TXTIME:
2647 		if (!sock_flag(sk, SOCK_TXTIME))
2648 			return -EINVAL;
2649 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2650 			return -EINVAL;
2651 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2652 		break;
2653 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2654 	case SCM_RIGHTS:
2655 	case SCM_CREDENTIALS:
2656 		break;
2657 	default:
2658 		return -EINVAL;
2659 	}
2660 	return 0;
2661 }
2662 EXPORT_SYMBOL(__sock_cmsg_send);
2663 
2664 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2665 		   struct sockcm_cookie *sockc)
2666 {
2667 	struct cmsghdr *cmsg;
2668 	int ret;
2669 
2670 	for_each_cmsghdr(cmsg, msg) {
2671 		if (!CMSG_OK(msg, cmsg))
2672 			return -EINVAL;
2673 		if (cmsg->cmsg_level != SOL_SOCKET)
2674 			continue;
2675 		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2676 		if (ret)
2677 			return ret;
2678 	}
2679 	return 0;
2680 }
2681 EXPORT_SYMBOL(sock_cmsg_send);
2682 
2683 static void sk_enter_memory_pressure(struct sock *sk)
2684 {
2685 	if (!sk->sk_prot->enter_memory_pressure)
2686 		return;
2687 
2688 	sk->sk_prot->enter_memory_pressure(sk);
2689 }
2690 
2691 static void sk_leave_memory_pressure(struct sock *sk)
2692 {
2693 	if (sk->sk_prot->leave_memory_pressure) {
2694 		sk->sk_prot->leave_memory_pressure(sk);
2695 	} else {
2696 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2697 
2698 		if (memory_pressure && READ_ONCE(*memory_pressure))
2699 			WRITE_ONCE(*memory_pressure, 0);
2700 	}
2701 }
2702 
2703 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2704 
2705 /**
2706  * skb_page_frag_refill - check that a page_frag contains enough room
2707  * @sz: minimum size of the fragment we want to get
2708  * @pfrag: pointer to page_frag
2709  * @gfp: priority for memory allocation
2710  *
2711  * Note: While this allocator tries to use high order pages, there is
2712  * no guarantee that allocations succeed. Therefore, @sz MUST be
2713  * less or equal than PAGE_SIZE.
2714  */
2715 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2716 {
2717 	if (pfrag->page) {
2718 		if (page_ref_count(pfrag->page) == 1) {
2719 			pfrag->offset = 0;
2720 			return true;
2721 		}
2722 		if (pfrag->offset + sz <= pfrag->size)
2723 			return true;
2724 		put_page(pfrag->page);
2725 	}
2726 
2727 	pfrag->offset = 0;
2728 	if (SKB_FRAG_PAGE_ORDER &&
2729 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2730 		/* Avoid direct reclaim but allow kswapd to wake */
2731 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2732 					  __GFP_COMP | __GFP_NOWARN |
2733 					  __GFP_NORETRY,
2734 					  SKB_FRAG_PAGE_ORDER);
2735 		if (likely(pfrag->page)) {
2736 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2737 			return true;
2738 		}
2739 	}
2740 	pfrag->page = alloc_page(gfp);
2741 	if (likely(pfrag->page)) {
2742 		pfrag->size = PAGE_SIZE;
2743 		return true;
2744 	}
2745 	return false;
2746 }
2747 EXPORT_SYMBOL(skb_page_frag_refill);
2748 
2749 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2750 {
2751 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2752 		return true;
2753 
2754 	sk_enter_memory_pressure(sk);
2755 	sk_stream_moderate_sndbuf(sk);
2756 	return false;
2757 }
2758 EXPORT_SYMBOL(sk_page_frag_refill);
2759 
2760 void __lock_sock(struct sock *sk)
2761 	__releases(&sk->sk_lock.slock)
2762 	__acquires(&sk->sk_lock.slock)
2763 {
2764 	DEFINE_WAIT(wait);
2765 
2766 	for (;;) {
2767 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2768 					TASK_UNINTERRUPTIBLE);
2769 		spin_unlock_bh(&sk->sk_lock.slock);
2770 		schedule();
2771 		spin_lock_bh(&sk->sk_lock.slock);
2772 		if (!sock_owned_by_user(sk))
2773 			break;
2774 	}
2775 	finish_wait(&sk->sk_lock.wq, &wait);
2776 }
2777 
2778 void __release_sock(struct sock *sk)
2779 	__releases(&sk->sk_lock.slock)
2780 	__acquires(&sk->sk_lock.slock)
2781 {
2782 	struct sk_buff *skb, *next;
2783 
2784 	while ((skb = sk->sk_backlog.head) != NULL) {
2785 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2786 
2787 		spin_unlock_bh(&sk->sk_lock.slock);
2788 
2789 		do {
2790 			next = skb->next;
2791 			prefetch(next);
2792 			WARN_ON_ONCE(skb_dst_is_noref(skb));
2793 			skb_mark_not_on_list(skb);
2794 			sk_backlog_rcv(sk, skb);
2795 
2796 			cond_resched();
2797 
2798 			skb = next;
2799 		} while (skb != NULL);
2800 
2801 		spin_lock_bh(&sk->sk_lock.slock);
2802 	}
2803 
2804 	/*
2805 	 * Doing the zeroing here guarantee we can not loop forever
2806 	 * while a wild producer attempts to flood us.
2807 	 */
2808 	sk->sk_backlog.len = 0;
2809 }
2810 
2811 void __sk_flush_backlog(struct sock *sk)
2812 {
2813 	spin_lock_bh(&sk->sk_lock.slock);
2814 	__release_sock(sk);
2815 	spin_unlock_bh(&sk->sk_lock.slock);
2816 }
2817 
2818 /**
2819  * sk_wait_data - wait for data to arrive at sk_receive_queue
2820  * @sk:    sock to wait on
2821  * @timeo: for how long
2822  * @skb:   last skb seen on sk_receive_queue
2823  *
2824  * Now socket state including sk->sk_err is changed only under lock,
2825  * hence we may omit checks after joining wait queue.
2826  * We check receive queue before schedule() only as optimization;
2827  * it is very likely that release_sock() added new data.
2828  */
2829 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2830 {
2831 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2832 	int rc;
2833 
2834 	add_wait_queue(sk_sleep(sk), &wait);
2835 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2836 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2837 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2838 	remove_wait_queue(sk_sleep(sk), &wait);
2839 	return rc;
2840 }
2841 EXPORT_SYMBOL(sk_wait_data);
2842 
2843 /**
2844  *	__sk_mem_raise_allocated - increase memory_allocated
2845  *	@sk: socket
2846  *	@size: memory size to allocate
2847  *	@amt: pages to allocate
2848  *	@kind: allocation type
2849  *
2850  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2851  */
2852 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2853 {
2854 	struct proto *prot = sk->sk_prot;
2855 	long allocated = sk_memory_allocated_add(sk, amt);
2856 	bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
2857 	bool charged = true;
2858 
2859 	if (memcg_charge &&
2860 	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2861 						gfp_memcg_charge())))
2862 		goto suppress_allocation;
2863 
2864 	/* Under limit. */
2865 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2866 		sk_leave_memory_pressure(sk);
2867 		return 1;
2868 	}
2869 
2870 	/* Under pressure. */
2871 	if (allocated > sk_prot_mem_limits(sk, 1))
2872 		sk_enter_memory_pressure(sk);
2873 
2874 	/* Over hard limit. */
2875 	if (allocated > sk_prot_mem_limits(sk, 2))
2876 		goto suppress_allocation;
2877 
2878 	/* guarantee minimum buffer size under pressure */
2879 	if (kind == SK_MEM_RECV) {
2880 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2881 			return 1;
2882 
2883 	} else { /* SK_MEM_SEND */
2884 		int wmem0 = sk_get_wmem0(sk, prot);
2885 
2886 		if (sk->sk_type == SOCK_STREAM) {
2887 			if (sk->sk_wmem_queued < wmem0)
2888 				return 1;
2889 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2890 				return 1;
2891 		}
2892 	}
2893 
2894 	if (sk_has_memory_pressure(sk)) {
2895 		u64 alloc;
2896 
2897 		if (!sk_under_memory_pressure(sk))
2898 			return 1;
2899 		alloc = sk_sockets_allocated_read_positive(sk);
2900 		if (sk_prot_mem_limits(sk, 2) > alloc *
2901 		    sk_mem_pages(sk->sk_wmem_queued +
2902 				 atomic_read(&sk->sk_rmem_alloc) +
2903 				 sk->sk_forward_alloc))
2904 			return 1;
2905 	}
2906 
2907 suppress_allocation:
2908 
2909 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2910 		sk_stream_moderate_sndbuf(sk);
2911 
2912 		/* Fail only if socket is _under_ its sndbuf.
2913 		 * In this case we cannot block, so that we have to fail.
2914 		 */
2915 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
2916 			/* Force charge with __GFP_NOFAIL */
2917 			if (memcg_charge && !charged) {
2918 				mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2919 					gfp_memcg_charge() | __GFP_NOFAIL);
2920 			}
2921 			return 1;
2922 		}
2923 	}
2924 
2925 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2926 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2927 
2928 	sk_memory_allocated_sub(sk, amt);
2929 
2930 	if (memcg_charge && charged)
2931 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2932 
2933 	return 0;
2934 }
2935 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2936 
2937 /**
2938  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2939  *	@sk: socket
2940  *	@size: memory size to allocate
2941  *	@kind: allocation type
2942  *
2943  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2944  *	rmem allocation. This function assumes that protocols which have
2945  *	memory_pressure use sk_wmem_queued as write buffer accounting.
2946  */
2947 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2948 {
2949 	int ret, amt = sk_mem_pages(size);
2950 
2951 	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2952 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2953 	if (!ret)
2954 		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2955 	return ret;
2956 }
2957 EXPORT_SYMBOL(__sk_mem_schedule);
2958 
2959 /**
2960  *	__sk_mem_reduce_allocated - reclaim memory_allocated
2961  *	@sk: socket
2962  *	@amount: number of quanta
2963  *
2964  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2965  */
2966 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2967 {
2968 	sk_memory_allocated_sub(sk, amount);
2969 
2970 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2971 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2972 
2973 	if (sk_under_memory_pressure(sk) &&
2974 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2975 		sk_leave_memory_pressure(sk);
2976 }
2977 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2978 
2979 /**
2980  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2981  *	@sk: socket
2982  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2983  */
2984 void __sk_mem_reclaim(struct sock *sk, int amount)
2985 {
2986 	amount >>= SK_MEM_QUANTUM_SHIFT;
2987 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2988 	__sk_mem_reduce_allocated(sk, amount);
2989 }
2990 EXPORT_SYMBOL(__sk_mem_reclaim);
2991 
2992 int sk_set_peek_off(struct sock *sk, int val)
2993 {
2994 	sk->sk_peek_off = val;
2995 	return 0;
2996 }
2997 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2998 
2999 /*
3000  * Set of default routines for initialising struct proto_ops when
3001  * the protocol does not support a particular function. In certain
3002  * cases where it makes no sense for a protocol to have a "do nothing"
3003  * function, some default processing is provided.
3004  */
3005 
3006 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3007 {
3008 	return -EOPNOTSUPP;
3009 }
3010 EXPORT_SYMBOL(sock_no_bind);
3011 
3012 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3013 		    int len, int flags)
3014 {
3015 	return -EOPNOTSUPP;
3016 }
3017 EXPORT_SYMBOL(sock_no_connect);
3018 
3019 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3020 {
3021 	return -EOPNOTSUPP;
3022 }
3023 EXPORT_SYMBOL(sock_no_socketpair);
3024 
3025 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
3026 		   bool kern)
3027 {
3028 	return -EOPNOTSUPP;
3029 }
3030 EXPORT_SYMBOL(sock_no_accept);
3031 
3032 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3033 		    int peer)
3034 {
3035 	return -EOPNOTSUPP;
3036 }
3037 EXPORT_SYMBOL(sock_no_getname);
3038 
3039 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3040 {
3041 	return -EOPNOTSUPP;
3042 }
3043 EXPORT_SYMBOL(sock_no_ioctl);
3044 
3045 int sock_no_listen(struct socket *sock, int backlog)
3046 {
3047 	return -EOPNOTSUPP;
3048 }
3049 EXPORT_SYMBOL(sock_no_listen);
3050 
3051 int sock_no_shutdown(struct socket *sock, int how)
3052 {
3053 	return -EOPNOTSUPP;
3054 }
3055 EXPORT_SYMBOL(sock_no_shutdown);
3056 
3057 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3058 {
3059 	return -EOPNOTSUPP;
3060 }
3061 EXPORT_SYMBOL(sock_no_sendmsg);
3062 
3063 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3064 {
3065 	return -EOPNOTSUPP;
3066 }
3067 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3068 
3069 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3070 		    int flags)
3071 {
3072 	return -EOPNOTSUPP;
3073 }
3074 EXPORT_SYMBOL(sock_no_recvmsg);
3075 
3076 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3077 {
3078 	/* Mirror missing mmap method error code */
3079 	return -ENODEV;
3080 }
3081 EXPORT_SYMBOL(sock_no_mmap);
3082 
3083 /*
3084  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3085  * various sock-based usage counts.
3086  */
3087 void __receive_sock(struct file *file)
3088 {
3089 	struct socket *sock;
3090 
3091 	sock = sock_from_file(file);
3092 	if (sock) {
3093 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3094 		sock_update_classid(&sock->sk->sk_cgrp_data);
3095 	}
3096 }
3097 
3098 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
3099 {
3100 	ssize_t res;
3101 	struct msghdr msg = {.msg_flags = flags};
3102 	struct kvec iov;
3103 	char *kaddr = kmap(page);
3104 	iov.iov_base = kaddr + offset;
3105 	iov.iov_len = size;
3106 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
3107 	kunmap(page);
3108 	return res;
3109 }
3110 EXPORT_SYMBOL(sock_no_sendpage);
3111 
3112 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
3113 				int offset, size_t size, int flags)
3114 {
3115 	ssize_t res;
3116 	struct msghdr msg = {.msg_flags = flags};
3117 	struct kvec iov;
3118 	char *kaddr = kmap(page);
3119 
3120 	iov.iov_base = kaddr + offset;
3121 	iov.iov_len = size;
3122 	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
3123 	kunmap(page);
3124 	return res;
3125 }
3126 EXPORT_SYMBOL(sock_no_sendpage_locked);
3127 
3128 /*
3129  *	Default Socket Callbacks
3130  */
3131 
3132 static void sock_def_wakeup(struct sock *sk)
3133 {
3134 	struct socket_wq *wq;
3135 
3136 	rcu_read_lock();
3137 	wq = rcu_dereference(sk->sk_wq);
3138 	if (skwq_has_sleeper(wq))
3139 		wake_up_interruptible_all(&wq->wait);
3140 	rcu_read_unlock();
3141 }
3142 
3143 static void sock_def_error_report(struct sock *sk)
3144 {
3145 	struct socket_wq *wq;
3146 
3147 	rcu_read_lock();
3148 	wq = rcu_dereference(sk->sk_wq);
3149 	if (skwq_has_sleeper(wq))
3150 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3151 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3152 	rcu_read_unlock();
3153 }
3154 
3155 void sock_def_readable(struct sock *sk)
3156 {
3157 	struct socket_wq *wq;
3158 
3159 	rcu_read_lock();
3160 	wq = rcu_dereference(sk->sk_wq);
3161 	if (skwq_has_sleeper(wq))
3162 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3163 						EPOLLRDNORM | EPOLLRDBAND);
3164 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3165 	rcu_read_unlock();
3166 }
3167 
3168 static void sock_def_write_space(struct sock *sk)
3169 {
3170 	struct socket_wq *wq;
3171 
3172 	rcu_read_lock();
3173 
3174 	/* Do not wake up a writer until he can make "significant"
3175 	 * progress.  --DaveM
3176 	 */
3177 	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
3178 		wq = rcu_dereference(sk->sk_wq);
3179 		if (skwq_has_sleeper(wq))
3180 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3181 						EPOLLWRNORM | EPOLLWRBAND);
3182 
3183 		/* Should agree with poll, otherwise some programs break */
3184 		if (sock_writeable(sk))
3185 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3186 	}
3187 
3188 	rcu_read_unlock();
3189 }
3190 
3191 static void sock_def_destruct(struct sock *sk)
3192 {
3193 }
3194 
3195 void sk_send_sigurg(struct sock *sk)
3196 {
3197 	if (sk->sk_socket && sk->sk_socket->file)
3198 		if (send_sigurg(&sk->sk_socket->file->f_owner))
3199 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3200 }
3201 EXPORT_SYMBOL(sk_send_sigurg);
3202 
3203 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3204 		    unsigned long expires)
3205 {
3206 	if (!mod_timer(timer, expires))
3207 		sock_hold(sk);
3208 }
3209 EXPORT_SYMBOL(sk_reset_timer);
3210 
3211 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3212 {
3213 	if (del_timer(timer))
3214 		__sock_put(sk);
3215 }
3216 EXPORT_SYMBOL(sk_stop_timer);
3217 
3218 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3219 {
3220 	if (del_timer_sync(timer))
3221 		__sock_put(sk);
3222 }
3223 EXPORT_SYMBOL(sk_stop_timer_sync);
3224 
3225 void sock_init_data(struct socket *sock, struct sock *sk)
3226 {
3227 	sk_init_common(sk);
3228 	sk->sk_send_head	=	NULL;
3229 
3230 	timer_setup(&sk->sk_timer, NULL, 0);
3231 
3232 	sk->sk_allocation	=	GFP_KERNEL;
3233 	sk->sk_rcvbuf		=	sysctl_rmem_default;
3234 	sk->sk_sndbuf		=	sysctl_wmem_default;
3235 	sk->sk_state		=	TCP_CLOSE;
3236 	sk_set_socket(sk, sock);
3237 
3238 	sock_set_flag(sk, SOCK_ZAPPED);
3239 
3240 	if (sock) {
3241 		sk->sk_type	=	sock->type;
3242 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3243 		sock->sk	=	sk;
3244 		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
3245 	} else {
3246 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3247 		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
3248 	}
3249 
3250 	rwlock_init(&sk->sk_callback_lock);
3251 	if (sk->sk_kern_sock)
3252 		lockdep_set_class_and_name(
3253 			&sk->sk_callback_lock,
3254 			af_kern_callback_keys + sk->sk_family,
3255 			af_family_kern_clock_key_strings[sk->sk_family]);
3256 	else
3257 		lockdep_set_class_and_name(
3258 			&sk->sk_callback_lock,
3259 			af_callback_keys + sk->sk_family,
3260 			af_family_clock_key_strings[sk->sk_family]);
3261 
3262 	sk->sk_state_change	=	sock_def_wakeup;
3263 	sk->sk_data_ready	=	sock_def_readable;
3264 	sk->sk_write_space	=	sock_def_write_space;
3265 	sk->sk_error_report	=	sock_def_error_report;
3266 	sk->sk_destruct		=	sock_def_destruct;
3267 
3268 	sk->sk_frag.page	=	NULL;
3269 	sk->sk_frag.offset	=	0;
3270 	sk->sk_peek_off		=	-1;
3271 
3272 	sk->sk_peer_pid 	=	NULL;
3273 	sk->sk_peer_cred	=	NULL;
3274 	spin_lock_init(&sk->sk_peer_lock);
3275 
3276 	sk->sk_write_pending	=	0;
3277 	sk->sk_rcvlowat		=	1;
3278 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3279 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3280 
3281 	sk->sk_stamp = SK_DEFAULT_STAMP;
3282 #if BITS_PER_LONG==32
3283 	seqlock_init(&sk->sk_stamp_seq);
3284 #endif
3285 	atomic_set(&sk->sk_zckey, 0);
3286 
3287 #ifdef CONFIG_NET_RX_BUSY_POLL
3288 	sk->sk_napi_id		=	0;
3289 	sk->sk_ll_usec		=	sysctl_net_busy_read;
3290 #endif
3291 
3292 	sk->sk_max_pacing_rate = ~0UL;
3293 	sk->sk_pacing_rate = ~0UL;
3294 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3295 	sk->sk_incoming_cpu = -1;
3296 	sk->sk_txrehash = SOCK_TXREHASH_DEFAULT;
3297 
3298 	sk_rx_queue_clear(sk);
3299 	/*
3300 	 * Before updating sk_refcnt, we must commit prior changes to memory
3301 	 * (Documentation/RCU/rculist_nulls.rst for details)
3302 	 */
3303 	smp_wmb();
3304 	refcount_set(&sk->sk_refcnt, 1);
3305 	atomic_set(&sk->sk_drops, 0);
3306 }
3307 EXPORT_SYMBOL(sock_init_data);
3308 
3309 void lock_sock_nested(struct sock *sk, int subclass)
3310 {
3311 	/* The sk_lock has mutex_lock() semantics here. */
3312 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3313 
3314 	might_sleep();
3315 	spin_lock_bh(&sk->sk_lock.slock);
3316 	if (sock_owned_by_user_nocheck(sk))
3317 		__lock_sock(sk);
3318 	sk->sk_lock.owned = 1;
3319 	spin_unlock_bh(&sk->sk_lock.slock);
3320 }
3321 EXPORT_SYMBOL(lock_sock_nested);
3322 
3323 void release_sock(struct sock *sk)
3324 {
3325 	spin_lock_bh(&sk->sk_lock.slock);
3326 	if (sk->sk_backlog.tail)
3327 		__release_sock(sk);
3328 
3329 	/* Warning : release_cb() might need to release sk ownership,
3330 	 * ie call sock_release_ownership(sk) before us.
3331 	 */
3332 	if (sk->sk_prot->release_cb)
3333 		sk->sk_prot->release_cb(sk);
3334 
3335 	sock_release_ownership(sk);
3336 	if (waitqueue_active(&sk->sk_lock.wq))
3337 		wake_up(&sk->sk_lock.wq);
3338 	spin_unlock_bh(&sk->sk_lock.slock);
3339 }
3340 EXPORT_SYMBOL(release_sock);
3341 
3342 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3343 {
3344 	might_sleep();
3345 	spin_lock_bh(&sk->sk_lock.slock);
3346 
3347 	if (!sock_owned_by_user_nocheck(sk)) {
3348 		/*
3349 		 * Fast path return with bottom halves disabled and
3350 		 * sock::sk_lock.slock held.
3351 		 *
3352 		 * The 'mutex' is not contended and holding
3353 		 * sock::sk_lock.slock prevents all other lockers to
3354 		 * proceed so the corresponding unlock_sock_fast() can
3355 		 * avoid the slow path of release_sock() completely and
3356 		 * just release slock.
3357 		 *
3358 		 * From a semantical POV this is equivalent to 'acquiring'
3359 		 * the 'mutex', hence the corresponding lockdep
3360 		 * mutex_release() has to happen in the fast path of
3361 		 * unlock_sock_fast().
3362 		 */
3363 		return false;
3364 	}
3365 
3366 	__lock_sock(sk);
3367 	sk->sk_lock.owned = 1;
3368 	__acquire(&sk->sk_lock.slock);
3369 	spin_unlock_bh(&sk->sk_lock.slock);
3370 	return true;
3371 }
3372 EXPORT_SYMBOL(__lock_sock_fast);
3373 
3374 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3375 		   bool timeval, bool time32)
3376 {
3377 	struct sock *sk = sock->sk;
3378 	struct timespec64 ts;
3379 
3380 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3381 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3382 	if (ts.tv_sec == -1)
3383 		return -ENOENT;
3384 	if (ts.tv_sec == 0) {
3385 		ktime_t kt = ktime_get_real();
3386 		sock_write_timestamp(sk, kt);
3387 		ts = ktime_to_timespec64(kt);
3388 	}
3389 
3390 	if (timeval)
3391 		ts.tv_nsec /= 1000;
3392 
3393 #ifdef CONFIG_COMPAT_32BIT_TIME
3394 	if (time32)
3395 		return put_old_timespec32(&ts, userstamp);
3396 #endif
3397 #ifdef CONFIG_SPARC64
3398 	/* beware of padding in sparc64 timeval */
3399 	if (timeval && !in_compat_syscall()) {
3400 		struct __kernel_old_timeval __user tv = {
3401 			.tv_sec = ts.tv_sec,
3402 			.tv_usec = ts.tv_nsec,
3403 		};
3404 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3405 			return -EFAULT;
3406 		return 0;
3407 	}
3408 #endif
3409 	return put_timespec64(&ts, userstamp);
3410 }
3411 EXPORT_SYMBOL(sock_gettstamp);
3412 
3413 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3414 {
3415 	if (!sock_flag(sk, flag)) {
3416 		unsigned long previous_flags = sk->sk_flags;
3417 
3418 		sock_set_flag(sk, flag);
3419 		/*
3420 		 * we just set one of the two flags which require net
3421 		 * time stamping, but time stamping might have been on
3422 		 * already because of the other one
3423 		 */
3424 		if (sock_needs_netstamp(sk) &&
3425 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3426 			net_enable_timestamp();
3427 	}
3428 }
3429 
3430 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3431 		       int level, int type)
3432 {
3433 	struct sock_exterr_skb *serr;
3434 	struct sk_buff *skb;
3435 	int copied, err;
3436 
3437 	err = -EAGAIN;
3438 	skb = sock_dequeue_err_skb(sk);
3439 	if (skb == NULL)
3440 		goto out;
3441 
3442 	copied = skb->len;
3443 	if (copied > len) {
3444 		msg->msg_flags |= MSG_TRUNC;
3445 		copied = len;
3446 	}
3447 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3448 	if (err)
3449 		goto out_free_skb;
3450 
3451 	sock_recv_timestamp(msg, sk, skb);
3452 
3453 	serr = SKB_EXT_ERR(skb);
3454 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3455 
3456 	msg->msg_flags |= MSG_ERRQUEUE;
3457 	err = copied;
3458 
3459 out_free_skb:
3460 	kfree_skb(skb);
3461 out:
3462 	return err;
3463 }
3464 EXPORT_SYMBOL(sock_recv_errqueue);
3465 
3466 /*
3467  *	Get a socket option on an socket.
3468  *
3469  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3470  *	asynchronous errors should be reported by getsockopt. We assume
3471  *	this means if you specify SO_ERROR (otherwise whats the point of it).
3472  */
3473 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3474 			   char __user *optval, int __user *optlen)
3475 {
3476 	struct sock *sk = sock->sk;
3477 
3478 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3479 }
3480 EXPORT_SYMBOL(sock_common_getsockopt);
3481 
3482 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3483 			int flags)
3484 {
3485 	struct sock *sk = sock->sk;
3486 	int addr_len = 0;
3487 	int err;
3488 
3489 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3490 				   flags & ~MSG_DONTWAIT, &addr_len);
3491 	if (err >= 0)
3492 		msg->msg_namelen = addr_len;
3493 	return err;
3494 }
3495 EXPORT_SYMBOL(sock_common_recvmsg);
3496 
3497 /*
3498  *	Set socket options on an inet socket.
3499  */
3500 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3501 			   sockptr_t optval, unsigned int optlen)
3502 {
3503 	struct sock *sk = sock->sk;
3504 
3505 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3506 }
3507 EXPORT_SYMBOL(sock_common_setsockopt);
3508 
3509 void sk_common_release(struct sock *sk)
3510 {
3511 	if (sk->sk_prot->destroy)
3512 		sk->sk_prot->destroy(sk);
3513 
3514 	/*
3515 	 * Observation: when sk_common_release is called, processes have
3516 	 * no access to socket. But net still has.
3517 	 * Step one, detach it from networking:
3518 	 *
3519 	 * A. Remove from hash tables.
3520 	 */
3521 
3522 	sk->sk_prot->unhash(sk);
3523 
3524 	/*
3525 	 * In this point socket cannot receive new packets, but it is possible
3526 	 * that some packets are in flight because some CPU runs receiver and
3527 	 * did hash table lookup before we unhashed socket. They will achieve
3528 	 * receive queue and will be purged by socket destructor.
3529 	 *
3530 	 * Also we still have packets pending on receive queue and probably,
3531 	 * our own packets waiting in device queues. sock_destroy will drain
3532 	 * receive queue, but transmitted packets will delay socket destruction
3533 	 * until the last reference will be released.
3534 	 */
3535 
3536 	sock_orphan(sk);
3537 
3538 	xfrm_sk_free_policy(sk);
3539 
3540 	sk_refcnt_debug_release(sk);
3541 
3542 	sock_put(sk);
3543 }
3544 EXPORT_SYMBOL(sk_common_release);
3545 
3546 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3547 {
3548 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3549 
3550 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3551 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3552 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3553 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3554 	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3555 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3556 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3557 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3558 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3559 }
3560 
3561 #ifdef CONFIG_PROC_FS
3562 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3563 
3564 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3565 {
3566 	int cpu, idx = prot->inuse_idx;
3567 	int res = 0;
3568 
3569 	for_each_possible_cpu(cpu)
3570 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3571 
3572 	return res >= 0 ? res : 0;
3573 }
3574 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3575 
3576 int sock_inuse_get(struct net *net)
3577 {
3578 	int cpu, res = 0;
3579 
3580 	for_each_possible_cpu(cpu)
3581 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3582 
3583 	return res;
3584 }
3585 
3586 EXPORT_SYMBOL_GPL(sock_inuse_get);
3587 
3588 static int __net_init sock_inuse_init_net(struct net *net)
3589 {
3590 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3591 	if (net->core.prot_inuse == NULL)
3592 		return -ENOMEM;
3593 	return 0;
3594 }
3595 
3596 static void __net_exit sock_inuse_exit_net(struct net *net)
3597 {
3598 	free_percpu(net->core.prot_inuse);
3599 }
3600 
3601 static struct pernet_operations net_inuse_ops = {
3602 	.init = sock_inuse_init_net,
3603 	.exit = sock_inuse_exit_net,
3604 };
3605 
3606 static __init int net_inuse_init(void)
3607 {
3608 	if (register_pernet_subsys(&net_inuse_ops))
3609 		panic("Cannot initialize net inuse counters");
3610 
3611 	return 0;
3612 }
3613 
3614 core_initcall(net_inuse_init);
3615 
3616 static int assign_proto_idx(struct proto *prot)
3617 {
3618 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3619 
3620 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3621 		pr_err("PROTO_INUSE_NR exhausted\n");
3622 		return -ENOSPC;
3623 	}
3624 
3625 	set_bit(prot->inuse_idx, proto_inuse_idx);
3626 	return 0;
3627 }
3628 
3629 static void release_proto_idx(struct proto *prot)
3630 {
3631 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3632 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3633 }
3634 #else
3635 static inline int assign_proto_idx(struct proto *prot)
3636 {
3637 	return 0;
3638 }
3639 
3640 static inline void release_proto_idx(struct proto *prot)
3641 {
3642 }
3643 
3644 #endif
3645 
3646 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3647 {
3648 	if (!twsk_prot)
3649 		return;
3650 	kfree(twsk_prot->twsk_slab_name);
3651 	twsk_prot->twsk_slab_name = NULL;
3652 	kmem_cache_destroy(twsk_prot->twsk_slab);
3653 	twsk_prot->twsk_slab = NULL;
3654 }
3655 
3656 static int tw_prot_init(const struct proto *prot)
3657 {
3658 	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3659 
3660 	if (!twsk_prot)
3661 		return 0;
3662 
3663 	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3664 					      prot->name);
3665 	if (!twsk_prot->twsk_slab_name)
3666 		return -ENOMEM;
3667 
3668 	twsk_prot->twsk_slab =
3669 		kmem_cache_create(twsk_prot->twsk_slab_name,
3670 				  twsk_prot->twsk_obj_size, 0,
3671 				  SLAB_ACCOUNT | prot->slab_flags,
3672 				  NULL);
3673 	if (!twsk_prot->twsk_slab) {
3674 		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3675 			prot->name);
3676 		return -ENOMEM;
3677 	}
3678 
3679 	return 0;
3680 }
3681 
3682 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3683 {
3684 	if (!rsk_prot)
3685 		return;
3686 	kfree(rsk_prot->slab_name);
3687 	rsk_prot->slab_name = NULL;
3688 	kmem_cache_destroy(rsk_prot->slab);
3689 	rsk_prot->slab = NULL;
3690 }
3691 
3692 static int req_prot_init(const struct proto *prot)
3693 {
3694 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3695 
3696 	if (!rsk_prot)
3697 		return 0;
3698 
3699 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3700 					prot->name);
3701 	if (!rsk_prot->slab_name)
3702 		return -ENOMEM;
3703 
3704 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3705 					   rsk_prot->obj_size, 0,
3706 					   SLAB_ACCOUNT | prot->slab_flags,
3707 					   NULL);
3708 
3709 	if (!rsk_prot->slab) {
3710 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3711 			prot->name);
3712 		return -ENOMEM;
3713 	}
3714 	return 0;
3715 }
3716 
3717 int proto_register(struct proto *prot, int alloc_slab)
3718 {
3719 	int ret = -ENOBUFS;
3720 
3721 	if (prot->memory_allocated && !prot->sysctl_mem) {
3722 		pr_err("%s: missing sysctl_mem\n", prot->name);
3723 		return -EINVAL;
3724 	}
3725 	if (alloc_slab) {
3726 		prot->slab = kmem_cache_create_usercopy(prot->name,
3727 					prot->obj_size, 0,
3728 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3729 					prot->slab_flags,
3730 					prot->useroffset, prot->usersize,
3731 					NULL);
3732 
3733 		if (prot->slab == NULL) {
3734 			pr_crit("%s: Can't create sock SLAB cache!\n",
3735 				prot->name);
3736 			goto out;
3737 		}
3738 
3739 		if (req_prot_init(prot))
3740 			goto out_free_request_sock_slab;
3741 
3742 		if (tw_prot_init(prot))
3743 			goto out_free_timewait_sock_slab;
3744 	}
3745 
3746 	mutex_lock(&proto_list_mutex);
3747 	ret = assign_proto_idx(prot);
3748 	if (ret) {
3749 		mutex_unlock(&proto_list_mutex);
3750 		goto out_free_timewait_sock_slab;
3751 	}
3752 	list_add(&prot->node, &proto_list);
3753 	mutex_unlock(&proto_list_mutex);
3754 	return ret;
3755 
3756 out_free_timewait_sock_slab:
3757 	if (alloc_slab)
3758 		tw_prot_cleanup(prot->twsk_prot);
3759 out_free_request_sock_slab:
3760 	if (alloc_slab) {
3761 		req_prot_cleanup(prot->rsk_prot);
3762 
3763 		kmem_cache_destroy(prot->slab);
3764 		prot->slab = NULL;
3765 	}
3766 out:
3767 	return ret;
3768 }
3769 EXPORT_SYMBOL(proto_register);
3770 
3771 void proto_unregister(struct proto *prot)
3772 {
3773 	mutex_lock(&proto_list_mutex);
3774 	release_proto_idx(prot);
3775 	list_del(&prot->node);
3776 	mutex_unlock(&proto_list_mutex);
3777 
3778 	kmem_cache_destroy(prot->slab);
3779 	prot->slab = NULL;
3780 
3781 	req_prot_cleanup(prot->rsk_prot);
3782 	tw_prot_cleanup(prot->twsk_prot);
3783 }
3784 EXPORT_SYMBOL(proto_unregister);
3785 
3786 int sock_load_diag_module(int family, int protocol)
3787 {
3788 	if (!protocol) {
3789 		if (!sock_is_registered(family))
3790 			return -ENOENT;
3791 
3792 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3793 				      NETLINK_SOCK_DIAG, family);
3794 	}
3795 
3796 #ifdef CONFIG_INET
3797 	if (family == AF_INET &&
3798 	    protocol != IPPROTO_RAW &&
3799 	    protocol < MAX_INET_PROTOS &&
3800 	    !rcu_access_pointer(inet_protos[protocol]))
3801 		return -ENOENT;
3802 #endif
3803 
3804 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3805 			      NETLINK_SOCK_DIAG, family, protocol);
3806 }
3807 EXPORT_SYMBOL(sock_load_diag_module);
3808 
3809 #ifdef CONFIG_PROC_FS
3810 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3811 	__acquires(proto_list_mutex)
3812 {
3813 	mutex_lock(&proto_list_mutex);
3814 	return seq_list_start_head(&proto_list, *pos);
3815 }
3816 
3817 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3818 {
3819 	return seq_list_next(v, &proto_list, pos);
3820 }
3821 
3822 static void proto_seq_stop(struct seq_file *seq, void *v)
3823 	__releases(proto_list_mutex)
3824 {
3825 	mutex_unlock(&proto_list_mutex);
3826 }
3827 
3828 static char proto_method_implemented(const void *method)
3829 {
3830 	return method == NULL ? 'n' : 'y';
3831 }
3832 static long sock_prot_memory_allocated(struct proto *proto)
3833 {
3834 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3835 }
3836 
3837 static const char *sock_prot_memory_pressure(struct proto *proto)
3838 {
3839 	return proto->memory_pressure != NULL ?
3840 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3841 }
3842 
3843 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3844 {
3845 
3846 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3847 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3848 		   proto->name,
3849 		   proto->obj_size,
3850 		   sock_prot_inuse_get(seq_file_net(seq), proto),
3851 		   sock_prot_memory_allocated(proto),
3852 		   sock_prot_memory_pressure(proto),
3853 		   proto->max_header,
3854 		   proto->slab == NULL ? "no" : "yes",
3855 		   module_name(proto->owner),
3856 		   proto_method_implemented(proto->close),
3857 		   proto_method_implemented(proto->connect),
3858 		   proto_method_implemented(proto->disconnect),
3859 		   proto_method_implemented(proto->accept),
3860 		   proto_method_implemented(proto->ioctl),
3861 		   proto_method_implemented(proto->init),
3862 		   proto_method_implemented(proto->destroy),
3863 		   proto_method_implemented(proto->shutdown),
3864 		   proto_method_implemented(proto->setsockopt),
3865 		   proto_method_implemented(proto->getsockopt),
3866 		   proto_method_implemented(proto->sendmsg),
3867 		   proto_method_implemented(proto->recvmsg),
3868 		   proto_method_implemented(proto->sendpage),
3869 		   proto_method_implemented(proto->bind),
3870 		   proto_method_implemented(proto->backlog_rcv),
3871 		   proto_method_implemented(proto->hash),
3872 		   proto_method_implemented(proto->unhash),
3873 		   proto_method_implemented(proto->get_port),
3874 		   proto_method_implemented(proto->enter_memory_pressure));
3875 }
3876 
3877 static int proto_seq_show(struct seq_file *seq, void *v)
3878 {
3879 	if (v == &proto_list)
3880 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3881 			   "protocol",
3882 			   "size",
3883 			   "sockets",
3884 			   "memory",
3885 			   "press",
3886 			   "maxhdr",
3887 			   "slab",
3888 			   "module",
3889 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3890 	else
3891 		proto_seq_printf(seq, list_entry(v, struct proto, node));
3892 	return 0;
3893 }
3894 
3895 static const struct seq_operations proto_seq_ops = {
3896 	.start  = proto_seq_start,
3897 	.next   = proto_seq_next,
3898 	.stop   = proto_seq_stop,
3899 	.show   = proto_seq_show,
3900 };
3901 
3902 static __net_init int proto_init_net(struct net *net)
3903 {
3904 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3905 			sizeof(struct seq_net_private)))
3906 		return -ENOMEM;
3907 
3908 	return 0;
3909 }
3910 
3911 static __net_exit void proto_exit_net(struct net *net)
3912 {
3913 	remove_proc_entry("protocols", net->proc_net);
3914 }
3915 
3916 
3917 static __net_initdata struct pernet_operations proto_net_ops = {
3918 	.init = proto_init_net,
3919 	.exit = proto_exit_net,
3920 };
3921 
3922 static int __init proto_init(void)
3923 {
3924 	return register_pernet_subsys(&proto_net_ops);
3925 }
3926 
3927 subsys_initcall(proto_init);
3928 
3929 #endif /* PROC_FS */
3930 
3931 #ifdef CONFIG_NET_RX_BUSY_POLL
3932 bool sk_busy_loop_end(void *p, unsigned long start_time)
3933 {
3934 	struct sock *sk = p;
3935 
3936 	return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3937 	       sk_busy_loop_timeout(sk, start_time);
3938 }
3939 EXPORT_SYMBOL(sk_busy_loop_end);
3940 #endif /* CONFIG_NET_RX_BUSY_POLL */
3941 
3942 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
3943 {
3944 	if (!sk->sk_prot->bind_add)
3945 		return -EOPNOTSUPP;
3946 	return sk->sk_prot->bind_add(sk, addr, addr_len);
3947 }
3948 EXPORT_SYMBOL(sock_bind_add);
3949