xref: /openbmc/linux/net/core/sock.c (revision 911b8eac)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/init.h>
111 #include <linux/highmem.h>
112 #include <linux/user_namespace.h>
113 #include <linux/static_key.h>
114 #include <linux/memcontrol.h>
115 #include <linux/prefetch.h>
116 #include <linux/compat.h>
117 
118 #include <linux/uaccess.h>
119 
120 #include <linux/netdevice.h>
121 #include <net/protocol.h>
122 #include <linux/skbuff.h>
123 #include <net/net_namespace.h>
124 #include <net/request_sock.h>
125 #include <net/sock.h>
126 #include <linux/net_tstamp.h>
127 #include <net/xfrm.h>
128 #include <linux/ipsec.h>
129 #include <net/cls_cgroup.h>
130 #include <net/netprio_cgroup.h>
131 #include <linux/sock_diag.h>
132 
133 #include <linux/filter.h>
134 #include <net/sock_reuseport.h>
135 #include <net/bpf_sk_storage.h>
136 
137 #include <trace/events/sock.h>
138 
139 #include <net/tcp.h>
140 #include <net/busy_poll.h>
141 
142 static DEFINE_MUTEX(proto_list_mutex);
143 static LIST_HEAD(proto_list);
144 
145 static void sock_inuse_add(struct net *net, int val);
146 
147 /**
148  * sk_ns_capable - General socket capability test
149  * @sk: Socket to use a capability on or through
150  * @user_ns: The user namespace of the capability to use
151  * @cap: The capability to use
152  *
153  * Test to see if the opener of the socket had when the socket was
154  * created and the current process has the capability @cap in the user
155  * namespace @user_ns.
156  */
157 bool sk_ns_capable(const struct sock *sk,
158 		   struct user_namespace *user_ns, int cap)
159 {
160 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
161 		ns_capable(user_ns, cap);
162 }
163 EXPORT_SYMBOL(sk_ns_capable);
164 
165 /**
166  * sk_capable - Socket global capability test
167  * @sk: Socket to use a capability on or through
168  * @cap: The global capability to use
169  *
170  * Test to see if the opener of the socket had when the socket was
171  * created and the current process has the capability @cap in all user
172  * namespaces.
173  */
174 bool sk_capable(const struct sock *sk, int cap)
175 {
176 	return sk_ns_capable(sk, &init_user_ns, cap);
177 }
178 EXPORT_SYMBOL(sk_capable);
179 
180 /**
181  * sk_net_capable - Network namespace socket capability test
182  * @sk: Socket to use a capability on or through
183  * @cap: The capability to use
184  *
185  * Test to see if the opener of the socket had when the socket was created
186  * and the current process has the capability @cap over the network namespace
187  * the socket is a member of.
188  */
189 bool sk_net_capable(const struct sock *sk, int cap)
190 {
191 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
192 }
193 EXPORT_SYMBOL(sk_net_capable);
194 
195 /*
196  * Each address family might have different locking rules, so we have
197  * one slock key per address family and separate keys for internal and
198  * userspace sockets.
199  */
200 static struct lock_class_key af_family_keys[AF_MAX];
201 static struct lock_class_key af_family_kern_keys[AF_MAX];
202 static struct lock_class_key af_family_slock_keys[AF_MAX];
203 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
204 
205 /*
206  * Make lock validator output more readable. (we pre-construct these
207  * strings build-time, so that runtime initialization of socket
208  * locks is fast):
209  */
210 
211 #define _sock_locks(x)						  \
212   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
213   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
214   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
215   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
216   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
217   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
218   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
219   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
220   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
221   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
222   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
223   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
224   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
225   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
226   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
227   x "AF_MAX"
228 
229 static const char *const af_family_key_strings[AF_MAX+1] = {
230 	_sock_locks("sk_lock-")
231 };
232 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
233 	_sock_locks("slock-")
234 };
235 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
236 	_sock_locks("clock-")
237 };
238 
239 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
240 	_sock_locks("k-sk_lock-")
241 };
242 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
243 	_sock_locks("k-slock-")
244 };
245 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
246 	_sock_locks("k-clock-")
247 };
248 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
249 	_sock_locks("rlock-")
250 };
251 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
252 	_sock_locks("wlock-")
253 };
254 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
255 	_sock_locks("elock-")
256 };
257 
258 /*
259  * sk_callback_lock and sk queues locking rules are per-address-family,
260  * so split the lock classes by using a per-AF key:
261  */
262 static struct lock_class_key af_callback_keys[AF_MAX];
263 static struct lock_class_key af_rlock_keys[AF_MAX];
264 static struct lock_class_key af_wlock_keys[AF_MAX];
265 static struct lock_class_key af_elock_keys[AF_MAX];
266 static struct lock_class_key af_kern_callback_keys[AF_MAX];
267 
268 /* Run time adjustable parameters. */
269 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
270 EXPORT_SYMBOL(sysctl_wmem_max);
271 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
272 EXPORT_SYMBOL(sysctl_rmem_max);
273 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
274 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
275 
276 /* Maximal space eaten by iovec or ancillary data plus some space */
277 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
278 EXPORT_SYMBOL(sysctl_optmem_max);
279 
280 int sysctl_tstamp_allow_data __read_mostly = 1;
281 
282 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
283 EXPORT_SYMBOL_GPL(memalloc_socks_key);
284 
285 /**
286  * sk_set_memalloc - sets %SOCK_MEMALLOC
287  * @sk: socket to set it on
288  *
289  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
290  * It's the responsibility of the admin to adjust min_free_kbytes
291  * to meet the requirements
292  */
293 void sk_set_memalloc(struct sock *sk)
294 {
295 	sock_set_flag(sk, SOCK_MEMALLOC);
296 	sk->sk_allocation |= __GFP_MEMALLOC;
297 	static_branch_inc(&memalloc_socks_key);
298 }
299 EXPORT_SYMBOL_GPL(sk_set_memalloc);
300 
301 void sk_clear_memalloc(struct sock *sk)
302 {
303 	sock_reset_flag(sk, SOCK_MEMALLOC);
304 	sk->sk_allocation &= ~__GFP_MEMALLOC;
305 	static_branch_dec(&memalloc_socks_key);
306 
307 	/*
308 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
309 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
310 	 * it has rmem allocations due to the last swapfile being deactivated
311 	 * but there is a risk that the socket is unusable due to exceeding
312 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
313 	 */
314 	sk_mem_reclaim(sk);
315 }
316 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
317 
318 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
319 {
320 	int ret;
321 	unsigned int noreclaim_flag;
322 
323 	/* these should have been dropped before queueing */
324 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
325 
326 	noreclaim_flag = memalloc_noreclaim_save();
327 	ret = sk->sk_backlog_rcv(sk, skb);
328 	memalloc_noreclaim_restore(noreclaim_flag);
329 
330 	return ret;
331 }
332 EXPORT_SYMBOL(__sk_backlog_rcv);
333 
334 static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
335 {
336 	struct __kernel_sock_timeval tv;
337 
338 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
339 		tv.tv_sec = 0;
340 		tv.tv_usec = 0;
341 	} else {
342 		tv.tv_sec = timeo / HZ;
343 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
344 	}
345 
346 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
347 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
348 		*(struct old_timeval32 *)optval = tv32;
349 		return sizeof(tv32);
350 	}
351 
352 	if (old_timeval) {
353 		struct __kernel_old_timeval old_tv;
354 		old_tv.tv_sec = tv.tv_sec;
355 		old_tv.tv_usec = tv.tv_usec;
356 		*(struct __kernel_old_timeval *)optval = old_tv;
357 		return sizeof(old_tv);
358 	}
359 
360 	*(struct __kernel_sock_timeval *)optval = tv;
361 	return sizeof(tv);
362 }
363 
364 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
365 			    bool old_timeval)
366 {
367 	struct __kernel_sock_timeval tv;
368 
369 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
370 		struct old_timeval32 tv32;
371 
372 		if (optlen < sizeof(tv32))
373 			return -EINVAL;
374 
375 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
376 			return -EFAULT;
377 		tv.tv_sec = tv32.tv_sec;
378 		tv.tv_usec = tv32.tv_usec;
379 	} else if (old_timeval) {
380 		struct __kernel_old_timeval old_tv;
381 
382 		if (optlen < sizeof(old_tv))
383 			return -EINVAL;
384 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
385 			return -EFAULT;
386 		tv.tv_sec = old_tv.tv_sec;
387 		tv.tv_usec = old_tv.tv_usec;
388 	} else {
389 		if (optlen < sizeof(tv))
390 			return -EINVAL;
391 		if (copy_from_sockptr(&tv, optval, sizeof(tv)))
392 			return -EFAULT;
393 	}
394 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
395 		return -EDOM;
396 
397 	if (tv.tv_sec < 0) {
398 		static int warned __read_mostly;
399 
400 		*timeo_p = 0;
401 		if (warned < 10 && net_ratelimit()) {
402 			warned++;
403 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
404 				__func__, current->comm, task_pid_nr(current));
405 		}
406 		return 0;
407 	}
408 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
409 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
410 		return 0;
411 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
412 		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
413 	return 0;
414 }
415 
416 static bool sock_needs_netstamp(const struct sock *sk)
417 {
418 	switch (sk->sk_family) {
419 	case AF_UNSPEC:
420 	case AF_UNIX:
421 		return false;
422 	default:
423 		return true;
424 	}
425 }
426 
427 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
428 {
429 	if (sk->sk_flags & flags) {
430 		sk->sk_flags &= ~flags;
431 		if (sock_needs_netstamp(sk) &&
432 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
433 			net_disable_timestamp();
434 	}
435 }
436 
437 
438 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
439 {
440 	unsigned long flags;
441 	struct sk_buff_head *list = &sk->sk_receive_queue;
442 
443 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
444 		atomic_inc(&sk->sk_drops);
445 		trace_sock_rcvqueue_full(sk, skb);
446 		return -ENOMEM;
447 	}
448 
449 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
450 		atomic_inc(&sk->sk_drops);
451 		return -ENOBUFS;
452 	}
453 
454 	skb->dev = NULL;
455 	skb_set_owner_r(skb, sk);
456 
457 	/* we escape from rcu protected region, make sure we dont leak
458 	 * a norefcounted dst
459 	 */
460 	skb_dst_force(skb);
461 
462 	spin_lock_irqsave(&list->lock, flags);
463 	sock_skb_set_dropcount(sk, skb);
464 	__skb_queue_tail(list, skb);
465 	spin_unlock_irqrestore(&list->lock, flags);
466 
467 	if (!sock_flag(sk, SOCK_DEAD))
468 		sk->sk_data_ready(sk);
469 	return 0;
470 }
471 EXPORT_SYMBOL(__sock_queue_rcv_skb);
472 
473 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
474 {
475 	int err;
476 
477 	err = sk_filter(sk, skb);
478 	if (err)
479 		return err;
480 
481 	return __sock_queue_rcv_skb(sk, skb);
482 }
483 EXPORT_SYMBOL(sock_queue_rcv_skb);
484 
485 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
486 		     const int nested, unsigned int trim_cap, bool refcounted)
487 {
488 	int rc = NET_RX_SUCCESS;
489 
490 	if (sk_filter_trim_cap(sk, skb, trim_cap))
491 		goto discard_and_relse;
492 
493 	skb->dev = NULL;
494 
495 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
496 		atomic_inc(&sk->sk_drops);
497 		goto discard_and_relse;
498 	}
499 	if (nested)
500 		bh_lock_sock_nested(sk);
501 	else
502 		bh_lock_sock(sk);
503 	if (!sock_owned_by_user(sk)) {
504 		/*
505 		 * trylock + unlock semantics:
506 		 */
507 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
508 
509 		rc = sk_backlog_rcv(sk, skb);
510 
511 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
512 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
513 		bh_unlock_sock(sk);
514 		atomic_inc(&sk->sk_drops);
515 		goto discard_and_relse;
516 	}
517 
518 	bh_unlock_sock(sk);
519 out:
520 	if (refcounted)
521 		sock_put(sk);
522 	return rc;
523 discard_and_relse:
524 	kfree_skb(skb);
525 	goto out;
526 }
527 EXPORT_SYMBOL(__sk_receive_skb);
528 
529 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
530 {
531 	struct dst_entry *dst = __sk_dst_get(sk);
532 
533 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
534 		sk_tx_queue_clear(sk);
535 		sk->sk_dst_pending_confirm = 0;
536 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
537 		dst_release(dst);
538 		return NULL;
539 	}
540 
541 	return dst;
542 }
543 EXPORT_SYMBOL(__sk_dst_check);
544 
545 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
546 {
547 	struct dst_entry *dst = sk_dst_get(sk);
548 
549 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
550 		sk_dst_reset(sk);
551 		dst_release(dst);
552 		return NULL;
553 	}
554 
555 	return dst;
556 }
557 EXPORT_SYMBOL(sk_dst_check);
558 
559 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
560 {
561 	int ret = -ENOPROTOOPT;
562 #ifdef CONFIG_NETDEVICES
563 	struct net *net = sock_net(sk);
564 
565 	/* Sorry... */
566 	ret = -EPERM;
567 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
568 		goto out;
569 
570 	ret = -EINVAL;
571 	if (ifindex < 0)
572 		goto out;
573 
574 	sk->sk_bound_dev_if = ifindex;
575 	if (sk->sk_prot->rehash)
576 		sk->sk_prot->rehash(sk);
577 	sk_dst_reset(sk);
578 
579 	ret = 0;
580 
581 out:
582 #endif
583 
584 	return ret;
585 }
586 
587 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
588 {
589 	int ret;
590 
591 	if (lock_sk)
592 		lock_sock(sk);
593 	ret = sock_bindtoindex_locked(sk, ifindex);
594 	if (lock_sk)
595 		release_sock(sk);
596 
597 	return ret;
598 }
599 EXPORT_SYMBOL(sock_bindtoindex);
600 
601 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
602 {
603 	int ret = -ENOPROTOOPT;
604 #ifdef CONFIG_NETDEVICES
605 	struct net *net = sock_net(sk);
606 	char devname[IFNAMSIZ];
607 	int index;
608 
609 	ret = -EINVAL;
610 	if (optlen < 0)
611 		goto out;
612 
613 	/* Bind this socket to a particular device like "eth0",
614 	 * as specified in the passed interface name. If the
615 	 * name is "" or the option length is zero the socket
616 	 * is not bound.
617 	 */
618 	if (optlen > IFNAMSIZ - 1)
619 		optlen = IFNAMSIZ - 1;
620 	memset(devname, 0, sizeof(devname));
621 
622 	ret = -EFAULT;
623 	if (copy_from_sockptr(devname, optval, optlen))
624 		goto out;
625 
626 	index = 0;
627 	if (devname[0] != '\0') {
628 		struct net_device *dev;
629 
630 		rcu_read_lock();
631 		dev = dev_get_by_name_rcu(net, devname);
632 		if (dev)
633 			index = dev->ifindex;
634 		rcu_read_unlock();
635 		ret = -ENODEV;
636 		if (!dev)
637 			goto out;
638 	}
639 
640 	return sock_bindtoindex(sk, index, true);
641 out:
642 #endif
643 
644 	return ret;
645 }
646 
647 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
648 				int __user *optlen, int len)
649 {
650 	int ret = -ENOPROTOOPT;
651 #ifdef CONFIG_NETDEVICES
652 	struct net *net = sock_net(sk);
653 	char devname[IFNAMSIZ];
654 
655 	if (sk->sk_bound_dev_if == 0) {
656 		len = 0;
657 		goto zero;
658 	}
659 
660 	ret = -EINVAL;
661 	if (len < IFNAMSIZ)
662 		goto out;
663 
664 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
665 	if (ret)
666 		goto out;
667 
668 	len = strlen(devname) + 1;
669 
670 	ret = -EFAULT;
671 	if (copy_to_user(optval, devname, len))
672 		goto out;
673 
674 zero:
675 	ret = -EFAULT;
676 	if (put_user(len, optlen))
677 		goto out;
678 
679 	ret = 0;
680 
681 out:
682 #endif
683 
684 	return ret;
685 }
686 
687 bool sk_mc_loop(struct sock *sk)
688 {
689 	if (dev_recursion_level())
690 		return false;
691 	if (!sk)
692 		return true;
693 	switch (sk->sk_family) {
694 	case AF_INET:
695 		return inet_sk(sk)->mc_loop;
696 #if IS_ENABLED(CONFIG_IPV6)
697 	case AF_INET6:
698 		return inet6_sk(sk)->mc_loop;
699 #endif
700 	}
701 	WARN_ON_ONCE(1);
702 	return true;
703 }
704 EXPORT_SYMBOL(sk_mc_loop);
705 
706 void sock_set_reuseaddr(struct sock *sk)
707 {
708 	lock_sock(sk);
709 	sk->sk_reuse = SK_CAN_REUSE;
710 	release_sock(sk);
711 }
712 EXPORT_SYMBOL(sock_set_reuseaddr);
713 
714 void sock_set_reuseport(struct sock *sk)
715 {
716 	lock_sock(sk);
717 	sk->sk_reuseport = true;
718 	release_sock(sk);
719 }
720 EXPORT_SYMBOL(sock_set_reuseport);
721 
722 void sock_no_linger(struct sock *sk)
723 {
724 	lock_sock(sk);
725 	sk->sk_lingertime = 0;
726 	sock_set_flag(sk, SOCK_LINGER);
727 	release_sock(sk);
728 }
729 EXPORT_SYMBOL(sock_no_linger);
730 
731 void sock_set_priority(struct sock *sk, u32 priority)
732 {
733 	lock_sock(sk);
734 	sk->sk_priority = priority;
735 	release_sock(sk);
736 }
737 EXPORT_SYMBOL(sock_set_priority);
738 
739 void sock_set_sndtimeo(struct sock *sk, s64 secs)
740 {
741 	lock_sock(sk);
742 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
743 		sk->sk_sndtimeo = secs * HZ;
744 	else
745 		sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
746 	release_sock(sk);
747 }
748 EXPORT_SYMBOL(sock_set_sndtimeo);
749 
750 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
751 {
752 	if (val)  {
753 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
754 		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
755 		sock_set_flag(sk, SOCK_RCVTSTAMP);
756 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
757 	} else {
758 		sock_reset_flag(sk, SOCK_RCVTSTAMP);
759 		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
760 		sock_reset_flag(sk, SOCK_TSTAMP_NEW);
761 	}
762 }
763 
764 void sock_enable_timestamps(struct sock *sk)
765 {
766 	lock_sock(sk);
767 	__sock_set_timestamps(sk, true, false, true);
768 	release_sock(sk);
769 }
770 EXPORT_SYMBOL(sock_enable_timestamps);
771 
772 void sock_set_keepalive(struct sock *sk)
773 {
774 	lock_sock(sk);
775 	if (sk->sk_prot->keepalive)
776 		sk->sk_prot->keepalive(sk, true);
777 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
778 	release_sock(sk);
779 }
780 EXPORT_SYMBOL(sock_set_keepalive);
781 
782 static void __sock_set_rcvbuf(struct sock *sk, int val)
783 {
784 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
785 	 * as a negative value.
786 	 */
787 	val = min_t(int, val, INT_MAX / 2);
788 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
789 
790 	/* We double it on the way in to account for "struct sk_buff" etc.
791 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
792 	 * will allow that much actual data to be received on that socket.
793 	 *
794 	 * Applications are unaware that "struct sk_buff" and other overheads
795 	 * allocate from the receive buffer during socket buffer allocation.
796 	 *
797 	 * And after considering the possible alternatives, returning the value
798 	 * we actually used in getsockopt is the most desirable behavior.
799 	 */
800 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
801 }
802 
803 void sock_set_rcvbuf(struct sock *sk, int val)
804 {
805 	lock_sock(sk);
806 	__sock_set_rcvbuf(sk, val);
807 	release_sock(sk);
808 }
809 EXPORT_SYMBOL(sock_set_rcvbuf);
810 
811 void sock_set_mark(struct sock *sk, u32 val)
812 {
813 	lock_sock(sk);
814 	sk->sk_mark = val;
815 	release_sock(sk);
816 }
817 EXPORT_SYMBOL(sock_set_mark);
818 
819 /*
820  *	This is meant for all protocols to use and covers goings on
821  *	at the socket level. Everything here is generic.
822  */
823 
824 int sock_setsockopt(struct socket *sock, int level, int optname,
825 		    sockptr_t optval, unsigned int optlen)
826 {
827 	struct sock_txtime sk_txtime;
828 	struct sock *sk = sock->sk;
829 	int val;
830 	int valbool;
831 	struct linger ling;
832 	int ret = 0;
833 
834 	/*
835 	 *	Options without arguments
836 	 */
837 
838 	if (optname == SO_BINDTODEVICE)
839 		return sock_setbindtodevice(sk, optval, optlen);
840 
841 	if (optlen < sizeof(int))
842 		return -EINVAL;
843 
844 	if (copy_from_sockptr(&val, optval, sizeof(val)))
845 		return -EFAULT;
846 
847 	valbool = val ? 1 : 0;
848 
849 	lock_sock(sk);
850 
851 	switch (optname) {
852 	case SO_DEBUG:
853 		if (val && !capable(CAP_NET_ADMIN))
854 			ret = -EACCES;
855 		else
856 			sock_valbool_flag(sk, SOCK_DBG, valbool);
857 		break;
858 	case SO_REUSEADDR:
859 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
860 		break;
861 	case SO_REUSEPORT:
862 		sk->sk_reuseport = valbool;
863 		break;
864 	case SO_TYPE:
865 	case SO_PROTOCOL:
866 	case SO_DOMAIN:
867 	case SO_ERROR:
868 		ret = -ENOPROTOOPT;
869 		break;
870 	case SO_DONTROUTE:
871 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
872 		sk_dst_reset(sk);
873 		break;
874 	case SO_BROADCAST:
875 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
876 		break;
877 	case SO_SNDBUF:
878 		/* Don't error on this BSD doesn't and if you think
879 		 * about it this is right. Otherwise apps have to
880 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
881 		 * are treated in BSD as hints
882 		 */
883 		val = min_t(u32, val, sysctl_wmem_max);
884 set_sndbuf:
885 		/* Ensure val * 2 fits into an int, to prevent max_t()
886 		 * from treating it as a negative value.
887 		 */
888 		val = min_t(int, val, INT_MAX / 2);
889 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
890 		WRITE_ONCE(sk->sk_sndbuf,
891 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
892 		/* Wake up sending tasks if we upped the value. */
893 		sk->sk_write_space(sk);
894 		break;
895 
896 	case SO_SNDBUFFORCE:
897 		if (!capable(CAP_NET_ADMIN)) {
898 			ret = -EPERM;
899 			break;
900 		}
901 
902 		/* No negative values (to prevent underflow, as val will be
903 		 * multiplied by 2).
904 		 */
905 		if (val < 0)
906 			val = 0;
907 		goto set_sndbuf;
908 
909 	case SO_RCVBUF:
910 		/* Don't error on this BSD doesn't and if you think
911 		 * about it this is right. Otherwise apps have to
912 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
913 		 * are treated in BSD as hints
914 		 */
915 		__sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max));
916 		break;
917 
918 	case SO_RCVBUFFORCE:
919 		if (!capable(CAP_NET_ADMIN)) {
920 			ret = -EPERM;
921 			break;
922 		}
923 
924 		/* No negative values (to prevent underflow, as val will be
925 		 * multiplied by 2).
926 		 */
927 		__sock_set_rcvbuf(sk, max(val, 0));
928 		break;
929 
930 	case SO_KEEPALIVE:
931 		if (sk->sk_prot->keepalive)
932 			sk->sk_prot->keepalive(sk, valbool);
933 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
934 		break;
935 
936 	case SO_OOBINLINE:
937 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
938 		break;
939 
940 	case SO_NO_CHECK:
941 		sk->sk_no_check_tx = valbool;
942 		break;
943 
944 	case SO_PRIORITY:
945 		if ((val >= 0 && val <= 6) ||
946 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
947 			sk->sk_priority = val;
948 		else
949 			ret = -EPERM;
950 		break;
951 
952 	case SO_LINGER:
953 		if (optlen < sizeof(ling)) {
954 			ret = -EINVAL;	/* 1003.1g */
955 			break;
956 		}
957 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
958 			ret = -EFAULT;
959 			break;
960 		}
961 		if (!ling.l_onoff)
962 			sock_reset_flag(sk, SOCK_LINGER);
963 		else {
964 #if (BITS_PER_LONG == 32)
965 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
966 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
967 			else
968 #endif
969 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
970 			sock_set_flag(sk, SOCK_LINGER);
971 		}
972 		break;
973 
974 	case SO_BSDCOMPAT:
975 		break;
976 
977 	case SO_PASSCRED:
978 		if (valbool)
979 			set_bit(SOCK_PASSCRED, &sock->flags);
980 		else
981 			clear_bit(SOCK_PASSCRED, &sock->flags);
982 		break;
983 
984 	case SO_TIMESTAMP_OLD:
985 		__sock_set_timestamps(sk, valbool, false, false);
986 		break;
987 	case SO_TIMESTAMP_NEW:
988 		__sock_set_timestamps(sk, valbool, true, false);
989 		break;
990 	case SO_TIMESTAMPNS_OLD:
991 		__sock_set_timestamps(sk, valbool, false, true);
992 		break;
993 	case SO_TIMESTAMPNS_NEW:
994 		__sock_set_timestamps(sk, valbool, true, true);
995 		break;
996 	case SO_TIMESTAMPING_NEW:
997 		sock_set_flag(sk, SOCK_TSTAMP_NEW);
998 		fallthrough;
999 	case SO_TIMESTAMPING_OLD:
1000 		if (val & ~SOF_TIMESTAMPING_MASK) {
1001 			ret = -EINVAL;
1002 			break;
1003 		}
1004 
1005 		if (val & SOF_TIMESTAMPING_OPT_ID &&
1006 		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
1007 			if (sk->sk_protocol == IPPROTO_TCP &&
1008 			    sk->sk_type == SOCK_STREAM) {
1009 				if ((1 << sk->sk_state) &
1010 				    (TCPF_CLOSE | TCPF_LISTEN)) {
1011 					ret = -EINVAL;
1012 					break;
1013 				}
1014 				sk->sk_tskey = tcp_sk(sk)->snd_una;
1015 			} else {
1016 				sk->sk_tskey = 0;
1017 			}
1018 		}
1019 
1020 		if (val & SOF_TIMESTAMPING_OPT_STATS &&
1021 		    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
1022 			ret = -EINVAL;
1023 			break;
1024 		}
1025 
1026 		sk->sk_tsflags = val;
1027 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
1028 			sock_enable_timestamp(sk,
1029 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
1030 		else {
1031 			if (optname == SO_TIMESTAMPING_NEW)
1032 				sock_reset_flag(sk, SOCK_TSTAMP_NEW);
1033 
1034 			sock_disable_timestamp(sk,
1035 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
1036 		}
1037 		break;
1038 
1039 	case SO_RCVLOWAT:
1040 		if (val < 0)
1041 			val = INT_MAX;
1042 		if (sock->ops->set_rcvlowat)
1043 			ret = sock->ops->set_rcvlowat(sk, val);
1044 		else
1045 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1046 		break;
1047 
1048 	case SO_RCVTIMEO_OLD:
1049 	case SO_RCVTIMEO_NEW:
1050 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1051 				       optlen, optname == SO_RCVTIMEO_OLD);
1052 		break;
1053 
1054 	case SO_SNDTIMEO_OLD:
1055 	case SO_SNDTIMEO_NEW:
1056 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1057 				       optlen, optname == SO_SNDTIMEO_OLD);
1058 		break;
1059 
1060 	case SO_ATTACH_FILTER: {
1061 		struct sock_fprog fprog;
1062 
1063 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1064 		if (!ret)
1065 			ret = sk_attach_filter(&fprog, sk);
1066 		break;
1067 	}
1068 	case SO_ATTACH_BPF:
1069 		ret = -EINVAL;
1070 		if (optlen == sizeof(u32)) {
1071 			u32 ufd;
1072 
1073 			ret = -EFAULT;
1074 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1075 				break;
1076 
1077 			ret = sk_attach_bpf(ufd, sk);
1078 		}
1079 		break;
1080 
1081 	case SO_ATTACH_REUSEPORT_CBPF: {
1082 		struct sock_fprog fprog;
1083 
1084 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1085 		if (!ret)
1086 			ret = sk_reuseport_attach_filter(&fprog, sk);
1087 		break;
1088 	}
1089 	case SO_ATTACH_REUSEPORT_EBPF:
1090 		ret = -EINVAL;
1091 		if (optlen == sizeof(u32)) {
1092 			u32 ufd;
1093 
1094 			ret = -EFAULT;
1095 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1096 				break;
1097 
1098 			ret = sk_reuseport_attach_bpf(ufd, sk);
1099 		}
1100 		break;
1101 
1102 	case SO_DETACH_REUSEPORT_BPF:
1103 		ret = reuseport_detach_prog(sk);
1104 		break;
1105 
1106 	case SO_DETACH_FILTER:
1107 		ret = sk_detach_filter(sk);
1108 		break;
1109 
1110 	case SO_LOCK_FILTER:
1111 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1112 			ret = -EPERM;
1113 		else
1114 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1115 		break;
1116 
1117 	case SO_PASSSEC:
1118 		if (valbool)
1119 			set_bit(SOCK_PASSSEC, &sock->flags);
1120 		else
1121 			clear_bit(SOCK_PASSSEC, &sock->flags);
1122 		break;
1123 	case SO_MARK:
1124 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1125 			ret = -EPERM;
1126 		} else if (val != sk->sk_mark) {
1127 			sk->sk_mark = val;
1128 			sk_dst_reset(sk);
1129 		}
1130 		break;
1131 
1132 	case SO_RXQ_OVFL:
1133 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1134 		break;
1135 
1136 	case SO_WIFI_STATUS:
1137 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1138 		break;
1139 
1140 	case SO_PEEK_OFF:
1141 		if (sock->ops->set_peek_off)
1142 			ret = sock->ops->set_peek_off(sk, val);
1143 		else
1144 			ret = -EOPNOTSUPP;
1145 		break;
1146 
1147 	case SO_NOFCS:
1148 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1149 		break;
1150 
1151 	case SO_SELECT_ERR_QUEUE:
1152 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1153 		break;
1154 
1155 #ifdef CONFIG_NET_RX_BUSY_POLL
1156 	case SO_BUSY_POLL:
1157 		/* allow unprivileged users to decrease the value */
1158 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1159 			ret = -EPERM;
1160 		else {
1161 			if (val < 0)
1162 				ret = -EINVAL;
1163 			else
1164 				sk->sk_ll_usec = val;
1165 		}
1166 		break;
1167 #endif
1168 
1169 	case SO_MAX_PACING_RATE:
1170 		{
1171 		unsigned long ulval = (val == ~0U) ? ~0UL : val;
1172 
1173 		if (sizeof(ulval) != sizeof(val) &&
1174 		    optlen >= sizeof(ulval) &&
1175 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1176 			ret = -EFAULT;
1177 			break;
1178 		}
1179 		if (ulval != ~0UL)
1180 			cmpxchg(&sk->sk_pacing_status,
1181 				SK_PACING_NONE,
1182 				SK_PACING_NEEDED);
1183 		sk->sk_max_pacing_rate = ulval;
1184 		sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1185 		break;
1186 		}
1187 	case SO_INCOMING_CPU:
1188 		WRITE_ONCE(sk->sk_incoming_cpu, val);
1189 		break;
1190 
1191 	case SO_CNX_ADVICE:
1192 		if (val == 1)
1193 			dst_negative_advice(sk);
1194 		break;
1195 
1196 	case SO_ZEROCOPY:
1197 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1198 			if (!((sk->sk_type == SOCK_STREAM &&
1199 			       sk->sk_protocol == IPPROTO_TCP) ||
1200 			      (sk->sk_type == SOCK_DGRAM &&
1201 			       sk->sk_protocol == IPPROTO_UDP)))
1202 				ret = -ENOTSUPP;
1203 		} else if (sk->sk_family != PF_RDS) {
1204 			ret = -ENOTSUPP;
1205 		}
1206 		if (!ret) {
1207 			if (val < 0 || val > 1)
1208 				ret = -EINVAL;
1209 			else
1210 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1211 		}
1212 		break;
1213 
1214 	case SO_TXTIME:
1215 		if (optlen != sizeof(struct sock_txtime)) {
1216 			ret = -EINVAL;
1217 			break;
1218 		} else if (copy_from_sockptr(&sk_txtime, optval,
1219 			   sizeof(struct sock_txtime))) {
1220 			ret = -EFAULT;
1221 			break;
1222 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1223 			ret = -EINVAL;
1224 			break;
1225 		}
1226 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1227 		 * scheduler has enough safe guards.
1228 		 */
1229 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1230 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1231 			ret = -EPERM;
1232 			break;
1233 		}
1234 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1235 		sk->sk_clockid = sk_txtime.clockid;
1236 		sk->sk_txtime_deadline_mode =
1237 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1238 		sk->sk_txtime_report_errors =
1239 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1240 		break;
1241 
1242 	case SO_BINDTOIFINDEX:
1243 		ret = sock_bindtoindex_locked(sk, val);
1244 		break;
1245 
1246 	default:
1247 		ret = -ENOPROTOOPT;
1248 		break;
1249 	}
1250 	release_sock(sk);
1251 	return ret;
1252 }
1253 EXPORT_SYMBOL(sock_setsockopt);
1254 
1255 
1256 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1257 			  struct ucred *ucred)
1258 {
1259 	ucred->pid = pid_vnr(pid);
1260 	ucred->uid = ucred->gid = -1;
1261 	if (cred) {
1262 		struct user_namespace *current_ns = current_user_ns();
1263 
1264 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1265 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1266 	}
1267 }
1268 
1269 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1270 {
1271 	struct user_namespace *user_ns = current_user_ns();
1272 	int i;
1273 
1274 	for (i = 0; i < src->ngroups; i++)
1275 		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1276 			return -EFAULT;
1277 
1278 	return 0;
1279 }
1280 
1281 int sock_getsockopt(struct socket *sock, int level, int optname,
1282 		    char __user *optval, int __user *optlen)
1283 {
1284 	struct sock *sk = sock->sk;
1285 
1286 	union {
1287 		int val;
1288 		u64 val64;
1289 		unsigned long ulval;
1290 		struct linger ling;
1291 		struct old_timeval32 tm32;
1292 		struct __kernel_old_timeval tm;
1293 		struct  __kernel_sock_timeval stm;
1294 		struct sock_txtime txtime;
1295 	} v;
1296 
1297 	int lv = sizeof(int);
1298 	int len;
1299 
1300 	if (get_user(len, optlen))
1301 		return -EFAULT;
1302 	if (len < 0)
1303 		return -EINVAL;
1304 
1305 	memset(&v, 0, sizeof(v));
1306 
1307 	switch (optname) {
1308 	case SO_DEBUG:
1309 		v.val = sock_flag(sk, SOCK_DBG);
1310 		break;
1311 
1312 	case SO_DONTROUTE:
1313 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1314 		break;
1315 
1316 	case SO_BROADCAST:
1317 		v.val = sock_flag(sk, SOCK_BROADCAST);
1318 		break;
1319 
1320 	case SO_SNDBUF:
1321 		v.val = sk->sk_sndbuf;
1322 		break;
1323 
1324 	case SO_RCVBUF:
1325 		v.val = sk->sk_rcvbuf;
1326 		break;
1327 
1328 	case SO_REUSEADDR:
1329 		v.val = sk->sk_reuse;
1330 		break;
1331 
1332 	case SO_REUSEPORT:
1333 		v.val = sk->sk_reuseport;
1334 		break;
1335 
1336 	case SO_KEEPALIVE:
1337 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1338 		break;
1339 
1340 	case SO_TYPE:
1341 		v.val = sk->sk_type;
1342 		break;
1343 
1344 	case SO_PROTOCOL:
1345 		v.val = sk->sk_protocol;
1346 		break;
1347 
1348 	case SO_DOMAIN:
1349 		v.val = sk->sk_family;
1350 		break;
1351 
1352 	case SO_ERROR:
1353 		v.val = -sock_error(sk);
1354 		if (v.val == 0)
1355 			v.val = xchg(&sk->sk_err_soft, 0);
1356 		break;
1357 
1358 	case SO_OOBINLINE:
1359 		v.val = sock_flag(sk, SOCK_URGINLINE);
1360 		break;
1361 
1362 	case SO_NO_CHECK:
1363 		v.val = sk->sk_no_check_tx;
1364 		break;
1365 
1366 	case SO_PRIORITY:
1367 		v.val = sk->sk_priority;
1368 		break;
1369 
1370 	case SO_LINGER:
1371 		lv		= sizeof(v.ling);
1372 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1373 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1374 		break;
1375 
1376 	case SO_BSDCOMPAT:
1377 		break;
1378 
1379 	case SO_TIMESTAMP_OLD:
1380 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1381 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1382 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1383 		break;
1384 
1385 	case SO_TIMESTAMPNS_OLD:
1386 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1387 		break;
1388 
1389 	case SO_TIMESTAMP_NEW:
1390 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1391 		break;
1392 
1393 	case SO_TIMESTAMPNS_NEW:
1394 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1395 		break;
1396 
1397 	case SO_TIMESTAMPING_OLD:
1398 		v.val = sk->sk_tsflags;
1399 		break;
1400 
1401 	case SO_RCVTIMEO_OLD:
1402 	case SO_RCVTIMEO_NEW:
1403 		lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1404 		break;
1405 
1406 	case SO_SNDTIMEO_OLD:
1407 	case SO_SNDTIMEO_NEW:
1408 		lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1409 		break;
1410 
1411 	case SO_RCVLOWAT:
1412 		v.val = sk->sk_rcvlowat;
1413 		break;
1414 
1415 	case SO_SNDLOWAT:
1416 		v.val = 1;
1417 		break;
1418 
1419 	case SO_PASSCRED:
1420 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1421 		break;
1422 
1423 	case SO_PEERCRED:
1424 	{
1425 		struct ucred peercred;
1426 		if (len > sizeof(peercred))
1427 			len = sizeof(peercred);
1428 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1429 		if (copy_to_user(optval, &peercred, len))
1430 			return -EFAULT;
1431 		goto lenout;
1432 	}
1433 
1434 	case SO_PEERGROUPS:
1435 	{
1436 		int ret, n;
1437 
1438 		if (!sk->sk_peer_cred)
1439 			return -ENODATA;
1440 
1441 		n = sk->sk_peer_cred->group_info->ngroups;
1442 		if (len < n * sizeof(gid_t)) {
1443 			len = n * sizeof(gid_t);
1444 			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1445 		}
1446 		len = n * sizeof(gid_t);
1447 
1448 		ret = groups_to_user((gid_t __user *)optval,
1449 				     sk->sk_peer_cred->group_info);
1450 		if (ret)
1451 			return ret;
1452 		goto lenout;
1453 	}
1454 
1455 	case SO_PEERNAME:
1456 	{
1457 		char address[128];
1458 
1459 		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1460 		if (lv < 0)
1461 			return -ENOTCONN;
1462 		if (lv < len)
1463 			return -EINVAL;
1464 		if (copy_to_user(optval, address, len))
1465 			return -EFAULT;
1466 		goto lenout;
1467 	}
1468 
1469 	/* Dubious BSD thing... Probably nobody even uses it, but
1470 	 * the UNIX standard wants it for whatever reason... -DaveM
1471 	 */
1472 	case SO_ACCEPTCONN:
1473 		v.val = sk->sk_state == TCP_LISTEN;
1474 		break;
1475 
1476 	case SO_PASSSEC:
1477 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1478 		break;
1479 
1480 	case SO_PEERSEC:
1481 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1482 
1483 	case SO_MARK:
1484 		v.val = sk->sk_mark;
1485 		break;
1486 
1487 	case SO_RXQ_OVFL:
1488 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1489 		break;
1490 
1491 	case SO_WIFI_STATUS:
1492 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1493 		break;
1494 
1495 	case SO_PEEK_OFF:
1496 		if (!sock->ops->set_peek_off)
1497 			return -EOPNOTSUPP;
1498 
1499 		v.val = sk->sk_peek_off;
1500 		break;
1501 	case SO_NOFCS:
1502 		v.val = sock_flag(sk, SOCK_NOFCS);
1503 		break;
1504 
1505 	case SO_BINDTODEVICE:
1506 		return sock_getbindtodevice(sk, optval, optlen, len);
1507 
1508 	case SO_GET_FILTER:
1509 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1510 		if (len < 0)
1511 			return len;
1512 
1513 		goto lenout;
1514 
1515 	case SO_LOCK_FILTER:
1516 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1517 		break;
1518 
1519 	case SO_BPF_EXTENSIONS:
1520 		v.val = bpf_tell_extensions();
1521 		break;
1522 
1523 	case SO_SELECT_ERR_QUEUE:
1524 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1525 		break;
1526 
1527 #ifdef CONFIG_NET_RX_BUSY_POLL
1528 	case SO_BUSY_POLL:
1529 		v.val = sk->sk_ll_usec;
1530 		break;
1531 #endif
1532 
1533 	case SO_MAX_PACING_RATE:
1534 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1535 			lv = sizeof(v.ulval);
1536 			v.ulval = sk->sk_max_pacing_rate;
1537 		} else {
1538 			/* 32bit version */
1539 			v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1540 		}
1541 		break;
1542 
1543 	case SO_INCOMING_CPU:
1544 		v.val = READ_ONCE(sk->sk_incoming_cpu);
1545 		break;
1546 
1547 	case SO_MEMINFO:
1548 	{
1549 		u32 meminfo[SK_MEMINFO_VARS];
1550 
1551 		sk_get_meminfo(sk, meminfo);
1552 
1553 		len = min_t(unsigned int, len, sizeof(meminfo));
1554 		if (copy_to_user(optval, &meminfo, len))
1555 			return -EFAULT;
1556 
1557 		goto lenout;
1558 	}
1559 
1560 #ifdef CONFIG_NET_RX_BUSY_POLL
1561 	case SO_INCOMING_NAPI_ID:
1562 		v.val = READ_ONCE(sk->sk_napi_id);
1563 
1564 		/* aggregate non-NAPI IDs down to 0 */
1565 		if (v.val < MIN_NAPI_ID)
1566 			v.val = 0;
1567 
1568 		break;
1569 #endif
1570 
1571 	case SO_COOKIE:
1572 		lv = sizeof(u64);
1573 		if (len < lv)
1574 			return -EINVAL;
1575 		v.val64 = sock_gen_cookie(sk);
1576 		break;
1577 
1578 	case SO_ZEROCOPY:
1579 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1580 		break;
1581 
1582 	case SO_TXTIME:
1583 		lv = sizeof(v.txtime);
1584 		v.txtime.clockid = sk->sk_clockid;
1585 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1586 				  SOF_TXTIME_DEADLINE_MODE : 0;
1587 		v.txtime.flags |= sk->sk_txtime_report_errors ?
1588 				  SOF_TXTIME_REPORT_ERRORS : 0;
1589 		break;
1590 
1591 	case SO_BINDTOIFINDEX:
1592 		v.val = sk->sk_bound_dev_if;
1593 		break;
1594 
1595 	default:
1596 		/* We implement the SO_SNDLOWAT etc to not be settable
1597 		 * (1003.1g 7).
1598 		 */
1599 		return -ENOPROTOOPT;
1600 	}
1601 
1602 	if (len > lv)
1603 		len = lv;
1604 	if (copy_to_user(optval, &v, len))
1605 		return -EFAULT;
1606 lenout:
1607 	if (put_user(len, optlen))
1608 		return -EFAULT;
1609 	return 0;
1610 }
1611 
1612 /*
1613  * Initialize an sk_lock.
1614  *
1615  * (We also register the sk_lock with the lock validator.)
1616  */
1617 static inline void sock_lock_init(struct sock *sk)
1618 {
1619 	if (sk->sk_kern_sock)
1620 		sock_lock_init_class_and_name(
1621 			sk,
1622 			af_family_kern_slock_key_strings[sk->sk_family],
1623 			af_family_kern_slock_keys + sk->sk_family,
1624 			af_family_kern_key_strings[sk->sk_family],
1625 			af_family_kern_keys + sk->sk_family);
1626 	else
1627 		sock_lock_init_class_and_name(
1628 			sk,
1629 			af_family_slock_key_strings[sk->sk_family],
1630 			af_family_slock_keys + sk->sk_family,
1631 			af_family_key_strings[sk->sk_family],
1632 			af_family_keys + sk->sk_family);
1633 }
1634 
1635 /*
1636  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1637  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1638  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1639  */
1640 static void sock_copy(struct sock *nsk, const struct sock *osk)
1641 {
1642 	const struct proto *prot = READ_ONCE(osk->sk_prot);
1643 #ifdef CONFIG_SECURITY_NETWORK
1644 	void *sptr = nsk->sk_security;
1645 #endif
1646 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1647 
1648 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1649 	       prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1650 
1651 #ifdef CONFIG_SECURITY_NETWORK
1652 	nsk->sk_security = sptr;
1653 	security_sk_clone(osk, nsk);
1654 #endif
1655 }
1656 
1657 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1658 		int family)
1659 {
1660 	struct sock *sk;
1661 	struct kmem_cache *slab;
1662 
1663 	slab = prot->slab;
1664 	if (slab != NULL) {
1665 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1666 		if (!sk)
1667 			return sk;
1668 		if (want_init_on_alloc(priority))
1669 			sk_prot_clear_nulls(sk, prot->obj_size);
1670 	} else
1671 		sk = kmalloc(prot->obj_size, priority);
1672 
1673 	if (sk != NULL) {
1674 		if (security_sk_alloc(sk, family, priority))
1675 			goto out_free;
1676 
1677 		if (!try_module_get(prot->owner))
1678 			goto out_free_sec;
1679 		sk_tx_queue_clear(sk);
1680 	}
1681 
1682 	return sk;
1683 
1684 out_free_sec:
1685 	security_sk_free(sk);
1686 out_free:
1687 	if (slab != NULL)
1688 		kmem_cache_free(slab, sk);
1689 	else
1690 		kfree(sk);
1691 	return NULL;
1692 }
1693 
1694 static void sk_prot_free(struct proto *prot, struct sock *sk)
1695 {
1696 	struct kmem_cache *slab;
1697 	struct module *owner;
1698 
1699 	owner = prot->owner;
1700 	slab = prot->slab;
1701 
1702 	cgroup_sk_free(&sk->sk_cgrp_data);
1703 	mem_cgroup_sk_free(sk);
1704 	security_sk_free(sk);
1705 	if (slab != NULL)
1706 		kmem_cache_free(slab, sk);
1707 	else
1708 		kfree(sk);
1709 	module_put(owner);
1710 }
1711 
1712 /**
1713  *	sk_alloc - All socket objects are allocated here
1714  *	@net: the applicable net namespace
1715  *	@family: protocol family
1716  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1717  *	@prot: struct proto associated with this new sock instance
1718  *	@kern: is this to be a kernel socket?
1719  */
1720 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1721 		      struct proto *prot, int kern)
1722 {
1723 	struct sock *sk;
1724 
1725 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1726 	if (sk) {
1727 		sk->sk_family = family;
1728 		/*
1729 		 * See comment in struct sock definition to understand
1730 		 * why we need sk_prot_creator -acme
1731 		 */
1732 		sk->sk_prot = sk->sk_prot_creator = prot;
1733 		sk->sk_kern_sock = kern;
1734 		sock_lock_init(sk);
1735 		sk->sk_net_refcnt = kern ? 0 : 1;
1736 		if (likely(sk->sk_net_refcnt)) {
1737 			get_net(net);
1738 			sock_inuse_add(net, 1);
1739 		}
1740 
1741 		sock_net_set(sk, net);
1742 		refcount_set(&sk->sk_wmem_alloc, 1);
1743 
1744 		mem_cgroup_sk_alloc(sk);
1745 		cgroup_sk_alloc(&sk->sk_cgrp_data);
1746 		sock_update_classid(&sk->sk_cgrp_data);
1747 		sock_update_netprioidx(&sk->sk_cgrp_data);
1748 		sk_tx_queue_clear(sk);
1749 	}
1750 
1751 	return sk;
1752 }
1753 EXPORT_SYMBOL(sk_alloc);
1754 
1755 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1756  * grace period. This is the case for UDP sockets and TCP listeners.
1757  */
1758 static void __sk_destruct(struct rcu_head *head)
1759 {
1760 	struct sock *sk = container_of(head, struct sock, sk_rcu);
1761 	struct sk_filter *filter;
1762 
1763 	if (sk->sk_destruct)
1764 		sk->sk_destruct(sk);
1765 
1766 	filter = rcu_dereference_check(sk->sk_filter,
1767 				       refcount_read(&sk->sk_wmem_alloc) == 0);
1768 	if (filter) {
1769 		sk_filter_uncharge(sk, filter);
1770 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1771 	}
1772 
1773 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1774 
1775 #ifdef CONFIG_BPF_SYSCALL
1776 	bpf_sk_storage_free(sk);
1777 #endif
1778 
1779 	if (atomic_read(&sk->sk_omem_alloc))
1780 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1781 			 __func__, atomic_read(&sk->sk_omem_alloc));
1782 
1783 	if (sk->sk_frag.page) {
1784 		put_page(sk->sk_frag.page);
1785 		sk->sk_frag.page = NULL;
1786 	}
1787 
1788 	if (sk->sk_peer_cred)
1789 		put_cred(sk->sk_peer_cred);
1790 	put_pid(sk->sk_peer_pid);
1791 	if (likely(sk->sk_net_refcnt))
1792 		put_net(sock_net(sk));
1793 	sk_prot_free(sk->sk_prot_creator, sk);
1794 }
1795 
1796 void sk_destruct(struct sock *sk)
1797 {
1798 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1799 
1800 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1801 		reuseport_detach_sock(sk);
1802 		use_call_rcu = true;
1803 	}
1804 
1805 	if (use_call_rcu)
1806 		call_rcu(&sk->sk_rcu, __sk_destruct);
1807 	else
1808 		__sk_destruct(&sk->sk_rcu);
1809 }
1810 
1811 static void __sk_free(struct sock *sk)
1812 {
1813 	if (likely(sk->sk_net_refcnt))
1814 		sock_inuse_add(sock_net(sk), -1);
1815 
1816 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1817 		sock_diag_broadcast_destroy(sk);
1818 	else
1819 		sk_destruct(sk);
1820 }
1821 
1822 void sk_free(struct sock *sk)
1823 {
1824 	/*
1825 	 * We subtract one from sk_wmem_alloc and can know if
1826 	 * some packets are still in some tx queue.
1827 	 * If not null, sock_wfree() will call __sk_free(sk) later
1828 	 */
1829 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1830 		__sk_free(sk);
1831 }
1832 EXPORT_SYMBOL(sk_free);
1833 
1834 static void sk_init_common(struct sock *sk)
1835 {
1836 	skb_queue_head_init(&sk->sk_receive_queue);
1837 	skb_queue_head_init(&sk->sk_write_queue);
1838 	skb_queue_head_init(&sk->sk_error_queue);
1839 
1840 	rwlock_init(&sk->sk_callback_lock);
1841 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1842 			af_rlock_keys + sk->sk_family,
1843 			af_family_rlock_key_strings[sk->sk_family]);
1844 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1845 			af_wlock_keys + sk->sk_family,
1846 			af_family_wlock_key_strings[sk->sk_family]);
1847 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1848 			af_elock_keys + sk->sk_family,
1849 			af_family_elock_key_strings[sk->sk_family]);
1850 	lockdep_set_class_and_name(&sk->sk_callback_lock,
1851 			af_callback_keys + sk->sk_family,
1852 			af_family_clock_key_strings[sk->sk_family]);
1853 }
1854 
1855 /**
1856  *	sk_clone_lock - clone a socket, and lock its clone
1857  *	@sk: the socket to clone
1858  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1859  *
1860  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1861  */
1862 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1863 {
1864 	struct proto *prot = READ_ONCE(sk->sk_prot);
1865 	struct sock *newsk;
1866 	bool is_charged = true;
1867 
1868 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
1869 	if (newsk != NULL) {
1870 		struct sk_filter *filter;
1871 
1872 		sock_copy(newsk, sk);
1873 
1874 		newsk->sk_prot_creator = prot;
1875 
1876 		/* SANITY */
1877 		if (likely(newsk->sk_net_refcnt))
1878 			get_net(sock_net(newsk));
1879 		sk_node_init(&newsk->sk_node);
1880 		sock_lock_init(newsk);
1881 		bh_lock_sock(newsk);
1882 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1883 		newsk->sk_backlog.len = 0;
1884 
1885 		atomic_set(&newsk->sk_rmem_alloc, 0);
1886 		/*
1887 		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1888 		 */
1889 		refcount_set(&newsk->sk_wmem_alloc, 1);
1890 		atomic_set(&newsk->sk_omem_alloc, 0);
1891 		sk_init_common(newsk);
1892 
1893 		newsk->sk_dst_cache	= NULL;
1894 		newsk->sk_dst_pending_confirm = 0;
1895 		newsk->sk_wmem_queued	= 0;
1896 		newsk->sk_forward_alloc = 0;
1897 		atomic_set(&newsk->sk_drops, 0);
1898 		newsk->sk_send_head	= NULL;
1899 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1900 		atomic_set(&newsk->sk_zckey, 0);
1901 
1902 		sock_reset_flag(newsk, SOCK_DONE);
1903 
1904 		/* sk->sk_memcg will be populated at accept() time */
1905 		newsk->sk_memcg = NULL;
1906 
1907 		cgroup_sk_clone(&newsk->sk_cgrp_data);
1908 
1909 		rcu_read_lock();
1910 		filter = rcu_dereference(sk->sk_filter);
1911 		if (filter != NULL)
1912 			/* though it's an empty new sock, the charging may fail
1913 			 * if sysctl_optmem_max was changed between creation of
1914 			 * original socket and cloning
1915 			 */
1916 			is_charged = sk_filter_charge(newsk, filter);
1917 		RCU_INIT_POINTER(newsk->sk_filter, filter);
1918 		rcu_read_unlock();
1919 
1920 		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1921 			/* We need to make sure that we don't uncharge the new
1922 			 * socket if we couldn't charge it in the first place
1923 			 * as otherwise we uncharge the parent's filter.
1924 			 */
1925 			if (!is_charged)
1926 				RCU_INIT_POINTER(newsk->sk_filter, NULL);
1927 			sk_free_unlock_clone(newsk);
1928 			newsk = NULL;
1929 			goto out;
1930 		}
1931 		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1932 
1933 		if (bpf_sk_storage_clone(sk, newsk)) {
1934 			sk_free_unlock_clone(newsk);
1935 			newsk = NULL;
1936 			goto out;
1937 		}
1938 
1939 		/* Clear sk_user_data if parent had the pointer tagged
1940 		 * as not suitable for copying when cloning.
1941 		 */
1942 		if (sk_user_data_is_nocopy(newsk))
1943 			newsk->sk_user_data = NULL;
1944 
1945 		newsk->sk_err	   = 0;
1946 		newsk->sk_err_soft = 0;
1947 		newsk->sk_priority = 0;
1948 		newsk->sk_incoming_cpu = raw_smp_processor_id();
1949 		if (likely(newsk->sk_net_refcnt))
1950 			sock_inuse_add(sock_net(newsk), 1);
1951 
1952 		/*
1953 		 * Before updating sk_refcnt, we must commit prior changes to memory
1954 		 * (Documentation/RCU/rculist_nulls.rst for details)
1955 		 */
1956 		smp_wmb();
1957 		refcount_set(&newsk->sk_refcnt, 2);
1958 
1959 		/*
1960 		 * Increment the counter in the same struct proto as the master
1961 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1962 		 * is the same as sk->sk_prot->socks, as this field was copied
1963 		 * with memcpy).
1964 		 *
1965 		 * This _changes_ the previous behaviour, where
1966 		 * tcp_create_openreq_child always was incrementing the
1967 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1968 		 * to be taken into account in all callers. -acme
1969 		 */
1970 		sk_refcnt_debug_inc(newsk);
1971 		sk_set_socket(newsk, NULL);
1972 		sk_tx_queue_clear(newsk);
1973 		RCU_INIT_POINTER(newsk->sk_wq, NULL);
1974 
1975 		if (newsk->sk_prot->sockets_allocated)
1976 			sk_sockets_allocated_inc(newsk);
1977 
1978 		if (sock_needs_netstamp(sk) &&
1979 		    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1980 			net_enable_timestamp();
1981 	}
1982 out:
1983 	return newsk;
1984 }
1985 EXPORT_SYMBOL_GPL(sk_clone_lock);
1986 
1987 void sk_free_unlock_clone(struct sock *sk)
1988 {
1989 	/* It is still raw copy of parent, so invalidate
1990 	 * destructor and make plain sk_free() */
1991 	sk->sk_destruct = NULL;
1992 	bh_unlock_sock(sk);
1993 	sk_free(sk);
1994 }
1995 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1996 
1997 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1998 {
1999 	u32 max_segs = 1;
2000 
2001 	sk_dst_set(sk, dst);
2002 	sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
2003 	if (sk->sk_route_caps & NETIF_F_GSO)
2004 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2005 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
2006 	if (sk_can_gso(sk)) {
2007 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2008 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2009 		} else {
2010 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2011 			sk->sk_gso_max_size = dst->dev->gso_max_size;
2012 			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
2013 		}
2014 	}
2015 	sk->sk_gso_max_segs = max_segs;
2016 }
2017 EXPORT_SYMBOL_GPL(sk_setup_caps);
2018 
2019 /*
2020  *	Simple resource managers for sockets.
2021  */
2022 
2023 
2024 /*
2025  * Write buffer destructor automatically called from kfree_skb.
2026  */
2027 void sock_wfree(struct sk_buff *skb)
2028 {
2029 	struct sock *sk = skb->sk;
2030 	unsigned int len = skb->truesize;
2031 
2032 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2033 		/*
2034 		 * Keep a reference on sk_wmem_alloc, this will be released
2035 		 * after sk_write_space() call
2036 		 */
2037 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2038 		sk->sk_write_space(sk);
2039 		len = 1;
2040 	}
2041 	/*
2042 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2043 	 * could not do because of in-flight packets
2044 	 */
2045 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2046 		__sk_free(sk);
2047 }
2048 EXPORT_SYMBOL(sock_wfree);
2049 
2050 /* This variant of sock_wfree() is used by TCP,
2051  * since it sets SOCK_USE_WRITE_QUEUE.
2052  */
2053 void __sock_wfree(struct sk_buff *skb)
2054 {
2055 	struct sock *sk = skb->sk;
2056 
2057 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2058 		__sk_free(sk);
2059 }
2060 
2061 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2062 {
2063 	skb_orphan(skb);
2064 	skb->sk = sk;
2065 #ifdef CONFIG_INET
2066 	if (unlikely(!sk_fullsock(sk))) {
2067 		skb->destructor = sock_edemux;
2068 		sock_hold(sk);
2069 		return;
2070 	}
2071 #endif
2072 	skb->destructor = sock_wfree;
2073 	skb_set_hash_from_sk(skb, sk);
2074 	/*
2075 	 * We used to take a refcount on sk, but following operation
2076 	 * is enough to guarantee sk_free() wont free this sock until
2077 	 * all in-flight packets are completed
2078 	 */
2079 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2080 }
2081 EXPORT_SYMBOL(skb_set_owner_w);
2082 
2083 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2084 {
2085 #ifdef CONFIG_TLS_DEVICE
2086 	/* Drivers depend on in-order delivery for crypto offload,
2087 	 * partial orphan breaks out-of-order-OK logic.
2088 	 */
2089 	if (skb->decrypted)
2090 		return false;
2091 #endif
2092 	return (skb->destructor == sock_wfree ||
2093 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2094 }
2095 
2096 /* This helper is used by netem, as it can hold packets in its
2097  * delay queue. We want to allow the owner socket to send more
2098  * packets, as if they were already TX completed by a typical driver.
2099  * But we also want to keep skb->sk set because some packet schedulers
2100  * rely on it (sch_fq for example).
2101  */
2102 void skb_orphan_partial(struct sk_buff *skb)
2103 {
2104 	if (skb_is_tcp_pure_ack(skb))
2105 		return;
2106 
2107 	if (can_skb_orphan_partial(skb)) {
2108 		struct sock *sk = skb->sk;
2109 
2110 		if (refcount_inc_not_zero(&sk->sk_refcnt)) {
2111 			WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
2112 			skb->destructor = sock_efree;
2113 		}
2114 	} else {
2115 		skb_orphan(skb);
2116 	}
2117 }
2118 EXPORT_SYMBOL(skb_orphan_partial);
2119 
2120 /*
2121  * Read buffer destructor automatically called from kfree_skb.
2122  */
2123 void sock_rfree(struct sk_buff *skb)
2124 {
2125 	struct sock *sk = skb->sk;
2126 	unsigned int len = skb->truesize;
2127 
2128 	atomic_sub(len, &sk->sk_rmem_alloc);
2129 	sk_mem_uncharge(sk, len);
2130 }
2131 EXPORT_SYMBOL(sock_rfree);
2132 
2133 /*
2134  * Buffer destructor for skbs that are not used directly in read or write
2135  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2136  */
2137 void sock_efree(struct sk_buff *skb)
2138 {
2139 	sock_put(skb->sk);
2140 }
2141 EXPORT_SYMBOL(sock_efree);
2142 
2143 /* Buffer destructor for prefetch/receive path where reference count may
2144  * not be held, e.g. for listen sockets.
2145  */
2146 #ifdef CONFIG_INET
2147 void sock_pfree(struct sk_buff *skb)
2148 {
2149 	if (sk_is_refcounted(skb->sk))
2150 		sock_gen_put(skb->sk);
2151 }
2152 EXPORT_SYMBOL(sock_pfree);
2153 #endif /* CONFIG_INET */
2154 
2155 kuid_t sock_i_uid(struct sock *sk)
2156 {
2157 	kuid_t uid;
2158 
2159 	read_lock_bh(&sk->sk_callback_lock);
2160 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2161 	read_unlock_bh(&sk->sk_callback_lock);
2162 	return uid;
2163 }
2164 EXPORT_SYMBOL(sock_i_uid);
2165 
2166 unsigned long sock_i_ino(struct sock *sk)
2167 {
2168 	unsigned long ino;
2169 
2170 	read_lock_bh(&sk->sk_callback_lock);
2171 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2172 	read_unlock_bh(&sk->sk_callback_lock);
2173 	return ino;
2174 }
2175 EXPORT_SYMBOL(sock_i_ino);
2176 
2177 /*
2178  * Allocate a skb from the socket's send buffer.
2179  */
2180 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2181 			     gfp_t priority)
2182 {
2183 	if (force ||
2184 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2185 		struct sk_buff *skb = alloc_skb(size, priority);
2186 
2187 		if (skb) {
2188 			skb_set_owner_w(skb, sk);
2189 			return skb;
2190 		}
2191 	}
2192 	return NULL;
2193 }
2194 EXPORT_SYMBOL(sock_wmalloc);
2195 
2196 static void sock_ofree(struct sk_buff *skb)
2197 {
2198 	struct sock *sk = skb->sk;
2199 
2200 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2201 }
2202 
2203 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2204 			     gfp_t priority)
2205 {
2206 	struct sk_buff *skb;
2207 
2208 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2209 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2210 	    sysctl_optmem_max)
2211 		return NULL;
2212 
2213 	skb = alloc_skb(size, priority);
2214 	if (!skb)
2215 		return NULL;
2216 
2217 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2218 	skb->sk = sk;
2219 	skb->destructor = sock_ofree;
2220 	return skb;
2221 }
2222 
2223 /*
2224  * Allocate a memory block from the socket's option memory buffer.
2225  */
2226 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2227 {
2228 	if ((unsigned int)size <= sysctl_optmem_max &&
2229 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2230 		void *mem;
2231 		/* First do the add, to avoid the race if kmalloc
2232 		 * might sleep.
2233 		 */
2234 		atomic_add(size, &sk->sk_omem_alloc);
2235 		mem = kmalloc(size, priority);
2236 		if (mem)
2237 			return mem;
2238 		atomic_sub(size, &sk->sk_omem_alloc);
2239 	}
2240 	return NULL;
2241 }
2242 EXPORT_SYMBOL(sock_kmalloc);
2243 
2244 /* Free an option memory block. Note, we actually want the inline
2245  * here as this allows gcc to detect the nullify and fold away the
2246  * condition entirely.
2247  */
2248 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2249 				  const bool nullify)
2250 {
2251 	if (WARN_ON_ONCE(!mem))
2252 		return;
2253 	if (nullify)
2254 		kfree_sensitive(mem);
2255 	else
2256 		kfree(mem);
2257 	atomic_sub(size, &sk->sk_omem_alloc);
2258 }
2259 
2260 void sock_kfree_s(struct sock *sk, void *mem, int size)
2261 {
2262 	__sock_kfree_s(sk, mem, size, false);
2263 }
2264 EXPORT_SYMBOL(sock_kfree_s);
2265 
2266 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2267 {
2268 	__sock_kfree_s(sk, mem, size, true);
2269 }
2270 EXPORT_SYMBOL(sock_kzfree_s);
2271 
2272 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2273    I think, these locks should be removed for datagram sockets.
2274  */
2275 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2276 {
2277 	DEFINE_WAIT(wait);
2278 
2279 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2280 	for (;;) {
2281 		if (!timeo)
2282 			break;
2283 		if (signal_pending(current))
2284 			break;
2285 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2286 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2287 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2288 			break;
2289 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2290 			break;
2291 		if (sk->sk_err)
2292 			break;
2293 		timeo = schedule_timeout(timeo);
2294 	}
2295 	finish_wait(sk_sleep(sk), &wait);
2296 	return timeo;
2297 }
2298 
2299 
2300 /*
2301  *	Generic send/receive buffer handlers
2302  */
2303 
2304 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2305 				     unsigned long data_len, int noblock,
2306 				     int *errcode, int max_page_order)
2307 {
2308 	struct sk_buff *skb;
2309 	long timeo;
2310 	int err;
2311 
2312 	timeo = sock_sndtimeo(sk, noblock);
2313 	for (;;) {
2314 		err = sock_error(sk);
2315 		if (err != 0)
2316 			goto failure;
2317 
2318 		err = -EPIPE;
2319 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2320 			goto failure;
2321 
2322 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2323 			break;
2324 
2325 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2326 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2327 		err = -EAGAIN;
2328 		if (!timeo)
2329 			goto failure;
2330 		if (signal_pending(current))
2331 			goto interrupted;
2332 		timeo = sock_wait_for_wmem(sk, timeo);
2333 	}
2334 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2335 				   errcode, sk->sk_allocation);
2336 	if (skb)
2337 		skb_set_owner_w(skb, sk);
2338 	return skb;
2339 
2340 interrupted:
2341 	err = sock_intr_errno(timeo);
2342 failure:
2343 	*errcode = err;
2344 	return NULL;
2345 }
2346 EXPORT_SYMBOL(sock_alloc_send_pskb);
2347 
2348 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2349 				    int noblock, int *errcode)
2350 {
2351 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2352 }
2353 EXPORT_SYMBOL(sock_alloc_send_skb);
2354 
2355 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2356 		     struct sockcm_cookie *sockc)
2357 {
2358 	u32 tsflags;
2359 
2360 	switch (cmsg->cmsg_type) {
2361 	case SO_MARK:
2362 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2363 			return -EPERM;
2364 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2365 			return -EINVAL;
2366 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2367 		break;
2368 	case SO_TIMESTAMPING_OLD:
2369 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2370 			return -EINVAL;
2371 
2372 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2373 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2374 			return -EINVAL;
2375 
2376 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2377 		sockc->tsflags |= tsflags;
2378 		break;
2379 	case SCM_TXTIME:
2380 		if (!sock_flag(sk, SOCK_TXTIME))
2381 			return -EINVAL;
2382 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2383 			return -EINVAL;
2384 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2385 		break;
2386 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2387 	case SCM_RIGHTS:
2388 	case SCM_CREDENTIALS:
2389 		break;
2390 	default:
2391 		return -EINVAL;
2392 	}
2393 	return 0;
2394 }
2395 EXPORT_SYMBOL(__sock_cmsg_send);
2396 
2397 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2398 		   struct sockcm_cookie *sockc)
2399 {
2400 	struct cmsghdr *cmsg;
2401 	int ret;
2402 
2403 	for_each_cmsghdr(cmsg, msg) {
2404 		if (!CMSG_OK(msg, cmsg))
2405 			return -EINVAL;
2406 		if (cmsg->cmsg_level != SOL_SOCKET)
2407 			continue;
2408 		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2409 		if (ret)
2410 			return ret;
2411 	}
2412 	return 0;
2413 }
2414 EXPORT_SYMBOL(sock_cmsg_send);
2415 
2416 static void sk_enter_memory_pressure(struct sock *sk)
2417 {
2418 	if (!sk->sk_prot->enter_memory_pressure)
2419 		return;
2420 
2421 	sk->sk_prot->enter_memory_pressure(sk);
2422 }
2423 
2424 static void sk_leave_memory_pressure(struct sock *sk)
2425 {
2426 	if (sk->sk_prot->leave_memory_pressure) {
2427 		sk->sk_prot->leave_memory_pressure(sk);
2428 	} else {
2429 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2430 
2431 		if (memory_pressure && READ_ONCE(*memory_pressure))
2432 			WRITE_ONCE(*memory_pressure, 0);
2433 	}
2434 }
2435 
2436 #define SKB_FRAG_PAGE_ORDER	get_order(32768)
2437 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2438 
2439 /**
2440  * skb_page_frag_refill - check that a page_frag contains enough room
2441  * @sz: minimum size of the fragment we want to get
2442  * @pfrag: pointer to page_frag
2443  * @gfp: priority for memory allocation
2444  *
2445  * Note: While this allocator tries to use high order pages, there is
2446  * no guarantee that allocations succeed. Therefore, @sz MUST be
2447  * less or equal than PAGE_SIZE.
2448  */
2449 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2450 {
2451 	if (pfrag->page) {
2452 		if (page_ref_count(pfrag->page) == 1) {
2453 			pfrag->offset = 0;
2454 			return true;
2455 		}
2456 		if (pfrag->offset + sz <= pfrag->size)
2457 			return true;
2458 		put_page(pfrag->page);
2459 	}
2460 
2461 	pfrag->offset = 0;
2462 	if (SKB_FRAG_PAGE_ORDER &&
2463 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2464 		/* Avoid direct reclaim but allow kswapd to wake */
2465 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2466 					  __GFP_COMP | __GFP_NOWARN |
2467 					  __GFP_NORETRY,
2468 					  SKB_FRAG_PAGE_ORDER);
2469 		if (likely(pfrag->page)) {
2470 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2471 			return true;
2472 		}
2473 	}
2474 	pfrag->page = alloc_page(gfp);
2475 	if (likely(pfrag->page)) {
2476 		pfrag->size = PAGE_SIZE;
2477 		return true;
2478 	}
2479 	return false;
2480 }
2481 EXPORT_SYMBOL(skb_page_frag_refill);
2482 
2483 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2484 {
2485 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2486 		return true;
2487 
2488 	sk_enter_memory_pressure(sk);
2489 	sk_stream_moderate_sndbuf(sk);
2490 	return false;
2491 }
2492 EXPORT_SYMBOL(sk_page_frag_refill);
2493 
2494 static void __lock_sock(struct sock *sk)
2495 	__releases(&sk->sk_lock.slock)
2496 	__acquires(&sk->sk_lock.slock)
2497 {
2498 	DEFINE_WAIT(wait);
2499 
2500 	for (;;) {
2501 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2502 					TASK_UNINTERRUPTIBLE);
2503 		spin_unlock_bh(&sk->sk_lock.slock);
2504 		schedule();
2505 		spin_lock_bh(&sk->sk_lock.slock);
2506 		if (!sock_owned_by_user(sk))
2507 			break;
2508 	}
2509 	finish_wait(&sk->sk_lock.wq, &wait);
2510 }
2511 
2512 void __release_sock(struct sock *sk)
2513 	__releases(&sk->sk_lock.slock)
2514 	__acquires(&sk->sk_lock.slock)
2515 {
2516 	struct sk_buff *skb, *next;
2517 
2518 	while ((skb = sk->sk_backlog.head) != NULL) {
2519 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2520 
2521 		spin_unlock_bh(&sk->sk_lock.slock);
2522 
2523 		do {
2524 			next = skb->next;
2525 			prefetch(next);
2526 			WARN_ON_ONCE(skb_dst_is_noref(skb));
2527 			skb_mark_not_on_list(skb);
2528 			sk_backlog_rcv(sk, skb);
2529 
2530 			cond_resched();
2531 
2532 			skb = next;
2533 		} while (skb != NULL);
2534 
2535 		spin_lock_bh(&sk->sk_lock.slock);
2536 	}
2537 
2538 	/*
2539 	 * Doing the zeroing here guarantee we can not loop forever
2540 	 * while a wild producer attempts to flood us.
2541 	 */
2542 	sk->sk_backlog.len = 0;
2543 }
2544 
2545 void __sk_flush_backlog(struct sock *sk)
2546 {
2547 	spin_lock_bh(&sk->sk_lock.slock);
2548 	__release_sock(sk);
2549 	spin_unlock_bh(&sk->sk_lock.slock);
2550 }
2551 
2552 /**
2553  * sk_wait_data - wait for data to arrive at sk_receive_queue
2554  * @sk:    sock to wait on
2555  * @timeo: for how long
2556  * @skb:   last skb seen on sk_receive_queue
2557  *
2558  * Now socket state including sk->sk_err is changed only under lock,
2559  * hence we may omit checks after joining wait queue.
2560  * We check receive queue before schedule() only as optimization;
2561  * it is very likely that release_sock() added new data.
2562  */
2563 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2564 {
2565 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2566 	int rc;
2567 
2568 	add_wait_queue(sk_sleep(sk), &wait);
2569 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2570 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2571 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2572 	remove_wait_queue(sk_sleep(sk), &wait);
2573 	return rc;
2574 }
2575 EXPORT_SYMBOL(sk_wait_data);
2576 
2577 /**
2578  *	__sk_mem_raise_allocated - increase memory_allocated
2579  *	@sk: socket
2580  *	@size: memory size to allocate
2581  *	@amt: pages to allocate
2582  *	@kind: allocation type
2583  *
2584  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2585  */
2586 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2587 {
2588 	struct proto *prot = sk->sk_prot;
2589 	long allocated = sk_memory_allocated_add(sk, amt);
2590 	bool charged = true;
2591 
2592 	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2593 	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
2594 		goto suppress_allocation;
2595 
2596 	/* Under limit. */
2597 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2598 		sk_leave_memory_pressure(sk);
2599 		return 1;
2600 	}
2601 
2602 	/* Under pressure. */
2603 	if (allocated > sk_prot_mem_limits(sk, 1))
2604 		sk_enter_memory_pressure(sk);
2605 
2606 	/* Over hard limit. */
2607 	if (allocated > sk_prot_mem_limits(sk, 2))
2608 		goto suppress_allocation;
2609 
2610 	/* guarantee minimum buffer size under pressure */
2611 	if (kind == SK_MEM_RECV) {
2612 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2613 			return 1;
2614 
2615 	} else { /* SK_MEM_SEND */
2616 		int wmem0 = sk_get_wmem0(sk, prot);
2617 
2618 		if (sk->sk_type == SOCK_STREAM) {
2619 			if (sk->sk_wmem_queued < wmem0)
2620 				return 1;
2621 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2622 				return 1;
2623 		}
2624 	}
2625 
2626 	if (sk_has_memory_pressure(sk)) {
2627 		u64 alloc;
2628 
2629 		if (!sk_under_memory_pressure(sk))
2630 			return 1;
2631 		alloc = sk_sockets_allocated_read_positive(sk);
2632 		if (sk_prot_mem_limits(sk, 2) > alloc *
2633 		    sk_mem_pages(sk->sk_wmem_queued +
2634 				 atomic_read(&sk->sk_rmem_alloc) +
2635 				 sk->sk_forward_alloc))
2636 			return 1;
2637 	}
2638 
2639 suppress_allocation:
2640 
2641 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2642 		sk_stream_moderate_sndbuf(sk);
2643 
2644 		/* Fail only if socket is _under_ its sndbuf.
2645 		 * In this case we cannot block, so that we have to fail.
2646 		 */
2647 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2648 			return 1;
2649 	}
2650 
2651 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2652 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2653 
2654 	sk_memory_allocated_sub(sk, amt);
2655 
2656 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2657 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2658 
2659 	return 0;
2660 }
2661 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2662 
2663 /**
2664  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2665  *	@sk: socket
2666  *	@size: memory size to allocate
2667  *	@kind: allocation type
2668  *
2669  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2670  *	rmem allocation. This function assumes that protocols which have
2671  *	memory_pressure use sk_wmem_queued as write buffer accounting.
2672  */
2673 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2674 {
2675 	int ret, amt = sk_mem_pages(size);
2676 
2677 	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2678 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2679 	if (!ret)
2680 		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2681 	return ret;
2682 }
2683 EXPORT_SYMBOL(__sk_mem_schedule);
2684 
2685 /**
2686  *	__sk_mem_reduce_allocated - reclaim memory_allocated
2687  *	@sk: socket
2688  *	@amount: number of quanta
2689  *
2690  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2691  */
2692 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2693 {
2694 	sk_memory_allocated_sub(sk, amount);
2695 
2696 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2697 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2698 
2699 	if (sk_under_memory_pressure(sk) &&
2700 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2701 		sk_leave_memory_pressure(sk);
2702 }
2703 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2704 
2705 /**
2706  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2707  *	@sk: socket
2708  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2709  */
2710 void __sk_mem_reclaim(struct sock *sk, int amount)
2711 {
2712 	amount >>= SK_MEM_QUANTUM_SHIFT;
2713 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2714 	__sk_mem_reduce_allocated(sk, amount);
2715 }
2716 EXPORT_SYMBOL(__sk_mem_reclaim);
2717 
2718 int sk_set_peek_off(struct sock *sk, int val)
2719 {
2720 	sk->sk_peek_off = val;
2721 	return 0;
2722 }
2723 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2724 
2725 /*
2726  * Set of default routines for initialising struct proto_ops when
2727  * the protocol does not support a particular function. In certain
2728  * cases where it makes no sense for a protocol to have a "do nothing"
2729  * function, some default processing is provided.
2730  */
2731 
2732 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2733 {
2734 	return -EOPNOTSUPP;
2735 }
2736 EXPORT_SYMBOL(sock_no_bind);
2737 
2738 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2739 		    int len, int flags)
2740 {
2741 	return -EOPNOTSUPP;
2742 }
2743 EXPORT_SYMBOL(sock_no_connect);
2744 
2745 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2746 {
2747 	return -EOPNOTSUPP;
2748 }
2749 EXPORT_SYMBOL(sock_no_socketpair);
2750 
2751 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2752 		   bool kern)
2753 {
2754 	return -EOPNOTSUPP;
2755 }
2756 EXPORT_SYMBOL(sock_no_accept);
2757 
2758 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2759 		    int peer)
2760 {
2761 	return -EOPNOTSUPP;
2762 }
2763 EXPORT_SYMBOL(sock_no_getname);
2764 
2765 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2766 {
2767 	return -EOPNOTSUPP;
2768 }
2769 EXPORT_SYMBOL(sock_no_ioctl);
2770 
2771 int sock_no_listen(struct socket *sock, int backlog)
2772 {
2773 	return -EOPNOTSUPP;
2774 }
2775 EXPORT_SYMBOL(sock_no_listen);
2776 
2777 int sock_no_shutdown(struct socket *sock, int how)
2778 {
2779 	return -EOPNOTSUPP;
2780 }
2781 EXPORT_SYMBOL(sock_no_shutdown);
2782 
2783 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2784 {
2785 	return -EOPNOTSUPP;
2786 }
2787 EXPORT_SYMBOL(sock_no_sendmsg);
2788 
2789 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2790 {
2791 	return -EOPNOTSUPP;
2792 }
2793 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2794 
2795 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2796 		    int flags)
2797 {
2798 	return -EOPNOTSUPP;
2799 }
2800 EXPORT_SYMBOL(sock_no_recvmsg);
2801 
2802 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2803 {
2804 	/* Mirror missing mmap method error code */
2805 	return -ENODEV;
2806 }
2807 EXPORT_SYMBOL(sock_no_mmap);
2808 
2809 /*
2810  * When a file is received (via SCM_RIGHTS, etc), we must bump the
2811  * various sock-based usage counts.
2812  */
2813 void __receive_sock(struct file *file)
2814 {
2815 	struct socket *sock;
2816 	int error;
2817 
2818 	/*
2819 	 * The resulting value of "error" is ignored here since we only
2820 	 * need to take action when the file is a socket and testing
2821 	 * "sock" for NULL is sufficient.
2822 	 */
2823 	sock = sock_from_file(file, &error);
2824 	if (sock) {
2825 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
2826 		sock_update_classid(&sock->sk->sk_cgrp_data);
2827 	}
2828 }
2829 
2830 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2831 {
2832 	ssize_t res;
2833 	struct msghdr msg = {.msg_flags = flags};
2834 	struct kvec iov;
2835 	char *kaddr = kmap(page);
2836 	iov.iov_base = kaddr + offset;
2837 	iov.iov_len = size;
2838 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2839 	kunmap(page);
2840 	return res;
2841 }
2842 EXPORT_SYMBOL(sock_no_sendpage);
2843 
2844 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2845 				int offset, size_t size, int flags)
2846 {
2847 	ssize_t res;
2848 	struct msghdr msg = {.msg_flags = flags};
2849 	struct kvec iov;
2850 	char *kaddr = kmap(page);
2851 
2852 	iov.iov_base = kaddr + offset;
2853 	iov.iov_len = size;
2854 	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2855 	kunmap(page);
2856 	return res;
2857 }
2858 EXPORT_SYMBOL(sock_no_sendpage_locked);
2859 
2860 /*
2861  *	Default Socket Callbacks
2862  */
2863 
2864 static void sock_def_wakeup(struct sock *sk)
2865 {
2866 	struct socket_wq *wq;
2867 
2868 	rcu_read_lock();
2869 	wq = rcu_dereference(sk->sk_wq);
2870 	if (skwq_has_sleeper(wq))
2871 		wake_up_interruptible_all(&wq->wait);
2872 	rcu_read_unlock();
2873 }
2874 
2875 static void sock_def_error_report(struct sock *sk)
2876 {
2877 	struct socket_wq *wq;
2878 
2879 	rcu_read_lock();
2880 	wq = rcu_dereference(sk->sk_wq);
2881 	if (skwq_has_sleeper(wq))
2882 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2883 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2884 	rcu_read_unlock();
2885 }
2886 
2887 void sock_def_readable(struct sock *sk)
2888 {
2889 	struct socket_wq *wq;
2890 
2891 	rcu_read_lock();
2892 	wq = rcu_dereference(sk->sk_wq);
2893 	if (skwq_has_sleeper(wq))
2894 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2895 						EPOLLRDNORM | EPOLLRDBAND);
2896 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2897 	rcu_read_unlock();
2898 }
2899 
2900 static void sock_def_write_space(struct sock *sk)
2901 {
2902 	struct socket_wq *wq;
2903 
2904 	rcu_read_lock();
2905 
2906 	/* Do not wake up a writer until he can make "significant"
2907 	 * progress.  --DaveM
2908 	 */
2909 	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
2910 		wq = rcu_dereference(sk->sk_wq);
2911 		if (skwq_has_sleeper(wq))
2912 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2913 						EPOLLWRNORM | EPOLLWRBAND);
2914 
2915 		/* Should agree with poll, otherwise some programs break */
2916 		if (sock_writeable(sk))
2917 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2918 	}
2919 
2920 	rcu_read_unlock();
2921 }
2922 
2923 static void sock_def_destruct(struct sock *sk)
2924 {
2925 }
2926 
2927 void sk_send_sigurg(struct sock *sk)
2928 {
2929 	if (sk->sk_socket && sk->sk_socket->file)
2930 		if (send_sigurg(&sk->sk_socket->file->f_owner))
2931 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2932 }
2933 EXPORT_SYMBOL(sk_send_sigurg);
2934 
2935 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2936 		    unsigned long expires)
2937 {
2938 	if (!mod_timer(timer, expires))
2939 		sock_hold(sk);
2940 }
2941 EXPORT_SYMBOL(sk_reset_timer);
2942 
2943 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2944 {
2945 	if (del_timer(timer))
2946 		__sock_put(sk);
2947 }
2948 EXPORT_SYMBOL(sk_stop_timer);
2949 
2950 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
2951 {
2952 	if (del_timer_sync(timer))
2953 		__sock_put(sk);
2954 }
2955 EXPORT_SYMBOL(sk_stop_timer_sync);
2956 
2957 void sock_init_data(struct socket *sock, struct sock *sk)
2958 {
2959 	sk_init_common(sk);
2960 	sk->sk_send_head	=	NULL;
2961 
2962 	timer_setup(&sk->sk_timer, NULL, 0);
2963 
2964 	sk->sk_allocation	=	GFP_KERNEL;
2965 	sk->sk_rcvbuf		=	sysctl_rmem_default;
2966 	sk->sk_sndbuf		=	sysctl_wmem_default;
2967 	sk->sk_state		=	TCP_CLOSE;
2968 	sk_set_socket(sk, sock);
2969 
2970 	sock_set_flag(sk, SOCK_ZAPPED);
2971 
2972 	if (sock) {
2973 		sk->sk_type	=	sock->type;
2974 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
2975 		sock->sk	=	sk;
2976 		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
2977 	} else {
2978 		RCU_INIT_POINTER(sk->sk_wq, NULL);
2979 		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
2980 	}
2981 
2982 	rwlock_init(&sk->sk_callback_lock);
2983 	if (sk->sk_kern_sock)
2984 		lockdep_set_class_and_name(
2985 			&sk->sk_callback_lock,
2986 			af_kern_callback_keys + sk->sk_family,
2987 			af_family_kern_clock_key_strings[sk->sk_family]);
2988 	else
2989 		lockdep_set_class_and_name(
2990 			&sk->sk_callback_lock,
2991 			af_callback_keys + sk->sk_family,
2992 			af_family_clock_key_strings[sk->sk_family]);
2993 
2994 	sk->sk_state_change	=	sock_def_wakeup;
2995 	sk->sk_data_ready	=	sock_def_readable;
2996 	sk->sk_write_space	=	sock_def_write_space;
2997 	sk->sk_error_report	=	sock_def_error_report;
2998 	sk->sk_destruct		=	sock_def_destruct;
2999 
3000 	sk->sk_frag.page	=	NULL;
3001 	sk->sk_frag.offset	=	0;
3002 	sk->sk_peek_off		=	-1;
3003 
3004 	sk->sk_peer_pid 	=	NULL;
3005 	sk->sk_peer_cred	=	NULL;
3006 	sk->sk_write_pending	=	0;
3007 	sk->sk_rcvlowat		=	1;
3008 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3009 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3010 
3011 	sk->sk_stamp = SK_DEFAULT_STAMP;
3012 #if BITS_PER_LONG==32
3013 	seqlock_init(&sk->sk_stamp_seq);
3014 #endif
3015 	atomic_set(&sk->sk_zckey, 0);
3016 
3017 #ifdef CONFIG_NET_RX_BUSY_POLL
3018 	sk->sk_napi_id		=	0;
3019 	sk->sk_ll_usec		=	sysctl_net_busy_read;
3020 #endif
3021 
3022 	sk->sk_max_pacing_rate = ~0UL;
3023 	sk->sk_pacing_rate = ~0UL;
3024 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3025 	sk->sk_incoming_cpu = -1;
3026 
3027 	sk_rx_queue_clear(sk);
3028 	/*
3029 	 * Before updating sk_refcnt, we must commit prior changes to memory
3030 	 * (Documentation/RCU/rculist_nulls.rst for details)
3031 	 */
3032 	smp_wmb();
3033 	refcount_set(&sk->sk_refcnt, 1);
3034 	atomic_set(&sk->sk_drops, 0);
3035 }
3036 EXPORT_SYMBOL(sock_init_data);
3037 
3038 void lock_sock_nested(struct sock *sk, int subclass)
3039 {
3040 	might_sleep();
3041 	spin_lock_bh(&sk->sk_lock.slock);
3042 	if (sk->sk_lock.owned)
3043 		__lock_sock(sk);
3044 	sk->sk_lock.owned = 1;
3045 	spin_unlock(&sk->sk_lock.slock);
3046 	/*
3047 	 * The sk_lock has mutex_lock() semantics here:
3048 	 */
3049 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3050 	local_bh_enable();
3051 }
3052 EXPORT_SYMBOL(lock_sock_nested);
3053 
3054 void release_sock(struct sock *sk)
3055 {
3056 	spin_lock_bh(&sk->sk_lock.slock);
3057 	if (sk->sk_backlog.tail)
3058 		__release_sock(sk);
3059 
3060 	/* Warning : release_cb() might need to release sk ownership,
3061 	 * ie call sock_release_ownership(sk) before us.
3062 	 */
3063 	if (sk->sk_prot->release_cb)
3064 		sk->sk_prot->release_cb(sk);
3065 
3066 	sock_release_ownership(sk);
3067 	if (waitqueue_active(&sk->sk_lock.wq))
3068 		wake_up(&sk->sk_lock.wq);
3069 	spin_unlock_bh(&sk->sk_lock.slock);
3070 }
3071 EXPORT_SYMBOL(release_sock);
3072 
3073 /**
3074  * lock_sock_fast - fast version of lock_sock
3075  * @sk: socket
3076  *
3077  * This version should be used for very small section, where process wont block
3078  * return false if fast path is taken:
3079  *
3080  *   sk_lock.slock locked, owned = 0, BH disabled
3081  *
3082  * return true if slow path is taken:
3083  *
3084  *   sk_lock.slock unlocked, owned = 1, BH enabled
3085  */
3086 bool lock_sock_fast(struct sock *sk)
3087 {
3088 	might_sleep();
3089 	spin_lock_bh(&sk->sk_lock.slock);
3090 
3091 	if (!sk->sk_lock.owned)
3092 		/*
3093 		 * Note : We must disable BH
3094 		 */
3095 		return false;
3096 
3097 	__lock_sock(sk);
3098 	sk->sk_lock.owned = 1;
3099 	spin_unlock(&sk->sk_lock.slock);
3100 	/*
3101 	 * The sk_lock has mutex_lock() semantics here:
3102 	 */
3103 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
3104 	local_bh_enable();
3105 	return true;
3106 }
3107 EXPORT_SYMBOL(lock_sock_fast);
3108 
3109 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3110 		   bool timeval, bool time32)
3111 {
3112 	struct sock *sk = sock->sk;
3113 	struct timespec64 ts;
3114 
3115 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3116 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3117 	if (ts.tv_sec == -1)
3118 		return -ENOENT;
3119 	if (ts.tv_sec == 0) {
3120 		ktime_t kt = ktime_get_real();
3121 		sock_write_timestamp(sk, kt);
3122 		ts = ktime_to_timespec64(kt);
3123 	}
3124 
3125 	if (timeval)
3126 		ts.tv_nsec /= 1000;
3127 
3128 #ifdef CONFIG_COMPAT_32BIT_TIME
3129 	if (time32)
3130 		return put_old_timespec32(&ts, userstamp);
3131 #endif
3132 #ifdef CONFIG_SPARC64
3133 	/* beware of padding in sparc64 timeval */
3134 	if (timeval && !in_compat_syscall()) {
3135 		struct __kernel_old_timeval __user tv = {
3136 			.tv_sec = ts.tv_sec,
3137 			.tv_usec = ts.tv_nsec,
3138 		};
3139 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3140 			return -EFAULT;
3141 		return 0;
3142 	}
3143 #endif
3144 	return put_timespec64(&ts, userstamp);
3145 }
3146 EXPORT_SYMBOL(sock_gettstamp);
3147 
3148 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3149 {
3150 	if (!sock_flag(sk, flag)) {
3151 		unsigned long previous_flags = sk->sk_flags;
3152 
3153 		sock_set_flag(sk, flag);
3154 		/*
3155 		 * we just set one of the two flags which require net
3156 		 * time stamping, but time stamping might have been on
3157 		 * already because of the other one
3158 		 */
3159 		if (sock_needs_netstamp(sk) &&
3160 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3161 			net_enable_timestamp();
3162 	}
3163 }
3164 
3165 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3166 		       int level, int type)
3167 {
3168 	struct sock_exterr_skb *serr;
3169 	struct sk_buff *skb;
3170 	int copied, err;
3171 
3172 	err = -EAGAIN;
3173 	skb = sock_dequeue_err_skb(sk);
3174 	if (skb == NULL)
3175 		goto out;
3176 
3177 	copied = skb->len;
3178 	if (copied > len) {
3179 		msg->msg_flags |= MSG_TRUNC;
3180 		copied = len;
3181 	}
3182 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3183 	if (err)
3184 		goto out_free_skb;
3185 
3186 	sock_recv_timestamp(msg, sk, skb);
3187 
3188 	serr = SKB_EXT_ERR(skb);
3189 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3190 
3191 	msg->msg_flags |= MSG_ERRQUEUE;
3192 	err = copied;
3193 
3194 out_free_skb:
3195 	kfree_skb(skb);
3196 out:
3197 	return err;
3198 }
3199 EXPORT_SYMBOL(sock_recv_errqueue);
3200 
3201 /*
3202  *	Get a socket option on an socket.
3203  *
3204  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3205  *	asynchronous errors should be reported by getsockopt. We assume
3206  *	this means if you specify SO_ERROR (otherwise whats the point of it).
3207  */
3208 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3209 			   char __user *optval, int __user *optlen)
3210 {
3211 	struct sock *sk = sock->sk;
3212 
3213 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3214 }
3215 EXPORT_SYMBOL(sock_common_getsockopt);
3216 
3217 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3218 			int flags)
3219 {
3220 	struct sock *sk = sock->sk;
3221 	int addr_len = 0;
3222 	int err;
3223 
3224 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3225 				   flags & ~MSG_DONTWAIT, &addr_len);
3226 	if (err >= 0)
3227 		msg->msg_namelen = addr_len;
3228 	return err;
3229 }
3230 EXPORT_SYMBOL(sock_common_recvmsg);
3231 
3232 /*
3233  *	Set socket options on an inet socket.
3234  */
3235 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3236 			   sockptr_t optval, unsigned int optlen)
3237 {
3238 	struct sock *sk = sock->sk;
3239 
3240 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3241 }
3242 EXPORT_SYMBOL(sock_common_setsockopt);
3243 
3244 void sk_common_release(struct sock *sk)
3245 {
3246 	if (sk->sk_prot->destroy)
3247 		sk->sk_prot->destroy(sk);
3248 
3249 	/*
3250 	 * Observation: when sk_common_release is called, processes have
3251 	 * no access to socket. But net still has.
3252 	 * Step one, detach it from networking:
3253 	 *
3254 	 * A. Remove from hash tables.
3255 	 */
3256 
3257 	sk->sk_prot->unhash(sk);
3258 
3259 	/*
3260 	 * In this point socket cannot receive new packets, but it is possible
3261 	 * that some packets are in flight because some CPU runs receiver and
3262 	 * did hash table lookup before we unhashed socket. They will achieve
3263 	 * receive queue and will be purged by socket destructor.
3264 	 *
3265 	 * Also we still have packets pending on receive queue and probably,
3266 	 * our own packets waiting in device queues. sock_destroy will drain
3267 	 * receive queue, but transmitted packets will delay socket destruction
3268 	 * until the last reference will be released.
3269 	 */
3270 
3271 	sock_orphan(sk);
3272 
3273 	xfrm_sk_free_policy(sk);
3274 
3275 	sk_refcnt_debug_release(sk);
3276 
3277 	sock_put(sk);
3278 }
3279 EXPORT_SYMBOL(sk_common_release);
3280 
3281 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3282 {
3283 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3284 
3285 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3286 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3287 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3288 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3289 	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3290 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3291 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3292 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3293 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3294 }
3295 
3296 #ifdef CONFIG_PROC_FS
3297 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
3298 struct prot_inuse {
3299 	int val[PROTO_INUSE_NR];
3300 };
3301 
3302 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3303 
3304 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3305 {
3306 	__this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3307 }
3308 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3309 
3310 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3311 {
3312 	int cpu, idx = prot->inuse_idx;
3313 	int res = 0;
3314 
3315 	for_each_possible_cpu(cpu)
3316 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3317 
3318 	return res >= 0 ? res : 0;
3319 }
3320 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3321 
3322 static void sock_inuse_add(struct net *net, int val)
3323 {
3324 	this_cpu_add(*net->core.sock_inuse, val);
3325 }
3326 
3327 int sock_inuse_get(struct net *net)
3328 {
3329 	int cpu, res = 0;
3330 
3331 	for_each_possible_cpu(cpu)
3332 		res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3333 
3334 	return res;
3335 }
3336 
3337 EXPORT_SYMBOL_GPL(sock_inuse_get);
3338 
3339 static int __net_init sock_inuse_init_net(struct net *net)
3340 {
3341 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3342 	if (net->core.prot_inuse == NULL)
3343 		return -ENOMEM;
3344 
3345 	net->core.sock_inuse = alloc_percpu(int);
3346 	if (net->core.sock_inuse == NULL)
3347 		goto out;
3348 
3349 	return 0;
3350 
3351 out:
3352 	free_percpu(net->core.prot_inuse);
3353 	return -ENOMEM;
3354 }
3355 
3356 static void __net_exit sock_inuse_exit_net(struct net *net)
3357 {
3358 	free_percpu(net->core.prot_inuse);
3359 	free_percpu(net->core.sock_inuse);
3360 }
3361 
3362 static struct pernet_operations net_inuse_ops = {
3363 	.init = sock_inuse_init_net,
3364 	.exit = sock_inuse_exit_net,
3365 };
3366 
3367 static __init int net_inuse_init(void)
3368 {
3369 	if (register_pernet_subsys(&net_inuse_ops))
3370 		panic("Cannot initialize net inuse counters");
3371 
3372 	return 0;
3373 }
3374 
3375 core_initcall(net_inuse_init);
3376 
3377 static int assign_proto_idx(struct proto *prot)
3378 {
3379 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3380 
3381 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3382 		pr_err("PROTO_INUSE_NR exhausted\n");
3383 		return -ENOSPC;
3384 	}
3385 
3386 	set_bit(prot->inuse_idx, proto_inuse_idx);
3387 	return 0;
3388 }
3389 
3390 static void release_proto_idx(struct proto *prot)
3391 {
3392 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3393 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3394 }
3395 #else
3396 static inline int assign_proto_idx(struct proto *prot)
3397 {
3398 	return 0;
3399 }
3400 
3401 static inline void release_proto_idx(struct proto *prot)
3402 {
3403 }
3404 
3405 static void sock_inuse_add(struct net *net, int val)
3406 {
3407 }
3408 #endif
3409 
3410 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3411 {
3412 	if (!twsk_prot)
3413 		return;
3414 	kfree(twsk_prot->twsk_slab_name);
3415 	twsk_prot->twsk_slab_name = NULL;
3416 	kmem_cache_destroy(twsk_prot->twsk_slab);
3417 	twsk_prot->twsk_slab = NULL;
3418 }
3419 
3420 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3421 {
3422 	if (!rsk_prot)
3423 		return;
3424 	kfree(rsk_prot->slab_name);
3425 	rsk_prot->slab_name = NULL;
3426 	kmem_cache_destroy(rsk_prot->slab);
3427 	rsk_prot->slab = NULL;
3428 }
3429 
3430 static int req_prot_init(const struct proto *prot)
3431 {
3432 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3433 
3434 	if (!rsk_prot)
3435 		return 0;
3436 
3437 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3438 					prot->name);
3439 	if (!rsk_prot->slab_name)
3440 		return -ENOMEM;
3441 
3442 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3443 					   rsk_prot->obj_size, 0,
3444 					   SLAB_ACCOUNT | prot->slab_flags,
3445 					   NULL);
3446 
3447 	if (!rsk_prot->slab) {
3448 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3449 			prot->name);
3450 		return -ENOMEM;
3451 	}
3452 	return 0;
3453 }
3454 
3455 int proto_register(struct proto *prot, int alloc_slab)
3456 {
3457 	int ret = -ENOBUFS;
3458 
3459 	if (alloc_slab) {
3460 		prot->slab = kmem_cache_create_usercopy(prot->name,
3461 					prot->obj_size, 0,
3462 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3463 					prot->slab_flags,
3464 					prot->useroffset, prot->usersize,
3465 					NULL);
3466 
3467 		if (prot->slab == NULL) {
3468 			pr_crit("%s: Can't create sock SLAB cache!\n",
3469 				prot->name);
3470 			goto out;
3471 		}
3472 
3473 		if (req_prot_init(prot))
3474 			goto out_free_request_sock_slab;
3475 
3476 		if (prot->twsk_prot != NULL) {
3477 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3478 
3479 			if (prot->twsk_prot->twsk_slab_name == NULL)
3480 				goto out_free_request_sock_slab;
3481 
3482 			prot->twsk_prot->twsk_slab =
3483 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3484 						  prot->twsk_prot->twsk_obj_size,
3485 						  0,
3486 						  SLAB_ACCOUNT |
3487 						  prot->slab_flags,
3488 						  NULL);
3489 			if (prot->twsk_prot->twsk_slab == NULL)
3490 				goto out_free_timewait_sock_slab;
3491 		}
3492 	}
3493 
3494 	mutex_lock(&proto_list_mutex);
3495 	ret = assign_proto_idx(prot);
3496 	if (ret) {
3497 		mutex_unlock(&proto_list_mutex);
3498 		goto out_free_timewait_sock_slab;
3499 	}
3500 	list_add(&prot->node, &proto_list);
3501 	mutex_unlock(&proto_list_mutex);
3502 	return ret;
3503 
3504 out_free_timewait_sock_slab:
3505 	if (alloc_slab && prot->twsk_prot)
3506 		tw_prot_cleanup(prot->twsk_prot);
3507 out_free_request_sock_slab:
3508 	if (alloc_slab) {
3509 		req_prot_cleanup(prot->rsk_prot);
3510 
3511 		kmem_cache_destroy(prot->slab);
3512 		prot->slab = NULL;
3513 	}
3514 out:
3515 	return ret;
3516 }
3517 EXPORT_SYMBOL(proto_register);
3518 
3519 void proto_unregister(struct proto *prot)
3520 {
3521 	mutex_lock(&proto_list_mutex);
3522 	release_proto_idx(prot);
3523 	list_del(&prot->node);
3524 	mutex_unlock(&proto_list_mutex);
3525 
3526 	kmem_cache_destroy(prot->slab);
3527 	prot->slab = NULL;
3528 
3529 	req_prot_cleanup(prot->rsk_prot);
3530 	tw_prot_cleanup(prot->twsk_prot);
3531 }
3532 EXPORT_SYMBOL(proto_unregister);
3533 
3534 int sock_load_diag_module(int family, int protocol)
3535 {
3536 	if (!protocol) {
3537 		if (!sock_is_registered(family))
3538 			return -ENOENT;
3539 
3540 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3541 				      NETLINK_SOCK_DIAG, family);
3542 	}
3543 
3544 #ifdef CONFIG_INET
3545 	if (family == AF_INET &&
3546 	    protocol != IPPROTO_RAW &&
3547 	    protocol < MAX_INET_PROTOS &&
3548 	    !rcu_access_pointer(inet_protos[protocol]))
3549 		return -ENOENT;
3550 #endif
3551 
3552 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3553 			      NETLINK_SOCK_DIAG, family, protocol);
3554 }
3555 EXPORT_SYMBOL(sock_load_diag_module);
3556 
3557 #ifdef CONFIG_PROC_FS
3558 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3559 	__acquires(proto_list_mutex)
3560 {
3561 	mutex_lock(&proto_list_mutex);
3562 	return seq_list_start_head(&proto_list, *pos);
3563 }
3564 
3565 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3566 {
3567 	return seq_list_next(v, &proto_list, pos);
3568 }
3569 
3570 static void proto_seq_stop(struct seq_file *seq, void *v)
3571 	__releases(proto_list_mutex)
3572 {
3573 	mutex_unlock(&proto_list_mutex);
3574 }
3575 
3576 static char proto_method_implemented(const void *method)
3577 {
3578 	return method == NULL ? 'n' : 'y';
3579 }
3580 static long sock_prot_memory_allocated(struct proto *proto)
3581 {
3582 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3583 }
3584 
3585 static const char *sock_prot_memory_pressure(struct proto *proto)
3586 {
3587 	return proto->memory_pressure != NULL ?
3588 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3589 }
3590 
3591 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3592 {
3593 
3594 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3595 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3596 		   proto->name,
3597 		   proto->obj_size,
3598 		   sock_prot_inuse_get(seq_file_net(seq), proto),
3599 		   sock_prot_memory_allocated(proto),
3600 		   sock_prot_memory_pressure(proto),
3601 		   proto->max_header,
3602 		   proto->slab == NULL ? "no" : "yes",
3603 		   module_name(proto->owner),
3604 		   proto_method_implemented(proto->close),
3605 		   proto_method_implemented(proto->connect),
3606 		   proto_method_implemented(proto->disconnect),
3607 		   proto_method_implemented(proto->accept),
3608 		   proto_method_implemented(proto->ioctl),
3609 		   proto_method_implemented(proto->init),
3610 		   proto_method_implemented(proto->destroy),
3611 		   proto_method_implemented(proto->shutdown),
3612 		   proto_method_implemented(proto->setsockopt),
3613 		   proto_method_implemented(proto->getsockopt),
3614 		   proto_method_implemented(proto->sendmsg),
3615 		   proto_method_implemented(proto->recvmsg),
3616 		   proto_method_implemented(proto->sendpage),
3617 		   proto_method_implemented(proto->bind),
3618 		   proto_method_implemented(proto->backlog_rcv),
3619 		   proto_method_implemented(proto->hash),
3620 		   proto_method_implemented(proto->unhash),
3621 		   proto_method_implemented(proto->get_port),
3622 		   proto_method_implemented(proto->enter_memory_pressure));
3623 }
3624 
3625 static int proto_seq_show(struct seq_file *seq, void *v)
3626 {
3627 	if (v == &proto_list)
3628 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3629 			   "protocol",
3630 			   "size",
3631 			   "sockets",
3632 			   "memory",
3633 			   "press",
3634 			   "maxhdr",
3635 			   "slab",
3636 			   "module",
3637 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3638 	else
3639 		proto_seq_printf(seq, list_entry(v, struct proto, node));
3640 	return 0;
3641 }
3642 
3643 static const struct seq_operations proto_seq_ops = {
3644 	.start  = proto_seq_start,
3645 	.next   = proto_seq_next,
3646 	.stop   = proto_seq_stop,
3647 	.show   = proto_seq_show,
3648 };
3649 
3650 static __net_init int proto_init_net(struct net *net)
3651 {
3652 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3653 			sizeof(struct seq_net_private)))
3654 		return -ENOMEM;
3655 
3656 	return 0;
3657 }
3658 
3659 static __net_exit void proto_exit_net(struct net *net)
3660 {
3661 	remove_proc_entry("protocols", net->proc_net);
3662 }
3663 
3664 
3665 static __net_initdata struct pernet_operations proto_net_ops = {
3666 	.init = proto_init_net,
3667 	.exit = proto_exit_net,
3668 };
3669 
3670 static int __init proto_init(void)
3671 {
3672 	return register_pernet_subsys(&proto_net_ops);
3673 }
3674 
3675 subsys_initcall(proto_init);
3676 
3677 #endif /* PROC_FS */
3678 
3679 #ifdef CONFIG_NET_RX_BUSY_POLL
3680 bool sk_busy_loop_end(void *p, unsigned long start_time)
3681 {
3682 	struct sock *sk = p;
3683 
3684 	return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3685 	       sk_busy_loop_timeout(sk, start_time);
3686 }
3687 EXPORT_SYMBOL(sk_busy_loop_end);
3688 #endif /* CONFIG_NET_RX_BUSY_POLL */
3689 
3690 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
3691 {
3692 	if (!sk->sk_prot->bind_add)
3693 		return -EOPNOTSUPP;
3694 	return sk->sk_prot->bind_add(sk, addr, addr_len);
3695 }
3696 EXPORT_SYMBOL(sock_bind_add);
3697