xref: /openbmc/linux/net/core/sock.c (revision 3cea11cd)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/init.h>
111 #include <linux/highmem.h>
112 #include <linux/user_namespace.h>
113 #include <linux/static_key.h>
114 #include <linux/memcontrol.h>
115 #include <linux/prefetch.h>
116 #include <linux/compat.h>
117 
118 #include <linux/uaccess.h>
119 
120 #include <linux/netdevice.h>
121 #include <net/protocol.h>
122 #include <linux/skbuff.h>
123 #include <net/net_namespace.h>
124 #include <net/request_sock.h>
125 #include <net/sock.h>
126 #include <linux/net_tstamp.h>
127 #include <net/xfrm.h>
128 #include <linux/ipsec.h>
129 #include <net/cls_cgroup.h>
130 #include <net/netprio_cgroup.h>
131 #include <linux/sock_diag.h>
132 
133 #include <linux/filter.h>
134 #include <net/sock_reuseport.h>
135 #include <net/bpf_sk_storage.h>
136 
137 #include <trace/events/sock.h>
138 
139 #include <net/tcp.h>
140 #include <net/busy_poll.h>
141 
142 static DEFINE_MUTEX(proto_list_mutex);
143 static LIST_HEAD(proto_list);
144 
145 static void sock_inuse_add(struct net *net, int val);
146 
147 /**
148  * sk_ns_capable - General socket capability test
149  * @sk: Socket to use a capability on or through
150  * @user_ns: The user namespace of the capability to use
151  * @cap: The capability to use
152  *
153  * Test to see if the opener of the socket had when the socket was
154  * created and the current process has the capability @cap in the user
155  * namespace @user_ns.
156  */
157 bool sk_ns_capable(const struct sock *sk,
158 		   struct user_namespace *user_ns, int cap)
159 {
160 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
161 		ns_capable(user_ns, cap);
162 }
163 EXPORT_SYMBOL(sk_ns_capable);
164 
165 /**
166  * sk_capable - Socket global capability test
167  * @sk: Socket to use a capability on or through
168  * @cap: The global capability to use
169  *
170  * Test to see if the opener of the socket had when the socket was
171  * created and the current process has the capability @cap in all user
172  * namespaces.
173  */
174 bool sk_capable(const struct sock *sk, int cap)
175 {
176 	return sk_ns_capable(sk, &init_user_ns, cap);
177 }
178 EXPORT_SYMBOL(sk_capable);
179 
180 /**
181  * sk_net_capable - Network namespace socket capability test
182  * @sk: Socket to use a capability on or through
183  * @cap: The capability to use
184  *
185  * Test to see if the opener of the socket had when the socket was created
186  * and the current process has the capability @cap over the network namespace
187  * the socket is a member of.
188  */
189 bool sk_net_capable(const struct sock *sk, int cap)
190 {
191 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
192 }
193 EXPORT_SYMBOL(sk_net_capable);
194 
195 /*
196  * Each address family might have different locking rules, so we have
197  * one slock key per address family and separate keys for internal and
198  * userspace sockets.
199  */
200 static struct lock_class_key af_family_keys[AF_MAX];
201 static struct lock_class_key af_family_kern_keys[AF_MAX];
202 static struct lock_class_key af_family_slock_keys[AF_MAX];
203 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
204 
205 /*
206  * Make lock validator output more readable. (we pre-construct these
207  * strings build-time, so that runtime initialization of socket
208  * locks is fast):
209  */
210 
211 #define _sock_locks(x)						  \
212   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
213   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
214   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
215   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
216   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
217   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
218   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
219   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
220   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
221   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
222   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
223   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
224   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
225   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
226   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
227   x "AF_MAX"
228 
229 static const char *const af_family_key_strings[AF_MAX+1] = {
230 	_sock_locks("sk_lock-")
231 };
232 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
233 	_sock_locks("slock-")
234 };
235 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
236 	_sock_locks("clock-")
237 };
238 
239 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
240 	_sock_locks("k-sk_lock-")
241 };
242 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
243 	_sock_locks("k-slock-")
244 };
245 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
246 	_sock_locks("k-clock-")
247 };
248 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
249 	_sock_locks("rlock-")
250 };
251 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
252 	_sock_locks("wlock-")
253 };
254 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
255 	_sock_locks("elock-")
256 };
257 
258 /*
259  * sk_callback_lock and sk queues locking rules are per-address-family,
260  * so split the lock classes by using a per-AF key:
261  */
262 static struct lock_class_key af_callback_keys[AF_MAX];
263 static struct lock_class_key af_rlock_keys[AF_MAX];
264 static struct lock_class_key af_wlock_keys[AF_MAX];
265 static struct lock_class_key af_elock_keys[AF_MAX];
266 static struct lock_class_key af_kern_callback_keys[AF_MAX];
267 
268 /* Run time adjustable parameters. */
269 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
270 EXPORT_SYMBOL(sysctl_wmem_max);
271 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
272 EXPORT_SYMBOL(sysctl_rmem_max);
273 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
274 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
275 
276 /* Maximal space eaten by iovec or ancillary data plus some space */
277 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
278 EXPORT_SYMBOL(sysctl_optmem_max);
279 
280 int sysctl_tstamp_allow_data __read_mostly = 1;
281 
282 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
283 EXPORT_SYMBOL_GPL(memalloc_socks_key);
284 
285 /**
286  * sk_set_memalloc - sets %SOCK_MEMALLOC
287  * @sk: socket to set it on
288  *
289  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
290  * It's the responsibility of the admin to adjust min_free_kbytes
291  * to meet the requirements
292  */
293 void sk_set_memalloc(struct sock *sk)
294 {
295 	sock_set_flag(sk, SOCK_MEMALLOC);
296 	sk->sk_allocation |= __GFP_MEMALLOC;
297 	static_branch_inc(&memalloc_socks_key);
298 }
299 EXPORT_SYMBOL_GPL(sk_set_memalloc);
300 
301 void sk_clear_memalloc(struct sock *sk)
302 {
303 	sock_reset_flag(sk, SOCK_MEMALLOC);
304 	sk->sk_allocation &= ~__GFP_MEMALLOC;
305 	static_branch_dec(&memalloc_socks_key);
306 
307 	/*
308 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
309 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
310 	 * it has rmem allocations due to the last swapfile being deactivated
311 	 * but there is a risk that the socket is unusable due to exceeding
312 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
313 	 */
314 	sk_mem_reclaim(sk);
315 }
316 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
317 
318 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
319 {
320 	int ret;
321 	unsigned int noreclaim_flag;
322 
323 	/* these should have been dropped before queueing */
324 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
325 
326 	noreclaim_flag = memalloc_noreclaim_save();
327 	ret = sk->sk_backlog_rcv(sk, skb);
328 	memalloc_noreclaim_restore(noreclaim_flag);
329 
330 	return ret;
331 }
332 EXPORT_SYMBOL(__sk_backlog_rcv);
333 
334 static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
335 {
336 	struct __kernel_sock_timeval tv;
337 
338 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
339 		tv.tv_sec = 0;
340 		tv.tv_usec = 0;
341 	} else {
342 		tv.tv_sec = timeo / HZ;
343 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
344 	}
345 
346 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
347 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
348 		*(struct old_timeval32 *)optval = tv32;
349 		return sizeof(tv32);
350 	}
351 
352 	if (old_timeval) {
353 		struct __kernel_old_timeval old_tv;
354 		old_tv.tv_sec = tv.tv_sec;
355 		old_tv.tv_usec = tv.tv_usec;
356 		*(struct __kernel_old_timeval *)optval = old_tv;
357 		return sizeof(old_tv);
358 	}
359 
360 	*(struct __kernel_sock_timeval *)optval = tv;
361 	return sizeof(tv);
362 }
363 
364 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
365 			    bool old_timeval)
366 {
367 	struct __kernel_sock_timeval tv;
368 
369 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
370 		struct old_timeval32 tv32;
371 
372 		if (optlen < sizeof(tv32))
373 			return -EINVAL;
374 
375 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
376 			return -EFAULT;
377 		tv.tv_sec = tv32.tv_sec;
378 		tv.tv_usec = tv32.tv_usec;
379 	} else if (old_timeval) {
380 		struct __kernel_old_timeval old_tv;
381 
382 		if (optlen < sizeof(old_tv))
383 			return -EINVAL;
384 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
385 			return -EFAULT;
386 		tv.tv_sec = old_tv.tv_sec;
387 		tv.tv_usec = old_tv.tv_usec;
388 	} else {
389 		if (optlen < sizeof(tv))
390 			return -EINVAL;
391 		if (copy_from_sockptr(&tv, optval, sizeof(tv)))
392 			return -EFAULT;
393 	}
394 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
395 		return -EDOM;
396 
397 	if (tv.tv_sec < 0) {
398 		static int warned __read_mostly;
399 
400 		*timeo_p = 0;
401 		if (warned < 10 && net_ratelimit()) {
402 			warned++;
403 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
404 				__func__, current->comm, task_pid_nr(current));
405 		}
406 		return 0;
407 	}
408 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
409 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
410 		return 0;
411 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
412 		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
413 	return 0;
414 }
415 
416 static bool sock_needs_netstamp(const struct sock *sk)
417 {
418 	switch (sk->sk_family) {
419 	case AF_UNSPEC:
420 	case AF_UNIX:
421 		return false;
422 	default:
423 		return true;
424 	}
425 }
426 
427 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
428 {
429 	if (sk->sk_flags & flags) {
430 		sk->sk_flags &= ~flags;
431 		if (sock_needs_netstamp(sk) &&
432 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
433 			net_disable_timestamp();
434 	}
435 }
436 
437 
438 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
439 {
440 	unsigned long flags;
441 	struct sk_buff_head *list = &sk->sk_receive_queue;
442 
443 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
444 		atomic_inc(&sk->sk_drops);
445 		trace_sock_rcvqueue_full(sk, skb);
446 		return -ENOMEM;
447 	}
448 
449 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
450 		atomic_inc(&sk->sk_drops);
451 		return -ENOBUFS;
452 	}
453 
454 	skb->dev = NULL;
455 	skb_set_owner_r(skb, sk);
456 
457 	/* we escape from rcu protected region, make sure we dont leak
458 	 * a norefcounted dst
459 	 */
460 	skb_dst_force(skb);
461 
462 	spin_lock_irqsave(&list->lock, flags);
463 	sock_skb_set_dropcount(sk, skb);
464 	__skb_queue_tail(list, skb);
465 	spin_unlock_irqrestore(&list->lock, flags);
466 
467 	if (!sock_flag(sk, SOCK_DEAD))
468 		sk->sk_data_ready(sk);
469 	return 0;
470 }
471 EXPORT_SYMBOL(__sock_queue_rcv_skb);
472 
473 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
474 {
475 	int err;
476 
477 	err = sk_filter(sk, skb);
478 	if (err)
479 		return err;
480 
481 	return __sock_queue_rcv_skb(sk, skb);
482 }
483 EXPORT_SYMBOL(sock_queue_rcv_skb);
484 
485 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
486 		     const int nested, unsigned int trim_cap, bool refcounted)
487 {
488 	int rc = NET_RX_SUCCESS;
489 
490 	if (sk_filter_trim_cap(sk, skb, trim_cap))
491 		goto discard_and_relse;
492 
493 	skb->dev = NULL;
494 
495 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
496 		atomic_inc(&sk->sk_drops);
497 		goto discard_and_relse;
498 	}
499 	if (nested)
500 		bh_lock_sock_nested(sk);
501 	else
502 		bh_lock_sock(sk);
503 	if (!sock_owned_by_user(sk)) {
504 		/*
505 		 * trylock + unlock semantics:
506 		 */
507 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
508 
509 		rc = sk_backlog_rcv(sk, skb);
510 
511 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
512 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
513 		bh_unlock_sock(sk);
514 		atomic_inc(&sk->sk_drops);
515 		goto discard_and_relse;
516 	}
517 
518 	bh_unlock_sock(sk);
519 out:
520 	if (refcounted)
521 		sock_put(sk);
522 	return rc;
523 discard_and_relse:
524 	kfree_skb(skb);
525 	goto out;
526 }
527 EXPORT_SYMBOL(__sk_receive_skb);
528 
529 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
530 {
531 	struct dst_entry *dst = __sk_dst_get(sk);
532 
533 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
534 		sk_tx_queue_clear(sk);
535 		sk->sk_dst_pending_confirm = 0;
536 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
537 		dst_release(dst);
538 		return NULL;
539 	}
540 
541 	return dst;
542 }
543 EXPORT_SYMBOL(__sk_dst_check);
544 
545 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
546 {
547 	struct dst_entry *dst = sk_dst_get(sk);
548 
549 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
550 		sk_dst_reset(sk);
551 		dst_release(dst);
552 		return NULL;
553 	}
554 
555 	return dst;
556 }
557 EXPORT_SYMBOL(sk_dst_check);
558 
559 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
560 {
561 	int ret = -ENOPROTOOPT;
562 #ifdef CONFIG_NETDEVICES
563 	struct net *net = sock_net(sk);
564 
565 	/* Sorry... */
566 	ret = -EPERM;
567 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
568 		goto out;
569 
570 	ret = -EINVAL;
571 	if (ifindex < 0)
572 		goto out;
573 
574 	sk->sk_bound_dev_if = ifindex;
575 	if (sk->sk_prot->rehash)
576 		sk->sk_prot->rehash(sk);
577 	sk_dst_reset(sk);
578 
579 	ret = 0;
580 
581 out:
582 #endif
583 
584 	return ret;
585 }
586 
587 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
588 {
589 	int ret;
590 
591 	if (lock_sk)
592 		lock_sock(sk);
593 	ret = sock_bindtoindex_locked(sk, ifindex);
594 	if (lock_sk)
595 		release_sock(sk);
596 
597 	return ret;
598 }
599 EXPORT_SYMBOL(sock_bindtoindex);
600 
601 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
602 {
603 	int ret = -ENOPROTOOPT;
604 #ifdef CONFIG_NETDEVICES
605 	struct net *net = sock_net(sk);
606 	char devname[IFNAMSIZ];
607 	int index;
608 
609 	ret = -EINVAL;
610 	if (optlen < 0)
611 		goto out;
612 
613 	/* Bind this socket to a particular device like "eth0",
614 	 * as specified in the passed interface name. If the
615 	 * name is "" or the option length is zero the socket
616 	 * is not bound.
617 	 */
618 	if (optlen > IFNAMSIZ - 1)
619 		optlen = IFNAMSIZ - 1;
620 	memset(devname, 0, sizeof(devname));
621 
622 	ret = -EFAULT;
623 	if (copy_from_sockptr(devname, optval, optlen))
624 		goto out;
625 
626 	index = 0;
627 	if (devname[0] != '\0') {
628 		struct net_device *dev;
629 
630 		rcu_read_lock();
631 		dev = dev_get_by_name_rcu(net, devname);
632 		if (dev)
633 			index = dev->ifindex;
634 		rcu_read_unlock();
635 		ret = -ENODEV;
636 		if (!dev)
637 			goto out;
638 	}
639 
640 	return sock_bindtoindex(sk, index, true);
641 out:
642 #endif
643 
644 	return ret;
645 }
646 
647 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
648 				int __user *optlen, int len)
649 {
650 	int ret = -ENOPROTOOPT;
651 #ifdef CONFIG_NETDEVICES
652 	struct net *net = sock_net(sk);
653 	char devname[IFNAMSIZ];
654 
655 	if (sk->sk_bound_dev_if == 0) {
656 		len = 0;
657 		goto zero;
658 	}
659 
660 	ret = -EINVAL;
661 	if (len < IFNAMSIZ)
662 		goto out;
663 
664 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
665 	if (ret)
666 		goto out;
667 
668 	len = strlen(devname) + 1;
669 
670 	ret = -EFAULT;
671 	if (copy_to_user(optval, devname, len))
672 		goto out;
673 
674 zero:
675 	ret = -EFAULT;
676 	if (put_user(len, optlen))
677 		goto out;
678 
679 	ret = 0;
680 
681 out:
682 #endif
683 
684 	return ret;
685 }
686 
687 bool sk_mc_loop(struct sock *sk)
688 {
689 	if (dev_recursion_level())
690 		return false;
691 	if (!sk)
692 		return true;
693 	switch (sk->sk_family) {
694 	case AF_INET:
695 		return inet_sk(sk)->mc_loop;
696 #if IS_ENABLED(CONFIG_IPV6)
697 	case AF_INET6:
698 		return inet6_sk(sk)->mc_loop;
699 #endif
700 	}
701 	WARN_ON_ONCE(1);
702 	return true;
703 }
704 EXPORT_SYMBOL(sk_mc_loop);
705 
706 void sock_set_reuseaddr(struct sock *sk)
707 {
708 	lock_sock(sk);
709 	sk->sk_reuse = SK_CAN_REUSE;
710 	release_sock(sk);
711 }
712 EXPORT_SYMBOL(sock_set_reuseaddr);
713 
714 void sock_set_reuseport(struct sock *sk)
715 {
716 	lock_sock(sk);
717 	sk->sk_reuseport = true;
718 	release_sock(sk);
719 }
720 EXPORT_SYMBOL(sock_set_reuseport);
721 
722 void sock_no_linger(struct sock *sk)
723 {
724 	lock_sock(sk);
725 	sk->sk_lingertime = 0;
726 	sock_set_flag(sk, SOCK_LINGER);
727 	release_sock(sk);
728 }
729 EXPORT_SYMBOL(sock_no_linger);
730 
731 void sock_set_priority(struct sock *sk, u32 priority)
732 {
733 	lock_sock(sk);
734 	sk->sk_priority = priority;
735 	release_sock(sk);
736 }
737 EXPORT_SYMBOL(sock_set_priority);
738 
739 void sock_set_sndtimeo(struct sock *sk, s64 secs)
740 {
741 	lock_sock(sk);
742 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
743 		sk->sk_sndtimeo = secs * HZ;
744 	else
745 		sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
746 	release_sock(sk);
747 }
748 EXPORT_SYMBOL(sock_set_sndtimeo);
749 
750 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
751 {
752 	if (val)  {
753 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
754 		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
755 		sock_set_flag(sk, SOCK_RCVTSTAMP);
756 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
757 	} else {
758 		sock_reset_flag(sk, SOCK_RCVTSTAMP);
759 		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
760 	}
761 }
762 
763 void sock_enable_timestamps(struct sock *sk)
764 {
765 	lock_sock(sk);
766 	__sock_set_timestamps(sk, true, false, true);
767 	release_sock(sk);
768 }
769 EXPORT_SYMBOL(sock_enable_timestamps);
770 
771 void sock_set_keepalive(struct sock *sk)
772 {
773 	lock_sock(sk);
774 	if (sk->sk_prot->keepalive)
775 		sk->sk_prot->keepalive(sk, true);
776 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
777 	release_sock(sk);
778 }
779 EXPORT_SYMBOL(sock_set_keepalive);
780 
781 static void __sock_set_rcvbuf(struct sock *sk, int val)
782 {
783 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
784 	 * as a negative value.
785 	 */
786 	val = min_t(int, val, INT_MAX / 2);
787 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
788 
789 	/* We double it on the way in to account for "struct sk_buff" etc.
790 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
791 	 * will allow that much actual data to be received on that socket.
792 	 *
793 	 * Applications are unaware that "struct sk_buff" and other overheads
794 	 * allocate from the receive buffer during socket buffer allocation.
795 	 *
796 	 * And after considering the possible alternatives, returning the value
797 	 * we actually used in getsockopt is the most desirable behavior.
798 	 */
799 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
800 }
801 
802 void sock_set_rcvbuf(struct sock *sk, int val)
803 {
804 	lock_sock(sk);
805 	__sock_set_rcvbuf(sk, val);
806 	release_sock(sk);
807 }
808 EXPORT_SYMBOL(sock_set_rcvbuf);
809 
810 void sock_set_mark(struct sock *sk, u32 val)
811 {
812 	lock_sock(sk);
813 	sk->sk_mark = val;
814 	release_sock(sk);
815 }
816 EXPORT_SYMBOL(sock_set_mark);
817 
818 /*
819  *	This is meant for all protocols to use and covers goings on
820  *	at the socket level. Everything here is generic.
821  */
822 
823 int sock_setsockopt(struct socket *sock, int level, int optname,
824 		    sockptr_t optval, unsigned int optlen)
825 {
826 	struct sock_txtime sk_txtime;
827 	struct sock *sk = sock->sk;
828 	int val;
829 	int valbool;
830 	struct linger ling;
831 	int ret = 0;
832 
833 	/*
834 	 *	Options without arguments
835 	 */
836 
837 	if (optname == SO_BINDTODEVICE)
838 		return sock_setbindtodevice(sk, optval, optlen);
839 
840 	if (optlen < sizeof(int))
841 		return -EINVAL;
842 
843 	if (copy_from_sockptr(&val, optval, sizeof(val)))
844 		return -EFAULT;
845 
846 	valbool = val ? 1 : 0;
847 
848 	lock_sock(sk);
849 
850 	switch (optname) {
851 	case SO_DEBUG:
852 		if (val && !capable(CAP_NET_ADMIN))
853 			ret = -EACCES;
854 		else
855 			sock_valbool_flag(sk, SOCK_DBG, valbool);
856 		break;
857 	case SO_REUSEADDR:
858 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
859 		break;
860 	case SO_REUSEPORT:
861 		sk->sk_reuseport = valbool;
862 		break;
863 	case SO_TYPE:
864 	case SO_PROTOCOL:
865 	case SO_DOMAIN:
866 	case SO_ERROR:
867 		ret = -ENOPROTOOPT;
868 		break;
869 	case SO_DONTROUTE:
870 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
871 		sk_dst_reset(sk);
872 		break;
873 	case SO_BROADCAST:
874 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
875 		break;
876 	case SO_SNDBUF:
877 		/* Don't error on this BSD doesn't and if you think
878 		 * about it this is right. Otherwise apps have to
879 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
880 		 * are treated in BSD as hints
881 		 */
882 		val = min_t(u32, val, sysctl_wmem_max);
883 set_sndbuf:
884 		/* Ensure val * 2 fits into an int, to prevent max_t()
885 		 * from treating it as a negative value.
886 		 */
887 		val = min_t(int, val, INT_MAX / 2);
888 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
889 		WRITE_ONCE(sk->sk_sndbuf,
890 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
891 		/* Wake up sending tasks if we upped the value. */
892 		sk->sk_write_space(sk);
893 		break;
894 
895 	case SO_SNDBUFFORCE:
896 		if (!capable(CAP_NET_ADMIN)) {
897 			ret = -EPERM;
898 			break;
899 		}
900 
901 		/* No negative values (to prevent underflow, as val will be
902 		 * multiplied by 2).
903 		 */
904 		if (val < 0)
905 			val = 0;
906 		goto set_sndbuf;
907 
908 	case SO_RCVBUF:
909 		/* Don't error on this BSD doesn't and if you think
910 		 * about it this is right. Otherwise apps have to
911 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
912 		 * are treated in BSD as hints
913 		 */
914 		__sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max));
915 		break;
916 
917 	case SO_RCVBUFFORCE:
918 		if (!capable(CAP_NET_ADMIN)) {
919 			ret = -EPERM;
920 			break;
921 		}
922 
923 		/* No negative values (to prevent underflow, as val will be
924 		 * multiplied by 2).
925 		 */
926 		__sock_set_rcvbuf(sk, max(val, 0));
927 		break;
928 
929 	case SO_KEEPALIVE:
930 		if (sk->sk_prot->keepalive)
931 			sk->sk_prot->keepalive(sk, valbool);
932 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
933 		break;
934 
935 	case SO_OOBINLINE:
936 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
937 		break;
938 
939 	case SO_NO_CHECK:
940 		sk->sk_no_check_tx = valbool;
941 		break;
942 
943 	case SO_PRIORITY:
944 		if ((val >= 0 && val <= 6) ||
945 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
946 			sk->sk_priority = val;
947 		else
948 			ret = -EPERM;
949 		break;
950 
951 	case SO_LINGER:
952 		if (optlen < sizeof(ling)) {
953 			ret = -EINVAL;	/* 1003.1g */
954 			break;
955 		}
956 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
957 			ret = -EFAULT;
958 			break;
959 		}
960 		if (!ling.l_onoff)
961 			sock_reset_flag(sk, SOCK_LINGER);
962 		else {
963 #if (BITS_PER_LONG == 32)
964 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
965 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
966 			else
967 #endif
968 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
969 			sock_set_flag(sk, SOCK_LINGER);
970 		}
971 		break;
972 
973 	case SO_BSDCOMPAT:
974 		break;
975 
976 	case SO_PASSCRED:
977 		if (valbool)
978 			set_bit(SOCK_PASSCRED, &sock->flags);
979 		else
980 			clear_bit(SOCK_PASSCRED, &sock->flags);
981 		break;
982 
983 	case SO_TIMESTAMP_OLD:
984 		__sock_set_timestamps(sk, valbool, false, false);
985 		break;
986 	case SO_TIMESTAMP_NEW:
987 		__sock_set_timestamps(sk, valbool, true, false);
988 		break;
989 	case SO_TIMESTAMPNS_OLD:
990 		__sock_set_timestamps(sk, valbool, false, true);
991 		break;
992 	case SO_TIMESTAMPNS_NEW:
993 		__sock_set_timestamps(sk, valbool, true, true);
994 		break;
995 	case SO_TIMESTAMPING_NEW:
996 	case SO_TIMESTAMPING_OLD:
997 		if (val & ~SOF_TIMESTAMPING_MASK) {
998 			ret = -EINVAL;
999 			break;
1000 		}
1001 
1002 		if (val & SOF_TIMESTAMPING_OPT_ID &&
1003 		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
1004 			if (sk->sk_protocol == IPPROTO_TCP &&
1005 			    sk->sk_type == SOCK_STREAM) {
1006 				if ((1 << sk->sk_state) &
1007 				    (TCPF_CLOSE | TCPF_LISTEN)) {
1008 					ret = -EINVAL;
1009 					break;
1010 				}
1011 				sk->sk_tskey = tcp_sk(sk)->snd_una;
1012 			} else {
1013 				sk->sk_tskey = 0;
1014 			}
1015 		}
1016 
1017 		if (val & SOF_TIMESTAMPING_OPT_STATS &&
1018 		    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
1019 			ret = -EINVAL;
1020 			break;
1021 		}
1022 
1023 		sk->sk_tsflags = val;
1024 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
1025 
1026 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
1027 			sock_enable_timestamp(sk,
1028 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
1029 		else
1030 			sock_disable_timestamp(sk,
1031 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
1032 		break;
1033 
1034 	case SO_RCVLOWAT:
1035 		if (val < 0)
1036 			val = INT_MAX;
1037 		if (sock->ops->set_rcvlowat)
1038 			ret = sock->ops->set_rcvlowat(sk, val);
1039 		else
1040 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1041 		break;
1042 
1043 	case SO_RCVTIMEO_OLD:
1044 	case SO_RCVTIMEO_NEW:
1045 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1046 				       optlen, optname == SO_RCVTIMEO_OLD);
1047 		break;
1048 
1049 	case SO_SNDTIMEO_OLD:
1050 	case SO_SNDTIMEO_NEW:
1051 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1052 				       optlen, optname == SO_SNDTIMEO_OLD);
1053 		break;
1054 
1055 	case SO_ATTACH_FILTER: {
1056 		struct sock_fprog fprog;
1057 
1058 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1059 		if (!ret)
1060 			ret = sk_attach_filter(&fprog, sk);
1061 		break;
1062 	}
1063 	case SO_ATTACH_BPF:
1064 		ret = -EINVAL;
1065 		if (optlen == sizeof(u32)) {
1066 			u32 ufd;
1067 
1068 			ret = -EFAULT;
1069 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1070 				break;
1071 
1072 			ret = sk_attach_bpf(ufd, sk);
1073 		}
1074 		break;
1075 
1076 	case SO_ATTACH_REUSEPORT_CBPF: {
1077 		struct sock_fprog fprog;
1078 
1079 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1080 		if (!ret)
1081 			ret = sk_reuseport_attach_filter(&fprog, sk);
1082 		break;
1083 	}
1084 	case SO_ATTACH_REUSEPORT_EBPF:
1085 		ret = -EINVAL;
1086 		if (optlen == sizeof(u32)) {
1087 			u32 ufd;
1088 
1089 			ret = -EFAULT;
1090 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1091 				break;
1092 
1093 			ret = sk_reuseport_attach_bpf(ufd, sk);
1094 		}
1095 		break;
1096 
1097 	case SO_DETACH_REUSEPORT_BPF:
1098 		ret = reuseport_detach_prog(sk);
1099 		break;
1100 
1101 	case SO_DETACH_FILTER:
1102 		ret = sk_detach_filter(sk);
1103 		break;
1104 
1105 	case SO_LOCK_FILTER:
1106 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1107 			ret = -EPERM;
1108 		else
1109 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1110 		break;
1111 
1112 	case SO_PASSSEC:
1113 		if (valbool)
1114 			set_bit(SOCK_PASSSEC, &sock->flags);
1115 		else
1116 			clear_bit(SOCK_PASSSEC, &sock->flags);
1117 		break;
1118 	case SO_MARK:
1119 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1120 			ret = -EPERM;
1121 		} else if (val != sk->sk_mark) {
1122 			sk->sk_mark = val;
1123 			sk_dst_reset(sk);
1124 		}
1125 		break;
1126 
1127 	case SO_RXQ_OVFL:
1128 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1129 		break;
1130 
1131 	case SO_WIFI_STATUS:
1132 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1133 		break;
1134 
1135 	case SO_PEEK_OFF:
1136 		if (sock->ops->set_peek_off)
1137 			ret = sock->ops->set_peek_off(sk, val);
1138 		else
1139 			ret = -EOPNOTSUPP;
1140 		break;
1141 
1142 	case SO_NOFCS:
1143 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1144 		break;
1145 
1146 	case SO_SELECT_ERR_QUEUE:
1147 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1148 		break;
1149 
1150 #ifdef CONFIG_NET_RX_BUSY_POLL
1151 	case SO_BUSY_POLL:
1152 		/* allow unprivileged users to decrease the value */
1153 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1154 			ret = -EPERM;
1155 		else {
1156 			if (val < 0)
1157 				ret = -EINVAL;
1158 			else
1159 				sk->sk_ll_usec = val;
1160 		}
1161 		break;
1162 #endif
1163 
1164 	case SO_MAX_PACING_RATE:
1165 		{
1166 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1167 
1168 		if (sizeof(ulval) != sizeof(val) &&
1169 		    optlen >= sizeof(ulval) &&
1170 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1171 			ret = -EFAULT;
1172 			break;
1173 		}
1174 		if (ulval != ~0UL)
1175 			cmpxchg(&sk->sk_pacing_status,
1176 				SK_PACING_NONE,
1177 				SK_PACING_NEEDED);
1178 		sk->sk_max_pacing_rate = ulval;
1179 		sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1180 		break;
1181 		}
1182 	case SO_INCOMING_CPU:
1183 		WRITE_ONCE(sk->sk_incoming_cpu, val);
1184 		break;
1185 
1186 	case SO_CNX_ADVICE:
1187 		if (val == 1)
1188 			dst_negative_advice(sk);
1189 		break;
1190 
1191 	case SO_ZEROCOPY:
1192 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1193 			if (!((sk->sk_type == SOCK_STREAM &&
1194 			       sk->sk_protocol == IPPROTO_TCP) ||
1195 			      (sk->sk_type == SOCK_DGRAM &&
1196 			       sk->sk_protocol == IPPROTO_UDP)))
1197 				ret = -ENOTSUPP;
1198 		} else if (sk->sk_family != PF_RDS) {
1199 			ret = -ENOTSUPP;
1200 		}
1201 		if (!ret) {
1202 			if (val < 0 || val > 1)
1203 				ret = -EINVAL;
1204 			else
1205 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1206 		}
1207 		break;
1208 
1209 	case SO_TXTIME:
1210 		if (optlen != sizeof(struct sock_txtime)) {
1211 			ret = -EINVAL;
1212 			break;
1213 		} else if (copy_from_sockptr(&sk_txtime, optval,
1214 			   sizeof(struct sock_txtime))) {
1215 			ret = -EFAULT;
1216 			break;
1217 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1218 			ret = -EINVAL;
1219 			break;
1220 		}
1221 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1222 		 * scheduler has enough safe guards.
1223 		 */
1224 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1225 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1226 			ret = -EPERM;
1227 			break;
1228 		}
1229 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1230 		sk->sk_clockid = sk_txtime.clockid;
1231 		sk->sk_txtime_deadline_mode =
1232 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1233 		sk->sk_txtime_report_errors =
1234 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1235 		break;
1236 
1237 	case SO_BINDTOIFINDEX:
1238 		ret = sock_bindtoindex_locked(sk, val);
1239 		break;
1240 
1241 	default:
1242 		ret = -ENOPROTOOPT;
1243 		break;
1244 	}
1245 	release_sock(sk);
1246 	return ret;
1247 }
1248 EXPORT_SYMBOL(sock_setsockopt);
1249 
1250 
1251 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1252 			  struct ucred *ucred)
1253 {
1254 	ucred->pid = pid_vnr(pid);
1255 	ucred->uid = ucred->gid = -1;
1256 	if (cred) {
1257 		struct user_namespace *current_ns = current_user_ns();
1258 
1259 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1260 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1261 	}
1262 }
1263 
1264 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1265 {
1266 	struct user_namespace *user_ns = current_user_ns();
1267 	int i;
1268 
1269 	for (i = 0; i < src->ngroups; i++)
1270 		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1271 			return -EFAULT;
1272 
1273 	return 0;
1274 }
1275 
1276 int sock_getsockopt(struct socket *sock, int level, int optname,
1277 		    char __user *optval, int __user *optlen)
1278 {
1279 	struct sock *sk = sock->sk;
1280 
1281 	union {
1282 		int val;
1283 		u64 val64;
1284 		unsigned long ulval;
1285 		struct linger ling;
1286 		struct old_timeval32 tm32;
1287 		struct __kernel_old_timeval tm;
1288 		struct  __kernel_sock_timeval stm;
1289 		struct sock_txtime txtime;
1290 	} v;
1291 
1292 	int lv = sizeof(int);
1293 	int len;
1294 
1295 	if (get_user(len, optlen))
1296 		return -EFAULT;
1297 	if (len < 0)
1298 		return -EINVAL;
1299 
1300 	memset(&v, 0, sizeof(v));
1301 
1302 	switch (optname) {
1303 	case SO_DEBUG:
1304 		v.val = sock_flag(sk, SOCK_DBG);
1305 		break;
1306 
1307 	case SO_DONTROUTE:
1308 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1309 		break;
1310 
1311 	case SO_BROADCAST:
1312 		v.val = sock_flag(sk, SOCK_BROADCAST);
1313 		break;
1314 
1315 	case SO_SNDBUF:
1316 		v.val = sk->sk_sndbuf;
1317 		break;
1318 
1319 	case SO_RCVBUF:
1320 		v.val = sk->sk_rcvbuf;
1321 		break;
1322 
1323 	case SO_REUSEADDR:
1324 		v.val = sk->sk_reuse;
1325 		break;
1326 
1327 	case SO_REUSEPORT:
1328 		v.val = sk->sk_reuseport;
1329 		break;
1330 
1331 	case SO_KEEPALIVE:
1332 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1333 		break;
1334 
1335 	case SO_TYPE:
1336 		v.val = sk->sk_type;
1337 		break;
1338 
1339 	case SO_PROTOCOL:
1340 		v.val = sk->sk_protocol;
1341 		break;
1342 
1343 	case SO_DOMAIN:
1344 		v.val = sk->sk_family;
1345 		break;
1346 
1347 	case SO_ERROR:
1348 		v.val = -sock_error(sk);
1349 		if (v.val == 0)
1350 			v.val = xchg(&sk->sk_err_soft, 0);
1351 		break;
1352 
1353 	case SO_OOBINLINE:
1354 		v.val = sock_flag(sk, SOCK_URGINLINE);
1355 		break;
1356 
1357 	case SO_NO_CHECK:
1358 		v.val = sk->sk_no_check_tx;
1359 		break;
1360 
1361 	case SO_PRIORITY:
1362 		v.val = sk->sk_priority;
1363 		break;
1364 
1365 	case SO_LINGER:
1366 		lv		= sizeof(v.ling);
1367 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1368 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1369 		break;
1370 
1371 	case SO_BSDCOMPAT:
1372 		break;
1373 
1374 	case SO_TIMESTAMP_OLD:
1375 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1376 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1377 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1378 		break;
1379 
1380 	case SO_TIMESTAMPNS_OLD:
1381 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1382 		break;
1383 
1384 	case SO_TIMESTAMP_NEW:
1385 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1386 		break;
1387 
1388 	case SO_TIMESTAMPNS_NEW:
1389 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1390 		break;
1391 
1392 	case SO_TIMESTAMPING_OLD:
1393 		v.val = sk->sk_tsflags;
1394 		break;
1395 
1396 	case SO_RCVTIMEO_OLD:
1397 	case SO_RCVTIMEO_NEW:
1398 		lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1399 		break;
1400 
1401 	case SO_SNDTIMEO_OLD:
1402 	case SO_SNDTIMEO_NEW:
1403 		lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1404 		break;
1405 
1406 	case SO_RCVLOWAT:
1407 		v.val = sk->sk_rcvlowat;
1408 		break;
1409 
1410 	case SO_SNDLOWAT:
1411 		v.val = 1;
1412 		break;
1413 
1414 	case SO_PASSCRED:
1415 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1416 		break;
1417 
1418 	case SO_PEERCRED:
1419 	{
1420 		struct ucred peercred;
1421 		if (len > sizeof(peercred))
1422 			len = sizeof(peercred);
1423 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1424 		if (copy_to_user(optval, &peercred, len))
1425 			return -EFAULT;
1426 		goto lenout;
1427 	}
1428 
1429 	case SO_PEERGROUPS:
1430 	{
1431 		int ret, n;
1432 
1433 		if (!sk->sk_peer_cred)
1434 			return -ENODATA;
1435 
1436 		n = sk->sk_peer_cred->group_info->ngroups;
1437 		if (len < n * sizeof(gid_t)) {
1438 			len = n * sizeof(gid_t);
1439 			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1440 		}
1441 		len = n * sizeof(gid_t);
1442 
1443 		ret = groups_to_user((gid_t __user *)optval,
1444 				     sk->sk_peer_cred->group_info);
1445 		if (ret)
1446 			return ret;
1447 		goto lenout;
1448 	}
1449 
1450 	case SO_PEERNAME:
1451 	{
1452 		char address[128];
1453 
1454 		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1455 		if (lv < 0)
1456 			return -ENOTCONN;
1457 		if (lv < len)
1458 			return -EINVAL;
1459 		if (copy_to_user(optval, address, len))
1460 			return -EFAULT;
1461 		goto lenout;
1462 	}
1463 
1464 	/* Dubious BSD thing... Probably nobody even uses it, but
1465 	 * the UNIX standard wants it for whatever reason... -DaveM
1466 	 */
1467 	case SO_ACCEPTCONN:
1468 		v.val = sk->sk_state == TCP_LISTEN;
1469 		break;
1470 
1471 	case SO_PASSSEC:
1472 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1473 		break;
1474 
1475 	case SO_PEERSEC:
1476 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1477 
1478 	case SO_MARK:
1479 		v.val = sk->sk_mark;
1480 		break;
1481 
1482 	case SO_RXQ_OVFL:
1483 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1484 		break;
1485 
1486 	case SO_WIFI_STATUS:
1487 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1488 		break;
1489 
1490 	case SO_PEEK_OFF:
1491 		if (!sock->ops->set_peek_off)
1492 			return -EOPNOTSUPP;
1493 
1494 		v.val = sk->sk_peek_off;
1495 		break;
1496 	case SO_NOFCS:
1497 		v.val = sock_flag(sk, SOCK_NOFCS);
1498 		break;
1499 
1500 	case SO_BINDTODEVICE:
1501 		return sock_getbindtodevice(sk, optval, optlen, len);
1502 
1503 	case SO_GET_FILTER:
1504 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1505 		if (len < 0)
1506 			return len;
1507 
1508 		goto lenout;
1509 
1510 	case SO_LOCK_FILTER:
1511 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1512 		break;
1513 
1514 	case SO_BPF_EXTENSIONS:
1515 		v.val = bpf_tell_extensions();
1516 		break;
1517 
1518 	case SO_SELECT_ERR_QUEUE:
1519 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1520 		break;
1521 
1522 #ifdef CONFIG_NET_RX_BUSY_POLL
1523 	case SO_BUSY_POLL:
1524 		v.val = sk->sk_ll_usec;
1525 		break;
1526 #endif
1527 
1528 	case SO_MAX_PACING_RATE:
1529 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1530 			lv = sizeof(v.ulval);
1531 			v.ulval = sk->sk_max_pacing_rate;
1532 		} else {
1533 			/* 32bit version */
1534 			v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1535 		}
1536 		break;
1537 
1538 	case SO_INCOMING_CPU:
1539 		v.val = READ_ONCE(sk->sk_incoming_cpu);
1540 		break;
1541 
1542 	case SO_MEMINFO:
1543 	{
1544 		u32 meminfo[SK_MEMINFO_VARS];
1545 
1546 		sk_get_meminfo(sk, meminfo);
1547 
1548 		len = min_t(unsigned int, len, sizeof(meminfo));
1549 		if (copy_to_user(optval, &meminfo, len))
1550 			return -EFAULT;
1551 
1552 		goto lenout;
1553 	}
1554 
1555 #ifdef CONFIG_NET_RX_BUSY_POLL
1556 	case SO_INCOMING_NAPI_ID:
1557 		v.val = READ_ONCE(sk->sk_napi_id);
1558 
1559 		/* aggregate non-NAPI IDs down to 0 */
1560 		if (v.val < MIN_NAPI_ID)
1561 			v.val = 0;
1562 
1563 		break;
1564 #endif
1565 
1566 	case SO_COOKIE:
1567 		lv = sizeof(u64);
1568 		if (len < lv)
1569 			return -EINVAL;
1570 		v.val64 = sock_gen_cookie(sk);
1571 		break;
1572 
1573 	case SO_ZEROCOPY:
1574 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1575 		break;
1576 
1577 	case SO_TXTIME:
1578 		lv = sizeof(v.txtime);
1579 		v.txtime.clockid = sk->sk_clockid;
1580 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1581 				  SOF_TXTIME_DEADLINE_MODE : 0;
1582 		v.txtime.flags |= sk->sk_txtime_report_errors ?
1583 				  SOF_TXTIME_REPORT_ERRORS : 0;
1584 		break;
1585 
1586 	case SO_BINDTOIFINDEX:
1587 		v.val = sk->sk_bound_dev_if;
1588 		break;
1589 
1590 	default:
1591 		/* We implement the SO_SNDLOWAT etc to not be settable
1592 		 * (1003.1g 7).
1593 		 */
1594 		return -ENOPROTOOPT;
1595 	}
1596 
1597 	if (len > lv)
1598 		len = lv;
1599 	if (copy_to_user(optval, &v, len))
1600 		return -EFAULT;
1601 lenout:
1602 	if (put_user(len, optlen))
1603 		return -EFAULT;
1604 	return 0;
1605 }
1606 
1607 /*
1608  * Initialize an sk_lock.
1609  *
1610  * (We also register the sk_lock with the lock validator.)
1611  */
1612 static inline void sock_lock_init(struct sock *sk)
1613 {
1614 	if (sk->sk_kern_sock)
1615 		sock_lock_init_class_and_name(
1616 			sk,
1617 			af_family_kern_slock_key_strings[sk->sk_family],
1618 			af_family_kern_slock_keys + sk->sk_family,
1619 			af_family_kern_key_strings[sk->sk_family],
1620 			af_family_kern_keys + sk->sk_family);
1621 	else
1622 		sock_lock_init_class_and_name(
1623 			sk,
1624 			af_family_slock_key_strings[sk->sk_family],
1625 			af_family_slock_keys + sk->sk_family,
1626 			af_family_key_strings[sk->sk_family],
1627 			af_family_keys + sk->sk_family);
1628 }
1629 
1630 /*
1631  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1632  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1633  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1634  */
1635 static void sock_copy(struct sock *nsk, const struct sock *osk)
1636 {
1637 	const struct proto *prot = READ_ONCE(osk->sk_prot);
1638 #ifdef CONFIG_SECURITY_NETWORK
1639 	void *sptr = nsk->sk_security;
1640 #endif
1641 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1642 
1643 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1644 	       prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1645 
1646 #ifdef CONFIG_SECURITY_NETWORK
1647 	nsk->sk_security = sptr;
1648 	security_sk_clone(osk, nsk);
1649 #endif
1650 }
1651 
1652 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1653 		int family)
1654 {
1655 	struct sock *sk;
1656 	struct kmem_cache *slab;
1657 
1658 	slab = prot->slab;
1659 	if (slab != NULL) {
1660 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1661 		if (!sk)
1662 			return sk;
1663 		if (want_init_on_alloc(priority))
1664 			sk_prot_clear_nulls(sk, prot->obj_size);
1665 	} else
1666 		sk = kmalloc(prot->obj_size, priority);
1667 
1668 	if (sk != NULL) {
1669 		if (security_sk_alloc(sk, family, priority))
1670 			goto out_free;
1671 
1672 		if (!try_module_get(prot->owner))
1673 			goto out_free_sec;
1674 		sk_tx_queue_clear(sk);
1675 	}
1676 
1677 	return sk;
1678 
1679 out_free_sec:
1680 	security_sk_free(sk);
1681 out_free:
1682 	if (slab != NULL)
1683 		kmem_cache_free(slab, sk);
1684 	else
1685 		kfree(sk);
1686 	return NULL;
1687 }
1688 
1689 static void sk_prot_free(struct proto *prot, struct sock *sk)
1690 {
1691 	struct kmem_cache *slab;
1692 	struct module *owner;
1693 
1694 	owner = prot->owner;
1695 	slab = prot->slab;
1696 
1697 	cgroup_sk_free(&sk->sk_cgrp_data);
1698 	mem_cgroup_sk_free(sk);
1699 	security_sk_free(sk);
1700 	if (slab != NULL)
1701 		kmem_cache_free(slab, sk);
1702 	else
1703 		kfree(sk);
1704 	module_put(owner);
1705 }
1706 
1707 /**
1708  *	sk_alloc - All socket objects are allocated here
1709  *	@net: the applicable net namespace
1710  *	@family: protocol family
1711  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1712  *	@prot: struct proto associated with this new sock instance
1713  *	@kern: is this to be a kernel socket?
1714  */
1715 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1716 		      struct proto *prot, int kern)
1717 {
1718 	struct sock *sk;
1719 
1720 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1721 	if (sk) {
1722 		sk->sk_family = family;
1723 		/*
1724 		 * See comment in struct sock definition to understand
1725 		 * why we need sk_prot_creator -acme
1726 		 */
1727 		sk->sk_prot = sk->sk_prot_creator = prot;
1728 		sk->sk_kern_sock = kern;
1729 		sock_lock_init(sk);
1730 		sk->sk_net_refcnt = kern ? 0 : 1;
1731 		if (likely(sk->sk_net_refcnt)) {
1732 			get_net(net);
1733 			sock_inuse_add(net, 1);
1734 		}
1735 
1736 		sock_net_set(sk, net);
1737 		refcount_set(&sk->sk_wmem_alloc, 1);
1738 
1739 		mem_cgroup_sk_alloc(sk);
1740 		cgroup_sk_alloc(&sk->sk_cgrp_data);
1741 		sock_update_classid(&sk->sk_cgrp_data);
1742 		sock_update_netprioidx(&sk->sk_cgrp_data);
1743 		sk_tx_queue_clear(sk);
1744 	}
1745 
1746 	return sk;
1747 }
1748 EXPORT_SYMBOL(sk_alloc);
1749 
1750 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1751  * grace period. This is the case for UDP sockets and TCP listeners.
1752  */
1753 static void __sk_destruct(struct rcu_head *head)
1754 {
1755 	struct sock *sk = container_of(head, struct sock, sk_rcu);
1756 	struct sk_filter *filter;
1757 
1758 	if (sk->sk_destruct)
1759 		sk->sk_destruct(sk);
1760 
1761 	filter = rcu_dereference_check(sk->sk_filter,
1762 				       refcount_read(&sk->sk_wmem_alloc) == 0);
1763 	if (filter) {
1764 		sk_filter_uncharge(sk, filter);
1765 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1766 	}
1767 
1768 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1769 
1770 #ifdef CONFIG_BPF_SYSCALL
1771 	bpf_sk_storage_free(sk);
1772 #endif
1773 
1774 	if (atomic_read(&sk->sk_omem_alloc))
1775 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1776 			 __func__, atomic_read(&sk->sk_omem_alloc));
1777 
1778 	if (sk->sk_frag.page) {
1779 		put_page(sk->sk_frag.page);
1780 		sk->sk_frag.page = NULL;
1781 	}
1782 
1783 	if (sk->sk_peer_cred)
1784 		put_cred(sk->sk_peer_cred);
1785 	put_pid(sk->sk_peer_pid);
1786 	if (likely(sk->sk_net_refcnt))
1787 		put_net(sock_net(sk));
1788 	sk_prot_free(sk->sk_prot_creator, sk);
1789 }
1790 
1791 void sk_destruct(struct sock *sk)
1792 {
1793 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1794 
1795 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1796 		reuseport_detach_sock(sk);
1797 		use_call_rcu = true;
1798 	}
1799 
1800 	if (use_call_rcu)
1801 		call_rcu(&sk->sk_rcu, __sk_destruct);
1802 	else
1803 		__sk_destruct(&sk->sk_rcu);
1804 }
1805 
1806 static void __sk_free(struct sock *sk)
1807 {
1808 	if (likely(sk->sk_net_refcnt))
1809 		sock_inuse_add(sock_net(sk), -1);
1810 
1811 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1812 		sock_diag_broadcast_destroy(sk);
1813 	else
1814 		sk_destruct(sk);
1815 }
1816 
1817 void sk_free(struct sock *sk)
1818 {
1819 	/*
1820 	 * We subtract one from sk_wmem_alloc and can know if
1821 	 * some packets are still in some tx queue.
1822 	 * If not null, sock_wfree() will call __sk_free(sk) later
1823 	 */
1824 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1825 		__sk_free(sk);
1826 }
1827 EXPORT_SYMBOL(sk_free);
1828 
1829 static void sk_init_common(struct sock *sk)
1830 {
1831 	skb_queue_head_init(&sk->sk_receive_queue);
1832 	skb_queue_head_init(&sk->sk_write_queue);
1833 	skb_queue_head_init(&sk->sk_error_queue);
1834 
1835 	rwlock_init(&sk->sk_callback_lock);
1836 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1837 			af_rlock_keys + sk->sk_family,
1838 			af_family_rlock_key_strings[sk->sk_family]);
1839 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1840 			af_wlock_keys + sk->sk_family,
1841 			af_family_wlock_key_strings[sk->sk_family]);
1842 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1843 			af_elock_keys + sk->sk_family,
1844 			af_family_elock_key_strings[sk->sk_family]);
1845 	lockdep_set_class_and_name(&sk->sk_callback_lock,
1846 			af_callback_keys + sk->sk_family,
1847 			af_family_clock_key_strings[sk->sk_family]);
1848 }
1849 
1850 /**
1851  *	sk_clone_lock - clone a socket, and lock its clone
1852  *	@sk: the socket to clone
1853  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1854  *
1855  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1856  */
1857 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1858 {
1859 	struct proto *prot = READ_ONCE(sk->sk_prot);
1860 	struct sock *newsk;
1861 	bool is_charged = true;
1862 
1863 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
1864 	if (newsk != NULL) {
1865 		struct sk_filter *filter;
1866 
1867 		sock_copy(newsk, sk);
1868 
1869 		newsk->sk_prot_creator = prot;
1870 
1871 		/* SANITY */
1872 		if (likely(newsk->sk_net_refcnt))
1873 			get_net(sock_net(newsk));
1874 		sk_node_init(&newsk->sk_node);
1875 		sock_lock_init(newsk);
1876 		bh_lock_sock(newsk);
1877 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1878 		newsk->sk_backlog.len = 0;
1879 
1880 		atomic_set(&newsk->sk_rmem_alloc, 0);
1881 		/*
1882 		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1883 		 */
1884 		refcount_set(&newsk->sk_wmem_alloc, 1);
1885 		atomic_set(&newsk->sk_omem_alloc, 0);
1886 		sk_init_common(newsk);
1887 
1888 		newsk->sk_dst_cache	= NULL;
1889 		newsk->sk_dst_pending_confirm = 0;
1890 		newsk->sk_wmem_queued	= 0;
1891 		newsk->sk_forward_alloc = 0;
1892 		atomic_set(&newsk->sk_drops, 0);
1893 		newsk->sk_send_head	= NULL;
1894 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1895 		atomic_set(&newsk->sk_zckey, 0);
1896 
1897 		sock_reset_flag(newsk, SOCK_DONE);
1898 
1899 		/* sk->sk_memcg will be populated at accept() time */
1900 		newsk->sk_memcg = NULL;
1901 
1902 		cgroup_sk_clone(&newsk->sk_cgrp_data);
1903 
1904 		rcu_read_lock();
1905 		filter = rcu_dereference(sk->sk_filter);
1906 		if (filter != NULL)
1907 			/* though it's an empty new sock, the charging may fail
1908 			 * if sysctl_optmem_max was changed between creation of
1909 			 * original socket and cloning
1910 			 */
1911 			is_charged = sk_filter_charge(newsk, filter);
1912 		RCU_INIT_POINTER(newsk->sk_filter, filter);
1913 		rcu_read_unlock();
1914 
1915 		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1916 			/* We need to make sure that we don't uncharge the new
1917 			 * socket if we couldn't charge it in the first place
1918 			 * as otherwise we uncharge the parent's filter.
1919 			 */
1920 			if (!is_charged)
1921 				RCU_INIT_POINTER(newsk->sk_filter, NULL);
1922 			sk_free_unlock_clone(newsk);
1923 			newsk = NULL;
1924 			goto out;
1925 		}
1926 		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1927 
1928 		if (bpf_sk_storage_clone(sk, newsk)) {
1929 			sk_free_unlock_clone(newsk);
1930 			newsk = NULL;
1931 			goto out;
1932 		}
1933 
1934 		/* Clear sk_user_data if parent had the pointer tagged
1935 		 * as not suitable for copying when cloning.
1936 		 */
1937 		if (sk_user_data_is_nocopy(newsk))
1938 			newsk->sk_user_data = NULL;
1939 
1940 		newsk->sk_err	   = 0;
1941 		newsk->sk_err_soft = 0;
1942 		newsk->sk_priority = 0;
1943 		newsk->sk_incoming_cpu = raw_smp_processor_id();
1944 		if (likely(newsk->sk_net_refcnt))
1945 			sock_inuse_add(sock_net(newsk), 1);
1946 
1947 		/*
1948 		 * Before updating sk_refcnt, we must commit prior changes to memory
1949 		 * (Documentation/RCU/rculist_nulls.rst for details)
1950 		 */
1951 		smp_wmb();
1952 		refcount_set(&newsk->sk_refcnt, 2);
1953 
1954 		/*
1955 		 * Increment the counter in the same struct proto as the master
1956 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1957 		 * is the same as sk->sk_prot->socks, as this field was copied
1958 		 * with memcpy).
1959 		 *
1960 		 * This _changes_ the previous behaviour, where
1961 		 * tcp_create_openreq_child always was incrementing the
1962 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1963 		 * to be taken into account in all callers. -acme
1964 		 */
1965 		sk_refcnt_debug_inc(newsk);
1966 		sk_set_socket(newsk, NULL);
1967 		sk_tx_queue_clear(newsk);
1968 		RCU_INIT_POINTER(newsk->sk_wq, NULL);
1969 
1970 		if (newsk->sk_prot->sockets_allocated)
1971 			sk_sockets_allocated_inc(newsk);
1972 
1973 		if (sock_needs_netstamp(sk) &&
1974 		    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1975 			net_enable_timestamp();
1976 	}
1977 out:
1978 	return newsk;
1979 }
1980 EXPORT_SYMBOL_GPL(sk_clone_lock);
1981 
1982 void sk_free_unlock_clone(struct sock *sk)
1983 {
1984 	/* It is still raw copy of parent, so invalidate
1985 	 * destructor and make plain sk_free() */
1986 	sk->sk_destruct = NULL;
1987 	bh_unlock_sock(sk);
1988 	sk_free(sk);
1989 }
1990 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1991 
1992 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1993 {
1994 	u32 max_segs = 1;
1995 
1996 	sk_dst_set(sk, dst);
1997 	sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
1998 	if (sk->sk_route_caps & NETIF_F_GSO)
1999 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2000 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
2001 	if (sk_can_gso(sk)) {
2002 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2003 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2004 		} else {
2005 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2006 			sk->sk_gso_max_size = dst->dev->gso_max_size;
2007 			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
2008 		}
2009 	}
2010 	sk->sk_gso_max_segs = max_segs;
2011 }
2012 EXPORT_SYMBOL_GPL(sk_setup_caps);
2013 
2014 /*
2015  *	Simple resource managers for sockets.
2016  */
2017 
2018 
2019 /*
2020  * Write buffer destructor automatically called from kfree_skb.
2021  */
2022 void sock_wfree(struct sk_buff *skb)
2023 {
2024 	struct sock *sk = skb->sk;
2025 	unsigned int len = skb->truesize;
2026 
2027 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2028 		/*
2029 		 * Keep a reference on sk_wmem_alloc, this will be released
2030 		 * after sk_write_space() call
2031 		 */
2032 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2033 		sk->sk_write_space(sk);
2034 		len = 1;
2035 	}
2036 	/*
2037 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2038 	 * could not do because of in-flight packets
2039 	 */
2040 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2041 		__sk_free(sk);
2042 }
2043 EXPORT_SYMBOL(sock_wfree);
2044 
2045 /* This variant of sock_wfree() is used by TCP,
2046  * since it sets SOCK_USE_WRITE_QUEUE.
2047  */
2048 void __sock_wfree(struct sk_buff *skb)
2049 {
2050 	struct sock *sk = skb->sk;
2051 
2052 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2053 		__sk_free(sk);
2054 }
2055 
2056 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2057 {
2058 	skb_orphan(skb);
2059 	skb->sk = sk;
2060 #ifdef CONFIG_INET
2061 	if (unlikely(!sk_fullsock(sk))) {
2062 		skb->destructor = sock_edemux;
2063 		sock_hold(sk);
2064 		return;
2065 	}
2066 #endif
2067 	skb->destructor = sock_wfree;
2068 	skb_set_hash_from_sk(skb, sk);
2069 	/*
2070 	 * We used to take a refcount on sk, but following operation
2071 	 * is enough to guarantee sk_free() wont free this sock until
2072 	 * all in-flight packets are completed
2073 	 */
2074 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2075 }
2076 EXPORT_SYMBOL(skb_set_owner_w);
2077 
2078 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2079 {
2080 #ifdef CONFIG_TLS_DEVICE
2081 	/* Drivers depend on in-order delivery for crypto offload,
2082 	 * partial orphan breaks out-of-order-OK logic.
2083 	 */
2084 	if (skb->decrypted)
2085 		return false;
2086 #endif
2087 	return (skb->destructor == sock_wfree ||
2088 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2089 }
2090 
2091 /* This helper is used by netem, as it can hold packets in its
2092  * delay queue. We want to allow the owner socket to send more
2093  * packets, as if they were already TX completed by a typical driver.
2094  * But we also want to keep skb->sk set because some packet schedulers
2095  * rely on it (sch_fq for example).
2096  */
2097 void skb_orphan_partial(struct sk_buff *skb)
2098 {
2099 	if (skb_is_tcp_pure_ack(skb))
2100 		return;
2101 
2102 	if (can_skb_orphan_partial(skb)) {
2103 		struct sock *sk = skb->sk;
2104 
2105 		if (refcount_inc_not_zero(&sk->sk_refcnt)) {
2106 			WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
2107 			skb->destructor = sock_efree;
2108 		}
2109 	} else {
2110 		skb_orphan(skb);
2111 	}
2112 }
2113 EXPORT_SYMBOL(skb_orphan_partial);
2114 
2115 /*
2116  * Read buffer destructor automatically called from kfree_skb.
2117  */
2118 void sock_rfree(struct sk_buff *skb)
2119 {
2120 	struct sock *sk = skb->sk;
2121 	unsigned int len = skb->truesize;
2122 
2123 	atomic_sub(len, &sk->sk_rmem_alloc);
2124 	sk_mem_uncharge(sk, len);
2125 }
2126 EXPORT_SYMBOL(sock_rfree);
2127 
2128 /*
2129  * Buffer destructor for skbs that are not used directly in read or write
2130  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2131  */
2132 void sock_efree(struct sk_buff *skb)
2133 {
2134 	sock_put(skb->sk);
2135 }
2136 EXPORT_SYMBOL(sock_efree);
2137 
2138 /* Buffer destructor for prefetch/receive path where reference count may
2139  * not be held, e.g. for listen sockets.
2140  */
2141 #ifdef CONFIG_INET
2142 void sock_pfree(struct sk_buff *skb)
2143 {
2144 	if (sk_is_refcounted(skb->sk))
2145 		sock_gen_put(skb->sk);
2146 }
2147 EXPORT_SYMBOL(sock_pfree);
2148 #endif /* CONFIG_INET */
2149 
2150 kuid_t sock_i_uid(struct sock *sk)
2151 {
2152 	kuid_t uid;
2153 
2154 	read_lock_bh(&sk->sk_callback_lock);
2155 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2156 	read_unlock_bh(&sk->sk_callback_lock);
2157 	return uid;
2158 }
2159 EXPORT_SYMBOL(sock_i_uid);
2160 
2161 unsigned long sock_i_ino(struct sock *sk)
2162 {
2163 	unsigned long ino;
2164 
2165 	read_lock_bh(&sk->sk_callback_lock);
2166 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2167 	read_unlock_bh(&sk->sk_callback_lock);
2168 	return ino;
2169 }
2170 EXPORT_SYMBOL(sock_i_ino);
2171 
2172 /*
2173  * Allocate a skb from the socket's send buffer.
2174  */
2175 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2176 			     gfp_t priority)
2177 {
2178 	if (force ||
2179 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2180 		struct sk_buff *skb = alloc_skb(size, priority);
2181 
2182 		if (skb) {
2183 			skb_set_owner_w(skb, sk);
2184 			return skb;
2185 		}
2186 	}
2187 	return NULL;
2188 }
2189 EXPORT_SYMBOL(sock_wmalloc);
2190 
2191 static void sock_ofree(struct sk_buff *skb)
2192 {
2193 	struct sock *sk = skb->sk;
2194 
2195 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2196 }
2197 
2198 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2199 			     gfp_t priority)
2200 {
2201 	struct sk_buff *skb;
2202 
2203 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2204 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2205 	    sysctl_optmem_max)
2206 		return NULL;
2207 
2208 	skb = alloc_skb(size, priority);
2209 	if (!skb)
2210 		return NULL;
2211 
2212 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2213 	skb->sk = sk;
2214 	skb->destructor = sock_ofree;
2215 	return skb;
2216 }
2217 
2218 /*
2219  * Allocate a memory block from the socket's option memory buffer.
2220  */
2221 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2222 {
2223 	if ((unsigned int)size <= sysctl_optmem_max &&
2224 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2225 		void *mem;
2226 		/* First do the add, to avoid the race if kmalloc
2227 		 * might sleep.
2228 		 */
2229 		atomic_add(size, &sk->sk_omem_alloc);
2230 		mem = kmalloc(size, priority);
2231 		if (mem)
2232 			return mem;
2233 		atomic_sub(size, &sk->sk_omem_alloc);
2234 	}
2235 	return NULL;
2236 }
2237 EXPORT_SYMBOL(sock_kmalloc);
2238 
2239 /* Free an option memory block. Note, we actually want the inline
2240  * here as this allows gcc to detect the nullify and fold away the
2241  * condition entirely.
2242  */
2243 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2244 				  const bool nullify)
2245 {
2246 	if (WARN_ON_ONCE(!mem))
2247 		return;
2248 	if (nullify)
2249 		kfree_sensitive(mem);
2250 	else
2251 		kfree(mem);
2252 	atomic_sub(size, &sk->sk_omem_alloc);
2253 }
2254 
2255 void sock_kfree_s(struct sock *sk, void *mem, int size)
2256 {
2257 	__sock_kfree_s(sk, mem, size, false);
2258 }
2259 EXPORT_SYMBOL(sock_kfree_s);
2260 
2261 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2262 {
2263 	__sock_kfree_s(sk, mem, size, true);
2264 }
2265 EXPORT_SYMBOL(sock_kzfree_s);
2266 
2267 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2268    I think, these locks should be removed for datagram sockets.
2269  */
2270 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2271 {
2272 	DEFINE_WAIT(wait);
2273 
2274 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2275 	for (;;) {
2276 		if (!timeo)
2277 			break;
2278 		if (signal_pending(current))
2279 			break;
2280 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2281 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2282 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2283 			break;
2284 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2285 			break;
2286 		if (sk->sk_err)
2287 			break;
2288 		timeo = schedule_timeout(timeo);
2289 	}
2290 	finish_wait(sk_sleep(sk), &wait);
2291 	return timeo;
2292 }
2293 
2294 
2295 /*
2296  *	Generic send/receive buffer handlers
2297  */
2298 
2299 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2300 				     unsigned long data_len, int noblock,
2301 				     int *errcode, int max_page_order)
2302 {
2303 	struct sk_buff *skb;
2304 	long timeo;
2305 	int err;
2306 
2307 	timeo = sock_sndtimeo(sk, noblock);
2308 	for (;;) {
2309 		err = sock_error(sk);
2310 		if (err != 0)
2311 			goto failure;
2312 
2313 		err = -EPIPE;
2314 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2315 			goto failure;
2316 
2317 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2318 			break;
2319 
2320 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2321 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2322 		err = -EAGAIN;
2323 		if (!timeo)
2324 			goto failure;
2325 		if (signal_pending(current))
2326 			goto interrupted;
2327 		timeo = sock_wait_for_wmem(sk, timeo);
2328 	}
2329 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2330 				   errcode, sk->sk_allocation);
2331 	if (skb)
2332 		skb_set_owner_w(skb, sk);
2333 	return skb;
2334 
2335 interrupted:
2336 	err = sock_intr_errno(timeo);
2337 failure:
2338 	*errcode = err;
2339 	return NULL;
2340 }
2341 EXPORT_SYMBOL(sock_alloc_send_pskb);
2342 
2343 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2344 				    int noblock, int *errcode)
2345 {
2346 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2347 }
2348 EXPORT_SYMBOL(sock_alloc_send_skb);
2349 
2350 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2351 		     struct sockcm_cookie *sockc)
2352 {
2353 	u32 tsflags;
2354 
2355 	switch (cmsg->cmsg_type) {
2356 	case SO_MARK:
2357 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2358 			return -EPERM;
2359 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2360 			return -EINVAL;
2361 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2362 		break;
2363 	case SO_TIMESTAMPING_OLD:
2364 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2365 			return -EINVAL;
2366 
2367 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2368 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2369 			return -EINVAL;
2370 
2371 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2372 		sockc->tsflags |= tsflags;
2373 		break;
2374 	case SCM_TXTIME:
2375 		if (!sock_flag(sk, SOCK_TXTIME))
2376 			return -EINVAL;
2377 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2378 			return -EINVAL;
2379 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2380 		break;
2381 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2382 	case SCM_RIGHTS:
2383 	case SCM_CREDENTIALS:
2384 		break;
2385 	default:
2386 		return -EINVAL;
2387 	}
2388 	return 0;
2389 }
2390 EXPORT_SYMBOL(__sock_cmsg_send);
2391 
2392 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2393 		   struct sockcm_cookie *sockc)
2394 {
2395 	struct cmsghdr *cmsg;
2396 	int ret;
2397 
2398 	for_each_cmsghdr(cmsg, msg) {
2399 		if (!CMSG_OK(msg, cmsg))
2400 			return -EINVAL;
2401 		if (cmsg->cmsg_level != SOL_SOCKET)
2402 			continue;
2403 		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2404 		if (ret)
2405 			return ret;
2406 	}
2407 	return 0;
2408 }
2409 EXPORT_SYMBOL(sock_cmsg_send);
2410 
2411 static void sk_enter_memory_pressure(struct sock *sk)
2412 {
2413 	if (!sk->sk_prot->enter_memory_pressure)
2414 		return;
2415 
2416 	sk->sk_prot->enter_memory_pressure(sk);
2417 }
2418 
2419 static void sk_leave_memory_pressure(struct sock *sk)
2420 {
2421 	if (sk->sk_prot->leave_memory_pressure) {
2422 		sk->sk_prot->leave_memory_pressure(sk);
2423 	} else {
2424 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2425 
2426 		if (memory_pressure && READ_ONCE(*memory_pressure))
2427 			WRITE_ONCE(*memory_pressure, 0);
2428 	}
2429 }
2430 
2431 #define SKB_FRAG_PAGE_ORDER	get_order(32768)
2432 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2433 
2434 /**
2435  * skb_page_frag_refill - check that a page_frag contains enough room
2436  * @sz: minimum size of the fragment we want to get
2437  * @pfrag: pointer to page_frag
2438  * @gfp: priority for memory allocation
2439  *
2440  * Note: While this allocator tries to use high order pages, there is
2441  * no guarantee that allocations succeed. Therefore, @sz MUST be
2442  * less or equal than PAGE_SIZE.
2443  */
2444 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2445 {
2446 	if (pfrag->page) {
2447 		if (page_ref_count(pfrag->page) == 1) {
2448 			pfrag->offset = 0;
2449 			return true;
2450 		}
2451 		if (pfrag->offset + sz <= pfrag->size)
2452 			return true;
2453 		put_page(pfrag->page);
2454 	}
2455 
2456 	pfrag->offset = 0;
2457 	if (SKB_FRAG_PAGE_ORDER &&
2458 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2459 		/* Avoid direct reclaim but allow kswapd to wake */
2460 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2461 					  __GFP_COMP | __GFP_NOWARN |
2462 					  __GFP_NORETRY,
2463 					  SKB_FRAG_PAGE_ORDER);
2464 		if (likely(pfrag->page)) {
2465 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2466 			return true;
2467 		}
2468 	}
2469 	pfrag->page = alloc_page(gfp);
2470 	if (likely(pfrag->page)) {
2471 		pfrag->size = PAGE_SIZE;
2472 		return true;
2473 	}
2474 	return false;
2475 }
2476 EXPORT_SYMBOL(skb_page_frag_refill);
2477 
2478 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2479 {
2480 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2481 		return true;
2482 
2483 	sk_enter_memory_pressure(sk);
2484 	sk_stream_moderate_sndbuf(sk);
2485 	return false;
2486 }
2487 EXPORT_SYMBOL(sk_page_frag_refill);
2488 
2489 static void __lock_sock(struct sock *sk)
2490 	__releases(&sk->sk_lock.slock)
2491 	__acquires(&sk->sk_lock.slock)
2492 {
2493 	DEFINE_WAIT(wait);
2494 
2495 	for (;;) {
2496 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2497 					TASK_UNINTERRUPTIBLE);
2498 		spin_unlock_bh(&sk->sk_lock.slock);
2499 		schedule();
2500 		spin_lock_bh(&sk->sk_lock.slock);
2501 		if (!sock_owned_by_user(sk))
2502 			break;
2503 	}
2504 	finish_wait(&sk->sk_lock.wq, &wait);
2505 }
2506 
2507 void __release_sock(struct sock *sk)
2508 	__releases(&sk->sk_lock.slock)
2509 	__acquires(&sk->sk_lock.slock)
2510 {
2511 	struct sk_buff *skb, *next;
2512 
2513 	while ((skb = sk->sk_backlog.head) != NULL) {
2514 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2515 
2516 		spin_unlock_bh(&sk->sk_lock.slock);
2517 
2518 		do {
2519 			next = skb->next;
2520 			prefetch(next);
2521 			WARN_ON_ONCE(skb_dst_is_noref(skb));
2522 			skb_mark_not_on_list(skb);
2523 			sk_backlog_rcv(sk, skb);
2524 
2525 			cond_resched();
2526 
2527 			skb = next;
2528 		} while (skb != NULL);
2529 
2530 		spin_lock_bh(&sk->sk_lock.slock);
2531 	}
2532 
2533 	/*
2534 	 * Doing the zeroing here guarantee we can not loop forever
2535 	 * while a wild producer attempts to flood us.
2536 	 */
2537 	sk->sk_backlog.len = 0;
2538 }
2539 
2540 void __sk_flush_backlog(struct sock *sk)
2541 {
2542 	spin_lock_bh(&sk->sk_lock.slock);
2543 	__release_sock(sk);
2544 	spin_unlock_bh(&sk->sk_lock.slock);
2545 }
2546 
2547 /**
2548  * sk_wait_data - wait for data to arrive at sk_receive_queue
2549  * @sk:    sock to wait on
2550  * @timeo: for how long
2551  * @skb:   last skb seen on sk_receive_queue
2552  *
2553  * Now socket state including sk->sk_err is changed only under lock,
2554  * hence we may omit checks after joining wait queue.
2555  * We check receive queue before schedule() only as optimization;
2556  * it is very likely that release_sock() added new data.
2557  */
2558 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2559 {
2560 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2561 	int rc;
2562 
2563 	add_wait_queue(sk_sleep(sk), &wait);
2564 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2565 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2566 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2567 	remove_wait_queue(sk_sleep(sk), &wait);
2568 	return rc;
2569 }
2570 EXPORT_SYMBOL(sk_wait_data);
2571 
2572 /**
2573  *	__sk_mem_raise_allocated - increase memory_allocated
2574  *	@sk: socket
2575  *	@size: memory size to allocate
2576  *	@amt: pages to allocate
2577  *	@kind: allocation type
2578  *
2579  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2580  */
2581 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2582 {
2583 	struct proto *prot = sk->sk_prot;
2584 	long allocated = sk_memory_allocated_add(sk, amt);
2585 	bool charged = true;
2586 
2587 	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2588 	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
2589 		goto suppress_allocation;
2590 
2591 	/* Under limit. */
2592 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2593 		sk_leave_memory_pressure(sk);
2594 		return 1;
2595 	}
2596 
2597 	/* Under pressure. */
2598 	if (allocated > sk_prot_mem_limits(sk, 1))
2599 		sk_enter_memory_pressure(sk);
2600 
2601 	/* Over hard limit. */
2602 	if (allocated > sk_prot_mem_limits(sk, 2))
2603 		goto suppress_allocation;
2604 
2605 	/* guarantee minimum buffer size under pressure */
2606 	if (kind == SK_MEM_RECV) {
2607 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2608 			return 1;
2609 
2610 	} else { /* SK_MEM_SEND */
2611 		int wmem0 = sk_get_wmem0(sk, prot);
2612 
2613 		if (sk->sk_type == SOCK_STREAM) {
2614 			if (sk->sk_wmem_queued < wmem0)
2615 				return 1;
2616 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2617 				return 1;
2618 		}
2619 	}
2620 
2621 	if (sk_has_memory_pressure(sk)) {
2622 		u64 alloc;
2623 
2624 		if (!sk_under_memory_pressure(sk))
2625 			return 1;
2626 		alloc = sk_sockets_allocated_read_positive(sk);
2627 		if (sk_prot_mem_limits(sk, 2) > alloc *
2628 		    sk_mem_pages(sk->sk_wmem_queued +
2629 				 atomic_read(&sk->sk_rmem_alloc) +
2630 				 sk->sk_forward_alloc))
2631 			return 1;
2632 	}
2633 
2634 suppress_allocation:
2635 
2636 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2637 		sk_stream_moderate_sndbuf(sk);
2638 
2639 		/* Fail only if socket is _under_ its sndbuf.
2640 		 * In this case we cannot block, so that we have to fail.
2641 		 */
2642 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2643 			return 1;
2644 	}
2645 
2646 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2647 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2648 
2649 	sk_memory_allocated_sub(sk, amt);
2650 
2651 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2652 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2653 
2654 	return 0;
2655 }
2656 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2657 
2658 /**
2659  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2660  *	@sk: socket
2661  *	@size: memory size to allocate
2662  *	@kind: allocation type
2663  *
2664  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2665  *	rmem allocation. This function assumes that protocols which have
2666  *	memory_pressure use sk_wmem_queued as write buffer accounting.
2667  */
2668 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2669 {
2670 	int ret, amt = sk_mem_pages(size);
2671 
2672 	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2673 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2674 	if (!ret)
2675 		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2676 	return ret;
2677 }
2678 EXPORT_SYMBOL(__sk_mem_schedule);
2679 
2680 /**
2681  *	__sk_mem_reduce_allocated - reclaim memory_allocated
2682  *	@sk: socket
2683  *	@amount: number of quanta
2684  *
2685  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2686  */
2687 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2688 {
2689 	sk_memory_allocated_sub(sk, amount);
2690 
2691 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2692 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2693 
2694 	if (sk_under_memory_pressure(sk) &&
2695 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2696 		sk_leave_memory_pressure(sk);
2697 }
2698 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2699 
2700 /**
2701  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2702  *	@sk: socket
2703  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2704  */
2705 void __sk_mem_reclaim(struct sock *sk, int amount)
2706 {
2707 	amount >>= SK_MEM_QUANTUM_SHIFT;
2708 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2709 	__sk_mem_reduce_allocated(sk, amount);
2710 }
2711 EXPORT_SYMBOL(__sk_mem_reclaim);
2712 
2713 int sk_set_peek_off(struct sock *sk, int val)
2714 {
2715 	sk->sk_peek_off = val;
2716 	return 0;
2717 }
2718 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2719 
2720 /*
2721  * Set of default routines for initialising struct proto_ops when
2722  * the protocol does not support a particular function. In certain
2723  * cases where it makes no sense for a protocol to have a "do nothing"
2724  * function, some default processing is provided.
2725  */
2726 
2727 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2728 {
2729 	return -EOPNOTSUPP;
2730 }
2731 EXPORT_SYMBOL(sock_no_bind);
2732 
2733 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2734 		    int len, int flags)
2735 {
2736 	return -EOPNOTSUPP;
2737 }
2738 EXPORT_SYMBOL(sock_no_connect);
2739 
2740 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2741 {
2742 	return -EOPNOTSUPP;
2743 }
2744 EXPORT_SYMBOL(sock_no_socketpair);
2745 
2746 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2747 		   bool kern)
2748 {
2749 	return -EOPNOTSUPP;
2750 }
2751 EXPORT_SYMBOL(sock_no_accept);
2752 
2753 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2754 		    int peer)
2755 {
2756 	return -EOPNOTSUPP;
2757 }
2758 EXPORT_SYMBOL(sock_no_getname);
2759 
2760 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2761 {
2762 	return -EOPNOTSUPP;
2763 }
2764 EXPORT_SYMBOL(sock_no_ioctl);
2765 
2766 int sock_no_listen(struct socket *sock, int backlog)
2767 {
2768 	return -EOPNOTSUPP;
2769 }
2770 EXPORT_SYMBOL(sock_no_listen);
2771 
2772 int sock_no_shutdown(struct socket *sock, int how)
2773 {
2774 	return -EOPNOTSUPP;
2775 }
2776 EXPORT_SYMBOL(sock_no_shutdown);
2777 
2778 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2779 {
2780 	return -EOPNOTSUPP;
2781 }
2782 EXPORT_SYMBOL(sock_no_sendmsg);
2783 
2784 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2785 {
2786 	return -EOPNOTSUPP;
2787 }
2788 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2789 
2790 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2791 		    int flags)
2792 {
2793 	return -EOPNOTSUPP;
2794 }
2795 EXPORT_SYMBOL(sock_no_recvmsg);
2796 
2797 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2798 {
2799 	/* Mirror missing mmap method error code */
2800 	return -ENODEV;
2801 }
2802 EXPORT_SYMBOL(sock_no_mmap);
2803 
2804 /*
2805  * When a file is received (via SCM_RIGHTS, etc), we must bump the
2806  * various sock-based usage counts.
2807  */
2808 void __receive_sock(struct file *file)
2809 {
2810 	struct socket *sock;
2811 	int error;
2812 
2813 	/*
2814 	 * The resulting value of "error" is ignored here since we only
2815 	 * need to take action when the file is a socket and testing
2816 	 * "sock" for NULL is sufficient.
2817 	 */
2818 	sock = sock_from_file(file, &error);
2819 	if (sock) {
2820 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
2821 		sock_update_classid(&sock->sk->sk_cgrp_data);
2822 	}
2823 }
2824 
2825 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2826 {
2827 	ssize_t res;
2828 	struct msghdr msg = {.msg_flags = flags};
2829 	struct kvec iov;
2830 	char *kaddr = kmap(page);
2831 	iov.iov_base = kaddr + offset;
2832 	iov.iov_len = size;
2833 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2834 	kunmap(page);
2835 	return res;
2836 }
2837 EXPORT_SYMBOL(sock_no_sendpage);
2838 
2839 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2840 				int offset, size_t size, int flags)
2841 {
2842 	ssize_t res;
2843 	struct msghdr msg = {.msg_flags = flags};
2844 	struct kvec iov;
2845 	char *kaddr = kmap(page);
2846 
2847 	iov.iov_base = kaddr + offset;
2848 	iov.iov_len = size;
2849 	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2850 	kunmap(page);
2851 	return res;
2852 }
2853 EXPORT_SYMBOL(sock_no_sendpage_locked);
2854 
2855 /*
2856  *	Default Socket Callbacks
2857  */
2858 
2859 static void sock_def_wakeup(struct sock *sk)
2860 {
2861 	struct socket_wq *wq;
2862 
2863 	rcu_read_lock();
2864 	wq = rcu_dereference(sk->sk_wq);
2865 	if (skwq_has_sleeper(wq))
2866 		wake_up_interruptible_all(&wq->wait);
2867 	rcu_read_unlock();
2868 }
2869 
2870 static void sock_def_error_report(struct sock *sk)
2871 {
2872 	struct socket_wq *wq;
2873 
2874 	rcu_read_lock();
2875 	wq = rcu_dereference(sk->sk_wq);
2876 	if (skwq_has_sleeper(wq))
2877 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2878 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2879 	rcu_read_unlock();
2880 }
2881 
2882 void sock_def_readable(struct sock *sk)
2883 {
2884 	struct socket_wq *wq;
2885 
2886 	rcu_read_lock();
2887 	wq = rcu_dereference(sk->sk_wq);
2888 	if (skwq_has_sleeper(wq))
2889 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2890 						EPOLLRDNORM | EPOLLRDBAND);
2891 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2892 	rcu_read_unlock();
2893 }
2894 
2895 static void sock_def_write_space(struct sock *sk)
2896 {
2897 	struct socket_wq *wq;
2898 
2899 	rcu_read_lock();
2900 
2901 	/* Do not wake up a writer until he can make "significant"
2902 	 * progress.  --DaveM
2903 	 */
2904 	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
2905 		wq = rcu_dereference(sk->sk_wq);
2906 		if (skwq_has_sleeper(wq))
2907 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2908 						EPOLLWRNORM | EPOLLWRBAND);
2909 
2910 		/* Should agree with poll, otherwise some programs break */
2911 		if (sock_writeable(sk))
2912 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2913 	}
2914 
2915 	rcu_read_unlock();
2916 }
2917 
2918 static void sock_def_destruct(struct sock *sk)
2919 {
2920 }
2921 
2922 void sk_send_sigurg(struct sock *sk)
2923 {
2924 	if (sk->sk_socket && sk->sk_socket->file)
2925 		if (send_sigurg(&sk->sk_socket->file->f_owner))
2926 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2927 }
2928 EXPORT_SYMBOL(sk_send_sigurg);
2929 
2930 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2931 		    unsigned long expires)
2932 {
2933 	if (!mod_timer(timer, expires))
2934 		sock_hold(sk);
2935 }
2936 EXPORT_SYMBOL(sk_reset_timer);
2937 
2938 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2939 {
2940 	if (del_timer(timer))
2941 		__sock_put(sk);
2942 }
2943 EXPORT_SYMBOL(sk_stop_timer);
2944 
2945 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
2946 {
2947 	if (del_timer_sync(timer))
2948 		__sock_put(sk);
2949 }
2950 EXPORT_SYMBOL(sk_stop_timer_sync);
2951 
2952 void sock_init_data(struct socket *sock, struct sock *sk)
2953 {
2954 	sk_init_common(sk);
2955 	sk->sk_send_head	=	NULL;
2956 
2957 	timer_setup(&sk->sk_timer, NULL, 0);
2958 
2959 	sk->sk_allocation	=	GFP_KERNEL;
2960 	sk->sk_rcvbuf		=	sysctl_rmem_default;
2961 	sk->sk_sndbuf		=	sysctl_wmem_default;
2962 	sk->sk_state		=	TCP_CLOSE;
2963 	sk_set_socket(sk, sock);
2964 
2965 	sock_set_flag(sk, SOCK_ZAPPED);
2966 
2967 	if (sock) {
2968 		sk->sk_type	=	sock->type;
2969 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
2970 		sock->sk	=	sk;
2971 		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
2972 	} else {
2973 		RCU_INIT_POINTER(sk->sk_wq, NULL);
2974 		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
2975 	}
2976 
2977 	rwlock_init(&sk->sk_callback_lock);
2978 	if (sk->sk_kern_sock)
2979 		lockdep_set_class_and_name(
2980 			&sk->sk_callback_lock,
2981 			af_kern_callback_keys + sk->sk_family,
2982 			af_family_kern_clock_key_strings[sk->sk_family]);
2983 	else
2984 		lockdep_set_class_and_name(
2985 			&sk->sk_callback_lock,
2986 			af_callback_keys + sk->sk_family,
2987 			af_family_clock_key_strings[sk->sk_family]);
2988 
2989 	sk->sk_state_change	=	sock_def_wakeup;
2990 	sk->sk_data_ready	=	sock_def_readable;
2991 	sk->sk_write_space	=	sock_def_write_space;
2992 	sk->sk_error_report	=	sock_def_error_report;
2993 	sk->sk_destruct		=	sock_def_destruct;
2994 
2995 	sk->sk_frag.page	=	NULL;
2996 	sk->sk_frag.offset	=	0;
2997 	sk->sk_peek_off		=	-1;
2998 
2999 	sk->sk_peer_pid 	=	NULL;
3000 	sk->sk_peer_cred	=	NULL;
3001 	sk->sk_write_pending	=	0;
3002 	sk->sk_rcvlowat		=	1;
3003 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3004 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3005 
3006 	sk->sk_stamp = SK_DEFAULT_STAMP;
3007 #if BITS_PER_LONG==32
3008 	seqlock_init(&sk->sk_stamp_seq);
3009 #endif
3010 	atomic_set(&sk->sk_zckey, 0);
3011 
3012 #ifdef CONFIG_NET_RX_BUSY_POLL
3013 	sk->sk_napi_id		=	0;
3014 	sk->sk_ll_usec		=	sysctl_net_busy_read;
3015 #endif
3016 
3017 	sk->sk_max_pacing_rate = ~0UL;
3018 	sk->sk_pacing_rate = ~0UL;
3019 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3020 	sk->sk_incoming_cpu = -1;
3021 
3022 	sk_rx_queue_clear(sk);
3023 	/*
3024 	 * Before updating sk_refcnt, we must commit prior changes to memory
3025 	 * (Documentation/RCU/rculist_nulls.rst for details)
3026 	 */
3027 	smp_wmb();
3028 	refcount_set(&sk->sk_refcnt, 1);
3029 	atomic_set(&sk->sk_drops, 0);
3030 }
3031 EXPORT_SYMBOL(sock_init_data);
3032 
3033 void lock_sock_nested(struct sock *sk, int subclass)
3034 {
3035 	might_sleep();
3036 	spin_lock_bh(&sk->sk_lock.slock);
3037 	if (sk->sk_lock.owned)
3038 		__lock_sock(sk);
3039 	sk->sk_lock.owned = 1;
3040 	spin_unlock(&sk->sk_lock.slock);
3041 	/*
3042 	 * The sk_lock has mutex_lock() semantics here:
3043 	 */
3044 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3045 	local_bh_enable();
3046 }
3047 EXPORT_SYMBOL(lock_sock_nested);
3048 
3049 void release_sock(struct sock *sk)
3050 {
3051 	spin_lock_bh(&sk->sk_lock.slock);
3052 	if (sk->sk_backlog.tail)
3053 		__release_sock(sk);
3054 
3055 	/* Warning : release_cb() might need to release sk ownership,
3056 	 * ie call sock_release_ownership(sk) before us.
3057 	 */
3058 	if (sk->sk_prot->release_cb)
3059 		sk->sk_prot->release_cb(sk);
3060 
3061 	sock_release_ownership(sk);
3062 	if (waitqueue_active(&sk->sk_lock.wq))
3063 		wake_up(&sk->sk_lock.wq);
3064 	spin_unlock_bh(&sk->sk_lock.slock);
3065 }
3066 EXPORT_SYMBOL(release_sock);
3067 
3068 /**
3069  * lock_sock_fast - fast version of lock_sock
3070  * @sk: socket
3071  *
3072  * This version should be used for very small section, where process wont block
3073  * return false if fast path is taken:
3074  *
3075  *   sk_lock.slock locked, owned = 0, BH disabled
3076  *
3077  * return true if slow path is taken:
3078  *
3079  *   sk_lock.slock unlocked, owned = 1, BH enabled
3080  */
3081 bool lock_sock_fast(struct sock *sk)
3082 {
3083 	might_sleep();
3084 	spin_lock_bh(&sk->sk_lock.slock);
3085 
3086 	if (!sk->sk_lock.owned)
3087 		/*
3088 		 * Note : We must disable BH
3089 		 */
3090 		return false;
3091 
3092 	__lock_sock(sk);
3093 	sk->sk_lock.owned = 1;
3094 	spin_unlock(&sk->sk_lock.slock);
3095 	/*
3096 	 * The sk_lock has mutex_lock() semantics here:
3097 	 */
3098 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
3099 	local_bh_enable();
3100 	return true;
3101 }
3102 EXPORT_SYMBOL(lock_sock_fast);
3103 
3104 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3105 		   bool timeval, bool time32)
3106 {
3107 	struct sock *sk = sock->sk;
3108 	struct timespec64 ts;
3109 
3110 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3111 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3112 	if (ts.tv_sec == -1)
3113 		return -ENOENT;
3114 	if (ts.tv_sec == 0) {
3115 		ktime_t kt = ktime_get_real();
3116 		sock_write_timestamp(sk, kt);
3117 		ts = ktime_to_timespec64(kt);
3118 	}
3119 
3120 	if (timeval)
3121 		ts.tv_nsec /= 1000;
3122 
3123 #ifdef CONFIG_COMPAT_32BIT_TIME
3124 	if (time32)
3125 		return put_old_timespec32(&ts, userstamp);
3126 #endif
3127 #ifdef CONFIG_SPARC64
3128 	/* beware of padding in sparc64 timeval */
3129 	if (timeval && !in_compat_syscall()) {
3130 		struct __kernel_old_timeval __user tv = {
3131 			.tv_sec = ts.tv_sec,
3132 			.tv_usec = ts.tv_nsec,
3133 		};
3134 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3135 			return -EFAULT;
3136 		return 0;
3137 	}
3138 #endif
3139 	return put_timespec64(&ts, userstamp);
3140 }
3141 EXPORT_SYMBOL(sock_gettstamp);
3142 
3143 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3144 {
3145 	if (!sock_flag(sk, flag)) {
3146 		unsigned long previous_flags = sk->sk_flags;
3147 
3148 		sock_set_flag(sk, flag);
3149 		/*
3150 		 * we just set one of the two flags which require net
3151 		 * time stamping, but time stamping might have been on
3152 		 * already because of the other one
3153 		 */
3154 		if (sock_needs_netstamp(sk) &&
3155 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3156 			net_enable_timestamp();
3157 	}
3158 }
3159 
3160 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3161 		       int level, int type)
3162 {
3163 	struct sock_exterr_skb *serr;
3164 	struct sk_buff *skb;
3165 	int copied, err;
3166 
3167 	err = -EAGAIN;
3168 	skb = sock_dequeue_err_skb(sk);
3169 	if (skb == NULL)
3170 		goto out;
3171 
3172 	copied = skb->len;
3173 	if (copied > len) {
3174 		msg->msg_flags |= MSG_TRUNC;
3175 		copied = len;
3176 	}
3177 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3178 	if (err)
3179 		goto out_free_skb;
3180 
3181 	sock_recv_timestamp(msg, sk, skb);
3182 
3183 	serr = SKB_EXT_ERR(skb);
3184 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3185 
3186 	msg->msg_flags |= MSG_ERRQUEUE;
3187 	err = copied;
3188 
3189 out_free_skb:
3190 	kfree_skb(skb);
3191 out:
3192 	return err;
3193 }
3194 EXPORT_SYMBOL(sock_recv_errqueue);
3195 
3196 /*
3197  *	Get a socket option on an socket.
3198  *
3199  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3200  *	asynchronous errors should be reported by getsockopt. We assume
3201  *	this means if you specify SO_ERROR (otherwise whats the point of it).
3202  */
3203 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3204 			   char __user *optval, int __user *optlen)
3205 {
3206 	struct sock *sk = sock->sk;
3207 
3208 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3209 }
3210 EXPORT_SYMBOL(sock_common_getsockopt);
3211 
3212 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3213 			int flags)
3214 {
3215 	struct sock *sk = sock->sk;
3216 	int addr_len = 0;
3217 	int err;
3218 
3219 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3220 				   flags & ~MSG_DONTWAIT, &addr_len);
3221 	if (err >= 0)
3222 		msg->msg_namelen = addr_len;
3223 	return err;
3224 }
3225 EXPORT_SYMBOL(sock_common_recvmsg);
3226 
3227 /*
3228  *	Set socket options on an inet socket.
3229  */
3230 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3231 			   sockptr_t optval, unsigned int optlen)
3232 {
3233 	struct sock *sk = sock->sk;
3234 
3235 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3236 }
3237 EXPORT_SYMBOL(sock_common_setsockopt);
3238 
3239 void sk_common_release(struct sock *sk)
3240 {
3241 	if (sk->sk_prot->destroy)
3242 		sk->sk_prot->destroy(sk);
3243 
3244 	/*
3245 	 * Observation: when sk_common_release is called, processes have
3246 	 * no access to socket. But net still has.
3247 	 * Step one, detach it from networking:
3248 	 *
3249 	 * A. Remove from hash tables.
3250 	 */
3251 
3252 	sk->sk_prot->unhash(sk);
3253 
3254 	/*
3255 	 * In this point socket cannot receive new packets, but it is possible
3256 	 * that some packets are in flight because some CPU runs receiver and
3257 	 * did hash table lookup before we unhashed socket. They will achieve
3258 	 * receive queue and will be purged by socket destructor.
3259 	 *
3260 	 * Also we still have packets pending on receive queue and probably,
3261 	 * our own packets waiting in device queues. sock_destroy will drain
3262 	 * receive queue, but transmitted packets will delay socket destruction
3263 	 * until the last reference will be released.
3264 	 */
3265 
3266 	sock_orphan(sk);
3267 
3268 	xfrm_sk_free_policy(sk);
3269 
3270 	sk_refcnt_debug_release(sk);
3271 
3272 	sock_put(sk);
3273 }
3274 EXPORT_SYMBOL(sk_common_release);
3275 
3276 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3277 {
3278 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3279 
3280 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3281 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3282 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3283 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3284 	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3285 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3286 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3287 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3288 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3289 }
3290 
3291 #ifdef CONFIG_PROC_FS
3292 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
3293 struct prot_inuse {
3294 	int val[PROTO_INUSE_NR];
3295 };
3296 
3297 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3298 
3299 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3300 {
3301 	__this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3302 }
3303 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3304 
3305 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3306 {
3307 	int cpu, idx = prot->inuse_idx;
3308 	int res = 0;
3309 
3310 	for_each_possible_cpu(cpu)
3311 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3312 
3313 	return res >= 0 ? res : 0;
3314 }
3315 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3316 
3317 static void sock_inuse_add(struct net *net, int val)
3318 {
3319 	this_cpu_add(*net->core.sock_inuse, val);
3320 }
3321 
3322 int sock_inuse_get(struct net *net)
3323 {
3324 	int cpu, res = 0;
3325 
3326 	for_each_possible_cpu(cpu)
3327 		res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3328 
3329 	return res;
3330 }
3331 
3332 EXPORT_SYMBOL_GPL(sock_inuse_get);
3333 
3334 static int __net_init sock_inuse_init_net(struct net *net)
3335 {
3336 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3337 	if (net->core.prot_inuse == NULL)
3338 		return -ENOMEM;
3339 
3340 	net->core.sock_inuse = alloc_percpu(int);
3341 	if (net->core.sock_inuse == NULL)
3342 		goto out;
3343 
3344 	return 0;
3345 
3346 out:
3347 	free_percpu(net->core.prot_inuse);
3348 	return -ENOMEM;
3349 }
3350 
3351 static void __net_exit sock_inuse_exit_net(struct net *net)
3352 {
3353 	free_percpu(net->core.prot_inuse);
3354 	free_percpu(net->core.sock_inuse);
3355 }
3356 
3357 static struct pernet_operations net_inuse_ops = {
3358 	.init = sock_inuse_init_net,
3359 	.exit = sock_inuse_exit_net,
3360 };
3361 
3362 static __init int net_inuse_init(void)
3363 {
3364 	if (register_pernet_subsys(&net_inuse_ops))
3365 		panic("Cannot initialize net inuse counters");
3366 
3367 	return 0;
3368 }
3369 
3370 core_initcall(net_inuse_init);
3371 
3372 static int assign_proto_idx(struct proto *prot)
3373 {
3374 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3375 
3376 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3377 		pr_err("PROTO_INUSE_NR exhausted\n");
3378 		return -ENOSPC;
3379 	}
3380 
3381 	set_bit(prot->inuse_idx, proto_inuse_idx);
3382 	return 0;
3383 }
3384 
3385 static void release_proto_idx(struct proto *prot)
3386 {
3387 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3388 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3389 }
3390 #else
3391 static inline int assign_proto_idx(struct proto *prot)
3392 {
3393 	return 0;
3394 }
3395 
3396 static inline void release_proto_idx(struct proto *prot)
3397 {
3398 }
3399 
3400 static void sock_inuse_add(struct net *net, int val)
3401 {
3402 }
3403 #endif
3404 
3405 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3406 {
3407 	if (!twsk_prot)
3408 		return;
3409 	kfree(twsk_prot->twsk_slab_name);
3410 	twsk_prot->twsk_slab_name = NULL;
3411 	kmem_cache_destroy(twsk_prot->twsk_slab);
3412 	twsk_prot->twsk_slab = NULL;
3413 }
3414 
3415 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3416 {
3417 	if (!rsk_prot)
3418 		return;
3419 	kfree(rsk_prot->slab_name);
3420 	rsk_prot->slab_name = NULL;
3421 	kmem_cache_destroy(rsk_prot->slab);
3422 	rsk_prot->slab = NULL;
3423 }
3424 
3425 static int req_prot_init(const struct proto *prot)
3426 {
3427 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3428 
3429 	if (!rsk_prot)
3430 		return 0;
3431 
3432 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3433 					prot->name);
3434 	if (!rsk_prot->slab_name)
3435 		return -ENOMEM;
3436 
3437 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3438 					   rsk_prot->obj_size, 0,
3439 					   SLAB_ACCOUNT | prot->slab_flags,
3440 					   NULL);
3441 
3442 	if (!rsk_prot->slab) {
3443 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3444 			prot->name);
3445 		return -ENOMEM;
3446 	}
3447 	return 0;
3448 }
3449 
3450 int proto_register(struct proto *prot, int alloc_slab)
3451 {
3452 	int ret = -ENOBUFS;
3453 
3454 	if (alloc_slab) {
3455 		prot->slab = kmem_cache_create_usercopy(prot->name,
3456 					prot->obj_size, 0,
3457 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3458 					prot->slab_flags,
3459 					prot->useroffset, prot->usersize,
3460 					NULL);
3461 
3462 		if (prot->slab == NULL) {
3463 			pr_crit("%s: Can't create sock SLAB cache!\n",
3464 				prot->name);
3465 			goto out;
3466 		}
3467 
3468 		if (req_prot_init(prot))
3469 			goto out_free_request_sock_slab;
3470 
3471 		if (prot->twsk_prot != NULL) {
3472 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3473 
3474 			if (prot->twsk_prot->twsk_slab_name == NULL)
3475 				goto out_free_request_sock_slab;
3476 
3477 			prot->twsk_prot->twsk_slab =
3478 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3479 						  prot->twsk_prot->twsk_obj_size,
3480 						  0,
3481 						  SLAB_ACCOUNT |
3482 						  prot->slab_flags,
3483 						  NULL);
3484 			if (prot->twsk_prot->twsk_slab == NULL)
3485 				goto out_free_timewait_sock_slab;
3486 		}
3487 	}
3488 
3489 	mutex_lock(&proto_list_mutex);
3490 	ret = assign_proto_idx(prot);
3491 	if (ret) {
3492 		mutex_unlock(&proto_list_mutex);
3493 		goto out_free_timewait_sock_slab;
3494 	}
3495 	list_add(&prot->node, &proto_list);
3496 	mutex_unlock(&proto_list_mutex);
3497 	return ret;
3498 
3499 out_free_timewait_sock_slab:
3500 	if (alloc_slab && prot->twsk_prot)
3501 		tw_prot_cleanup(prot->twsk_prot);
3502 out_free_request_sock_slab:
3503 	if (alloc_slab) {
3504 		req_prot_cleanup(prot->rsk_prot);
3505 
3506 		kmem_cache_destroy(prot->slab);
3507 		prot->slab = NULL;
3508 	}
3509 out:
3510 	return ret;
3511 }
3512 EXPORT_SYMBOL(proto_register);
3513 
3514 void proto_unregister(struct proto *prot)
3515 {
3516 	mutex_lock(&proto_list_mutex);
3517 	release_proto_idx(prot);
3518 	list_del(&prot->node);
3519 	mutex_unlock(&proto_list_mutex);
3520 
3521 	kmem_cache_destroy(prot->slab);
3522 	prot->slab = NULL;
3523 
3524 	req_prot_cleanup(prot->rsk_prot);
3525 	tw_prot_cleanup(prot->twsk_prot);
3526 }
3527 EXPORT_SYMBOL(proto_unregister);
3528 
3529 int sock_load_diag_module(int family, int protocol)
3530 {
3531 	if (!protocol) {
3532 		if (!sock_is_registered(family))
3533 			return -ENOENT;
3534 
3535 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3536 				      NETLINK_SOCK_DIAG, family);
3537 	}
3538 
3539 #ifdef CONFIG_INET
3540 	if (family == AF_INET &&
3541 	    protocol != IPPROTO_RAW &&
3542 	    protocol < MAX_INET_PROTOS &&
3543 	    !rcu_access_pointer(inet_protos[protocol]))
3544 		return -ENOENT;
3545 #endif
3546 
3547 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3548 			      NETLINK_SOCK_DIAG, family, protocol);
3549 }
3550 EXPORT_SYMBOL(sock_load_diag_module);
3551 
3552 #ifdef CONFIG_PROC_FS
3553 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3554 	__acquires(proto_list_mutex)
3555 {
3556 	mutex_lock(&proto_list_mutex);
3557 	return seq_list_start_head(&proto_list, *pos);
3558 }
3559 
3560 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3561 {
3562 	return seq_list_next(v, &proto_list, pos);
3563 }
3564 
3565 static void proto_seq_stop(struct seq_file *seq, void *v)
3566 	__releases(proto_list_mutex)
3567 {
3568 	mutex_unlock(&proto_list_mutex);
3569 }
3570 
3571 static char proto_method_implemented(const void *method)
3572 {
3573 	return method == NULL ? 'n' : 'y';
3574 }
3575 static long sock_prot_memory_allocated(struct proto *proto)
3576 {
3577 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3578 }
3579 
3580 static const char *sock_prot_memory_pressure(struct proto *proto)
3581 {
3582 	return proto->memory_pressure != NULL ?
3583 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3584 }
3585 
3586 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3587 {
3588 
3589 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3590 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3591 		   proto->name,
3592 		   proto->obj_size,
3593 		   sock_prot_inuse_get(seq_file_net(seq), proto),
3594 		   sock_prot_memory_allocated(proto),
3595 		   sock_prot_memory_pressure(proto),
3596 		   proto->max_header,
3597 		   proto->slab == NULL ? "no" : "yes",
3598 		   module_name(proto->owner),
3599 		   proto_method_implemented(proto->close),
3600 		   proto_method_implemented(proto->connect),
3601 		   proto_method_implemented(proto->disconnect),
3602 		   proto_method_implemented(proto->accept),
3603 		   proto_method_implemented(proto->ioctl),
3604 		   proto_method_implemented(proto->init),
3605 		   proto_method_implemented(proto->destroy),
3606 		   proto_method_implemented(proto->shutdown),
3607 		   proto_method_implemented(proto->setsockopt),
3608 		   proto_method_implemented(proto->getsockopt),
3609 		   proto_method_implemented(proto->sendmsg),
3610 		   proto_method_implemented(proto->recvmsg),
3611 		   proto_method_implemented(proto->sendpage),
3612 		   proto_method_implemented(proto->bind),
3613 		   proto_method_implemented(proto->backlog_rcv),
3614 		   proto_method_implemented(proto->hash),
3615 		   proto_method_implemented(proto->unhash),
3616 		   proto_method_implemented(proto->get_port),
3617 		   proto_method_implemented(proto->enter_memory_pressure));
3618 }
3619 
3620 static int proto_seq_show(struct seq_file *seq, void *v)
3621 {
3622 	if (v == &proto_list)
3623 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3624 			   "protocol",
3625 			   "size",
3626 			   "sockets",
3627 			   "memory",
3628 			   "press",
3629 			   "maxhdr",
3630 			   "slab",
3631 			   "module",
3632 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3633 	else
3634 		proto_seq_printf(seq, list_entry(v, struct proto, node));
3635 	return 0;
3636 }
3637 
3638 static const struct seq_operations proto_seq_ops = {
3639 	.start  = proto_seq_start,
3640 	.next   = proto_seq_next,
3641 	.stop   = proto_seq_stop,
3642 	.show   = proto_seq_show,
3643 };
3644 
3645 static __net_init int proto_init_net(struct net *net)
3646 {
3647 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3648 			sizeof(struct seq_net_private)))
3649 		return -ENOMEM;
3650 
3651 	return 0;
3652 }
3653 
3654 static __net_exit void proto_exit_net(struct net *net)
3655 {
3656 	remove_proc_entry("protocols", net->proc_net);
3657 }
3658 
3659 
3660 static __net_initdata struct pernet_operations proto_net_ops = {
3661 	.init = proto_init_net,
3662 	.exit = proto_exit_net,
3663 };
3664 
3665 static int __init proto_init(void)
3666 {
3667 	return register_pernet_subsys(&proto_net_ops);
3668 }
3669 
3670 subsys_initcall(proto_init);
3671 
3672 #endif /* PROC_FS */
3673 
3674 #ifdef CONFIG_NET_RX_BUSY_POLL
3675 bool sk_busy_loop_end(void *p, unsigned long start_time)
3676 {
3677 	struct sock *sk = p;
3678 
3679 	return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3680 	       sk_busy_loop_timeout(sk, start_time);
3681 }
3682 EXPORT_SYMBOL(sk_busy_loop_end);
3683 #endif /* CONFIG_NET_RX_BUSY_POLL */
3684 
3685 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
3686 {
3687 	if (!sk->sk_prot->bind_add)
3688 		return -EOPNOTSUPP;
3689 	return sk->sk_prot->bind_add(sk, addr, addr_len);
3690 }
3691 EXPORT_SYMBOL(sock_bind_add);
3692