xref: /openbmc/linux/net/core/sock.c (revision e330fb14)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/init.h>
111 #include <linux/highmem.h>
112 #include <linux/user_namespace.h>
113 #include <linux/static_key.h>
114 #include <linux/memcontrol.h>
115 #include <linux/prefetch.h>
116 #include <linux/compat.h>
117 
118 #include <linux/uaccess.h>
119 
120 #include <linux/netdevice.h>
121 #include <net/protocol.h>
122 #include <linux/skbuff.h>
123 #include <net/net_namespace.h>
124 #include <net/request_sock.h>
125 #include <net/sock.h>
126 #include <linux/net_tstamp.h>
127 #include <net/xfrm.h>
128 #include <linux/ipsec.h>
129 #include <net/cls_cgroup.h>
130 #include <net/netprio_cgroup.h>
131 #include <linux/sock_diag.h>
132 
133 #include <linux/filter.h>
134 #include <net/sock_reuseport.h>
135 #include <net/bpf_sk_storage.h>
136 
137 #include <trace/events/sock.h>
138 
139 #include <net/tcp.h>
140 #include <net/busy_poll.h>
141 
142 #include <linux/ethtool.h>
143 
144 static DEFINE_MUTEX(proto_list_mutex);
145 static LIST_HEAD(proto_list);
146 
147 static void sock_inuse_add(struct net *net, int val);
148 
149 /**
150  * sk_ns_capable - General socket capability test
151  * @sk: Socket to use a capability on or through
152  * @user_ns: The user namespace of the capability to use
153  * @cap: The capability to use
154  *
155  * Test to see if the opener of the socket had when the socket was
156  * created and the current process has the capability @cap in the user
157  * namespace @user_ns.
158  */
159 bool sk_ns_capable(const struct sock *sk,
160 		   struct user_namespace *user_ns, int cap)
161 {
162 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
163 		ns_capable(user_ns, cap);
164 }
165 EXPORT_SYMBOL(sk_ns_capable);
166 
167 /**
168  * sk_capable - Socket global capability test
169  * @sk: Socket to use a capability on or through
170  * @cap: The global capability to use
171  *
172  * Test to see if the opener of the socket had when the socket was
173  * created and the current process has the capability @cap in all user
174  * namespaces.
175  */
176 bool sk_capable(const struct sock *sk, int cap)
177 {
178 	return sk_ns_capable(sk, &init_user_ns, cap);
179 }
180 EXPORT_SYMBOL(sk_capable);
181 
182 /**
183  * sk_net_capable - Network namespace socket capability test
184  * @sk: Socket to use a capability on or through
185  * @cap: The capability to use
186  *
187  * Test to see if the opener of the socket had when the socket was created
188  * and the current process has the capability @cap over the network namespace
189  * the socket is a member of.
190  */
191 bool sk_net_capable(const struct sock *sk, int cap)
192 {
193 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
194 }
195 EXPORT_SYMBOL(sk_net_capable);
196 
197 /*
198  * Each address family might have different locking rules, so we have
199  * one slock key per address family and separate keys for internal and
200  * userspace sockets.
201  */
202 static struct lock_class_key af_family_keys[AF_MAX];
203 static struct lock_class_key af_family_kern_keys[AF_MAX];
204 static struct lock_class_key af_family_slock_keys[AF_MAX];
205 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
206 
207 /*
208  * Make lock validator output more readable. (we pre-construct these
209  * strings build-time, so that runtime initialization of socket
210  * locks is fast):
211  */
212 
213 #define _sock_locks(x)						  \
214   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
215   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
216   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
217   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
218   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
219   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
220   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
221   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
222   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
223   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
224   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
225   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
226   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
227   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
228   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
229   x "AF_MCTP"  , \
230   x "AF_MAX"
231 
232 static const char *const af_family_key_strings[AF_MAX+1] = {
233 	_sock_locks("sk_lock-")
234 };
235 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
236 	_sock_locks("slock-")
237 };
238 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
239 	_sock_locks("clock-")
240 };
241 
242 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
243 	_sock_locks("k-sk_lock-")
244 };
245 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
246 	_sock_locks("k-slock-")
247 };
248 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
249 	_sock_locks("k-clock-")
250 };
251 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
252 	_sock_locks("rlock-")
253 };
254 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
255 	_sock_locks("wlock-")
256 };
257 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
258 	_sock_locks("elock-")
259 };
260 
261 /*
262  * sk_callback_lock and sk queues locking rules are per-address-family,
263  * so split the lock classes by using a per-AF key:
264  */
265 static struct lock_class_key af_callback_keys[AF_MAX];
266 static struct lock_class_key af_rlock_keys[AF_MAX];
267 static struct lock_class_key af_wlock_keys[AF_MAX];
268 static struct lock_class_key af_elock_keys[AF_MAX];
269 static struct lock_class_key af_kern_callback_keys[AF_MAX];
270 
271 /* Run time adjustable parameters. */
272 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
273 EXPORT_SYMBOL(sysctl_wmem_max);
274 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
275 EXPORT_SYMBOL(sysctl_rmem_max);
276 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
277 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
278 
279 /* Maximal space eaten by iovec or ancillary data plus some space */
280 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
281 EXPORT_SYMBOL(sysctl_optmem_max);
282 
283 int sysctl_tstamp_allow_data __read_mostly = 1;
284 
285 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
286 EXPORT_SYMBOL_GPL(memalloc_socks_key);
287 
288 /**
289  * sk_set_memalloc - sets %SOCK_MEMALLOC
290  * @sk: socket to set it on
291  *
292  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
293  * It's the responsibility of the admin to adjust min_free_kbytes
294  * to meet the requirements
295  */
296 void sk_set_memalloc(struct sock *sk)
297 {
298 	sock_set_flag(sk, SOCK_MEMALLOC);
299 	sk->sk_allocation |= __GFP_MEMALLOC;
300 	static_branch_inc(&memalloc_socks_key);
301 }
302 EXPORT_SYMBOL_GPL(sk_set_memalloc);
303 
304 void sk_clear_memalloc(struct sock *sk)
305 {
306 	sock_reset_flag(sk, SOCK_MEMALLOC);
307 	sk->sk_allocation &= ~__GFP_MEMALLOC;
308 	static_branch_dec(&memalloc_socks_key);
309 
310 	/*
311 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
312 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
313 	 * it has rmem allocations due to the last swapfile being deactivated
314 	 * but there is a risk that the socket is unusable due to exceeding
315 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
316 	 */
317 	sk_mem_reclaim(sk);
318 }
319 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
320 
321 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
322 {
323 	int ret;
324 	unsigned int noreclaim_flag;
325 
326 	/* these should have been dropped before queueing */
327 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
328 
329 	noreclaim_flag = memalloc_noreclaim_save();
330 	ret = sk->sk_backlog_rcv(sk, skb);
331 	memalloc_noreclaim_restore(noreclaim_flag);
332 
333 	return ret;
334 }
335 EXPORT_SYMBOL(__sk_backlog_rcv);
336 
337 void sk_error_report(struct sock *sk)
338 {
339 	sk->sk_error_report(sk);
340 
341 	switch (sk->sk_family) {
342 	case AF_INET:
343 		fallthrough;
344 	case AF_INET6:
345 		trace_inet_sk_error_report(sk);
346 		break;
347 	default:
348 		break;
349 	}
350 }
351 EXPORT_SYMBOL(sk_error_report);
352 
353 static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
354 {
355 	struct __kernel_sock_timeval tv;
356 
357 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
358 		tv.tv_sec = 0;
359 		tv.tv_usec = 0;
360 	} else {
361 		tv.tv_sec = timeo / HZ;
362 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
363 	}
364 
365 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
366 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
367 		*(struct old_timeval32 *)optval = tv32;
368 		return sizeof(tv32);
369 	}
370 
371 	if (old_timeval) {
372 		struct __kernel_old_timeval old_tv;
373 		old_tv.tv_sec = tv.tv_sec;
374 		old_tv.tv_usec = tv.tv_usec;
375 		*(struct __kernel_old_timeval *)optval = old_tv;
376 		return sizeof(old_tv);
377 	}
378 
379 	*(struct __kernel_sock_timeval *)optval = tv;
380 	return sizeof(tv);
381 }
382 
383 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
384 			    bool old_timeval)
385 {
386 	struct __kernel_sock_timeval tv;
387 
388 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
389 		struct old_timeval32 tv32;
390 
391 		if (optlen < sizeof(tv32))
392 			return -EINVAL;
393 
394 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
395 			return -EFAULT;
396 		tv.tv_sec = tv32.tv_sec;
397 		tv.tv_usec = tv32.tv_usec;
398 	} else if (old_timeval) {
399 		struct __kernel_old_timeval old_tv;
400 
401 		if (optlen < sizeof(old_tv))
402 			return -EINVAL;
403 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
404 			return -EFAULT;
405 		tv.tv_sec = old_tv.tv_sec;
406 		tv.tv_usec = old_tv.tv_usec;
407 	} else {
408 		if (optlen < sizeof(tv))
409 			return -EINVAL;
410 		if (copy_from_sockptr(&tv, optval, sizeof(tv)))
411 			return -EFAULT;
412 	}
413 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
414 		return -EDOM;
415 
416 	if (tv.tv_sec < 0) {
417 		static int warned __read_mostly;
418 
419 		*timeo_p = 0;
420 		if (warned < 10 && net_ratelimit()) {
421 			warned++;
422 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
423 				__func__, current->comm, task_pid_nr(current));
424 		}
425 		return 0;
426 	}
427 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
428 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
429 		return 0;
430 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
431 		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
432 	return 0;
433 }
434 
435 static bool sock_needs_netstamp(const struct sock *sk)
436 {
437 	switch (sk->sk_family) {
438 	case AF_UNSPEC:
439 	case AF_UNIX:
440 		return false;
441 	default:
442 		return true;
443 	}
444 }
445 
446 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
447 {
448 	if (sk->sk_flags & flags) {
449 		sk->sk_flags &= ~flags;
450 		if (sock_needs_netstamp(sk) &&
451 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
452 			net_disable_timestamp();
453 	}
454 }
455 
456 
457 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
458 {
459 	unsigned long flags;
460 	struct sk_buff_head *list = &sk->sk_receive_queue;
461 
462 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
463 		atomic_inc(&sk->sk_drops);
464 		trace_sock_rcvqueue_full(sk, skb);
465 		return -ENOMEM;
466 	}
467 
468 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
469 		atomic_inc(&sk->sk_drops);
470 		return -ENOBUFS;
471 	}
472 
473 	skb->dev = NULL;
474 	skb_set_owner_r(skb, sk);
475 
476 	/* we escape from rcu protected region, make sure we dont leak
477 	 * a norefcounted dst
478 	 */
479 	skb_dst_force(skb);
480 
481 	spin_lock_irqsave(&list->lock, flags);
482 	sock_skb_set_dropcount(sk, skb);
483 	__skb_queue_tail(list, skb);
484 	spin_unlock_irqrestore(&list->lock, flags);
485 
486 	if (!sock_flag(sk, SOCK_DEAD))
487 		sk->sk_data_ready(sk);
488 	return 0;
489 }
490 EXPORT_SYMBOL(__sock_queue_rcv_skb);
491 
492 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
493 {
494 	int err;
495 
496 	err = sk_filter(sk, skb);
497 	if (err)
498 		return err;
499 
500 	return __sock_queue_rcv_skb(sk, skb);
501 }
502 EXPORT_SYMBOL(sock_queue_rcv_skb);
503 
504 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
505 		     const int nested, unsigned int trim_cap, bool refcounted)
506 {
507 	int rc = NET_RX_SUCCESS;
508 
509 	if (sk_filter_trim_cap(sk, skb, trim_cap))
510 		goto discard_and_relse;
511 
512 	skb->dev = NULL;
513 
514 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
515 		atomic_inc(&sk->sk_drops);
516 		goto discard_and_relse;
517 	}
518 	if (nested)
519 		bh_lock_sock_nested(sk);
520 	else
521 		bh_lock_sock(sk);
522 	if (!sock_owned_by_user(sk)) {
523 		/*
524 		 * trylock + unlock semantics:
525 		 */
526 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
527 
528 		rc = sk_backlog_rcv(sk, skb);
529 
530 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
531 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
532 		bh_unlock_sock(sk);
533 		atomic_inc(&sk->sk_drops);
534 		goto discard_and_relse;
535 	}
536 
537 	bh_unlock_sock(sk);
538 out:
539 	if (refcounted)
540 		sock_put(sk);
541 	return rc;
542 discard_and_relse:
543 	kfree_skb(skb);
544 	goto out;
545 }
546 EXPORT_SYMBOL(__sk_receive_skb);
547 
548 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
549 							  u32));
550 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
551 							   u32));
552 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
553 {
554 	struct dst_entry *dst = __sk_dst_get(sk);
555 
556 	if (dst && dst->obsolete &&
557 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
558 			       dst, cookie) == NULL) {
559 		sk_tx_queue_clear(sk);
560 		sk->sk_dst_pending_confirm = 0;
561 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
562 		dst_release(dst);
563 		return NULL;
564 	}
565 
566 	return dst;
567 }
568 EXPORT_SYMBOL(__sk_dst_check);
569 
570 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
571 {
572 	struct dst_entry *dst = sk_dst_get(sk);
573 
574 	if (dst && dst->obsolete &&
575 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
576 			       dst, cookie) == NULL) {
577 		sk_dst_reset(sk);
578 		dst_release(dst);
579 		return NULL;
580 	}
581 
582 	return dst;
583 }
584 EXPORT_SYMBOL(sk_dst_check);
585 
586 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
587 {
588 	int ret = -ENOPROTOOPT;
589 #ifdef CONFIG_NETDEVICES
590 	struct net *net = sock_net(sk);
591 
592 	/* Sorry... */
593 	ret = -EPERM;
594 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
595 		goto out;
596 
597 	ret = -EINVAL;
598 	if (ifindex < 0)
599 		goto out;
600 
601 	sk->sk_bound_dev_if = ifindex;
602 	if (sk->sk_prot->rehash)
603 		sk->sk_prot->rehash(sk);
604 	sk_dst_reset(sk);
605 
606 	ret = 0;
607 
608 out:
609 #endif
610 
611 	return ret;
612 }
613 
614 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
615 {
616 	int ret;
617 
618 	if (lock_sk)
619 		lock_sock(sk);
620 	ret = sock_bindtoindex_locked(sk, ifindex);
621 	if (lock_sk)
622 		release_sock(sk);
623 
624 	return ret;
625 }
626 EXPORT_SYMBOL(sock_bindtoindex);
627 
628 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
629 {
630 	int ret = -ENOPROTOOPT;
631 #ifdef CONFIG_NETDEVICES
632 	struct net *net = sock_net(sk);
633 	char devname[IFNAMSIZ];
634 	int index;
635 
636 	ret = -EINVAL;
637 	if (optlen < 0)
638 		goto out;
639 
640 	/* Bind this socket to a particular device like "eth0",
641 	 * as specified in the passed interface name. If the
642 	 * name is "" or the option length is zero the socket
643 	 * is not bound.
644 	 */
645 	if (optlen > IFNAMSIZ - 1)
646 		optlen = IFNAMSIZ - 1;
647 	memset(devname, 0, sizeof(devname));
648 
649 	ret = -EFAULT;
650 	if (copy_from_sockptr(devname, optval, optlen))
651 		goto out;
652 
653 	index = 0;
654 	if (devname[0] != '\0') {
655 		struct net_device *dev;
656 
657 		rcu_read_lock();
658 		dev = dev_get_by_name_rcu(net, devname);
659 		if (dev)
660 			index = dev->ifindex;
661 		rcu_read_unlock();
662 		ret = -ENODEV;
663 		if (!dev)
664 			goto out;
665 	}
666 
667 	return sock_bindtoindex(sk, index, true);
668 out:
669 #endif
670 
671 	return ret;
672 }
673 
674 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
675 				int __user *optlen, int len)
676 {
677 	int ret = -ENOPROTOOPT;
678 #ifdef CONFIG_NETDEVICES
679 	struct net *net = sock_net(sk);
680 	char devname[IFNAMSIZ];
681 
682 	if (sk->sk_bound_dev_if == 0) {
683 		len = 0;
684 		goto zero;
685 	}
686 
687 	ret = -EINVAL;
688 	if (len < IFNAMSIZ)
689 		goto out;
690 
691 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
692 	if (ret)
693 		goto out;
694 
695 	len = strlen(devname) + 1;
696 
697 	ret = -EFAULT;
698 	if (copy_to_user(optval, devname, len))
699 		goto out;
700 
701 zero:
702 	ret = -EFAULT;
703 	if (put_user(len, optlen))
704 		goto out;
705 
706 	ret = 0;
707 
708 out:
709 #endif
710 
711 	return ret;
712 }
713 
714 bool sk_mc_loop(struct sock *sk)
715 {
716 	if (dev_recursion_level())
717 		return false;
718 	if (!sk)
719 		return true;
720 	switch (sk->sk_family) {
721 	case AF_INET:
722 		return inet_sk(sk)->mc_loop;
723 #if IS_ENABLED(CONFIG_IPV6)
724 	case AF_INET6:
725 		return inet6_sk(sk)->mc_loop;
726 #endif
727 	}
728 	WARN_ON_ONCE(1);
729 	return true;
730 }
731 EXPORT_SYMBOL(sk_mc_loop);
732 
733 void sock_set_reuseaddr(struct sock *sk)
734 {
735 	lock_sock(sk);
736 	sk->sk_reuse = SK_CAN_REUSE;
737 	release_sock(sk);
738 }
739 EXPORT_SYMBOL(sock_set_reuseaddr);
740 
741 void sock_set_reuseport(struct sock *sk)
742 {
743 	lock_sock(sk);
744 	sk->sk_reuseport = true;
745 	release_sock(sk);
746 }
747 EXPORT_SYMBOL(sock_set_reuseport);
748 
749 void sock_no_linger(struct sock *sk)
750 {
751 	lock_sock(sk);
752 	sk->sk_lingertime = 0;
753 	sock_set_flag(sk, SOCK_LINGER);
754 	release_sock(sk);
755 }
756 EXPORT_SYMBOL(sock_no_linger);
757 
758 void sock_set_priority(struct sock *sk, u32 priority)
759 {
760 	lock_sock(sk);
761 	sk->sk_priority = priority;
762 	release_sock(sk);
763 }
764 EXPORT_SYMBOL(sock_set_priority);
765 
766 void sock_set_sndtimeo(struct sock *sk, s64 secs)
767 {
768 	lock_sock(sk);
769 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
770 		sk->sk_sndtimeo = secs * HZ;
771 	else
772 		sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
773 	release_sock(sk);
774 }
775 EXPORT_SYMBOL(sock_set_sndtimeo);
776 
777 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
778 {
779 	if (val)  {
780 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
781 		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
782 		sock_set_flag(sk, SOCK_RCVTSTAMP);
783 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
784 	} else {
785 		sock_reset_flag(sk, SOCK_RCVTSTAMP);
786 		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
787 	}
788 }
789 
790 void sock_enable_timestamps(struct sock *sk)
791 {
792 	lock_sock(sk);
793 	__sock_set_timestamps(sk, true, false, true);
794 	release_sock(sk);
795 }
796 EXPORT_SYMBOL(sock_enable_timestamps);
797 
798 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
799 {
800 	switch (optname) {
801 	case SO_TIMESTAMP_OLD:
802 		__sock_set_timestamps(sk, valbool, false, false);
803 		break;
804 	case SO_TIMESTAMP_NEW:
805 		__sock_set_timestamps(sk, valbool, true, false);
806 		break;
807 	case SO_TIMESTAMPNS_OLD:
808 		__sock_set_timestamps(sk, valbool, false, true);
809 		break;
810 	case SO_TIMESTAMPNS_NEW:
811 		__sock_set_timestamps(sk, valbool, true, true);
812 		break;
813 	}
814 }
815 
816 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
817 {
818 	struct net *net = sock_net(sk);
819 	struct net_device *dev = NULL;
820 	bool match = false;
821 	int *vclock_index;
822 	int i, num;
823 
824 	if (sk->sk_bound_dev_if)
825 		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
826 
827 	if (!dev) {
828 		pr_err("%s: sock not bind to device\n", __func__);
829 		return -EOPNOTSUPP;
830 	}
831 
832 	num = ethtool_get_phc_vclocks(dev, &vclock_index);
833 	for (i = 0; i < num; i++) {
834 		if (*(vclock_index + i) == phc_index) {
835 			match = true;
836 			break;
837 		}
838 	}
839 
840 	if (num > 0)
841 		kfree(vclock_index);
842 
843 	if (!match)
844 		return -EINVAL;
845 
846 	sk->sk_bind_phc = phc_index;
847 
848 	return 0;
849 }
850 
851 int sock_set_timestamping(struct sock *sk, int optname,
852 			  struct so_timestamping timestamping)
853 {
854 	int val = timestamping.flags;
855 	int ret;
856 
857 	if (val & ~SOF_TIMESTAMPING_MASK)
858 		return -EINVAL;
859 
860 	if (val & SOF_TIMESTAMPING_OPT_ID &&
861 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
862 		if (sk->sk_protocol == IPPROTO_TCP &&
863 		    sk->sk_type == SOCK_STREAM) {
864 			if ((1 << sk->sk_state) &
865 			    (TCPF_CLOSE | TCPF_LISTEN))
866 				return -EINVAL;
867 			sk->sk_tskey = tcp_sk(sk)->snd_una;
868 		} else {
869 			sk->sk_tskey = 0;
870 		}
871 	}
872 
873 	if (val & SOF_TIMESTAMPING_OPT_STATS &&
874 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
875 		return -EINVAL;
876 
877 	if (val & SOF_TIMESTAMPING_BIND_PHC) {
878 		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
879 		if (ret)
880 			return ret;
881 	}
882 
883 	sk->sk_tsflags = val;
884 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
885 
886 	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
887 		sock_enable_timestamp(sk,
888 				      SOCK_TIMESTAMPING_RX_SOFTWARE);
889 	else
890 		sock_disable_timestamp(sk,
891 				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
892 	return 0;
893 }
894 
895 void sock_set_keepalive(struct sock *sk)
896 {
897 	lock_sock(sk);
898 	if (sk->sk_prot->keepalive)
899 		sk->sk_prot->keepalive(sk, true);
900 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
901 	release_sock(sk);
902 }
903 EXPORT_SYMBOL(sock_set_keepalive);
904 
905 static void __sock_set_rcvbuf(struct sock *sk, int val)
906 {
907 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
908 	 * as a negative value.
909 	 */
910 	val = min_t(int, val, INT_MAX / 2);
911 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
912 
913 	/* We double it on the way in to account for "struct sk_buff" etc.
914 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
915 	 * will allow that much actual data to be received on that socket.
916 	 *
917 	 * Applications are unaware that "struct sk_buff" and other overheads
918 	 * allocate from the receive buffer during socket buffer allocation.
919 	 *
920 	 * And after considering the possible alternatives, returning the value
921 	 * we actually used in getsockopt is the most desirable behavior.
922 	 */
923 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
924 }
925 
926 void sock_set_rcvbuf(struct sock *sk, int val)
927 {
928 	lock_sock(sk);
929 	__sock_set_rcvbuf(sk, val);
930 	release_sock(sk);
931 }
932 EXPORT_SYMBOL(sock_set_rcvbuf);
933 
934 static void __sock_set_mark(struct sock *sk, u32 val)
935 {
936 	if (val != sk->sk_mark) {
937 		sk->sk_mark = val;
938 		sk_dst_reset(sk);
939 	}
940 }
941 
942 void sock_set_mark(struct sock *sk, u32 val)
943 {
944 	lock_sock(sk);
945 	__sock_set_mark(sk, val);
946 	release_sock(sk);
947 }
948 EXPORT_SYMBOL(sock_set_mark);
949 
950 static void sock_release_reserved_memory(struct sock *sk, int bytes)
951 {
952 	/* Round down bytes to multiple of pages */
953 	bytes &= ~(SK_MEM_QUANTUM - 1);
954 
955 	WARN_ON(bytes > sk->sk_reserved_mem);
956 	sk->sk_reserved_mem -= bytes;
957 	sk_mem_reclaim(sk);
958 }
959 
960 static int sock_reserve_memory(struct sock *sk, int bytes)
961 {
962 	long allocated;
963 	bool charged;
964 	int pages;
965 
966 	if (!mem_cgroup_sockets_enabled || !sk->sk_memcg)
967 		return -EOPNOTSUPP;
968 
969 	if (!bytes)
970 		return 0;
971 
972 	pages = sk_mem_pages(bytes);
973 
974 	/* pre-charge to memcg */
975 	charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
976 					  GFP_KERNEL | __GFP_RETRY_MAYFAIL);
977 	if (!charged)
978 		return -ENOMEM;
979 
980 	/* pre-charge to forward_alloc */
981 	allocated = sk_memory_allocated_add(sk, pages);
982 	/* If the system goes into memory pressure with this
983 	 * precharge, give up and return error.
984 	 */
985 	if (allocated > sk_prot_mem_limits(sk, 1)) {
986 		sk_memory_allocated_sub(sk, pages);
987 		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
988 		return -ENOMEM;
989 	}
990 	sk->sk_forward_alloc += pages << SK_MEM_QUANTUM_SHIFT;
991 
992 	sk->sk_reserved_mem += pages << SK_MEM_QUANTUM_SHIFT;
993 
994 	return 0;
995 }
996 
997 /*
998  *	This is meant for all protocols to use and covers goings on
999  *	at the socket level. Everything here is generic.
1000  */
1001 
1002 int sock_setsockopt(struct socket *sock, int level, int optname,
1003 		    sockptr_t optval, unsigned int optlen)
1004 {
1005 	struct so_timestamping timestamping;
1006 	struct sock_txtime sk_txtime;
1007 	struct sock *sk = sock->sk;
1008 	int val;
1009 	int valbool;
1010 	struct linger ling;
1011 	int ret = 0;
1012 
1013 	/*
1014 	 *	Options without arguments
1015 	 */
1016 
1017 	if (optname == SO_BINDTODEVICE)
1018 		return sock_setbindtodevice(sk, optval, optlen);
1019 
1020 	if (optlen < sizeof(int))
1021 		return -EINVAL;
1022 
1023 	if (copy_from_sockptr(&val, optval, sizeof(val)))
1024 		return -EFAULT;
1025 
1026 	valbool = val ? 1 : 0;
1027 
1028 	lock_sock(sk);
1029 
1030 	switch (optname) {
1031 	case SO_DEBUG:
1032 		if (val && !capable(CAP_NET_ADMIN))
1033 			ret = -EACCES;
1034 		else
1035 			sock_valbool_flag(sk, SOCK_DBG, valbool);
1036 		break;
1037 	case SO_REUSEADDR:
1038 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1039 		break;
1040 	case SO_REUSEPORT:
1041 		sk->sk_reuseport = valbool;
1042 		break;
1043 	case SO_TYPE:
1044 	case SO_PROTOCOL:
1045 	case SO_DOMAIN:
1046 	case SO_ERROR:
1047 		ret = -ENOPROTOOPT;
1048 		break;
1049 	case SO_DONTROUTE:
1050 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1051 		sk_dst_reset(sk);
1052 		break;
1053 	case SO_BROADCAST:
1054 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1055 		break;
1056 	case SO_SNDBUF:
1057 		/* Don't error on this BSD doesn't and if you think
1058 		 * about it this is right. Otherwise apps have to
1059 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1060 		 * are treated in BSD as hints
1061 		 */
1062 		val = min_t(u32, val, sysctl_wmem_max);
1063 set_sndbuf:
1064 		/* Ensure val * 2 fits into an int, to prevent max_t()
1065 		 * from treating it as a negative value.
1066 		 */
1067 		val = min_t(int, val, INT_MAX / 2);
1068 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1069 		WRITE_ONCE(sk->sk_sndbuf,
1070 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1071 		/* Wake up sending tasks if we upped the value. */
1072 		sk->sk_write_space(sk);
1073 		break;
1074 
1075 	case SO_SNDBUFFORCE:
1076 		if (!capable(CAP_NET_ADMIN)) {
1077 			ret = -EPERM;
1078 			break;
1079 		}
1080 
1081 		/* No negative values (to prevent underflow, as val will be
1082 		 * multiplied by 2).
1083 		 */
1084 		if (val < 0)
1085 			val = 0;
1086 		goto set_sndbuf;
1087 
1088 	case SO_RCVBUF:
1089 		/* Don't error on this BSD doesn't and if you think
1090 		 * about it this is right. Otherwise apps have to
1091 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1092 		 * are treated in BSD as hints
1093 		 */
1094 		__sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max));
1095 		break;
1096 
1097 	case SO_RCVBUFFORCE:
1098 		if (!capable(CAP_NET_ADMIN)) {
1099 			ret = -EPERM;
1100 			break;
1101 		}
1102 
1103 		/* No negative values (to prevent underflow, as val will be
1104 		 * multiplied by 2).
1105 		 */
1106 		__sock_set_rcvbuf(sk, max(val, 0));
1107 		break;
1108 
1109 	case SO_KEEPALIVE:
1110 		if (sk->sk_prot->keepalive)
1111 			sk->sk_prot->keepalive(sk, valbool);
1112 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1113 		break;
1114 
1115 	case SO_OOBINLINE:
1116 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1117 		break;
1118 
1119 	case SO_NO_CHECK:
1120 		sk->sk_no_check_tx = valbool;
1121 		break;
1122 
1123 	case SO_PRIORITY:
1124 		if ((val >= 0 && val <= 6) ||
1125 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1126 			sk->sk_priority = val;
1127 		else
1128 			ret = -EPERM;
1129 		break;
1130 
1131 	case SO_LINGER:
1132 		if (optlen < sizeof(ling)) {
1133 			ret = -EINVAL;	/* 1003.1g */
1134 			break;
1135 		}
1136 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1137 			ret = -EFAULT;
1138 			break;
1139 		}
1140 		if (!ling.l_onoff)
1141 			sock_reset_flag(sk, SOCK_LINGER);
1142 		else {
1143 #if (BITS_PER_LONG == 32)
1144 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
1145 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1146 			else
1147 #endif
1148 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
1149 			sock_set_flag(sk, SOCK_LINGER);
1150 		}
1151 		break;
1152 
1153 	case SO_BSDCOMPAT:
1154 		break;
1155 
1156 	case SO_PASSCRED:
1157 		if (valbool)
1158 			set_bit(SOCK_PASSCRED, &sock->flags);
1159 		else
1160 			clear_bit(SOCK_PASSCRED, &sock->flags);
1161 		break;
1162 
1163 	case SO_TIMESTAMP_OLD:
1164 	case SO_TIMESTAMP_NEW:
1165 	case SO_TIMESTAMPNS_OLD:
1166 	case SO_TIMESTAMPNS_NEW:
1167 		sock_set_timestamp(sk, optname, valbool);
1168 		break;
1169 
1170 	case SO_TIMESTAMPING_NEW:
1171 	case SO_TIMESTAMPING_OLD:
1172 		if (optlen == sizeof(timestamping)) {
1173 			if (copy_from_sockptr(&timestamping, optval,
1174 					      sizeof(timestamping))) {
1175 				ret = -EFAULT;
1176 				break;
1177 			}
1178 		} else {
1179 			memset(&timestamping, 0, sizeof(timestamping));
1180 			timestamping.flags = val;
1181 		}
1182 		ret = sock_set_timestamping(sk, optname, timestamping);
1183 		break;
1184 
1185 	case SO_RCVLOWAT:
1186 		if (val < 0)
1187 			val = INT_MAX;
1188 		if (sock->ops->set_rcvlowat)
1189 			ret = sock->ops->set_rcvlowat(sk, val);
1190 		else
1191 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1192 		break;
1193 
1194 	case SO_RCVTIMEO_OLD:
1195 	case SO_RCVTIMEO_NEW:
1196 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1197 				       optlen, optname == SO_RCVTIMEO_OLD);
1198 		break;
1199 
1200 	case SO_SNDTIMEO_OLD:
1201 	case SO_SNDTIMEO_NEW:
1202 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1203 				       optlen, optname == SO_SNDTIMEO_OLD);
1204 		break;
1205 
1206 	case SO_ATTACH_FILTER: {
1207 		struct sock_fprog fprog;
1208 
1209 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1210 		if (!ret)
1211 			ret = sk_attach_filter(&fprog, sk);
1212 		break;
1213 	}
1214 	case SO_ATTACH_BPF:
1215 		ret = -EINVAL;
1216 		if (optlen == sizeof(u32)) {
1217 			u32 ufd;
1218 
1219 			ret = -EFAULT;
1220 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1221 				break;
1222 
1223 			ret = sk_attach_bpf(ufd, sk);
1224 		}
1225 		break;
1226 
1227 	case SO_ATTACH_REUSEPORT_CBPF: {
1228 		struct sock_fprog fprog;
1229 
1230 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1231 		if (!ret)
1232 			ret = sk_reuseport_attach_filter(&fprog, sk);
1233 		break;
1234 	}
1235 	case SO_ATTACH_REUSEPORT_EBPF:
1236 		ret = -EINVAL;
1237 		if (optlen == sizeof(u32)) {
1238 			u32 ufd;
1239 
1240 			ret = -EFAULT;
1241 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1242 				break;
1243 
1244 			ret = sk_reuseport_attach_bpf(ufd, sk);
1245 		}
1246 		break;
1247 
1248 	case SO_DETACH_REUSEPORT_BPF:
1249 		ret = reuseport_detach_prog(sk);
1250 		break;
1251 
1252 	case SO_DETACH_FILTER:
1253 		ret = sk_detach_filter(sk);
1254 		break;
1255 
1256 	case SO_LOCK_FILTER:
1257 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1258 			ret = -EPERM;
1259 		else
1260 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1261 		break;
1262 
1263 	case SO_PASSSEC:
1264 		if (valbool)
1265 			set_bit(SOCK_PASSSEC, &sock->flags);
1266 		else
1267 			clear_bit(SOCK_PASSSEC, &sock->flags);
1268 		break;
1269 	case SO_MARK:
1270 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1271 			ret = -EPERM;
1272 			break;
1273 		}
1274 
1275 		__sock_set_mark(sk, val);
1276 		break;
1277 
1278 	case SO_RXQ_OVFL:
1279 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1280 		break;
1281 
1282 	case SO_WIFI_STATUS:
1283 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1284 		break;
1285 
1286 	case SO_PEEK_OFF:
1287 		if (sock->ops->set_peek_off)
1288 			ret = sock->ops->set_peek_off(sk, val);
1289 		else
1290 			ret = -EOPNOTSUPP;
1291 		break;
1292 
1293 	case SO_NOFCS:
1294 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1295 		break;
1296 
1297 	case SO_SELECT_ERR_QUEUE:
1298 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1299 		break;
1300 
1301 #ifdef CONFIG_NET_RX_BUSY_POLL
1302 	case SO_BUSY_POLL:
1303 		/* allow unprivileged users to decrease the value */
1304 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1305 			ret = -EPERM;
1306 		else {
1307 			if (val < 0)
1308 				ret = -EINVAL;
1309 			else
1310 				WRITE_ONCE(sk->sk_ll_usec, val);
1311 		}
1312 		break;
1313 	case SO_PREFER_BUSY_POLL:
1314 		if (valbool && !capable(CAP_NET_ADMIN))
1315 			ret = -EPERM;
1316 		else
1317 			WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1318 		break;
1319 	case SO_BUSY_POLL_BUDGET:
1320 		if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
1321 			ret = -EPERM;
1322 		} else {
1323 			if (val < 0 || val > U16_MAX)
1324 				ret = -EINVAL;
1325 			else
1326 				WRITE_ONCE(sk->sk_busy_poll_budget, val);
1327 		}
1328 		break;
1329 #endif
1330 
1331 	case SO_MAX_PACING_RATE:
1332 		{
1333 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1334 
1335 		if (sizeof(ulval) != sizeof(val) &&
1336 		    optlen >= sizeof(ulval) &&
1337 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1338 			ret = -EFAULT;
1339 			break;
1340 		}
1341 		if (ulval != ~0UL)
1342 			cmpxchg(&sk->sk_pacing_status,
1343 				SK_PACING_NONE,
1344 				SK_PACING_NEEDED);
1345 		sk->sk_max_pacing_rate = ulval;
1346 		sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1347 		break;
1348 		}
1349 	case SO_INCOMING_CPU:
1350 		WRITE_ONCE(sk->sk_incoming_cpu, val);
1351 		break;
1352 
1353 	case SO_CNX_ADVICE:
1354 		if (val == 1)
1355 			dst_negative_advice(sk);
1356 		break;
1357 
1358 	case SO_ZEROCOPY:
1359 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1360 			if (!((sk->sk_type == SOCK_STREAM &&
1361 			       sk->sk_protocol == IPPROTO_TCP) ||
1362 			      (sk->sk_type == SOCK_DGRAM &&
1363 			       sk->sk_protocol == IPPROTO_UDP)))
1364 				ret = -ENOTSUPP;
1365 		} else if (sk->sk_family != PF_RDS) {
1366 			ret = -ENOTSUPP;
1367 		}
1368 		if (!ret) {
1369 			if (val < 0 || val > 1)
1370 				ret = -EINVAL;
1371 			else
1372 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1373 		}
1374 		break;
1375 
1376 	case SO_TXTIME:
1377 		if (optlen != sizeof(struct sock_txtime)) {
1378 			ret = -EINVAL;
1379 			break;
1380 		} else if (copy_from_sockptr(&sk_txtime, optval,
1381 			   sizeof(struct sock_txtime))) {
1382 			ret = -EFAULT;
1383 			break;
1384 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1385 			ret = -EINVAL;
1386 			break;
1387 		}
1388 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1389 		 * scheduler has enough safe guards.
1390 		 */
1391 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1392 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1393 			ret = -EPERM;
1394 			break;
1395 		}
1396 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1397 		sk->sk_clockid = sk_txtime.clockid;
1398 		sk->sk_txtime_deadline_mode =
1399 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1400 		sk->sk_txtime_report_errors =
1401 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1402 		break;
1403 
1404 	case SO_BINDTOIFINDEX:
1405 		ret = sock_bindtoindex_locked(sk, val);
1406 		break;
1407 
1408 	case SO_BUF_LOCK:
1409 		if (val & ~SOCK_BUF_LOCK_MASK) {
1410 			ret = -EINVAL;
1411 			break;
1412 		}
1413 		sk->sk_userlocks = val | (sk->sk_userlocks &
1414 					  ~SOCK_BUF_LOCK_MASK);
1415 		break;
1416 
1417 	case SO_RESERVE_MEM:
1418 	{
1419 		int delta;
1420 
1421 		if (val < 0) {
1422 			ret = -EINVAL;
1423 			break;
1424 		}
1425 
1426 		delta = val - sk->sk_reserved_mem;
1427 		if (delta < 0)
1428 			sock_release_reserved_memory(sk, -delta);
1429 		else
1430 			ret = sock_reserve_memory(sk, delta);
1431 		break;
1432 	}
1433 
1434 	default:
1435 		ret = -ENOPROTOOPT;
1436 		break;
1437 	}
1438 	release_sock(sk);
1439 	return ret;
1440 }
1441 EXPORT_SYMBOL(sock_setsockopt);
1442 
1443 static const struct cred *sk_get_peer_cred(struct sock *sk)
1444 {
1445 	const struct cred *cred;
1446 
1447 	spin_lock(&sk->sk_peer_lock);
1448 	cred = get_cred(sk->sk_peer_cred);
1449 	spin_unlock(&sk->sk_peer_lock);
1450 
1451 	return cred;
1452 }
1453 
1454 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1455 			  struct ucred *ucred)
1456 {
1457 	ucred->pid = pid_vnr(pid);
1458 	ucred->uid = ucred->gid = -1;
1459 	if (cred) {
1460 		struct user_namespace *current_ns = current_user_ns();
1461 
1462 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1463 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1464 	}
1465 }
1466 
1467 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1468 {
1469 	struct user_namespace *user_ns = current_user_ns();
1470 	int i;
1471 
1472 	for (i = 0; i < src->ngroups; i++)
1473 		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1474 			return -EFAULT;
1475 
1476 	return 0;
1477 }
1478 
1479 int sock_getsockopt(struct socket *sock, int level, int optname,
1480 		    char __user *optval, int __user *optlen)
1481 {
1482 	struct sock *sk = sock->sk;
1483 
1484 	union {
1485 		int val;
1486 		u64 val64;
1487 		unsigned long ulval;
1488 		struct linger ling;
1489 		struct old_timeval32 tm32;
1490 		struct __kernel_old_timeval tm;
1491 		struct  __kernel_sock_timeval stm;
1492 		struct sock_txtime txtime;
1493 		struct so_timestamping timestamping;
1494 	} v;
1495 
1496 	int lv = sizeof(int);
1497 	int len;
1498 
1499 	if (get_user(len, optlen))
1500 		return -EFAULT;
1501 	if (len < 0)
1502 		return -EINVAL;
1503 
1504 	memset(&v, 0, sizeof(v));
1505 
1506 	switch (optname) {
1507 	case SO_DEBUG:
1508 		v.val = sock_flag(sk, SOCK_DBG);
1509 		break;
1510 
1511 	case SO_DONTROUTE:
1512 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1513 		break;
1514 
1515 	case SO_BROADCAST:
1516 		v.val = sock_flag(sk, SOCK_BROADCAST);
1517 		break;
1518 
1519 	case SO_SNDBUF:
1520 		v.val = sk->sk_sndbuf;
1521 		break;
1522 
1523 	case SO_RCVBUF:
1524 		v.val = sk->sk_rcvbuf;
1525 		break;
1526 
1527 	case SO_REUSEADDR:
1528 		v.val = sk->sk_reuse;
1529 		break;
1530 
1531 	case SO_REUSEPORT:
1532 		v.val = sk->sk_reuseport;
1533 		break;
1534 
1535 	case SO_KEEPALIVE:
1536 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1537 		break;
1538 
1539 	case SO_TYPE:
1540 		v.val = sk->sk_type;
1541 		break;
1542 
1543 	case SO_PROTOCOL:
1544 		v.val = sk->sk_protocol;
1545 		break;
1546 
1547 	case SO_DOMAIN:
1548 		v.val = sk->sk_family;
1549 		break;
1550 
1551 	case SO_ERROR:
1552 		v.val = -sock_error(sk);
1553 		if (v.val == 0)
1554 			v.val = xchg(&sk->sk_err_soft, 0);
1555 		break;
1556 
1557 	case SO_OOBINLINE:
1558 		v.val = sock_flag(sk, SOCK_URGINLINE);
1559 		break;
1560 
1561 	case SO_NO_CHECK:
1562 		v.val = sk->sk_no_check_tx;
1563 		break;
1564 
1565 	case SO_PRIORITY:
1566 		v.val = sk->sk_priority;
1567 		break;
1568 
1569 	case SO_LINGER:
1570 		lv		= sizeof(v.ling);
1571 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1572 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1573 		break;
1574 
1575 	case SO_BSDCOMPAT:
1576 		break;
1577 
1578 	case SO_TIMESTAMP_OLD:
1579 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1580 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1581 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1582 		break;
1583 
1584 	case SO_TIMESTAMPNS_OLD:
1585 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1586 		break;
1587 
1588 	case SO_TIMESTAMP_NEW:
1589 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1590 		break;
1591 
1592 	case SO_TIMESTAMPNS_NEW:
1593 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1594 		break;
1595 
1596 	case SO_TIMESTAMPING_OLD:
1597 		lv = sizeof(v.timestamping);
1598 		v.timestamping.flags = sk->sk_tsflags;
1599 		v.timestamping.bind_phc = sk->sk_bind_phc;
1600 		break;
1601 
1602 	case SO_RCVTIMEO_OLD:
1603 	case SO_RCVTIMEO_NEW:
1604 		lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1605 		break;
1606 
1607 	case SO_SNDTIMEO_OLD:
1608 	case SO_SNDTIMEO_NEW:
1609 		lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1610 		break;
1611 
1612 	case SO_RCVLOWAT:
1613 		v.val = sk->sk_rcvlowat;
1614 		break;
1615 
1616 	case SO_SNDLOWAT:
1617 		v.val = 1;
1618 		break;
1619 
1620 	case SO_PASSCRED:
1621 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1622 		break;
1623 
1624 	case SO_PEERCRED:
1625 	{
1626 		struct ucred peercred;
1627 		if (len > sizeof(peercred))
1628 			len = sizeof(peercred);
1629 
1630 		spin_lock(&sk->sk_peer_lock);
1631 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1632 		spin_unlock(&sk->sk_peer_lock);
1633 
1634 		if (copy_to_user(optval, &peercred, len))
1635 			return -EFAULT;
1636 		goto lenout;
1637 	}
1638 
1639 	case SO_PEERGROUPS:
1640 	{
1641 		const struct cred *cred;
1642 		int ret, n;
1643 
1644 		cred = sk_get_peer_cred(sk);
1645 		if (!cred)
1646 			return -ENODATA;
1647 
1648 		n = cred->group_info->ngroups;
1649 		if (len < n * sizeof(gid_t)) {
1650 			len = n * sizeof(gid_t);
1651 			put_cred(cred);
1652 			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1653 		}
1654 		len = n * sizeof(gid_t);
1655 
1656 		ret = groups_to_user((gid_t __user *)optval, cred->group_info);
1657 		put_cred(cred);
1658 		if (ret)
1659 			return ret;
1660 		goto lenout;
1661 	}
1662 
1663 	case SO_PEERNAME:
1664 	{
1665 		char address[128];
1666 
1667 		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1668 		if (lv < 0)
1669 			return -ENOTCONN;
1670 		if (lv < len)
1671 			return -EINVAL;
1672 		if (copy_to_user(optval, address, len))
1673 			return -EFAULT;
1674 		goto lenout;
1675 	}
1676 
1677 	/* Dubious BSD thing... Probably nobody even uses it, but
1678 	 * the UNIX standard wants it for whatever reason... -DaveM
1679 	 */
1680 	case SO_ACCEPTCONN:
1681 		v.val = sk->sk_state == TCP_LISTEN;
1682 		break;
1683 
1684 	case SO_PASSSEC:
1685 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1686 		break;
1687 
1688 	case SO_PEERSEC:
1689 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1690 
1691 	case SO_MARK:
1692 		v.val = sk->sk_mark;
1693 		break;
1694 
1695 	case SO_RXQ_OVFL:
1696 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1697 		break;
1698 
1699 	case SO_WIFI_STATUS:
1700 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1701 		break;
1702 
1703 	case SO_PEEK_OFF:
1704 		if (!sock->ops->set_peek_off)
1705 			return -EOPNOTSUPP;
1706 
1707 		v.val = sk->sk_peek_off;
1708 		break;
1709 	case SO_NOFCS:
1710 		v.val = sock_flag(sk, SOCK_NOFCS);
1711 		break;
1712 
1713 	case SO_BINDTODEVICE:
1714 		return sock_getbindtodevice(sk, optval, optlen, len);
1715 
1716 	case SO_GET_FILTER:
1717 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1718 		if (len < 0)
1719 			return len;
1720 
1721 		goto lenout;
1722 
1723 	case SO_LOCK_FILTER:
1724 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1725 		break;
1726 
1727 	case SO_BPF_EXTENSIONS:
1728 		v.val = bpf_tell_extensions();
1729 		break;
1730 
1731 	case SO_SELECT_ERR_QUEUE:
1732 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1733 		break;
1734 
1735 #ifdef CONFIG_NET_RX_BUSY_POLL
1736 	case SO_BUSY_POLL:
1737 		v.val = sk->sk_ll_usec;
1738 		break;
1739 	case SO_PREFER_BUSY_POLL:
1740 		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1741 		break;
1742 #endif
1743 
1744 	case SO_MAX_PACING_RATE:
1745 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1746 			lv = sizeof(v.ulval);
1747 			v.ulval = sk->sk_max_pacing_rate;
1748 		} else {
1749 			/* 32bit version */
1750 			v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1751 		}
1752 		break;
1753 
1754 	case SO_INCOMING_CPU:
1755 		v.val = READ_ONCE(sk->sk_incoming_cpu);
1756 		break;
1757 
1758 	case SO_MEMINFO:
1759 	{
1760 		u32 meminfo[SK_MEMINFO_VARS];
1761 
1762 		sk_get_meminfo(sk, meminfo);
1763 
1764 		len = min_t(unsigned int, len, sizeof(meminfo));
1765 		if (copy_to_user(optval, &meminfo, len))
1766 			return -EFAULT;
1767 
1768 		goto lenout;
1769 	}
1770 
1771 #ifdef CONFIG_NET_RX_BUSY_POLL
1772 	case SO_INCOMING_NAPI_ID:
1773 		v.val = READ_ONCE(sk->sk_napi_id);
1774 
1775 		/* aggregate non-NAPI IDs down to 0 */
1776 		if (v.val < MIN_NAPI_ID)
1777 			v.val = 0;
1778 
1779 		break;
1780 #endif
1781 
1782 	case SO_COOKIE:
1783 		lv = sizeof(u64);
1784 		if (len < lv)
1785 			return -EINVAL;
1786 		v.val64 = sock_gen_cookie(sk);
1787 		break;
1788 
1789 	case SO_ZEROCOPY:
1790 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1791 		break;
1792 
1793 	case SO_TXTIME:
1794 		lv = sizeof(v.txtime);
1795 		v.txtime.clockid = sk->sk_clockid;
1796 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1797 				  SOF_TXTIME_DEADLINE_MODE : 0;
1798 		v.txtime.flags |= sk->sk_txtime_report_errors ?
1799 				  SOF_TXTIME_REPORT_ERRORS : 0;
1800 		break;
1801 
1802 	case SO_BINDTOIFINDEX:
1803 		v.val = sk->sk_bound_dev_if;
1804 		break;
1805 
1806 	case SO_NETNS_COOKIE:
1807 		lv = sizeof(u64);
1808 		if (len != lv)
1809 			return -EINVAL;
1810 		v.val64 = sock_net(sk)->net_cookie;
1811 		break;
1812 
1813 	case SO_BUF_LOCK:
1814 		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1815 		break;
1816 
1817 	case SO_RESERVE_MEM:
1818 		v.val = sk->sk_reserved_mem;
1819 		break;
1820 
1821 	default:
1822 		/* We implement the SO_SNDLOWAT etc to not be settable
1823 		 * (1003.1g 7).
1824 		 */
1825 		return -ENOPROTOOPT;
1826 	}
1827 
1828 	if (len > lv)
1829 		len = lv;
1830 	if (copy_to_user(optval, &v, len))
1831 		return -EFAULT;
1832 lenout:
1833 	if (put_user(len, optlen))
1834 		return -EFAULT;
1835 	return 0;
1836 }
1837 
1838 /*
1839  * Initialize an sk_lock.
1840  *
1841  * (We also register the sk_lock with the lock validator.)
1842  */
1843 static inline void sock_lock_init(struct sock *sk)
1844 {
1845 	if (sk->sk_kern_sock)
1846 		sock_lock_init_class_and_name(
1847 			sk,
1848 			af_family_kern_slock_key_strings[sk->sk_family],
1849 			af_family_kern_slock_keys + sk->sk_family,
1850 			af_family_kern_key_strings[sk->sk_family],
1851 			af_family_kern_keys + sk->sk_family);
1852 	else
1853 		sock_lock_init_class_and_name(
1854 			sk,
1855 			af_family_slock_key_strings[sk->sk_family],
1856 			af_family_slock_keys + sk->sk_family,
1857 			af_family_key_strings[sk->sk_family],
1858 			af_family_keys + sk->sk_family);
1859 }
1860 
1861 /*
1862  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1863  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1864  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1865  */
1866 static void sock_copy(struct sock *nsk, const struct sock *osk)
1867 {
1868 	const struct proto *prot = READ_ONCE(osk->sk_prot);
1869 #ifdef CONFIG_SECURITY_NETWORK
1870 	void *sptr = nsk->sk_security;
1871 #endif
1872 
1873 	/* If we move sk_tx_queue_mapping out of the private section,
1874 	 * we must check if sk_tx_queue_clear() is called after
1875 	 * sock_copy() in sk_clone_lock().
1876 	 */
1877 	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
1878 		     offsetof(struct sock, sk_dontcopy_begin) ||
1879 		     offsetof(struct sock, sk_tx_queue_mapping) >=
1880 		     offsetof(struct sock, sk_dontcopy_end));
1881 
1882 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1883 
1884 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1885 	       prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1886 
1887 #ifdef CONFIG_SECURITY_NETWORK
1888 	nsk->sk_security = sptr;
1889 	security_sk_clone(osk, nsk);
1890 #endif
1891 }
1892 
1893 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1894 		int family)
1895 {
1896 	struct sock *sk;
1897 	struct kmem_cache *slab;
1898 
1899 	slab = prot->slab;
1900 	if (slab != NULL) {
1901 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1902 		if (!sk)
1903 			return sk;
1904 		if (want_init_on_alloc(priority))
1905 			sk_prot_clear_nulls(sk, prot->obj_size);
1906 	} else
1907 		sk = kmalloc(prot->obj_size, priority);
1908 
1909 	if (sk != NULL) {
1910 		if (security_sk_alloc(sk, family, priority))
1911 			goto out_free;
1912 
1913 		if (!try_module_get(prot->owner))
1914 			goto out_free_sec;
1915 	}
1916 
1917 	return sk;
1918 
1919 out_free_sec:
1920 	security_sk_free(sk);
1921 out_free:
1922 	if (slab != NULL)
1923 		kmem_cache_free(slab, sk);
1924 	else
1925 		kfree(sk);
1926 	return NULL;
1927 }
1928 
1929 static void sk_prot_free(struct proto *prot, struct sock *sk)
1930 {
1931 	struct kmem_cache *slab;
1932 	struct module *owner;
1933 
1934 	owner = prot->owner;
1935 	slab = prot->slab;
1936 
1937 	cgroup_sk_free(&sk->sk_cgrp_data);
1938 	mem_cgroup_sk_free(sk);
1939 	security_sk_free(sk);
1940 	if (slab != NULL)
1941 		kmem_cache_free(slab, sk);
1942 	else
1943 		kfree(sk);
1944 	module_put(owner);
1945 }
1946 
1947 /**
1948  *	sk_alloc - All socket objects are allocated here
1949  *	@net: the applicable net namespace
1950  *	@family: protocol family
1951  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1952  *	@prot: struct proto associated with this new sock instance
1953  *	@kern: is this to be a kernel socket?
1954  */
1955 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1956 		      struct proto *prot, int kern)
1957 {
1958 	struct sock *sk;
1959 
1960 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1961 	if (sk) {
1962 		sk->sk_family = family;
1963 		/*
1964 		 * See comment in struct sock definition to understand
1965 		 * why we need sk_prot_creator -acme
1966 		 */
1967 		sk->sk_prot = sk->sk_prot_creator = prot;
1968 		sk->sk_kern_sock = kern;
1969 		sock_lock_init(sk);
1970 		sk->sk_net_refcnt = kern ? 0 : 1;
1971 		if (likely(sk->sk_net_refcnt)) {
1972 			get_net(net);
1973 			sock_inuse_add(net, 1);
1974 		}
1975 
1976 		sock_net_set(sk, net);
1977 		refcount_set(&sk->sk_wmem_alloc, 1);
1978 
1979 		mem_cgroup_sk_alloc(sk);
1980 		cgroup_sk_alloc(&sk->sk_cgrp_data);
1981 		sock_update_classid(&sk->sk_cgrp_data);
1982 		sock_update_netprioidx(&sk->sk_cgrp_data);
1983 		sk_tx_queue_clear(sk);
1984 	}
1985 
1986 	return sk;
1987 }
1988 EXPORT_SYMBOL(sk_alloc);
1989 
1990 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1991  * grace period. This is the case for UDP sockets and TCP listeners.
1992  */
1993 static void __sk_destruct(struct rcu_head *head)
1994 {
1995 	struct sock *sk = container_of(head, struct sock, sk_rcu);
1996 	struct sk_filter *filter;
1997 
1998 	if (sk->sk_destruct)
1999 		sk->sk_destruct(sk);
2000 
2001 	filter = rcu_dereference_check(sk->sk_filter,
2002 				       refcount_read(&sk->sk_wmem_alloc) == 0);
2003 	if (filter) {
2004 		sk_filter_uncharge(sk, filter);
2005 		RCU_INIT_POINTER(sk->sk_filter, NULL);
2006 	}
2007 
2008 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2009 
2010 #ifdef CONFIG_BPF_SYSCALL
2011 	bpf_sk_storage_free(sk);
2012 #endif
2013 
2014 	if (atomic_read(&sk->sk_omem_alloc))
2015 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2016 			 __func__, atomic_read(&sk->sk_omem_alloc));
2017 
2018 	if (sk->sk_frag.page) {
2019 		put_page(sk->sk_frag.page);
2020 		sk->sk_frag.page = NULL;
2021 	}
2022 
2023 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2024 	put_cred(sk->sk_peer_cred);
2025 	put_pid(sk->sk_peer_pid);
2026 
2027 	if (likely(sk->sk_net_refcnt))
2028 		put_net(sock_net(sk));
2029 	sk_prot_free(sk->sk_prot_creator, sk);
2030 }
2031 
2032 void sk_destruct(struct sock *sk)
2033 {
2034 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2035 
2036 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2037 		reuseport_detach_sock(sk);
2038 		use_call_rcu = true;
2039 	}
2040 
2041 	if (use_call_rcu)
2042 		call_rcu(&sk->sk_rcu, __sk_destruct);
2043 	else
2044 		__sk_destruct(&sk->sk_rcu);
2045 }
2046 
2047 static void __sk_free(struct sock *sk)
2048 {
2049 	if (likely(sk->sk_net_refcnt))
2050 		sock_inuse_add(sock_net(sk), -1);
2051 
2052 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2053 		sock_diag_broadcast_destroy(sk);
2054 	else
2055 		sk_destruct(sk);
2056 }
2057 
2058 void sk_free(struct sock *sk)
2059 {
2060 	/*
2061 	 * We subtract one from sk_wmem_alloc and can know if
2062 	 * some packets are still in some tx queue.
2063 	 * If not null, sock_wfree() will call __sk_free(sk) later
2064 	 */
2065 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2066 		__sk_free(sk);
2067 }
2068 EXPORT_SYMBOL(sk_free);
2069 
2070 static void sk_init_common(struct sock *sk)
2071 {
2072 	skb_queue_head_init(&sk->sk_receive_queue);
2073 	skb_queue_head_init(&sk->sk_write_queue);
2074 	skb_queue_head_init(&sk->sk_error_queue);
2075 
2076 	rwlock_init(&sk->sk_callback_lock);
2077 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2078 			af_rlock_keys + sk->sk_family,
2079 			af_family_rlock_key_strings[sk->sk_family]);
2080 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2081 			af_wlock_keys + sk->sk_family,
2082 			af_family_wlock_key_strings[sk->sk_family]);
2083 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2084 			af_elock_keys + sk->sk_family,
2085 			af_family_elock_key_strings[sk->sk_family]);
2086 	lockdep_set_class_and_name(&sk->sk_callback_lock,
2087 			af_callback_keys + sk->sk_family,
2088 			af_family_clock_key_strings[sk->sk_family]);
2089 }
2090 
2091 /**
2092  *	sk_clone_lock - clone a socket, and lock its clone
2093  *	@sk: the socket to clone
2094  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2095  *
2096  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2097  */
2098 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2099 {
2100 	struct proto *prot = READ_ONCE(sk->sk_prot);
2101 	struct sk_filter *filter;
2102 	bool is_charged = true;
2103 	struct sock *newsk;
2104 
2105 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2106 	if (!newsk)
2107 		goto out;
2108 
2109 	sock_copy(newsk, sk);
2110 
2111 	newsk->sk_prot_creator = prot;
2112 
2113 	/* SANITY */
2114 	if (likely(newsk->sk_net_refcnt))
2115 		get_net(sock_net(newsk));
2116 	sk_node_init(&newsk->sk_node);
2117 	sock_lock_init(newsk);
2118 	bh_lock_sock(newsk);
2119 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2120 	newsk->sk_backlog.len = 0;
2121 
2122 	atomic_set(&newsk->sk_rmem_alloc, 0);
2123 
2124 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2125 	refcount_set(&newsk->sk_wmem_alloc, 1);
2126 
2127 	atomic_set(&newsk->sk_omem_alloc, 0);
2128 	sk_init_common(newsk);
2129 
2130 	newsk->sk_dst_cache	= NULL;
2131 	newsk->sk_dst_pending_confirm = 0;
2132 	newsk->sk_wmem_queued	= 0;
2133 	newsk->sk_forward_alloc = 0;
2134 	newsk->sk_reserved_mem  = 0;
2135 	atomic_set(&newsk->sk_drops, 0);
2136 	newsk->sk_send_head	= NULL;
2137 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2138 	atomic_set(&newsk->sk_zckey, 0);
2139 
2140 	sock_reset_flag(newsk, SOCK_DONE);
2141 
2142 	/* sk->sk_memcg will be populated at accept() time */
2143 	newsk->sk_memcg = NULL;
2144 
2145 	cgroup_sk_clone(&newsk->sk_cgrp_data);
2146 
2147 	rcu_read_lock();
2148 	filter = rcu_dereference(sk->sk_filter);
2149 	if (filter != NULL)
2150 		/* though it's an empty new sock, the charging may fail
2151 		 * if sysctl_optmem_max was changed between creation of
2152 		 * original socket and cloning
2153 		 */
2154 		is_charged = sk_filter_charge(newsk, filter);
2155 	RCU_INIT_POINTER(newsk->sk_filter, filter);
2156 	rcu_read_unlock();
2157 
2158 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2159 		/* We need to make sure that we don't uncharge the new
2160 		 * socket if we couldn't charge it in the first place
2161 		 * as otherwise we uncharge the parent's filter.
2162 		 */
2163 		if (!is_charged)
2164 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2165 		sk_free_unlock_clone(newsk);
2166 		newsk = NULL;
2167 		goto out;
2168 	}
2169 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2170 
2171 	if (bpf_sk_storage_clone(sk, newsk)) {
2172 		sk_free_unlock_clone(newsk);
2173 		newsk = NULL;
2174 		goto out;
2175 	}
2176 
2177 	/* Clear sk_user_data if parent had the pointer tagged
2178 	 * as not suitable for copying when cloning.
2179 	 */
2180 	if (sk_user_data_is_nocopy(newsk))
2181 		newsk->sk_user_data = NULL;
2182 
2183 	newsk->sk_err	   = 0;
2184 	newsk->sk_err_soft = 0;
2185 	newsk->sk_priority = 0;
2186 	newsk->sk_incoming_cpu = raw_smp_processor_id();
2187 	if (likely(newsk->sk_net_refcnt))
2188 		sock_inuse_add(sock_net(newsk), 1);
2189 
2190 	/* Before updating sk_refcnt, we must commit prior changes to memory
2191 	 * (Documentation/RCU/rculist_nulls.rst for details)
2192 	 */
2193 	smp_wmb();
2194 	refcount_set(&newsk->sk_refcnt, 2);
2195 
2196 	/* Increment the counter in the same struct proto as the master
2197 	 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
2198 	 * is the same as sk->sk_prot->socks, as this field was copied
2199 	 * with memcpy).
2200 	 *
2201 	 * This _changes_ the previous behaviour, where
2202 	 * tcp_create_openreq_child always was incrementing the
2203 	 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
2204 	 * to be taken into account in all callers. -acme
2205 	 */
2206 	sk_refcnt_debug_inc(newsk);
2207 	sk_set_socket(newsk, NULL);
2208 	sk_tx_queue_clear(newsk);
2209 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2210 
2211 	if (newsk->sk_prot->sockets_allocated)
2212 		sk_sockets_allocated_inc(newsk);
2213 
2214 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2215 		net_enable_timestamp();
2216 out:
2217 	return newsk;
2218 }
2219 EXPORT_SYMBOL_GPL(sk_clone_lock);
2220 
2221 void sk_free_unlock_clone(struct sock *sk)
2222 {
2223 	/* It is still raw copy of parent, so invalidate
2224 	 * destructor and make plain sk_free() */
2225 	sk->sk_destruct = NULL;
2226 	bh_unlock_sock(sk);
2227 	sk_free(sk);
2228 }
2229 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2230 
2231 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2232 {
2233 	u32 max_segs = 1;
2234 
2235 	sk_dst_set(sk, dst);
2236 	sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
2237 	if (sk->sk_route_caps & NETIF_F_GSO)
2238 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2239 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
2240 	if (sk_can_gso(sk)) {
2241 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2242 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2243 		} else {
2244 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2245 			sk->sk_gso_max_size = dst->dev->gso_max_size;
2246 			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
2247 		}
2248 	}
2249 	sk->sk_gso_max_segs = max_segs;
2250 }
2251 EXPORT_SYMBOL_GPL(sk_setup_caps);
2252 
2253 /*
2254  *	Simple resource managers for sockets.
2255  */
2256 
2257 
2258 /*
2259  * Write buffer destructor automatically called from kfree_skb.
2260  */
2261 void sock_wfree(struct sk_buff *skb)
2262 {
2263 	struct sock *sk = skb->sk;
2264 	unsigned int len = skb->truesize;
2265 
2266 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2267 		/*
2268 		 * Keep a reference on sk_wmem_alloc, this will be released
2269 		 * after sk_write_space() call
2270 		 */
2271 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2272 		sk->sk_write_space(sk);
2273 		len = 1;
2274 	}
2275 	/*
2276 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2277 	 * could not do because of in-flight packets
2278 	 */
2279 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2280 		__sk_free(sk);
2281 }
2282 EXPORT_SYMBOL(sock_wfree);
2283 
2284 /* This variant of sock_wfree() is used by TCP,
2285  * since it sets SOCK_USE_WRITE_QUEUE.
2286  */
2287 void __sock_wfree(struct sk_buff *skb)
2288 {
2289 	struct sock *sk = skb->sk;
2290 
2291 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2292 		__sk_free(sk);
2293 }
2294 
2295 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2296 {
2297 	skb_orphan(skb);
2298 	skb->sk = sk;
2299 #ifdef CONFIG_INET
2300 	if (unlikely(!sk_fullsock(sk))) {
2301 		skb->destructor = sock_edemux;
2302 		sock_hold(sk);
2303 		return;
2304 	}
2305 #endif
2306 	skb->destructor = sock_wfree;
2307 	skb_set_hash_from_sk(skb, sk);
2308 	/*
2309 	 * We used to take a refcount on sk, but following operation
2310 	 * is enough to guarantee sk_free() wont free this sock until
2311 	 * all in-flight packets are completed
2312 	 */
2313 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2314 }
2315 EXPORT_SYMBOL(skb_set_owner_w);
2316 
2317 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2318 {
2319 #ifdef CONFIG_TLS_DEVICE
2320 	/* Drivers depend on in-order delivery for crypto offload,
2321 	 * partial orphan breaks out-of-order-OK logic.
2322 	 */
2323 	if (skb->decrypted)
2324 		return false;
2325 #endif
2326 	return (skb->destructor == sock_wfree ||
2327 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2328 }
2329 
2330 /* This helper is used by netem, as it can hold packets in its
2331  * delay queue. We want to allow the owner socket to send more
2332  * packets, as if they were already TX completed by a typical driver.
2333  * But we also want to keep skb->sk set because some packet schedulers
2334  * rely on it (sch_fq for example).
2335  */
2336 void skb_orphan_partial(struct sk_buff *skb)
2337 {
2338 	if (skb_is_tcp_pure_ack(skb))
2339 		return;
2340 
2341 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2342 		return;
2343 
2344 	skb_orphan(skb);
2345 }
2346 EXPORT_SYMBOL(skb_orphan_partial);
2347 
2348 /*
2349  * Read buffer destructor automatically called from kfree_skb.
2350  */
2351 void sock_rfree(struct sk_buff *skb)
2352 {
2353 	struct sock *sk = skb->sk;
2354 	unsigned int len = skb->truesize;
2355 
2356 	atomic_sub(len, &sk->sk_rmem_alloc);
2357 	sk_mem_uncharge(sk, len);
2358 }
2359 EXPORT_SYMBOL(sock_rfree);
2360 
2361 /*
2362  * Buffer destructor for skbs that are not used directly in read or write
2363  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2364  */
2365 void sock_efree(struct sk_buff *skb)
2366 {
2367 	sock_put(skb->sk);
2368 }
2369 EXPORT_SYMBOL(sock_efree);
2370 
2371 /* Buffer destructor for prefetch/receive path where reference count may
2372  * not be held, e.g. for listen sockets.
2373  */
2374 #ifdef CONFIG_INET
2375 void sock_pfree(struct sk_buff *skb)
2376 {
2377 	if (sk_is_refcounted(skb->sk))
2378 		sock_gen_put(skb->sk);
2379 }
2380 EXPORT_SYMBOL(sock_pfree);
2381 #endif /* CONFIG_INET */
2382 
2383 kuid_t sock_i_uid(struct sock *sk)
2384 {
2385 	kuid_t uid;
2386 
2387 	read_lock_bh(&sk->sk_callback_lock);
2388 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2389 	read_unlock_bh(&sk->sk_callback_lock);
2390 	return uid;
2391 }
2392 EXPORT_SYMBOL(sock_i_uid);
2393 
2394 unsigned long sock_i_ino(struct sock *sk)
2395 {
2396 	unsigned long ino;
2397 
2398 	read_lock_bh(&sk->sk_callback_lock);
2399 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2400 	read_unlock_bh(&sk->sk_callback_lock);
2401 	return ino;
2402 }
2403 EXPORT_SYMBOL(sock_i_ino);
2404 
2405 /*
2406  * Allocate a skb from the socket's send buffer.
2407  */
2408 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2409 			     gfp_t priority)
2410 {
2411 	if (force ||
2412 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2413 		struct sk_buff *skb = alloc_skb(size, priority);
2414 
2415 		if (skb) {
2416 			skb_set_owner_w(skb, sk);
2417 			return skb;
2418 		}
2419 	}
2420 	return NULL;
2421 }
2422 EXPORT_SYMBOL(sock_wmalloc);
2423 
2424 static void sock_ofree(struct sk_buff *skb)
2425 {
2426 	struct sock *sk = skb->sk;
2427 
2428 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2429 }
2430 
2431 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2432 			     gfp_t priority)
2433 {
2434 	struct sk_buff *skb;
2435 
2436 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2437 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2438 	    sysctl_optmem_max)
2439 		return NULL;
2440 
2441 	skb = alloc_skb(size, priority);
2442 	if (!skb)
2443 		return NULL;
2444 
2445 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2446 	skb->sk = sk;
2447 	skb->destructor = sock_ofree;
2448 	return skb;
2449 }
2450 
2451 /*
2452  * Allocate a memory block from the socket's option memory buffer.
2453  */
2454 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2455 {
2456 	if ((unsigned int)size <= sysctl_optmem_max &&
2457 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2458 		void *mem;
2459 		/* First do the add, to avoid the race if kmalloc
2460 		 * might sleep.
2461 		 */
2462 		atomic_add(size, &sk->sk_omem_alloc);
2463 		mem = kmalloc(size, priority);
2464 		if (mem)
2465 			return mem;
2466 		atomic_sub(size, &sk->sk_omem_alloc);
2467 	}
2468 	return NULL;
2469 }
2470 EXPORT_SYMBOL(sock_kmalloc);
2471 
2472 /* Free an option memory block. Note, we actually want the inline
2473  * here as this allows gcc to detect the nullify and fold away the
2474  * condition entirely.
2475  */
2476 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2477 				  const bool nullify)
2478 {
2479 	if (WARN_ON_ONCE(!mem))
2480 		return;
2481 	if (nullify)
2482 		kfree_sensitive(mem);
2483 	else
2484 		kfree(mem);
2485 	atomic_sub(size, &sk->sk_omem_alloc);
2486 }
2487 
2488 void sock_kfree_s(struct sock *sk, void *mem, int size)
2489 {
2490 	__sock_kfree_s(sk, mem, size, false);
2491 }
2492 EXPORT_SYMBOL(sock_kfree_s);
2493 
2494 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2495 {
2496 	__sock_kfree_s(sk, mem, size, true);
2497 }
2498 EXPORT_SYMBOL(sock_kzfree_s);
2499 
2500 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2501    I think, these locks should be removed for datagram sockets.
2502  */
2503 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2504 {
2505 	DEFINE_WAIT(wait);
2506 
2507 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2508 	for (;;) {
2509 		if (!timeo)
2510 			break;
2511 		if (signal_pending(current))
2512 			break;
2513 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2514 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2515 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2516 			break;
2517 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2518 			break;
2519 		if (sk->sk_err)
2520 			break;
2521 		timeo = schedule_timeout(timeo);
2522 	}
2523 	finish_wait(sk_sleep(sk), &wait);
2524 	return timeo;
2525 }
2526 
2527 
2528 /*
2529  *	Generic send/receive buffer handlers
2530  */
2531 
2532 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2533 				     unsigned long data_len, int noblock,
2534 				     int *errcode, int max_page_order)
2535 {
2536 	struct sk_buff *skb;
2537 	long timeo;
2538 	int err;
2539 
2540 	timeo = sock_sndtimeo(sk, noblock);
2541 	for (;;) {
2542 		err = sock_error(sk);
2543 		if (err != 0)
2544 			goto failure;
2545 
2546 		err = -EPIPE;
2547 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2548 			goto failure;
2549 
2550 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2551 			break;
2552 
2553 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2554 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2555 		err = -EAGAIN;
2556 		if (!timeo)
2557 			goto failure;
2558 		if (signal_pending(current))
2559 			goto interrupted;
2560 		timeo = sock_wait_for_wmem(sk, timeo);
2561 	}
2562 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2563 				   errcode, sk->sk_allocation);
2564 	if (skb)
2565 		skb_set_owner_w(skb, sk);
2566 	return skb;
2567 
2568 interrupted:
2569 	err = sock_intr_errno(timeo);
2570 failure:
2571 	*errcode = err;
2572 	return NULL;
2573 }
2574 EXPORT_SYMBOL(sock_alloc_send_pskb);
2575 
2576 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2577 				    int noblock, int *errcode)
2578 {
2579 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2580 }
2581 EXPORT_SYMBOL(sock_alloc_send_skb);
2582 
2583 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2584 		     struct sockcm_cookie *sockc)
2585 {
2586 	u32 tsflags;
2587 
2588 	switch (cmsg->cmsg_type) {
2589 	case SO_MARK:
2590 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2591 			return -EPERM;
2592 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2593 			return -EINVAL;
2594 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2595 		break;
2596 	case SO_TIMESTAMPING_OLD:
2597 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2598 			return -EINVAL;
2599 
2600 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2601 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2602 			return -EINVAL;
2603 
2604 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2605 		sockc->tsflags |= tsflags;
2606 		break;
2607 	case SCM_TXTIME:
2608 		if (!sock_flag(sk, SOCK_TXTIME))
2609 			return -EINVAL;
2610 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2611 			return -EINVAL;
2612 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2613 		break;
2614 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2615 	case SCM_RIGHTS:
2616 	case SCM_CREDENTIALS:
2617 		break;
2618 	default:
2619 		return -EINVAL;
2620 	}
2621 	return 0;
2622 }
2623 EXPORT_SYMBOL(__sock_cmsg_send);
2624 
2625 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2626 		   struct sockcm_cookie *sockc)
2627 {
2628 	struct cmsghdr *cmsg;
2629 	int ret;
2630 
2631 	for_each_cmsghdr(cmsg, msg) {
2632 		if (!CMSG_OK(msg, cmsg))
2633 			return -EINVAL;
2634 		if (cmsg->cmsg_level != SOL_SOCKET)
2635 			continue;
2636 		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2637 		if (ret)
2638 			return ret;
2639 	}
2640 	return 0;
2641 }
2642 EXPORT_SYMBOL(sock_cmsg_send);
2643 
2644 static void sk_enter_memory_pressure(struct sock *sk)
2645 {
2646 	if (!sk->sk_prot->enter_memory_pressure)
2647 		return;
2648 
2649 	sk->sk_prot->enter_memory_pressure(sk);
2650 }
2651 
2652 static void sk_leave_memory_pressure(struct sock *sk)
2653 {
2654 	if (sk->sk_prot->leave_memory_pressure) {
2655 		sk->sk_prot->leave_memory_pressure(sk);
2656 	} else {
2657 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2658 
2659 		if (memory_pressure && READ_ONCE(*memory_pressure))
2660 			WRITE_ONCE(*memory_pressure, 0);
2661 	}
2662 }
2663 
2664 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2665 
2666 /**
2667  * skb_page_frag_refill - check that a page_frag contains enough room
2668  * @sz: minimum size of the fragment we want to get
2669  * @pfrag: pointer to page_frag
2670  * @gfp: priority for memory allocation
2671  *
2672  * Note: While this allocator tries to use high order pages, there is
2673  * no guarantee that allocations succeed. Therefore, @sz MUST be
2674  * less or equal than PAGE_SIZE.
2675  */
2676 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2677 {
2678 	if (pfrag->page) {
2679 		if (page_ref_count(pfrag->page) == 1) {
2680 			pfrag->offset = 0;
2681 			return true;
2682 		}
2683 		if (pfrag->offset + sz <= pfrag->size)
2684 			return true;
2685 		put_page(pfrag->page);
2686 	}
2687 
2688 	pfrag->offset = 0;
2689 	if (SKB_FRAG_PAGE_ORDER &&
2690 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2691 		/* Avoid direct reclaim but allow kswapd to wake */
2692 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2693 					  __GFP_COMP | __GFP_NOWARN |
2694 					  __GFP_NORETRY,
2695 					  SKB_FRAG_PAGE_ORDER);
2696 		if (likely(pfrag->page)) {
2697 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2698 			return true;
2699 		}
2700 	}
2701 	pfrag->page = alloc_page(gfp);
2702 	if (likely(pfrag->page)) {
2703 		pfrag->size = PAGE_SIZE;
2704 		return true;
2705 	}
2706 	return false;
2707 }
2708 EXPORT_SYMBOL(skb_page_frag_refill);
2709 
2710 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2711 {
2712 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2713 		return true;
2714 
2715 	sk_enter_memory_pressure(sk);
2716 	sk_stream_moderate_sndbuf(sk);
2717 	return false;
2718 }
2719 EXPORT_SYMBOL(sk_page_frag_refill);
2720 
2721 void __lock_sock(struct sock *sk)
2722 	__releases(&sk->sk_lock.slock)
2723 	__acquires(&sk->sk_lock.slock)
2724 {
2725 	DEFINE_WAIT(wait);
2726 
2727 	for (;;) {
2728 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2729 					TASK_UNINTERRUPTIBLE);
2730 		spin_unlock_bh(&sk->sk_lock.slock);
2731 		schedule();
2732 		spin_lock_bh(&sk->sk_lock.slock);
2733 		if (!sock_owned_by_user(sk))
2734 			break;
2735 	}
2736 	finish_wait(&sk->sk_lock.wq, &wait);
2737 }
2738 
2739 void __release_sock(struct sock *sk)
2740 	__releases(&sk->sk_lock.slock)
2741 	__acquires(&sk->sk_lock.slock)
2742 {
2743 	struct sk_buff *skb, *next;
2744 
2745 	while ((skb = sk->sk_backlog.head) != NULL) {
2746 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2747 
2748 		spin_unlock_bh(&sk->sk_lock.slock);
2749 
2750 		do {
2751 			next = skb->next;
2752 			prefetch(next);
2753 			WARN_ON_ONCE(skb_dst_is_noref(skb));
2754 			skb_mark_not_on_list(skb);
2755 			sk_backlog_rcv(sk, skb);
2756 
2757 			cond_resched();
2758 
2759 			skb = next;
2760 		} while (skb != NULL);
2761 
2762 		spin_lock_bh(&sk->sk_lock.slock);
2763 	}
2764 
2765 	/*
2766 	 * Doing the zeroing here guarantee we can not loop forever
2767 	 * while a wild producer attempts to flood us.
2768 	 */
2769 	sk->sk_backlog.len = 0;
2770 }
2771 
2772 void __sk_flush_backlog(struct sock *sk)
2773 {
2774 	spin_lock_bh(&sk->sk_lock.slock);
2775 	__release_sock(sk);
2776 	spin_unlock_bh(&sk->sk_lock.slock);
2777 }
2778 
2779 /**
2780  * sk_wait_data - wait for data to arrive at sk_receive_queue
2781  * @sk:    sock to wait on
2782  * @timeo: for how long
2783  * @skb:   last skb seen on sk_receive_queue
2784  *
2785  * Now socket state including sk->sk_err is changed only under lock,
2786  * hence we may omit checks after joining wait queue.
2787  * We check receive queue before schedule() only as optimization;
2788  * it is very likely that release_sock() added new data.
2789  */
2790 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2791 {
2792 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2793 	int rc;
2794 
2795 	add_wait_queue(sk_sleep(sk), &wait);
2796 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2797 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2798 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2799 	remove_wait_queue(sk_sleep(sk), &wait);
2800 	return rc;
2801 }
2802 EXPORT_SYMBOL(sk_wait_data);
2803 
2804 /**
2805  *	__sk_mem_raise_allocated - increase memory_allocated
2806  *	@sk: socket
2807  *	@size: memory size to allocate
2808  *	@amt: pages to allocate
2809  *	@kind: allocation type
2810  *
2811  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2812  */
2813 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2814 {
2815 	struct proto *prot = sk->sk_prot;
2816 	long allocated = sk_memory_allocated_add(sk, amt);
2817 	bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
2818 	bool charged = true;
2819 
2820 	if (memcg_charge &&
2821 	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2822 						gfp_memcg_charge())))
2823 		goto suppress_allocation;
2824 
2825 	/* Under limit. */
2826 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2827 		sk_leave_memory_pressure(sk);
2828 		return 1;
2829 	}
2830 
2831 	/* Under pressure. */
2832 	if (allocated > sk_prot_mem_limits(sk, 1))
2833 		sk_enter_memory_pressure(sk);
2834 
2835 	/* Over hard limit. */
2836 	if (allocated > sk_prot_mem_limits(sk, 2))
2837 		goto suppress_allocation;
2838 
2839 	/* guarantee minimum buffer size under pressure */
2840 	if (kind == SK_MEM_RECV) {
2841 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2842 			return 1;
2843 
2844 	} else { /* SK_MEM_SEND */
2845 		int wmem0 = sk_get_wmem0(sk, prot);
2846 
2847 		if (sk->sk_type == SOCK_STREAM) {
2848 			if (sk->sk_wmem_queued < wmem0)
2849 				return 1;
2850 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2851 				return 1;
2852 		}
2853 	}
2854 
2855 	if (sk_has_memory_pressure(sk)) {
2856 		u64 alloc;
2857 
2858 		if (!sk_under_memory_pressure(sk))
2859 			return 1;
2860 		alloc = sk_sockets_allocated_read_positive(sk);
2861 		if (sk_prot_mem_limits(sk, 2) > alloc *
2862 		    sk_mem_pages(sk->sk_wmem_queued +
2863 				 atomic_read(&sk->sk_rmem_alloc) +
2864 				 sk->sk_forward_alloc))
2865 			return 1;
2866 	}
2867 
2868 suppress_allocation:
2869 
2870 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2871 		sk_stream_moderate_sndbuf(sk);
2872 
2873 		/* Fail only if socket is _under_ its sndbuf.
2874 		 * In this case we cannot block, so that we have to fail.
2875 		 */
2876 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
2877 			/* Force charge with __GFP_NOFAIL */
2878 			if (memcg_charge && !charged) {
2879 				mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2880 					gfp_memcg_charge() | __GFP_NOFAIL);
2881 			}
2882 			return 1;
2883 		}
2884 	}
2885 
2886 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2887 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2888 
2889 	sk_memory_allocated_sub(sk, amt);
2890 
2891 	if (memcg_charge && charged)
2892 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2893 
2894 	return 0;
2895 }
2896 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2897 
2898 /**
2899  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2900  *	@sk: socket
2901  *	@size: memory size to allocate
2902  *	@kind: allocation type
2903  *
2904  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2905  *	rmem allocation. This function assumes that protocols which have
2906  *	memory_pressure use sk_wmem_queued as write buffer accounting.
2907  */
2908 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2909 {
2910 	int ret, amt = sk_mem_pages(size);
2911 
2912 	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2913 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2914 	if (!ret)
2915 		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2916 	return ret;
2917 }
2918 EXPORT_SYMBOL(__sk_mem_schedule);
2919 
2920 /**
2921  *	__sk_mem_reduce_allocated - reclaim memory_allocated
2922  *	@sk: socket
2923  *	@amount: number of quanta
2924  *
2925  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2926  */
2927 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2928 {
2929 	sk_memory_allocated_sub(sk, amount);
2930 
2931 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2932 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2933 
2934 	if (sk_under_memory_pressure(sk) &&
2935 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2936 		sk_leave_memory_pressure(sk);
2937 }
2938 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2939 
2940 /**
2941  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2942  *	@sk: socket
2943  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2944  */
2945 void __sk_mem_reclaim(struct sock *sk, int amount)
2946 {
2947 	amount >>= SK_MEM_QUANTUM_SHIFT;
2948 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2949 	__sk_mem_reduce_allocated(sk, amount);
2950 }
2951 EXPORT_SYMBOL(__sk_mem_reclaim);
2952 
2953 int sk_set_peek_off(struct sock *sk, int val)
2954 {
2955 	sk->sk_peek_off = val;
2956 	return 0;
2957 }
2958 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2959 
2960 /*
2961  * Set of default routines for initialising struct proto_ops when
2962  * the protocol does not support a particular function. In certain
2963  * cases where it makes no sense for a protocol to have a "do nothing"
2964  * function, some default processing is provided.
2965  */
2966 
2967 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2968 {
2969 	return -EOPNOTSUPP;
2970 }
2971 EXPORT_SYMBOL(sock_no_bind);
2972 
2973 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2974 		    int len, int flags)
2975 {
2976 	return -EOPNOTSUPP;
2977 }
2978 EXPORT_SYMBOL(sock_no_connect);
2979 
2980 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2981 {
2982 	return -EOPNOTSUPP;
2983 }
2984 EXPORT_SYMBOL(sock_no_socketpair);
2985 
2986 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2987 		   bool kern)
2988 {
2989 	return -EOPNOTSUPP;
2990 }
2991 EXPORT_SYMBOL(sock_no_accept);
2992 
2993 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2994 		    int peer)
2995 {
2996 	return -EOPNOTSUPP;
2997 }
2998 EXPORT_SYMBOL(sock_no_getname);
2999 
3000 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3001 {
3002 	return -EOPNOTSUPP;
3003 }
3004 EXPORT_SYMBOL(sock_no_ioctl);
3005 
3006 int sock_no_listen(struct socket *sock, int backlog)
3007 {
3008 	return -EOPNOTSUPP;
3009 }
3010 EXPORT_SYMBOL(sock_no_listen);
3011 
3012 int sock_no_shutdown(struct socket *sock, int how)
3013 {
3014 	return -EOPNOTSUPP;
3015 }
3016 EXPORT_SYMBOL(sock_no_shutdown);
3017 
3018 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3019 {
3020 	return -EOPNOTSUPP;
3021 }
3022 EXPORT_SYMBOL(sock_no_sendmsg);
3023 
3024 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3025 {
3026 	return -EOPNOTSUPP;
3027 }
3028 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3029 
3030 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3031 		    int flags)
3032 {
3033 	return -EOPNOTSUPP;
3034 }
3035 EXPORT_SYMBOL(sock_no_recvmsg);
3036 
3037 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3038 {
3039 	/* Mirror missing mmap method error code */
3040 	return -ENODEV;
3041 }
3042 EXPORT_SYMBOL(sock_no_mmap);
3043 
3044 /*
3045  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3046  * various sock-based usage counts.
3047  */
3048 void __receive_sock(struct file *file)
3049 {
3050 	struct socket *sock;
3051 
3052 	sock = sock_from_file(file);
3053 	if (sock) {
3054 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3055 		sock_update_classid(&sock->sk->sk_cgrp_data);
3056 	}
3057 }
3058 
3059 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
3060 {
3061 	ssize_t res;
3062 	struct msghdr msg = {.msg_flags = flags};
3063 	struct kvec iov;
3064 	char *kaddr = kmap(page);
3065 	iov.iov_base = kaddr + offset;
3066 	iov.iov_len = size;
3067 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
3068 	kunmap(page);
3069 	return res;
3070 }
3071 EXPORT_SYMBOL(sock_no_sendpage);
3072 
3073 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
3074 				int offset, size_t size, int flags)
3075 {
3076 	ssize_t res;
3077 	struct msghdr msg = {.msg_flags = flags};
3078 	struct kvec iov;
3079 	char *kaddr = kmap(page);
3080 
3081 	iov.iov_base = kaddr + offset;
3082 	iov.iov_len = size;
3083 	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
3084 	kunmap(page);
3085 	return res;
3086 }
3087 EXPORT_SYMBOL(sock_no_sendpage_locked);
3088 
3089 /*
3090  *	Default Socket Callbacks
3091  */
3092 
3093 static void sock_def_wakeup(struct sock *sk)
3094 {
3095 	struct socket_wq *wq;
3096 
3097 	rcu_read_lock();
3098 	wq = rcu_dereference(sk->sk_wq);
3099 	if (skwq_has_sleeper(wq))
3100 		wake_up_interruptible_all(&wq->wait);
3101 	rcu_read_unlock();
3102 }
3103 
3104 static void sock_def_error_report(struct sock *sk)
3105 {
3106 	struct socket_wq *wq;
3107 
3108 	rcu_read_lock();
3109 	wq = rcu_dereference(sk->sk_wq);
3110 	if (skwq_has_sleeper(wq))
3111 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3112 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3113 	rcu_read_unlock();
3114 }
3115 
3116 void sock_def_readable(struct sock *sk)
3117 {
3118 	struct socket_wq *wq;
3119 
3120 	rcu_read_lock();
3121 	wq = rcu_dereference(sk->sk_wq);
3122 	if (skwq_has_sleeper(wq))
3123 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3124 						EPOLLRDNORM | EPOLLRDBAND);
3125 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3126 	rcu_read_unlock();
3127 }
3128 
3129 static void sock_def_write_space(struct sock *sk)
3130 {
3131 	struct socket_wq *wq;
3132 
3133 	rcu_read_lock();
3134 
3135 	/* Do not wake up a writer until he can make "significant"
3136 	 * progress.  --DaveM
3137 	 */
3138 	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
3139 		wq = rcu_dereference(sk->sk_wq);
3140 		if (skwq_has_sleeper(wq))
3141 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3142 						EPOLLWRNORM | EPOLLWRBAND);
3143 
3144 		/* Should agree with poll, otherwise some programs break */
3145 		if (sock_writeable(sk))
3146 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3147 	}
3148 
3149 	rcu_read_unlock();
3150 }
3151 
3152 static void sock_def_destruct(struct sock *sk)
3153 {
3154 }
3155 
3156 void sk_send_sigurg(struct sock *sk)
3157 {
3158 	if (sk->sk_socket && sk->sk_socket->file)
3159 		if (send_sigurg(&sk->sk_socket->file->f_owner))
3160 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3161 }
3162 EXPORT_SYMBOL(sk_send_sigurg);
3163 
3164 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3165 		    unsigned long expires)
3166 {
3167 	if (!mod_timer(timer, expires))
3168 		sock_hold(sk);
3169 }
3170 EXPORT_SYMBOL(sk_reset_timer);
3171 
3172 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3173 {
3174 	if (del_timer(timer))
3175 		__sock_put(sk);
3176 }
3177 EXPORT_SYMBOL(sk_stop_timer);
3178 
3179 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3180 {
3181 	if (del_timer_sync(timer))
3182 		__sock_put(sk);
3183 }
3184 EXPORT_SYMBOL(sk_stop_timer_sync);
3185 
3186 void sock_init_data(struct socket *sock, struct sock *sk)
3187 {
3188 	sk_init_common(sk);
3189 	sk->sk_send_head	=	NULL;
3190 
3191 	timer_setup(&sk->sk_timer, NULL, 0);
3192 
3193 	sk->sk_allocation	=	GFP_KERNEL;
3194 	sk->sk_rcvbuf		=	sysctl_rmem_default;
3195 	sk->sk_sndbuf		=	sysctl_wmem_default;
3196 	sk->sk_state		=	TCP_CLOSE;
3197 	sk_set_socket(sk, sock);
3198 
3199 	sock_set_flag(sk, SOCK_ZAPPED);
3200 
3201 	if (sock) {
3202 		sk->sk_type	=	sock->type;
3203 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3204 		sock->sk	=	sk;
3205 		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
3206 	} else {
3207 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3208 		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
3209 	}
3210 
3211 	rwlock_init(&sk->sk_callback_lock);
3212 	if (sk->sk_kern_sock)
3213 		lockdep_set_class_and_name(
3214 			&sk->sk_callback_lock,
3215 			af_kern_callback_keys + sk->sk_family,
3216 			af_family_kern_clock_key_strings[sk->sk_family]);
3217 	else
3218 		lockdep_set_class_and_name(
3219 			&sk->sk_callback_lock,
3220 			af_callback_keys + sk->sk_family,
3221 			af_family_clock_key_strings[sk->sk_family]);
3222 
3223 	sk->sk_state_change	=	sock_def_wakeup;
3224 	sk->sk_data_ready	=	sock_def_readable;
3225 	sk->sk_write_space	=	sock_def_write_space;
3226 	sk->sk_error_report	=	sock_def_error_report;
3227 	sk->sk_destruct		=	sock_def_destruct;
3228 
3229 	sk->sk_frag.page	=	NULL;
3230 	sk->sk_frag.offset	=	0;
3231 	sk->sk_peek_off		=	-1;
3232 
3233 	sk->sk_peer_pid 	=	NULL;
3234 	sk->sk_peer_cred	=	NULL;
3235 	spin_lock_init(&sk->sk_peer_lock);
3236 
3237 	sk->sk_write_pending	=	0;
3238 	sk->sk_rcvlowat		=	1;
3239 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3240 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3241 
3242 	sk->sk_stamp = SK_DEFAULT_STAMP;
3243 #if BITS_PER_LONG==32
3244 	seqlock_init(&sk->sk_stamp_seq);
3245 #endif
3246 	atomic_set(&sk->sk_zckey, 0);
3247 
3248 #ifdef CONFIG_NET_RX_BUSY_POLL
3249 	sk->sk_napi_id		=	0;
3250 	sk->sk_ll_usec		=	sysctl_net_busy_read;
3251 #endif
3252 
3253 	sk->sk_max_pacing_rate = ~0UL;
3254 	sk->sk_pacing_rate = ~0UL;
3255 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3256 	sk->sk_incoming_cpu = -1;
3257 
3258 	sk_rx_queue_clear(sk);
3259 	/*
3260 	 * Before updating sk_refcnt, we must commit prior changes to memory
3261 	 * (Documentation/RCU/rculist_nulls.rst for details)
3262 	 */
3263 	smp_wmb();
3264 	refcount_set(&sk->sk_refcnt, 1);
3265 	atomic_set(&sk->sk_drops, 0);
3266 }
3267 EXPORT_SYMBOL(sock_init_data);
3268 
3269 void lock_sock_nested(struct sock *sk, int subclass)
3270 {
3271 	/* The sk_lock has mutex_lock() semantics here. */
3272 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3273 
3274 	might_sleep();
3275 	spin_lock_bh(&sk->sk_lock.slock);
3276 	if (sk->sk_lock.owned)
3277 		__lock_sock(sk);
3278 	sk->sk_lock.owned = 1;
3279 	spin_unlock_bh(&sk->sk_lock.slock);
3280 }
3281 EXPORT_SYMBOL(lock_sock_nested);
3282 
3283 void release_sock(struct sock *sk)
3284 {
3285 	spin_lock_bh(&sk->sk_lock.slock);
3286 	if (sk->sk_backlog.tail)
3287 		__release_sock(sk);
3288 
3289 	/* Warning : release_cb() might need to release sk ownership,
3290 	 * ie call sock_release_ownership(sk) before us.
3291 	 */
3292 	if (sk->sk_prot->release_cb)
3293 		sk->sk_prot->release_cb(sk);
3294 
3295 	sock_release_ownership(sk);
3296 	if (waitqueue_active(&sk->sk_lock.wq))
3297 		wake_up(&sk->sk_lock.wq);
3298 	spin_unlock_bh(&sk->sk_lock.slock);
3299 }
3300 EXPORT_SYMBOL(release_sock);
3301 
3302 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3303 {
3304 	might_sleep();
3305 	spin_lock_bh(&sk->sk_lock.slock);
3306 
3307 	if (!sk->sk_lock.owned) {
3308 		/*
3309 		 * Fast path return with bottom halves disabled and
3310 		 * sock::sk_lock.slock held.
3311 		 *
3312 		 * The 'mutex' is not contended and holding
3313 		 * sock::sk_lock.slock prevents all other lockers to
3314 		 * proceed so the corresponding unlock_sock_fast() can
3315 		 * avoid the slow path of release_sock() completely and
3316 		 * just release slock.
3317 		 *
3318 		 * From a semantical POV this is equivalent to 'acquiring'
3319 		 * the 'mutex', hence the corresponding lockdep
3320 		 * mutex_release() has to happen in the fast path of
3321 		 * unlock_sock_fast().
3322 		 */
3323 		return false;
3324 	}
3325 
3326 	__lock_sock(sk);
3327 	sk->sk_lock.owned = 1;
3328 	__acquire(&sk->sk_lock.slock);
3329 	spin_unlock_bh(&sk->sk_lock.slock);
3330 	return true;
3331 }
3332 EXPORT_SYMBOL(__lock_sock_fast);
3333 
3334 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3335 		   bool timeval, bool time32)
3336 {
3337 	struct sock *sk = sock->sk;
3338 	struct timespec64 ts;
3339 
3340 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3341 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3342 	if (ts.tv_sec == -1)
3343 		return -ENOENT;
3344 	if (ts.tv_sec == 0) {
3345 		ktime_t kt = ktime_get_real();
3346 		sock_write_timestamp(sk, kt);
3347 		ts = ktime_to_timespec64(kt);
3348 	}
3349 
3350 	if (timeval)
3351 		ts.tv_nsec /= 1000;
3352 
3353 #ifdef CONFIG_COMPAT_32BIT_TIME
3354 	if (time32)
3355 		return put_old_timespec32(&ts, userstamp);
3356 #endif
3357 #ifdef CONFIG_SPARC64
3358 	/* beware of padding in sparc64 timeval */
3359 	if (timeval && !in_compat_syscall()) {
3360 		struct __kernel_old_timeval __user tv = {
3361 			.tv_sec = ts.tv_sec,
3362 			.tv_usec = ts.tv_nsec,
3363 		};
3364 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3365 			return -EFAULT;
3366 		return 0;
3367 	}
3368 #endif
3369 	return put_timespec64(&ts, userstamp);
3370 }
3371 EXPORT_SYMBOL(sock_gettstamp);
3372 
3373 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3374 {
3375 	if (!sock_flag(sk, flag)) {
3376 		unsigned long previous_flags = sk->sk_flags;
3377 
3378 		sock_set_flag(sk, flag);
3379 		/*
3380 		 * we just set one of the two flags which require net
3381 		 * time stamping, but time stamping might have been on
3382 		 * already because of the other one
3383 		 */
3384 		if (sock_needs_netstamp(sk) &&
3385 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3386 			net_enable_timestamp();
3387 	}
3388 }
3389 
3390 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3391 		       int level, int type)
3392 {
3393 	struct sock_exterr_skb *serr;
3394 	struct sk_buff *skb;
3395 	int copied, err;
3396 
3397 	err = -EAGAIN;
3398 	skb = sock_dequeue_err_skb(sk);
3399 	if (skb == NULL)
3400 		goto out;
3401 
3402 	copied = skb->len;
3403 	if (copied > len) {
3404 		msg->msg_flags |= MSG_TRUNC;
3405 		copied = len;
3406 	}
3407 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3408 	if (err)
3409 		goto out_free_skb;
3410 
3411 	sock_recv_timestamp(msg, sk, skb);
3412 
3413 	serr = SKB_EXT_ERR(skb);
3414 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3415 
3416 	msg->msg_flags |= MSG_ERRQUEUE;
3417 	err = copied;
3418 
3419 out_free_skb:
3420 	kfree_skb(skb);
3421 out:
3422 	return err;
3423 }
3424 EXPORT_SYMBOL(sock_recv_errqueue);
3425 
3426 /*
3427  *	Get a socket option on an socket.
3428  *
3429  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3430  *	asynchronous errors should be reported by getsockopt. We assume
3431  *	this means if you specify SO_ERROR (otherwise whats the point of it).
3432  */
3433 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3434 			   char __user *optval, int __user *optlen)
3435 {
3436 	struct sock *sk = sock->sk;
3437 
3438 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3439 }
3440 EXPORT_SYMBOL(sock_common_getsockopt);
3441 
3442 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3443 			int flags)
3444 {
3445 	struct sock *sk = sock->sk;
3446 	int addr_len = 0;
3447 	int err;
3448 
3449 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3450 				   flags & ~MSG_DONTWAIT, &addr_len);
3451 	if (err >= 0)
3452 		msg->msg_namelen = addr_len;
3453 	return err;
3454 }
3455 EXPORT_SYMBOL(sock_common_recvmsg);
3456 
3457 /*
3458  *	Set socket options on an inet socket.
3459  */
3460 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3461 			   sockptr_t optval, unsigned int optlen)
3462 {
3463 	struct sock *sk = sock->sk;
3464 
3465 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3466 }
3467 EXPORT_SYMBOL(sock_common_setsockopt);
3468 
3469 void sk_common_release(struct sock *sk)
3470 {
3471 	if (sk->sk_prot->destroy)
3472 		sk->sk_prot->destroy(sk);
3473 
3474 	/*
3475 	 * Observation: when sk_common_release is called, processes have
3476 	 * no access to socket. But net still has.
3477 	 * Step one, detach it from networking:
3478 	 *
3479 	 * A. Remove from hash tables.
3480 	 */
3481 
3482 	sk->sk_prot->unhash(sk);
3483 
3484 	/*
3485 	 * In this point socket cannot receive new packets, but it is possible
3486 	 * that some packets are in flight because some CPU runs receiver and
3487 	 * did hash table lookup before we unhashed socket. They will achieve
3488 	 * receive queue and will be purged by socket destructor.
3489 	 *
3490 	 * Also we still have packets pending on receive queue and probably,
3491 	 * our own packets waiting in device queues. sock_destroy will drain
3492 	 * receive queue, but transmitted packets will delay socket destruction
3493 	 * until the last reference will be released.
3494 	 */
3495 
3496 	sock_orphan(sk);
3497 
3498 	xfrm_sk_free_policy(sk);
3499 
3500 	sk_refcnt_debug_release(sk);
3501 
3502 	sock_put(sk);
3503 }
3504 EXPORT_SYMBOL(sk_common_release);
3505 
3506 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3507 {
3508 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3509 
3510 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3511 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3512 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3513 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3514 	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3515 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3516 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3517 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3518 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3519 }
3520 
3521 #ifdef CONFIG_PROC_FS
3522 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
3523 struct prot_inuse {
3524 	int val[PROTO_INUSE_NR];
3525 };
3526 
3527 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3528 
3529 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3530 {
3531 	__this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3532 }
3533 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3534 
3535 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3536 {
3537 	int cpu, idx = prot->inuse_idx;
3538 	int res = 0;
3539 
3540 	for_each_possible_cpu(cpu)
3541 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3542 
3543 	return res >= 0 ? res : 0;
3544 }
3545 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3546 
3547 static void sock_inuse_add(struct net *net, int val)
3548 {
3549 	this_cpu_add(*net->core.sock_inuse, val);
3550 }
3551 
3552 int sock_inuse_get(struct net *net)
3553 {
3554 	int cpu, res = 0;
3555 
3556 	for_each_possible_cpu(cpu)
3557 		res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3558 
3559 	return res;
3560 }
3561 
3562 EXPORT_SYMBOL_GPL(sock_inuse_get);
3563 
3564 static int __net_init sock_inuse_init_net(struct net *net)
3565 {
3566 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3567 	if (net->core.prot_inuse == NULL)
3568 		return -ENOMEM;
3569 
3570 	net->core.sock_inuse = alloc_percpu(int);
3571 	if (net->core.sock_inuse == NULL)
3572 		goto out;
3573 
3574 	return 0;
3575 
3576 out:
3577 	free_percpu(net->core.prot_inuse);
3578 	return -ENOMEM;
3579 }
3580 
3581 static void __net_exit sock_inuse_exit_net(struct net *net)
3582 {
3583 	free_percpu(net->core.prot_inuse);
3584 	free_percpu(net->core.sock_inuse);
3585 }
3586 
3587 static struct pernet_operations net_inuse_ops = {
3588 	.init = sock_inuse_init_net,
3589 	.exit = sock_inuse_exit_net,
3590 };
3591 
3592 static __init int net_inuse_init(void)
3593 {
3594 	if (register_pernet_subsys(&net_inuse_ops))
3595 		panic("Cannot initialize net inuse counters");
3596 
3597 	return 0;
3598 }
3599 
3600 core_initcall(net_inuse_init);
3601 
3602 static int assign_proto_idx(struct proto *prot)
3603 {
3604 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3605 
3606 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3607 		pr_err("PROTO_INUSE_NR exhausted\n");
3608 		return -ENOSPC;
3609 	}
3610 
3611 	set_bit(prot->inuse_idx, proto_inuse_idx);
3612 	return 0;
3613 }
3614 
3615 static void release_proto_idx(struct proto *prot)
3616 {
3617 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3618 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3619 }
3620 #else
3621 static inline int assign_proto_idx(struct proto *prot)
3622 {
3623 	return 0;
3624 }
3625 
3626 static inline void release_proto_idx(struct proto *prot)
3627 {
3628 }
3629 
3630 static void sock_inuse_add(struct net *net, int val)
3631 {
3632 }
3633 #endif
3634 
3635 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3636 {
3637 	if (!twsk_prot)
3638 		return;
3639 	kfree(twsk_prot->twsk_slab_name);
3640 	twsk_prot->twsk_slab_name = NULL;
3641 	kmem_cache_destroy(twsk_prot->twsk_slab);
3642 	twsk_prot->twsk_slab = NULL;
3643 }
3644 
3645 static int tw_prot_init(const struct proto *prot)
3646 {
3647 	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3648 
3649 	if (!twsk_prot)
3650 		return 0;
3651 
3652 	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3653 					      prot->name);
3654 	if (!twsk_prot->twsk_slab_name)
3655 		return -ENOMEM;
3656 
3657 	twsk_prot->twsk_slab =
3658 		kmem_cache_create(twsk_prot->twsk_slab_name,
3659 				  twsk_prot->twsk_obj_size, 0,
3660 				  SLAB_ACCOUNT | prot->slab_flags,
3661 				  NULL);
3662 	if (!twsk_prot->twsk_slab) {
3663 		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3664 			prot->name);
3665 		return -ENOMEM;
3666 	}
3667 
3668 	return 0;
3669 }
3670 
3671 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3672 {
3673 	if (!rsk_prot)
3674 		return;
3675 	kfree(rsk_prot->slab_name);
3676 	rsk_prot->slab_name = NULL;
3677 	kmem_cache_destroy(rsk_prot->slab);
3678 	rsk_prot->slab = NULL;
3679 }
3680 
3681 static int req_prot_init(const struct proto *prot)
3682 {
3683 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3684 
3685 	if (!rsk_prot)
3686 		return 0;
3687 
3688 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3689 					prot->name);
3690 	if (!rsk_prot->slab_name)
3691 		return -ENOMEM;
3692 
3693 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3694 					   rsk_prot->obj_size, 0,
3695 					   SLAB_ACCOUNT | prot->slab_flags,
3696 					   NULL);
3697 
3698 	if (!rsk_prot->slab) {
3699 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3700 			prot->name);
3701 		return -ENOMEM;
3702 	}
3703 	return 0;
3704 }
3705 
3706 int proto_register(struct proto *prot, int alloc_slab)
3707 {
3708 	int ret = -ENOBUFS;
3709 
3710 	if (alloc_slab) {
3711 		prot->slab = kmem_cache_create_usercopy(prot->name,
3712 					prot->obj_size, 0,
3713 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3714 					prot->slab_flags,
3715 					prot->useroffset, prot->usersize,
3716 					NULL);
3717 
3718 		if (prot->slab == NULL) {
3719 			pr_crit("%s: Can't create sock SLAB cache!\n",
3720 				prot->name);
3721 			goto out;
3722 		}
3723 
3724 		if (req_prot_init(prot))
3725 			goto out_free_request_sock_slab;
3726 
3727 		if (tw_prot_init(prot))
3728 			goto out_free_timewait_sock_slab;
3729 	}
3730 
3731 	mutex_lock(&proto_list_mutex);
3732 	ret = assign_proto_idx(prot);
3733 	if (ret) {
3734 		mutex_unlock(&proto_list_mutex);
3735 		goto out_free_timewait_sock_slab;
3736 	}
3737 	list_add(&prot->node, &proto_list);
3738 	mutex_unlock(&proto_list_mutex);
3739 	return ret;
3740 
3741 out_free_timewait_sock_slab:
3742 	if (alloc_slab)
3743 		tw_prot_cleanup(prot->twsk_prot);
3744 out_free_request_sock_slab:
3745 	if (alloc_slab) {
3746 		req_prot_cleanup(prot->rsk_prot);
3747 
3748 		kmem_cache_destroy(prot->slab);
3749 		prot->slab = NULL;
3750 	}
3751 out:
3752 	return ret;
3753 }
3754 EXPORT_SYMBOL(proto_register);
3755 
3756 void proto_unregister(struct proto *prot)
3757 {
3758 	mutex_lock(&proto_list_mutex);
3759 	release_proto_idx(prot);
3760 	list_del(&prot->node);
3761 	mutex_unlock(&proto_list_mutex);
3762 
3763 	kmem_cache_destroy(prot->slab);
3764 	prot->slab = NULL;
3765 
3766 	req_prot_cleanup(prot->rsk_prot);
3767 	tw_prot_cleanup(prot->twsk_prot);
3768 }
3769 EXPORT_SYMBOL(proto_unregister);
3770 
3771 int sock_load_diag_module(int family, int protocol)
3772 {
3773 	if (!protocol) {
3774 		if (!sock_is_registered(family))
3775 			return -ENOENT;
3776 
3777 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3778 				      NETLINK_SOCK_DIAG, family);
3779 	}
3780 
3781 #ifdef CONFIG_INET
3782 	if (family == AF_INET &&
3783 	    protocol != IPPROTO_RAW &&
3784 	    protocol < MAX_INET_PROTOS &&
3785 	    !rcu_access_pointer(inet_protos[protocol]))
3786 		return -ENOENT;
3787 #endif
3788 
3789 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3790 			      NETLINK_SOCK_DIAG, family, protocol);
3791 }
3792 EXPORT_SYMBOL(sock_load_diag_module);
3793 
3794 #ifdef CONFIG_PROC_FS
3795 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3796 	__acquires(proto_list_mutex)
3797 {
3798 	mutex_lock(&proto_list_mutex);
3799 	return seq_list_start_head(&proto_list, *pos);
3800 }
3801 
3802 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3803 {
3804 	return seq_list_next(v, &proto_list, pos);
3805 }
3806 
3807 static void proto_seq_stop(struct seq_file *seq, void *v)
3808 	__releases(proto_list_mutex)
3809 {
3810 	mutex_unlock(&proto_list_mutex);
3811 }
3812 
3813 static char proto_method_implemented(const void *method)
3814 {
3815 	return method == NULL ? 'n' : 'y';
3816 }
3817 static long sock_prot_memory_allocated(struct proto *proto)
3818 {
3819 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3820 }
3821 
3822 static const char *sock_prot_memory_pressure(struct proto *proto)
3823 {
3824 	return proto->memory_pressure != NULL ?
3825 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3826 }
3827 
3828 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3829 {
3830 
3831 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3832 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3833 		   proto->name,
3834 		   proto->obj_size,
3835 		   sock_prot_inuse_get(seq_file_net(seq), proto),
3836 		   sock_prot_memory_allocated(proto),
3837 		   sock_prot_memory_pressure(proto),
3838 		   proto->max_header,
3839 		   proto->slab == NULL ? "no" : "yes",
3840 		   module_name(proto->owner),
3841 		   proto_method_implemented(proto->close),
3842 		   proto_method_implemented(proto->connect),
3843 		   proto_method_implemented(proto->disconnect),
3844 		   proto_method_implemented(proto->accept),
3845 		   proto_method_implemented(proto->ioctl),
3846 		   proto_method_implemented(proto->init),
3847 		   proto_method_implemented(proto->destroy),
3848 		   proto_method_implemented(proto->shutdown),
3849 		   proto_method_implemented(proto->setsockopt),
3850 		   proto_method_implemented(proto->getsockopt),
3851 		   proto_method_implemented(proto->sendmsg),
3852 		   proto_method_implemented(proto->recvmsg),
3853 		   proto_method_implemented(proto->sendpage),
3854 		   proto_method_implemented(proto->bind),
3855 		   proto_method_implemented(proto->backlog_rcv),
3856 		   proto_method_implemented(proto->hash),
3857 		   proto_method_implemented(proto->unhash),
3858 		   proto_method_implemented(proto->get_port),
3859 		   proto_method_implemented(proto->enter_memory_pressure));
3860 }
3861 
3862 static int proto_seq_show(struct seq_file *seq, void *v)
3863 {
3864 	if (v == &proto_list)
3865 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3866 			   "protocol",
3867 			   "size",
3868 			   "sockets",
3869 			   "memory",
3870 			   "press",
3871 			   "maxhdr",
3872 			   "slab",
3873 			   "module",
3874 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3875 	else
3876 		proto_seq_printf(seq, list_entry(v, struct proto, node));
3877 	return 0;
3878 }
3879 
3880 static const struct seq_operations proto_seq_ops = {
3881 	.start  = proto_seq_start,
3882 	.next   = proto_seq_next,
3883 	.stop   = proto_seq_stop,
3884 	.show   = proto_seq_show,
3885 };
3886 
3887 static __net_init int proto_init_net(struct net *net)
3888 {
3889 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3890 			sizeof(struct seq_net_private)))
3891 		return -ENOMEM;
3892 
3893 	return 0;
3894 }
3895 
3896 static __net_exit void proto_exit_net(struct net *net)
3897 {
3898 	remove_proc_entry("protocols", net->proc_net);
3899 }
3900 
3901 
3902 static __net_initdata struct pernet_operations proto_net_ops = {
3903 	.init = proto_init_net,
3904 	.exit = proto_exit_net,
3905 };
3906 
3907 static int __init proto_init(void)
3908 {
3909 	return register_pernet_subsys(&proto_net_ops);
3910 }
3911 
3912 subsys_initcall(proto_init);
3913 
3914 #endif /* PROC_FS */
3915 
3916 #ifdef CONFIG_NET_RX_BUSY_POLL
3917 bool sk_busy_loop_end(void *p, unsigned long start_time)
3918 {
3919 	struct sock *sk = p;
3920 
3921 	return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3922 	       sk_busy_loop_timeout(sk, start_time);
3923 }
3924 EXPORT_SYMBOL(sk_busy_loop_end);
3925 #endif /* CONFIG_NET_RX_BUSY_POLL */
3926 
3927 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
3928 {
3929 	if (!sk->sk_prot->bind_add)
3930 		return -EOPNOTSUPP;
3931 	return sk->sk_prot->bind_add(sk, addr, addr_len);
3932 }
3933 EXPORT_SYMBOL(sock_bind_add);
3934