xref: /openbmc/linux/net/core/sock.c (revision dde2daa0a279623a6f769b258339df744cc0fdd6)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/init.h>
111 #include <linux/highmem.h>
112 #include <linux/user_namespace.h>
113 #include <linux/static_key.h>
114 #include <linux/memcontrol.h>
115 #include <linux/prefetch.h>
116 #include <linux/compat.h>
117 
118 #include <linux/uaccess.h>
119 
120 #include <linux/netdevice.h>
121 #include <net/protocol.h>
122 #include <linux/skbuff.h>
123 #include <net/net_namespace.h>
124 #include <net/request_sock.h>
125 #include <net/sock.h>
126 #include <linux/net_tstamp.h>
127 #include <net/xfrm.h>
128 #include <linux/ipsec.h>
129 #include <net/cls_cgroup.h>
130 #include <net/netprio_cgroup.h>
131 #include <linux/sock_diag.h>
132 
133 #include <linux/filter.h>
134 #include <net/sock_reuseport.h>
135 #include <net/bpf_sk_storage.h>
136 
137 #include <trace/events/sock.h>
138 
139 #include <net/tcp.h>
140 #include <net/busy_poll.h>
141 
142 #include <linux/ethtool.h>
143 
144 #include "dev.h"
145 
146 static DEFINE_MUTEX(proto_list_mutex);
147 static LIST_HEAD(proto_list);
148 
149 /**
150  * sk_ns_capable - General socket capability test
151  * @sk: Socket to use a capability on or through
152  * @user_ns: The user namespace of the capability to use
153  * @cap: The capability to use
154  *
155  * Test to see if the opener of the socket had when the socket was
156  * created and the current process has the capability @cap in the user
157  * namespace @user_ns.
158  */
159 bool sk_ns_capable(const struct sock *sk,
160 		   struct user_namespace *user_ns, int cap)
161 {
162 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
163 		ns_capable(user_ns, cap);
164 }
165 EXPORT_SYMBOL(sk_ns_capable);
166 
167 /**
168  * sk_capable - Socket global capability test
169  * @sk: Socket to use a capability on or through
170  * @cap: The global capability to use
171  *
172  * Test to see if the opener of the socket had when the socket was
173  * created and the current process has the capability @cap in all user
174  * namespaces.
175  */
176 bool sk_capable(const struct sock *sk, int cap)
177 {
178 	return sk_ns_capable(sk, &init_user_ns, cap);
179 }
180 EXPORT_SYMBOL(sk_capable);
181 
182 /**
183  * sk_net_capable - Network namespace socket capability test
184  * @sk: Socket to use a capability on or through
185  * @cap: The capability to use
186  *
187  * Test to see if the opener of the socket had when the socket was created
188  * and the current process has the capability @cap over the network namespace
189  * the socket is a member of.
190  */
191 bool sk_net_capable(const struct sock *sk, int cap)
192 {
193 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
194 }
195 EXPORT_SYMBOL(sk_net_capable);
196 
197 /*
198  * Each address family might have different locking rules, so we have
199  * one slock key per address family and separate keys for internal and
200  * userspace sockets.
201  */
202 static struct lock_class_key af_family_keys[AF_MAX];
203 static struct lock_class_key af_family_kern_keys[AF_MAX];
204 static struct lock_class_key af_family_slock_keys[AF_MAX];
205 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
206 
207 /*
208  * Make lock validator output more readable. (we pre-construct these
209  * strings build-time, so that runtime initialization of socket
210  * locks is fast):
211  */
212 
213 #define _sock_locks(x)						  \
214   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
215   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
216   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
217   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
218   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
219   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
220   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
221   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
222   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
223   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
224   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
225   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
226   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
227   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
228   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
229   x "AF_MCTP"  , \
230   x "AF_MAX"
231 
232 static const char *const af_family_key_strings[AF_MAX+1] = {
233 	_sock_locks("sk_lock-")
234 };
235 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
236 	_sock_locks("slock-")
237 };
238 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
239 	_sock_locks("clock-")
240 };
241 
242 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
243 	_sock_locks("k-sk_lock-")
244 };
245 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
246 	_sock_locks("k-slock-")
247 };
248 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
249 	_sock_locks("k-clock-")
250 };
251 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
252 	_sock_locks("rlock-")
253 };
254 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
255 	_sock_locks("wlock-")
256 };
257 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
258 	_sock_locks("elock-")
259 };
260 
261 /*
262  * sk_callback_lock and sk queues locking rules are per-address-family,
263  * so split the lock classes by using a per-AF key:
264  */
265 static struct lock_class_key af_callback_keys[AF_MAX];
266 static struct lock_class_key af_rlock_keys[AF_MAX];
267 static struct lock_class_key af_wlock_keys[AF_MAX];
268 static struct lock_class_key af_elock_keys[AF_MAX];
269 static struct lock_class_key af_kern_callback_keys[AF_MAX];
270 
271 /* Run time adjustable parameters. */
272 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
273 EXPORT_SYMBOL(sysctl_wmem_max);
274 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
275 EXPORT_SYMBOL(sysctl_rmem_max);
276 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
277 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
278 
279 /* Maximal space eaten by iovec or ancillary data plus some space */
280 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
281 EXPORT_SYMBOL(sysctl_optmem_max);
282 
283 int sysctl_tstamp_allow_data __read_mostly = 1;
284 
285 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
286 EXPORT_SYMBOL_GPL(memalloc_socks_key);
287 
288 /**
289  * sk_set_memalloc - sets %SOCK_MEMALLOC
290  * @sk: socket to set it on
291  *
292  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
293  * It's the responsibility of the admin to adjust min_free_kbytes
294  * to meet the requirements
295  */
296 void sk_set_memalloc(struct sock *sk)
297 {
298 	sock_set_flag(sk, SOCK_MEMALLOC);
299 	sk->sk_allocation |= __GFP_MEMALLOC;
300 	static_branch_inc(&memalloc_socks_key);
301 }
302 EXPORT_SYMBOL_GPL(sk_set_memalloc);
303 
304 void sk_clear_memalloc(struct sock *sk)
305 {
306 	sock_reset_flag(sk, SOCK_MEMALLOC);
307 	sk->sk_allocation &= ~__GFP_MEMALLOC;
308 	static_branch_dec(&memalloc_socks_key);
309 
310 	/*
311 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
312 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
313 	 * it has rmem allocations due to the last swapfile being deactivated
314 	 * but there is a risk that the socket is unusable due to exceeding
315 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
316 	 */
317 	sk_mem_reclaim(sk);
318 }
319 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
320 
321 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
322 {
323 	int ret;
324 	unsigned int noreclaim_flag;
325 
326 	/* these should have been dropped before queueing */
327 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
328 
329 	noreclaim_flag = memalloc_noreclaim_save();
330 	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
331 				 tcp_v6_do_rcv,
332 				 tcp_v4_do_rcv,
333 				 sk, skb);
334 	memalloc_noreclaim_restore(noreclaim_flag);
335 
336 	return ret;
337 }
338 EXPORT_SYMBOL(__sk_backlog_rcv);
339 
340 void sk_error_report(struct sock *sk)
341 {
342 	sk->sk_error_report(sk);
343 
344 	switch (sk->sk_family) {
345 	case AF_INET:
346 		fallthrough;
347 	case AF_INET6:
348 		trace_inet_sk_error_report(sk);
349 		break;
350 	default:
351 		break;
352 	}
353 }
354 EXPORT_SYMBOL(sk_error_report);
355 
356 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
357 {
358 	struct __kernel_sock_timeval tv;
359 
360 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
361 		tv.tv_sec = 0;
362 		tv.tv_usec = 0;
363 	} else {
364 		tv.tv_sec = timeo / HZ;
365 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
366 	}
367 
368 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
369 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
370 		*(struct old_timeval32 *)optval = tv32;
371 		return sizeof(tv32);
372 	}
373 
374 	if (old_timeval) {
375 		struct __kernel_old_timeval old_tv;
376 		old_tv.tv_sec = tv.tv_sec;
377 		old_tv.tv_usec = tv.tv_usec;
378 		*(struct __kernel_old_timeval *)optval = old_tv;
379 		return sizeof(old_tv);
380 	}
381 
382 	*(struct __kernel_sock_timeval *)optval = tv;
383 	return sizeof(tv);
384 }
385 EXPORT_SYMBOL(sock_get_timeout);
386 
387 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
388 			   sockptr_t optval, int optlen, bool old_timeval)
389 {
390 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
391 		struct old_timeval32 tv32;
392 
393 		if (optlen < sizeof(tv32))
394 			return -EINVAL;
395 
396 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
397 			return -EFAULT;
398 		tv->tv_sec = tv32.tv_sec;
399 		tv->tv_usec = tv32.tv_usec;
400 	} else if (old_timeval) {
401 		struct __kernel_old_timeval old_tv;
402 
403 		if (optlen < sizeof(old_tv))
404 			return -EINVAL;
405 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
406 			return -EFAULT;
407 		tv->tv_sec = old_tv.tv_sec;
408 		tv->tv_usec = old_tv.tv_usec;
409 	} else {
410 		if (optlen < sizeof(*tv))
411 			return -EINVAL;
412 		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
413 			return -EFAULT;
414 	}
415 
416 	return 0;
417 }
418 EXPORT_SYMBOL(sock_copy_user_timeval);
419 
420 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
421 			    bool old_timeval)
422 {
423 	struct __kernel_sock_timeval tv;
424 	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
425 
426 	if (err)
427 		return err;
428 
429 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
430 		return -EDOM;
431 
432 	if (tv.tv_sec < 0) {
433 		static int warned __read_mostly;
434 
435 		*timeo_p = 0;
436 		if (warned < 10 && net_ratelimit()) {
437 			warned++;
438 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
439 				__func__, current->comm, task_pid_nr(current));
440 		}
441 		return 0;
442 	}
443 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
444 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
445 		return 0;
446 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
447 		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
448 	return 0;
449 }
450 
451 static bool sock_needs_netstamp(const struct sock *sk)
452 {
453 	switch (sk->sk_family) {
454 	case AF_UNSPEC:
455 	case AF_UNIX:
456 		return false;
457 	default:
458 		return true;
459 	}
460 }
461 
462 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
463 {
464 	if (sk->sk_flags & flags) {
465 		sk->sk_flags &= ~flags;
466 		if (sock_needs_netstamp(sk) &&
467 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
468 			net_disable_timestamp();
469 	}
470 }
471 
472 
473 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
474 {
475 	unsigned long flags;
476 	struct sk_buff_head *list = &sk->sk_receive_queue;
477 
478 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
479 		atomic_inc(&sk->sk_drops);
480 		trace_sock_rcvqueue_full(sk, skb);
481 		return -ENOMEM;
482 	}
483 
484 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
485 		atomic_inc(&sk->sk_drops);
486 		return -ENOBUFS;
487 	}
488 
489 	skb->dev = NULL;
490 	skb_set_owner_r(skb, sk);
491 
492 	/* we escape from rcu protected region, make sure we dont leak
493 	 * a norefcounted dst
494 	 */
495 	skb_dst_force(skb);
496 
497 	spin_lock_irqsave(&list->lock, flags);
498 	sock_skb_set_dropcount(sk, skb);
499 	__skb_queue_tail(list, skb);
500 	spin_unlock_irqrestore(&list->lock, flags);
501 
502 	if (!sock_flag(sk, SOCK_DEAD))
503 		sk->sk_data_ready(sk);
504 	return 0;
505 }
506 EXPORT_SYMBOL(__sock_queue_rcv_skb);
507 
508 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
509 			      enum skb_drop_reason *reason)
510 {
511 	enum skb_drop_reason drop_reason;
512 	int err;
513 
514 	err = sk_filter(sk, skb);
515 	if (err) {
516 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
517 		goto out;
518 	}
519 	err = __sock_queue_rcv_skb(sk, skb);
520 	switch (err) {
521 	case -ENOMEM:
522 		drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
523 		break;
524 	case -ENOBUFS:
525 		drop_reason = SKB_DROP_REASON_PROTO_MEM;
526 		break;
527 	default:
528 		drop_reason = SKB_NOT_DROPPED_YET;
529 		break;
530 	}
531 out:
532 	if (reason)
533 		*reason = drop_reason;
534 	return err;
535 }
536 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
537 
538 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
539 		     const int nested, unsigned int trim_cap, bool refcounted)
540 {
541 	int rc = NET_RX_SUCCESS;
542 
543 	if (sk_filter_trim_cap(sk, skb, trim_cap))
544 		goto discard_and_relse;
545 
546 	skb->dev = NULL;
547 
548 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
549 		atomic_inc(&sk->sk_drops);
550 		goto discard_and_relse;
551 	}
552 	if (nested)
553 		bh_lock_sock_nested(sk);
554 	else
555 		bh_lock_sock(sk);
556 	if (!sock_owned_by_user(sk)) {
557 		/*
558 		 * trylock + unlock semantics:
559 		 */
560 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
561 
562 		rc = sk_backlog_rcv(sk, skb);
563 
564 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
565 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
566 		bh_unlock_sock(sk);
567 		atomic_inc(&sk->sk_drops);
568 		goto discard_and_relse;
569 	}
570 
571 	bh_unlock_sock(sk);
572 out:
573 	if (refcounted)
574 		sock_put(sk);
575 	return rc;
576 discard_and_relse:
577 	kfree_skb(skb);
578 	goto out;
579 }
580 EXPORT_SYMBOL(__sk_receive_skb);
581 
582 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
583 							  u32));
584 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
585 							   u32));
586 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
587 {
588 	struct dst_entry *dst = __sk_dst_get(sk);
589 
590 	if (dst && dst->obsolete &&
591 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
592 			       dst, cookie) == NULL) {
593 		sk_tx_queue_clear(sk);
594 		sk->sk_dst_pending_confirm = 0;
595 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
596 		dst_release(dst);
597 		return NULL;
598 	}
599 
600 	return dst;
601 }
602 EXPORT_SYMBOL(__sk_dst_check);
603 
604 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
605 {
606 	struct dst_entry *dst = sk_dst_get(sk);
607 
608 	if (dst && dst->obsolete &&
609 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
610 			       dst, cookie) == NULL) {
611 		sk_dst_reset(sk);
612 		dst_release(dst);
613 		return NULL;
614 	}
615 
616 	return dst;
617 }
618 EXPORT_SYMBOL(sk_dst_check);
619 
620 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
621 {
622 	int ret = -ENOPROTOOPT;
623 #ifdef CONFIG_NETDEVICES
624 	struct net *net = sock_net(sk);
625 
626 	/* Sorry... */
627 	ret = -EPERM;
628 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
629 		goto out;
630 
631 	ret = -EINVAL;
632 	if (ifindex < 0)
633 		goto out;
634 
635 	sk->sk_bound_dev_if = ifindex;
636 	if (sk->sk_prot->rehash)
637 		sk->sk_prot->rehash(sk);
638 	sk_dst_reset(sk);
639 
640 	ret = 0;
641 
642 out:
643 #endif
644 
645 	return ret;
646 }
647 
648 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
649 {
650 	int ret;
651 
652 	if (lock_sk)
653 		lock_sock(sk);
654 	ret = sock_bindtoindex_locked(sk, ifindex);
655 	if (lock_sk)
656 		release_sock(sk);
657 
658 	return ret;
659 }
660 EXPORT_SYMBOL(sock_bindtoindex);
661 
662 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
663 {
664 	int ret = -ENOPROTOOPT;
665 #ifdef CONFIG_NETDEVICES
666 	struct net *net = sock_net(sk);
667 	char devname[IFNAMSIZ];
668 	int index;
669 
670 	ret = -EINVAL;
671 	if (optlen < 0)
672 		goto out;
673 
674 	/* Bind this socket to a particular device like "eth0",
675 	 * as specified in the passed interface name. If the
676 	 * name is "" or the option length is zero the socket
677 	 * is not bound.
678 	 */
679 	if (optlen > IFNAMSIZ - 1)
680 		optlen = IFNAMSIZ - 1;
681 	memset(devname, 0, sizeof(devname));
682 
683 	ret = -EFAULT;
684 	if (copy_from_sockptr(devname, optval, optlen))
685 		goto out;
686 
687 	index = 0;
688 	if (devname[0] != '\0') {
689 		struct net_device *dev;
690 
691 		rcu_read_lock();
692 		dev = dev_get_by_name_rcu(net, devname);
693 		if (dev)
694 			index = dev->ifindex;
695 		rcu_read_unlock();
696 		ret = -ENODEV;
697 		if (!dev)
698 			goto out;
699 	}
700 
701 	return sock_bindtoindex(sk, index, true);
702 out:
703 #endif
704 
705 	return ret;
706 }
707 
708 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
709 				int __user *optlen, int len)
710 {
711 	int ret = -ENOPROTOOPT;
712 #ifdef CONFIG_NETDEVICES
713 	struct net *net = sock_net(sk);
714 	char devname[IFNAMSIZ];
715 
716 	if (sk->sk_bound_dev_if == 0) {
717 		len = 0;
718 		goto zero;
719 	}
720 
721 	ret = -EINVAL;
722 	if (len < IFNAMSIZ)
723 		goto out;
724 
725 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
726 	if (ret)
727 		goto out;
728 
729 	len = strlen(devname) + 1;
730 
731 	ret = -EFAULT;
732 	if (copy_to_user(optval, devname, len))
733 		goto out;
734 
735 zero:
736 	ret = -EFAULT;
737 	if (put_user(len, optlen))
738 		goto out;
739 
740 	ret = 0;
741 
742 out:
743 #endif
744 
745 	return ret;
746 }
747 
748 bool sk_mc_loop(struct sock *sk)
749 {
750 	if (dev_recursion_level())
751 		return false;
752 	if (!sk)
753 		return true;
754 	switch (sk->sk_family) {
755 	case AF_INET:
756 		return inet_sk(sk)->mc_loop;
757 #if IS_ENABLED(CONFIG_IPV6)
758 	case AF_INET6:
759 		return inet6_sk(sk)->mc_loop;
760 #endif
761 	}
762 	WARN_ON_ONCE(1);
763 	return true;
764 }
765 EXPORT_SYMBOL(sk_mc_loop);
766 
767 void sock_set_reuseaddr(struct sock *sk)
768 {
769 	lock_sock(sk);
770 	sk->sk_reuse = SK_CAN_REUSE;
771 	release_sock(sk);
772 }
773 EXPORT_SYMBOL(sock_set_reuseaddr);
774 
775 void sock_set_reuseport(struct sock *sk)
776 {
777 	lock_sock(sk);
778 	sk->sk_reuseport = true;
779 	release_sock(sk);
780 }
781 EXPORT_SYMBOL(sock_set_reuseport);
782 
783 void sock_no_linger(struct sock *sk)
784 {
785 	lock_sock(sk);
786 	sk->sk_lingertime = 0;
787 	sock_set_flag(sk, SOCK_LINGER);
788 	release_sock(sk);
789 }
790 EXPORT_SYMBOL(sock_no_linger);
791 
792 void sock_set_priority(struct sock *sk, u32 priority)
793 {
794 	lock_sock(sk);
795 	sk->sk_priority = priority;
796 	release_sock(sk);
797 }
798 EXPORT_SYMBOL(sock_set_priority);
799 
800 void sock_set_sndtimeo(struct sock *sk, s64 secs)
801 {
802 	lock_sock(sk);
803 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
804 		sk->sk_sndtimeo = secs * HZ;
805 	else
806 		sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
807 	release_sock(sk);
808 }
809 EXPORT_SYMBOL(sock_set_sndtimeo);
810 
811 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
812 {
813 	if (val)  {
814 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
815 		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
816 		sock_set_flag(sk, SOCK_RCVTSTAMP);
817 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
818 	} else {
819 		sock_reset_flag(sk, SOCK_RCVTSTAMP);
820 		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
821 	}
822 }
823 
824 void sock_enable_timestamps(struct sock *sk)
825 {
826 	lock_sock(sk);
827 	__sock_set_timestamps(sk, true, false, true);
828 	release_sock(sk);
829 }
830 EXPORT_SYMBOL(sock_enable_timestamps);
831 
832 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
833 {
834 	switch (optname) {
835 	case SO_TIMESTAMP_OLD:
836 		__sock_set_timestamps(sk, valbool, false, false);
837 		break;
838 	case SO_TIMESTAMP_NEW:
839 		__sock_set_timestamps(sk, valbool, true, false);
840 		break;
841 	case SO_TIMESTAMPNS_OLD:
842 		__sock_set_timestamps(sk, valbool, false, true);
843 		break;
844 	case SO_TIMESTAMPNS_NEW:
845 		__sock_set_timestamps(sk, valbool, true, true);
846 		break;
847 	}
848 }
849 
850 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
851 {
852 	struct net *net = sock_net(sk);
853 	struct net_device *dev = NULL;
854 	bool match = false;
855 	int *vclock_index;
856 	int i, num;
857 
858 	if (sk->sk_bound_dev_if)
859 		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
860 
861 	if (!dev) {
862 		pr_err("%s: sock not bind to device\n", __func__);
863 		return -EOPNOTSUPP;
864 	}
865 
866 	num = ethtool_get_phc_vclocks(dev, &vclock_index);
867 	dev_put(dev);
868 
869 	for (i = 0; i < num; i++) {
870 		if (*(vclock_index + i) == phc_index) {
871 			match = true;
872 			break;
873 		}
874 	}
875 
876 	if (num > 0)
877 		kfree(vclock_index);
878 
879 	if (!match)
880 		return -EINVAL;
881 
882 	sk->sk_bind_phc = phc_index;
883 
884 	return 0;
885 }
886 
887 int sock_set_timestamping(struct sock *sk, int optname,
888 			  struct so_timestamping timestamping)
889 {
890 	int val = timestamping.flags;
891 	int ret;
892 
893 	if (val & ~SOF_TIMESTAMPING_MASK)
894 		return -EINVAL;
895 
896 	if (val & SOF_TIMESTAMPING_OPT_ID &&
897 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
898 		if (sk_is_tcp(sk)) {
899 			if ((1 << sk->sk_state) &
900 			    (TCPF_CLOSE | TCPF_LISTEN))
901 				return -EINVAL;
902 			atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
903 		} else {
904 			atomic_set(&sk->sk_tskey, 0);
905 		}
906 	}
907 
908 	if (val & SOF_TIMESTAMPING_OPT_STATS &&
909 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
910 		return -EINVAL;
911 
912 	if (val & SOF_TIMESTAMPING_BIND_PHC) {
913 		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
914 		if (ret)
915 			return ret;
916 	}
917 
918 	sk->sk_tsflags = val;
919 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
920 
921 	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
922 		sock_enable_timestamp(sk,
923 				      SOCK_TIMESTAMPING_RX_SOFTWARE);
924 	else
925 		sock_disable_timestamp(sk,
926 				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
927 	return 0;
928 }
929 
930 void sock_set_keepalive(struct sock *sk)
931 {
932 	lock_sock(sk);
933 	if (sk->sk_prot->keepalive)
934 		sk->sk_prot->keepalive(sk, true);
935 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
936 	release_sock(sk);
937 }
938 EXPORT_SYMBOL(sock_set_keepalive);
939 
940 static void __sock_set_rcvbuf(struct sock *sk, int val)
941 {
942 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
943 	 * as a negative value.
944 	 */
945 	val = min_t(int, val, INT_MAX / 2);
946 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
947 
948 	/* We double it on the way in to account for "struct sk_buff" etc.
949 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
950 	 * will allow that much actual data to be received on that socket.
951 	 *
952 	 * Applications are unaware that "struct sk_buff" and other overheads
953 	 * allocate from the receive buffer during socket buffer allocation.
954 	 *
955 	 * And after considering the possible alternatives, returning the value
956 	 * we actually used in getsockopt is the most desirable behavior.
957 	 */
958 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
959 }
960 
961 void sock_set_rcvbuf(struct sock *sk, int val)
962 {
963 	lock_sock(sk);
964 	__sock_set_rcvbuf(sk, val);
965 	release_sock(sk);
966 }
967 EXPORT_SYMBOL(sock_set_rcvbuf);
968 
969 static void __sock_set_mark(struct sock *sk, u32 val)
970 {
971 	if (val != sk->sk_mark) {
972 		sk->sk_mark = val;
973 		sk_dst_reset(sk);
974 	}
975 }
976 
977 void sock_set_mark(struct sock *sk, u32 val)
978 {
979 	lock_sock(sk);
980 	__sock_set_mark(sk, val);
981 	release_sock(sk);
982 }
983 EXPORT_SYMBOL(sock_set_mark);
984 
985 static void sock_release_reserved_memory(struct sock *sk, int bytes)
986 {
987 	/* Round down bytes to multiple of pages */
988 	bytes &= ~(SK_MEM_QUANTUM - 1);
989 
990 	WARN_ON(bytes > sk->sk_reserved_mem);
991 	sk->sk_reserved_mem -= bytes;
992 	sk_mem_reclaim(sk);
993 }
994 
995 static int sock_reserve_memory(struct sock *sk, int bytes)
996 {
997 	long allocated;
998 	bool charged;
999 	int pages;
1000 
1001 	if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1002 		return -EOPNOTSUPP;
1003 
1004 	if (!bytes)
1005 		return 0;
1006 
1007 	pages = sk_mem_pages(bytes);
1008 
1009 	/* pre-charge to memcg */
1010 	charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1011 					  GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1012 	if (!charged)
1013 		return -ENOMEM;
1014 
1015 	/* pre-charge to forward_alloc */
1016 	allocated = sk_memory_allocated_add(sk, pages);
1017 	/* If the system goes into memory pressure with this
1018 	 * precharge, give up and return error.
1019 	 */
1020 	if (allocated > sk_prot_mem_limits(sk, 1)) {
1021 		sk_memory_allocated_sub(sk, pages);
1022 		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1023 		return -ENOMEM;
1024 	}
1025 	sk->sk_forward_alloc += pages << SK_MEM_QUANTUM_SHIFT;
1026 
1027 	sk->sk_reserved_mem += pages << SK_MEM_QUANTUM_SHIFT;
1028 
1029 	return 0;
1030 }
1031 
1032 /*
1033  *	This is meant for all protocols to use and covers goings on
1034  *	at the socket level. Everything here is generic.
1035  */
1036 
1037 int sock_setsockopt(struct socket *sock, int level, int optname,
1038 		    sockptr_t optval, unsigned int optlen)
1039 {
1040 	struct so_timestamping timestamping;
1041 	struct sock_txtime sk_txtime;
1042 	struct sock *sk = sock->sk;
1043 	int val;
1044 	int valbool;
1045 	struct linger ling;
1046 	int ret = 0;
1047 
1048 	/*
1049 	 *	Options without arguments
1050 	 */
1051 
1052 	if (optname == SO_BINDTODEVICE)
1053 		return sock_setbindtodevice(sk, optval, optlen);
1054 
1055 	if (optlen < sizeof(int))
1056 		return -EINVAL;
1057 
1058 	if (copy_from_sockptr(&val, optval, sizeof(val)))
1059 		return -EFAULT;
1060 
1061 	valbool = val ? 1 : 0;
1062 
1063 	lock_sock(sk);
1064 
1065 	switch (optname) {
1066 	case SO_DEBUG:
1067 		if (val && !capable(CAP_NET_ADMIN))
1068 			ret = -EACCES;
1069 		else
1070 			sock_valbool_flag(sk, SOCK_DBG, valbool);
1071 		break;
1072 	case SO_REUSEADDR:
1073 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1074 		break;
1075 	case SO_REUSEPORT:
1076 		sk->sk_reuseport = valbool;
1077 		break;
1078 	case SO_TYPE:
1079 	case SO_PROTOCOL:
1080 	case SO_DOMAIN:
1081 	case SO_ERROR:
1082 		ret = -ENOPROTOOPT;
1083 		break;
1084 	case SO_DONTROUTE:
1085 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1086 		sk_dst_reset(sk);
1087 		break;
1088 	case SO_BROADCAST:
1089 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1090 		break;
1091 	case SO_SNDBUF:
1092 		/* Don't error on this BSD doesn't and if you think
1093 		 * about it this is right. Otherwise apps have to
1094 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1095 		 * are treated in BSD as hints
1096 		 */
1097 		val = min_t(u32, val, sysctl_wmem_max);
1098 set_sndbuf:
1099 		/* Ensure val * 2 fits into an int, to prevent max_t()
1100 		 * from treating it as a negative value.
1101 		 */
1102 		val = min_t(int, val, INT_MAX / 2);
1103 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1104 		WRITE_ONCE(sk->sk_sndbuf,
1105 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1106 		/* Wake up sending tasks if we upped the value. */
1107 		sk->sk_write_space(sk);
1108 		break;
1109 
1110 	case SO_SNDBUFFORCE:
1111 		if (!capable(CAP_NET_ADMIN)) {
1112 			ret = -EPERM;
1113 			break;
1114 		}
1115 
1116 		/* No negative values (to prevent underflow, as val will be
1117 		 * multiplied by 2).
1118 		 */
1119 		if (val < 0)
1120 			val = 0;
1121 		goto set_sndbuf;
1122 
1123 	case SO_RCVBUF:
1124 		/* Don't error on this BSD doesn't and if you think
1125 		 * about it this is right. Otherwise apps have to
1126 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1127 		 * are treated in BSD as hints
1128 		 */
1129 		__sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max));
1130 		break;
1131 
1132 	case SO_RCVBUFFORCE:
1133 		if (!capable(CAP_NET_ADMIN)) {
1134 			ret = -EPERM;
1135 			break;
1136 		}
1137 
1138 		/* No negative values (to prevent underflow, as val will be
1139 		 * multiplied by 2).
1140 		 */
1141 		__sock_set_rcvbuf(sk, max(val, 0));
1142 		break;
1143 
1144 	case SO_KEEPALIVE:
1145 		if (sk->sk_prot->keepalive)
1146 			sk->sk_prot->keepalive(sk, valbool);
1147 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1148 		break;
1149 
1150 	case SO_OOBINLINE:
1151 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1152 		break;
1153 
1154 	case SO_NO_CHECK:
1155 		sk->sk_no_check_tx = valbool;
1156 		break;
1157 
1158 	case SO_PRIORITY:
1159 		if ((val >= 0 && val <= 6) ||
1160 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1161 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1162 			sk->sk_priority = val;
1163 		else
1164 			ret = -EPERM;
1165 		break;
1166 
1167 	case SO_LINGER:
1168 		if (optlen < sizeof(ling)) {
1169 			ret = -EINVAL;	/* 1003.1g */
1170 			break;
1171 		}
1172 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1173 			ret = -EFAULT;
1174 			break;
1175 		}
1176 		if (!ling.l_onoff)
1177 			sock_reset_flag(sk, SOCK_LINGER);
1178 		else {
1179 #if (BITS_PER_LONG == 32)
1180 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
1181 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1182 			else
1183 #endif
1184 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
1185 			sock_set_flag(sk, SOCK_LINGER);
1186 		}
1187 		break;
1188 
1189 	case SO_BSDCOMPAT:
1190 		break;
1191 
1192 	case SO_PASSCRED:
1193 		if (valbool)
1194 			set_bit(SOCK_PASSCRED, &sock->flags);
1195 		else
1196 			clear_bit(SOCK_PASSCRED, &sock->flags);
1197 		break;
1198 
1199 	case SO_TIMESTAMP_OLD:
1200 	case SO_TIMESTAMP_NEW:
1201 	case SO_TIMESTAMPNS_OLD:
1202 	case SO_TIMESTAMPNS_NEW:
1203 		sock_set_timestamp(sk, optname, valbool);
1204 		break;
1205 
1206 	case SO_TIMESTAMPING_NEW:
1207 	case SO_TIMESTAMPING_OLD:
1208 		if (optlen == sizeof(timestamping)) {
1209 			if (copy_from_sockptr(&timestamping, optval,
1210 					      sizeof(timestamping))) {
1211 				ret = -EFAULT;
1212 				break;
1213 			}
1214 		} else {
1215 			memset(&timestamping, 0, sizeof(timestamping));
1216 			timestamping.flags = val;
1217 		}
1218 		ret = sock_set_timestamping(sk, optname, timestamping);
1219 		break;
1220 
1221 	case SO_RCVLOWAT:
1222 		if (val < 0)
1223 			val = INT_MAX;
1224 		if (sock->ops->set_rcvlowat)
1225 			ret = sock->ops->set_rcvlowat(sk, val);
1226 		else
1227 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1228 		break;
1229 
1230 	case SO_RCVTIMEO_OLD:
1231 	case SO_RCVTIMEO_NEW:
1232 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1233 				       optlen, optname == SO_RCVTIMEO_OLD);
1234 		break;
1235 
1236 	case SO_SNDTIMEO_OLD:
1237 	case SO_SNDTIMEO_NEW:
1238 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1239 				       optlen, optname == SO_SNDTIMEO_OLD);
1240 		break;
1241 
1242 	case SO_ATTACH_FILTER: {
1243 		struct sock_fprog fprog;
1244 
1245 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1246 		if (!ret)
1247 			ret = sk_attach_filter(&fprog, sk);
1248 		break;
1249 	}
1250 	case SO_ATTACH_BPF:
1251 		ret = -EINVAL;
1252 		if (optlen == sizeof(u32)) {
1253 			u32 ufd;
1254 
1255 			ret = -EFAULT;
1256 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1257 				break;
1258 
1259 			ret = sk_attach_bpf(ufd, sk);
1260 		}
1261 		break;
1262 
1263 	case SO_ATTACH_REUSEPORT_CBPF: {
1264 		struct sock_fprog fprog;
1265 
1266 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1267 		if (!ret)
1268 			ret = sk_reuseport_attach_filter(&fprog, sk);
1269 		break;
1270 	}
1271 	case SO_ATTACH_REUSEPORT_EBPF:
1272 		ret = -EINVAL;
1273 		if (optlen == sizeof(u32)) {
1274 			u32 ufd;
1275 
1276 			ret = -EFAULT;
1277 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1278 				break;
1279 
1280 			ret = sk_reuseport_attach_bpf(ufd, sk);
1281 		}
1282 		break;
1283 
1284 	case SO_DETACH_REUSEPORT_BPF:
1285 		ret = reuseport_detach_prog(sk);
1286 		break;
1287 
1288 	case SO_DETACH_FILTER:
1289 		ret = sk_detach_filter(sk);
1290 		break;
1291 
1292 	case SO_LOCK_FILTER:
1293 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1294 			ret = -EPERM;
1295 		else
1296 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1297 		break;
1298 
1299 	case SO_PASSSEC:
1300 		if (valbool)
1301 			set_bit(SOCK_PASSSEC, &sock->flags);
1302 		else
1303 			clear_bit(SOCK_PASSSEC, &sock->flags);
1304 		break;
1305 	case SO_MARK:
1306 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1307 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1308 			ret = -EPERM;
1309 			break;
1310 		}
1311 
1312 		__sock_set_mark(sk, val);
1313 		break;
1314 	case SO_RCVMARK:
1315 		sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1316 		break;
1317 
1318 	case SO_RXQ_OVFL:
1319 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1320 		break;
1321 
1322 	case SO_WIFI_STATUS:
1323 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1324 		break;
1325 
1326 	case SO_PEEK_OFF:
1327 		if (sock->ops->set_peek_off)
1328 			ret = sock->ops->set_peek_off(sk, val);
1329 		else
1330 			ret = -EOPNOTSUPP;
1331 		break;
1332 
1333 	case SO_NOFCS:
1334 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1335 		break;
1336 
1337 	case SO_SELECT_ERR_QUEUE:
1338 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1339 		break;
1340 
1341 #ifdef CONFIG_NET_RX_BUSY_POLL
1342 	case SO_BUSY_POLL:
1343 		/* allow unprivileged users to decrease the value */
1344 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1345 			ret = -EPERM;
1346 		else {
1347 			if (val < 0)
1348 				ret = -EINVAL;
1349 			else
1350 				WRITE_ONCE(sk->sk_ll_usec, val);
1351 		}
1352 		break;
1353 	case SO_PREFER_BUSY_POLL:
1354 		if (valbool && !capable(CAP_NET_ADMIN))
1355 			ret = -EPERM;
1356 		else
1357 			WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1358 		break;
1359 	case SO_BUSY_POLL_BUDGET:
1360 		if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
1361 			ret = -EPERM;
1362 		} else {
1363 			if (val < 0 || val > U16_MAX)
1364 				ret = -EINVAL;
1365 			else
1366 				WRITE_ONCE(sk->sk_busy_poll_budget, val);
1367 		}
1368 		break;
1369 #endif
1370 
1371 	case SO_MAX_PACING_RATE:
1372 		{
1373 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1374 
1375 		if (sizeof(ulval) != sizeof(val) &&
1376 		    optlen >= sizeof(ulval) &&
1377 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1378 			ret = -EFAULT;
1379 			break;
1380 		}
1381 		if (ulval != ~0UL)
1382 			cmpxchg(&sk->sk_pacing_status,
1383 				SK_PACING_NONE,
1384 				SK_PACING_NEEDED);
1385 		sk->sk_max_pacing_rate = ulval;
1386 		sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1387 		break;
1388 		}
1389 	case SO_INCOMING_CPU:
1390 		WRITE_ONCE(sk->sk_incoming_cpu, val);
1391 		break;
1392 
1393 	case SO_CNX_ADVICE:
1394 		if (val == 1)
1395 			dst_negative_advice(sk);
1396 		break;
1397 
1398 	case SO_ZEROCOPY:
1399 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1400 			if (!(sk_is_tcp(sk) ||
1401 			      (sk->sk_type == SOCK_DGRAM &&
1402 			       sk->sk_protocol == IPPROTO_UDP)))
1403 				ret = -EOPNOTSUPP;
1404 		} else if (sk->sk_family != PF_RDS) {
1405 			ret = -EOPNOTSUPP;
1406 		}
1407 		if (!ret) {
1408 			if (val < 0 || val > 1)
1409 				ret = -EINVAL;
1410 			else
1411 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1412 		}
1413 		break;
1414 
1415 	case SO_TXTIME:
1416 		if (optlen != sizeof(struct sock_txtime)) {
1417 			ret = -EINVAL;
1418 			break;
1419 		} else if (copy_from_sockptr(&sk_txtime, optval,
1420 			   sizeof(struct sock_txtime))) {
1421 			ret = -EFAULT;
1422 			break;
1423 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1424 			ret = -EINVAL;
1425 			break;
1426 		}
1427 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1428 		 * scheduler has enough safe guards.
1429 		 */
1430 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1431 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1432 			ret = -EPERM;
1433 			break;
1434 		}
1435 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1436 		sk->sk_clockid = sk_txtime.clockid;
1437 		sk->sk_txtime_deadline_mode =
1438 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1439 		sk->sk_txtime_report_errors =
1440 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1441 		break;
1442 
1443 	case SO_BINDTOIFINDEX:
1444 		ret = sock_bindtoindex_locked(sk, val);
1445 		break;
1446 
1447 	case SO_BUF_LOCK:
1448 		if (val & ~SOCK_BUF_LOCK_MASK) {
1449 			ret = -EINVAL;
1450 			break;
1451 		}
1452 		sk->sk_userlocks = val | (sk->sk_userlocks &
1453 					  ~SOCK_BUF_LOCK_MASK);
1454 		break;
1455 
1456 	case SO_RESERVE_MEM:
1457 	{
1458 		int delta;
1459 
1460 		if (val < 0) {
1461 			ret = -EINVAL;
1462 			break;
1463 		}
1464 
1465 		delta = val - sk->sk_reserved_mem;
1466 		if (delta < 0)
1467 			sock_release_reserved_memory(sk, -delta);
1468 		else
1469 			ret = sock_reserve_memory(sk, delta);
1470 		break;
1471 	}
1472 
1473 	case SO_TXREHASH:
1474 		if (val < -1 || val > 1) {
1475 			ret = -EINVAL;
1476 			break;
1477 		}
1478 		/* Paired with READ_ONCE() in tcp_rtx_synack() */
1479 		WRITE_ONCE(sk->sk_txrehash, (u8)val);
1480 		break;
1481 
1482 	default:
1483 		ret = -ENOPROTOOPT;
1484 		break;
1485 	}
1486 	release_sock(sk);
1487 	return ret;
1488 }
1489 EXPORT_SYMBOL(sock_setsockopt);
1490 
1491 static const struct cred *sk_get_peer_cred(struct sock *sk)
1492 {
1493 	const struct cred *cred;
1494 
1495 	spin_lock(&sk->sk_peer_lock);
1496 	cred = get_cred(sk->sk_peer_cred);
1497 	spin_unlock(&sk->sk_peer_lock);
1498 
1499 	return cred;
1500 }
1501 
1502 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1503 			  struct ucred *ucred)
1504 {
1505 	ucred->pid = pid_vnr(pid);
1506 	ucred->uid = ucred->gid = -1;
1507 	if (cred) {
1508 		struct user_namespace *current_ns = current_user_ns();
1509 
1510 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1511 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1512 	}
1513 }
1514 
1515 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1516 {
1517 	struct user_namespace *user_ns = current_user_ns();
1518 	int i;
1519 
1520 	for (i = 0; i < src->ngroups; i++)
1521 		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1522 			return -EFAULT;
1523 
1524 	return 0;
1525 }
1526 
1527 int sock_getsockopt(struct socket *sock, int level, int optname,
1528 		    char __user *optval, int __user *optlen)
1529 {
1530 	struct sock *sk = sock->sk;
1531 
1532 	union {
1533 		int val;
1534 		u64 val64;
1535 		unsigned long ulval;
1536 		struct linger ling;
1537 		struct old_timeval32 tm32;
1538 		struct __kernel_old_timeval tm;
1539 		struct  __kernel_sock_timeval stm;
1540 		struct sock_txtime txtime;
1541 		struct so_timestamping timestamping;
1542 	} v;
1543 
1544 	int lv = sizeof(int);
1545 	int len;
1546 
1547 	if (get_user(len, optlen))
1548 		return -EFAULT;
1549 	if (len < 0)
1550 		return -EINVAL;
1551 
1552 	memset(&v, 0, sizeof(v));
1553 
1554 	switch (optname) {
1555 	case SO_DEBUG:
1556 		v.val = sock_flag(sk, SOCK_DBG);
1557 		break;
1558 
1559 	case SO_DONTROUTE:
1560 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1561 		break;
1562 
1563 	case SO_BROADCAST:
1564 		v.val = sock_flag(sk, SOCK_BROADCAST);
1565 		break;
1566 
1567 	case SO_SNDBUF:
1568 		v.val = sk->sk_sndbuf;
1569 		break;
1570 
1571 	case SO_RCVBUF:
1572 		v.val = sk->sk_rcvbuf;
1573 		break;
1574 
1575 	case SO_REUSEADDR:
1576 		v.val = sk->sk_reuse;
1577 		break;
1578 
1579 	case SO_REUSEPORT:
1580 		v.val = sk->sk_reuseport;
1581 		break;
1582 
1583 	case SO_KEEPALIVE:
1584 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1585 		break;
1586 
1587 	case SO_TYPE:
1588 		v.val = sk->sk_type;
1589 		break;
1590 
1591 	case SO_PROTOCOL:
1592 		v.val = sk->sk_protocol;
1593 		break;
1594 
1595 	case SO_DOMAIN:
1596 		v.val = sk->sk_family;
1597 		break;
1598 
1599 	case SO_ERROR:
1600 		v.val = -sock_error(sk);
1601 		if (v.val == 0)
1602 			v.val = xchg(&sk->sk_err_soft, 0);
1603 		break;
1604 
1605 	case SO_OOBINLINE:
1606 		v.val = sock_flag(sk, SOCK_URGINLINE);
1607 		break;
1608 
1609 	case SO_NO_CHECK:
1610 		v.val = sk->sk_no_check_tx;
1611 		break;
1612 
1613 	case SO_PRIORITY:
1614 		v.val = sk->sk_priority;
1615 		break;
1616 
1617 	case SO_LINGER:
1618 		lv		= sizeof(v.ling);
1619 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1620 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1621 		break;
1622 
1623 	case SO_BSDCOMPAT:
1624 		break;
1625 
1626 	case SO_TIMESTAMP_OLD:
1627 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1628 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1629 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1630 		break;
1631 
1632 	case SO_TIMESTAMPNS_OLD:
1633 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1634 		break;
1635 
1636 	case SO_TIMESTAMP_NEW:
1637 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1638 		break;
1639 
1640 	case SO_TIMESTAMPNS_NEW:
1641 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1642 		break;
1643 
1644 	case SO_TIMESTAMPING_OLD:
1645 		lv = sizeof(v.timestamping);
1646 		v.timestamping.flags = sk->sk_tsflags;
1647 		v.timestamping.bind_phc = sk->sk_bind_phc;
1648 		break;
1649 
1650 	case SO_RCVTIMEO_OLD:
1651 	case SO_RCVTIMEO_NEW:
1652 		lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1653 		break;
1654 
1655 	case SO_SNDTIMEO_OLD:
1656 	case SO_SNDTIMEO_NEW:
1657 		lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1658 		break;
1659 
1660 	case SO_RCVLOWAT:
1661 		v.val = sk->sk_rcvlowat;
1662 		break;
1663 
1664 	case SO_SNDLOWAT:
1665 		v.val = 1;
1666 		break;
1667 
1668 	case SO_PASSCRED:
1669 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1670 		break;
1671 
1672 	case SO_PEERCRED:
1673 	{
1674 		struct ucred peercred;
1675 		if (len > sizeof(peercred))
1676 			len = sizeof(peercred);
1677 
1678 		spin_lock(&sk->sk_peer_lock);
1679 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1680 		spin_unlock(&sk->sk_peer_lock);
1681 
1682 		if (copy_to_user(optval, &peercred, len))
1683 			return -EFAULT;
1684 		goto lenout;
1685 	}
1686 
1687 	case SO_PEERGROUPS:
1688 	{
1689 		const struct cred *cred;
1690 		int ret, n;
1691 
1692 		cred = sk_get_peer_cred(sk);
1693 		if (!cred)
1694 			return -ENODATA;
1695 
1696 		n = cred->group_info->ngroups;
1697 		if (len < n * sizeof(gid_t)) {
1698 			len = n * sizeof(gid_t);
1699 			put_cred(cred);
1700 			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1701 		}
1702 		len = n * sizeof(gid_t);
1703 
1704 		ret = groups_to_user((gid_t __user *)optval, cred->group_info);
1705 		put_cred(cred);
1706 		if (ret)
1707 			return ret;
1708 		goto lenout;
1709 	}
1710 
1711 	case SO_PEERNAME:
1712 	{
1713 		char address[128];
1714 
1715 		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1716 		if (lv < 0)
1717 			return -ENOTCONN;
1718 		if (lv < len)
1719 			return -EINVAL;
1720 		if (copy_to_user(optval, address, len))
1721 			return -EFAULT;
1722 		goto lenout;
1723 	}
1724 
1725 	/* Dubious BSD thing... Probably nobody even uses it, but
1726 	 * the UNIX standard wants it for whatever reason... -DaveM
1727 	 */
1728 	case SO_ACCEPTCONN:
1729 		v.val = sk->sk_state == TCP_LISTEN;
1730 		break;
1731 
1732 	case SO_PASSSEC:
1733 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1734 		break;
1735 
1736 	case SO_PEERSEC:
1737 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1738 
1739 	case SO_MARK:
1740 		v.val = sk->sk_mark;
1741 		break;
1742 
1743 	case SO_RCVMARK:
1744 		v.val = sock_flag(sk, SOCK_RCVMARK);
1745 		break;
1746 
1747 	case SO_RXQ_OVFL:
1748 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1749 		break;
1750 
1751 	case SO_WIFI_STATUS:
1752 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1753 		break;
1754 
1755 	case SO_PEEK_OFF:
1756 		if (!sock->ops->set_peek_off)
1757 			return -EOPNOTSUPP;
1758 
1759 		v.val = sk->sk_peek_off;
1760 		break;
1761 	case SO_NOFCS:
1762 		v.val = sock_flag(sk, SOCK_NOFCS);
1763 		break;
1764 
1765 	case SO_BINDTODEVICE:
1766 		return sock_getbindtodevice(sk, optval, optlen, len);
1767 
1768 	case SO_GET_FILTER:
1769 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1770 		if (len < 0)
1771 			return len;
1772 
1773 		goto lenout;
1774 
1775 	case SO_LOCK_FILTER:
1776 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1777 		break;
1778 
1779 	case SO_BPF_EXTENSIONS:
1780 		v.val = bpf_tell_extensions();
1781 		break;
1782 
1783 	case SO_SELECT_ERR_QUEUE:
1784 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1785 		break;
1786 
1787 #ifdef CONFIG_NET_RX_BUSY_POLL
1788 	case SO_BUSY_POLL:
1789 		v.val = sk->sk_ll_usec;
1790 		break;
1791 	case SO_PREFER_BUSY_POLL:
1792 		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1793 		break;
1794 #endif
1795 
1796 	case SO_MAX_PACING_RATE:
1797 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1798 			lv = sizeof(v.ulval);
1799 			v.ulval = sk->sk_max_pacing_rate;
1800 		} else {
1801 			/* 32bit version */
1802 			v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1803 		}
1804 		break;
1805 
1806 	case SO_INCOMING_CPU:
1807 		v.val = READ_ONCE(sk->sk_incoming_cpu);
1808 		break;
1809 
1810 	case SO_MEMINFO:
1811 	{
1812 		u32 meminfo[SK_MEMINFO_VARS];
1813 
1814 		sk_get_meminfo(sk, meminfo);
1815 
1816 		len = min_t(unsigned int, len, sizeof(meminfo));
1817 		if (copy_to_user(optval, &meminfo, len))
1818 			return -EFAULT;
1819 
1820 		goto lenout;
1821 	}
1822 
1823 #ifdef CONFIG_NET_RX_BUSY_POLL
1824 	case SO_INCOMING_NAPI_ID:
1825 		v.val = READ_ONCE(sk->sk_napi_id);
1826 
1827 		/* aggregate non-NAPI IDs down to 0 */
1828 		if (v.val < MIN_NAPI_ID)
1829 			v.val = 0;
1830 
1831 		break;
1832 #endif
1833 
1834 	case SO_COOKIE:
1835 		lv = sizeof(u64);
1836 		if (len < lv)
1837 			return -EINVAL;
1838 		v.val64 = sock_gen_cookie(sk);
1839 		break;
1840 
1841 	case SO_ZEROCOPY:
1842 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1843 		break;
1844 
1845 	case SO_TXTIME:
1846 		lv = sizeof(v.txtime);
1847 		v.txtime.clockid = sk->sk_clockid;
1848 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1849 				  SOF_TXTIME_DEADLINE_MODE : 0;
1850 		v.txtime.flags |= sk->sk_txtime_report_errors ?
1851 				  SOF_TXTIME_REPORT_ERRORS : 0;
1852 		break;
1853 
1854 	case SO_BINDTOIFINDEX:
1855 		v.val = sk->sk_bound_dev_if;
1856 		break;
1857 
1858 	case SO_NETNS_COOKIE:
1859 		lv = sizeof(u64);
1860 		if (len != lv)
1861 			return -EINVAL;
1862 		v.val64 = sock_net(sk)->net_cookie;
1863 		break;
1864 
1865 	case SO_BUF_LOCK:
1866 		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1867 		break;
1868 
1869 	case SO_RESERVE_MEM:
1870 		v.val = sk->sk_reserved_mem;
1871 		break;
1872 
1873 	case SO_TXREHASH:
1874 		v.val = sk->sk_txrehash;
1875 		break;
1876 
1877 	default:
1878 		/* We implement the SO_SNDLOWAT etc to not be settable
1879 		 * (1003.1g 7).
1880 		 */
1881 		return -ENOPROTOOPT;
1882 	}
1883 
1884 	if (len > lv)
1885 		len = lv;
1886 	if (copy_to_user(optval, &v, len))
1887 		return -EFAULT;
1888 lenout:
1889 	if (put_user(len, optlen))
1890 		return -EFAULT;
1891 	return 0;
1892 }
1893 
1894 /*
1895  * Initialize an sk_lock.
1896  *
1897  * (We also register the sk_lock with the lock validator.)
1898  */
1899 static inline void sock_lock_init(struct sock *sk)
1900 {
1901 	if (sk->sk_kern_sock)
1902 		sock_lock_init_class_and_name(
1903 			sk,
1904 			af_family_kern_slock_key_strings[sk->sk_family],
1905 			af_family_kern_slock_keys + sk->sk_family,
1906 			af_family_kern_key_strings[sk->sk_family],
1907 			af_family_kern_keys + sk->sk_family);
1908 	else
1909 		sock_lock_init_class_and_name(
1910 			sk,
1911 			af_family_slock_key_strings[sk->sk_family],
1912 			af_family_slock_keys + sk->sk_family,
1913 			af_family_key_strings[sk->sk_family],
1914 			af_family_keys + sk->sk_family);
1915 }
1916 
1917 /*
1918  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1919  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1920  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1921  */
1922 static void sock_copy(struct sock *nsk, const struct sock *osk)
1923 {
1924 	const struct proto *prot = READ_ONCE(osk->sk_prot);
1925 #ifdef CONFIG_SECURITY_NETWORK
1926 	void *sptr = nsk->sk_security;
1927 #endif
1928 
1929 	/* If we move sk_tx_queue_mapping out of the private section,
1930 	 * we must check if sk_tx_queue_clear() is called after
1931 	 * sock_copy() in sk_clone_lock().
1932 	 */
1933 	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
1934 		     offsetof(struct sock, sk_dontcopy_begin) ||
1935 		     offsetof(struct sock, sk_tx_queue_mapping) >=
1936 		     offsetof(struct sock, sk_dontcopy_end));
1937 
1938 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1939 
1940 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1941 	       prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1942 
1943 #ifdef CONFIG_SECURITY_NETWORK
1944 	nsk->sk_security = sptr;
1945 	security_sk_clone(osk, nsk);
1946 #endif
1947 }
1948 
1949 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1950 		int family)
1951 {
1952 	struct sock *sk;
1953 	struct kmem_cache *slab;
1954 
1955 	slab = prot->slab;
1956 	if (slab != NULL) {
1957 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1958 		if (!sk)
1959 			return sk;
1960 		if (want_init_on_alloc(priority))
1961 			sk_prot_clear_nulls(sk, prot->obj_size);
1962 	} else
1963 		sk = kmalloc(prot->obj_size, priority);
1964 
1965 	if (sk != NULL) {
1966 		if (security_sk_alloc(sk, family, priority))
1967 			goto out_free;
1968 
1969 		if (!try_module_get(prot->owner))
1970 			goto out_free_sec;
1971 	}
1972 
1973 	return sk;
1974 
1975 out_free_sec:
1976 	security_sk_free(sk);
1977 out_free:
1978 	if (slab != NULL)
1979 		kmem_cache_free(slab, sk);
1980 	else
1981 		kfree(sk);
1982 	return NULL;
1983 }
1984 
1985 static void sk_prot_free(struct proto *prot, struct sock *sk)
1986 {
1987 	struct kmem_cache *slab;
1988 	struct module *owner;
1989 
1990 	owner = prot->owner;
1991 	slab = prot->slab;
1992 
1993 	cgroup_sk_free(&sk->sk_cgrp_data);
1994 	mem_cgroup_sk_free(sk);
1995 	security_sk_free(sk);
1996 	if (slab != NULL)
1997 		kmem_cache_free(slab, sk);
1998 	else
1999 		kfree(sk);
2000 	module_put(owner);
2001 }
2002 
2003 /**
2004  *	sk_alloc - All socket objects are allocated here
2005  *	@net: the applicable net namespace
2006  *	@family: protocol family
2007  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2008  *	@prot: struct proto associated with this new sock instance
2009  *	@kern: is this to be a kernel socket?
2010  */
2011 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2012 		      struct proto *prot, int kern)
2013 {
2014 	struct sock *sk;
2015 
2016 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2017 	if (sk) {
2018 		sk->sk_family = family;
2019 		/*
2020 		 * See comment in struct sock definition to understand
2021 		 * why we need sk_prot_creator -acme
2022 		 */
2023 		sk->sk_prot = sk->sk_prot_creator = prot;
2024 		sk->sk_kern_sock = kern;
2025 		sock_lock_init(sk);
2026 		sk->sk_net_refcnt = kern ? 0 : 1;
2027 		if (likely(sk->sk_net_refcnt)) {
2028 			get_net_track(net, &sk->ns_tracker, priority);
2029 			sock_inuse_add(net, 1);
2030 		}
2031 
2032 		sock_net_set(sk, net);
2033 		refcount_set(&sk->sk_wmem_alloc, 1);
2034 
2035 		mem_cgroup_sk_alloc(sk);
2036 		cgroup_sk_alloc(&sk->sk_cgrp_data);
2037 		sock_update_classid(&sk->sk_cgrp_data);
2038 		sock_update_netprioidx(&sk->sk_cgrp_data);
2039 		sk_tx_queue_clear(sk);
2040 	}
2041 
2042 	return sk;
2043 }
2044 EXPORT_SYMBOL(sk_alloc);
2045 
2046 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2047  * grace period. This is the case for UDP sockets and TCP listeners.
2048  */
2049 static void __sk_destruct(struct rcu_head *head)
2050 {
2051 	struct sock *sk = container_of(head, struct sock, sk_rcu);
2052 	struct sk_filter *filter;
2053 
2054 	if (sk->sk_destruct)
2055 		sk->sk_destruct(sk);
2056 
2057 	filter = rcu_dereference_check(sk->sk_filter,
2058 				       refcount_read(&sk->sk_wmem_alloc) == 0);
2059 	if (filter) {
2060 		sk_filter_uncharge(sk, filter);
2061 		RCU_INIT_POINTER(sk->sk_filter, NULL);
2062 	}
2063 
2064 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2065 
2066 #ifdef CONFIG_BPF_SYSCALL
2067 	bpf_sk_storage_free(sk);
2068 #endif
2069 
2070 	if (atomic_read(&sk->sk_omem_alloc))
2071 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2072 			 __func__, atomic_read(&sk->sk_omem_alloc));
2073 
2074 	if (sk->sk_frag.page) {
2075 		put_page(sk->sk_frag.page);
2076 		sk->sk_frag.page = NULL;
2077 	}
2078 
2079 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2080 	put_cred(sk->sk_peer_cred);
2081 	put_pid(sk->sk_peer_pid);
2082 
2083 	if (likely(sk->sk_net_refcnt))
2084 		put_net_track(sock_net(sk), &sk->ns_tracker);
2085 	sk_prot_free(sk->sk_prot_creator, sk);
2086 }
2087 
2088 void sk_destruct(struct sock *sk)
2089 {
2090 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2091 
2092 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2093 		reuseport_detach_sock(sk);
2094 		use_call_rcu = true;
2095 	}
2096 
2097 	if (use_call_rcu)
2098 		call_rcu(&sk->sk_rcu, __sk_destruct);
2099 	else
2100 		__sk_destruct(&sk->sk_rcu);
2101 }
2102 
2103 static void __sk_free(struct sock *sk)
2104 {
2105 	if (likely(sk->sk_net_refcnt))
2106 		sock_inuse_add(sock_net(sk), -1);
2107 
2108 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2109 		sock_diag_broadcast_destroy(sk);
2110 	else
2111 		sk_destruct(sk);
2112 }
2113 
2114 void sk_free(struct sock *sk)
2115 {
2116 	/*
2117 	 * We subtract one from sk_wmem_alloc and can know if
2118 	 * some packets are still in some tx queue.
2119 	 * If not null, sock_wfree() will call __sk_free(sk) later
2120 	 */
2121 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2122 		__sk_free(sk);
2123 }
2124 EXPORT_SYMBOL(sk_free);
2125 
2126 static void sk_init_common(struct sock *sk)
2127 {
2128 	skb_queue_head_init(&sk->sk_receive_queue);
2129 	skb_queue_head_init(&sk->sk_write_queue);
2130 	skb_queue_head_init(&sk->sk_error_queue);
2131 
2132 	rwlock_init(&sk->sk_callback_lock);
2133 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2134 			af_rlock_keys + sk->sk_family,
2135 			af_family_rlock_key_strings[sk->sk_family]);
2136 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2137 			af_wlock_keys + sk->sk_family,
2138 			af_family_wlock_key_strings[sk->sk_family]);
2139 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2140 			af_elock_keys + sk->sk_family,
2141 			af_family_elock_key_strings[sk->sk_family]);
2142 	lockdep_set_class_and_name(&sk->sk_callback_lock,
2143 			af_callback_keys + sk->sk_family,
2144 			af_family_clock_key_strings[sk->sk_family]);
2145 }
2146 
2147 /**
2148  *	sk_clone_lock - clone a socket, and lock its clone
2149  *	@sk: the socket to clone
2150  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2151  *
2152  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2153  */
2154 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2155 {
2156 	struct proto *prot = READ_ONCE(sk->sk_prot);
2157 	struct sk_filter *filter;
2158 	bool is_charged = true;
2159 	struct sock *newsk;
2160 
2161 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2162 	if (!newsk)
2163 		goto out;
2164 
2165 	sock_copy(newsk, sk);
2166 
2167 	newsk->sk_prot_creator = prot;
2168 
2169 	/* SANITY */
2170 	if (likely(newsk->sk_net_refcnt)) {
2171 		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2172 		sock_inuse_add(sock_net(newsk), 1);
2173 	}
2174 	sk_node_init(&newsk->sk_node);
2175 	sock_lock_init(newsk);
2176 	bh_lock_sock(newsk);
2177 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2178 	newsk->sk_backlog.len = 0;
2179 
2180 	atomic_set(&newsk->sk_rmem_alloc, 0);
2181 
2182 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2183 	refcount_set(&newsk->sk_wmem_alloc, 1);
2184 
2185 	atomic_set(&newsk->sk_omem_alloc, 0);
2186 	sk_init_common(newsk);
2187 
2188 	newsk->sk_dst_cache	= NULL;
2189 	newsk->sk_dst_pending_confirm = 0;
2190 	newsk->sk_wmem_queued	= 0;
2191 	newsk->sk_forward_alloc = 0;
2192 	newsk->sk_reserved_mem  = 0;
2193 	atomic_set(&newsk->sk_drops, 0);
2194 	newsk->sk_send_head	= NULL;
2195 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2196 	atomic_set(&newsk->sk_zckey, 0);
2197 
2198 	sock_reset_flag(newsk, SOCK_DONE);
2199 
2200 	/* sk->sk_memcg will be populated at accept() time */
2201 	newsk->sk_memcg = NULL;
2202 
2203 	cgroup_sk_clone(&newsk->sk_cgrp_data);
2204 
2205 	rcu_read_lock();
2206 	filter = rcu_dereference(sk->sk_filter);
2207 	if (filter != NULL)
2208 		/* though it's an empty new sock, the charging may fail
2209 		 * if sysctl_optmem_max was changed between creation of
2210 		 * original socket and cloning
2211 		 */
2212 		is_charged = sk_filter_charge(newsk, filter);
2213 	RCU_INIT_POINTER(newsk->sk_filter, filter);
2214 	rcu_read_unlock();
2215 
2216 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2217 		/* We need to make sure that we don't uncharge the new
2218 		 * socket if we couldn't charge it in the first place
2219 		 * as otherwise we uncharge the parent's filter.
2220 		 */
2221 		if (!is_charged)
2222 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2223 		sk_free_unlock_clone(newsk);
2224 		newsk = NULL;
2225 		goto out;
2226 	}
2227 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2228 
2229 	if (bpf_sk_storage_clone(sk, newsk)) {
2230 		sk_free_unlock_clone(newsk);
2231 		newsk = NULL;
2232 		goto out;
2233 	}
2234 
2235 	/* Clear sk_user_data if parent had the pointer tagged
2236 	 * as not suitable for copying when cloning.
2237 	 */
2238 	if (sk_user_data_is_nocopy(newsk))
2239 		newsk->sk_user_data = NULL;
2240 
2241 	newsk->sk_err	   = 0;
2242 	newsk->sk_err_soft = 0;
2243 	newsk->sk_priority = 0;
2244 	newsk->sk_incoming_cpu = raw_smp_processor_id();
2245 
2246 	/* Before updating sk_refcnt, we must commit prior changes to memory
2247 	 * (Documentation/RCU/rculist_nulls.rst for details)
2248 	 */
2249 	smp_wmb();
2250 	refcount_set(&newsk->sk_refcnt, 2);
2251 
2252 	/* Increment the counter in the same struct proto as the master
2253 	 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
2254 	 * is the same as sk->sk_prot->socks, as this field was copied
2255 	 * with memcpy).
2256 	 *
2257 	 * This _changes_ the previous behaviour, where
2258 	 * tcp_create_openreq_child always was incrementing the
2259 	 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
2260 	 * to be taken into account in all callers. -acme
2261 	 */
2262 	sk_refcnt_debug_inc(newsk);
2263 	sk_set_socket(newsk, NULL);
2264 	sk_tx_queue_clear(newsk);
2265 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2266 
2267 	if (newsk->sk_prot->sockets_allocated)
2268 		sk_sockets_allocated_inc(newsk);
2269 
2270 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2271 		net_enable_timestamp();
2272 out:
2273 	return newsk;
2274 }
2275 EXPORT_SYMBOL_GPL(sk_clone_lock);
2276 
2277 void sk_free_unlock_clone(struct sock *sk)
2278 {
2279 	/* It is still raw copy of parent, so invalidate
2280 	 * destructor and make plain sk_free() */
2281 	sk->sk_destruct = NULL;
2282 	bh_unlock_sock(sk);
2283 	sk_free(sk);
2284 }
2285 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2286 
2287 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2288 {
2289 	u32 max_segs = 1;
2290 
2291 	sk_dst_set(sk, dst);
2292 	sk->sk_route_caps = dst->dev->features;
2293 	if (sk_is_tcp(sk))
2294 		sk->sk_route_caps |= NETIF_F_GSO;
2295 	if (sk->sk_route_caps & NETIF_F_GSO)
2296 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2297 	if (unlikely(sk->sk_gso_disabled))
2298 		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2299 	if (sk_can_gso(sk)) {
2300 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2301 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2302 		} else {
2303 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2304 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */
2305 			sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size);
2306 			sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1);
2307 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2308 			max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2309 		}
2310 	}
2311 	sk->sk_gso_max_segs = max_segs;
2312 }
2313 EXPORT_SYMBOL_GPL(sk_setup_caps);
2314 
2315 /*
2316  *	Simple resource managers for sockets.
2317  */
2318 
2319 
2320 /*
2321  * Write buffer destructor automatically called from kfree_skb.
2322  */
2323 void sock_wfree(struct sk_buff *skb)
2324 {
2325 	struct sock *sk = skb->sk;
2326 	unsigned int len = skb->truesize;
2327 
2328 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2329 		/*
2330 		 * Keep a reference on sk_wmem_alloc, this will be released
2331 		 * after sk_write_space() call
2332 		 */
2333 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2334 		sk->sk_write_space(sk);
2335 		len = 1;
2336 	}
2337 	/*
2338 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2339 	 * could not do because of in-flight packets
2340 	 */
2341 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2342 		__sk_free(sk);
2343 }
2344 EXPORT_SYMBOL(sock_wfree);
2345 
2346 /* This variant of sock_wfree() is used by TCP,
2347  * since it sets SOCK_USE_WRITE_QUEUE.
2348  */
2349 void __sock_wfree(struct sk_buff *skb)
2350 {
2351 	struct sock *sk = skb->sk;
2352 
2353 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2354 		__sk_free(sk);
2355 }
2356 
2357 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2358 {
2359 	skb_orphan(skb);
2360 	skb->sk = sk;
2361 #ifdef CONFIG_INET
2362 	if (unlikely(!sk_fullsock(sk))) {
2363 		skb->destructor = sock_edemux;
2364 		sock_hold(sk);
2365 		return;
2366 	}
2367 #endif
2368 	skb->destructor = sock_wfree;
2369 	skb_set_hash_from_sk(skb, sk);
2370 	/*
2371 	 * We used to take a refcount on sk, but following operation
2372 	 * is enough to guarantee sk_free() wont free this sock until
2373 	 * all in-flight packets are completed
2374 	 */
2375 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2376 }
2377 EXPORT_SYMBOL(skb_set_owner_w);
2378 
2379 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2380 {
2381 #ifdef CONFIG_TLS_DEVICE
2382 	/* Drivers depend on in-order delivery for crypto offload,
2383 	 * partial orphan breaks out-of-order-OK logic.
2384 	 */
2385 	if (skb->decrypted)
2386 		return false;
2387 #endif
2388 	return (skb->destructor == sock_wfree ||
2389 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2390 }
2391 
2392 /* This helper is used by netem, as it can hold packets in its
2393  * delay queue. We want to allow the owner socket to send more
2394  * packets, as if they were already TX completed by a typical driver.
2395  * But we also want to keep skb->sk set because some packet schedulers
2396  * rely on it (sch_fq for example).
2397  */
2398 void skb_orphan_partial(struct sk_buff *skb)
2399 {
2400 	if (skb_is_tcp_pure_ack(skb))
2401 		return;
2402 
2403 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2404 		return;
2405 
2406 	skb_orphan(skb);
2407 }
2408 EXPORT_SYMBOL(skb_orphan_partial);
2409 
2410 /*
2411  * Read buffer destructor automatically called from kfree_skb.
2412  */
2413 void sock_rfree(struct sk_buff *skb)
2414 {
2415 	struct sock *sk = skb->sk;
2416 	unsigned int len = skb->truesize;
2417 
2418 	atomic_sub(len, &sk->sk_rmem_alloc);
2419 	sk_mem_uncharge(sk, len);
2420 }
2421 EXPORT_SYMBOL(sock_rfree);
2422 
2423 /*
2424  * Buffer destructor for skbs that are not used directly in read or write
2425  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2426  */
2427 void sock_efree(struct sk_buff *skb)
2428 {
2429 	sock_put(skb->sk);
2430 }
2431 EXPORT_SYMBOL(sock_efree);
2432 
2433 /* Buffer destructor for prefetch/receive path where reference count may
2434  * not be held, e.g. for listen sockets.
2435  */
2436 #ifdef CONFIG_INET
2437 void sock_pfree(struct sk_buff *skb)
2438 {
2439 	if (sk_is_refcounted(skb->sk))
2440 		sock_gen_put(skb->sk);
2441 }
2442 EXPORT_SYMBOL(sock_pfree);
2443 #endif /* CONFIG_INET */
2444 
2445 kuid_t sock_i_uid(struct sock *sk)
2446 {
2447 	kuid_t uid;
2448 
2449 	read_lock_bh(&sk->sk_callback_lock);
2450 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2451 	read_unlock_bh(&sk->sk_callback_lock);
2452 	return uid;
2453 }
2454 EXPORT_SYMBOL(sock_i_uid);
2455 
2456 unsigned long sock_i_ino(struct sock *sk)
2457 {
2458 	unsigned long ino;
2459 
2460 	read_lock_bh(&sk->sk_callback_lock);
2461 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2462 	read_unlock_bh(&sk->sk_callback_lock);
2463 	return ino;
2464 }
2465 EXPORT_SYMBOL(sock_i_ino);
2466 
2467 /*
2468  * Allocate a skb from the socket's send buffer.
2469  */
2470 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2471 			     gfp_t priority)
2472 {
2473 	if (force ||
2474 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2475 		struct sk_buff *skb = alloc_skb(size, priority);
2476 
2477 		if (skb) {
2478 			skb_set_owner_w(skb, sk);
2479 			return skb;
2480 		}
2481 	}
2482 	return NULL;
2483 }
2484 EXPORT_SYMBOL(sock_wmalloc);
2485 
2486 static void sock_ofree(struct sk_buff *skb)
2487 {
2488 	struct sock *sk = skb->sk;
2489 
2490 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2491 }
2492 
2493 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2494 			     gfp_t priority)
2495 {
2496 	struct sk_buff *skb;
2497 
2498 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2499 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2500 	    sysctl_optmem_max)
2501 		return NULL;
2502 
2503 	skb = alloc_skb(size, priority);
2504 	if (!skb)
2505 		return NULL;
2506 
2507 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2508 	skb->sk = sk;
2509 	skb->destructor = sock_ofree;
2510 	return skb;
2511 }
2512 
2513 /*
2514  * Allocate a memory block from the socket's option memory buffer.
2515  */
2516 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2517 {
2518 	if ((unsigned int)size <= sysctl_optmem_max &&
2519 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2520 		void *mem;
2521 		/* First do the add, to avoid the race if kmalloc
2522 		 * might sleep.
2523 		 */
2524 		atomic_add(size, &sk->sk_omem_alloc);
2525 		mem = kmalloc(size, priority);
2526 		if (mem)
2527 			return mem;
2528 		atomic_sub(size, &sk->sk_omem_alloc);
2529 	}
2530 	return NULL;
2531 }
2532 EXPORT_SYMBOL(sock_kmalloc);
2533 
2534 /* Free an option memory block. Note, we actually want the inline
2535  * here as this allows gcc to detect the nullify and fold away the
2536  * condition entirely.
2537  */
2538 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2539 				  const bool nullify)
2540 {
2541 	if (WARN_ON_ONCE(!mem))
2542 		return;
2543 	if (nullify)
2544 		kfree_sensitive(mem);
2545 	else
2546 		kfree(mem);
2547 	atomic_sub(size, &sk->sk_omem_alloc);
2548 }
2549 
2550 void sock_kfree_s(struct sock *sk, void *mem, int size)
2551 {
2552 	__sock_kfree_s(sk, mem, size, false);
2553 }
2554 EXPORT_SYMBOL(sock_kfree_s);
2555 
2556 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2557 {
2558 	__sock_kfree_s(sk, mem, size, true);
2559 }
2560 EXPORT_SYMBOL(sock_kzfree_s);
2561 
2562 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2563    I think, these locks should be removed for datagram sockets.
2564  */
2565 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2566 {
2567 	DEFINE_WAIT(wait);
2568 
2569 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2570 	for (;;) {
2571 		if (!timeo)
2572 			break;
2573 		if (signal_pending(current))
2574 			break;
2575 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2576 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2577 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2578 			break;
2579 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2580 			break;
2581 		if (sk->sk_err)
2582 			break;
2583 		timeo = schedule_timeout(timeo);
2584 	}
2585 	finish_wait(sk_sleep(sk), &wait);
2586 	return timeo;
2587 }
2588 
2589 
2590 /*
2591  *	Generic send/receive buffer handlers
2592  */
2593 
2594 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2595 				     unsigned long data_len, int noblock,
2596 				     int *errcode, int max_page_order)
2597 {
2598 	struct sk_buff *skb;
2599 	long timeo;
2600 	int err;
2601 
2602 	timeo = sock_sndtimeo(sk, noblock);
2603 	for (;;) {
2604 		err = sock_error(sk);
2605 		if (err != 0)
2606 			goto failure;
2607 
2608 		err = -EPIPE;
2609 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2610 			goto failure;
2611 
2612 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2613 			break;
2614 
2615 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2616 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2617 		err = -EAGAIN;
2618 		if (!timeo)
2619 			goto failure;
2620 		if (signal_pending(current))
2621 			goto interrupted;
2622 		timeo = sock_wait_for_wmem(sk, timeo);
2623 	}
2624 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2625 				   errcode, sk->sk_allocation);
2626 	if (skb)
2627 		skb_set_owner_w(skb, sk);
2628 	return skb;
2629 
2630 interrupted:
2631 	err = sock_intr_errno(timeo);
2632 failure:
2633 	*errcode = err;
2634 	return NULL;
2635 }
2636 EXPORT_SYMBOL(sock_alloc_send_pskb);
2637 
2638 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2639 				    int noblock, int *errcode)
2640 {
2641 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2642 }
2643 EXPORT_SYMBOL(sock_alloc_send_skb);
2644 
2645 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2646 		     struct sockcm_cookie *sockc)
2647 {
2648 	u32 tsflags;
2649 
2650 	switch (cmsg->cmsg_type) {
2651 	case SO_MARK:
2652 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2653 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2654 			return -EPERM;
2655 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2656 			return -EINVAL;
2657 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2658 		break;
2659 	case SO_TIMESTAMPING_OLD:
2660 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2661 			return -EINVAL;
2662 
2663 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2664 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2665 			return -EINVAL;
2666 
2667 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2668 		sockc->tsflags |= tsflags;
2669 		break;
2670 	case SCM_TXTIME:
2671 		if (!sock_flag(sk, SOCK_TXTIME))
2672 			return -EINVAL;
2673 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2674 			return -EINVAL;
2675 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2676 		break;
2677 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2678 	case SCM_RIGHTS:
2679 	case SCM_CREDENTIALS:
2680 		break;
2681 	default:
2682 		return -EINVAL;
2683 	}
2684 	return 0;
2685 }
2686 EXPORT_SYMBOL(__sock_cmsg_send);
2687 
2688 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2689 		   struct sockcm_cookie *sockc)
2690 {
2691 	struct cmsghdr *cmsg;
2692 	int ret;
2693 
2694 	for_each_cmsghdr(cmsg, msg) {
2695 		if (!CMSG_OK(msg, cmsg))
2696 			return -EINVAL;
2697 		if (cmsg->cmsg_level != SOL_SOCKET)
2698 			continue;
2699 		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2700 		if (ret)
2701 			return ret;
2702 	}
2703 	return 0;
2704 }
2705 EXPORT_SYMBOL(sock_cmsg_send);
2706 
2707 static void sk_enter_memory_pressure(struct sock *sk)
2708 {
2709 	if (!sk->sk_prot->enter_memory_pressure)
2710 		return;
2711 
2712 	sk->sk_prot->enter_memory_pressure(sk);
2713 }
2714 
2715 static void sk_leave_memory_pressure(struct sock *sk)
2716 {
2717 	if (sk->sk_prot->leave_memory_pressure) {
2718 		sk->sk_prot->leave_memory_pressure(sk);
2719 	} else {
2720 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2721 
2722 		if (memory_pressure && READ_ONCE(*memory_pressure))
2723 			WRITE_ONCE(*memory_pressure, 0);
2724 	}
2725 }
2726 
2727 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2728 
2729 /**
2730  * skb_page_frag_refill - check that a page_frag contains enough room
2731  * @sz: minimum size of the fragment we want to get
2732  * @pfrag: pointer to page_frag
2733  * @gfp: priority for memory allocation
2734  *
2735  * Note: While this allocator tries to use high order pages, there is
2736  * no guarantee that allocations succeed. Therefore, @sz MUST be
2737  * less or equal than PAGE_SIZE.
2738  */
2739 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2740 {
2741 	if (pfrag->page) {
2742 		if (page_ref_count(pfrag->page) == 1) {
2743 			pfrag->offset = 0;
2744 			return true;
2745 		}
2746 		if (pfrag->offset + sz <= pfrag->size)
2747 			return true;
2748 		put_page(pfrag->page);
2749 	}
2750 
2751 	pfrag->offset = 0;
2752 	if (SKB_FRAG_PAGE_ORDER &&
2753 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2754 		/* Avoid direct reclaim but allow kswapd to wake */
2755 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2756 					  __GFP_COMP | __GFP_NOWARN |
2757 					  __GFP_NORETRY,
2758 					  SKB_FRAG_PAGE_ORDER);
2759 		if (likely(pfrag->page)) {
2760 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2761 			return true;
2762 		}
2763 	}
2764 	pfrag->page = alloc_page(gfp);
2765 	if (likely(pfrag->page)) {
2766 		pfrag->size = PAGE_SIZE;
2767 		return true;
2768 	}
2769 	return false;
2770 }
2771 EXPORT_SYMBOL(skb_page_frag_refill);
2772 
2773 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2774 {
2775 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2776 		return true;
2777 
2778 	sk_enter_memory_pressure(sk);
2779 	sk_stream_moderate_sndbuf(sk);
2780 	return false;
2781 }
2782 EXPORT_SYMBOL(sk_page_frag_refill);
2783 
2784 void __lock_sock(struct sock *sk)
2785 	__releases(&sk->sk_lock.slock)
2786 	__acquires(&sk->sk_lock.slock)
2787 {
2788 	DEFINE_WAIT(wait);
2789 
2790 	for (;;) {
2791 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2792 					TASK_UNINTERRUPTIBLE);
2793 		spin_unlock_bh(&sk->sk_lock.slock);
2794 		schedule();
2795 		spin_lock_bh(&sk->sk_lock.slock);
2796 		if (!sock_owned_by_user(sk))
2797 			break;
2798 	}
2799 	finish_wait(&sk->sk_lock.wq, &wait);
2800 }
2801 
2802 void __release_sock(struct sock *sk)
2803 	__releases(&sk->sk_lock.slock)
2804 	__acquires(&sk->sk_lock.slock)
2805 {
2806 	struct sk_buff *skb, *next;
2807 
2808 	while ((skb = sk->sk_backlog.head) != NULL) {
2809 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2810 
2811 		spin_unlock_bh(&sk->sk_lock.slock);
2812 
2813 		do {
2814 			next = skb->next;
2815 			prefetch(next);
2816 			WARN_ON_ONCE(skb_dst_is_noref(skb));
2817 			skb_mark_not_on_list(skb);
2818 			sk_backlog_rcv(sk, skb);
2819 
2820 			cond_resched();
2821 
2822 			skb = next;
2823 		} while (skb != NULL);
2824 
2825 		spin_lock_bh(&sk->sk_lock.slock);
2826 	}
2827 
2828 	/*
2829 	 * Doing the zeroing here guarantee we can not loop forever
2830 	 * while a wild producer attempts to flood us.
2831 	 */
2832 	sk->sk_backlog.len = 0;
2833 }
2834 
2835 void __sk_flush_backlog(struct sock *sk)
2836 {
2837 	spin_lock_bh(&sk->sk_lock.slock);
2838 	__release_sock(sk);
2839 	spin_unlock_bh(&sk->sk_lock.slock);
2840 }
2841 
2842 /**
2843  * sk_wait_data - wait for data to arrive at sk_receive_queue
2844  * @sk:    sock to wait on
2845  * @timeo: for how long
2846  * @skb:   last skb seen on sk_receive_queue
2847  *
2848  * Now socket state including sk->sk_err is changed only under lock,
2849  * hence we may omit checks after joining wait queue.
2850  * We check receive queue before schedule() only as optimization;
2851  * it is very likely that release_sock() added new data.
2852  */
2853 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2854 {
2855 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2856 	int rc;
2857 
2858 	add_wait_queue(sk_sleep(sk), &wait);
2859 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2860 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2861 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2862 	remove_wait_queue(sk_sleep(sk), &wait);
2863 	return rc;
2864 }
2865 EXPORT_SYMBOL(sk_wait_data);
2866 
2867 /**
2868  *	__sk_mem_raise_allocated - increase memory_allocated
2869  *	@sk: socket
2870  *	@size: memory size to allocate
2871  *	@amt: pages to allocate
2872  *	@kind: allocation type
2873  *
2874  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2875  */
2876 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2877 {
2878 	struct proto *prot = sk->sk_prot;
2879 	long allocated = sk_memory_allocated_add(sk, amt);
2880 	bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
2881 	bool charged = true;
2882 
2883 	if (memcg_charge &&
2884 	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2885 						gfp_memcg_charge())))
2886 		goto suppress_allocation;
2887 
2888 	/* Under limit. */
2889 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2890 		sk_leave_memory_pressure(sk);
2891 		return 1;
2892 	}
2893 
2894 	/* Under pressure. */
2895 	if (allocated > sk_prot_mem_limits(sk, 1))
2896 		sk_enter_memory_pressure(sk);
2897 
2898 	/* Over hard limit. */
2899 	if (allocated > sk_prot_mem_limits(sk, 2))
2900 		goto suppress_allocation;
2901 
2902 	/* guarantee minimum buffer size under pressure */
2903 	if (kind == SK_MEM_RECV) {
2904 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2905 			return 1;
2906 
2907 	} else { /* SK_MEM_SEND */
2908 		int wmem0 = sk_get_wmem0(sk, prot);
2909 
2910 		if (sk->sk_type == SOCK_STREAM) {
2911 			if (sk->sk_wmem_queued < wmem0)
2912 				return 1;
2913 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2914 				return 1;
2915 		}
2916 	}
2917 
2918 	if (sk_has_memory_pressure(sk)) {
2919 		u64 alloc;
2920 
2921 		if (!sk_under_memory_pressure(sk))
2922 			return 1;
2923 		alloc = sk_sockets_allocated_read_positive(sk);
2924 		if (sk_prot_mem_limits(sk, 2) > alloc *
2925 		    sk_mem_pages(sk->sk_wmem_queued +
2926 				 atomic_read(&sk->sk_rmem_alloc) +
2927 				 sk->sk_forward_alloc))
2928 			return 1;
2929 	}
2930 
2931 suppress_allocation:
2932 
2933 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2934 		sk_stream_moderate_sndbuf(sk);
2935 
2936 		/* Fail only if socket is _under_ its sndbuf.
2937 		 * In this case we cannot block, so that we have to fail.
2938 		 */
2939 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
2940 			/* Force charge with __GFP_NOFAIL */
2941 			if (memcg_charge && !charged) {
2942 				mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2943 					gfp_memcg_charge() | __GFP_NOFAIL);
2944 			}
2945 			return 1;
2946 		}
2947 	}
2948 
2949 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2950 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2951 
2952 	sk_memory_allocated_sub(sk, amt);
2953 
2954 	if (memcg_charge && charged)
2955 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2956 
2957 	return 0;
2958 }
2959 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2960 
2961 /**
2962  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2963  *	@sk: socket
2964  *	@size: memory size to allocate
2965  *	@kind: allocation type
2966  *
2967  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2968  *	rmem allocation. This function assumes that protocols which have
2969  *	memory_pressure use sk_wmem_queued as write buffer accounting.
2970  */
2971 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2972 {
2973 	int ret, amt = sk_mem_pages(size);
2974 
2975 	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2976 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2977 	if (!ret)
2978 		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2979 	return ret;
2980 }
2981 EXPORT_SYMBOL(__sk_mem_schedule);
2982 
2983 /**
2984  *	__sk_mem_reduce_allocated - reclaim memory_allocated
2985  *	@sk: socket
2986  *	@amount: number of quanta
2987  *
2988  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2989  */
2990 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2991 {
2992 	sk_memory_allocated_sub(sk, amount);
2993 
2994 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2995 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2996 
2997 	if (sk_under_memory_pressure(sk) &&
2998 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2999 		sk_leave_memory_pressure(sk);
3000 }
3001 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
3002 
3003 /**
3004  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3005  *	@sk: socket
3006  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
3007  */
3008 void __sk_mem_reclaim(struct sock *sk, int amount)
3009 {
3010 	amount >>= SK_MEM_QUANTUM_SHIFT;
3011 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
3012 	__sk_mem_reduce_allocated(sk, amount);
3013 }
3014 EXPORT_SYMBOL(__sk_mem_reclaim);
3015 
3016 int sk_set_peek_off(struct sock *sk, int val)
3017 {
3018 	sk->sk_peek_off = val;
3019 	return 0;
3020 }
3021 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3022 
3023 /*
3024  * Set of default routines for initialising struct proto_ops when
3025  * the protocol does not support a particular function. In certain
3026  * cases where it makes no sense for a protocol to have a "do nothing"
3027  * function, some default processing is provided.
3028  */
3029 
3030 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3031 {
3032 	return -EOPNOTSUPP;
3033 }
3034 EXPORT_SYMBOL(sock_no_bind);
3035 
3036 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3037 		    int len, int flags)
3038 {
3039 	return -EOPNOTSUPP;
3040 }
3041 EXPORT_SYMBOL(sock_no_connect);
3042 
3043 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3044 {
3045 	return -EOPNOTSUPP;
3046 }
3047 EXPORT_SYMBOL(sock_no_socketpair);
3048 
3049 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
3050 		   bool kern)
3051 {
3052 	return -EOPNOTSUPP;
3053 }
3054 EXPORT_SYMBOL(sock_no_accept);
3055 
3056 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3057 		    int peer)
3058 {
3059 	return -EOPNOTSUPP;
3060 }
3061 EXPORT_SYMBOL(sock_no_getname);
3062 
3063 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3064 {
3065 	return -EOPNOTSUPP;
3066 }
3067 EXPORT_SYMBOL(sock_no_ioctl);
3068 
3069 int sock_no_listen(struct socket *sock, int backlog)
3070 {
3071 	return -EOPNOTSUPP;
3072 }
3073 EXPORT_SYMBOL(sock_no_listen);
3074 
3075 int sock_no_shutdown(struct socket *sock, int how)
3076 {
3077 	return -EOPNOTSUPP;
3078 }
3079 EXPORT_SYMBOL(sock_no_shutdown);
3080 
3081 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3082 {
3083 	return -EOPNOTSUPP;
3084 }
3085 EXPORT_SYMBOL(sock_no_sendmsg);
3086 
3087 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3088 {
3089 	return -EOPNOTSUPP;
3090 }
3091 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3092 
3093 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3094 		    int flags)
3095 {
3096 	return -EOPNOTSUPP;
3097 }
3098 EXPORT_SYMBOL(sock_no_recvmsg);
3099 
3100 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3101 {
3102 	/* Mirror missing mmap method error code */
3103 	return -ENODEV;
3104 }
3105 EXPORT_SYMBOL(sock_no_mmap);
3106 
3107 /*
3108  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3109  * various sock-based usage counts.
3110  */
3111 void __receive_sock(struct file *file)
3112 {
3113 	struct socket *sock;
3114 
3115 	sock = sock_from_file(file);
3116 	if (sock) {
3117 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3118 		sock_update_classid(&sock->sk->sk_cgrp_data);
3119 	}
3120 }
3121 
3122 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
3123 {
3124 	ssize_t res;
3125 	struct msghdr msg = {.msg_flags = flags};
3126 	struct kvec iov;
3127 	char *kaddr = kmap(page);
3128 	iov.iov_base = kaddr + offset;
3129 	iov.iov_len = size;
3130 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
3131 	kunmap(page);
3132 	return res;
3133 }
3134 EXPORT_SYMBOL(sock_no_sendpage);
3135 
3136 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
3137 				int offset, size_t size, int flags)
3138 {
3139 	ssize_t res;
3140 	struct msghdr msg = {.msg_flags = flags};
3141 	struct kvec iov;
3142 	char *kaddr = kmap(page);
3143 
3144 	iov.iov_base = kaddr + offset;
3145 	iov.iov_len = size;
3146 	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
3147 	kunmap(page);
3148 	return res;
3149 }
3150 EXPORT_SYMBOL(sock_no_sendpage_locked);
3151 
3152 /*
3153  *	Default Socket Callbacks
3154  */
3155 
3156 static void sock_def_wakeup(struct sock *sk)
3157 {
3158 	struct socket_wq *wq;
3159 
3160 	rcu_read_lock();
3161 	wq = rcu_dereference(sk->sk_wq);
3162 	if (skwq_has_sleeper(wq))
3163 		wake_up_interruptible_all(&wq->wait);
3164 	rcu_read_unlock();
3165 }
3166 
3167 static void sock_def_error_report(struct sock *sk)
3168 {
3169 	struct socket_wq *wq;
3170 
3171 	rcu_read_lock();
3172 	wq = rcu_dereference(sk->sk_wq);
3173 	if (skwq_has_sleeper(wq))
3174 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3175 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3176 	rcu_read_unlock();
3177 }
3178 
3179 void sock_def_readable(struct sock *sk)
3180 {
3181 	struct socket_wq *wq;
3182 
3183 	rcu_read_lock();
3184 	wq = rcu_dereference(sk->sk_wq);
3185 	if (skwq_has_sleeper(wq))
3186 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3187 						EPOLLRDNORM | EPOLLRDBAND);
3188 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3189 	rcu_read_unlock();
3190 }
3191 
3192 static void sock_def_write_space(struct sock *sk)
3193 {
3194 	struct socket_wq *wq;
3195 
3196 	rcu_read_lock();
3197 
3198 	/* Do not wake up a writer until he can make "significant"
3199 	 * progress.  --DaveM
3200 	 */
3201 	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
3202 		wq = rcu_dereference(sk->sk_wq);
3203 		if (skwq_has_sleeper(wq))
3204 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3205 						EPOLLWRNORM | EPOLLWRBAND);
3206 
3207 		/* Should agree with poll, otherwise some programs break */
3208 		if (sock_writeable(sk))
3209 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3210 	}
3211 
3212 	rcu_read_unlock();
3213 }
3214 
3215 static void sock_def_destruct(struct sock *sk)
3216 {
3217 }
3218 
3219 void sk_send_sigurg(struct sock *sk)
3220 {
3221 	if (sk->sk_socket && sk->sk_socket->file)
3222 		if (send_sigurg(&sk->sk_socket->file->f_owner))
3223 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3224 }
3225 EXPORT_SYMBOL(sk_send_sigurg);
3226 
3227 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3228 		    unsigned long expires)
3229 {
3230 	if (!mod_timer(timer, expires))
3231 		sock_hold(sk);
3232 }
3233 EXPORT_SYMBOL(sk_reset_timer);
3234 
3235 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3236 {
3237 	if (del_timer(timer))
3238 		__sock_put(sk);
3239 }
3240 EXPORT_SYMBOL(sk_stop_timer);
3241 
3242 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3243 {
3244 	if (del_timer_sync(timer))
3245 		__sock_put(sk);
3246 }
3247 EXPORT_SYMBOL(sk_stop_timer_sync);
3248 
3249 void sock_init_data(struct socket *sock, struct sock *sk)
3250 {
3251 	sk_init_common(sk);
3252 	sk->sk_send_head	=	NULL;
3253 
3254 	timer_setup(&sk->sk_timer, NULL, 0);
3255 
3256 	sk->sk_allocation	=	GFP_KERNEL;
3257 	sk->sk_rcvbuf		=	sysctl_rmem_default;
3258 	sk->sk_sndbuf		=	sysctl_wmem_default;
3259 	sk->sk_state		=	TCP_CLOSE;
3260 	sk_set_socket(sk, sock);
3261 
3262 	sock_set_flag(sk, SOCK_ZAPPED);
3263 
3264 	if (sock) {
3265 		sk->sk_type	=	sock->type;
3266 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3267 		sock->sk	=	sk;
3268 		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
3269 	} else {
3270 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3271 		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
3272 	}
3273 
3274 	rwlock_init(&sk->sk_callback_lock);
3275 	if (sk->sk_kern_sock)
3276 		lockdep_set_class_and_name(
3277 			&sk->sk_callback_lock,
3278 			af_kern_callback_keys + sk->sk_family,
3279 			af_family_kern_clock_key_strings[sk->sk_family]);
3280 	else
3281 		lockdep_set_class_and_name(
3282 			&sk->sk_callback_lock,
3283 			af_callback_keys + sk->sk_family,
3284 			af_family_clock_key_strings[sk->sk_family]);
3285 
3286 	sk->sk_state_change	=	sock_def_wakeup;
3287 	sk->sk_data_ready	=	sock_def_readable;
3288 	sk->sk_write_space	=	sock_def_write_space;
3289 	sk->sk_error_report	=	sock_def_error_report;
3290 	sk->sk_destruct		=	sock_def_destruct;
3291 
3292 	sk->sk_frag.page	=	NULL;
3293 	sk->sk_frag.offset	=	0;
3294 	sk->sk_peek_off		=	-1;
3295 
3296 	sk->sk_peer_pid 	=	NULL;
3297 	sk->sk_peer_cred	=	NULL;
3298 	spin_lock_init(&sk->sk_peer_lock);
3299 
3300 	sk->sk_write_pending	=	0;
3301 	sk->sk_rcvlowat		=	1;
3302 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3303 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3304 
3305 	sk->sk_stamp = SK_DEFAULT_STAMP;
3306 #if BITS_PER_LONG==32
3307 	seqlock_init(&sk->sk_stamp_seq);
3308 #endif
3309 	atomic_set(&sk->sk_zckey, 0);
3310 
3311 #ifdef CONFIG_NET_RX_BUSY_POLL
3312 	sk->sk_napi_id		=	0;
3313 	sk->sk_ll_usec		=	sysctl_net_busy_read;
3314 #endif
3315 
3316 	sk->sk_max_pacing_rate = ~0UL;
3317 	sk->sk_pacing_rate = ~0UL;
3318 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3319 	sk->sk_incoming_cpu = -1;
3320 	sk->sk_txrehash = SOCK_TXREHASH_DEFAULT;
3321 
3322 	sk_rx_queue_clear(sk);
3323 	/*
3324 	 * Before updating sk_refcnt, we must commit prior changes to memory
3325 	 * (Documentation/RCU/rculist_nulls.rst for details)
3326 	 */
3327 	smp_wmb();
3328 	refcount_set(&sk->sk_refcnt, 1);
3329 	atomic_set(&sk->sk_drops, 0);
3330 }
3331 EXPORT_SYMBOL(sock_init_data);
3332 
3333 void lock_sock_nested(struct sock *sk, int subclass)
3334 {
3335 	/* The sk_lock has mutex_lock() semantics here. */
3336 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3337 
3338 	might_sleep();
3339 	spin_lock_bh(&sk->sk_lock.slock);
3340 	if (sock_owned_by_user_nocheck(sk))
3341 		__lock_sock(sk);
3342 	sk->sk_lock.owned = 1;
3343 	spin_unlock_bh(&sk->sk_lock.slock);
3344 }
3345 EXPORT_SYMBOL(lock_sock_nested);
3346 
3347 void release_sock(struct sock *sk)
3348 {
3349 	spin_lock_bh(&sk->sk_lock.slock);
3350 	if (sk->sk_backlog.tail)
3351 		__release_sock(sk);
3352 
3353 	/* Warning : release_cb() might need to release sk ownership,
3354 	 * ie call sock_release_ownership(sk) before us.
3355 	 */
3356 	if (sk->sk_prot->release_cb)
3357 		sk->sk_prot->release_cb(sk);
3358 
3359 	sock_release_ownership(sk);
3360 	if (waitqueue_active(&sk->sk_lock.wq))
3361 		wake_up(&sk->sk_lock.wq);
3362 	spin_unlock_bh(&sk->sk_lock.slock);
3363 }
3364 EXPORT_SYMBOL(release_sock);
3365 
3366 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3367 {
3368 	might_sleep();
3369 	spin_lock_bh(&sk->sk_lock.slock);
3370 
3371 	if (!sock_owned_by_user_nocheck(sk)) {
3372 		/*
3373 		 * Fast path return with bottom halves disabled and
3374 		 * sock::sk_lock.slock held.
3375 		 *
3376 		 * The 'mutex' is not contended and holding
3377 		 * sock::sk_lock.slock prevents all other lockers to
3378 		 * proceed so the corresponding unlock_sock_fast() can
3379 		 * avoid the slow path of release_sock() completely and
3380 		 * just release slock.
3381 		 *
3382 		 * From a semantical POV this is equivalent to 'acquiring'
3383 		 * the 'mutex', hence the corresponding lockdep
3384 		 * mutex_release() has to happen in the fast path of
3385 		 * unlock_sock_fast().
3386 		 */
3387 		return false;
3388 	}
3389 
3390 	__lock_sock(sk);
3391 	sk->sk_lock.owned = 1;
3392 	__acquire(&sk->sk_lock.slock);
3393 	spin_unlock_bh(&sk->sk_lock.slock);
3394 	return true;
3395 }
3396 EXPORT_SYMBOL(__lock_sock_fast);
3397 
3398 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3399 		   bool timeval, bool time32)
3400 {
3401 	struct sock *sk = sock->sk;
3402 	struct timespec64 ts;
3403 
3404 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3405 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3406 	if (ts.tv_sec == -1)
3407 		return -ENOENT;
3408 	if (ts.tv_sec == 0) {
3409 		ktime_t kt = ktime_get_real();
3410 		sock_write_timestamp(sk, kt);
3411 		ts = ktime_to_timespec64(kt);
3412 	}
3413 
3414 	if (timeval)
3415 		ts.tv_nsec /= 1000;
3416 
3417 #ifdef CONFIG_COMPAT_32BIT_TIME
3418 	if (time32)
3419 		return put_old_timespec32(&ts, userstamp);
3420 #endif
3421 #ifdef CONFIG_SPARC64
3422 	/* beware of padding in sparc64 timeval */
3423 	if (timeval && !in_compat_syscall()) {
3424 		struct __kernel_old_timeval __user tv = {
3425 			.tv_sec = ts.tv_sec,
3426 			.tv_usec = ts.tv_nsec,
3427 		};
3428 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3429 			return -EFAULT;
3430 		return 0;
3431 	}
3432 #endif
3433 	return put_timespec64(&ts, userstamp);
3434 }
3435 EXPORT_SYMBOL(sock_gettstamp);
3436 
3437 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3438 {
3439 	if (!sock_flag(sk, flag)) {
3440 		unsigned long previous_flags = sk->sk_flags;
3441 
3442 		sock_set_flag(sk, flag);
3443 		/*
3444 		 * we just set one of the two flags which require net
3445 		 * time stamping, but time stamping might have been on
3446 		 * already because of the other one
3447 		 */
3448 		if (sock_needs_netstamp(sk) &&
3449 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3450 			net_enable_timestamp();
3451 	}
3452 }
3453 
3454 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3455 		       int level, int type)
3456 {
3457 	struct sock_exterr_skb *serr;
3458 	struct sk_buff *skb;
3459 	int copied, err;
3460 
3461 	err = -EAGAIN;
3462 	skb = sock_dequeue_err_skb(sk);
3463 	if (skb == NULL)
3464 		goto out;
3465 
3466 	copied = skb->len;
3467 	if (copied > len) {
3468 		msg->msg_flags |= MSG_TRUNC;
3469 		copied = len;
3470 	}
3471 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3472 	if (err)
3473 		goto out_free_skb;
3474 
3475 	sock_recv_timestamp(msg, sk, skb);
3476 
3477 	serr = SKB_EXT_ERR(skb);
3478 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3479 
3480 	msg->msg_flags |= MSG_ERRQUEUE;
3481 	err = copied;
3482 
3483 out_free_skb:
3484 	kfree_skb(skb);
3485 out:
3486 	return err;
3487 }
3488 EXPORT_SYMBOL(sock_recv_errqueue);
3489 
3490 /*
3491  *	Get a socket option on an socket.
3492  *
3493  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3494  *	asynchronous errors should be reported by getsockopt. We assume
3495  *	this means if you specify SO_ERROR (otherwise whats the point of it).
3496  */
3497 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3498 			   char __user *optval, int __user *optlen)
3499 {
3500 	struct sock *sk = sock->sk;
3501 
3502 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3503 }
3504 EXPORT_SYMBOL(sock_common_getsockopt);
3505 
3506 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3507 			int flags)
3508 {
3509 	struct sock *sk = sock->sk;
3510 	int addr_len = 0;
3511 	int err;
3512 
3513 	err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3514 	if (err >= 0)
3515 		msg->msg_namelen = addr_len;
3516 	return err;
3517 }
3518 EXPORT_SYMBOL(sock_common_recvmsg);
3519 
3520 /*
3521  *	Set socket options on an inet socket.
3522  */
3523 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3524 			   sockptr_t optval, unsigned int optlen)
3525 {
3526 	struct sock *sk = sock->sk;
3527 
3528 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3529 }
3530 EXPORT_SYMBOL(sock_common_setsockopt);
3531 
3532 void sk_common_release(struct sock *sk)
3533 {
3534 	if (sk->sk_prot->destroy)
3535 		sk->sk_prot->destroy(sk);
3536 
3537 	/*
3538 	 * Observation: when sk_common_release is called, processes have
3539 	 * no access to socket. But net still has.
3540 	 * Step one, detach it from networking:
3541 	 *
3542 	 * A. Remove from hash tables.
3543 	 */
3544 
3545 	sk->sk_prot->unhash(sk);
3546 
3547 	/*
3548 	 * In this point socket cannot receive new packets, but it is possible
3549 	 * that some packets are in flight because some CPU runs receiver and
3550 	 * did hash table lookup before we unhashed socket. They will achieve
3551 	 * receive queue and will be purged by socket destructor.
3552 	 *
3553 	 * Also we still have packets pending on receive queue and probably,
3554 	 * our own packets waiting in device queues. sock_destroy will drain
3555 	 * receive queue, but transmitted packets will delay socket destruction
3556 	 * until the last reference will be released.
3557 	 */
3558 
3559 	sock_orphan(sk);
3560 
3561 	xfrm_sk_free_policy(sk);
3562 
3563 	sk_refcnt_debug_release(sk);
3564 
3565 	sock_put(sk);
3566 }
3567 EXPORT_SYMBOL(sk_common_release);
3568 
3569 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3570 {
3571 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3572 
3573 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3574 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3575 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3576 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3577 	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3578 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3579 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3580 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3581 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3582 }
3583 
3584 #ifdef CONFIG_PROC_FS
3585 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3586 
3587 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3588 {
3589 	int cpu, idx = prot->inuse_idx;
3590 	int res = 0;
3591 
3592 	for_each_possible_cpu(cpu)
3593 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3594 
3595 	return res >= 0 ? res : 0;
3596 }
3597 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3598 
3599 int sock_inuse_get(struct net *net)
3600 {
3601 	int cpu, res = 0;
3602 
3603 	for_each_possible_cpu(cpu)
3604 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3605 
3606 	return res;
3607 }
3608 
3609 EXPORT_SYMBOL_GPL(sock_inuse_get);
3610 
3611 static int __net_init sock_inuse_init_net(struct net *net)
3612 {
3613 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3614 	if (net->core.prot_inuse == NULL)
3615 		return -ENOMEM;
3616 	return 0;
3617 }
3618 
3619 static void __net_exit sock_inuse_exit_net(struct net *net)
3620 {
3621 	free_percpu(net->core.prot_inuse);
3622 }
3623 
3624 static struct pernet_operations net_inuse_ops = {
3625 	.init = sock_inuse_init_net,
3626 	.exit = sock_inuse_exit_net,
3627 };
3628 
3629 static __init int net_inuse_init(void)
3630 {
3631 	if (register_pernet_subsys(&net_inuse_ops))
3632 		panic("Cannot initialize net inuse counters");
3633 
3634 	return 0;
3635 }
3636 
3637 core_initcall(net_inuse_init);
3638 
3639 static int assign_proto_idx(struct proto *prot)
3640 {
3641 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3642 
3643 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3644 		pr_err("PROTO_INUSE_NR exhausted\n");
3645 		return -ENOSPC;
3646 	}
3647 
3648 	set_bit(prot->inuse_idx, proto_inuse_idx);
3649 	return 0;
3650 }
3651 
3652 static void release_proto_idx(struct proto *prot)
3653 {
3654 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3655 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3656 }
3657 #else
3658 static inline int assign_proto_idx(struct proto *prot)
3659 {
3660 	return 0;
3661 }
3662 
3663 static inline void release_proto_idx(struct proto *prot)
3664 {
3665 }
3666 
3667 #endif
3668 
3669 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3670 {
3671 	if (!twsk_prot)
3672 		return;
3673 	kfree(twsk_prot->twsk_slab_name);
3674 	twsk_prot->twsk_slab_name = NULL;
3675 	kmem_cache_destroy(twsk_prot->twsk_slab);
3676 	twsk_prot->twsk_slab = NULL;
3677 }
3678 
3679 static int tw_prot_init(const struct proto *prot)
3680 {
3681 	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3682 
3683 	if (!twsk_prot)
3684 		return 0;
3685 
3686 	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3687 					      prot->name);
3688 	if (!twsk_prot->twsk_slab_name)
3689 		return -ENOMEM;
3690 
3691 	twsk_prot->twsk_slab =
3692 		kmem_cache_create(twsk_prot->twsk_slab_name,
3693 				  twsk_prot->twsk_obj_size, 0,
3694 				  SLAB_ACCOUNT | prot->slab_flags,
3695 				  NULL);
3696 	if (!twsk_prot->twsk_slab) {
3697 		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3698 			prot->name);
3699 		return -ENOMEM;
3700 	}
3701 
3702 	return 0;
3703 }
3704 
3705 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3706 {
3707 	if (!rsk_prot)
3708 		return;
3709 	kfree(rsk_prot->slab_name);
3710 	rsk_prot->slab_name = NULL;
3711 	kmem_cache_destroy(rsk_prot->slab);
3712 	rsk_prot->slab = NULL;
3713 }
3714 
3715 static int req_prot_init(const struct proto *prot)
3716 {
3717 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3718 
3719 	if (!rsk_prot)
3720 		return 0;
3721 
3722 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3723 					prot->name);
3724 	if (!rsk_prot->slab_name)
3725 		return -ENOMEM;
3726 
3727 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3728 					   rsk_prot->obj_size, 0,
3729 					   SLAB_ACCOUNT | prot->slab_flags,
3730 					   NULL);
3731 
3732 	if (!rsk_prot->slab) {
3733 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3734 			prot->name);
3735 		return -ENOMEM;
3736 	}
3737 	return 0;
3738 }
3739 
3740 int proto_register(struct proto *prot, int alloc_slab)
3741 {
3742 	int ret = -ENOBUFS;
3743 
3744 	if (prot->memory_allocated && !prot->sysctl_mem) {
3745 		pr_err("%s: missing sysctl_mem\n", prot->name);
3746 		return -EINVAL;
3747 	}
3748 	if (alloc_slab) {
3749 		prot->slab = kmem_cache_create_usercopy(prot->name,
3750 					prot->obj_size, 0,
3751 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3752 					prot->slab_flags,
3753 					prot->useroffset, prot->usersize,
3754 					NULL);
3755 
3756 		if (prot->slab == NULL) {
3757 			pr_crit("%s: Can't create sock SLAB cache!\n",
3758 				prot->name);
3759 			goto out;
3760 		}
3761 
3762 		if (req_prot_init(prot))
3763 			goto out_free_request_sock_slab;
3764 
3765 		if (tw_prot_init(prot))
3766 			goto out_free_timewait_sock_slab;
3767 	}
3768 
3769 	mutex_lock(&proto_list_mutex);
3770 	ret = assign_proto_idx(prot);
3771 	if (ret) {
3772 		mutex_unlock(&proto_list_mutex);
3773 		goto out_free_timewait_sock_slab;
3774 	}
3775 	list_add(&prot->node, &proto_list);
3776 	mutex_unlock(&proto_list_mutex);
3777 	return ret;
3778 
3779 out_free_timewait_sock_slab:
3780 	if (alloc_slab)
3781 		tw_prot_cleanup(prot->twsk_prot);
3782 out_free_request_sock_slab:
3783 	if (alloc_slab) {
3784 		req_prot_cleanup(prot->rsk_prot);
3785 
3786 		kmem_cache_destroy(prot->slab);
3787 		prot->slab = NULL;
3788 	}
3789 out:
3790 	return ret;
3791 }
3792 EXPORT_SYMBOL(proto_register);
3793 
3794 void proto_unregister(struct proto *prot)
3795 {
3796 	mutex_lock(&proto_list_mutex);
3797 	release_proto_idx(prot);
3798 	list_del(&prot->node);
3799 	mutex_unlock(&proto_list_mutex);
3800 
3801 	kmem_cache_destroy(prot->slab);
3802 	prot->slab = NULL;
3803 
3804 	req_prot_cleanup(prot->rsk_prot);
3805 	tw_prot_cleanup(prot->twsk_prot);
3806 }
3807 EXPORT_SYMBOL(proto_unregister);
3808 
3809 int sock_load_diag_module(int family, int protocol)
3810 {
3811 	if (!protocol) {
3812 		if (!sock_is_registered(family))
3813 			return -ENOENT;
3814 
3815 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3816 				      NETLINK_SOCK_DIAG, family);
3817 	}
3818 
3819 #ifdef CONFIG_INET
3820 	if (family == AF_INET &&
3821 	    protocol != IPPROTO_RAW &&
3822 	    protocol < MAX_INET_PROTOS &&
3823 	    !rcu_access_pointer(inet_protos[protocol]))
3824 		return -ENOENT;
3825 #endif
3826 
3827 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3828 			      NETLINK_SOCK_DIAG, family, protocol);
3829 }
3830 EXPORT_SYMBOL(sock_load_diag_module);
3831 
3832 #ifdef CONFIG_PROC_FS
3833 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3834 	__acquires(proto_list_mutex)
3835 {
3836 	mutex_lock(&proto_list_mutex);
3837 	return seq_list_start_head(&proto_list, *pos);
3838 }
3839 
3840 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3841 {
3842 	return seq_list_next(v, &proto_list, pos);
3843 }
3844 
3845 static void proto_seq_stop(struct seq_file *seq, void *v)
3846 	__releases(proto_list_mutex)
3847 {
3848 	mutex_unlock(&proto_list_mutex);
3849 }
3850 
3851 static char proto_method_implemented(const void *method)
3852 {
3853 	return method == NULL ? 'n' : 'y';
3854 }
3855 static long sock_prot_memory_allocated(struct proto *proto)
3856 {
3857 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3858 }
3859 
3860 static const char *sock_prot_memory_pressure(struct proto *proto)
3861 {
3862 	return proto->memory_pressure != NULL ?
3863 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3864 }
3865 
3866 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3867 {
3868 
3869 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3870 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3871 		   proto->name,
3872 		   proto->obj_size,
3873 		   sock_prot_inuse_get(seq_file_net(seq), proto),
3874 		   sock_prot_memory_allocated(proto),
3875 		   sock_prot_memory_pressure(proto),
3876 		   proto->max_header,
3877 		   proto->slab == NULL ? "no" : "yes",
3878 		   module_name(proto->owner),
3879 		   proto_method_implemented(proto->close),
3880 		   proto_method_implemented(proto->connect),
3881 		   proto_method_implemented(proto->disconnect),
3882 		   proto_method_implemented(proto->accept),
3883 		   proto_method_implemented(proto->ioctl),
3884 		   proto_method_implemented(proto->init),
3885 		   proto_method_implemented(proto->destroy),
3886 		   proto_method_implemented(proto->shutdown),
3887 		   proto_method_implemented(proto->setsockopt),
3888 		   proto_method_implemented(proto->getsockopt),
3889 		   proto_method_implemented(proto->sendmsg),
3890 		   proto_method_implemented(proto->recvmsg),
3891 		   proto_method_implemented(proto->sendpage),
3892 		   proto_method_implemented(proto->bind),
3893 		   proto_method_implemented(proto->backlog_rcv),
3894 		   proto_method_implemented(proto->hash),
3895 		   proto_method_implemented(proto->unhash),
3896 		   proto_method_implemented(proto->get_port),
3897 		   proto_method_implemented(proto->enter_memory_pressure));
3898 }
3899 
3900 static int proto_seq_show(struct seq_file *seq, void *v)
3901 {
3902 	if (v == &proto_list)
3903 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3904 			   "protocol",
3905 			   "size",
3906 			   "sockets",
3907 			   "memory",
3908 			   "press",
3909 			   "maxhdr",
3910 			   "slab",
3911 			   "module",
3912 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3913 	else
3914 		proto_seq_printf(seq, list_entry(v, struct proto, node));
3915 	return 0;
3916 }
3917 
3918 static const struct seq_operations proto_seq_ops = {
3919 	.start  = proto_seq_start,
3920 	.next   = proto_seq_next,
3921 	.stop   = proto_seq_stop,
3922 	.show   = proto_seq_show,
3923 };
3924 
3925 static __net_init int proto_init_net(struct net *net)
3926 {
3927 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3928 			sizeof(struct seq_net_private)))
3929 		return -ENOMEM;
3930 
3931 	return 0;
3932 }
3933 
3934 static __net_exit void proto_exit_net(struct net *net)
3935 {
3936 	remove_proc_entry("protocols", net->proc_net);
3937 }
3938 
3939 
3940 static __net_initdata struct pernet_operations proto_net_ops = {
3941 	.init = proto_init_net,
3942 	.exit = proto_exit_net,
3943 };
3944 
3945 static int __init proto_init(void)
3946 {
3947 	return register_pernet_subsys(&proto_net_ops);
3948 }
3949 
3950 subsys_initcall(proto_init);
3951 
3952 #endif /* PROC_FS */
3953 
3954 #ifdef CONFIG_NET_RX_BUSY_POLL
3955 bool sk_busy_loop_end(void *p, unsigned long start_time)
3956 {
3957 	struct sock *sk = p;
3958 
3959 	return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3960 	       sk_busy_loop_timeout(sk, start_time);
3961 }
3962 EXPORT_SYMBOL(sk_busy_loop_end);
3963 #endif /* CONFIG_NET_RX_BUSY_POLL */
3964 
3965 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
3966 {
3967 	if (!sk->sk_prot->bind_add)
3968 		return -EOPNOTSUPP;
3969 	return sk->sk_prot->bind_add(sk, addr, addr_len);
3970 }
3971 EXPORT_SYMBOL(sock_bind_add);
3972