xref: /openbmc/linux/net/core/sock.c (revision ca5999fd)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/init.h>
111 #include <linux/highmem.h>
112 #include <linux/user_namespace.h>
113 #include <linux/static_key.h>
114 #include <linux/memcontrol.h>
115 #include <linux/prefetch.h>
116 
117 #include <linux/uaccess.h>
118 
119 #include <linux/netdevice.h>
120 #include <net/protocol.h>
121 #include <linux/skbuff.h>
122 #include <net/net_namespace.h>
123 #include <net/request_sock.h>
124 #include <net/sock.h>
125 #include <linux/net_tstamp.h>
126 #include <net/xfrm.h>
127 #include <linux/ipsec.h>
128 #include <net/cls_cgroup.h>
129 #include <net/netprio_cgroup.h>
130 #include <linux/sock_diag.h>
131 
132 #include <linux/filter.h>
133 #include <net/sock_reuseport.h>
134 #include <net/bpf_sk_storage.h>
135 
136 #include <trace/events/sock.h>
137 
138 #include <net/tcp.h>
139 #include <net/busy_poll.h>
140 
141 static DEFINE_MUTEX(proto_list_mutex);
142 static LIST_HEAD(proto_list);
143 
144 static void sock_inuse_add(struct net *net, int val);
145 
146 /**
147  * sk_ns_capable - General socket capability test
148  * @sk: Socket to use a capability on or through
149  * @user_ns: The user namespace of the capability to use
150  * @cap: The capability to use
151  *
152  * Test to see if the opener of the socket had when the socket was
153  * created and the current process has the capability @cap in the user
154  * namespace @user_ns.
155  */
156 bool sk_ns_capable(const struct sock *sk,
157 		   struct user_namespace *user_ns, int cap)
158 {
159 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
160 		ns_capable(user_ns, cap);
161 }
162 EXPORT_SYMBOL(sk_ns_capable);
163 
164 /**
165  * sk_capable - Socket global capability test
166  * @sk: Socket to use a capability on or through
167  * @cap: The global capability to use
168  *
169  * Test to see if the opener of the socket had when the socket was
170  * created and the current process has the capability @cap in all user
171  * namespaces.
172  */
173 bool sk_capable(const struct sock *sk, int cap)
174 {
175 	return sk_ns_capable(sk, &init_user_ns, cap);
176 }
177 EXPORT_SYMBOL(sk_capable);
178 
179 /**
180  * sk_net_capable - Network namespace socket capability test
181  * @sk: Socket to use a capability on or through
182  * @cap: The capability to use
183  *
184  * Test to see if the opener of the socket had when the socket was created
185  * and the current process has the capability @cap over the network namespace
186  * the socket is a member of.
187  */
188 bool sk_net_capable(const struct sock *sk, int cap)
189 {
190 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
191 }
192 EXPORT_SYMBOL(sk_net_capable);
193 
194 /*
195  * Each address family might have different locking rules, so we have
196  * one slock key per address family and separate keys for internal and
197  * userspace sockets.
198  */
199 static struct lock_class_key af_family_keys[AF_MAX];
200 static struct lock_class_key af_family_kern_keys[AF_MAX];
201 static struct lock_class_key af_family_slock_keys[AF_MAX];
202 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
203 
204 /*
205  * Make lock validator output more readable. (we pre-construct these
206  * strings build-time, so that runtime initialization of socket
207  * locks is fast):
208  */
209 
210 #define _sock_locks(x)						  \
211   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
212   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
213   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
214   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
215   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
216   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
217   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
218   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
219   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
220   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
221   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
222   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
223   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
224   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
225   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
226   x "AF_MAX"
227 
228 static const char *const af_family_key_strings[AF_MAX+1] = {
229 	_sock_locks("sk_lock-")
230 };
231 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
232 	_sock_locks("slock-")
233 };
234 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
235 	_sock_locks("clock-")
236 };
237 
238 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
239 	_sock_locks("k-sk_lock-")
240 };
241 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
242 	_sock_locks("k-slock-")
243 };
244 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
245 	_sock_locks("k-clock-")
246 };
247 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
248 	_sock_locks("rlock-")
249 };
250 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
251 	_sock_locks("wlock-")
252 };
253 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
254 	_sock_locks("elock-")
255 };
256 
257 /*
258  * sk_callback_lock and sk queues locking rules are per-address-family,
259  * so split the lock classes by using a per-AF key:
260  */
261 static struct lock_class_key af_callback_keys[AF_MAX];
262 static struct lock_class_key af_rlock_keys[AF_MAX];
263 static struct lock_class_key af_wlock_keys[AF_MAX];
264 static struct lock_class_key af_elock_keys[AF_MAX];
265 static struct lock_class_key af_kern_callback_keys[AF_MAX];
266 
267 /* Run time adjustable parameters. */
268 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
269 EXPORT_SYMBOL(sysctl_wmem_max);
270 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
271 EXPORT_SYMBOL(sysctl_rmem_max);
272 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
273 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
274 
275 /* Maximal space eaten by iovec or ancillary data plus some space */
276 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
277 EXPORT_SYMBOL(sysctl_optmem_max);
278 
279 int sysctl_tstamp_allow_data __read_mostly = 1;
280 
281 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
282 EXPORT_SYMBOL_GPL(memalloc_socks_key);
283 
284 /**
285  * sk_set_memalloc - sets %SOCK_MEMALLOC
286  * @sk: socket to set it on
287  *
288  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
289  * It's the responsibility of the admin to adjust min_free_kbytes
290  * to meet the requirements
291  */
292 void sk_set_memalloc(struct sock *sk)
293 {
294 	sock_set_flag(sk, SOCK_MEMALLOC);
295 	sk->sk_allocation |= __GFP_MEMALLOC;
296 	static_branch_inc(&memalloc_socks_key);
297 }
298 EXPORT_SYMBOL_GPL(sk_set_memalloc);
299 
300 void sk_clear_memalloc(struct sock *sk)
301 {
302 	sock_reset_flag(sk, SOCK_MEMALLOC);
303 	sk->sk_allocation &= ~__GFP_MEMALLOC;
304 	static_branch_dec(&memalloc_socks_key);
305 
306 	/*
307 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
308 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
309 	 * it has rmem allocations due to the last swapfile being deactivated
310 	 * but there is a risk that the socket is unusable due to exceeding
311 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
312 	 */
313 	sk_mem_reclaim(sk);
314 }
315 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
316 
317 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
318 {
319 	int ret;
320 	unsigned int noreclaim_flag;
321 
322 	/* these should have been dropped before queueing */
323 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
324 
325 	noreclaim_flag = memalloc_noreclaim_save();
326 	ret = sk->sk_backlog_rcv(sk, skb);
327 	memalloc_noreclaim_restore(noreclaim_flag);
328 
329 	return ret;
330 }
331 EXPORT_SYMBOL(__sk_backlog_rcv);
332 
333 static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
334 {
335 	struct __kernel_sock_timeval tv;
336 
337 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
338 		tv.tv_sec = 0;
339 		tv.tv_usec = 0;
340 	} else {
341 		tv.tv_sec = timeo / HZ;
342 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
343 	}
344 
345 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
346 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
347 		*(struct old_timeval32 *)optval = tv32;
348 		return sizeof(tv32);
349 	}
350 
351 	if (old_timeval) {
352 		struct __kernel_old_timeval old_tv;
353 		old_tv.tv_sec = tv.tv_sec;
354 		old_tv.tv_usec = tv.tv_usec;
355 		*(struct __kernel_old_timeval *)optval = old_tv;
356 		return sizeof(old_tv);
357 	}
358 
359 	*(struct __kernel_sock_timeval *)optval = tv;
360 	return sizeof(tv);
361 }
362 
363 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool old_timeval)
364 {
365 	struct __kernel_sock_timeval tv;
366 
367 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
368 		struct old_timeval32 tv32;
369 
370 		if (optlen < sizeof(tv32))
371 			return -EINVAL;
372 
373 		if (copy_from_user(&tv32, optval, sizeof(tv32)))
374 			return -EFAULT;
375 		tv.tv_sec = tv32.tv_sec;
376 		tv.tv_usec = tv32.tv_usec;
377 	} else if (old_timeval) {
378 		struct __kernel_old_timeval old_tv;
379 
380 		if (optlen < sizeof(old_tv))
381 			return -EINVAL;
382 		if (copy_from_user(&old_tv, optval, sizeof(old_tv)))
383 			return -EFAULT;
384 		tv.tv_sec = old_tv.tv_sec;
385 		tv.tv_usec = old_tv.tv_usec;
386 	} else {
387 		if (optlen < sizeof(tv))
388 			return -EINVAL;
389 		if (copy_from_user(&tv, optval, sizeof(tv)))
390 			return -EFAULT;
391 	}
392 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
393 		return -EDOM;
394 
395 	if (tv.tv_sec < 0) {
396 		static int warned __read_mostly;
397 
398 		*timeo_p = 0;
399 		if (warned < 10 && net_ratelimit()) {
400 			warned++;
401 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
402 				__func__, current->comm, task_pid_nr(current));
403 		}
404 		return 0;
405 	}
406 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
407 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
408 		return 0;
409 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
410 		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
411 	return 0;
412 }
413 
414 static void sock_warn_obsolete_bsdism(const char *name)
415 {
416 	static int warned;
417 	static char warncomm[TASK_COMM_LEN];
418 	if (strcmp(warncomm, current->comm) && warned < 5) {
419 		strcpy(warncomm,  current->comm);
420 		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
421 			warncomm, name);
422 		warned++;
423 	}
424 }
425 
426 static bool sock_needs_netstamp(const struct sock *sk)
427 {
428 	switch (sk->sk_family) {
429 	case AF_UNSPEC:
430 	case AF_UNIX:
431 		return false;
432 	default:
433 		return true;
434 	}
435 }
436 
437 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
438 {
439 	if (sk->sk_flags & flags) {
440 		sk->sk_flags &= ~flags;
441 		if (sock_needs_netstamp(sk) &&
442 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
443 			net_disable_timestamp();
444 	}
445 }
446 
447 
448 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
449 {
450 	unsigned long flags;
451 	struct sk_buff_head *list = &sk->sk_receive_queue;
452 
453 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
454 		atomic_inc(&sk->sk_drops);
455 		trace_sock_rcvqueue_full(sk, skb);
456 		return -ENOMEM;
457 	}
458 
459 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
460 		atomic_inc(&sk->sk_drops);
461 		return -ENOBUFS;
462 	}
463 
464 	skb->dev = NULL;
465 	skb_set_owner_r(skb, sk);
466 
467 	/* we escape from rcu protected region, make sure we dont leak
468 	 * a norefcounted dst
469 	 */
470 	skb_dst_force(skb);
471 
472 	spin_lock_irqsave(&list->lock, flags);
473 	sock_skb_set_dropcount(sk, skb);
474 	__skb_queue_tail(list, skb);
475 	spin_unlock_irqrestore(&list->lock, flags);
476 
477 	if (!sock_flag(sk, SOCK_DEAD))
478 		sk->sk_data_ready(sk);
479 	return 0;
480 }
481 EXPORT_SYMBOL(__sock_queue_rcv_skb);
482 
483 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
484 {
485 	int err;
486 
487 	err = sk_filter(sk, skb);
488 	if (err)
489 		return err;
490 
491 	return __sock_queue_rcv_skb(sk, skb);
492 }
493 EXPORT_SYMBOL(sock_queue_rcv_skb);
494 
495 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
496 		     const int nested, unsigned int trim_cap, bool refcounted)
497 {
498 	int rc = NET_RX_SUCCESS;
499 
500 	if (sk_filter_trim_cap(sk, skb, trim_cap))
501 		goto discard_and_relse;
502 
503 	skb->dev = NULL;
504 
505 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
506 		atomic_inc(&sk->sk_drops);
507 		goto discard_and_relse;
508 	}
509 	if (nested)
510 		bh_lock_sock_nested(sk);
511 	else
512 		bh_lock_sock(sk);
513 	if (!sock_owned_by_user(sk)) {
514 		/*
515 		 * trylock + unlock semantics:
516 		 */
517 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
518 
519 		rc = sk_backlog_rcv(sk, skb);
520 
521 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
522 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
523 		bh_unlock_sock(sk);
524 		atomic_inc(&sk->sk_drops);
525 		goto discard_and_relse;
526 	}
527 
528 	bh_unlock_sock(sk);
529 out:
530 	if (refcounted)
531 		sock_put(sk);
532 	return rc;
533 discard_and_relse:
534 	kfree_skb(skb);
535 	goto out;
536 }
537 EXPORT_SYMBOL(__sk_receive_skb);
538 
539 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
540 {
541 	struct dst_entry *dst = __sk_dst_get(sk);
542 
543 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
544 		sk_tx_queue_clear(sk);
545 		sk->sk_dst_pending_confirm = 0;
546 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
547 		dst_release(dst);
548 		return NULL;
549 	}
550 
551 	return dst;
552 }
553 EXPORT_SYMBOL(__sk_dst_check);
554 
555 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
556 {
557 	struct dst_entry *dst = sk_dst_get(sk);
558 
559 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
560 		sk_dst_reset(sk);
561 		dst_release(dst);
562 		return NULL;
563 	}
564 
565 	return dst;
566 }
567 EXPORT_SYMBOL(sk_dst_check);
568 
569 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
570 {
571 	int ret = -ENOPROTOOPT;
572 #ifdef CONFIG_NETDEVICES
573 	struct net *net = sock_net(sk);
574 
575 	/* Sorry... */
576 	ret = -EPERM;
577 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
578 		goto out;
579 
580 	ret = -EINVAL;
581 	if (ifindex < 0)
582 		goto out;
583 
584 	sk->sk_bound_dev_if = ifindex;
585 	if (sk->sk_prot->rehash)
586 		sk->sk_prot->rehash(sk);
587 	sk_dst_reset(sk);
588 
589 	ret = 0;
590 
591 out:
592 #endif
593 
594 	return ret;
595 }
596 
597 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
598 {
599 	int ret;
600 
601 	if (lock_sk)
602 		lock_sock(sk);
603 	ret = sock_bindtoindex_locked(sk, ifindex);
604 	if (lock_sk)
605 		release_sock(sk);
606 
607 	return ret;
608 }
609 EXPORT_SYMBOL(sock_bindtoindex);
610 
611 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
612 				int optlen)
613 {
614 	int ret = -ENOPROTOOPT;
615 #ifdef CONFIG_NETDEVICES
616 	struct net *net = sock_net(sk);
617 	char devname[IFNAMSIZ];
618 	int index;
619 
620 	ret = -EINVAL;
621 	if (optlen < 0)
622 		goto out;
623 
624 	/* Bind this socket to a particular device like "eth0",
625 	 * as specified in the passed interface name. If the
626 	 * name is "" or the option length is zero the socket
627 	 * is not bound.
628 	 */
629 	if (optlen > IFNAMSIZ - 1)
630 		optlen = IFNAMSIZ - 1;
631 	memset(devname, 0, sizeof(devname));
632 
633 	ret = -EFAULT;
634 	if (copy_from_user(devname, optval, optlen))
635 		goto out;
636 
637 	index = 0;
638 	if (devname[0] != '\0') {
639 		struct net_device *dev;
640 
641 		rcu_read_lock();
642 		dev = dev_get_by_name_rcu(net, devname);
643 		if (dev)
644 			index = dev->ifindex;
645 		rcu_read_unlock();
646 		ret = -ENODEV;
647 		if (!dev)
648 			goto out;
649 	}
650 
651 	return sock_bindtoindex(sk, index, true);
652 out:
653 #endif
654 
655 	return ret;
656 }
657 
658 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
659 				int __user *optlen, int len)
660 {
661 	int ret = -ENOPROTOOPT;
662 #ifdef CONFIG_NETDEVICES
663 	struct net *net = sock_net(sk);
664 	char devname[IFNAMSIZ];
665 
666 	if (sk->sk_bound_dev_if == 0) {
667 		len = 0;
668 		goto zero;
669 	}
670 
671 	ret = -EINVAL;
672 	if (len < IFNAMSIZ)
673 		goto out;
674 
675 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
676 	if (ret)
677 		goto out;
678 
679 	len = strlen(devname) + 1;
680 
681 	ret = -EFAULT;
682 	if (copy_to_user(optval, devname, len))
683 		goto out;
684 
685 zero:
686 	ret = -EFAULT;
687 	if (put_user(len, optlen))
688 		goto out;
689 
690 	ret = 0;
691 
692 out:
693 #endif
694 
695 	return ret;
696 }
697 
698 static inline void sock_valbool_flag(struct sock *sk, enum sock_flags bit,
699 				     int valbool)
700 {
701 	if (valbool)
702 		sock_set_flag(sk, bit);
703 	else
704 		sock_reset_flag(sk, bit);
705 }
706 
707 bool sk_mc_loop(struct sock *sk)
708 {
709 	if (dev_recursion_level())
710 		return false;
711 	if (!sk)
712 		return true;
713 	switch (sk->sk_family) {
714 	case AF_INET:
715 		return inet_sk(sk)->mc_loop;
716 #if IS_ENABLED(CONFIG_IPV6)
717 	case AF_INET6:
718 		return inet6_sk(sk)->mc_loop;
719 #endif
720 	}
721 	WARN_ON(1);
722 	return true;
723 }
724 EXPORT_SYMBOL(sk_mc_loop);
725 
726 void sock_set_reuseaddr(struct sock *sk)
727 {
728 	lock_sock(sk);
729 	sk->sk_reuse = SK_CAN_REUSE;
730 	release_sock(sk);
731 }
732 EXPORT_SYMBOL(sock_set_reuseaddr);
733 
734 void sock_set_reuseport(struct sock *sk)
735 {
736 	lock_sock(sk);
737 	sk->sk_reuseport = true;
738 	release_sock(sk);
739 }
740 EXPORT_SYMBOL(sock_set_reuseport);
741 
742 void sock_no_linger(struct sock *sk)
743 {
744 	lock_sock(sk);
745 	sk->sk_lingertime = 0;
746 	sock_set_flag(sk, SOCK_LINGER);
747 	release_sock(sk);
748 }
749 EXPORT_SYMBOL(sock_no_linger);
750 
751 void sock_set_priority(struct sock *sk, u32 priority)
752 {
753 	lock_sock(sk);
754 	sk->sk_priority = priority;
755 	release_sock(sk);
756 }
757 EXPORT_SYMBOL(sock_set_priority);
758 
759 void sock_set_sndtimeo(struct sock *sk, s64 secs)
760 {
761 	lock_sock(sk);
762 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
763 		sk->sk_sndtimeo = secs * HZ;
764 	else
765 		sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
766 	release_sock(sk);
767 }
768 EXPORT_SYMBOL(sock_set_sndtimeo);
769 
770 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
771 {
772 	if (val)  {
773 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
774 		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
775 		sock_set_flag(sk, SOCK_RCVTSTAMP);
776 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
777 	} else {
778 		sock_reset_flag(sk, SOCK_RCVTSTAMP);
779 		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
780 		sock_reset_flag(sk, SOCK_TSTAMP_NEW);
781 	}
782 }
783 
784 void sock_enable_timestamps(struct sock *sk)
785 {
786 	lock_sock(sk);
787 	__sock_set_timestamps(sk, true, false, true);
788 	release_sock(sk);
789 }
790 EXPORT_SYMBOL(sock_enable_timestamps);
791 
792 void sock_set_keepalive(struct sock *sk)
793 {
794 	lock_sock(sk);
795 	if (sk->sk_prot->keepalive)
796 		sk->sk_prot->keepalive(sk, true);
797 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
798 	release_sock(sk);
799 }
800 EXPORT_SYMBOL(sock_set_keepalive);
801 
802 static void __sock_set_rcvbuf(struct sock *sk, int val)
803 {
804 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
805 	 * as a negative value.
806 	 */
807 	val = min_t(int, val, INT_MAX / 2);
808 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
809 
810 	/* We double it on the way in to account for "struct sk_buff" etc.
811 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
812 	 * will allow that much actual data to be received on that socket.
813 	 *
814 	 * Applications are unaware that "struct sk_buff" and other overheads
815 	 * allocate from the receive buffer during socket buffer allocation.
816 	 *
817 	 * And after considering the possible alternatives, returning the value
818 	 * we actually used in getsockopt is the most desirable behavior.
819 	 */
820 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
821 }
822 
823 void sock_set_rcvbuf(struct sock *sk, int val)
824 {
825 	lock_sock(sk);
826 	__sock_set_rcvbuf(sk, val);
827 	release_sock(sk);
828 }
829 EXPORT_SYMBOL(sock_set_rcvbuf);
830 
831 /*
832  *	This is meant for all protocols to use and covers goings on
833  *	at the socket level. Everything here is generic.
834  */
835 
836 int sock_setsockopt(struct socket *sock, int level, int optname,
837 		    char __user *optval, unsigned int optlen)
838 {
839 	struct sock_txtime sk_txtime;
840 	struct sock *sk = sock->sk;
841 	int val;
842 	int valbool;
843 	struct linger ling;
844 	int ret = 0;
845 
846 	/*
847 	 *	Options without arguments
848 	 */
849 
850 	if (optname == SO_BINDTODEVICE)
851 		return sock_setbindtodevice(sk, optval, optlen);
852 
853 	if (optlen < sizeof(int))
854 		return -EINVAL;
855 
856 	if (get_user(val, (int __user *)optval))
857 		return -EFAULT;
858 
859 	valbool = val ? 1 : 0;
860 
861 	lock_sock(sk);
862 
863 	switch (optname) {
864 	case SO_DEBUG:
865 		if (val && !capable(CAP_NET_ADMIN))
866 			ret = -EACCES;
867 		else
868 			sock_valbool_flag(sk, SOCK_DBG, valbool);
869 		break;
870 	case SO_REUSEADDR:
871 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
872 		break;
873 	case SO_REUSEPORT:
874 		sk->sk_reuseport = valbool;
875 		break;
876 	case SO_TYPE:
877 	case SO_PROTOCOL:
878 	case SO_DOMAIN:
879 	case SO_ERROR:
880 		ret = -ENOPROTOOPT;
881 		break;
882 	case SO_DONTROUTE:
883 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
884 		sk_dst_reset(sk);
885 		break;
886 	case SO_BROADCAST:
887 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
888 		break;
889 	case SO_SNDBUF:
890 		/* Don't error on this BSD doesn't and if you think
891 		 * about it this is right. Otherwise apps have to
892 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
893 		 * are treated in BSD as hints
894 		 */
895 		val = min_t(u32, val, sysctl_wmem_max);
896 set_sndbuf:
897 		/* Ensure val * 2 fits into an int, to prevent max_t()
898 		 * from treating it as a negative value.
899 		 */
900 		val = min_t(int, val, INT_MAX / 2);
901 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
902 		WRITE_ONCE(sk->sk_sndbuf,
903 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
904 		/* Wake up sending tasks if we upped the value. */
905 		sk->sk_write_space(sk);
906 		break;
907 
908 	case SO_SNDBUFFORCE:
909 		if (!capable(CAP_NET_ADMIN)) {
910 			ret = -EPERM;
911 			break;
912 		}
913 
914 		/* No negative values (to prevent underflow, as val will be
915 		 * multiplied by 2).
916 		 */
917 		if (val < 0)
918 			val = 0;
919 		goto set_sndbuf;
920 
921 	case SO_RCVBUF:
922 		/* Don't error on this BSD doesn't and if you think
923 		 * about it this is right. Otherwise apps have to
924 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
925 		 * are treated in BSD as hints
926 		 */
927 		__sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max));
928 		break;
929 
930 	case SO_RCVBUFFORCE:
931 		if (!capable(CAP_NET_ADMIN)) {
932 			ret = -EPERM;
933 			break;
934 		}
935 
936 		/* No negative values (to prevent underflow, as val will be
937 		 * multiplied by 2).
938 		 */
939 		__sock_set_rcvbuf(sk, max(val, 0));
940 		break;
941 
942 	case SO_KEEPALIVE:
943 		if (sk->sk_prot->keepalive)
944 			sk->sk_prot->keepalive(sk, valbool);
945 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
946 		break;
947 
948 	case SO_OOBINLINE:
949 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
950 		break;
951 
952 	case SO_NO_CHECK:
953 		sk->sk_no_check_tx = valbool;
954 		break;
955 
956 	case SO_PRIORITY:
957 		if ((val >= 0 && val <= 6) ||
958 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
959 			sk->sk_priority = val;
960 		else
961 			ret = -EPERM;
962 		break;
963 
964 	case SO_LINGER:
965 		if (optlen < sizeof(ling)) {
966 			ret = -EINVAL;	/* 1003.1g */
967 			break;
968 		}
969 		if (copy_from_user(&ling, optval, sizeof(ling))) {
970 			ret = -EFAULT;
971 			break;
972 		}
973 		if (!ling.l_onoff)
974 			sock_reset_flag(sk, SOCK_LINGER);
975 		else {
976 #if (BITS_PER_LONG == 32)
977 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
978 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
979 			else
980 #endif
981 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
982 			sock_set_flag(sk, SOCK_LINGER);
983 		}
984 		break;
985 
986 	case SO_BSDCOMPAT:
987 		sock_warn_obsolete_bsdism("setsockopt");
988 		break;
989 
990 	case SO_PASSCRED:
991 		if (valbool)
992 			set_bit(SOCK_PASSCRED, &sock->flags);
993 		else
994 			clear_bit(SOCK_PASSCRED, &sock->flags);
995 		break;
996 
997 	case SO_TIMESTAMP_OLD:
998 		__sock_set_timestamps(sk, valbool, false, false);
999 		break;
1000 	case SO_TIMESTAMP_NEW:
1001 		__sock_set_timestamps(sk, valbool, true, false);
1002 		break;
1003 	case SO_TIMESTAMPNS_OLD:
1004 		__sock_set_timestamps(sk, valbool, false, true);
1005 		break;
1006 	case SO_TIMESTAMPNS_NEW:
1007 		__sock_set_timestamps(sk, valbool, true, true);
1008 		break;
1009 	case SO_TIMESTAMPING_NEW:
1010 		sock_set_flag(sk, SOCK_TSTAMP_NEW);
1011 		/* fall through */
1012 	case SO_TIMESTAMPING_OLD:
1013 		if (val & ~SOF_TIMESTAMPING_MASK) {
1014 			ret = -EINVAL;
1015 			break;
1016 		}
1017 
1018 		if (val & SOF_TIMESTAMPING_OPT_ID &&
1019 		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
1020 			if (sk->sk_protocol == IPPROTO_TCP &&
1021 			    sk->sk_type == SOCK_STREAM) {
1022 				if ((1 << sk->sk_state) &
1023 				    (TCPF_CLOSE | TCPF_LISTEN)) {
1024 					ret = -EINVAL;
1025 					break;
1026 				}
1027 				sk->sk_tskey = tcp_sk(sk)->snd_una;
1028 			} else {
1029 				sk->sk_tskey = 0;
1030 			}
1031 		}
1032 
1033 		if (val & SOF_TIMESTAMPING_OPT_STATS &&
1034 		    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
1035 			ret = -EINVAL;
1036 			break;
1037 		}
1038 
1039 		sk->sk_tsflags = val;
1040 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
1041 			sock_enable_timestamp(sk,
1042 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
1043 		else {
1044 			if (optname == SO_TIMESTAMPING_NEW)
1045 				sock_reset_flag(sk, SOCK_TSTAMP_NEW);
1046 
1047 			sock_disable_timestamp(sk,
1048 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
1049 		}
1050 		break;
1051 
1052 	case SO_RCVLOWAT:
1053 		if (val < 0)
1054 			val = INT_MAX;
1055 		if (sock->ops->set_rcvlowat)
1056 			ret = sock->ops->set_rcvlowat(sk, val);
1057 		else
1058 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1059 		break;
1060 
1061 	case SO_RCVTIMEO_OLD:
1062 	case SO_RCVTIMEO_NEW:
1063 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen, optname == SO_RCVTIMEO_OLD);
1064 		break;
1065 
1066 	case SO_SNDTIMEO_OLD:
1067 	case SO_SNDTIMEO_NEW:
1068 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen, optname == SO_SNDTIMEO_OLD);
1069 		break;
1070 
1071 	case SO_ATTACH_FILTER:
1072 		ret = -EINVAL;
1073 		if (optlen == sizeof(struct sock_fprog)) {
1074 			struct sock_fprog fprog;
1075 
1076 			ret = -EFAULT;
1077 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
1078 				break;
1079 
1080 			ret = sk_attach_filter(&fprog, sk);
1081 		}
1082 		break;
1083 
1084 	case SO_ATTACH_BPF:
1085 		ret = -EINVAL;
1086 		if (optlen == sizeof(u32)) {
1087 			u32 ufd;
1088 
1089 			ret = -EFAULT;
1090 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
1091 				break;
1092 
1093 			ret = sk_attach_bpf(ufd, sk);
1094 		}
1095 		break;
1096 
1097 	case SO_ATTACH_REUSEPORT_CBPF:
1098 		ret = -EINVAL;
1099 		if (optlen == sizeof(struct sock_fprog)) {
1100 			struct sock_fprog fprog;
1101 
1102 			ret = -EFAULT;
1103 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
1104 				break;
1105 
1106 			ret = sk_reuseport_attach_filter(&fprog, sk);
1107 		}
1108 		break;
1109 
1110 	case SO_ATTACH_REUSEPORT_EBPF:
1111 		ret = -EINVAL;
1112 		if (optlen == sizeof(u32)) {
1113 			u32 ufd;
1114 
1115 			ret = -EFAULT;
1116 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
1117 				break;
1118 
1119 			ret = sk_reuseport_attach_bpf(ufd, sk);
1120 		}
1121 		break;
1122 
1123 	case SO_DETACH_REUSEPORT_BPF:
1124 		ret = reuseport_detach_prog(sk);
1125 		break;
1126 
1127 	case SO_DETACH_FILTER:
1128 		ret = sk_detach_filter(sk);
1129 		break;
1130 
1131 	case SO_LOCK_FILTER:
1132 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1133 			ret = -EPERM;
1134 		else
1135 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1136 		break;
1137 
1138 	case SO_PASSSEC:
1139 		if (valbool)
1140 			set_bit(SOCK_PASSSEC, &sock->flags);
1141 		else
1142 			clear_bit(SOCK_PASSSEC, &sock->flags);
1143 		break;
1144 	case SO_MARK:
1145 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1146 			ret = -EPERM;
1147 		} else if (val != sk->sk_mark) {
1148 			sk->sk_mark = val;
1149 			sk_dst_reset(sk);
1150 		}
1151 		break;
1152 
1153 	case SO_RXQ_OVFL:
1154 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1155 		break;
1156 
1157 	case SO_WIFI_STATUS:
1158 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1159 		break;
1160 
1161 	case SO_PEEK_OFF:
1162 		if (sock->ops->set_peek_off)
1163 			ret = sock->ops->set_peek_off(sk, val);
1164 		else
1165 			ret = -EOPNOTSUPP;
1166 		break;
1167 
1168 	case SO_NOFCS:
1169 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1170 		break;
1171 
1172 	case SO_SELECT_ERR_QUEUE:
1173 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1174 		break;
1175 
1176 #ifdef CONFIG_NET_RX_BUSY_POLL
1177 	case SO_BUSY_POLL:
1178 		/* allow unprivileged users to decrease the value */
1179 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1180 			ret = -EPERM;
1181 		else {
1182 			if (val < 0)
1183 				ret = -EINVAL;
1184 			else
1185 				sk->sk_ll_usec = val;
1186 		}
1187 		break;
1188 #endif
1189 
1190 	case SO_MAX_PACING_RATE:
1191 		{
1192 		unsigned long ulval = (val == ~0U) ? ~0UL : val;
1193 
1194 		if (sizeof(ulval) != sizeof(val) &&
1195 		    optlen >= sizeof(ulval) &&
1196 		    get_user(ulval, (unsigned long __user *)optval)) {
1197 			ret = -EFAULT;
1198 			break;
1199 		}
1200 		if (ulval != ~0UL)
1201 			cmpxchg(&sk->sk_pacing_status,
1202 				SK_PACING_NONE,
1203 				SK_PACING_NEEDED);
1204 		sk->sk_max_pacing_rate = ulval;
1205 		sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1206 		break;
1207 		}
1208 	case SO_INCOMING_CPU:
1209 		WRITE_ONCE(sk->sk_incoming_cpu, val);
1210 		break;
1211 
1212 	case SO_CNX_ADVICE:
1213 		if (val == 1)
1214 			dst_negative_advice(sk);
1215 		break;
1216 
1217 	case SO_ZEROCOPY:
1218 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1219 			if (!((sk->sk_type == SOCK_STREAM &&
1220 			       sk->sk_protocol == IPPROTO_TCP) ||
1221 			      (sk->sk_type == SOCK_DGRAM &&
1222 			       sk->sk_protocol == IPPROTO_UDP)))
1223 				ret = -ENOTSUPP;
1224 		} else if (sk->sk_family != PF_RDS) {
1225 			ret = -ENOTSUPP;
1226 		}
1227 		if (!ret) {
1228 			if (val < 0 || val > 1)
1229 				ret = -EINVAL;
1230 			else
1231 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1232 		}
1233 		break;
1234 
1235 	case SO_TXTIME:
1236 		if (optlen != sizeof(struct sock_txtime)) {
1237 			ret = -EINVAL;
1238 			break;
1239 		} else if (copy_from_user(&sk_txtime, optval,
1240 			   sizeof(struct sock_txtime))) {
1241 			ret = -EFAULT;
1242 			break;
1243 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1244 			ret = -EINVAL;
1245 			break;
1246 		}
1247 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1248 		 * scheduler has enough safe guards.
1249 		 */
1250 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1251 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1252 			ret = -EPERM;
1253 			break;
1254 		}
1255 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1256 		sk->sk_clockid = sk_txtime.clockid;
1257 		sk->sk_txtime_deadline_mode =
1258 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1259 		sk->sk_txtime_report_errors =
1260 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1261 		break;
1262 
1263 	case SO_BINDTOIFINDEX:
1264 		ret = sock_bindtoindex_locked(sk, val);
1265 		break;
1266 
1267 	default:
1268 		ret = -ENOPROTOOPT;
1269 		break;
1270 	}
1271 	release_sock(sk);
1272 	return ret;
1273 }
1274 EXPORT_SYMBOL(sock_setsockopt);
1275 
1276 
1277 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1278 			  struct ucred *ucred)
1279 {
1280 	ucred->pid = pid_vnr(pid);
1281 	ucred->uid = ucred->gid = -1;
1282 	if (cred) {
1283 		struct user_namespace *current_ns = current_user_ns();
1284 
1285 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1286 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1287 	}
1288 }
1289 
1290 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1291 {
1292 	struct user_namespace *user_ns = current_user_ns();
1293 	int i;
1294 
1295 	for (i = 0; i < src->ngroups; i++)
1296 		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1297 			return -EFAULT;
1298 
1299 	return 0;
1300 }
1301 
1302 int sock_getsockopt(struct socket *sock, int level, int optname,
1303 		    char __user *optval, int __user *optlen)
1304 {
1305 	struct sock *sk = sock->sk;
1306 
1307 	union {
1308 		int val;
1309 		u64 val64;
1310 		unsigned long ulval;
1311 		struct linger ling;
1312 		struct old_timeval32 tm32;
1313 		struct __kernel_old_timeval tm;
1314 		struct  __kernel_sock_timeval stm;
1315 		struct sock_txtime txtime;
1316 	} v;
1317 
1318 	int lv = sizeof(int);
1319 	int len;
1320 
1321 	if (get_user(len, optlen))
1322 		return -EFAULT;
1323 	if (len < 0)
1324 		return -EINVAL;
1325 
1326 	memset(&v, 0, sizeof(v));
1327 
1328 	switch (optname) {
1329 	case SO_DEBUG:
1330 		v.val = sock_flag(sk, SOCK_DBG);
1331 		break;
1332 
1333 	case SO_DONTROUTE:
1334 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1335 		break;
1336 
1337 	case SO_BROADCAST:
1338 		v.val = sock_flag(sk, SOCK_BROADCAST);
1339 		break;
1340 
1341 	case SO_SNDBUF:
1342 		v.val = sk->sk_sndbuf;
1343 		break;
1344 
1345 	case SO_RCVBUF:
1346 		v.val = sk->sk_rcvbuf;
1347 		break;
1348 
1349 	case SO_REUSEADDR:
1350 		v.val = sk->sk_reuse;
1351 		break;
1352 
1353 	case SO_REUSEPORT:
1354 		v.val = sk->sk_reuseport;
1355 		break;
1356 
1357 	case SO_KEEPALIVE:
1358 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1359 		break;
1360 
1361 	case SO_TYPE:
1362 		v.val = sk->sk_type;
1363 		break;
1364 
1365 	case SO_PROTOCOL:
1366 		v.val = sk->sk_protocol;
1367 		break;
1368 
1369 	case SO_DOMAIN:
1370 		v.val = sk->sk_family;
1371 		break;
1372 
1373 	case SO_ERROR:
1374 		v.val = -sock_error(sk);
1375 		if (v.val == 0)
1376 			v.val = xchg(&sk->sk_err_soft, 0);
1377 		break;
1378 
1379 	case SO_OOBINLINE:
1380 		v.val = sock_flag(sk, SOCK_URGINLINE);
1381 		break;
1382 
1383 	case SO_NO_CHECK:
1384 		v.val = sk->sk_no_check_tx;
1385 		break;
1386 
1387 	case SO_PRIORITY:
1388 		v.val = sk->sk_priority;
1389 		break;
1390 
1391 	case SO_LINGER:
1392 		lv		= sizeof(v.ling);
1393 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1394 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1395 		break;
1396 
1397 	case SO_BSDCOMPAT:
1398 		sock_warn_obsolete_bsdism("getsockopt");
1399 		break;
1400 
1401 	case SO_TIMESTAMP_OLD:
1402 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1403 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1404 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1405 		break;
1406 
1407 	case SO_TIMESTAMPNS_OLD:
1408 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1409 		break;
1410 
1411 	case SO_TIMESTAMP_NEW:
1412 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1413 		break;
1414 
1415 	case SO_TIMESTAMPNS_NEW:
1416 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1417 		break;
1418 
1419 	case SO_TIMESTAMPING_OLD:
1420 		v.val = sk->sk_tsflags;
1421 		break;
1422 
1423 	case SO_RCVTIMEO_OLD:
1424 	case SO_RCVTIMEO_NEW:
1425 		lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1426 		break;
1427 
1428 	case SO_SNDTIMEO_OLD:
1429 	case SO_SNDTIMEO_NEW:
1430 		lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1431 		break;
1432 
1433 	case SO_RCVLOWAT:
1434 		v.val = sk->sk_rcvlowat;
1435 		break;
1436 
1437 	case SO_SNDLOWAT:
1438 		v.val = 1;
1439 		break;
1440 
1441 	case SO_PASSCRED:
1442 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1443 		break;
1444 
1445 	case SO_PEERCRED:
1446 	{
1447 		struct ucred peercred;
1448 		if (len > sizeof(peercred))
1449 			len = sizeof(peercred);
1450 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1451 		if (copy_to_user(optval, &peercred, len))
1452 			return -EFAULT;
1453 		goto lenout;
1454 	}
1455 
1456 	case SO_PEERGROUPS:
1457 	{
1458 		int ret, n;
1459 
1460 		if (!sk->sk_peer_cred)
1461 			return -ENODATA;
1462 
1463 		n = sk->sk_peer_cred->group_info->ngroups;
1464 		if (len < n * sizeof(gid_t)) {
1465 			len = n * sizeof(gid_t);
1466 			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1467 		}
1468 		len = n * sizeof(gid_t);
1469 
1470 		ret = groups_to_user((gid_t __user *)optval,
1471 				     sk->sk_peer_cred->group_info);
1472 		if (ret)
1473 			return ret;
1474 		goto lenout;
1475 	}
1476 
1477 	case SO_PEERNAME:
1478 	{
1479 		char address[128];
1480 
1481 		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1482 		if (lv < 0)
1483 			return -ENOTCONN;
1484 		if (lv < len)
1485 			return -EINVAL;
1486 		if (copy_to_user(optval, address, len))
1487 			return -EFAULT;
1488 		goto lenout;
1489 	}
1490 
1491 	/* Dubious BSD thing... Probably nobody even uses it, but
1492 	 * the UNIX standard wants it for whatever reason... -DaveM
1493 	 */
1494 	case SO_ACCEPTCONN:
1495 		v.val = sk->sk_state == TCP_LISTEN;
1496 		break;
1497 
1498 	case SO_PASSSEC:
1499 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1500 		break;
1501 
1502 	case SO_PEERSEC:
1503 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1504 
1505 	case SO_MARK:
1506 		v.val = sk->sk_mark;
1507 		break;
1508 
1509 	case SO_RXQ_OVFL:
1510 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1511 		break;
1512 
1513 	case SO_WIFI_STATUS:
1514 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1515 		break;
1516 
1517 	case SO_PEEK_OFF:
1518 		if (!sock->ops->set_peek_off)
1519 			return -EOPNOTSUPP;
1520 
1521 		v.val = sk->sk_peek_off;
1522 		break;
1523 	case SO_NOFCS:
1524 		v.val = sock_flag(sk, SOCK_NOFCS);
1525 		break;
1526 
1527 	case SO_BINDTODEVICE:
1528 		return sock_getbindtodevice(sk, optval, optlen, len);
1529 
1530 	case SO_GET_FILTER:
1531 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1532 		if (len < 0)
1533 			return len;
1534 
1535 		goto lenout;
1536 
1537 	case SO_LOCK_FILTER:
1538 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1539 		break;
1540 
1541 	case SO_BPF_EXTENSIONS:
1542 		v.val = bpf_tell_extensions();
1543 		break;
1544 
1545 	case SO_SELECT_ERR_QUEUE:
1546 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1547 		break;
1548 
1549 #ifdef CONFIG_NET_RX_BUSY_POLL
1550 	case SO_BUSY_POLL:
1551 		v.val = sk->sk_ll_usec;
1552 		break;
1553 #endif
1554 
1555 	case SO_MAX_PACING_RATE:
1556 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1557 			lv = sizeof(v.ulval);
1558 			v.ulval = sk->sk_max_pacing_rate;
1559 		} else {
1560 			/* 32bit version */
1561 			v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1562 		}
1563 		break;
1564 
1565 	case SO_INCOMING_CPU:
1566 		v.val = READ_ONCE(sk->sk_incoming_cpu);
1567 		break;
1568 
1569 	case SO_MEMINFO:
1570 	{
1571 		u32 meminfo[SK_MEMINFO_VARS];
1572 
1573 		sk_get_meminfo(sk, meminfo);
1574 
1575 		len = min_t(unsigned int, len, sizeof(meminfo));
1576 		if (copy_to_user(optval, &meminfo, len))
1577 			return -EFAULT;
1578 
1579 		goto lenout;
1580 	}
1581 
1582 #ifdef CONFIG_NET_RX_BUSY_POLL
1583 	case SO_INCOMING_NAPI_ID:
1584 		v.val = READ_ONCE(sk->sk_napi_id);
1585 
1586 		/* aggregate non-NAPI IDs down to 0 */
1587 		if (v.val < MIN_NAPI_ID)
1588 			v.val = 0;
1589 
1590 		break;
1591 #endif
1592 
1593 	case SO_COOKIE:
1594 		lv = sizeof(u64);
1595 		if (len < lv)
1596 			return -EINVAL;
1597 		v.val64 = sock_gen_cookie(sk);
1598 		break;
1599 
1600 	case SO_ZEROCOPY:
1601 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1602 		break;
1603 
1604 	case SO_TXTIME:
1605 		lv = sizeof(v.txtime);
1606 		v.txtime.clockid = sk->sk_clockid;
1607 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1608 				  SOF_TXTIME_DEADLINE_MODE : 0;
1609 		v.txtime.flags |= sk->sk_txtime_report_errors ?
1610 				  SOF_TXTIME_REPORT_ERRORS : 0;
1611 		break;
1612 
1613 	case SO_BINDTOIFINDEX:
1614 		v.val = sk->sk_bound_dev_if;
1615 		break;
1616 
1617 	default:
1618 		/* We implement the SO_SNDLOWAT etc to not be settable
1619 		 * (1003.1g 7).
1620 		 */
1621 		return -ENOPROTOOPT;
1622 	}
1623 
1624 	if (len > lv)
1625 		len = lv;
1626 	if (copy_to_user(optval, &v, len))
1627 		return -EFAULT;
1628 lenout:
1629 	if (put_user(len, optlen))
1630 		return -EFAULT;
1631 	return 0;
1632 }
1633 
1634 /*
1635  * Initialize an sk_lock.
1636  *
1637  * (We also register the sk_lock with the lock validator.)
1638  */
1639 static inline void sock_lock_init(struct sock *sk)
1640 {
1641 	if (sk->sk_kern_sock)
1642 		sock_lock_init_class_and_name(
1643 			sk,
1644 			af_family_kern_slock_key_strings[sk->sk_family],
1645 			af_family_kern_slock_keys + sk->sk_family,
1646 			af_family_kern_key_strings[sk->sk_family],
1647 			af_family_kern_keys + sk->sk_family);
1648 	else
1649 		sock_lock_init_class_and_name(
1650 			sk,
1651 			af_family_slock_key_strings[sk->sk_family],
1652 			af_family_slock_keys + sk->sk_family,
1653 			af_family_key_strings[sk->sk_family],
1654 			af_family_keys + sk->sk_family);
1655 }
1656 
1657 /*
1658  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1659  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1660  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1661  */
1662 static void sock_copy(struct sock *nsk, const struct sock *osk)
1663 {
1664 	const struct proto *prot = READ_ONCE(osk->sk_prot);
1665 #ifdef CONFIG_SECURITY_NETWORK
1666 	void *sptr = nsk->sk_security;
1667 #endif
1668 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1669 
1670 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1671 	       prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1672 
1673 #ifdef CONFIG_SECURITY_NETWORK
1674 	nsk->sk_security = sptr;
1675 	security_sk_clone(osk, nsk);
1676 #endif
1677 }
1678 
1679 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1680 		int family)
1681 {
1682 	struct sock *sk;
1683 	struct kmem_cache *slab;
1684 
1685 	slab = prot->slab;
1686 	if (slab != NULL) {
1687 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1688 		if (!sk)
1689 			return sk;
1690 		if (want_init_on_alloc(priority))
1691 			sk_prot_clear_nulls(sk, prot->obj_size);
1692 	} else
1693 		sk = kmalloc(prot->obj_size, priority);
1694 
1695 	if (sk != NULL) {
1696 		if (security_sk_alloc(sk, family, priority))
1697 			goto out_free;
1698 
1699 		if (!try_module_get(prot->owner))
1700 			goto out_free_sec;
1701 		sk_tx_queue_clear(sk);
1702 	}
1703 
1704 	return sk;
1705 
1706 out_free_sec:
1707 	security_sk_free(sk);
1708 out_free:
1709 	if (slab != NULL)
1710 		kmem_cache_free(slab, sk);
1711 	else
1712 		kfree(sk);
1713 	return NULL;
1714 }
1715 
1716 static void sk_prot_free(struct proto *prot, struct sock *sk)
1717 {
1718 	struct kmem_cache *slab;
1719 	struct module *owner;
1720 
1721 	owner = prot->owner;
1722 	slab = prot->slab;
1723 
1724 	cgroup_sk_free(&sk->sk_cgrp_data);
1725 	mem_cgroup_sk_free(sk);
1726 	security_sk_free(sk);
1727 	if (slab != NULL)
1728 		kmem_cache_free(slab, sk);
1729 	else
1730 		kfree(sk);
1731 	module_put(owner);
1732 }
1733 
1734 /**
1735  *	sk_alloc - All socket objects are allocated here
1736  *	@net: the applicable net namespace
1737  *	@family: protocol family
1738  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1739  *	@prot: struct proto associated with this new sock instance
1740  *	@kern: is this to be a kernel socket?
1741  */
1742 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1743 		      struct proto *prot, int kern)
1744 {
1745 	struct sock *sk;
1746 
1747 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1748 	if (sk) {
1749 		sk->sk_family = family;
1750 		/*
1751 		 * See comment in struct sock definition to understand
1752 		 * why we need sk_prot_creator -acme
1753 		 */
1754 		sk->sk_prot = sk->sk_prot_creator = prot;
1755 		sk->sk_kern_sock = kern;
1756 		sock_lock_init(sk);
1757 		sk->sk_net_refcnt = kern ? 0 : 1;
1758 		if (likely(sk->sk_net_refcnt)) {
1759 			get_net(net);
1760 			sock_inuse_add(net, 1);
1761 		}
1762 
1763 		sock_net_set(sk, net);
1764 		refcount_set(&sk->sk_wmem_alloc, 1);
1765 
1766 		mem_cgroup_sk_alloc(sk);
1767 		cgroup_sk_alloc(&sk->sk_cgrp_data);
1768 		sock_update_classid(&sk->sk_cgrp_data);
1769 		sock_update_netprioidx(&sk->sk_cgrp_data);
1770 	}
1771 
1772 	return sk;
1773 }
1774 EXPORT_SYMBOL(sk_alloc);
1775 
1776 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1777  * grace period. This is the case for UDP sockets and TCP listeners.
1778  */
1779 static void __sk_destruct(struct rcu_head *head)
1780 {
1781 	struct sock *sk = container_of(head, struct sock, sk_rcu);
1782 	struct sk_filter *filter;
1783 
1784 	if (sk->sk_destruct)
1785 		sk->sk_destruct(sk);
1786 
1787 	filter = rcu_dereference_check(sk->sk_filter,
1788 				       refcount_read(&sk->sk_wmem_alloc) == 0);
1789 	if (filter) {
1790 		sk_filter_uncharge(sk, filter);
1791 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1792 	}
1793 
1794 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1795 
1796 #ifdef CONFIG_BPF_SYSCALL
1797 	bpf_sk_storage_free(sk);
1798 #endif
1799 
1800 	if (atomic_read(&sk->sk_omem_alloc))
1801 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1802 			 __func__, atomic_read(&sk->sk_omem_alloc));
1803 
1804 	if (sk->sk_frag.page) {
1805 		put_page(sk->sk_frag.page);
1806 		sk->sk_frag.page = NULL;
1807 	}
1808 
1809 	if (sk->sk_peer_cred)
1810 		put_cred(sk->sk_peer_cred);
1811 	put_pid(sk->sk_peer_pid);
1812 	if (likely(sk->sk_net_refcnt))
1813 		put_net(sock_net(sk));
1814 	sk_prot_free(sk->sk_prot_creator, sk);
1815 }
1816 
1817 void sk_destruct(struct sock *sk)
1818 {
1819 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1820 
1821 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1822 		reuseport_detach_sock(sk);
1823 		use_call_rcu = true;
1824 	}
1825 
1826 	if (use_call_rcu)
1827 		call_rcu(&sk->sk_rcu, __sk_destruct);
1828 	else
1829 		__sk_destruct(&sk->sk_rcu);
1830 }
1831 
1832 static void __sk_free(struct sock *sk)
1833 {
1834 	if (likely(sk->sk_net_refcnt))
1835 		sock_inuse_add(sock_net(sk), -1);
1836 
1837 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1838 		sock_diag_broadcast_destroy(sk);
1839 	else
1840 		sk_destruct(sk);
1841 }
1842 
1843 void sk_free(struct sock *sk)
1844 {
1845 	/*
1846 	 * We subtract one from sk_wmem_alloc and can know if
1847 	 * some packets are still in some tx queue.
1848 	 * If not null, sock_wfree() will call __sk_free(sk) later
1849 	 */
1850 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1851 		__sk_free(sk);
1852 }
1853 EXPORT_SYMBOL(sk_free);
1854 
1855 static void sk_init_common(struct sock *sk)
1856 {
1857 	skb_queue_head_init(&sk->sk_receive_queue);
1858 	skb_queue_head_init(&sk->sk_write_queue);
1859 	skb_queue_head_init(&sk->sk_error_queue);
1860 
1861 	rwlock_init(&sk->sk_callback_lock);
1862 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1863 			af_rlock_keys + sk->sk_family,
1864 			af_family_rlock_key_strings[sk->sk_family]);
1865 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1866 			af_wlock_keys + sk->sk_family,
1867 			af_family_wlock_key_strings[sk->sk_family]);
1868 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1869 			af_elock_keys + sk->sk_family,
1870 			af_family_elock_key_strings[sk->sk_family]);
1871 	lockdep_set_class_and_name(&sk->sk_callback_lock,
1872 			af_callback_keys + sk->sk_family,
1873 			af_family_clock_key_strings[sk->sk_family]);
1874 }
1875 
1876 /**
1877  *	sk_clone_lock - clone a socket, and lock its clone
1878  *	@sk: the socket to clone
1879  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1880  *
1881  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1882  */
1883 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1884 {
1885 	struct proto *prot = READ_ONCE(sk->sk_prot);
1886 	struct sock *newsk;
1887 	bool is_charged = true;
1888 
1889 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
1890 	if (newsk != NULL) {
1891 		struct sk_filter *filter;
1892 
1893 		sock_copy(newsk, sk);
1894 
1895 		newsk->sk_prot_creator = prot;
1896 
1897 		/* SANITY */
1898 		if (likely(newsk->sk_net_refcnt))
1899 			get_net(sock_net(newsk));
1900 		sk_node_init(&newsk->sk_node);
1901 		sock_lock_init(newsk);
1902 		bh_lock_sock(newsk);
1903 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1904 		newsk->sk_backlog.len = 0;
1905 
1906 		atomic_set(&newsk->sk_rmem_alloc, 0);
1907 		/*
1908 		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1909 		 */
1910 		refcount_set(&newsk->sk_wmem_alloc, 1);
1911 		atomic_set(&newsk->sk_omem_alloc, 0);
1912 		sk_init_common(newsk);
1913 
1914 		newsk->sk_dst_cache	= NULL;
1915 		newsk->sk_dst_pending_confirm = 0;
1916 		newsk->sk_wmem_queued	= 0;
1917 		newsk->sk_forward_alloc = 0;
1918 		atomic_set(&newsk->sk_drops, 0);
1919 		newsk->sk_send_head	= NULL;
1920 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1921 		atomic_set(&newsk->sk_zckey, 0);
1922 
1923 		sock_reset_flag(newsk, SOCK_DONE);
1924 
1925 		/* sk->sk_memcg will be populated at accept() time */
1926 		newsk->sk_memcg = NULL;
1927 
1928 		cgroup_sk_alloc(&newsk->sk_cgrp_data);
1929 
1930 		rcu_read_lock();
1931 		filter = rcu_dereference(sk->sk_filter);
1932 		if (filter != NULL)
1933 			/* though it's an empty new sock, the charging may fail
1934 			 * if sysctl_optmem_max was changed between creation of
1935 			 * original socket and cloning
1936 			 */
1937 			is_charged = sk_filter_charge(newsk, filter);
1938 		RCU_INIT_POINTER(newsk->sk_filter, filter);
1939 		rcu_read_unlock();
1940 
1941 		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1942 			/* We need to make sure that we don't uncharge the new
1943 			 * socket if we couldn't charge it in the first place
1944 			 * as otherwise we uncharge the parent's filter.
1945 			 */
1946 			if (!is_charged)
1947 				RCU_INIT_POINTER(newsk->sk_filter, NULL);
1948 			sk_free_unlock_clone(newsk);
1949 			newsk = NULL;
1950 			goto out;
1951 		}
1952 		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1953 
1954 		if (bpf_sk_storage_clone(sk, newsk)) {
1955 			sk_free_unlock_clone(newsk);
1956 			newsk = NULL;
1957 			goto out;
1958 		}
1959 
1960 		/* Clear sk_user_data if parent had the pointer tagged
1961 		 * as not suitable for copying when cloning.
1962 		 */
1963 		if (sk_user_data_is_nocopy(newsk))
1964 			newsk->sk_user_data = NULL;
1965 
1966 		newsk->sk_err	   = 0;
1967 		newsk->sk_err_soft = 0;
1968 		newsk->sk_priority = 0;
1969 		newsk->sk_incoming_cpu = raw_smp_processor_id();
1970 		if (likely(newsk->sk_net_refcnt))
1971 			sock_inuse_add(sock_net(newsk), 1);
1972 
1973 		/*
1974 		 * Before updating sk_refcnt, we must commit prior changes to memory
1975 		 * (Documentation/RCU/rculist_nulls.txt for details)
1976 		 */
1977 		smp_wmb();
1978 		refcount_set(&newsk->sk_refcnt, 2);
1979 
1980 		/*
1981 		 * Increment the counter in the same struct proto as the master
1982 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1983 		 * is the same as sk->sk_prot->socks, as this field was copied
1984 		 * with memcpy).
1985 		 *
1986 		 * This _changes_ the previous behaviour, where
1987 		 * tcp_create_openreq_child always was incrementing the
1988 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1989 		 * to be taken into account in all callers. -acme
1990 		 */
1991 		sk_refcnt_debug_inc(newsk);
1992 		sk_set_socket(newsk, NULL);
1993 		RCU_INIT_POINTER(newsk->sk_wq, NULL);
1994 
1995 		if (newsk->sk_prot->sockets_allocated)
1996 			sk_sockets_allocated_inc(newsk);
1997 
1998 		if (sock_needs_netstamp(sk) &&
1999 		    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2000 			net_enable_timestamp();
2001 	}
2002 out:
2003 	return newsk;
2004 }
2005 EXPORT_SYMBOL_GPL(sk_clone_lock);
2006 
2007 void sk_free_unlock_clone(struct sock *sk)
2008 {
2009 	/* It is still raw copy of parent, so invalidate
2010 	 * destructor and make plain sk_free() */
2011 	sk->sk_destruct = NULL;
2012 	bh_unlock_sock(sk);
2013 	sk_free(sk);
2014 }
2015 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2016 
2017 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2018 {
2019 	u32 max_segs = 1;
2020 
2021 	sk_dst_set(sk, dst);
2022 	sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
2023 	if (sk->sk_route_caps & NETIF_F_GSO)
2024 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2025 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
2026 	if (sk_can_gso(sk)) {
2027 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2028 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2029 		} else {
2030 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2031 			sk->sk_gso_max_size = dst->dev->gso_max_size;
2032 			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
2033 		}
2034 	}
2035 	sk->sk_gso_max_segs = max_segs;
2036 }
2037 EXPORT_SYMBOL_GPL(sk_setup_caps);
2038 
2039 /*
2040  *	Simple resource managers for sockets.
2041  */
2042 
2043 
2044 /*
2045  * Write buffer destructor automatically called from kfree_skb.
2046  */
2047 void sock_wfree(struct sk_buff *skb)
2048 {
2049 	struct sock *sk = skb->sk;
2050 	unsigned int len = skb->truesize;
2051 
2052 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2053 		/*
2054 		 * Keep a reference on sk_wmem_alloc, this will be released
2055 		 * after sk_write_space() call
2056 		 */
2057 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2058 		sk->sk_write_space(sk);
2059 		len = 1;
2060 	}
2061 	/*
2062 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2063 	 * could not do because of in-flight packets
2064 	 */
2065 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2066 		__sk_free(sk);
2067 }
2068 EXPORT_SYMBOL(sock_wfree);
2069 
2070 /* This variant of sock_wfree() is used by TCP,
2071  * since it sets SOCK_USE_WRITE_QUEUE.
2072  */
2073 void __sock_wfree(struct sk_buff *skb)
2074 {
2075 	struct sock *sk = skb->sk;
2076 
2077 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2078 		__sk_free(sk);
2079 }
2080 
2081 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2082 {
2083 	skb_orphan(skb);
2084 	skb->sk = sk;
2085 #ifdef CONFIG_INET
2086 	if (unlikely(!sk_fullsock(sk))) {
2087 		skb->destructor = sock_edemux;
2088 		sock_hold(sk);
2089 		return;
2090 	}
2091 #endif
2092 	skb->destructor = sock_wfree;
2093 	skb_set_hash_from_sk(skb, sk);
2094 	/*
2095 	 * We used to take a refcount on sk, but following operation
2096 	 * is enough to guarantee sk_free() wont free this sock until
2097 	 * all in-flight packets are completed
2098 	 */
2099 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2100 }
2101 EXPORT_SYMBOL(skb_set_owner_w);
2102 
2103 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2104 {
2105 #ifdef CONFIG_TLS_DEVICE
2106 	/* Drivers depend on in-order delivery for crypto offload,
2107 	 * partial orphan breaks out-of-order-OK logic.
2108 	 */
2109 	if (skb->decrypted)
2110 		return false;
2111 #endif
2112 	return (skb->destructor == sock_wfree ||
2113 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2114 }
2115 
2116 /* This helper is used by netem, as it can hold packets in its
2117  * delay queue. We want to allow the owner socket to send more
2118  * packets, as if they were already TX completed by a typical driver.
2119  * But we also want to keep skb->sk set because some packet schedulers
2120  * rely on it (sch_fq for example).
2121  */
2122 void skb_orphan_partial(struct sk_buff *skb)
2123 {
2124 	if (skb_is_tcp_pure_ack(skb))
2125 		return;
2126 
2127 	if (can_skb_orphan_partial(skb)) {
2128 		struct sock *sk = skb->sk;
2129 
2130 		if (refcount_inc_not_zero(&sk->sk_refcnt)) {
2131 			WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
2132 			skb->destructor = sock_efree;
2133 		}
2134 	} else {
2135 		skb_orphan(skb);
2136 	}
2137 }
2138 EXPORT_SYMBOL(skb_orphan_partial);
2139 
2140 /*
2141  * Read buffer destructor automatically called from kfree_skb.
2142  */
2143 void sock_rfree(struct sk_buff *skb)
2144 {
2145 	struct sock *sk = skb->sk;
2146 	unsigned int len = skb->truesize;
2147 
2148 	atomic_sub(len, &sk->sk_rmem_alloc);
2149 	sk_mem_uncharge(sk, len);
2150 }
2151 EXPORT_SYMBOL(sock_rfree);
2152 
2153 /*
2154  * Buffer destructor for skbs that are not used directly in read or write
2155  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2156  */
2157 void sock_efree(struct sk_buff *skb)
2158 {
2159 	sock_put(skb->sk);
2160 }
2161 EXPORT_SYMBOL(sock_efree);
2162 
2163 /* Buffer destructor for prefetch/receive path where reference count may
2164  * not be held, e.g. for listen sockets.
2165  */
2166 #ifdef CONFIG_INET
2167 void sock_pfree(struct sk_buff *skb)
2168 {
2169 	if (sk_is_refcounted(skb->sk))
2170 		sock_gen_put(skb->sk);
2171 }
2172 EXPORT_SYMBOL(sock_pfree);
2173 #endif /* CONFIG_INET */
2174 
2175 kuid_t sock_i_uid(struct sock *sk)
2176 {
2177 	kuid_t uid;
2178 
2179 	read_lock_bh(&sk->sk_callback_lock);
2180 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2181 	read_unlock_bh(&sk->sk_callback_lock);
2182 	return uid;
2183 }
2184 EXPORT_SYMBOL(sock_i_uid);
2185 
2186 unsigned long sock_i_ino(struct sock *sk)
2187 {
2188 	unsigned long ino;
2189 
2190 	read_lock_bh(&sk->sk_callback_lock);
2191 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2192 	read_unlock_bh(&sk->sk_callback_lock);
2193 	return ino;
2194 }
2195 EXPORT_SYMBOL(sock_i_ino);
2196 
2197 /*
2198  * Allocate a skb from the socket's send buffer.
2199  */
2200 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2201 			     gfp_t priority)
2202 {
2203 	if (force ||
2204 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2205 		struct sk_buff *skb = alloc_skb(size, priority);
2206 
2207 		if (skb) {
2208 			skb_set_owner_w(skb, sk);
2209 			return skb;
2210 		}
2211 	}
2212 	return NULL;
2213 }
2214 EXPORT_SYMBOL(sock_wmalloc);
2215 
2216 static void sock_ofree(struct sk_buff *skb)
2217 {
2218 	struct sock *sk = skb->sk;
2219 
2220 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2221 }
2222 
2223 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2224 			     gfp_t priority)
2225 {
2226 	struct sk_buff *skb;
2227 
2228 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2229 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2230 	    sysctl_optmem_max)
2231 		return NULL;
2232 
2233 	skb = alloc_skb(size, priority);
2234 	if (!skb)
2235 		return NULL;
2236 
2237 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2238 	skb->sk = sk;
2239 	skb->destructor = sock_ofree;
2240 	return skb;
2241 }
2242 
2243 /*
2244  * Allocate a memory block from the socket's option memory buffer.
2245  */
2246 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2247 {
2248 	if ((unsigned int)size <= sysctl_optmem_max &&
2249 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2250 		void *mem;
2251 		/* First do the add, to avoid the race if kmalloc
2252 		 * might sleep.
2253 		 */
2254 		atomic_add(size, &sk->sk_omem_alloc);
2255 		mem = kmalloc(size, priority);
2256 		if (mem)
2257 			return mem;
2258 		atomic_sub(size, &sk->sk_omem_alloc);
2259 	}
2260 	return NULL;
2261 }
2262 EXPORT_SYMBOL(sock_kmalloc);
2263 
2264 /* Free an option memory block. Note, we actually want the inline
2265  * here as this allows gcc to detect the nullify and fold away the
2266  * condition entirely.
2267  */
2268 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2269 				  const bool nullify)
2270 {
2271 	if (WARN_ON_ONCE(!mem))
2272 		return;
2273 	if (nullify)
2274 		kzfree(mem);
2275 	else
2276 		kfree(mem);
2277 	atomic_sub(size, &sk->sk_omem_alloc);
2278 }
2279 
2280 void sock_kfree_s(struct sock *sk, void *mem, int size)
2281 {
2282 	__sock_kfree_s(sk, mem, size, false);
2283 }
2284 EXPORT_SYMBOL(sock_kfree_s);
2285 
2286 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2287 {
2288 	__sock_kfree_s(sk, mem, size, true);
2289 }
2290 EXPORT_SYMBOL(sock_kzfree_s);
2291 
2292 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2293    I think, these locks should be removed for datagram sockets.
2294  */
2295 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2296 {
2297 	DEFINE_WAIT(wait);
2298 
2299 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2300 	for (;;) {
2301 		if (!timeo)
2302 			break;
2303 		if (signal_pending(current))
2304 			break;
2305 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2306 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2307 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2308 			break;
2309 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2310 			break;
2311 		if (sk->sk_err)
2312 			break;
2313 		timeo = schedule_timeout(timeo);
2314 	}
2315 	finish_wait(sk_sleep(sk), &wait);
2316 	return timeo;
2317 }
2318 
2319 
2320 /*
2321  *	Generic send/receive buffer handlers
2322  */
2323 
2324 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2325 				     unsigned long data_len, int noblock,
2326 				     int *errcode, int max_page_order)
2327 {
2328 	struct sk_buff *skb;
2329 	long timeo;
2330 	int err;
2331 
2332 	timeo = sock_sndtimeo(sk, noblock);
2333 	for (;;) {
2334 		err = sock_error(sk);
2335 		if (err != 0)
2336 			goto failure;
2337 
2338 		err = -EPIPE;
2339 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2340 			goto failure;
2341 
2342 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2343 			break;
2344 
2345 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2346 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2347 		err = -EAGAIN;
2348 		if (!timeo)
2349 			goto failure;
2350 		if (signal_pending(current))
2351 			goto interrupted;
2352 		timeo = sock_wait_for_wmem(sk, timeo);
2353 	}
2354 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2355 				   errcode, sk->sk_allocation);
2356 	if (skb)
2357 		skb_set_owner_w(skb, sk);
2358 	return skb;
2359 
2360 interrupted:
2361 	err = sock_intr_errno(timeo);
2362 failure:
2363 	*errcode = err;
2364 	return NULL;
2365 }
2366 EXPORT_SYMBOL(sock_alloc_send_pskb);
2367 
2368 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2369 				    int noblock, int *errcode)
2370 {
2371 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2372 }
2373 EXPORT_SYMBOL(sock_alloc_send_skb);
2374 
2375 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2376 		     struct sockcm_cookie *sockc)
2377 {
2378 	u32 tsflags;
2379 
2380 	switch (cmsg->cmsg_type) {
2381 	case SO_MARK:
2382 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2383 			return -EPERM;
2384 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2385 			return -EINVAL;
2386 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2387 		break;
2388 	case SO_TIMESTAMPING_OLD:
2389 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2390 			return -EINVAL;
2391 
2392 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2393 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2394 			return -EINVAL;
2395 
2396 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2397 		sockc->tsflags |= tsflags;
2398 		break;
2399 	case SCM_TXTIME:
2400 		if (!sock_flag(sk, SOCK_TXTIME))
2401 			return -EINVAL;
2402 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2403 			return -EINVAL;
2404 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2405 		break;
2406 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2407 	case SCM_RIGHTS:
2408 	case SCM_CREDENTIALS:
2409 		break;
2410 	default:
2411 		return -EINVAL;
2412 	}
2413 	return 0;
2414 }
2415 EXPORT_SYMBOL(__sock_cmsg_send);
2416 
2417 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2418 		   struct sockcm_cookie *sockc)
2419 {
2420 	struct cmsghdr *cmsg;
2421 	int ret;
2422 
2423 	for_each_cmsghdr(cmsg, msg) {
2424 		if (!CMSG_OK(msg, cmsg))
2425 			return -EINVAL;
2426 		if (cmsg->cmsg_level != SOL_SOCKET)
2427 			continue;
2428 		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2429 		if (ret)
2430 			return ret;
2431 	}
2432 	return 0;
2433 }
2434 EXPORT_SYMBOL(sock_cmsg_send);
2435 
2436 static void sk_enter_memory_pressure(struct sock *sk)
2437 {
2438 	if (!sk->sk_prot->enter_memory_pressure)
2439 		return;
2440 
2441 	sk->sk_prot->enter_memory_pressure(sk);
2442 }
2443 
2444 static void sk_leave_memory_pressure(struct sock *sk)
2445 {
2446 	if (sk->sk_prot->leave_memory_pressure) {
2447 		sk->sk_prot->leave_memory_pressure(sk);
2448 	} else {
2449 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2450 
2451 		if (memory_pressure && READ_ONCE(*memory_pressure))
2452 			WRITE_ONCE(*memory_pressure, 0);
2453 	}
2454 }
2455 
2456 #define SKB_FRAG_PAGE_ORDER	get_order(32768)
2457 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2458 
2459 /**
2460  * skb_page_frag_refill - check that a page_frag contains enough room
2461  * @sz: minimum size of the fragment we want to get
2462  * @pfrag: pointer to page_frag
2463  * @gfp: priority for memory allocation
2464  *
2465  * Note: While this allocator tries to use high order pages, there is
2466  * no guarantee that allocations succeed. Therefore, @sz MUST be
2467  * less or equal than PAGE_SIZE.
2468  */
2469 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2470 {
2471 	if (pfrag->page) {
2472 		if (page_ref_count(pfrag->page) == 1) {
2473 			pfrag->offset = 0;
2474 			return true;
2475 		}
2476 		if (pfrag->offset + sz <= pfrag->size)
2477 			return true;
2478 		put_page(pfrag->page);
2479 	}
2480 
2481 	pfrag->offset = 0;
2482 	if (SKB_FRAG_PAGE_ORDER &&
2483 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2484 		/* Avoid direct reclaim but allow kswapd to wake */
2485 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2486 					  __GFP_COMP | __GFP_NOWARN |
2487 					  __GFP_NORETRY,
2488 					  SKB_FRAG_PAGE_ORDER);
2489 		if (likely(pfrag->page)) {
2490 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2491 			return true;
2492 		}
2493 	}
2494 	pfrag->page = alloc_page(gfp);
2495 	if (likely(pfrag->page)) {
2496 		pfrag->size = PAGE_SIZE;
2497 		return true;
2498 	}
2499 	return false;
2500 }
2501 EXPORT_SYMBOL(skb_page_frag_refill);
2502 
2503 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2504 {
2505 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2506 		return true;
2507 
2508 	sk_enter_memory_pressure(sk);
2509 	sk_stream_moderate_sndbuf(sk);
2510 	return false;
2511 }
2512 EXPORT_SYMBOL(sk_page_frag_refill);
2513 
2514 static void __lock_sock(struct sock *sk)
2515 	__releases(&sk->sk_lock.slock)
2516 	__acquires(&sk->sk_lock.slock)
2517 {
2518 	DEFINE_WAIT(wait);
2519 
2520 	for (;;) {
2521 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2522 					TASK_UNINTERRUPTIBLE);
2523 		spin_unlock_bh(&sk->sk_lock.slock);
2524 		schedule();
2525 		spin_lock_bh(&sk->sk_lock.slock);
2526 		if (!sock_owned_by_user(sk))
2527 			break;
2528 	}
2529 	finish_wait(&sk->sk_lock.wq, &wait);
2530 }
2531 
2532 void __release_sock(struct sock *sk)
2533 	__releases(&sk->sk_lock.slock)
2534 	__acquires(&sk->sk_lock.slock)
2535 {
2536 	struct sk_buff *skb, *next;
2537 
2538 	while ((skb = sk->sk_backlog.head) != NULL) {
2539 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2540 
2541 		spin_unlock_bh(&sk->sk_lock.slock);
2542 
2543 		do {
2544 			next = skb->next;
2545 			prefetch(next);
2546 			WARN_ON_ONCE(skb_dst_is_noref(skb));
2547 			skb_mark_not_on_list(skb);
2548 			sk_backlog_rcv(sk, skb);
2549 
2550 			cond_resched();
2551 
2552 			skb = next;
2553 		} while (skb != NULL);
2554 
2555 		spin_lock_bh(&sk->sk_lock.slock);
2556 	}
2557 
2558 	/*
2559 	 * Doing the zeroing here guarantee we can not loop forever
2560 	 * while a wild producer attempts to flood us.
2561 	 */
2562 	sk->sk_backlog.len = 0;
2563 }
2564 
2565 void __sk_flush_backlog(struct sock *sk)
2566 {
2567 	spin_lock_bh(&sk->sk_lock.slock);
2568 	__release_sock(sk);
2569 	spin_unlock_bh(&sk->sk_lock.slock);
2570 }
2571 
2572 /**
2573  * sk_wait_data - wait for data to arrive at sk_receive_queue
2574  * @sk:    sock to wait on
2575  * @timeo: for how long
2576  * @skb:   last skb seen on sk_receive_queue
2577  *
2578  * Now socket state including sk->sk_err is changed only under lock,
2579  * hence we may omit checks after joining wait queue.
2580  * We check receive queue before schedule() only as optimization;
2581  * it is very likely that release_sock() added new data.
2582  */
2583 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2584 {
2585 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2586 	int rc;
2587 
2588 	add_wait_queue(sk_sleep(sk), &wait);
2589 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2590 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2591 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2592 	remove_wait_queue(sk_sleep(sk), &wait);
2593 	return rc;
2594 }
2595 EXPORT_SYMBOL(sk_wait_data);
2596 
2597 /**
2598  *	__sk_mem_raise_allocated - increase memory_allocated
2599  *	@sk: socket
2600  *	@size: memory size to allocate
2601  *	@amt: pages to allocate
2602  *	@kind: allocation type
2603  *
2604  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2605  */
2606 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2607 {
2608 	struct proto *prot = sk->sk_prot;
2609 	long allocated = sk_memory_allocated_add(sk, amt);
2610 	bool charged = true;
2611 
2612 	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2613 	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
2614 		goto suppress_allocation;
2615 
2616 	/* Under limit. */
2617 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2618 		sk_leave_memory_pressure(sk);
2619 		return 1;
2620 	}
2621 
2622 	/* Under pressure. */
2623 	if (allocated > sk_prot_mem_limits(sk, 1))
2624 		sk_enter_memory_pressure(sk);
2625 
2626 	/* Over hard limit. */
2627 	if (allocated > sk_prot_mem_limits(sk, 2))
2628 		goto suppress_allocation;
2629 
2630 	/* guarantee minimum buffer size under pressure */
2631 	if (kind == SK_MEM_RECV) {
2632 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2633 			return 1;
2634 
2635 	} else { /* SK_MEM_SEND */
2636 		int wmem0 = sk_get_wmem0(sk, prot);
2637 
2638 		if (sk->sk_type == SOCK_STREAM) {
2639 			if (sk->sk_wmem_queued < wmem0)
2640 				return 1;
2641 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2642 				return 1;
2643 		}
2644 	}
2645 
2646 	if (sk_has_memory_pressure(sk)) {
2647 		u64 alloc;
2648 
2649 		if (!sk_under_memory_pressure(sk))
2650 			return 1;
2651 		alloc = sk_sockets_allocated_read_positive(sk);
2652 		if (sk_prot_mem_limits(sk, 2) > alloc *
2653 		    sk_mem_pages(sk->sk_wmem_queued +
2654 				 atomic_read(&sk->sk_rmem_alloc) +
2655 				 sk->sk_forward_alloc))
2656 			return 1;
2657 	}
2658 
2659 suppress_allocation:
2660 
2661 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2662 		sk_stream_moderate_sndbuf(sk);
2663 
2664 		/* Fail only if socket is _under_ its sndbuf.
2665 		 * In this case we cannot block, so that we have to fail.
2666 		 */
2667 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2668 			return 1;
2669 	}
2670 
2671 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2672 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2673 
2674 	sk_memory_allocated_sub(sk, amt);
2675 
2676 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2677 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2678 
2679 	return 0;
2680 }
2681 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2682 
2683 /**
2684  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2685  *	@sk: socket
2686  *	@size: memory size to allocate
2687  *	@kind: allocation type
2688  *
2689  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2690  *	rmem allocation. This function assumes that protocols which have
2691  *	memory_pressure use sk_wmem_queued as write buffer accounting.
2692  */
2693 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2694 {
2695 	int ret, amt = sk_mem_pages(size);
2696 
2697 	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2698 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2699 	if (!ret)
2700 		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2701 	return ret;
2702 }
2703 EXPORT_SYMBOL(__sk_mem_schedule);
2704 
2705 /**
2706  *	__sk_mem_reduce_allocated - reclaim memory_allocated
2707  *	@sk: socket
2708  *	@amount: number of quanta
2709  *
2710  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2711  */
2712 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2713 {
2714 	sk_memory_allocated_sub(sk, amount);
2715 
2716 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2717 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2718 
2719 	if (sk_under_memory_pressure(sk) &&
2720 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2721 		sk_leave_memory_pressure(sk);
2722 }
2723 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2724 
2725 /**
2726  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2727  *	@sk: socket
2728  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2729  */
2730 void __sk_mem_reclaim(struct sock *sk, int amount)
2731 {
2732 	amount >>= SK_MEM_QUANTUM_SHIFT;
2733 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2734 	__sk_mem_reduce_allocated(sk, amount);
2735 }
2736 EXPORT_SYMBOL(__sk_mem_reclaim);
2737 
2738 int sk_set_peek_off(struct sock *sk, int val)
2739 {
2740 	sk->sk_peek_off = val;
2741 	return 0;
2742 }
2743 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2744 
2745 /*
2746  * Set of default routines for initialising struct proto_ops when
2747  * the protocol does not support a particular function. In certain
2748  * cases where it makes no sense for a protocol to have a "do nothing"
2749  * function, some default processing is provided.
2750  */
2751 
2752 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2753 {
2754 	return -EOPNOTSUPP;
2755 }
2756 EXPORT_SYMBOL(sock_no_bind);
2757 
2758 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2759 		    int len, int flags)
2760 {
2761 	return -EOPNOTSUPP;
2762 }
2763 EXPORT_SYMBOL(sock_no_connect);
2764 
2765 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2766 {
2767 	return -EOPNOTSUPP;
2768 }
2769 EXPORT_SYMBOL(sock_no_socketpair);
2770 
2771 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2772 		   bool kern)
2773 {
2774 	return -EOPNOTSUPP;
2775 }
2776 EXPORT_SYMBOL(sock_no_accept);
2777 
2778 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2779 		    int peer)
2780 {
2781 	return -EOPNOTSUPP;
2782 }
2783 EXPORT_SYMBOL(sock_no_getname);
2784 
2785 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2786 {
2787 	return -EOPNOTSUPP;
2788 }
2789 EXPORT_SYMBOL(sock_no_ioctl);
2790 
2791 int sock_no_listen(struct socket *sock, int backlog)
2792 {
2793 	return -EOPNOTSUPP;
2794 }
2795 EXPORT_SYMBOL(sock_no_listen);
2796 
2797 int sock_no_shutdown(struct socket *sock, int how)
2798 {
2799 	return -EOPNOTSUPP;
2800 }
2801 EXPORT_SYMBOL(sock_no_shutdown);
2802 
2803 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2804 		    char __user *optval, unsigned int optlen)
2805 {
2806 	return -EOPNOTSUPP;
2807 }
2808 EXPORT_SYMBOL(sock_no_setsockopt);
2809 
2810 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2811 		    char __user *optval, int __user *optlen)
2812 {
2813 	return -EOPNOTSUPP;
2814 }
2815 EXPORT_SYMBOL(sock_no_getsockopt);
2816 
2817 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2818 {
2819 	return -EOPNOTSUPP;
2820 }
2821 EXPORT_SYMBOL(sock_no_sendmsg);
2822 
2823 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2824 {
2825 	return -EOPNOTSUPP;
2826 }
2827 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2828 
2829 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2830 		    int flags)
2831 {
2832 	return -EOPNOTSUPP;
2833 }
2834 EXPORT_SYMBOL(sock_no_recvmsg);
2835 
2836 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2837 {
2838 	/* Mirror missing mmap method error code */
2839 	return -ENODEV;
2840 }
2841 EXPORT_SYMBOL(sock_no_mmap);
2842 
2843 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2844 {
2845 	ssize_t res;
2846 	struct msghdr msg = {.msg_flags = flags};
2847 	struct kvec iov;
2848 	char *kaddr = kmap(page);
2849 	iov.iov_base = kaddr + offset;
2850 	iov.iov_len = size;
2851 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2852 	kunmap(page);
2853 	return res;
2854 }
2855 EXPORT_SYMBOL(sock_no_sendpage);
2856 
2857 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2858 				int offset, size_t size, int flags)
2859 {
2860 	ssize_t res;
2861 	struct msghdr msg = {.msg_flags = flags};
2862 	struct kvec iov;
2863 	char *kaddr = kmap(page);
2864 
2865 	iov.iov_base = kaddr + offset;
2866 	iov.iov_len = size;
2867 	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2868 	kunmap(page);
2869 	return res;
2870 }
2871 EXPORT_SYMBOL(sock_no_sendpage_locked);
2872 
2873 /*
2874  *	Default Socket Callbacks
2875  */
2876 
2877 static void sock_def_wakeup(struct sock *sk)
2878 {
2879 	struct socket_wq *wq;
2880 
2881 	rcu_read_lock();
2882 	wq = rcu_dereference(sk->sk_wq);
2883 	if (skwq_has_sleeper(wq))
2884 		wake_up_interruptible_all(&wq->wait);
2885 	rcu_read_unlock();
2886 }
2887 
2888 static void sock_def_error_report(struct sock *sk)
2889 {
2890 	struct socket_wq *wq;
2891 
2892 	rcu_read_lock();
2893 	wq = rcu_dereference(sk->sk_wq);
2894 	if (skwq_has_sleeper(wq))
2895 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2896 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2897 	rcu_read_unlock();
2898 }
2899 
2900 void sock_def_readable(struct sock *sk)
2901 {
2902 	struct socket_wq *wq;
2903 
2904 	rcu_read_lock();
2905 	wq = rcu_dereference(sk->sk_wq);
2906 	if (skwq_has_sleeper(wq))
2907 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2908 						EPOLLRDNORM | EPOLLRDBAND);
2909 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2910 	rcu_read_unlock();
2911 }
2912 
2913 static void sock_def_write_space(struct sock *sk)
2914 {
2915 	struct socket_wq *wq;
2916 
2917 	rcu_read_lock();
2918 
2919 	/* Do not wake up a writer until he can make "significant"
2920 	 * progress.  --DaveM
2921 	 */
2922 	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
2923 		wq = rcu_dereference(sk->sk_wq);
2924 		if (skwq_has_sleeper(wq))
2925 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2926 						EPOLLWRNORM | EPOLLWRBAND);
2927 
2928 		/* Should agree with poll, otherwise some programs break */
2929 		if (sock_writeable(sk))
2930 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2931 	}
2932 
2933 	rcu_read_unlock();
2934 }
2935 
2936 static void sock_def_destruct(struct sock *sk)
2937 {
2938 }
2939 
2940 void sk_send_sigurg(struct sock *sk)
2941 {
2942 	if (sk->sk_socket && sk->sk_socket->file)
2943 		if (send_sigurg(&sk->sk_socket->file->f_owner))
2944 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2945 }
2946 EXPORT_SYMBOL(sk_send_sigurg);
2947 
2948 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2949 		    unsigned long expires)
2950 {
2951 	if (!mod_timer(timer, expires))
2952 		sock_hold(sk);
2953 }
2954 EXPORT_SYMBOL(sk_reset_timer);
2955 
2956 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2957 {
2958 	if (del_timer(timer))
2959 		__sock_put(sk);
2960 }
2961 EXPORT_SYMBOL(sk_stop_timer);
2962 
2963 void sock_init_data(struct socket *sock, struct sock *sk)
2964 {
2965 	sk_init_common(sk);
2966 	sk->sk_send_head	=	NULL;
2967 
2968 	timer_setup(&sk->sk_timer, NULL, 0);
2969 
2970 	sk->sk_allocation	=	GFP_KERNEL;
2971 	sk->sk_rcvbuf		=	sysctl_rmem_default;
2972 	sk->sk_sndbuf		=	sysctl_wmem_default;
2973 	sk->sk_state		=	TCP_CLOSE;
2974 	sk_set_socket(sk, sock);
2975 
2976 	sock_set_flag(sk, SOCK_ZAPPED);
2977 
2978 	if (sock) {
2979 		sk->sk_type	=	sock->type;
2980 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
2981 		sock->sk	=	sk;
2982 		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
2983 	} else {
2984 		RCU_INIT_POINTER(sk->sk_wq, NULL);
2985 		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
2986 	}
2987 
2988 	rwlock_init(&sk->sk_callback_lock);
2989 	if (sk->sk_kern_sock)
2990 		lockdep_set_class_and_name(
2991 			&sk->sk_callback_lock,
2992 			af_kern_callback_keys + sk->sk_family,
2993 			af_family_kern_clock_key_strings[sk->sk_family]);
2994 	else
2995 		lockdep_set_class_and_name(
2996 			&sk->sk_callback_lock,
2997 			af_callback_keys + sk->sk_family,
2998 			af_family_clock_key_strings[sk->sk_family]);
2999 
3000 	sk->sk_state_change	=	sock_def_wakeup;
3001 	sk->sk_data_ready	=	sock_def_readable;
3002 	sk->sk_write_space	=	sock_def_write_space;
3003 	sk->sk_error_report	=	sock_def_error_report;
3004 	sk->sk_destruct		=	sock_def_destruct;
3005 
3006 	sk->sk_frag.page	=	NULL;
3007 	sk->sk_frag.offset	=	0;
3008 	sk->sk_peek_off		=	-1;
3009 
3010 	sk->sk_peer_pid 	=	NULL;
3011 	sk->sk_peer_cred	=	NULL;
3012 	sk->sk_write_pending	=	0;
3013 	sk->sk_rcvlowat		=	1;
3014 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3015 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3016 
3017 	sk->sk_stamp = SK_DEFAULT_STAMP;
3018 #if BITS_PER_LONG==32
3019 	seqlock_init(&sk->sk_stamp_seq);
3020 #endif
3021 	atomic_set(&sk->sk_zckey, 0);
3022 
3023 #ifdef CONFIG_NET_RX_BUSY_POLL
3024 	sk->sk_napi_id		=	0;
3025 	sk->sk_ll_usec		=	sysctl_net_busy_read;
3026 #endif
3027 
3028 	sk->sk_max_pacing_rate = ~0UL;
3029 	sk->sk_pacing_rate = ~0UL;
3030 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3031 	sk->sk_incoming_cpu = -1;
3032 
3033 	sk_rx_queue_clear(sk);
3034 	/*
3035 	 * Before updating sk_refcnt, we must commit prior changes to memory
3036 	 * (Documentation/RCU/rculist_nulls.txt for details)
3037 	 */
3038 	smp_wmb();
3039 	refcount_set(&sk->sk_refcnt, 1);
3040 	atomic_set(&sk->sk_drops, 0);
3041 }
3042 EXPORT_SYMBOL(sock_init_data);
3043 
3044 void lock_sock_nested(struct sock *sk, int subclass)
3045 {
3046 	might_sleep();
3047 	spin_lock_bh(&sk->sk_lock.slock);
3048 	if (sk->sk_lock.owned)
3049 		__lock_sock(sk);
3050 	sk->sk_lock.owned = 1;
3051 	spin_unlock(&sk->sk_lock.slock);
3052 	/*
3053 	 * The sk_lock has mutex_lock() semantics here:
3054 	 */
3055 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3056 	local_bh_enable();
3057 }
3058 EXPORT_SYMBOL(lock_sock_nested);
3059 
3060 void release_sock(struct sock *sk)
3061 {
3062 	spin_lock_bh(&sk->sk_lock.slock);
3063 	if (sk->sk_backlog.tail)
3064 		__release_sock(sk);
3065 
3066 	/* Warning : release_cb() might need to release sk ownership,
3067 	 * ie call sock_release_ownership(sk) before us.
3068 	 */
3069 	if (sk->sk_prot->release_cb)
3070 		sk->sk_prot->release_cb(sk);
3071 
3072 	sock_release_ownership(sk);
3073 	if (waitqueue_active(&sk->sk_lock.wq))
3074 		wake_up(&sk->sk_lock.wq);
3075 	spin_unlock_bh(&sk->sk_lock.slock);
3076 }
3077 EXPORT_SYMBOL(release_sock);
3078 
3079 /**
3080  * lock_sock_fast - fast version of lock_sock
3081  * @sk: socket
3082  *
3083  * This version should be used for very small section, where process wont block
3084  * return false if fast path is taken:
3085  *
3086  *   sk_lock.slock locked, owned = 0, BH disabled
3087  *
3088  * return true if slow path is taken:
3089  *
3090  *   sk_lock.slock unlocked, owned = 1, BH enabled
3091  */
3092 bool lock_sock_fast(struct sock *sk)
3093 {
3094 	might_sleep();
3095 	spin_lock_bh(&sk->sk_lock.slock);
3096 
3097 	if (!sk->sk_lock.owned)
3098 		/*
3099 		 * Note : We must disable BH
3100 		 */
3101 		return false;
3102 
3103 	__lock_sock(sk);
3104 	sk->sk_lock.owned = 1;
3105 	spin_unlock(&sk->sk_lock.slock);
3106 	/*
3107 	 * The sk_lock has mutex_lock() semantics here:
3108 	 */
3109 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
3110 	local_bh_enable();
3111 	return true;
3112 }
3113 EXPORT_SYMBOL(lock_sock_fast);
3114 
3115 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3116 		   bool timeval, bool time32)
3117 {
3118 	struct sock *sk = sock->sk;
3119 	struct timespec64 ts;
3120 
3121 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3122 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3123 	if (ts.tv_sec == -1)
3124 		return -ENOENT;
3125 	if (ts.tv_sec == 0) {
3126 		ktime_t kt = ktime_get_real();
3127 		sock_write_timestamp(sk, kt);
3128 		ts = ktime_to_timespec64(kt);
3129 	}
3130 
3131 	if (timeval)
3132 		ts.tv_nsec /= 1000;
3133 
3134 #ifdef CONFIG_COMPAT_32BIT_TIME
3135 	if (time32)
3136 		return put_old_timespec32(&ts, userstamp);
3137 #endif
3138 #ifdef CONFIG_SPARC64
3139 	/* beware of padding in sparc64 timeval */
3140 	if (timeval && !in_compat_syscall()) {
3141 		struct __kernel_old_timeval __user tv = {
3142 			.tv_sec = ts.tv_sec,
3143 			.tv_usec = ts.tv_nsec,
3144 		};
3145 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3146 			return -EFAULT;
3147 		return 0;
3148 	}
3149 #endif
3150 	return put_timespec64(&ts, userstamp);
3151 }
3152 EXPORT_SYMBOL(sock_gettstamp);
3153 
3154 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3155 {
3156 	if (!sock_flag(sk, flag)) {
3157 		unsigned long previous_flags = sk->sk_flags;
3158 
3159 		sock_set_flag(sk, flag);
3160 		/*
3161 		 * we just set one of the two flags which require net
3162 		 * time stamping, but time stamping might have been on
3163 		 * already because of the other one
3164 		 */
3165 		if (sock_needs_netstamp(sk) &&
3166 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3167 			net_enable_timestamp();
3168 	}
3169 }
3170 
3171 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3172 		       int level, int type)
3173 {
3174 	struct sock_exterr_skb *serr;
3175 	struct sk_buff *skb;
3176 	int copied, err;
3177 
3178 	err = -EAGAIN;
3179 	skb = sock_dequeue_err_skb(sk);
3180 	if (skb == NULL)
3181 		goto out;
3182 
3183 	copied = skb->len;
3184 	if (copied > len) {
3185 		msg->msg_flags |= MSG_TRUNC;
3186 		copied = len;
3187 	}
3188 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3189 	if (err)
3190 		goto out_free_skb;
3191 
3192 	sock_recv_timestamp(msg, sk, skb);
3193 
3194 	serr = SKB_EXT_ERR(skb);
3195 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3196 
3197 	msg->msg_flags |= MSG_ERRQUEUE;
3198 	err = copied;
3199 
3200 out_free_skb:
3201 	kfree_skb(skb);
3202 out:
3203 	return err;
3204 }
3205 EXPORT_SYMBOL(sock_recv_errqueue);
3206 
3207 /*
3208  *	Get a socket option on an socket.
3209  *
3210  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3211  *	asynchronous errors should be reported by getsockopt. We assume
3212  *	this means if you specify SO_ERROR (otherwise whats the point of it).
3213  */
3214 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3215 			   char __user *optval, int __user *optlen)
3216 {
3217 	struct sock *sk = sock->sk;
3218 
3219 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3220 }
3221 EXPORT_SYMBOL(sock_common_getsockopt);
3222 
3223 #ifdef CONFIG_COMPAT
3224 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
3225 				  char __user *optval, int __user *optlen)
3226 {
3227 	struct sock *sk = sock->sk;
3228 
3229 	if (sk->sk_prot->compat_getsockopt != NULL)
3230 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
3231 						      optval, optlen);
3232 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3233 }
3234 EXPORT_SYMBOL(compat_sock_common_getsockopt);
3235 #endif
3236 
3237 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3238 			int flags)
3239 {
3240 	struct sock *sk = sock->sk;
3241 	int addr_len = 0;
3242 	int err;
3243 
3244 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3245 				   flags & ~MSG_DONTWAIT, &addr_len);
3246 	if (err >= 0)
3247 		msg->msg_namelen = addr_len;
3248 	return err;
3249 }
3250 EXPORT_SYMBOL(sock_common_recvmsg);
3251 
3252 /*
3253  *	Set socket options on an inet socket.
3254  */
3255 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3256 			   char __user *optval, unsigned int optlen)
3257 {
3258 	struct sock *sk = sock->sk;
3259 
3260 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3261 }
3262 EXPORT_SYMBOL(sock_common_setsockopt);
3263 
3264 #ifdef CONFIG_COMPAT
3265 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
3266 				  char __user *optval, unsigned int optlen)
3267 {
3268 	struct sock *sk = sock->sk;
3269 
3270 	if (sk->sk_prot->compat_setsockopt != NULL)
3271 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
3272 						      optval, optlen);
3273 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3274 }
3275 EXPORT_SYMBOL(compat_sock_common_setsockopt);
3276 #endif
3277 
3278 void sk_common_release(struct sock *sk)
3279 {
3280 	if (sk->sk_prot->destroy)
3281 		sk->sk_prot->destroy(sk);
3282 
3283 	/*
3284 	 * Observation: when sock_common_release is called, processes have
3285 	 * no access to socket. But net still has.
3286 	 * Step one, detach it from networking:
3287 	 *
3288 	 * A. Remove from hash tables.
3289 	 */
3290 
3291 	sk->sk_prot->unhash(sk);
3292 
3293 	/*
3294 	 * In this point socket cannot receive new packets, but it is possible
3295 	 * that some packets are in flight because some CPU runs receiver and
3296 	 * did hash table lookup before we unhashed socket. They will achieve
3297 	 * receive queue and will be purged by socket destructor.
3298 	 *
3299 	 * Also we still have packets pending on receive queue and probably,
3300 	 * our own packets waiting in device queues. sock_destroy will drain
3301 	 * receive queue, but transmitted packets will delay socket destruction
3302 	 * until the last reference will be released.
3303 	 */
3304 
3305 	sock_orphan(sk);
3306 
3307 	xfrm_sk_free_policy(sk);
3308 
3309 	sk_refcnt_debug_release(sk);
3310 
3311 	sock_put(sk);
3312 }
3313 EXPORT_SYMBOL(sk_common_release);
3314 
3315 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3316 {
3317 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3318 
3319 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3320 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3321 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3322 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3323 	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3324 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3325 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3326 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3327 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3328 }
3329 
3330 #ifdef CONFIG_PROC_FS
3331 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
3332 struct prot_inuse {
3333 	int val[PROTO_INUSE_NR];
3334 };
3335 
3336 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3337 
3338 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3339 {
3340 	__this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3341 }
3342 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3343 
3344 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3345 {
3346 	int cpu, idx = prot->inuse_idx;
3347 	int res = 0;
3348 
3349 	for_each_possible_cpu(cpu)
3350 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3351 
3352 	return res >= 0 ? res : 0;
3353 }
3354 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3355 
3356 static void sock_inuse_add(struct net *net, int val)
3357 {
3358 	this_cpu_add(*net->core.sock_inuse, val);
3359 }
3360 
3361 int sock_inuse_get(struct net *net)
3362 {
3363 	int cpu, res = 0;
3364 
3365 	for_each_possible_cpu(cpu)
3366 		res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3367 
3368 	return res;
3369 }
3370 
3371 EXPORT_SYMBOL_GPL(sock_inuse_get);
3372 
3373 static int __net_init sock_inuse_init_net(struct net *net)
3374 {
3375 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3376 	if (net->core.prot_inuse == NULL)
3377 		return -ENOMEM;
3378 
3379 	net->core.sock_inuse = alloc_percpu(int);
3380 	if (net->core.sock_inuse == NULL)
3381 		goto out;
3382 
3383 	return 0;
3384 
3385 out:
3386 	free_percpu(net->core.prot_inuse);
3387 	return -ENOMEM;
3388 }
3389 
3390 static void __net_exit sock_inuse_exit_net(struct net *net)
3391 {
3392 	free_percpu(net->core.prot_inuse);
3393 	free_percpu(net->core.sock_inuse);
3394 }
3395 
3396 static struct pernet_operations net_inuse_ops = {
3397 	.init = sock_inuse_init_net,
3398 	.exit = sock_inuse_exit_net,
3399 };
3400 
3401 static __init int net_inuse_init(void)
3402 {
3403 	if (register_pernet_subsys(&net_inuse_ops))
3404 		panic("Cannot initialize net inuse counters");
3405 
3406 	return 0;
3407 }
3408 
3409 core_initcall(net_inuse_init);
3410 
3411 static int assign_proto_idx(struct proto *prot)
3412 {
3413 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3414 
3415 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3416 		pr_err("PROTO_INUSE_NR exhausted\n");
3417 		return -ENOSPC;
3418 	}
3419 
3420 	set_bit(prot->inuse_idx, proto_inuse_idx);
3421 	return 0;
3422 }
3423 
3424 static void release_proto_idx(struct proto *prot)
3425 {
3426 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3427 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3428 }
3429 #else
3430 static inline int assign_proto_idx(struct proto *prot)
3431 {
3432 	return 0;
3433 }
3434 
3435 static inline void release_proto_idx(struct proto *prot)
3436 {
3437 }
3438 
3439 static void sock_inuse_add(struct net *net, int val)
3440 {
3441 }
3442 #endif
3443 
3444 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3445 {
3446 	if (!rsk_prot)
3447 		return;
3448 	kfree(rsk_prot->slab_name);
3449 	rsk_prot->slab_name = NULL;
3450 	kmem_cache_destroy(rsk_prot->slab);
3451 	rsk_prot->slab = NULL;
3452 }
3453 
3454 static int req_prot_init(const struct proto *prot)
3455 {
3456 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3457 
3458 	if (!rsk_prot)
3459 		return 0;
3460 
3461 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3462 					prot->name);
3463 	if (!rsk_prot->slab_name)
3464 		return -ENOMEM;
3465 
3466 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3467 					   rsk_prot->obj_size, 0,
3468 					   SLAB_ACCOUNT | prot->slab_flags,
3469 					   NULL);
3470 
3471 	if (!rsk_prot->slab) {
3472 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3473 			prot->name);
3474 		return -ENOMEM;
3475 	}
3476 	return 0;
3477 }
3478 
3479 int proto_register(struct proto *prot, int alloc_slab)
3480 {
3481 	int ret = -ENOBUFS;
3482 
3483 	if (alloc_slab) {
3484 		prot->slab = kmem_cache_create_usercopy(prot->name,
3485 					prot->obj_size, 0,
3486 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3487 					prot->slab_flags,
3488 					prot->useroffset, prot->usersize,
3489 					NULL);
3490 
3491 		if (prot->slab == NULL) {
3492 			pr_crit("%s: Can't create sock SLAB cache!\n",
3493 				prot->name);
3494 			goto out;
3495 		}
3496 
3497 		if (req_prot_init(prot))
3498 			goto out_free_request_sock_slab;
3499 
3500 		if (prot->twsk_prot != NULL) {
3501 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3502 
3503 			if (prot->twsk_prot->twsk_slab_name == NULL)
3504 				goto out_free_request_sock_slab;
3505 
3506 			prot->twsk_prot->twsk_slab =
3507 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3508 						  prot->twsk_prot->twsk_obj_size,
3509 						  0,
3510 						  SLAB_ACCOUNT |
3511 						  prot->slab_flags,
3512 						  NULL);
3513 			if (prot->twsk_prot->twsk_slab == NULL)
3514 				goto out_free_timewait_sock_slab_name;
3515 		}
3516 	}
3517 
3518 	mutex_lock(&proto_list_mutex);
3519 	ret = assign_proto_idx(prot);
3520 	if (ret) {
3521 		mutex_unlock(&proto_list_mutex);
3522 		goto out_free_timewait_sock_slab_name;
3523 	}
3524 	list_add(&prot->node, &proto_list);
3525 	mutex_unlock(&proto_list_mutex);
3526 	return ret;
3527 
3528 out_free_timewait_sock_slab_name:
3529 	if (alloc_slab && prot->twsk_prot)
3530 		kfree(prot->twsk_prot->twsk_slab_name);
3531 out_free_request_sock_slab:
3532 	if (alloc_slab) {
3533 		req_prot_cleanup(prot->rsk_prot);
3534 
3535 		kmem_cache_destroy(prot->slab);
3536 		prot->slab = NULL;
3537 	}
3538 out:
3539 	return ret;
3540 }
3541 EXPORT_SYMBOL(proto_register);
3542 
3543 void proto_unregister(struct proto *prot)
3544 {
3545 	mutex_lock(&proto_list_mutex);
3546 	release_proto_idx(prot);
3547 	list_del(&prot->node);
3548 	mutex_unlock(&proto_list_mutex);
3549 
3550 	kmem_cache_destroy(prot->slab);
3551 	prot->slab = NULL;
3552 
3553 	req_prot_cleanup(prot->rsk_prot);
3554 
3555 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3556 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3557 		kfree(prot->twsk_prot->twsk_slab_name);
3558 		prot->twsk_prot->twsk_slab = NULL;
3559 	}
3560 }
3561 EXPORT_SYMBOL(proto_unregister);
3562 
3563 int sock_load_diag_module(int family, int protocol)
3564 {
3565 	if (!protocol) {
3566 		if (!sock_is_registered(family))
3567 			return -ENOENT;
3568 
3569 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3570 				      NETLINK_SOCK_DIAG, family);
3571 	}
3572 
3573 #ifdef CONFIG_INET
3574 	if (family == AF_INET &&
3575 	    protocol != IPPROTO_RAW &&
3576 	    !rcu_access_pointer(inet_protos[protocol]))
3577 		return -ENOENT;
3578 #endif
3579 
3580 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3581 			      NETLINK_SOCK_DIAG, family, protocol);
3582 }
3583 EXPORT_SYMBOL(sock_load_diag_module);
3584 
3585 #ifdef CONFIG_PROC_FS
3586 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3587 	__acquires(proto_list_mutex)
3588 {
3589 	mutex_lock(&proto_list_mutex);
3590 	return seq_list_start_head(&proto_list, *pos);
3591 }
3592 
3593 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3594 {
3595 	return seq_list_next(v, &proto_list, pos);
3596 }
3597 
3598 static void proto_seq_stop(struct seq_file *seq, void *v)
3599 	__releases(proto_list_mutex)
3600 {
3601 	mutex_unlock(&proto_list_mutex);
3602 }
3603 
3604 static char proto_method_implemented(const void *method)
3605 {
3606 	return method == NULL ? 'n' : 'y';
3607 }
3608 static long sock_prot_memory_allocated(struct proto *proto)
3609 {
3610 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3611 }
3612 
3613 static const char *sock_prot_memory_pressure(struct proto *proto)
3614 {
3615 	return proto->memory_pressure != NULL ?
3616 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3617 }
3618 
3619 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3620 {
3621 
3622 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3623 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3624 		   proto->name,
3625 		   proto->obj_size,
3626 		   sock_prot_inuse_get(seq_file_net(seq), proto),
3627 		   sock_prot_memory_allocated(proto),
3628 		   sock_prot_memory_pressure(proto),
3629 		   proto->max_header,
3630 		   proto->slab == NULL ? "no" : "yes",
3631 		   module_name(proto->owner),
3632 		   proto_method_implemented(proto->close),
3633 		   proto_method_implemented(proto->connect),
3634 		   proto_method_implemented(proto->disconnect),
3635 		   proto_method_implemented(proto->accept),
3636 		   proto_method_implemented(proto->ioctl),
3637 		   proto_method_implemented(proto->init),
3638 		   proto_method_implemented(proto->destroy),
3639 		   proto_method_implemented(proto->shutdown),
3640 		   proto_method_implemented(proto->setsockopt),
3641 		   proto_method_implemented(proto->getsockopt),
3642 		   proto_method_implemented(proto->sendmsg),
3643 		   proto_method_implemented(proto->recvmsg),
3644 		   proto_method_implemented(proto->sendpage),
3645 		   proto_method_implemented(proto->bind),
3646 		   proto_method_implemented(proto->backlog_rcv),
3647 		   proto_method_implemented(proto->hash),
3648 		   proto_method_implemented(proto->unhash),
3649 		   proto_method_implemented(proto->get_port),
3650 		   proto_method_implemented(proto->enter_memory_pressure));
3651 }
3652 
3653 static int proto_seq_show(struct seq_file *seq, void *v)
3654 {
3655 	if (v == &proto_list)
3656 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3657 			   "protocol",
3658 			   "size",
3659 			   "sockets",
3660 			   "memory",
3661 			   "press",
3662 			   "maxhdr",
3663 			   "slab",
3664 			   "module",
3665 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3666 	else
3667 		proto_seq_printf(seq, list_entry(v, struct proto, node));
3668 	return 0;
3669 }
3670 
3671 static const struct seq_operations proto_seq_ops = {
3672 	.start  = proto_seq_start,
3673 	.next   = proto_seq_next,
3674 	.stop   = proto_seq_stop,
3675 	.show   = proto_seq_show,
3676 };
3677 
3678 static __net_init int proto_init_net(struct net *net)
3679 {
3680 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3681 			sizeof(struct seq_net_private)))
3682 		return -ENOMEM;
3683 
3684 	return 0;
3685 }
3686 
3687 static __net_exit void proto_exit_net(struct net *net)
3688 {
3689 	remove_proc_entry("protocols", net->proc_net);
3690 }
3691 
3692 
3693 static __net_initdata struct pernet_operations proto_net_ops = {
3694 	.init = proto_init_net,
3695 	.exit = proto_exit_net,
3696 };
3697 
3698 static int __init proto_init(void)
3699 {
3700 	return register_pernet_subsys(&proto_net_ops);
3701 }
3702 
3703 subsys_initcall(proto_init);
3704 
3705 #endif /* PROC_FS */
3706 
3707 #ifdef CONFIG_NET_RX_BUSY_POLL
3708 bool sk_busy_loop_end(void *p, unsigned long start_time)
3709 {
3710 	struct sock *sk = p;
3711 
3712 	return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3713 	       sk_busy_loop_timeout(sk, start_time);
3714 }
3715 EXPORT_SYMBOL(sk_busy_loop_end);
3716 #endif /* CONFIG_NET_RX_BUSY_POLL */
3717 
3718 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
3719 {
3720 	if (!sk->sk_prot->bind_add)
3721 		return -EOPNOTSUPP;
3722 	return sk->sk_prot->bind_add(sk, addr, addr_len);
3723 }
3724 EXPORT_SYMBOL(sock_bind_add);
3725