xref: /openbmc/linux/net/core/sock.c (revision 176f011b)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *		This program is free software; you can redistribute it and/or
87  *		modify it under the terms of the GNU General Public License
88  *		as published by the Free Software Foundation; either version
89  *		2 of the License, or (at your option) any later version.
90  */
91 
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93 
94 #include <asm/unaligned.h>
95 #include <linux/capability.h>
96 #include <linux/errno.h>
97 #include <linux/errqueue.h>
98 #include <linux/types.h>
99 #include <linux/socket.h>
100 #include <linux/in.h>
101 #include <linux/kernel.h>
102 #include <linux/module.h>
103 #include <linux/proc_fs.h>
104 #include <linux/seq_file.h>
105 #include <linux/sched.h>
106 #include <linux/sched/mm.h>
107 #include <linux/timer.h>
108 #include <linux/string.h>
109 #include <linux/sockios.h>
110 #include <linux/net.h>
111 #include <linux/mm.h>
112 #include <linux/slab.h>
113 #include <linux/interrupt.h>
114 #include <linux/poll.h>
115 #include <linux/tcp.h>
116 #include <linux/init.h>
117 #include <linux/highmem.h>
118 #include <linux/user_namespace.h>
119 #include <linux/static_key.h>
120 #include <linux/memcontrol.h>
121 #include <linux/prefetch.h>
122 
123 #include <linux/uaccess.h>
124 
125 #include <linux/netdevice.h>
126 #include <net/protocol.h>
127 #include <linux/skbuff.h>
128 #include <net/net_namespace.h>
129 #include <net/request_sock.h>
130 #include <net/sock.h>
131 #include <linux/net_tstamp.h>
132 #include <net/xfrm.h>
133 #include <linux/ipsec.h>
134 #include <net/cls_cgroup.h>
135 #include <net/netprio_cgroup.h>
136 #include <linux/sock_diag.h>
137 
138 #include <linux/filter.h>
139 #include <net/sock_reuseport.h>
140 
141 #include <trace/events/sock.h>
142 
143 #include <net/tcp.h>
144 #include <net/busy_poll.h>
145 
146 static DEFINE_MUTEX(proto_list_mutex);
147 static LIST_HEAD(proto_list);
148 
149 static void sock_inuse_add(struct net *net, int val);
150 
151 /**
152  * sk_ns_capable - General socket capability test
153  * @sk: Socket to use a capability on or through
154  * @user_ns: The user namespace of the capability to use
155  * @cap: The capability to use
156  *
157  * Test to see if the opener of the socket had when the socket was
158  * created and the current process has the capability @cap in the user
159  * namespace @user_ns.
160  */
161 bool sk_ns_capable(const struct sock *sk,
162 		   struct user_namespace *user_ns, int cap)
163 {
164 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
165 		ns_capable(user_ns, cap);
166 }
167 EXPORT_SYMBOL(sk_ns_capable);
168 
169 /**
170  * sk_capable - Socket global capability test
171  * @sk: Socket to use a capability on or through
172  * @cap: The global capability to use
173  *
174  * Test to see if the opener of the socket had when the socket was
175  * created and the current process has the capability @cap in all user
176  * namespaces.
177  */
178 bool sk_capable(const struct sock *sk, int cap)
179 {
180 	return sk_ns_capable(sk, &init_user_ns, cap);
181 }
182 EXPORT_SYMBOL(sk_capable);
183 
184 /**
185  * sk_net_capable - Network namespace socket capability test
186  * @sk: Socket to use a capability on or through
187  * @cap: The capability to use
188  *
189  * Test to see if the opener of the socket had when the socket was created
190  * and the current process has the capability @cap over the network namespace
191  * the socket is a member of.
192  */
193 bool sk_net_capable(const struct sock *sk, int cap)
194 {
195 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
196 }
197 EXPORT_SYMBOL(sk_net_capable);
198 
199 /*
200  * Each address family might have different locking rules, so we have
201  * one slock key per address family and separate keys for internal and
202  * userspace sockets.
203  */
204 static struct lock_class_key af_family_keys[AF_MAX];
205 static struct lock_class_key af_family_kern_keys[AF_MAX];
206 static struct lock_class_key af_family_slock_keys[AF_MAX];
207 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
208 
209 /*
210  * Make lock validator output more readable. (we pre-construct these
211  * strings build-time, so that runtime initialization of socket
212  * locks is fast):
213  */
214 
215 #define _sock_locks(x)						  \
216   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
217   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
218   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
219   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
220   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
221   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
222   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
223   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
224   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
225   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
226   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
227   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
228   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
229   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
230   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
231   x "AF_MAX"
232 
233 static const char *const af_family_key_strings[AF_MAX+1] = {
234 	_sock_locks("sk_lock-")
235 };
236 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
237 	_sock_locks("slock-")
238 };
239 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
240 	_sock_locks("clock-")
241 };
242 
243 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
244 	_sock_locks("k-sk_lock-")
245 };
246 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
247 	_sock_locks("k-slock-")
248 };
249 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
250 	_sock_locks("k-clock-")
251 };
252 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
253 	_sock_locks("rlock-")
254 };
255 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
256 	_sock_locks("wlock-")
257 };
258 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
259 	_sock_locks("elock-")
260 };
261 
262 /*
263  * sk_callback_lock and sk queues locking rules are per-address-family,
264  * so split the lock classes by using a per-AF key:
265  */
266 static struct lock_class_key af_callback_keys[AF_MAX];
267 static struct lock_class_key af_rlock_keys[AF_MAX];
268 static struct lock_class_key af_wlock_keys[AF_MAX];
269 static struct lock_class_key af_elock_keys[AF_MAX];
270 static struct lock_class_key af_kern_callback_keys[AF_MAX];
271 
272 /* Run time adjustable parameters. */
273 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
274 EXPORT_SYMBOL(sysctl_wmem_max);
275 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
276 EXPORT_SYMBOL(sysctl_rmem_max);
277 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
278 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
279 
280 /* Maximal space eaten by iovec or ancillary data plus some space */
281 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
282 EXPORT_SYMBOL(sysctl_optmem_max);
283 
284 int sysctl_tstamp_allow_data __read_mostly = 1;
285 
286 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
287 EXPORT_SYMBOL_GPL(memalloc_socks_key);
288 
289 /**
290  * sk_set_memalloc - sets %SOCK_MEMALLOC
291  * @sk: socket to set it on
292  *
293  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
294  * It's the responsibility of the admin to adjust min_free_kbytes
295  * to meet the requirements
296  */
297 void sk_set_memalloc(struct sock *sk)
298 {
299 	sock_set_flag(sk, SOCK_MEMALLOC);
300 	sk->sk_allocation |= __GFP_MEMALLOC;
301 	static_branch_inc(&memalloc_socks_key);
302 }
303 EXPORT_SYMBOL_GPL(sk_set_memalloc);
304 
305 void sk_clear_memalloc(struct sock *sk)
306 {
307 	sock_reset_flag(sk, SOCK_MEMALLOC);
308 	sk->sk_allocation &= ~__GFP_MEMALLOC;
309 	static_branch_dec(&memalloc_socks_key);
310 
311 	/*
312 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
313 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
314 	 * it has rmem allocations due to the last swapfile being deactivated
315 	 * but there is a risk that the socket is unusable due to exceeding
316 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
317 	 */
318 	sk_mem_reclaim(sk);
319 }
320 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
321 
322 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
323 {
324 	int ret;
325 	unsigned int noreclaim_flag;
326 
327 	/* these should have been dropped before queueing */
328 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
329 
330 	noreclaim_flag = memalloc_noreclaim_save();
331 	ret = sk->sk_backlog_rcv(sk, skb);
332 	memalloc_noreclaim_restore(noreclaim_flag);
333 
334 	return ret;
335 }
336 EXPORT_SYMBOL(__sk_backlog_rcv);
337 
338 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
339 {
340 	struct timeval tv;
341 
342 	if (optlen < sizeof(tv))
343 		return -EINVAL;
344 	if (copy_from_user(&tv, optval, sizeof(tv)))
345 		return -EFAULT;
346 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
347 		return -EDOM;
348 
349 	if (tv.tv_sec < 0) {
350 		static int warned __read_mostly;
351 
352 		*timeo_p = 0;
353 		if (warned < 10 && net_ratelimit()) {
354 			warned++;
355 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
356 				__func__, current->comm, task_pid_nr(current));
357 		}
358 		return 0;
359 	}
360 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
361 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
362 		return 0;
363 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
364 		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
365 	return 0;
366 }
367 
368 static void sock_warn_obsolete_bsdism(const char *name)
369 {
370 	static int warned;
371 	static char warncomm[TASK_COMM_LEN];
372 	if (strcmp(warncomm, current->comm) && warned < 5) {
373 		strcpy(warncomm,  current->comm);
374 		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
375 			warncomm, name);
376 		warned++;
377 	}
378 }
379 
380 static bool sock_needs_netstamp(const struct sock *sk)
381 {
382 	switch (sk->sk_family) {
383 	case AF_UNSPEC:
384 	case AF_UNIX:
385 		return false;
386 	default:
387 		return true;
388 	}
389 }
390 
391 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
392 {
393 	if (sk->sk_flags & flags) {
394 		sk->sk_flags &= ~flags;
395 		if (sock_needs_netstamp(sk) &&
396 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
397 			net_disable_timestamp();
398 	}
399 }
400 
401 
402 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
403 {
404 	unsigned long flags;
405 	struct sk_buff_head *list = &sk->sk_receive_queue;
406 
407 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
408 		atomic_inc(&sk->sk_drops);
409 		trace_sock_rcvqueue_full(sk, skb);
410 		return -ENOMEM;
411 	}
412 
413 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
414 		atomic_inc(&sk->sk_drops);
415 		return -ENOBUFS;
416 	}
417 
418 	skb->dev = NULL;
419 	skb_set_owner_r(skb, sk);
420 
421 	/* we escape from rcu protected region, make sure we dont leak
422 	 * a norefcounted dst
423 	 */
424 	skb_dst_force(skb);
425 
426 	spin_lock_irqsave(&list->lock, flags);
427 	sock_skb_set_dropcount(sk, skb);
428 	__skb_queue_tail(list, skb);
429 	spin_unlock_irqrestore(&list->lock, flags);
430 
431 	if (!sock_flag(sk, SOCK_DEAD))
432 		sk->sk_data_ready(sk);
433 	return 0;
434 }
435 EXPORT_SYMBOL(__sock_queue_rcv_skb);
436 
437 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
438 {
439 	int err;
440 
441 	err = sk_filter(sk, skb);
442 	if (err)
443 		return err;
444 
445 	return __sock_queue_rcv_skb(sk, skb);
446 }
447 EXPORT_SYMBOL(sock_queue_rcv_skb);
448 
449 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
450 		     const int nested, unsigned int trim_cap, bool refcounted)
451 {
452 	int rc = NET_RX_SUCCESS;
453 
454 	if (sk_filter_trim_cap(sk, skb, trim_cap))
455 		goto discard_and_relse;
456 
457 	skb->dev = NULL;
458 
459 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
460 		atomic_inc(&sk->sk_drops);
461 		goto discard_and_relse;
462 	}
463 	if (nested)
464 		bh_lock_sock_nested(sk);
465 	else
466 		bh_lock_sock(sk);
467 	if (!sock_owned_by_user(sk)) {
468 		/*
469 		 * trylock + unlock semantics:
470 		 */
471 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
472 
473 		rc = sk_backlog_rcv(sk, skb);
474 
475 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
476 	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
477 		bh_unlock_sock(sk);
478 		atomic_inc(&sk->sk_drops);
479 		goto discard_and_relse;
480 	}
481 
482 	bh_unlock_sock(sk);
483 out:
484 	if (refcounted)
485 		sock_put(sk);
486 	return rc;
487 discard_and_relse:
488 	kfree_skb(skb);
489 	goto out;
490 }
491 EXPORT_SYMBOL(__sk_receive_skb);
492 
493 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
494 {
495 	struct dst_entry *dst = __sk_dst_get(sk);
496 
497 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
498 		sk_tx_queue_clear(sk);
499 		sk->sk_dst_pending_confirm = 0;
500 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
501 		dst_release(dst);
502 		return NULL;
503 	}
504 
505 	return dst;
506 }
507 EXPORT_SYMBOL(__sk_dst_check);
508 
509 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
510 {
511 	struct dst_entry *dst = sk_dst_get(sk);
512 
513 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
514 		sk_dst_reset(sk);
515 		dst_release(dst);
516 		return NULL;
517 	}
518 
519 	return dst;
520 }
521 EXPORT_SYMBOL(sk_dst_check);
522 
523 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
524 				int optlen)
525 {
526 	int ret = -ENOPROTOOPT;
527 #ifdef CONFIG_NETDEVICES
528 	struct net *net = sock_net(sk);
529 	char devname[IFNAMSIZ];
530 	int index;
531 
532 	/* Sorry... */
533 	ret = -EPERM;
534 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
535 		goto out;
536 
537 	ret = -EINVAL;
538 	if (optlen < 0)
539 		goto out;
540 
541 	/* Bind this socket to a particular device like "eth0",
542 	 * as specified in the passed interface name. If the
543 	 * name is "" or the option length is zero the socket
544 	 * is not bound.
545 	 */
546 	if (optlen > IFNAMSIZ - 1)
547 		optlen = IFNAMSIZ - 1;
548 	memset(devname, 0, sizeof(devname));
549 
550 	ret = -EFAULT;
551 	if (copy_from_user(devname, optval, optlen))
552 		goto out;
553 
554 	index = 0;
555 	if (devname[0] != '\0') {
556 		struct net_device *dev;
557 
558 		rcu_read_lock();
559 		dev = dev_get_by_name_rcu(net, devname);
560 		if (dev)
561 			index = dev->ifindex;
562 		rcu_read_unlock();
563 		ret = -ENODEV;
564 		if (!dev)
565 			goto out;
566 	}
567 
568 	lock_sock(sk);
569 	sk->sk_bound_dev_if = index;
570 	if (sk->sk_prot->rehash)
571 		sk->sk_prot->rehash(sk);
572 	sk_dst_reset(sk);
573 	release_sock(sk);
574 
575 	ret = 0;
576 
577 out:
578 #endif
579 
580 	return ret;
581 }
582 
583 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
584 				int __user *optlen, int len)
585 {
586 	int ret = -ENOPROTOOPT;
587 #ifdef CONFIG_NETDEVICES
588 	struct net *net = sock_net(sk);
589 	char devname[IFNAMSIZ];
590 
591 	if (sk->sk_bound_dev_if == 0) {
592 		len = 0;
593 		goto zero;
594 	}
595 
596 	ret = -EINVAL;
597 	if (len < IFNAMSIZ)
598 		goto out;
599 
600 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
601 	if (ret)
602 		goto out;
603 
604 	len = strlen(devname) + 1;
605 
606 	ret = -EFAULT;
607 	if (copy_to_user(optval, devname, len))
608 		goto out;
609 
610 zero:
611 	ret = -EFAULT;
612 	if (put_user(len, optlen))
613 		goto out;
614 
615 	ret = 0;
616 
617 out:
618 #endif
619 
620 	return ret;
621 }
622 
623 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
624 {
625 	if (valbool)
626 		sock_set_flag(sk, bit);
627 	else
628 		sock_reset_flag(sk, bit);
629 }
630 
631 bool sk_mc_loop(struct sock *sk)
632 {
633 	if (dev_recursion_level())
634 		return false;
635 	if (!sk)
636 		return true;
637 	switch (sk->sk_family) {
638 	case AF_INET:
639 		return inet_sk(sk)->mc_loop;
640 #if IS_ENABLED(CONFIG_IPV6)
641 	case AF_INET6:
642 		return inet6_sk(sk)->mc_loop;
643 #endif
644 	}
645 	WARN_ON(1);
646 	return true;
647 }
648 EXPORT_SYMBOL(sk_mc_loop);
649 
650 /*
651  *	This is meant for all protocols to use and covers goings on
652  *	at the socket level. Everything here is generic.
653  */
654 
655 int sock_setsockopt(struct socket *sock, int level, int optname,
656 		    char __user *optval, unsigned int optlen)
657 {
658 	struct sock_txtime sk_txtime;
659 	struct sock *sk = sock->sk;
660 	int val;
661 	int valbool;
662 	struct linger ling;
663 	int ret = 0;
664 
665 	/*
666 	 *	Options without arguments
667 	 */
668 
669 	if (optname == SO_BINDTODEVICE)
670 		return sock_setbindtodevice(sk, optval, optlen);
671 
672 	if (optlen < sizeof(int))
673 		return -EINVAL;
674 
675 	if (get_user(val, (int __user *)optval))
676 		return -EFAULT;
677 
678 	valbool = val ? 1 : 0;
679 
680 	lock_sock(sk);
681 
682 	switch (optname) {
683 	case SO_DEBUG:
684 		if (val && !capable(CAP_NET_ADMIN))
685 			ret = -EACCES;
686 		else
687 			sock_valbool_flag(sk, SOCK_DBG, valbool);
688 		break;
689 	case SO_REUSEADDR:
690 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
691 		break;
692 	case SO_REUSEPORT:
693 		sk->sk_reuseport = valbool;
694 		break;
695 	case SO_TYPE:
696 	case SO_PROTOCOL:
697 	case SO_DOMAIN:
698 	case SO_ERROR:
699 		ret = -ENOPROTOOPT;
700 		break;
701 	case SO_DONTROUTE:
702 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
703 		sk_dst_reset(sk);
704 		break;
705 	case SO_BROADCAST:
706 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
707 		break;
708 	case SO_SNDBUF:
709 		/* Don't error on this BSD doesn't and if you think
710 		 * about it this is right. Otherwise apps have to
711 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
712 		 * are treated in BSD as hints
713 		 */
714 		val = min_t(u32, val, sysctl_wmem_max);
715 set_sndbuf:
716 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
717 		sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
718 		/* Wake up sending tasks if we upped the value. */
719 		sk->sk_write_space(sk);
720 		break;
721 
722 	case SO_SNDBUFFORCE:
723 		if (!capable(CAP_NET_ADMIN)) {
724 			ret = -EPERM;
725 			break;
726 		}
727 		goto set_sndbuf;
728 
729 	case SO_RCVBUF:
730 		/* Don't error on this BSD doesn't and if you think
731 		 * about it this is right. Otherwise apps have to
732 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
733 		 * are treated in BSD as hints
734 		 */
735 		val = min_t(u32, val, sysctl_rmem_max);
736 set_rcvbuf:
737 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
738 		/*
739 		 * We double it on the way in to account for
740 		 * "struct sk_buff" etc. overhead.   Applications
741 		 * assume that the SO_RCVBUF setting they make will
742 		 * allow that much actual data to be received on that
743 		 * socket.
744 		 *
745 		 * Applications are unaware that "struct sk_buff" and
746 		 * other overheads allocate from the receive buffer
747 		 * during socket buffer allocation.
748 		 *
749 		 * And after considering the possible alternatives,
750 		 * returning the value we actually used in getsockopt
751 		 * is the most desirable behavior.
752 		 */
753 		sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
754 		break;
755 
756 	case SO_RCVBUFFORCE:
757 		if (!capable(CAP_NET_ADMIN)) {
758 			ret = -EPERM;
759 			break;
760 		}
761 		goto set_rcvbuf;
762 
763 	case SO_KEEPALIVE:
764 		if (sk->sk_prot->keepalive)
765 			sk->sk_prot->keepalive(sk, valbool);
766 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
767 		break;
768 
769 	case SO_OOBINLINE:
770 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
771 		break;
772 
773 	case SO_NO_CHECK:
774 		sk->sk_no_check_tx = valbool;
775 		break;
776 
777 	case SO_PRIORITY:
778 		if ((val >= 0 && val <= 6) ||
779 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
780 			sk->sk_priority = val;
781 		else
782 			ret = -EPERM;
783 		break;
784 
785 	case SO_LINGER:
786 		if (optlen < sizeof(ling)) {
787 			ret = -EINVAL;	/* 1003.1g */
788 			break;
789 		}
790 		if (copy_from_user(&ling, optval, sizeof(ling))) {
791 			ret = -EFAULT;
792 			break;
793 		}
794 		if (!ling.l_onoff)
795 			sock_reset_flag(sk, SOCK_LINGER);
796 		else {
797 #if (BITS_PER_LONG == 32)
798 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
799 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
800 			else
801 #endif
802 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
803 			sock_set_flag(sk, SOCK_LINGER);
804 		}
805 		break;
806 
807 	case SO_BSDCOMPAT:
808 		sock_warn_obsolete_bsdism("setsockopt");
809 		break;
810 
811 	case SO_PASSCRED:
812 		if (valbool)
813 			set_bit(SOCK_PASSCRED, &sock->flags);
814 		else
815 			clear_bit(SOCK_PASSCRED, &sock->flags);
816 		break;
817 
818 	case SO_TIMESTAMP:
819 	case SO_TIMESTAMPNS:
820 		if (valbool)  {
821 			if (optname == SO_TIMESTAMP)
822 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
823 			else
824 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
825 			sock_set_flag(sk, SOCK_RCVTSTAMP);
826 			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
827 		} else {
828 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
829 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
830 		}
831 		break;
832 
833 	case SO_TIMESTAMPING:
834 		if (val & ~SOF_TIMESTAMPING_MASK) {
835 			ret = -EINVAL;
836 			break;
837 		}
838 
839 		if (val & SOF_TIMESTAMPING_OPT_ID &&
840 		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
841 			if (sk->sk_protocol == IPPROTO_TCP &&
842 			    sk->sk_type == SOCK_STREAM) {
843 				if ((1 << sk->sk_state) &
844 				    (TCPF_CLOSE | TCPF_LISTEN)) {
845 					ret = -EINVAL;
846 					break;
847 				}
848 				sk->sk_tskey = tcp_sk(sk)->snd_una;
849 			} else {
850 				sk->sk_tskey = 0;
851 			}
852 		}
853 
854 		if (val & SOF_TIMESTAMPING_OPT_STATS &&
855 		    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
856 			ret = -EINVAL;
857 			break;
858 		}
859 
860 		sk->sk_tsflags = val;
861 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
862 			sock_enable_timestamp(sk,
863 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
864 		else
865 			sock_disable_timestamp(sk,
866 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
867 		break;
868 
869 	case SO_RCVLOWAT:
870 		if (val < 0)
871 			val = INT_MAX;
872 		if (sock->ops->set_rcvlowat)
873 			ret = sock->ops->set_rcvlowat(sk, val);
874 		else
875 			sk->sk_rcvlowat = val ? : 1;
876 		break;
877 
878 	case SO_RCVTIMEO:
879 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
880 		break;
881 
882 	case SO_SNDTIMEO:
883 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
884 		break;
885 
886 	case SO_ATTACH_FILTER:
887 		ret = -EINVAL;
888 		if (optlen == sizeof(struct sock_fprog)) {
889 			struct sock_fprog fprog;
890 
891 			ret = -EFAULT;
892 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
893 				break;
894 
895 			ret = sk_attach_filter(&fprog, sk);
896 		}
897 		break;
898 
899 	case SO_ATTACH_BPF:
900 		ret = -EINVAL;
901 		if (optlen == sizeof(u32)) {
902 			u32 ufd;
903 
904 			ret = -EFAULT;
905 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
906 				break;
907 
908 			ret = sk_attach_bpf(ufd, sk);
909 		}
910 		break;
911 
912 	case SO_ATTACH_REUSEPORT_CBPF:
913 		ret = -EINVAL;
914 		if (optlen == sizeof(struct sock_fprog)) {
915 			struct sock_fprog fprog;
916 
917 			ret = -EFAULT;
918 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
919 				break;
920 
921 			ret = sk_reuseport_attach_filter(&fprog, sk);
922 		}
923 		break;
924 
925 	case SO_ATTACH_REUSEPORT_EBPF:
926 		ret = -EINVAL;
927 		if (optlen == sizeof(u32)) {
928 			u32 ufd;
929 
930 			ret = -EFAULT;
931 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
932 				break;
933 
934 			ret = sk_reuseport_attach_bpf(ufd, sk);
935 		}
936 		break;
937 
938 	case SO_DETACH_FILTER:
939 		ret = sk_detach_filter(sk);
940 		break;
941 
942 	case SO_LOCK_FILTER:
943 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
944 			ret = -EPERM;
945 		else
946 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
947 		break;
948 
949 	case SO_PASSSEC:
950 		if (valbool)
951 			set_bit(SOCK_PASSSEC, &sock->flags);
952 		else
953 			clear_bit(SOCK_PASSSEC, &sock->flags);
954 		break;
955 	case SO_MARK:
956 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
957 			ret = -EPERM;
958 		} else if (val != sk->sk_mark) {
959 			sk->sk_mark = val;
960 			sk_dst_reset(sk);
961 		}
962 		break;
963 
964 	case SO_RXQ_OVFL:
965 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
966 		break;
967 
968 	case SO_WIFI_STATUS:
969 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
970 		break;
971 
972 	case SO_PEEK_OFF:
973 		if (sock->ops->set_peek_off)
974 			ret = sock->ops->set_peek_off(sk, val);
975 		else
976 			ret = -EOPNOTSUPP;
977 		break;
978 
979 	case SO_NOFCS:
980 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
981 		break;
982 
983 	case SO_SELECT_ERR_QUEUE:
984 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
985 		break;
986 
987 #ifdef CONFIG_NET_RX_BUSY_POLL
988 	case SO_BUSY_POLL:
989 		/* allow unprivileged users to decrease the value */
990 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
991 			ret = -EPERM;
992 		else {
993 			if (val < 0)
994 				ret = -EINVAL;
995 			else
996 				sk->sk_ll_usec = val;
997 		}
998 		break;
999 #endif
1000 
1001 	case SO_MAX_PACING_RATE:
1002 		if (val != ~0U)
1003 			cmpxchg(&sk->sk_pacing_status,
1004 				SK_PACING_NONE,
1005 				SK_PACING_NEEDED);
1006 		sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val;
1007 		sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1008 					 sk->sk_max_pacing_rate);
1009 		break;
1010 
1011 	case SO_INCOMING_CPU:
1012 		sk->sk_incoming_cpu = val;
1013 		break;
1014 
1015 	case SO_CNX_ADVICE:
1016 		if (val == 1)
1017 			dst_negative_advice(sk);
1018 		break;
1019 
1020 	case SO_ZEROCOPY:
1021 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1022 			if (!((sk->sk_type == SOCK_STREAM &&
1023 			       sk->sk_protocol == IPPROTO_TCP) ||
1024 			      (sk->sk_type == SOCK_DGRAM &&
1025 			       sk->sk_protocol == IPPROTO_UDP)))
1026 				ret = -ENOTSUPP;
1027 		} else if (sk->sk_family != PF_RDS) {
1028 			ret = -ENOTSUPP;
1029 		}
1030 		if (!ret) {
1031 			if (val < 0 || val > 1)
1032 				ret = -EINVAL;
1033 			else
1034 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1035 		}
1036 		break;
1037 
1038 	case SO_TXTIME:
1039 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1040 			ret = -EPERM;
1041 		} else if (optlen != sizeof(struct sock_txtime)) {
1042 			ret = -EINVAL;
1043 		} else if (copy_from_user(&sk_txtime, optval,
1044 			   sizeof(struct sock_txtime))) {
1045 			ret = -EFAULT;
1046 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1047 			ret = -EINVAL;
1048 		} else {
1049 			sock_valbool_flag(sk, SOCK_TXTIME, true);
1050 			sk->sk_clockid = sk_txtime.clockid;
1051 			sk->sk_txtime_deadline_mode =
1052 				!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1053 			sk->sk_txtime_report_errors =
1054 				!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1055 		}
1056 		break;
1057 
1058 	default:
1059 		ret = -ENOPROTOOPT;
1060 		break;
1061 	}
1062 	release_sock(sk);
1063 	return ret;
1064 }
1065 EXPORT_SYMBOL(sock_setsockopt);
1066 
1067 
1068 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1069 			  struct ucred *ucred)
1070 {
1071 	ucred->pid = pid_vnr(pid);
1072 	ucred->uid = ucred->gid = -1;
1073 	if (cred) {
1074 		struct user_namespace *current_ns = current_user_ns();
1075 
1076 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1077 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1078 	}
1079 }
1080 
1081 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1082 {
1083 	struct user_namespace *user_ns = current_user_ns();
1084 	int i;
1085 
1086 	for (i = 0; i < src->ngroups; i++)
1087 		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1088 			return -EFAULT;
1089 
1090 	return 0;
1091 }
1092 
1093 int sock_getsockopt(struct socket *sock, int level, int optname,
1094 		    char __user *optval, int __user *optlen)
1095 {
1096 	struct sock *sk = sock->sk;
1097 
1098 	union {
1099 		int val;
1100 		u64 val64;
1101 		struct linger ling;
1102 		struct timeval tm;
1103 		struct sock_txtime txtime;
1104 	} v;
1105 
1106 	int lv = sizeof(int);
1107 	int len;
1108 
1109 	if (get_user(len, optlen))
1110 		return -EFAULT;
1111 	if (len < 0)
1112 		return -EINVAL;
1113 
1114 	memset(&v, 0, sizeof(v));
1115 
1116 	switch (optname) {
1117 	case SO_DEBUG:
1118 		v.val = sock_flag(sk, SOCK_DBG);
1119 		break;
1120 
1121 	case SO_DONTROUTE:
1122 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1123 		break;
1124 
1125 	case SO_BROADCAST:
1126 		v.val = sock_flag(sk, SOCK_BROADCAST);
1127 		break;
1128 
1129 	case SO_SNDBUF:
1130 		v.val = sk->sk_sndbuf;
1131 		break;
1132 
1133 	case SO_RCVBUF:
1134 		v.val = sk->sk_rcvbuf;
1135 		break;
1136 
1137 	case SO_REUSEADDR:
1138 		v.val = sk->sk_reuse;
1139 		break;
1140 
1141 	case SO_REUSEPORT:
1142 		v.val = sk->sk_reuseport;
1143 		break;
1144 
1145 	case SO_KEEPALIVE:
1146 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1147 		break;
1148 
1149 	case SO_TYPE:
1150 		v.val = sk->sk_type;
1151 		break;
1152 
1153 	case SO_PROTOCOL:
1154 		v.val = sk->sk_protocol;
1155 		break;
1156 
1157 	case SO_DOMAIN:
1158 		v.val = sk->sk_family;
1159 		break;
1160 
1161 	case SO_ERROR:
1162 		v.val = -sock_error(sk);
1163 		if (v.val == 0)
1164 			v.val = xchg(&sk->sk_err_soft, 0);
1165 		break;
1166 
1167 	case SO_OOBINLINE:
1168 		v.val = sock_flag(sk, SOCK_URGINLINE);
1169 		break;
1170 
1171 	case SO_NO_CHECK:
1172 		v.val = sk->sk_no_check_tx;
1173 		break;
1174 
1175 	case SO_PRIORITY:
1176 		v.val = sk->sk_priority;
1177 		break;
1178 
1179 	case SO_LINGER:
1180 		lv		= sizeof(v.ling);
1181 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1182 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1183 		break;
1184 
1185 	case SO_BSDCOMPAT:
1186 		sock_warn_obsolete_bsdism("getsockopt");
1187 		break;
1188 
1189 	case SO_TIMESTAMP:
1190 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1191 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1192 		break;
1193 
1194 	case SO_TIMESTAMPNS:
1195 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1196 		break;
1197 
1198 	case SO_TIMESTAMPING:
1199 		v.val = sk->sk_tsflags;
1200 		break;
1201 
1202 	case SO_RCVTIMEO:
1203 		lv = sizeof(struct timeval);
1204 		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1205 			v.tm.tv_sec = 0;
1206 			v.tm.tv_usec = 0;
1207 		} else {
1208 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1209 			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1210 		}
1211 		break;
1212 
1213 	case SO_SNDTIMEO:
1214 		lv = sizeof(struct timeval);
1215 		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1216 			v.tm.tv_sec = 0;
1217 			v.tm.tv_usec = 0;
1218 		} else {
1219 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1220 			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1221 		}
1222 		break;
1223 
1224 	case SO_RCVLOWAT:
1225 		v.val = sk->sk_rcvlowat;
1226 		break;
1227 
1228 	case SO_SNDLOWAT:
1229 		v.val = 1;
1230 		break;
1231 
1232 	case SO_PASSCRED:
1233 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1234 		break;
1235 
1236 	case SO_PEERCRED:
1237 	{
1238 		struct ucred peercred;
1239 		if (len > sizeof(peercred))
1240 			len = sizeof(peercred);
1241 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1242 		if (copy_to_user(optval, &peercred, len))
1243 			return -EFAULT;
1244 		goto lenout;
1245 	}
1246 
1247 	case SO_PEERGROUPS:
1248 	{
1249 		int ret, n;
1250 
1251 		if (!sk->sk_peer_cred)
1252 			return -ENODATA;
1253 
1254 		n = sk->sk_peer_cred->group_info->ngroups;
1255 		if (len < n * sizeof(gid_t)) {
1256 			len = n * sizeof(gid_t);
1257 			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1258 		}
1259 		len = n * sizeof(gid_t);
1260 
1261 		ret = groups_to_user((gid_t __user *)optval,
1262 				     sk->sk_peer_cred->group_info);
1263 		if (ret)
1264 			return ret;
1265 		goto lenout;
1266 	}
1267 
1268 	case SO_PEERNAME:
1269 	{
1270 		char address[128];
1271 
1272 		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1273 		if (lv < 0)
1274 			return -ENOTCONN;
1275 		if (lv < len)
1276 			return -EINVAL;
1277 		if (copy_to_user(optval, address, len))
1278 			return -EFAULT;
1279 		goto lenout;
1280 	}
1281 
1282 	/* Dubious BSD thing... Probably nobody even uses it, but
1283 	 * the UNIX standard wants it for whatever reason... -DaveM
1284 	 */
1285 	case SO_ACCEPTCONN:
1286 		v.val = sk->sk_state == TCP_LISTEN;
1287 		break;
1288 
1289 	case SO_PASSSEC:
1290 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1291 		break;
1292 
1293 	case SO_PEERSEC:
1294 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1295 
1296 	case SO_MARK:
1297 		v.val = sk->sk_mark;
1298 		break;
1299 
1300 	case SO_RXQ_OVFL:
1301 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1302 		break;
1303 
1304 	case SO_WIFI_STATUS:
1305 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1306 		break;
1307 
1308 	case SO_PEEK_OFF:
1309 		if (!sock->ops->set_peek_off)
1310 			return -EOPNOTSUPP;
1311 
1312 		v.val = sk->sk_peek_off;
1313 		break;
1314 	case SO_NOFCS:
1315 		v.val = sock_flag(sk, SOCK_NOFCS);
1316 		break;
1317 
1318 	case SO_BINDTODEVICE:
1319 		return sock_getbindtodevice(sk, optval, optlen, len);
1320 
1321 	case SO_GET_FILTER:
1322 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1323 		if (len < 0)
1324 			return len;
1325 
1326 		goto lenout;
1327 
1328 	case SO_LOCK_FILTER:
1329 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1330 		break;
1331 
1332 	case SO_BPF_EXTENSIONS:
1333 		v.val = bpf_tell_extensions();
1334 		break;
1335 
1336 	case SO_SELECT_ERR_QUEUE:
1337 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1338 		break;
1339 
1340 #ifdef CONFIG_NET_RX_BUSY_POLL
1341 	case SO_BUSY_POLL:
1342 		v.val = sk->sk_ll_usec;
1343 		break;
1344 #endif
1345 
1346 	case SO_MAX_PACING_RATE:
1347 		/* 32bit version */
1348 		v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1349 		break;
1350 
1351 	case SO_INCOMING_CPU:
1352 		v.val = sk->sk_incoming_cpu;
1353 		break;
1354 
1355 	case SO_MEMINFO:
1356 	{
1357 		u32 meminfo[SK_MEMINFO_VARS];
1358 
1359 		if (get_user(len, optlen))
1360 			return -EFAULT;
1361 
1362 		sk_get_meminfo(sk, meminfo);
1363 
1364 		len = min_t(unsigned int, len, sizeof(meminfo));
1365 		if (copy_to_user(optval, &meminfo, len))
1366 			return -EFAULT;
1367 
1368 		goto lenout;
1369 	}
1370 
1371 #ifdef CONFIG_NET_RX_BUSY_POLL
1372 	case SO_INCOMING_NAPI_ID:
1373 		v.val = READ_ONCE(sk->sk_napi_id);
1374 
1375 		/* aggregate non-NAPI IDs down to 0 */
1376 		if (v.val < MIN_NAPI_ID)
1377 			v.val = 0;
1378 
1379 		break;
1380 #endif
1381 
1382 	case SO_COOKIE:
1383 		lv = sizeof(u64);
1384 		if (len < lv)
1385 			return -EINVAL;
1386 		v.val64 = sock_gen_cookie(sk);
1387 		break;
1388 
1389 	case SO_ZEROCOPY:
1390 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1391 		break;
1392 
1393 	case SO_TXTIME:
1394 		lv = sizeof(v.txtime);
1395 		v.txtime.clockid = sk->sk_clockid;
1396 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1397 				  SOF_TXTIME_DEADLINE_MODE : 0;
1398 		v.txtime.flags |= sk->sk_txtime_report_errors ?
1399 				  SOF_TXTIME_REPORT_ERRORS : 0;
1400 		break;
1401 
1402 	default:
1403 		/* We implement the SO_SNDLOWAT etc to not be settable
1404 		 * (1003.1g 7).
1405 		 */
1406 		return -ENOPROTOOPT;
1407 	}
1408 
1409 	if (len > lv)
1410 		len = lv;
1411 	if (copy_to_user(optval, &v, len))
1412 		return -EFAULT;
1413 lenout:
1414 	if (put_user(len, optlen))
1415 		return -EFAULT;
1416 	return 0;
1417 }
1418 
1419 /*
1420  * Initialize an sk_lock.
1421  *
1422  * (We also register the sk_lock with the lock validator.)
1423  */
1424 static inline void sock_lock_init(struct sock *sk)
1425 {
1426 	if (sk->sk_kern_sock)
1427 		sock_lock_init_class_and_name(
1428 			sk,
1429 			af_family_kern_slock_key_strings[sk->sk_family],
1430 			af_family_kern_slock_keys + sk->sk_family,
1431 			af_family_kern_key_strings[sk->sk_family],
1432 			af_family_kern_keys + sk->sk_family);
1433 	else
1434 		sock_lock_init_class_and_name(
1435 			sk,
1436 			af_family_slock_key_strings[sk->sk_family],
1437 			af_family_slock_keys + sk->sk_family,
1438 			af_family_key_strings[sk->sk_family],
1439 			af_family_keys + sk->sk_family);
1440 }
1441 
1442 /*
1443  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1444  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1445  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1446  */
1447 static void sock_copy(struct sock *nsk, const struct sock *osk)
1448 {
1449 #ifdef CONFIG_SECURITY_NETWORK
1450 	void *sptr = nsk->sk_security;
1451 #endif
1452 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1453 
1454 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1455 	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1456 
1457 #ifdef CONFIG_SECURITY_NETWORK
1458 	nsk->sk_security = sptr;
1459 	security_sk_clone(osk, nsk);
1460 #endif
1461 }
1462 
1463 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1464 		int family)
1465 {
1466 	struct sock *sk;
1467 	struct kmem_cache *slab;
1468 
1469 	slab = prot->slab;
1470 	if (slab != NULL) {
1471 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1472 		if (!sk)
1473 			return sk;
1474 		if (priority & __GFP_ZERO)
1475 			sk_prot_clear_nulls(sk, prot->obj_size);
1476 	} else
1477 		sk = kmalloc(prot->obj_size, priority);
1478 
1479 	if (sk != NULL) {
1480 		if (security_sk_alloc(sk, family, priority))
1481 			goto out_free;
1482 
1483 		if (!try_module_get(prot->owner))
1484 			goto out_free_sec;
1485 		sk_tx_queue_clear(sk);
1486 	}
1487 
1488 	return sk;
1489 
1490 out_free_sec:
1491 	security_sk_free(sk);
1492 out_free:
1493 	if (slab != NULL)
1494 		kmem_cache_free(slab, sk);
1495 	else
1496 		kfree(sk);
1497 	return NULL;
1498 }
1499 
1500 static void sk_prot_free(struct proto *prot, struct sock *sk)
1501 {
1502 	struct kmem_cache *slab;
1503 	struct module *owner;
1504 
1505 	owner = prot->owner;
1506 	slab = prot->slab;
1507 
1508 	cgroup_sk_free(&sk->sk_cgrp_data);
1509 	mem_cgroup_sk_free(sk);
1510 	security_sk_free(sk);
1511 	if (slab != NULL)
1512 		kmem_cache_free(slab, sk);
1513 	else
1514 		kfree(sk);
1515 	module_put(owner);
1516 }
1517 
1518 /**
1519  *	sk_alloc - All socket objects are allocated here
1520  *	@net: the applicable net namespace
1521  *	@family: protocol family
1522  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1523  *	@prot: struct proto associated with this new sock instance
1524  *	@kern: is this to be a kernel socket?
1525  */
1526 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1527 		      struct proto *prot, int kern)
1528 {
1529 	struct sock *sk;
1530 
1531 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1532 	if (sk) {
1533 		sk->sk_family = family;
1534 		/*
1535 		 * See comment in struct sock definition to understand
1536 		 * why we need sk_prot_creator -acme
1537 		 */
1538 		sk->sk_prot = sk->sk_prot_creator = prot;
1539 		sk->sk_kern_sock = kern;
1540 		sock_lock_init(sk);
1541 		sk->sk_net_refcnt = kern ? 0 : 1;
1542 		if (likely(sk->sk_net_refcnt)) {
1543 			get_net(net);
1544 			sock_inuse_add(net, 1);
1545 		}
1546 
1547 		sock_net_set(sk, net);
1548 		refcount_set(&sk->sk_wmem_alloc, 1);
1549 
1550 		mem_cgroup_sk_alloc(sk);
1551 		cgroup_sk_alloc(&sk->sk_cgrp_data);
1552 		sock_update_classid(&sk->sk_cgrp_data);
1553 		sock_update_netprioidx(&sk->sk_cgrp_data);
1554 	}
1555 
1556 	return sk;
1557 }
1558 EXPORT_SYMBOL(sk_alloc);
1559 
1560 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1561  * grace period. This is the case for UDP sockets and TCP listeners.
1562  */
1563 static void __sk_destruct(struct rcu_head *head)
1564 {
1565 	struct sock *sk = container_of(head, struct sock, sk_rcu);
1566 	struct sk_filter *filter;
1567 
1568 	if (sk->sk_destruct)
1569 		sk->sk_destruct(sk);
1570 
1571 	filter = rcu_dereference_check(sk->sk_filter,
1572 				       refcount_read(&sk->sk_wmem_alloc) == 0);
1573 	if (filter) {
1574 		sk_filter_uncharge(sk, filter);
1575 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1576 	}
1577 	if (rcu_access_pointer(sk->sk_reuseport_cb))
1578 		reuseport_detach_sock(sk);
1579 
1580 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1581 
1582 	if (atomic_read(&sk->sk_omem_alloc))
1583 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1584 			 __func__, atomic_read(&sk->sk_omem_alloc));
1585 
1586 	if (sk->sk_frag.page) {
1587 		put_page(sk->sk_frag.page);
1588 		sk->sk_frag.page = NULL;
1589 	}
1590 
1591 	if (sk->sk_peer_cred)
1592 		put_cred(sk->sk_peer_cred);
1593 	put_pid(sk->sk_peer_pid);
1594 	if (likely(sk->sk_net_refcnt))
1595 		put_net(sock_net(sk));
1596 	sk_prot_free(sk->sk_prot_creator, sk);
1597 }
1598 
1599 void sk_destruct(struct sock *sk)
1600 {
1601 	if (sock_flag(sk, SOCK_RCU_FREE))
1602 		call_rcu(&sk->sk_rcu, __sk_destruct);
1603 	else
1604 		__sk_destruct(&sk->sk_rcu);
1605 }
1606 
1607 static void __sk_free(struct sock *sk)
1608 {
1609 	if (likely(sk->sk_net_refcnt))
1610 		sock_inuse_add(sock_net(sk), -1);
1611 
1612 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1613 		sock_diag_broadcast_destroy(sk);
1614 	else
1615 		sk_destruct(sk);
1616 }
1617 
1618 void sk_free(struct sock *sk)
1619 {
1620 	/*
1621 	 * We subtract one from sk_wmem_alloc and can know if
1622 	 * some packets are still in some tx queue.
1623 	 * If not null, sock_wfree() will call __sk_free(sk) later
1624 	 */
1625 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1626 		__sk_free(sk);
1627 }
1628 EXPORT_SYMBOL(sk_free);
1629 
1630 static void sk_init_common(struct sock *sk)
1631 {
1632 	skb_queue_head_init(&sk->sk_receive_queue);
1633 	skb_queue_head_init(&sk->sk_write_queue);
1634 	skb_queue_head_init(&sk->sk_error_queue);
1635 
1636 	rwlock_init(&sk->sk_callback_lock);
1637 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1638 			af_rlock_keys + sk->sk_family,
1639 			af_family_rlock_key_strings[sk->sk_family]);
1640 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1641 			af_wlock_keys + sk->sk_family,
1642 			af_family_wlock_key_strings[sk->sk_family]);
1643 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1644 			af_elock_keys + sk->sk_family,
1645 			af_family_elock_key_strings[sk->sk_family]);
1646 	lockdep_set_class_and_name(&sk->sk_callback_lock,
1647 			af_callback_keys + sk->sk_family,
1648 			af_family_clock_key_strings[sk->sk_family]);
1649 }
1650 
1651 /**
1652  *	sk_clone_lock - clone a socket, and lock its clone
1653  *	@sk: the socket to clone
1654  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1655  *
1656  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1657  */
1658 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1659 {
1660 	struct sock *newsk;
1661 	bool is_charged = true;
1662 
1663 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1664 	if (newsk != NULL) {
1665 		struct sk_filter *filter;
1666 
1667 		sock_copy(newsk, sk);
1668 
1669 		newsk->sk_prot_creator = sk->sk_prot;
1670 
1671 		/* SANITY */
1672 		if (likely(newsk->sk_net_refcnt))
1673 			get_net(sock_net(newsk));
1674 		sk_node_init(&newsk->sk_node);
1675 		sock_lock_init(newsk);
1676 		bh_lock_sock(newsk);
1677 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1678 		newsk->sk_backlog.len = 0;
1679 
1680 		atomic_set(&newsk->sk_rmem_alloc, 0);
1681 		/*
1682 		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1683 		 */
1684 		refcount_set(&newsk->sk_wmem_alloc, 1);
1685 		atomic_set(&newsk->sk_omem_alloc, 0);
1686 		sk_init_common(newsk);
1687 
1688 		newsk->sk_dst_cache	= NULL;
1689 		newsk->sk_dst_pending_confirm = 0;
1690 		newsk->sk_wmem_queued	= 0;
1691 		newsk->sk_forward_alloc = 0;
1692 		atomic_set(&newsk->sk_drops, 0);
1693 		newsk->sk_send_head	= NULL;
1694 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1695 		atomic_set(&newsk->sk_zckey, 0);
1696 
1697 		sock_reset_flag(newsk, SOCK_DONE);
1698 		mem_cgroup_sk_alloc(newsk);
1699 		cgroup_sk_alloc(&newsk->sk_cgrp_data);
1700 
1701 		rcu_read_lock();
1702 		filter = rcu_dereference(sk->sk_filter);
1703 		if (filter != NULL)
1704 			/* though it's an empty new sock, the charging may fail
1705 			 * if sysctl_optmem_max was changed between creation of
1706 			 * original socket and cloning
1707 			 */
1708 			is_charged = sk_filter_charge(newsk, filter);
1709 		RCU_INIT_POINTER(newsk->sk_filter, filter);
1710 		rcu_read_unlock();
1711 
1712 		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1713 			/* We need to make sure that we don't uncharge the new
1714 			 * socket if we couldn't charge it in the first place
1715 			 * as otherwise we uncharge the parent's filter.
1716 			 */
1717 			if (!is_charged)
1718 				RCU_INIT_POINTER(newsk->sk_filter, NULL);
1719 			sk_free_unlock_clone(newsk);
1720 			newsk = NULL;
1721 			goto out;
1722 		}
1723 		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1724 
1725 		newsk->sk_err	   = 0;
1726 		newsk->sk_err_soft = 0;
1727 		newsk->sk_priority = 0;
1728 		newsk->sk_incoming_cpu = raw_smp_processor_id();
1729 		atomic64_set(&newsk->sk_cookie, 0);
1730 		if (likely(newsk->sk_net_refcnt))
1731 			sock_inuse_add(sock_net(newsk), 1);
1732 
1733 		/*
1734 		 * Before updating sk_refcnt, we must commit prior changes to memory
1735 		 * (Documentation/RCU/rculist_nulls.txt for details)
1736 		 */
1737 		smp_wmb();
1738 		refcount_set(&newsk->sk_refcnt, 2);
1739 
1740 		/*
1741 		 * Increment the counter in the same struct proto as the master
1742 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1743 		 * is the same as sk->sk_prot->socks, as this field was copied
1744 		 * with memcpy).
1745 		 *
1746 		 * This _changes_ the previous behaviour, where
1747 		 * tcp_create_openreq_child always was incrementing the
1748 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1749 		 * to be taken into account in all callers. -acme
1750 		 */
1751 		sk_refcnt_debug_inc(newsk);
1752 		sk_set_socket(newsk, NULL);
1753 		newsk->sk_wq = NULL;
1754 
1755 		if (newsk->sk_prot->sockets_allocated)
1756 			sk_sockets_allocated_inc(newsk);
1757 
1758 		if (sock_needs_netstamp(sk) &&
1759 		    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1760 			net_enable_timestamp();
1761 	}
1762 out:
1763 	return newsk;
1764 }
1765 EXPORT_SYMBOL_GPL(sk_clone_lock);
1766 
1767 void sk_free_unlock_clone(struct sock *sk)
1768 {
1769 	/* It is still raw copy of parent, so invalidate
1770 	 * destructor and make plain sk_free() */
1771 	sk->sk_destruct = NULL;
1772 	bh_unlock_sock(sk);
1773 	sk_free(sk);
1774 }
1775 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1776 
1777 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1778 {
1779 	u32 max_segs = 1;
1780 
1781 	sk_dst_set(sk, dst);
1782 	sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
1783 	if (sk->sk_route_caps & NETIF_F_GSO)
1784 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1785 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1786 	if (sk_can_gso(sk)) {
1787 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
1788 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1789 		} else {
1790 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1791 			sk->sk_gso_max_size = dst->dev->gso_max_size;
1792 			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1793 		}
1794 	}
1795 	sk->sk_gso_max_segs = max_segs;
1796 }
1797 EXPORT_SYMBOL_GPL(sk_setup_caps);
1798 
1799 /*
1800  *	Simple resource managers for sockets.
1801  */
1802 
1803 
1804 /*
1805  * Write buffer destructor automatically called from kfree_skb.
1806  */
1807 void sock_wfree(struct sk_buff *skb)
1808 {
1809 	struct sock *sk = skb->sk;
1810 	unsigned int len = skb->truesize;
1811 
1812 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1813 		/*
1814 		 * Keep a reference on sk_wmem_alloc, this will be released
1815 		 * after sk_write_space() call
1816 		 */
1817 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1818 		sk->sk_write_space(sk);
1819 		len = 1;
1820 	}
1821 	/*
1822 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1823 	 * could not do because of in-flight packets
1824 	 */
1825 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1826 		__sk_free(sk);
1827 }
1828 EXPORT_SYMBOL(sock_wfree);
1829 
1830 /* This variant of sock_wfree() is used by TCP,
1831  * since it sets SOCK_USE_WRITE_QUEUE.
1832  */
1833 void __sock_wfree(struct sk_buff *skb)
1834 {
1835 	struct sock *sk = skb->sk;
1836 
1837 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1838 		__sk_free(sk);
1839 }
1840 
1841 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1842 {
1843 	skb_orphan(skb);
1844 	skb->sk = sk;
1845 #ifdef CONFIG_INET
1846 	if (unlikely(!sk_fullsock(sk))) {
1847 		skb->destructor = sock_edemux;
1848 		sock_hold(sk);
1849 		return;
1850 	}
1851 #endif
1852 	skb->destructor = sock_wfree;
1853 	skb_set_hash_from_sk(skb, sk);
1854 	/*
1855 	 * We used to take a refcount on sk, but following operation
1856 	 * is enough to guarantee sk_free() wont free this sock until
1857 	 * all in-flight packets are completed
1858 	 */
1859 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1860 }
1861 EXPORT_SYMBOL(skb_set_owner_w);
1862 
1863 /* This helper is used by netem, as it can hold packets in its
1864  * delay queue. We want to allow the owner socket to send more
1865  * packets, as if they were already TX completed by a typical driver.
1866  * But we also want to keep skb->sk set because some packet schedulers
1867  * rely on it (sch_fq for example).
1868  */
1869 void skb_orphan_partial(struct sk_buff *skb)
1870 {
1871 	if (skb_is_tcp_pure_ack(skb))
1872 		return;
1873 
1874 	if (skb->destructor == sock_wfree
1875 #ifdef CONFIG_INET
1876 	    || skb->destructor == tcp_wfree
1877 #endif
1878 		) {
1879 		struct sock *sk = skb->sk;
1880 
1881 		if (refcount_inc_not_zero(&sk->sk_refcnt)) {
1882 			WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
1883 			skb->destructor = sock_efree;
1884 		}
1885 	} else {
1886 		skb_orphan(skb);
1887 	}
1888 }
1889 EXPORT_SYMBOL(skb_orphan_partial);
1890 
1891 /*
1892  * Read buffer destructor automatically called from kfree_skb.
1893  */
1894 void sock_rfree(struct sk_buff *skb)
1895 {
1896 	struct sock *sk = skb->sk;
1897 	unsigned int len = skb->truesize;
1898 
1899 	atomic_sub(len, &sk->sk_rmem_alloc);
1900 	sk_mem_uncharge(sk, len);
1901 }
1902 EXPORT_SYMBOL(sock_rfree);
1903 
1904 /*
1905  * Buffer destructor for skbs that are not used directly in read or write
1906  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1907  */
1908 void sock_efree(struct sk_buff *skb)
1909 {
1910 	sock_put(skb->sk);
1911 }
1912 EXPORT_SYMBOL(sock_efree);
1913 
1914 kuid_t sock_i_uid(struct sock *sk)
1915 {
1916 	kuid_t uid;
1917 
1918 	read_lock_bh(&sk->sk_callback_lock);
1919 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1920 	read_unlock_bh(&sk->sk_callback_lock);
1921 	return uid;
1922 }
1923 EXPORT_SYMBOL(sock_i_uid);
1924 
1925 unsigned long sock_i_ino(struct sock *sk)
1926 {
1927 	unsigned long ino;
1928 
1929 	read_lock_bh(&sk->sk_callback_lock);
1930 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1931 	read_unlock_bh(&sk->sk_callback_lock);
1932 	return ino;
1933 }
1934 EXPORT_SYMBOL(sock_i_ino);
1935 
1936 /*
1937  * Allocate a skb from the socket's send buffer.
1938  */
1939 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1940 			     gfp_t priority)
1941 {
1942 	if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1943 		struct sk_buff *skb = alloc_skb(size, priority);
1944 		if (skb) {
1945 			skb_set_owner_w(skb, sk);
1946 			return skb;
1947 		}
1948 	}
1949 	return NULL;
1950 }
1951 EXPORT_SYMBOL(sock_wmalloc);
1952 
1953 static void sock_ofree(struct sk_buff *skb)
1954 {
1955 	struct sock *sk = skb->sk;
1956 
1957 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
1958 }
1959 
1960 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
1961 			     gfp_t priority)
1962 {
1963 	struct sk_buff *skb;
1964 
1965 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
1966 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
1967 	    sysctl_optmem_max)
1968 		return NULL;
1969 
1970 	skb = alloc_skb(size, priority);
1971 	if (!skb)
1972 		return NULL;
1973 
1974 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
1975 	skb->sk = sk;
1976 	skb->destructor = sock_ofree;
1977 	return skb;
1978 }
1979 
1980 /*
1981  * Allocate a memory block from the socket's option memory buffer.
1982  */
1983 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1984 {
1985 	if ((unsigned int)size <= sysctl_optmem_max &&
1986 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1987 		void *mem;
1988 		/* First do the add, to avoid the race if kmalloc
1989 		 * might sleep.
1990 		 */
1991 		atomic_add(size, &sk->sk_omem_alloc);
1992 		mem = kmalloc(size, priority);
1993 		if (mem)
1994 			return mem;
1995 		atomic_sub(size, &sk->sk_omem_alloc);
1996 	}
1997 	return NULL;
1998 }
1999 EXPORT_SYMBOL(sock_kmalloc);
2000 
2001 /* Free an option memory block. Note, we actually want the inline
2002  * here as this allows gcc to detect the nullify and fold away the
2003  * condition entirely.
2004  */
2005 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2006 				  const bool nullify)
2007 {
2008 	if (WARN_ON_ONCE(!mem))
2009 		return;
2010 	if (nullify)
2011 		kzfree(mem);
2012 	else
2013 		kfree(mem);
2014 	atomic_sub(size, &sk->sk_omem_alloc);
2015 }
2016 
2017 void sock_kfree_s(struct sock *sk, void *mem, int size)
2018 {
2019 	__sock_kfree_s(sk, mem, size, false);
2020 }
2021 EXPORT_SYMBOL(sock_kfree_s);
2022 
2023 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2024 {
2025 	__sock_kfree_s(sk, mem, size, true);
2026 }
2027 EXPORT_SYMBOL(sock_kzfree_s);
2028 
2029 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2030    I think, these locks should be removed for datagram sockets.
2031  */
2032 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2033 {
2034 	DEFINE_WAIT(wait);
2035 
2036 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2037 	for (;;) {
2038 		if (!timeo)
2039 			break;
2040 		if (signal_pending(current))
2041 			break;
2042 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2043 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2044 		if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
2045 			break;
2046 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2047 			break;
2048 		if (sk->sk_err)
2049 			break;
2050 		timeo = schedule_timeout(timeo);
2051 	}
2052 	finish_wait(sk_sleep(sk), &wait);
2053 	return timeo;
2054 }
2055 
2056 
2057 /*
2058  *	Generic send/receive buffer handlers
2059  */
2060 
2061 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2062 				     unsigned long data_len, int noblock,
2063 				     int *errcode, int max_page_order)
2064 {
2065 	struct sk_buff *skb;
2066 	long timeo;
2067 	int err;
2068 
2069 	timeo = sock_sndtimeo(sk, noblock);
2070 	for (;;) {
2071 		err = sock_error(sk);
2072 		if (err != 0)
2073 			goto failure;
2074 
2075 		err = -EPIPE;
2076 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2077 			goto failure;
2078 
2079 		if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2080 			break;
2081 
2082 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2083 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2084 		err = -EAGAIN;
2085 		if (!timeo)
2086 			goto failure;
2087 		if (signal_pending(current))
2088 			goto interrupted;
2089 		timeo = sock_wait_for_wmem(sk, timeo);
2090 	}
2091 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2092 				   errcode, sk->sk_allocation);
2093 	if (skb)
2094 		skb_set_owner_w(skb, sk);
2095 	return skb;
2096 
2097 interrupted:
2098 	err = sock_intr_errno(timeo);
2099 failure:
2100 	*errcode = err;
2101 	return NULL;
2102 }
2103 EXPORT_SYMBOL(sock_alloc_send_pskb);
2104 
2105 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2106 				    int noblock, int *errcode)
2107 {
2108 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2109 }
2110 EXPORT_SYMBOL(sock_alloc_send_skb);
2111 
2112 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2113 		     struct sockcm_cookie *sockc)
2114 {
2115 	u32 tsflags;
2116 
2117 	switch (cmsg->cmsg_type) {
2118 	case SO_MARK:
2119 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2120 			return -EPERM;
2121 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2122 			return -EINVAL;
2123 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2124 		break;
2125 	case SO_TIMESTAMPING:
2126 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2127 			return -EINVAL;
2128 
2129 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2130 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2131 			return -EINVAL;
2132 
2133 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2134 		sockc->tsflags |= tsflags;
2135 		break;
2136 	case SCM_TXTIME:
2137 		if (!sock_flag(sk, SOCK_TXTIME))
2138 			return -EINVAL;
2139 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2140 			return -EINVAL;
2141 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2142 		break;
2143 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2144 	case SCM_RIGHTS:
2145 	case SCM_CREDENTIALS:
2146 		break;
2147 	default:
2148 		return -EINVAL;
2149 	}
2150 	return 0;
2151 }
2152 EXPORT_SYMBOL(__sock_cmsg_send);
2153 
2154 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2155 		   struct sockcm_cookie *sockc)
2156 {
2157 	struct cmsghdr *cmsg;
2158 	int ret;
2159 
2160 	for_each_cmsghdr(cmsg, msg) {
2161 		if (!CMSG_OK(msg, cmsg))
2162 			return -EINVAL;
2163 		if (cmsg->cmsg_level != SOL_SOCKET)
2164 			continue;
2165 		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2166 		if (ret)
2167 			return ret;
2168 	}
2169 	return 0;
2170 }
2171 EXPORT_SYMBOL(sock_cmsg_send);
2172 
2173 static void sk_enter_memory_pressure(struct sock *sk)
2174 {
2175 	if (!sk->sk_prot->enter_memory_pressure)
2176 		return;
2177 
2178 	sk->sk_prot->enter_memory_pressure(sk);
2179 }
2180 
2181 static void sk_leave_memory_pressure(struct sock *sk)
2182 {
2183 	if (sk->sk_prot->leave_memory_pressure) {
2184 		sk->sk_prot->leave_memory_pressure(sk);
2185 	} else {
2186 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2187 
2188 		if (memory_pressure && *memory_pressure)
2189 			*memory_pressure = 0;
2190 	}
2191 }
2192 
2193 /* On 32bit arches, an skb frag is limited to 2^15 */
2194 #define SKB_FRAG_PAGE_ORDER	get_order(32768)
2195 
2196 /**
2197  * skb_page_frag_refill - check that a page_frag contains enough room
2198  * @sz: minimum size of the fragment we want to get
2199  * @pfrag: pointer to page_frag
2200  * @gfp: priority for memory allocation
2201  *
2202  * Note: While this allocator tries to use high order pages, there is
2203  * no guarantee that allocations succeed. Therefore, @sz MUST be
2204  * less or equal than PAGE_SIZE.
2205  */
2206 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2207 {
2208 	if (pfrag->page) {
2209 		if (page_ref_count(pfrag->page) == 1) {
2210 			pfrag->offset = 0;
2211 			return true;
2212 		}
2213 		if (pfrag->offset + sz <= pfrag->size)
2214 			return true;
2215 		put_page(pfrag->page);
2216 	}
2217 
2218 	pfrag->offset = 0;
2219 	if (SKB_FRAG_PAGE_ORDER) {
2220 		/* Avoid direct reclaim but allow kswapd to wake */
2221 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2222 					  __GFP_COMP | __GFP_NOWARN |
2223 					  __GFP_NORETRY,
2224 					  SKB_FRAG_PAGE_ORDER);
2225 		if (likely(pfrag->page)) {
2226 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2227 			return true;
2228 		}
2229 	}
2230 	pfrag->page = alloc_page(gfp);
2231 	if (likely(pfrag->page)) {
2232 		pfrag->size = PAGE_SIZE;
2233 		return true;
2234 	}
2235 	return false;
2236 }
2237 EXPORT_SYMBOL(skb_page_frag_refill);
2238 
2239 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2240 {
2241 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2242 		return true;
2243 
2244 	sk_enter_memory_pressure(sk);
2245 	sk_stream_moderate_sndbuf(sk);
2246 	return false;
2247 }
2248 EXPORT_SYMBOL(sk_page_frag_refill);
2249 
2250 static void __lock_sock(struct sock *sk)
2251 	__releases(&sk->sk_lock.slock)
2252 	__acquires(&sk->sk_lock.slock)
2253 {
2254 	DEFINE_WAIT(wait);
2255 
2256 	for (;;) {
2257 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2258 					TASK_UNINTERRUPTIBLE);
2259 		spin_unlock_bh(&sk->sk_lock.slock);
2260 		schedule();
2261 		spin_lock_bh(&sk->sk_lock.slock);
2262 		if (!sock_owned_by_user(sk))
2263 			break;
2264 	}
2265 	finish_wait(&sk->sk_lock.wq, &wait);
2266 }
2267 
2268 void __release_sock(struct sock *sk)
2269 	__releases(&sk->sk_lock.slock)
2270 	__acquires(&sk->sk_lock.slock)
2271 {
2272 	struct sk_buff *skb, *next;
2273 
2274 	while ((skb = sk->sk_backlog.head) != NULL) {
2275 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2276 
2277 		spin_unlock_bh(&sk->sk_lock.slock);
2278 
2279 		do {
2280 			next = skb->next;
2281 			prefetch(next);
2282 			WARN_ON_ONCE(skb_dst_is_noref(skb));
2283 			skb_mark_not_on_list(skb);
2284 			sk_backlog_rcv(sk, skb);
2285 
2286 			cond_resched();
2287 
2288 			skb = next;
2289 		} while (skb != NULL);
2290 
2291 		spin_lock_bh(&sk->sk_lock.slock);
2292 	}
2293 
2294 	/*
2295 	 * Doing the zeroing here guarantee we can not loop forever
2296 	 * while a wild producer attempts to flood us.
2297 	 */
2298 	sk->sk_backlog.len = 0;
2299 }
2300 
2301 void __sk_flush_backlog(struct sock *sk)
2302 {
2303 	spin_lock_bh(&sk->sk_lock.slock);
2304 	__release_sock(sk);
2305 	spin_unlock_bh(&sk->sk_lock.slock);
2306 }
2307 
2308 /**
2309  * sk_wait_data - wait for data to arrive at sk_receive_queue
2310  * @sk:    sock to wait on
2311  * @timeo: for how long
2312  * @skb:   last skb seen on sk_receive_queue
2313  *
2314  * Now socket state including sk->sk_err is changed only under lock,
2315  * hence we may omit checks after joining wait queue.
2316  * We check receive queue before schedule() only as optimization;
2317  * it is very likely that release_sock() added new data.
2318  */
2319 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2320 {
2321 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2322 	int rc;
2323 
2324 	add_wait_queue(sk_sleep(sk), &wait);
2325 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2326 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2327 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2328 	remove_wait_queue(sk_sleep(sk), &wait);
2329 	return rc;
2330 }
2331 EXPORT_SYMBOL(sk_wait_data);
2332 
2333 /**
2334  *	__sk_mem_raise_allocated - increase memory_allocated
2335  *	@sk: socket
2336  *	@size: memory size to allocate
2337  *	@amt: pages to allocate
2338  *	@kind: allocation type
2339  *
2340  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2341  */
2342 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2343 {
2344 	struct proto *prot = sk->sk_prot;
2345 	long allocated = sk_memory_allocated_add(sk, amt);
2346 	bool charged = true;
2347 
2348 	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2349 	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
2350 		goto suppress_allocation;
2351 
2352 	/* Under limit. */
2353 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2354 		sk_leave_memory_pressure(sk);
2355 		return 1;
2356 	}
2357 
2358 	/* Under pressure. */
2359 	if (allocated > sk_prot_mem_limits(sk, 1))
2360 		sk_enter_memory_pressure(sk);
2361 
2362 	/* Over hard limit. */
2363 	if (allocated > sk_prot_mem_limits(sk, 2))
2364 		goto suppress_allocation;
2365 
2366 	/* guarantee minimum buffer size under pressure */
2367 	if (kind == SK_MEM_RECV) {
2368 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2369 			return 1;
2370 
2371 	} else { /* SK_MEM_SEND */
2372 		int wmem0 = sk_get_wmem0(sk, prot);
2373 
2374 		if (sk->sk_type == SOCK_STREAM) {
2375 			if (sk->sk_wmem_queued < wmem0)
2376 				return 1;
2377 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2378 				return 1;
2379 		}
2380 	}
2381 
2382 	if (sk_has_memory_pressure(sk)) {
2383 		int alloc;
2384 
2385 		if (!sk_under_memory_pressure(sk))
2386 			return 1;
2387 		alloc = sk_sockets_allocated_read_positive(sk);
2388 		if (sk_prot_mem_limits(sk, 2) > alloc *
2389 		    sk_mem_pages(sk->sk_wmem_queued +
2390 				 atomic_read(&sk->sk_rmem_alloc) +
2391 				 sk->sk_forward_alloc))
2392 			return 1;
2393 	}
2394 
2395 suppress_allocation:
2396 
2397 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2398 		sk_stream_moderate_sndbuf(sk);
2399 
2400 		/* Fail only if socket is _under_ its sndbuf.
2401 		 * In this case we cannot block, so that we have to fail.
2402 		 */
2403 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2404 			return 1;
2405 	}
2406 
2407 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2408 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2409 
2410 	sk_memory_allocated_sub(sk, amt);
2411 
2412 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2413 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2414 
2415 	return 0;
2416 }
2417 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2418 
2419 /**
2420  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2421  *	@sk: socket
2422  *	@size: memory size to allocate
2423  *	@kind: allocation type
2424  *
2425  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2426  *	rmem allocation. This function assumes that protocols which have
2427  *	memory_pressure use sk_wmem_queued as write buffer accounting.
2428  */
2429 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2430 {
2431 	int ret, amt = sk_mem_pages(size);
2432 
2433 	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2434 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2435 	if (!ret)
2436 		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2437 	return ret;
2438 }
2439 EXPORT_SYMBOL(__sk_mem_schedule);
2440 
2441 /**
2442  *	__sk_mem_reduce_allocated - reclaim memory_allocated
2443  *	@sk: socket
2444  *	@amount: number of quanta
2445  *
2446  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2447  */
2448 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2449 {
2450 	sk_memory_allocated_sub(sk, amount);
2451 
2452 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2453 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2454 
2455 	if (sk_under_memory_pressure(sk) &&
2456 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2457 		sk_leave_memory_pressure(sk);
2458 }
2459 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2460 
2461 /**
2462  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2463  *	@sk: socket
2464  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2465  */
2466 void __sk_mem_reclaim(struct sock *sk, int amount)
2467 {
2468 	amount >>= SK_MEM_QUANTUM_SHIFT;
2469 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2470 	__sk_mem_reduce_allocated(sk, amount);
2471 }
2472 EXPORT_SYMBOL(__sk_mem_reclaim);
2473 
2474 int sk_set_peek_off(struct sock *sk, int val)
2475 {
2476 	sk->sk_peek_off = val;
2477 	return 0;
2478 }
2479 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2480 
2481 /*
2482  * Set of default routines for initialising struct proto_ops when
2483  * the protocol does not support a particular function. In certain
2484  * cases where it makes no sense for a protocol to have a "do nothing"
2485  * function, some default processing is provided.
2486  */
2487 
2488 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2489 {
2490 	return -EOPNOTSUPP;
2491 }
2492 EXPORT_SYMBOL(sock_no_bind);
2493 
2494 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2495 		    int len, int flags)
2496 {
2497 	return -EOPNOTSUPP;
2498 }
2499 EXPORT_SYMBOL(sock_no_connect);
2500 
2501 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2502 {
2503 	return -EOPNOTSUPP;
2504 }
2505 EXPORT_SYMBOL(sock_no_socketpair);
2506 
2507 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2508 		   bool kern)
2509 {
2510 	return -EOPNOTSUPP;
2511 }
2512 EXPORT_SYMBOL(sock_no_accept);
2513 
2514 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2515 		    int peer)
2516 {
2517 	return -EOPNOTSUPP;
2518 }
2519 EXPORT_SYMBOL(sock_no_getname);
2520 
2521 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2522 {
2523 	return -EOPNOTSUPP;
2524 }
2525 EXPORT_SYMBOL(sock_no_ioctl);
2526 
2527 int sock_no_listen(struct socket *sock, int backlog)
2528 {
2529 	return -EOPNOTSUPP;
2530 }
2531 EXPORT_SYMBOL(sock_no_listen);
2532 
2533 int sock_no_shutdown(struct socket *sock, int how)
2534 {
2535 	return -EOPNOTSUPP;
2536 }
2537 EXPORT_SYMBOL(sock_no_shutdown);
2538 
2539 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2540 		    char __user *optval, unsigned int optlen)
2541 {
2542 	return -EOPNOTSUPP;
2543 }
2544 EXPORT_SYMBOL(sock_no_setsockopt);
2545 
2546 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2547 		    char __user *optval, int __user *optlen)
2548 {
2549 	return -EOPNOTSUPP;
2550 }
2551 EXPORT_SYMBOL(sock_no_getsockopt);
2552 
2553 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2554 {
2555 	return -EOPNOTSUPP;
2556 }
2557 EXPORT_SYMBOL(sock_no_sendmsg);
2558 
2559 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2560 {
2561 	return -EOPNOTSUPP;
2562 }
2563 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2564 
2565 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2566 		    int flags)
2567 {
2568 	return -EOPNOTSUPP;
2569 }
2570 EXPORT_SYMBOL(sock_no_recvmsg);
2571 
2572 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2573 {
2574 	/* Mirror missing mmap method error code */
2575 	return -ENODEV;
2576 }
2577 EXPORT_SYMBOL(sock_no_mmap);
2578 
2579 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2580 {
2581 	ssize_t res;
2582 	struct msghdr msg = {.msg_flags = flags};
2583 	struct kvec iov;
2584 	char *kaddr = kmap(page);
2585 	iov.iov_base = kaddr + offset;
2586 	iov.iov_len = size;
2587 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2588 	kunmap(page);
2589 	return res;
2590 }
2591 EXPORT_SYMBOL(sock_no_sendpage);
2592 
2593 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2594 				int offset, size_t size, int flags)
2595 {
2596 	ssize_t res;
2597 	struct msghdr msg = {.msg_flags = flags};
2598 	struct kvec iov;
2599 	char *kaddr = kmap(page);
2600 
2601 	iov.iov_base = kaddr + offset;
2602 	iov.iov_len = size;
2603 	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2604 	kunmap(page);
2605 	return res;
2606 }
2607 EXPORT_SYMBOL(sock_no_sendpage_locked);
2608 
2609 /*
2610  *	Default Socket Callbacks
2611  */
2612 
2613 static void sock_def_wakeup(struct sock *sk)
2614 {
2615 	struct socket_wq *wq;
2616 
2617 	rcu_read_lock();
2618 	wq = rcu_dereference(sk->sk_wq);
2619 	if (skwq_has_sleeper(wq))
2620 		wake_up_interruptible_all(&wq->wait);
2621 	rcu_read_unlock();
2622 }
2623 
2624 static void sock_def_error_report(struct sock *sk)
2625 {
2626 	struct socket_wq *wq;
2627 
2628 	rcu_read_lock();
2629 	wq = rcu_dereference(sk->sk_wq);
2630 	if (skwq_has_sleeper(wq))
2631 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2632 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2633 	rcu_read_unlock();
2634 }
2635 
2636 static void sock_def_readable(struct sock *sk)
2637 {
2638 	struct socket_wq *wq;
2639 
2640 	rcu_read_lock();
2641 	wq = rcu_dereference(sk->sk_wq);
2642 	if (skwq_has_sleeper(wq))
2643 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2644 						EPOLLRDNORM | EPOLLRDBAND);
2645 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2646 	rcu_read_unlock();
2647 }
2648 
2649 static void sock_def_write_space(struct sock *sk)
2650 {
2651 	struct socket_wq *wq;
2652 
2653 	rcu_read_lock();
2654 
2655 	/* Do not wake up a writer until he can make "significant"
2656 	 * progress.  --DaveM
2657 	 */
2658 	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2659 		wq = rcu_dereference(sk->sk_wq);
2660 		if (skwq_has_sleeper(wq))
2661 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2662 						EPOLLWRNORM | EPOLLWRBAND);
2663 
2664 		/* Should agree with poll, otherwise some programs break */
2665 		if (sock_writeable(sk))
2666 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2667 	}
2668 
2669 	rcu_read_unlock();
2670 }
2671 
2672 static void sock_def_destruct(struct sock *sk)
2673 {
2674 }
2675 
2676 void sk_send_sigurg(struct sock *sk)
2677 {
2678 	if (sk->sk_socket && sk->sk_socket->file)
2679 		if (send_sigurg(&sk->sk_socket->file->f_owner))
2680 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2681 }
2682 EXPORT_SYMBOL(sk_send_sigurg);
2683 
2684 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2685 		    unsigned long expires)
2686 {
2687 	if (!mod_timer(timer, expires))
2688 		sock_hold(sk);
2689 }
2690 EXPORT_SYMBOL(sk_reset_timer);
2691 
2692 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2693 {
2694 	if (del_timer(timer))
2695 		__sock_put(sk);
2696 }
2697 EXPORT_SYMBOL(sk_stop_timer);
2698 
2699 void sock_init_data(struct socket *sock, struct sock *sk)
2700 {
2701 	sk_init_common(sk);
2702 	sk->sk_send_head	=	NULL;
2703 
2704 	timer_setup(&sk->sk_timer, NULL, 0);
2705 
2706 	sk->sk_allocation	=	GFP_KERNEL;
2707 	sk->sk_rcvbuf		=	sysctl_rmem_default;
2708 	sk->sk_sndbuf		=	sysctl_wmem_default;
2709 	sk->sk_state		=	TCP_CLOSE;
2710 	sk_set_socket(sk, sock);
2711 
2712 	sock_set_flag(sk, SOCK_ZAPPED);
2713 
2714 	if (sock) {
2715 		sk->sk_type	=	sock->type;
2716 		sk->sk_wq	=	sock->wq;
2717 		sock->sk	=	sk;
2718 		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
2719 	} else {
2720 		sk->sk_wq	=	NULL;
2721 		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
2722 	}
2723 
2724 	rwlock_init(&sk->sk_callback_lock);
2725 	if (sk->sk_kern_sock)
2726 		lockdep_set_class_and_name(
2727 			&sk->sk_callback_lock,
2728 			af_kern_callback_keys + sk->sk_family,
2729 			af_family_kern_clock_key_strings[sk->sk_family]);
2730 	else
2731 		lockdep_set_class_and_name(
2732 			&sk->sk_callback_lock,
2733 			af_callback_keys + sk->sk_family,
2734 			af_family_clock_key_strings[sk->sk_family]);
2735 
2736 	sk->sk_state_change	=	sock_def_wakeup;
2737 	sk->sk_data_ready	=	sock_def_readable;
2738 	sk->sk_write_space	=	sock_def_write_space;
2739 	sk->sk_error_report	=	sock_def_error_report;
2740 	sk->sk_destruct		=	sock_def_destruct;
2741 
2742 	sk->sk_frag.page	=	NULL;
2743 	sk->sk_frag.offset	=	0;
2744 	sk->sk_peek_off		=	-1;
2745 
2746 	sk->sk_peer_pid 	=	NULL;
2747 	sk->sk_peer_cred	=	NULL;
2748 	sk->sk_write_pending	=	0;
2749 	sk->sk_rcvlowat		=	1;
2750 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2751 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2752 
2753 	sk->sk_stamp = SK_DEFAULT_STAMP;
2754 #if BITS_PER_LONG==32
2755 	seqlock_init(&sk->sk_stamp_seq);
2756 #endif
2757 	atomic_set(&sk->sk_zckey, 0);
2758 
2759 #ifdef CONFIG_NET_RX_BUSY_POLL
2760 	sk->sk_napi_id		=	0;
2761 	sk->sk_ll_usec		=	sysctl_net_busy_read;
2762 #endif
2763 
2764 	sk->sk_max_pacing_rate = ~0UL;
2765 	sk->sk_pacing_rate = ~0UL;
2766 	sk->sk_pacing_shift = 10;
2767 	sk->sk_incoming_cpu = -1;
2768 
2769 	sk_rx_queue_clear(sk);
2770 	/*
2771 	 * Before updating sk_refcnt, we must commit prior changes to memory
2772 	 * (Documentation/RCU/rculist_nulls.txt for details)
2773 	 */
2774 	smp_wmb();
2775 	refcount_set(&sk->sk_refcnt, 1);
2776 	atomic_set(&sk->sk_drops, 0);
2777 }
2778 EXPORT_SYMBOL(sock_init_data);
2779 
2780 void lock_sock_nested(struct sock *sk, int subclass)
2781 {
2782 	might_sleep();
2783 	spin_lock_bh(&sk->sk_lock.slock);
2784 	if (sk->sk_lock.owned)
2785 		__lock_sock(sk);
2786 	sk->sk_lock.owned = 1;
2787 	spin_unlock(&sk->sk_lock.slock);
2788 	/*
2789 	 * The sk_lock has mutex_lock() semantics here:
2790 	 */
2791 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2792 	local_bh_enable();
2793 }
2794 EXPORT_SYMBOL(lock_sock_nested);
2795 
2796 void release_sock(struct sock *sk)
2797 {
2798 	spin_lock_bh(&sk->sk_lock.slock);
2799 	if (sk->sk_backlog.tail)
2800 		__release_sock(sk);
2801 
2802 	/* Warning : release_cb() might need to release sk ownership,
2803 	 * ie call sock_release_ownership(sk) before us.
2804 	 */
2805 	if (sk->sk_prot->release_cb)
2806 		sk->sk_prot->release_cb(sk);
2807 
2808 	sock_release_ownership(sk);
2809 	if (waitqueue_active(&sk->sk_lock.wq))
2810 		wake_up(&sk->sk_lock.wq);
2811 	spin_unlock_bh(&sk->sk_lock.slock);
2812 }
2813 EXPORT_SYMBOL(release_sock);
2814 
2815 /**
2816  * lock_sock_fast - fast version of lock_sock
2817  * @sk: socket
2818  *
2819  * This version should be used for very small section, where process wont block
2820  * return false if fast path is taken:
2821  *
2822  *   sk_lock.slock locked, owned = 0, BH disabled
2823  *
2824  * return true if slow path is taken:
2825  *
2826  *   sk_lock.slock unlocked, owned = 1, BH enabled
2827  */
2828 bool lock_sock_fast(struct sock *sk)
2829 {
2830 	might_sleep();
2831 	spin_lock_bh(&sk->sk_lock.slock);
2832 
2833 	if (!sk->sk_lock.owned)
2834 		/*
2835 		 * Note : We must disable BH
2836 		 */
2837 		return false;
2838 
2839 	__lock_sock(sk);
2840 	sk->sk_lock.owned = 1;
2841 	spin_unlock(&sk->sk_lock.slock);
2842 	/*
2843 	 * The sk_lock has mutex_lock() semantics here:
2844 	 */
2845 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2846 	local_bh_enable();
2847 	return true;
2848 }
2849 EXPORT_SYMBOL(lock_sock_fast);
2850 
2851 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2852 {
2853 	struct timeval tv;
2854 
2855 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2856 	tv = ktime_to_timeval(sock_read_timestamp(sk));
2857 	if (tv.tv_sec == -1)
2858 		return -ENOENT;
2859 	if (tv.tv_sec == 0) {
2860 		ktime_t kt = ktime_get_real();
2861 		sock_write_timestamp(sk, kt);
2862 		tv = ktime_to_timeval(kt);
2863 	}
2864 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2865 }
2866 EXPORT_SYMBOL(sock_get_timestamp);
2867 
2868 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2869 {
2870 	struct timespec ts;
2871 
2872 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2873 	ts = ktime_to_timespec(sock_read_timestamp(sk));
2874 	if (ts.tv_sec == -1)
2875 		return -ENOENT;
2876 	if (ts.tv_sec == 0) {
2877 		ktime_t kt = ktime_get_real();
2878 		sock_write_timestamp(sk, kt);
2879 		ts = ktime_to_timespec(sk->sk_stamp);
2880 	}
2881 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2882 }
2883 EXPORT_SYMBOL(sock_get_timestampns);
2884 
2885 void sock_enable_timestamp(struct sock *sk, int flag)
2886 {
2887 	if (!sock_flag(sk, flag)) {
2888 		unsigned long previous_flags = sk->sk_flags;
2889 
2890 		sock_set_flag(sk, flag);
2891 		/*
2892 		 * we just set one of the two flags which require net
2893 		 * time stamping, but time stamping might have been on
2894 		 * already because of the other one
2895 		 */
2896 		if (sock_needs_netstamp(sk) &&
2897 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
2898 			net_enable_timestamp();
2899 	}
2900 }
2901 
2902 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2903 		       int level, int type)
2904 {
2905 	struct sock_exterr_skb *serr;
2906 	struct sk_buff *skb;
2907 	int copied, err;
2908 
2909 	err = -EAGAIN;
2910 	skb = sock_dequeue_err_skb(sk);
2911 	if (skb == NULL)
2912 		goto out;
2913 
2914 	copied = skb->len;
2915 	if (copied > len) {
2916 		msg->msg_flags |= MSG_TRUNC;
2917 		copied = len;
2918 	}
2919 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
2920 	if (err)
2921 		goto out_free_skb;
2922 
2923 	sock_recv_timestamp(msg, sk, skb);
2924 
2925 	serr = SKB_EXT_ERR(skb);
2926 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2927 
2928 	msg->msg_flags |= MSG_ERRQUEUE;
2929 	err = copied;
2930 
2931 out_free_skb:
2932 	kfree_skb(skb);
2933 out:
2934 	return err;
2935 }
2936 EXPORT_SYMBOL(sock_recv_errqueue);
2937 
2938 /*
2939  *	Get a socket option on an socket.
2940  *
2941  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2942  *	asynchronous errors should be reported by getsockopt. We assume
2943  *	this means if you specify SO_ERROR (otherwise whats the point of it).
2944  */
2945 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2946 			   char __user *optval, int __user *optlen)
2947 {
2948 	struct sock *sk = sock->sk;
2949 
2950 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2951 }
2952 EXPORT_SYMBOL(sock_common_getsockopt);
2953 
2954 #ifdef CONFIG_COMPAT
2955 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2956 				  char __user *optval, int __user *optlen)
2957 {
2958 	struct sock *sk = sock->sk;
2959 
2960 	if (sk->sk_prot->compat_getsockopt != NULL)
2961 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2962 						      optval, optlen);
2963 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2964 }
2965 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2966 #endif
2967 
2968 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2969 			int flags)
2970 {
2971 	struct sock *sk = sock->sk;
2972 	int addr_len = 0;
2973 	int err;
2974 
2975 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2976 				   flags & ~MSG_DONTWAIT, &addr_len);
2977 	if (err >= 0)
2978 		msg->msg_namelen = addr_len;
2979 	return err;
2980 }
2981 EXPORT_SYMBOL(sock_common_recvmsg);
2982 
2983 /*
2984  *	Set socket options on an inet socket.
2985  */
2986 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2987 			   char __user *optval, unsigned int optlen)
2988 {
2989 	struct sock *sk = sock->sk;
2990 
2991 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2992 }
2993 EXPORT_SYMBOL(sock_common_setsockopt);
2994 
2995 #ifdef CONFIG_COMPAT
2996 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2997 				  char __user *optval, unsigned int optlen)
2998 {
2999 	struct sock *sk = sock->sk;
3000 
3001 	if (sk->sk_prot->compat_setsockopt != NULL)
3002 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
3003 						      optval, optlen);
3004 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3005 }
3006 EXPORT_SYMBOL(compat_sock_common_setsockopt);
3007 #endif
3008 
3009 void sk_common_release(struct sock *sk)
3010 {
3011 	if (sk->sk_prot->destroy)
3012 		sk->sk_prot->destroy(sk);
3013 
3014 	/*
3015 	 * Observation: when sock_common_release is called, processes have
3016 	 * no access to socket. But net still has.
3017 	 * Step one, detach it from networking:
3018 	 *
3019 	 * A. Remove from hash tables.
3020 	 */
3021 
3022 	sk->sk_prot->unhash(sk);
3023 
3024 	/*
3025 	 * In this point socket cannot receive new packets, but it is possible
3026 	 * that some packets are in flight because some CPU runs receiver and
3027 	 * did hash table lookup before we unhashed socket. They will achieve
3028 	 * receive queue and will be purged by socket destructor.
3029 	 *
3030 	 * Also we still have packets pending on receive queue and probably,
3031 	 * our own packets waiting in device queues. sock_destroy will drain
3032 	 * receive queue, but transmitted packets will delay socket destruction
3033 	 * until the last reference will be released.
3034 	 */
3035 
3036 	sock_orphan(sk);
3037 
3038 	xfrm_sk_free_policy(sk);
3039 
3040 	sk_refcnt_debug_release(sk);
3041 
3042 	sock_put(sk);
3043 }
3044 EXPORT_SYMBOL(sk_common_release);
3045 
3046 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3047 {
3048 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3049 
3050 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3051 	mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3052 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3053 	mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3054 	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3055 	mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3056 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3057 	mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3058 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3059 }
3060 
3061 #ifdef CONFIG_PROC_FS
3062 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
3063 struct prot_inuse {
3064 	int val[PROTO_INUSE_NR];
3065 };
3066 
3067 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3068 
3069 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3070 {
3071 	__this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3072 }
3073 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3074 
3075 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3076 {
3077 	int cpu, idx = prot->inuse_idx;
3078 	int res = 0;
3079 
3080 	for_each_possible_cpu(cpu)
3081 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3082 
3083 	return res >= 0 ? res : 0;
3084 }
3085 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3086 
3087 static void sock_inuse_add(struct net *net, int val)
3088 {
3089 	this_cpu_add(*net->core.sock_inuse, val);
3090 }
3091 
3092 int sock_inuse_get(struct net *net)
3093 {
3094 	int cpu, res = 0;
3095 
3096 	for_each_possible_cpu(cpu)
3097 		res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3098 
3099 	return res;
3100 }
3101 
3102 EXPORT_SYMBOL_GPL(sock_inuse_get);
3103 
3104 static int __net_init sock_inuse_init_net(struct net *net)
3105 {
3106 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3107 	if (net->core.prot_inuse == NULL)
3108 		return -ENOMEM;
3109 
3110 	net->core.sock_inuse = alloc_percpu(int);
3111 	if (net->core.sock_inuse == NULL)
3112 		goto out;
3113 
3114 	return 0;
3115 
3116 out:
3117 	free_percpu(net->core.prot_inuse);
3118 	return -ENOMEM;
3119 }
3120 
3121 static void __net_exit sock_inuse_exit_net(struct net *net)
3122 {
3123 	free_percpu(net->core.prot_inuse);
3124 	free_percpu(net->core.sock_inuse);
3125 }
3126 
3127 static struct pernet_operations net_inuse_ops = {
3128 	.init = sock_inuse_init_net,
3129 	.exit = sock_inuse_exit_net,
3130 };
3131 
3132 static __init int net_inuse_init(void)
3133 {
3134 	if (register_pernet_subsys(&net_inuse_ops))
3135 		panic("Cannot initialize net inuse counters");
3136 
3137 	return 0;
3138 }
3139 
3140 core_initcall(net_inuse_init);
3141 
3142 static void assign_proto_idx(struct proto *prot)
3143 {
3144 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3145 
3146 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3147 		pr_err("PROTO_INUSE_NR exhausted\n");
3148 		return;
3149 	}
3150 
3151 	set_bit(prot->inuse_idx, proto_inuse_idx);
3152 }
3153 
3154 static void release_proto_idx(struct proto *prot)
3155 {
3156 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3157 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3158 }
3159 #else
3160 static inline void assign_proto_idx(struct proto *prot)
3161 {
3162 }
3163 
3164 static inline void release_proto_idx(struct proto *prot)
3165 {
3166 }
3167 
3168 static void sock_inuse_add(struct net *net, int val)
3169 {
3170 }
3171 #endif
3172 
3173 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3174 {
3175 	if (!rsk_prot)
3176 		return;
3177 	kfree(rsk_prot->slab_name);
3178 	rsk_prot->slab_name = NULL;
3179 	kmem_cache_destroy(rsk_prot->slab);
3180 	rsk_prot->slab = NULL;
3181 }
3182 
3183 static int req_prot_init(const struct proto *prot)
3184 {
3185 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3186 
3187 	if (!rsk_prot)
3188 		return 0;
3189 
3190 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3191 					prot->name);
3192 	if (!rsk_prot->slab_name)
3193 		return -ENOMEM;
3194 
3195 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3196 					   rsk_prot->obj_size, 0,
3197 					   SLAB_ACCOUNT | prot->slab_flags,
3198 					   NULL);
3199 
3200 	if (!rsk_prot->slab) {
3201 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3202 			prot->name);
3203 		return -ENOMEM;
3204 	}
3205 	return 0;
3206 }
3207 
3208 int proto_register(struct proto *prot, int alloc_slab)
3209 {
3210 	if (alloc_slab) {
3211 		prot->slab = kmem_cache_create_usercopy(prot->name,
3212 					prot->obj_size, 0,
3213 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3214 					prot->slab_flags,
3215 					prot->useroffset, prot->usersize,
3216 					NULL);
3217 
3218 		if (prot->slab == NULL) {
3219 			pr_crit("%s: Can't create sock SLAB cache!\n",
3220 				prot->name);
3221 			goto out;
3222 		}
3223 
3224 		if (req_prot_init(prot))
3225 			goto out_free_request_sock_slab;
3226 
3227 		if (prot->twsk_prot != NULL) {
3228 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3229 
3230 			if (prot->twsk_prot->twsk_slab_name == NULL)
3231 				goto out_free_request_sock_slab;
3232 
3233 			prot->twsk_prot->twsk_slab =
3234 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3235 						  prot->twsk_prot->twsk_obj_size,
3236 						  0,
3237 						  SLAB_ACCOUNT |
3238 						  prot->slab_flags,
3239 						  NULL);
3240 			if (prot->twsk_prot->twsk_slab == NULL)
3241 				goto out_free_timewait_sock_slab_name;
3242 		}
3243 	}
3244 
3245 	mutex_lock(&proto_list_mutex);
3246 	list_add(&prot->node, &proto_list);
3247 	assign_proto_idx(prot);
3248 	mutex_unlock(&proto_list_mutex);
3249 	return 0;
3250 
3251 out_free_timewait_sock_slab_name:
3252 	kfree(prot->twsk_prot->twsk_slab_name);
3253 out_free_request_sock_slab:
3254 	req_prot_cleanup(prot->rsk_prot);
3255 
3256 	kmem_cache_destroy(prot->slab);
3257 	prot->slab = NULL;
3258 out:
3259 	return -ENOBUFS;
3260 }
3261 EXPORT_SYMBOL(proto_register);
3262 
3263 void proto_unregister(struct proto *prot)
3264 {
3265 	mutex_lock(&proto_list_mutex);
3266 	release_proto_idx(prot);
3267 	list_del(&prot->node);
3268 	mutex_unlock(&proto_list_mutex);
3269 
3270 	kmem_cache_destroy(prot->slab);
3271 	prot->slab = NULL;
3272 
3273 	req_prot_cleanup(prot->rsk_prot);
3274 
3275 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3276 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3277 		kfree(prot->twsk_prot->twsk_slab_name);
3278 		prot->twsk_prot->twsk_slab = NULL;
3279 	}
3280 }
3281 EXPORT_SYMBOL(proto_unregister);
3282 
3283 int sock_load_diag_module(int family, int protocol)
3284 {
3285 	if (!protocol) {
3286 		if (!sock_is_registered(family))
3287 			return -ENOENT;
3288 
3289 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3290 				      NETLINK_SOCK_DIAG, family);
3291 	}
3292 
3293 #ifdef CONFIG_INET
3294 	if (family == AF_INET &&
3295 	    protocol != IPPROTO_RAW &&
3296 	    !rcu_access_pointer(inet_protos[protocol]))
3297 		return -ENOENT;
3298 #endif
3299 
3300 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3301 			      NETLINK_SOCK_DIAG, family, protocol);
3302 }
3303 EXPORT_SYMBOL(sock_load_diag_module);
3304 
3305 #ifdef CONFIG_PROC_FS
3306 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3307 	__acquires(proto_list_mutex)
3308 {
3309 	mutex_lock(&proto_list_mutex);
3310 	return seq_list_start_head(&proto_list, *pos);
3311 }
3312 
3313 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3314 {
3315 	return seq_list_next(v, &proto_list, pos);
3316 }
3317 
3318 static void proto_seq_stop(struct seq_file *seq, void *v)
3319 	__releases(proto_list_mutex)
3320 {
3321 	mutex_unlock(&proto_list_mutex);
3322 }
3323 
3324 static char proto_method_implemented(const void *method)
3325 {
3326 	return method == NULL ? 'n' : 'y';
3327 }
3328 static long sock_prot_memory_allocated(struct proto *proto)
3329 {
3330 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3331 }
3332 
3333 static char *sock_prot_memory_pressure(struct proto *proto)
3334 {
3335 	return proto->memory_pressure != NULL ?
3336 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3337 }
3338 
3339 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3340 {
3341 
3342 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3343 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3344 		   proto->name,
3345 		   proto->obj_size,
3346 		   sock_prot_inuse_get(seq_file_net(seq), proto),
3347 		   sock_prot_memory_allocated(proto),
3348 		   sock_prot_memory_pressure(proto),
3349 		   proto->max_header,
3350 		   proto->slab == NULL ? "no" : "yes",
3351 		   module_name(proto->owner),
3352 		   proto_method_implemented(proto->close),
3353 		   proto_method_implemented(proto->connect),
3354 		   proto_method_implemented(proto->disconnect),
3355 		   proto_method_implemented(proto->accept),
3356 		   proto_method_implemented(proto->ioctl),
3357 		   proto_method_implemented(proto->init),
3358 		   proto_method_implemented(proto->destroy),
3359 		   proto_method_implemented(proto->shutdown),
3360 		   proto_method_implemented(proto->setsockopt),
3361 		   proto_method_implemented(proto->getsockopt),
3362 		   proto_method_implemented(proto->sendmsg),
3363 		   proto_method_implemented(proto->recvmsg),
3364 		   proto_method_implemented(proto->sendpage),
3365 		   proto_method_implemented(proto->bind),
3366 		   proto_method_implemented(proto->backlog_rcv),
3367 		   proto_method_implemented(proto->hash),
3368 		   proto_method_implemented(proto->unhash),
3369 		   proto_method_implemented(proto->get_port),
3370 		   proto_method_implemented(proto->enter_memory_pressure));
3371 }
3372 
3373 static int proto_seq_show(struct seq_file *seq, void *v)
3374 {
3375 	if (v == &proto_list)
3376 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3377 			   "protocol",
3378 			   "size",
3379 			   "sockets",
3380 			   "memory",
3381 			   "press",
3382 			   "maxhdr",
3383 			   "slab",
3384 			   "module",
3385 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3386 	else
3387 		proto_seq_printf(seq, list_entry(v, struct proto, node));
3388 	return 0;
3389 }
3390 
3391 static const struct seq_operations proto_seq_ops = {
3392 	.start  = proto_seq_start,
3393 	.next   = proto_seq_next,
3394 	.stop   = proto_seq_stop,
3395 	.show   = proto_seq_show,
3396 };
3397 
3398 static __net_init int proto_init_net(struct net *net)
3399 {
3400 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3401 			sizeof(struct seq_net_private)))
3402 		return -ENOMEM;
3403 
3404 	return 0;
3405 }
3406 
3407 static __net_exit void proto_exit_net(struct net *net)
3408 {
3409 	remove_proc_entry("protocols", net->proc_net);
3410 }
3411 
3412 
3413 static __net_initdata struct pernet_operations proto_net_ops = {
3414 	.init = proto_init_net,
3415 	.exit = proto_exit_net,
3416 };
3417 
3418 static int __init proto_init(void)
3419 {
3420 	return register_pernet_subsys(&proto_net_ops);
3421 }
3422 
3423 subsys_initcall(proto_init);
3424 
3425 #endif /* PROC_FS */
3426 
3427 #ifdef CONFIG_NET_RX_BUSY_POLL
3428 bool sk_busy_loop_end(void *p, unsigned long start_time)
3429 {
3430 	struct sock *sk = p;
3431 
3432 	return !skb_queue_empty(&sk->sk_receive_queue) ||
3433 	       sk_busy_loop_timeout(sk, start_time);
3434 }
3435 EXPORT_SYMBOL(sk_busy_loop_end);
3436 #endif /* CONFIG_NET_RX_BUSY_POLL */
3437