xref: /openbmc/linux/net/core/sock.c (revision 4c79e98b)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *		This program is free software; you can redistribute it and/or
87  *		modify it under the terms of the GNU General Public License
88  *		as published by the Free Software Foundation; either version
89  *		2 of the License, or (at your option) any later version.
90  */
91 
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93 
94 #include <linux/capability.h>
95 #include <linux/errno.h>
96 #include <linux/errqueue.h>
97 #include <linux/types.h>
98 #include <linux/socket.h>
99 #include <linux/in.h>
100 #include <linux/kernel.h>
101 #include <linux/module.h>
102 #include <linux/proc_fs.h>
103 #include <linux/seq_file.h>
104 #include <linux/sched.h>
105 #include <linux/timer.h>
106 #include <linux/string.h>
107 #include <linux/sockios.h>
108 #include <linux/net.h>
109 #include <linux/mm.h>
110 #include <linux/slab.h>
111 #include <linux/interrupt.h>
112 #include <linux/poll.h>
113 #include <linux/tcp.h>
114 #include <linux/init.h>
115 #include <linux/highmem.h>
116 #include <linux/user_namespace.h>
117 #include <linux/static_key.h>
118 #include <linux/memcontrol.h>
119 #include <linux/prefetch.h>
120 
121 #include <linux/uaccess.h>
122 
123 #include <linux/netdevice.h>
124 #include <net/protocol.h>
125 #include <linux/skbuff.h>
126 #include <net/net_namespace.h>
127 #include <net/request_sock.h>
128 #include <net/sock.h>
129 #include <linux/net_tstamp.h>
130 #include <net/xfrm.h>
131 #include <linux/ipsec.h>
132 #include <net/cls_cgroup.h>
133 #include <net/netprio_cgroup.h>
134 #include <linux/sock_diag.h>
135 
136 #include <linux/filter.h>
137 #include <net/sock_reuseport.h>
138 
139 #include <trace/events/sock.h>
140 
141 #ifdef CONFIG_INET
142 #include <net/tcp.h>
143 #endif
144 
145 #include <net/busy_poll.h>
146 
147 static DEFINE_MUTEX(proto_list_mutex);
148 static LIST_HEAD(proto_list);
149 
150 /**
151  * sk_ns_capable - General socket capability test
152  * @sk: Socket to use a capability on or through
153  * @user_ns: The user namespace of the capability to use
154  * @cap: The capability to use
155  *
156  * Test to see if the opener of the socket had when the socket was
157  * created and the current process has the capability @cap in the user
158  * namespace @user_ns.
159  */
160 bool sk_ns_capable(const struct sock *sk,
161 		   struct user_namespace *user_ns, int cap)
162 {
163 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
164 		ns_capable(user_ns, cap);
165 }
166 EXPORT_SYMBOL(sk_ns_capable);
167 
168 /**
169  * sk_capable - Socket global capability test
170  * @sk: Socket to use a capability on or through
171  * @cap: The global capability to use
172  *
173  * Test to see if the opener of the socket had when the socket was
174  * created and the current process has the capability @cap in all user
175  * namespaces.
176  */
177 bool sk_capable(const struct sock *sk, int cap)
178 {
179 	return sk_ns_capable(sk, &init_user_ns, cap);
180 }
181 EXPORT_SYMBOL(sk_capable);
182 
183 /**
184  * sk_net_capable - Network namespace socket capability test
185  * @sk: Socket to use a capability on or through
186  * @cap: The capability to use
187  *
188  * Test to see if the opener of the socket had when the socket was created
189  * and the current process has the capability @cap over the network namespace
190  * the socket is a member of.
191  */
192 bool sk_net_capable(const struct sock *sk, int cap)
193 {
194 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
195 }
196 EXPORT_SYMBOL(sk_net_capable);
197 
198 /*
199  * Each address family might have different locking rules, so we have
200  * one slock key per address family and separate keys for internal and
201  * userspace sockets.
202  */
203 static struct lock_class_key af_family_keys[AF_MAX];
204 static struct lock_class_key af_family_kern_keys[AF_MAX];
205 static struct lock_class_key af_family_slock_keys[AF_MAX];
206 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
207 
208 /*
209  * Make lock validator output more readable. (we pre-construct these
210  * strings build-time, so that runtime initialization of socket
211  * locks is fast):
212  */
213 
214 #define _sock_locks(x)						  \
215   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
216   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
217   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
218   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
219   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
220   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
221   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
222   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
223   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
224   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
225   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
226   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
227   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
228   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
229   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_MAX"
230 
231 static const char *const af_family_key_strings[AF_MAX+1] = {
232 	_sock_locks("sk_lock-")
233 };
234 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
235 	_sock_locks("slock-")
236 };
237 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
238 	_sock_locks("clock-")
239 };
240 
241 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
242 	_sock_locks("k-sk_lock-")
243 };
244 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
245 	_sock_locks("k-slock-")
246 };
247 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
248 	_sock_locks("k-clock-")
249 };
250 
251 /*
252  * sk_callback_lock locking rules are per-address-family,
253  * so split the lock classes by using a per-AF key:
254  */
255 static struct lock_class_key af_callback_keys[AF_MAX];
256 static struct lock_class_key af_kern_callback_keys[AF_MAX];
257 
258 /* Take into consideration the size of the struct sk_buff overhead in the
259  * determination of these values, since that is non-constant across
260  * platforms.  This makes socket queueing behavior and performance
261  * not depend upon such differences.
262  */
263 #define _SK_MEM_PACKETS		256
264 #define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
265 #define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
266 #define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
267 
268 /* Run time adjustable parameters. */
269 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
270 EXPORT_SYMBOL(sysctl_wmem_max);
271 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
272 EXPORT_SYMBOL(sysctl_rmem_max);
273 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
274 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
275 
276 /* Maximal space eaten by iovec or ancillary data plus some space */
277 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
278 EXPORT_SYMBOL(sysctl_optmem_max);
279 
280 int sysctl_tstamp_allow_data __read_mostly = 1;
281 
282 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
283 EXPORT_SYMBOL_GPL(memalloc_socks);
284 
285 /**
286  * sk_set_memalloc - sets %SOCK_MEMALLOC
287  * @sk: socket to set it on
288  *
289  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
290  * It's the responsibility of the admin to adjust min_free_kbytes
291  * to meet the requirements
292  */
293 void sk_set_memalloc(struct sock *sk)
294 {
295 	sock_set_flag(sk, SOCK_MEMALLOC);
296 	sk->sk_allocation |= __GFP_MEMALLOC;
297 	static_key_slow_inc(&memalloc_socks);
298 }
299 EXPORT_SYMBOL_GPL(sk_set_memalloc);
300 
301 void sk_clear_memalloc(struct sock *sk)
302 {
303 	sock_reset_flag(sk, SOCK_MEMALLOC);
304 	sk->sk_allocation &= ~__GFP_MEMALLOC;
305 	static_key_slow_dec(&memalloc_socks);
306 
307 	/*
308 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
309 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
310 	 * it has rmem allocations due to the last swapfile being deactivated
311 	 * but there is a risk that the socket is unusable due to exceeding
312 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
313 	 */
314 	sk_mem_reclaim(sk);
315 }
316 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
317 
318 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
319 {
320 	int ret;
321 	unsigned long pflags = current->flags;
322 
323 	/* these should have been dropped before queueing */
324 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
325 
326 	current->flags |= PF_MEMALLOC;
327 	ret = sk->sk_backlog_rcv(sk, skb);
328 	tsk_restore_flags(current, pflags, PF_MEMALLOC);
329 
330 	return ret;
331 }
332 EXPORT_SYMBOL(__sk_backlog_rcv);
333 
334 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
335 {
336 	struct timeval tv;
337 
338 	if (optlen < sizeof(tv))
339 		return -EINVAL;
340 	if (copy_from_user(&tv, optval, sizeof(tv)))
341 		return -EFAULT;
342 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
343 		return -EDOM;
344 
345 	if (tv.tv_sec < 0) {
346 		static int warned __read_mostly;
347 
348 		*timeo_p = 0;
349 		if (warned < 10 && net_ratelimit()) {
350 			warned++;
351 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
352 				__func__, current->comm, task_pid_nr(current));
353 		}
354 		return 0;
355 	}
356 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
357 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
358 		return 0;
359 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
360 		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
361 	return 0;
362 }
363 
364 static void sock_warn_obsolete_bsdism(const char *name)
365 {
366 	static int warned;
367 	static char warncomm[TASK_COMM_LEN];
368 	if (strcmp(warncomm, current->comm) && warned < 5) {
369 		strcpy(warncomm,  current->comm);
370 		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
371 			warncomm, name);
372 		warned++;
373 	}
374 }
375 
376 static bool sock_needs_netstamp(const struct sock *sk)
377 {
378 	switch (sk->sk_family) {
379 	case AF_UNSPEC:
380 	case AF_UNIX:
381 		return false;
382 	default:
383 		return true;
384 	}
385 }
386 
387 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
388 {
389 	if (sk->sk_flags & flags) {
390 		sk->sk_flags &= ~flags;
391 		if (sock_needs_netstamp(sk) &&
392 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
393 			net_disable_timestamp();
394 	}
395 }
396 
397 
398 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
399 {
400 	unsigned long flags;
401 	struct sk_buff_head *list = &sk->sk_receive_queue;
402 
403 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
404 		atomic_inc(&sk->sk_drops);
405 		trace_sock_rcvqueue_full(sk, skb);
406 		return -ENOMEM;
407 	}
408 
409 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
410 		atomic_inc(&sk->sk_drops);
411 		return -ENOBUFS;
412 	}
413 
414 	skb->dev = NULL;
415 	skb_set_owner_r(skb, sk);
416 
417 	/* we escape from rcu protected region, make sure we dont leak
418 	 * a norefcounted dst
419 	 */
420 	skb_dst_force(skb);
421 
422 	spin_lock_irqsave(&list->lock, flags);
423 	sock_skb_set_dropcount(sk, skb);
424 	__skb_queue_tail(list, skb);
425 	spin_unlock_irqrestore(&list->lock, flags);
426 
427 	if (!sock_flag(sk, SOCK_DEAD))
428 		sk->sk_data_ready(sk);
429 	return 0;
430 }
431 EXPORT_SYMBOL(__sock_queue_rcv_skb);
432 
433 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
434 {
435 	int err;
436 
437 	err = sk_filter(sk, skb);
438 	if (err)
439 		return err;
440 
441 	return __sock_queue_rcv_skb(sk, skb);
442 }
443 EXPORT_SYMBOL(sock_queue_rcv_skb);
444 
445 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
446 		     const int nested, unsigned int trim_cap, bool refcounted)
447 {
448 	int rc = NET_RX_SUCCESS;
449 
450 	if (sk_filter_trim_cap(sk, skb, trim_cap))
451 		goto discard_and_relse;
452 
453 	skb->dev = NULL;
454 
455 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
456 		atomic_inc(&sk->sk_drops);
457 		goto discard_and_relse;
458 	}
459 	if (nested)
460 		bh_lock_sock_nested(sk);
461 	else
462 		bh_lock_sock(sk);
463 	if (!sock_owned_by_user(sk)) {
464 		/*
465 		 * trylock + unlock semantics:
466 		 */
467 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
468 
469 		rc = sk_backlog_rcv(sk, skb);
470 
471 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
472 	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
473 		bh_unlock_sock(sk);
474 		atomic_inc(&sk->sk_drops);
475 		goto discard_and_relse;
476 	}
477 
478 	bh_unlock_sock(sk);
479 out:
480 	if (refcounted)
481 		sock_put(sk);
482 	return rc;
483 discard_and_relse:
484 	kfree_skb(skb);
485 	goto out;
486 }
487 EXPORT_SYMBOL(__sk_receive_skb);
488 
489 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
490 {
491 	struct dst_entry *dst = __sk_dst_get(sk);
492 
493 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
494 		sk_tx_queue_clear(sk);
495 		sk->sk_dst_pending_confirm = 0;
496 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
497 		dst_release(dst);
498 		return NULL;
499 	}
500 
501 	return dst;
502 }
503 EXPORT_SYMBOL(__sk_dst_check);
504 
505 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
506 {
507 	struct dst_entry *dst = sk_dst_get(sk);
508 
509 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
510 		sk_dst_reset(sk);
511 		dst_release(dst);
512 		return NULL;
513 	}
514 
515 	return dst;
516 }
517 EXPORT_SYMBOL(sk_dst_check);
518 
519 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
520 				int optlen)
521 {
522 	int ret = -ENOPROTOOPT;
523 #ifdef CONFIG_NETDEVICES
524 	struct net *net = sock_net(sk);
525 	char devname[IFNAMSIZ];
526 	int index;
527 
528 	/* Sorry... */
529 	ret = -EPERM;
530 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
531 		goto out;
532 
533 	ret = -EINVAL;
534 	if (optlen < 0)
535 		goto out;
536 
537 	/* Bind this socket to a particular device like "eth0",
538 	 * as specified in the passed interface name. If the
539 	 * name is "" or the option length is zero the socket
540 	 * is not bound.
541 	 */
542 	if (optlen > IFNAMSIZ - 1)
543 		optlen = IFNAMSIZ - 1;
544 	memset(devname, 0, sizeof(devname));
545 
546 	ret = -EFAULT;
547 	if (copy_from_user(devname, optval, optlen))
548 		goto out;
549 
550 	index = 0;
551 	if (devname[0] != '\0') {
552 		struct net_device *dev;
553 
554 		rcu_read_lock();
555 		dev = dev_get_by_name_rcu(net, devname);
556 		if (dev)
557 			index = dev->ifindex;
558 		rcu_read_unlock();
559 		ret = -ENODEV;
560 		if (!dev)
561 			goto out;
562 	}
563 
564 	lock_sock(sk);
565 	sk->sk_bound_dev_if = index;
566 	sk_dst_reset(sk);
567 	release_sock(sk);
568 
569 	ret = 0;
570 
571 out:
572 #endif
573 
574 	return ret;
575 }
576 
577 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
578 				int __user *optlen, int len)
579 {
580 	int ret = -ENOPROTOOPT;
581 #ifdef CONFIG_NETDEVICES
582 	struct net *net = sock_net(sk);
583 	char devname[IFNAMSIZ];
584 
585 	if (sk->sk_bound_dev_if == 0) {
586 		len = 0;
587 		goto zero;
588 	}
589 
590 	ret = -EINVAL;
591 	if (len < IFNAMSIZ)
592 		goto out;
593 
594 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
595 	if (ret)
596 		goto out;
597 
598 	len = strlen(devname) + 1;
599 
600 	ret = -EFAULT;
601 	if (copy_to_user(optval, devname, len))
602 		goto out;
603 
604 zero:
605 	ret = -EFAULT;
606 	if (put_user(len, optlen))
607 		goto out;
608 
609 	ret = 0;
610 
611 out:
612 #endif
613 
614 	return ret;
615 }
616 
617 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
618 {
619 	if (valbool)
620 		sock_set_flag(sk, bit);
621 	else
622 		sock_reset_flag(sk, bit);
623 }
624 
625 bool sk_mc_loop(struct sock *sk)
626 {
627 	if (dev_recursion_level())
628 		return false;
629 	if (!sk)
630 		return true;
631 	switch (sk->sk_family) {
632 	case AF_INET:
633 		return inet_sk(sk)->mc_loop;
634 #if IS_ENABLED(CONFIG_IPV6)
635 	case AF_INET6:
636 		return inet6_sk(sk)->mc_loop;
637 #endif
638 	}
639 	WARN_ON(1);
640 	return true;
641 }
642 EXPORT_SYMBOL(sk_mc_loop);
643 
644 /*
645  *	This is meant for all protocols to use and covers goings on
646  *	at the socket level. Everything here is generic.
647  */
648 
649 int sock_setsockopt(struct socket *sock, int level, int optname,
650 		    char __user *optval, unsigned int optlen)
651 {
652 	struct sock *sk = sock->sk;
653 	int val;
654 	int valbool;
655 	struct linger ling;
656 	int ret = 0;
657 
658 	/*
659 	 *	Options without arguments
660 	 */
661 
662 	if (optname == SO_BINDTODEVICE)
663 		return sock_setbindtodevice(sk, optval, optlen);
664 
665 	if (optlen < sizeof(int))
666 		return -EINVAL;
667 
668 	if (get_user(val, (int __user *)optval))
669 		return -EFAULT;
670 
671 	valbool = val ? 1 : 0;
672 
673 	lock_sock(sk);
674 
675 	switch (optname) {
676 	case SO_DEBUG:
677 		if (val && !capable(CAP_NET_ADMIN))
678 			ret = -EACCES;
679 		else
680 			sock_valbool_flag(sk, SOCK_DBG, valbool);
681 		break;
682 	case SO_REUSEADDR:
683 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
684 		break;
685 	case SO_REUSEPORT:
686 		sk->sk_reuseport = valbool;
687 		break;
688 	case SO_TYPE:
689 	case SO_PROTOCOL:
690 	case SO_DOMAIN:
691 	case SO_ERROR:
692 		ret = -ENOPROTOOPT;
693 		break;
694 	case SO_DONTROUTE:
695 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
696 		break;
697 	case SO_BROADCAST:
698 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
699 		break;
700 	case SO_SNDBUF:
701 		/* Don't error on this BSD doesn't and if you think
702 		 * about it this is right. Otherwise apps have to
703 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
704 		 * are treated in BSD as hints
705 		 */
706 		val = min_t(u32, val, sysctl_wmem_max);
707 set_sndbuf:
708 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
709 		sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
710 		/* Wake up sending tasks if we upped the value. */
711 		sk->sk_write_space(sk);
712 		break;
713 
714 	case SO_SNDBUFFORCE:
715 		if (!capable(CAP_NET_ADMIN)) {
716 			ret = -EPERM;
717 			break;
718 		}
719 		goto set_sndbuf;
720 
721 	case SO_RCVBUF:
722 		/* Don't error on this BSD doesn't and if you think
723 		 * about it this is right. Otherwise apps have to
724 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
725 		 * are treated in BSD as hints
726 		 */
727 		val = min_t(u32, val, sysctl_rmem_max);
728 set_rcvbuf:
729 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
730 		/*
731 		 * We double it on the way in to account for
732 		 * "struct sk_buff" etc. overhead.   Applications
733 		 * assume that the SO_RCVBUF setting they make will
734 		 * allow that much actual data to be received on that
735 		 * socket.
736 		 *
737 		 * Applications are unaware that "struct sk_buff" and
738 		 * other overheads allocate from the receive buffer
739 		 * during socket buffer allocation.
740 		 *
741 		 * And after considering the possible alternatives,
742 		 * returning the value we actually used in getsockopt
743 		 * is the most desirable behavior.
744 		 */
745 		sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
746 		break;
747 
748 	case SO_RCVBUFFORCE:
749 		if (!capable(CAP_NET_ADMIN)) {
750 			ret = -EPERM;
751 			break;
752 		}
753 		goto set_rcvbuf;
754 
755 	case SO_KEEPALIVE:
756 		if (sk->sk_prot->keepalive)
757 			sk->sk_prot->keepalive(sk, valbool);
758 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
759 		break;
760 
761 	case SO_OOBINLINE:
762 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
763 		break;
764 
765 	case SO_NO_CHECK:
766 		sk->sk_no_check_tx = valbool;
767 		break;
768 
769 	case SO_PRIORITY:
770 		if ((val >= 0 && val <= 6) ||
771 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
772 			sk->sk_priority = val;
773 		else
774 			ret = -EPERM;
775 		break;
776 
777 	case SO_LINGER:
778 		if (optlen < sizeof(ling)) {
779 			ret = -EINVAL;	/* 1003.1g */
780 			break;
781 		}
782 		if (copy_from_user(&ling, optval, sizeof(ling))) {
783 			ret = -EFAULT;
784 			break;
785 		}
786 		if (!ling.l_onoff)
787 			sock_reset_flag(sk, SOCK_LINGER);
788 		else {
789 #if (BITS_PER_LONG == 32)
790 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
791 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
792 			else
793 #endif
794 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
795 			sock_set_flag(sk, SOCK_LINGER);
796 		}
797 		break;
798 
799 	case SO_BSDCOMPAT:
800 		sock_warn_obsolete_bsdism("setsockopt");
801 		break;
802 
803 	case SO_PASSCRED:
804 		if (valbool)
805 			set_bit(SOCK_PASSCRED, &sock->flags);
806 		else
807 			clear_bit(SOCK_PASSCRED, &sock->flags);
808 		break;
809 
810 	case SO_TIMESTAMP:
811 	case SO_TIMESTAMPNS:
812 		if (valbool)  {
813 			if (optname == SO_TIMESTAMP)
814 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
815 			else
816 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
817 			sock_set_flag(sk, SOCK_RCVTSTAMP);
818 			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
819 		} else {
820 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
821 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
822 		}
823 		break;
824 
825 	case SO_TIMESTAMPING:
826 		if (val & ~SOF_TIMESTAMPING_MASK) {
827 			ret = -EINVAL;
828 			break;
829 		}
830 
831 		if (val & SOF_TIMESTAMPING_OPT_ID &&
832 		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
833 			if (sk->sk_protocol == IPPROTO_TCP &&
834 			    sk->sk_type == SOCK_STREAM) {
835 				if ((1 << sk->sk_state) &
836 				    (TCPF_CLOSE | TCPF_LISTEN)) {
837 					ret = -EINVAL;
838 					break;
839 				}
840 				sk->sk_tskey = tcp_sk(sk)->snd_una;
841 			} else {
842 				sk->sk_tskey = 0;
843 			}
844 		}
845 
846 		if (val & SOF_TIMESTAMPING_OPT_STATS &&
847 		    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
848 			ret = -EINVAL;
849 			break;
850 		}
851 
852 		sk->sk_tsflags = val;
853 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
854 			sock_enable_timestamp(sk,
855 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
856 		else
857 			sock_disable_timestamp(sk,
858 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
859 		break;
860 
861 	case SO_RCVLOWAT:
862 		if (val < 0)
863 			val = INT_MAX;
864 		sk->sk_rcvlowat = val ? : 1;
865 		break;
866 
867 	case SO_RCVTIMEO:
868 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
869 		break;
870 
871 	case SO_SNDTIMEO:
872 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
873 		break;
874 
875 	case SO_ATTACH_FILTER:
876 		ret = -EINVAL;
877 		if (optlen == sizeof(struct sock_fprog)) {
878 			struct sock_fprog fprog;
879 
880 			ret = -EFAULT;
881 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
882 				break;
883 
884 			ret = sk_attach_filter(&fprog, sk);
885 		}
886 		break;
887 
888 	case SO_ATTACH_BPF:
889 		ret = -EINVAL;
890 		if (optlen == sizeof(u32)) {
891 			u32 ufd;
892 
893 			ret = -EFAULT;
894 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
895 				break;
896 
897 			ret = sk_attach_bpf(ufd, sk);
898 		}
899 		break;
900 
901 	case SO_ATTACH_REUSEPORT_CBPF:
902 		ret = -EINVAL;
903 		if (optlen == sizeof(struct sock_fprog)) {
904 			struct sock_fprog fprog;
905 
906 			ret = -EFAULT;
907 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
908 				break;
909 
910 			ret = sk_reuseport_attach_filter(&fprog, sk);
911 		}
912 		break;
913 
914 	case SO_ATTACH_REUSEPORT_EBPF:
915 		ret = -EINVAL;
916 		if (optlen == sizeof(u32)) {
917 			u32 ufd;
918 
919 			ret = -EFAULT;
920 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
921 				break;
922 
923 			ret = sk_reuseport_attach_bpf(ufd, sk);
924 		}
925 		break;
926 
927 	case SO_DETACH_FILTER:
928 		ret = sk_detach_filter(sk);
929 		break;
930 
931 	case SO_LOCK_FILTER:
932 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
933 			ret = -EPERM;
934 		else
935 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
936 		break;
937 
938 	case SO_PASSSEC:
939 		if (valbool)
940 			set_bit(SOCK_PASSSEC, &sock->flags);
941 		else
942 			clear_bit(SOCK_PASSSEC, &sock->flags);
943 		break;
944 	case SO_MARK:
945 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
946 			ret = -EPERM;
947 		else
948 			sk->sk_mark = val;
949 		break;
950 
951 	case SO_RXQ_OVFL:
952 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
953 		break;
954 
955 	case SO_WIFI_STATUS:
956 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
957 		break;
958 
959 	case SO_PEEK_OFF:
960 		if (sock->ops->set_peek_off)
961 			ret = sock->ops->set_peek_off(sk, val);
962 		else
963 			ret = -EOPNOTSUPP;
964 		break;
965 
966 	case SO_NOFCS:
967 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
968 		break;
969 
970 	case SO_SELECT_ERR_QUEUE:
971 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
972 		break;
973 
974 #ifdef CONFIG_NET_RX_BUSY_POLL
975 	case SO_BUSY_POLL:
976 		/* allow unprivileged users to decrease the value */
977 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
978 			ret = -EPERM;
979 		else {
980 			if (val < 0)
981 				ret = -EINVAL;
982 			else
983 				sk->sk_ll_usec = val;
984 		}
985 		break;
986 #endif
987 
988 	case SO_MAX_PACING_RATE:
989 		sk->sk_max_pacing_rate = val;
990 		sk->sk_pacing_rate = min(sk->sk_pacing_rate,
991 					 sk->sk_max_pacing_rate);
992 		break;
993 
994 	case SO_INCOMING_CPU:
995 		sk->sk_incoming_cpu = val;
996 		break;
997 
998 	case SO_CNX_ADVICE:
999 		if (val == 1)
1000 			dst_negative_advice(sk);
1001 		break;
1002 	default:
1003 		ret = -ENOPROTOOPT;
1004 		break;
1005 	}
1006 	release_sock(sk);
1007 	return ret;
1008 }
1009 EXPORT_SYMBOL(sock_setsockopt);
1010 
1011 
1012 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1013 			  struct ucred *ucred)
1014 {
1015 	ucred->pid = pid_vnr(pid);
1016 	ucred->uid = ucred->gid = -1;
1017 	if (cred) {
1018 		struct user_namespace *current_ns = current_user_ns();
1019 
1020 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1021 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1022 	}
1023 }
1024 
1025 int sock_getsockopt(struct socket *sock, int level, int optname,
1026 		    char __user *optval, int __user *optlen)
1027 {
1028 	struct sock *sk = sock->sk;
1029 
1030 	union {
1031 		int val;
1032 		struct linger ling;
1033 		struct timeval tm;
1034 	} v;
1035 
1036 	int lv = sizeof(int);
1037 	int len;
1038 
1039 	if (get_user(len, optlen))
1040 		return -EFAULT;
1041 	if (len < 0)
1042 		return -EINVAL;
1043 
1044 	memset(&v, 0, sizeof(v));
1045 
1046 	switch (optname) {
1047 	case SO_DEBUG:
1048 		v.val = sock_flag(sk, SOCK_DBG);
1049 		break;
1050 
1051 	case SO_DONTROUTE:
1052 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1053 		break;
1054 
1055 	case SO_BROADCAST:
1056 		v.val = sock_flag(sk, SOCK_BROADCAST);
1057 		break;
1058 
1059 	case SO_SNDBUF:
1060 		v.val = sk->sk_sndbuf;
1061 		break;
1062 
1063 	case SO_RCVBUF:
1064 		v.val = sk->sk_rcvbuf;
1065 		break;
1066 
1067 	case SO_REUSEADDR:
1068 		v.val = sk->sk_reuse;
1069 		break;
1070 
1071 	case SO_REUSEPORT:
1072 		v.val = sk->sk_reuseport;
1073 		break;
1074 
1075 	case SO_KEEPALIVE:
1076 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1077 		break;
1078 
1079 	case SO_TYPE:
1080 		v.val = sk->sk_type;
1081 		break;
1082 
1083 	case SO_PROTOCOL:
1084 		v.val = sk->sk_protocol;
1085 		break;
1086 
1087 	case SO_DOMAIN:
1088 		v.val = sk->sk_family;
1089 		break;
1090 
1091 	case SO_ERROR:
1092 		v.val = -sock_error(sk);
1093 		if (v.val == 0)
1094 			v.val = xchg(&sk->sk_err_soft, 0);
1095 		break;
1096 
1097 	case SO_OOBINLINE:
1098 		v.val = sock_flag(sk, SOCK_URGINLINE);
1099 		break;
1100 
1101 	case SO_NO_CHECK:
1102 		v.val = sk->sk_no_check_tx;
1103 		break;
1104 
1105 	case SO_PRIORITY:
1106 		v.val = sk->sk_priority;
1107 		break;
1108 
1109 	case SO_LINGER:
1110 		lv		= sizeof(v.ling);
1111 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1112 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1113 		break;
1114 
1115 	case SO_BSDCOMPAT:
1116 		sock_warn_obsolete_bsdism("getsockopt");
1117 		break;
1118 
1119 	case SO_TIMESTAMP:
1120 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1121 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1122 		break;
1123 
1124 	case SO_TIMESTAMPNS:
1125 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1126 		break;
1127 
1128 	case SO_TIMESTAMPING:
1129 		v.val = sk->sk_tsflags;
1130 		break;
1131 
1132 	case SO_RCVTIMEO:
1133 		lv = sizeof(struct timeval);
1134 		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1135 			v.tm.tv_sec = 0;
1136 			v.tm.tv_usec = 0;
1137 		} else {
1138 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1139 			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1140 		}
1141 		break;
1142 
1143 	case SO_SNDTIMEO:
1144 		lv = sizeof(struct timeval);
1145 		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1146 			v.tm.tv_sec = 0;
1147 			v.tm.tv_usec = 0;
1148 		} else {
1149 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1150 			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1151 		}
1152 		break;
1153 
1154 	case SO_RCVLOWAT:
1155 		v.val = sk->sk_rcvlowat;
1156 		break;
1157 
1158 	case SO_SNDLOWAT:
1159 		v.val = 1;
1160 		break;
1161 
1162 	case SO_PASSCRED:
1163 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1164 		break;
1165 
1166 	case SO_PEERCRED:
1167 	{
1168 		struct ucred peercred;
1169 		if (len > sizeof(peercred))
1170 			len = sizeof(peercred);
1171 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1172 		if (copy_to_user(optval, &peercred, len))
1173 			return -EFAULT;
1174 		goto lenout;
1175 	}
1176 
1177 	case SO_PEERNAME:
1178 	{
1179 		char address[128];
1180 
1181 		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1182 			return -ENOTCONN;
1183 		if (lv < len)
1184 			return -EINVAL;
1185 		if (copy_to_user(optval, address, len))
1186 			return -EFAULT;
1187 		goto lenout;
1188 	}
1189 
1190 	/* Dubious BSD thing... Probably nobody even uses it, but
1191 	 * the UNIX standard wants it for whatever reason... -DaveM
1192 	 */
1193 	case SO_ACCEPTCONN:
1194 		v.val = sk->sk_state == TCP_LISTEN;
1195 		break;
1196 
1197 	case SO_PASSSEC:
1198 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1199 		break;
1200 
1201 	case SO_PEERSEC:
1202 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1203 
1204 	case SO_MARK:
1205 		v.val = sk->sk_mark;
1206 		break;
1207 
1208 	case SO_RXQ_OVFL:
1209 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1210 		break;
1211 
1212 	case SO_WIFI_STATUS:
1213 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1214 		break;
1215 
1216 	case SO_PEEK_OFF:
1217 		if (!sock->ops->set_peek_off)
1218 			return -EOPNOTSUPP;
1219 
1220 		v.val = sk->sk_peek_off;
1221 		break;
1222 	case SO_NOFCS:
1223 		v.val = sock_flag(sk, SOCK_NOFCS);
1224 		break;
1225 
1226 	case SO_BINDTODEVICE:
1227 		return sock_getbindtodevice(sk, optval, optlen, len);
1228 
1229 	case SO_GET_FILTER:
1230 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1231 		if (len < 0)
1232 			return len;
1233 
1234 		goto lenout;
1235 
1236 	case SO_LOCK_FILTER:
1237 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1238 		break;
1239 
1240 	case SO_BPF_EXTENSIONS:
1241 		v.val = bpf_tell_extensions();
1242 		break;
1243 
1244 	case SO_SELECT_ERR_QUEUE:
1245 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1246 		break;
1247 
1248 #ifdef CONFIG_NET_RX_BUSY_POLL
1249 	case SO_BUSY_POLL:
1250 		v.val = sk->sk_ll_usec;
1251 		break;
1252 #endif
1253 
1254 	case SO_MAX_PACING_RATE:
1255 		v.val = sk->sk_max_pacing_rate;
1256 		break;
1257 
1258 	case SO_INCOMING_CPU:
1259 		v.val = sk->sk_incoming_cpu;
1260 		break;
1261 
1262 	default:
1263 		/* We implement the SO_SNDLOWAT etc to not be settable
1264 		 * (1003.1g 7).
1265 		 */
1266 		return -ENOPROTOOPT;
1267 	}
1268 
1269 	if (len > lv)
1270 		len = lv;
1271 	if (copy_to_user(optval, &v, len))
1272 		return -EFAULT;
1273 lenout:
1274 	if (put_user(len, optlen))
1275 		return -EFAULT;
1276 	return 0;
1277 }
1278 
1279 /*
1280  * Initialize an sk_lock.
1281  *
1282  * (We also register the sk_lock with the lock validator.)
1283  */
1284 static inline void sock_lock_init(struct sock *sk)
1285 {
1286 	if (sk->sk_kern_sock)
1287 		sock_lock_init_class_and_name(
1288 			sk,
1289 			af_family_kern_slock_key_strings[sk->sk_family],
1290 			af_family_kern_slock_keys + sk->sk_family,
1291 			af_family_kern_key_strings[sk->sk_family],
1292 			af_family_kern_keys + sk->sk_family);
1293 	else
1294 		sock_lock_init_class_and_name(
1295 			sk,
1296 			af_family_slock_key_strings[sk->sk_family],
1297 			af_family_slock_keys + sk->sk_family,
1298 			af_family_key_strings[sk->sk_family],
1299 			af_family_keys + sk->sk_family);
1300 }
1301 
1302 /*
1303  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1304  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1305  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1306  */
1307 static void sock_copy(struct sock *nsk, const struct sock *osk)
1308 {
1309 #ifdef CONFIG_SECURITY_NETWORK
1310 	void *sptr = nsk->sk_security;
1311 #endif
1312 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1313 
1314 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1315 	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1316 
1317 #ifdef CONFIG_SECURITY_NETWORK
1318 	nsk->sk_security = sptr;
1319 	security_sk_clone(osk, nsk);
1320 #endif
1321 }
1322 
1323 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1324 		int family)
1325 {
1326 	struct sock *sk;
1327 	struct kmem_cache *slab;
1328 
1329 	slab = prot->slab;
1330 	if (slab != NULL) {
1331 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1332 		if (!sk)
1333 			return sk;
1334 		if (priority & __GFP_ZERO)
1335 			sk_prot_clear_nulls(sk, prot->obj_size);
1336 	} else
1337 		sk = kmalloc(prot->obj_size, priority);
1338 
1339 	if (sk != NULL) {
1340 		kmemcheck_annotate_bitfield(sk, flags);
1341 
1342 		if (security_sk_alloc(sk, family, priority))
1343 			goto out_free;
1344 
1345 		if (!try_module_get(prot->owner))
1346 			goto out_free_sec;
1347 		sk_tx_queue_clear(sk);
1348 	}
1349 
1350 	return sk;
1351 
1352 out_free_sec:
1353 	security_sk_free(sk);
1354 out_free:
1355 	if (slab != NULL)
1356 		kmem_cache_free(slab, sk);
1357 	else
1358 		kfree(sk);
1359 	return NULL;
1360 }
1361 
1362 static void sk_prot_free(struct proto *prot, struct sock *sk)
1363 {
1364 	struct kmem_cache *slab;
1365 	struct module *owner;
1366 
1367 	owner = prot->owner;
1368 	slab = prot->slab;
1369 
1370 	cgroup_sk_free(&sk->sk_cgrp_data);
1371 	mem_cgroup_sk_free(sk);
1372 	security_sk_free(sk);
1373 	if (slab != NULL)
1374 		kmem_cache_free(slab, sk);
1375 	else
1376 		kfree(sk);
1377 	module_put(owner);
1378 }
1379 
1380 /**
1381  *	sk_alloc - All socket objects are allocated here
1382  *	@net: the applicable net namespace
1383  *	@family: protocol family
1384  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1385  *	@prot: struct proto associated with this new sock instance
1386  *	@kern: is this to be a kernel socket?
1387  */
1388 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1389 		      struct proto *prot, int kern)
1390 {
1391 	struct sock *sk;
1392 
1393 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1394 	if (sk) {
1395 		sk->sk_family = family;
1396 		/*
1397 		 * See comment in struct sock definition to understand
1398 		 * why we need sk_prot_creator -acme
1399 		 */
1400 		sk->sk_prot = sk->sk_prot_creator = prot;
1401 		sk->sk_kern_sock = kern;
1402 		sock_lock_init(sk);
1403 		sk->sk_net_refcnt = kern ? 0 : 1;
1404 		if (likely(sk->sk_net_refcnt))
1405 			get_net(net);
1406 		sock_net_set(sk, net);
1407 		atomic_set(&sk->sk_wmem_alloc, 1);
1408 
1409 		mem_cgroup_sk_alloc(sk);
1410 		cgroup_sk_alloc(&sk->sk_cgrp_data);
1411 		sock_update_classid(&sk->sk_cgrp_data);
1412 		sock_update_netprioidx(&sk->sk_cgrp_data);
1413 	}
1414 
1415 	return sk;
1416 }
1417 EXPORT_SYMBOL(sk_alloc);
1418 
1419 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1420  * grace period. This is the case for UDP sockets and TCP listeners.
1421  */
1422 static void __sk_destruct(struct rcu_head *head)
1423 {
1424 	struct sock *sk = container_of(head, struct sock, sk_rcu);
1425 	struct sk_filter *filter;
1426 
1427 	if (sk->sk_destruct)
1428 		sk->sk_destruct(sk);
1429 
1430 	filter = rcu_dereference_check(sk->sk_filter,
1431 				       atomic_read(&sk->sk_wmem_alloc) == 0);
1432 	if (filter) {
1433 		sk_filter_uncharge(sk, filter);
1434 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1435 	}
1436 	if (rcu_access_pointer(sk->sk_reuseport_cb))
1437 		reuseport_detach_sock(sk);
1438 
1439 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1440 
1441 	if (atomic_read(&sk->sk_omem_alloc))
1442 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1443 			 __func__, atomic_read(&sk->sk_omem_alloc));
1444 
1445 	if (sk->sk_frag.page) {
1446 		put_page(sk->sk_frag.page);
1447 		sk->sk_frag.page = NULL;
1448 	}
1449 
1450 	if (sk->sk_peer_cred)
1451 		put_cred(sk->sk_peer_cred);
1452 	put_pid(sk->sk_peer_pid);
1453 	if (likely(sk->sk_net_refcnt))
1454 		put_net(sock_net(sk));
1455 	sk_prot_free(sk->sk_prot_creator, sk);
1456 }
1457 
1458 void sk_destruct(struct sock *sk)
1459 {
1460 	if (sock_flag(sk, SOCK_RCU_FREE))
1461 		call_rcu(&sk->sk_rcu, __sk_destruct);
1462 	else
1463 		__sk_destruct(&sk->sk_rcu);
1464 }
1465 
1466 static void __sk_free(struct sock *sk)
1467 {
1468 	if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
1469 		sock_diag_broadcast_destroy(sk);
1470 	else
1471 		sk_destruct(sk);
1472 }
1473 
1474 void sk_free(struct sock *sk)
1475 {
1476 	/*
1477 	 * We subtract one from sk_wmem_alloc and can know if
1478 	 * some packets are still in some tx queue.
1479 	 * If not null, sock_wfree() will call __sk_free(sk) later
1480 	 */
1481 	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1482 		__sk_free(sk);
1483 }
1484 EXPORT_SYMBOL(sk_free);
1485 
1486 /**
1487  *	sk_clone_lock - clone a socket, and lock its clone
1488  *	@sk: the socket to clone
1489  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1490  *
1491  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1492  */
1493 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1494 {
1495 	struct sock *newsk;
1496 	bool is_charged = true;
1497 
1498 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1499 	if (newsk != NULL) {
1500 		struct sk_filter *filter;
1501 
1502 		sock_copy(newsk, sk);
1503 
1504 		/* SANITY */
1505 		if (likely(newsk->sk_net_refcnt))
1506 			get_net(sock_net(newsk));
1507 		sk_node_init(&newsk->sk_node);
1508 		sock_lock_init(newsk);
1509 		bh_lock_sock(newsk);
1510 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1511 		newsk->sk_backlog.len = 0;
1512 
1513 		atomic_set(&newsk->sk_rmem_alloc, 0);
1514 		/*
1515 		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1516 		 */
1517 		atomic_set(&newsk->sk_wmem_alloc, 1);
1518 		atomic_set(&newsk->sk_omem_alloc, 0);
1519 		skb_queue_head_init(&newsk->sk_receive_queue);
1520 		skb_queue_head_init(&newsk->sk_write_queue);
1521 
1522 		rwlock_init(&newsk->sk_callback_lock);
1523 		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1524 				af_callback_keys + newsk->sk_family,
1525 				af_family_clock_key_strings[newsk->sk_family]);
1526 
1527 		newsk->sk_dst_cache	= NULL;
1528 		newsk->sk_dst_pending_confirm = 0;
1529 		newsk->sk_wmem_queued	= 0;
1530 		newsk->sk_forward_alloc = 0;
1531 		atomic_set(&newsk->sk_drops, 0);
1532 		newsk->sk_send_head	= NULL;
1533 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1534 
1535 		sock_reset_flag(newsk, SOCK_DONE);
1536 		skb_queue_head_init(&newsk->sk_error_queue);
1537 
1538 		filter = rcu_dereference_protected(newsk->sk_filter, 1);
1539 		if (filter != NULL)
1540 			/* though it's an empty new sock, the charging may fail
1541 			 * if sysctl_optmem_max was changed between creation of
1542 			 * original socket and cloning
1543 			 */
1544 			is_charged = sk_filter_charge(newsk, filter);
1545 
1546 		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1547 			/* We need to make sure that we don't uncharge the new
1548 			 * socket if we couldn't charge it in the first place
1549 			 * as otherwise we uncharge the parent's filter.
1550 			 */
1551 			if (!is_charged)
1552 				RCU_INIT_POINTER(newsk->sk_filter, NULL);
1553 			sk_free_unlock_clone(newsk);
1554 			newsk = NULL;
1555 			goto out;
1556 		}
1557 		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1558 
1559 		newsk->sk_err	   = 0;
1560 		newsk->sk_err_soft = 0;
1561 		newsk->sk_priority = 0;
1562 		newsk->sk_incoming_cpu = raw_smp_processor_id();
1563 		atomic64_set(&newsk->sk_cookie, 0);
1564 
1565 		mem_cgroup_sk_alloc(newsk);
1566 		cgroup_sk_alloc(&newsk->sk_cgrp_data);
1567 
1568 		/*
1569 		 * Before updating sk_refcnt, we must commit prior changes to memory
1570 		 * (Documentation/RCU/rculist_nulls.txt for details)
1571 		 */
1572 		smp_wmb();
1573 		atomic_set(&newsk->sk_refcnt, 2);
1574 
1575 		/*
1576 		 * Increment the counter in the same struct proto as the master
1577 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1578 		 * is the same as sk->sk_prot->socks, as this field was copied
1579 		 * with memcpy).
1580 		 *
1581 		 * This _changes_ the previous behaviour, where
1582 		 * tcp_create_openreq_child always was incrementing the
1583 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1584 		 * to be taken into account in all callers. -acme
1585 		 */
1586 		sk_refcnt_debug_inc(newsk);
1587 		sk_set_socket(newsk, NULL);
1588 		newsk->sk_wq = NULL;
1589 
1590 		if (newsk->sk_prot->sockets_allocated)
1591 			sk_sockets_allocated_inc(newsk);
1592 
1593 		if (sock_needs_netstamp(sk) &&
1594 		    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1595 			net_enable_timestamp();
1596 	}
1597 out:
1598 	return newsk;
1599 }
1600 EXPORT_SYMBOL_GPL(sk_clone_lock);
1601 
1602 void sk_free_unlock_clone(struct sock *sk)
1603 {
1604 	/* It is still raw copy of parent, so invalidate
1605 	 * destructor and make plain sk_free() */
1606 	sk->sk_destruct = NULL;
1607 	bh_unlock_sock(sk);
1608 	sk_free(sk);
1609 }
1610 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1611 
1612 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1613 {
1614 	u32 max_segs = 1;
1615 
1616 	sk_dst_set(sk, dst);
1617 	sk->sk_route_caps = dst->dev->features;
1618 	if (sk->sk_route_caps & NETIF_F_GSO)
1619 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1620 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1621 	if (sk_can_gso(sk)) {
1622 		if (dst->header_len) {
1623 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1624 		} else {
1625 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1626 			sk->sk_gso_max_size = dst->dev->gso_max_size;
1627 			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1628 		}
1629 	}
1630 	sk->sk_gso_max_segs = max_segs;
1631 }
1632 EXPORT_SYMBOL_GPL(sk_setup_caps);
1633 
1634 /*
1635  *	Simple resource managers for sockets.
1636  */
1637 
1638 
1639 /*
1640  * Write buffer destructor automatically called from kfree_skb.
1641  */
1642 void sock_wfree(struct sk_buff *skb)
1643 {
1644 	struct sock *sk = skb->sk;
1645 	unsigned int len = skb->truesize;
1646 
1647 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1648 		/*
1649 		 * Keep a reference on sk_wmem_alloc, this will be released
1650 		 * after sk_write_space() call
1651 		 */
1652 		atomic_sub(len - 1, &sk->sk_wmem_alloc);
1653 		sk->sk_write_space(sk);
1654 		len = 1;
1655 	}
1656 	/*
1657 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1658 	 * could not do because of in-flight packets
1659 	 */
1660 	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1661 		__sk_free(sk);
1662 }
1663 EXPORT_SYMBOL(sock_wfree);
1664 
1665 /* This variant of sock_wfree() is used by TCP,
1666  * since it sets SOCK_USE_WRITE_QUEUE.
1667  */
1668 void __sock_wfree(struct sk_buff *skb)
1669 {
1670 	struct sock *sk = skb->sk;
1671 
1672 	if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1673 		__sk_free(sk);
1674 }
1675 
1676 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1677 {
1678 	skb_orphan(skb);
1679 	skb->sk = sk;
1680 #ifdef CONFIG_INET
1681 	if (unlikely(!sk_fullsock(sk))) {
1682 		skb->destructor = sock_edemux;
1683 		sock_hold(sk);
1684 		return;
1685 	}
1686 #endif
1687 	skb->destructor = sock_wfree;
1688 	skb_set_hash_from_sk(skb, sk);
1689 	/*
1690 	 * We used to take a refcount on sk, but following operation
1691 	 * is enough to guarantee sk_free() wont free this sock until
1692 	 * all in-flight packets are completed
1693 	 */
1694 	atomic_add(skb->truesize, &sk->sk_wmem_alloc);
1695 }
1696 EXPORT_SYMBOL(skb_set_owner_w);
1697 
1698 /* This helper is used by netem, as it can hold packets in its
1699  * delay queue. We want to allow the owner socket to send more
1700  * packets, as if they were already TX completed by a typical driver.
1701  * But we also want to keep skb->sk set because some packet schedulers
1702  * rely on it (sch_fq for example). So we set skb->truesize to a small
1703  * amount (1) and decrease sk_wmem_alloc accordingly.
1704  */
1705 void skb_orphan_partial(struct sk_buff *skb)
1706 {
1707 	/* If this skb is a TCP pure ACK or already went here,
1708 	 * we have nothing to do. 2 is already a very small truesize.
1709 	 */
1710 	if (skb->truesize <= 2)
1711 		return;
1712 
1713 	/* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1714 	 * so we do not completely orphan skb, but transfert all
1715 	 * accounted bytes but one, to avoid unexpected reorders.
1716 	 */
1717 	if (skb->destructor == sock_wfree
1718 #ifdef CONFIG_INET
1719 	    || skb->destructor == tcp_wfree
1720 #endif
1721 		) {
1722 		atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1723 		skb->truesize = 1;
1724 	} else {
1725 		skb_orphan(skb);
1726 	}
1727 }
1728 EXPORT_SYMBOL(skb_orphan_partial);
1729 
1730 /*
1731  * Read buffer destructor automatically called from kfree_skb.
1732  */
1733 void sock_rfree(struct sk_buff *skb)
1734 {
1735 	struct sock *sk = skb->sk;
1736 	unsigned int len = skb->truesize;
1737 
1738 	atomic_sub(len, &sk->sk_rmem_alloc);
1739 	sk_mem_uncharge(sk, len);
1740 }
1741 EXPORT_SYMBOL(sock_rfree);
1742 
1743 /*
1744  * Buffer destructor for skbs that are not used directly in read or write
1745  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1746  */
1747 void sock_efree(struct sk_buff *skb)
1748 {
1749 	sock_put(skb->sk);
1750 }
1751 EXPORT_SYMBOL(sock_efree);
1752 
1753 kuid_t sock_i_uid(struct sock *sk)
1754 {
1755 	kuid_t uid;
1756 
1757 	read_lock_bh(&sk->sk_callback_lock);
1758 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1759 	read_unlock_bh(&sk->sk_callback_lock);
1760 	return uid;
1761 }
1762 EXPORT_SYMBOL(sock_i_uid);
1763 
1764 unsigned long sock_i_ino(struct sock *sk)
1765 {
1766 	unsigned long ino;
1767 
1768 	read_lock_bh(&sk->sk_callback_lock);
1769 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1770 	read_unlock_bh(&sk->sk_callback_lock);
1771 	return ino;
1772 }
1773 EXPORT_SYMBOL(sock_i_ino);
1774 
1775 /*
1776  * Allocate a skb from the socket's send buffer.
1777  */
1778 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1779 			     gfp_t priority)
1780 {
1781 	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1782 		struct sk_buff *skb = alloc_skb(size, priority);
1783 		if (skb) {
1784 			skb_set_owner_w(skb, sk);
1785 			return skb;
1786 		}
1787 	}
1788 	return NULL;
1789 }
1790 EXPORT_SYMBOL(sock_wmalloc);
1791 
1792 /*
1793  * Allocate a memory block from the socket's option memory buffer.
1794  */
1795 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1796 {
1797 	if ((unsigned int)size <= sysctl_optmem_max &&
1798 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1799 		void *mem;
1800 		/* First do the add, to avoid the race if kmalloc
1801 		 * might sleep.
1802 		 */
1803 		atomic_add(size, &sk->sk_omem_alloc);
1804 		mem = kmalloc(size, priority);
1805 		if (mem)
1806 			return mem;
1807 		atomic_sub(size, &sk->sk_omem_alloc);
1808 	}
1809 	return NULL;
1810 }
1811 EXPORT_SYMBOL(sock_kmalloc);
1812 
1813 /* Free an option memory block. Note, we actually want the inline
1814  * here as this allows gcc to detect the nullify and fold away the
1815  * condition entirely.
1816  */
1817 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1818 				  const bool nullify)
1819 {
1820 	if (WARN_ON_ONCE(!mem))
1821 		return;
1822 	if (nullify)
1823 		kzfree(mem);
1824 	else
1825 		kfree(mem);
1826 	atomic_sub(size, &sk->sk_omem_alloc);
1827 }
1828 
1829 void sock_kfree_s(struct sock *sk, void *mem, int size)
1830 {
1831 	__sock_kfree_s(sk, mem, size, false);
1832 }
1833 EXPORT_SYMBOL(sock_kfree_s);
1834 
1835 void sock_kzfree_s(struct sock *sk, void *mem, int size)
1836 {
1837 	__sock_kfree_s(sk, mem, size, true);
1838 }
1839 EXPORT_SYMBOL(sock_kzfree_s);
1840 
1841 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1842    I think, these locks should be removed for datagram sockets.
1843  */
1844 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1845 {
1846 	DEFINE_WAIT(wait);
1847 
1848 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1849 	for (;;) {
1850 		if (!timeo)
1851 			break;
1852 		if (signal_pending(current))
1853 			break;
1854 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1855 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1856 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1857 			break;
1858 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1859 			break;
1860 		if (sk->sk_err)
1861 			break;
1862 		timeo = schedule_timeout(timeo);
1863 	}
1864 	finish_wait(sk_sleep(sk), &wait);
1865 	return timeo;
1866 }
1867 
1868 
1869 /*
1870  *	Generic send/receive buffer handlers
1871  */
1872 
1873 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1874 				     unsigned long data_len, int noblock,
1875 				     int *errcode, int max_page_order)
1876 {
1877 	struct sk_buff *skb;
1878 	long timeo;
1879 	int err;
1880 
1881 	timeo = sock_sndtimeo(sk, noblock);
1882 	for (;;) {
1883 		err = sock_error(sk);
1884 		if (err != 0)
1885 			goto failure;
1886 
1887 		err = -EPIPE;
1888 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1889 			goto failure;
1890 
1891 		if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1892 			break;
1893 
1894 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1895 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1896 		err = -EAGAIN;
1897 		if (!timeo)
1898 			goto failure;
1899 		if (signal_pending(current))
1900 			goto interrupted;
1901 		timeo = sock_wait_for_wmem(sk, timeo);
1902 	}
1903 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1904 				   errcode, sk->sk_allocation);
1905 	if (skb)
1906 		skb_set_owner_w(skb, sk);
1907 	return skb;
1908 
1909 interrupted:
1910 	err = sock_intr_errno(timeo);
1911 failure:
1912 	*errcode = err;
1913 	return NULL;
1914 }
1915 EXPORT_SYMBOL(sock_alloc_send_pskb);
1916 
1917 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1918 				    int noblock, int *errcode)
1919 {
1920 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1921 }
1922 EXPORT_SYMBOL(sock_alloc_send_skb);
1923 
1924 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
1925 		     struct sockcm_cookie *sockc)
1926 {
1927 	u32 tsflags;
1928 
1929 	switch (cmsg->cmsg_type) {
1930 	case SO_MARK:
1931 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1932 			return -EPERM;
1933 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1934 			return -EINVAL;
1935 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
1936 		break;
1937 	case SO_TIMESTAMPING:
1938 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1939 			return -EINVAL;
1940 
1941 		tsflags = *(u32 *)CMSG_DATA(cmsg);
1942 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
1943 			return -EINVAL;
1944 
1945 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
1946 		sockc->tsflags |= tsflags;
1947 		break;
1948 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
1949 	case SCM_RIGHTS:
1950 	case SCM_CREDENTIALS:
1951 		break;
1952 	default:
1953 		return -EINVAL;
1954 	}
1955 	return 0;
1956 }
1957 EXPORT_SYMBOL(__sock_cmsg_send);
1958 
1959 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
1960 		   struct sockcm_cookie *sockc)
1961 {
1962 	struct cmsghdr *cmsg;
1963 	int ret;
1964 
1965 	for_each_cmsghdr(cmsg, msg) {
1966 		if (!CMSG_OK(msg, cmsg))
1967 			return -EINVAL;
1968 		if (cmsg->cmsg_level != SOL_SOCKET)
1969 			continue;
1970 		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
1971 		if (ret)
1972 			return ret;
1973 	}
1974 	return 0;
1975 }
1976 EXPORT_SYMBOL(sock_cmsg_send);
1977 
1978 /* On 32bit arches, an skb frag is limited to 2^15 */
1979 #define SKB_FRAG_PAGE_ORDER	get_order(32768)
1980 
1981 /**
1982  * skb_page_frag_refill - check that a page_frag contains enough room
1983  * @sz: minimum size of the fragment we want to get
1984  * @pfrag: pointer to page_frag
1985  * @gfp: priority for memory allocation
1986  *
1987  * Note: While this allocator tries to use high order pages, there is
1988  * no guarantee that allocations succeed. Therefore, @sz MUST be
1989  * less or equal than PAGE_SIZE.
1990  */
1991 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
1992 {
1993 	if (pfrag->page) {
1994 		if (page_ref_count(pfrag->page) == 1) {
1995 			pfrag->offset = 0;
1996 			return true;
1997 		}
1998 		if (pfrag->offset + sz <= pfrag->size)
1999 			return true;
2000 		put_page(pfrag->page);
2001 	}
2002 
2003 	pfrag->offset = 0;
2004 	if (SKB_FRAG_PAGE_ORDER) {
2005 		/* Avoid direct reclaim but allow kswapd to wake */
2006 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2007 					  __GFP_COMP | __GFP_NOWARN |
2008 					  __GFP_NORETRY,
2009 					  SKB_FRAG_PAGE_ORDER);
2010 		if (likely(pfrag->page)) {
2011 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2012 			return true;
2013 		}
2014 	}
2015 	pfrag->page = alloc_page(gfp);
2016 	if (likely(pfrag->page)) {
2017 		pfrag->size = PAGE_SIZE;
2018 		return true;
2019 	}
2020 	return false;
2021 }
2022 EXPORT_SYMBOL(skb_page_frag_refill);
2023 
2024 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2025 {
2026 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2027 		return true;
2028 
2029 	sk_enter_memory_pressure(sk);
2030 	sk_stream_moderate_sndbuf(sk);
2031 	return false;
2032 }
2033 EXPORT_SYMBOL(sk_page_frag_refill);
2034 
2035 static void __lock_sock(struct sock *sk)
2036 	__releases(&sk->sk_lock.slock)
2037 	__acquires(&sk->sk_lock.slock)
2038 {
2039 	DEFINE_WAIT(wait);
2040 
2041 	for (;;) {
2042 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2043 					TASK_UNINTERRUPTIBLE);
2044 		spin_unlock_bh(&sk->sk_lock.slock);
2045 		schedule();
2046 		spin_lock_bh(&sk->sk_lock.slock);
2047 		if (!sock_owned_by_user(sk))
2048 			break;
2049 	}
2050 	finish_wait(&sk->sk_lock.wq, &wait);
2051 }
2052 
2053 static void __release_sock(struct sock *sk)
2054 	__releases(&sk->sk_lock.slock)
2055 	__acquires(&sk->sk_lock.slock)
2056 {
2057 	struct sk_buff *skb, *next;
2058 
2059 	while ((skb = sk->sk_backlog.head) != NULL) {
2060 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2061 
2062 		spin_unlock_bh(&sk->sk_lock.slock);
2063 
2064 		do {
2065 			next = skb->next;
2066 			prefetch(next);
2067 			WARN_ON_ONCE(skb_dst_is_noref(skb));
2068 			skb->next = NULL;
2069 			sk_backlog_rcv(sk, skb);
2070 
2071 			cond_resched();
2072 
2073 			skb = next;
2074 		} while (skb != NULL);
2075 
2076 		spin_lock_bh(&sk->sk_lock.slock);
2077 	}
2078 
2079 	/*
2080 	 * Doing the zeroing here guarantee we can not loop forever
2081 	 * while a wild producer attempts to flood us.
2082 	 */
2083 	sk->sk_backlog.len = 0;
2084 }
2085 
2086 void __sk_flush_backlog(struct sock *sk)
2087 {
2088 	spin_lock_bh(&sk->sk_lock.slock);
2089 	__release_sock(sk);
2090 	spin_unlock_bh(&sk->sk_lock.slock);
2091 }
2092 
2093 /**
2094  * sk_wait_data - wait for data to arrive at sk_receive_queue
2095  * @sk:    sock to wait on
2096  * @timeo: for how long
2097  * @skb:   last skb seen on sk_receive_queue
2098  *
2099  * Now socket state including sk->sk_err is changed only under lock,
2100  * hence we may omit checks after joining wait queue.
2101  * We check receive queue before schedule() only as optimization;
2102  * it is very likely that release_sock() added new data.
2103  */
2104 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2105 {
2106 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2107 	int rc;
2108 
2109 	add_wait_queue(sk_sleep(sk), &wait);
2110 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2111 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2112 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2113 	remove_wait_queue(sk_sleep(sk), &wait);
2114 	return rc;
2115 }
2116 EXPORT_SYMBOL(sk_wait_data);
2117 
2118 /**
2119  *	__sk_mem_raise_allocated - increase memory_allocated
2120  *	@sk: socket
2121  *	@size: memory size to allocate
2122  *	@amt: pages to allocate
2123  *	@kind: allocation type
2124  *
2125  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2126  */
2127 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2128 {
2129 	struct proto *prot = sk->sk_prot;
2130 	long allocated = sk_memory_allocated_add(sk, amt);
2131 
2132 	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2133 	    !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2134 		goto suppress_allocation;
2135 
2136 	/* Under limit. */
2137 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2138 		sk_leave_memory_pressure(sk);
2139 		return 1;
2140 	}
2141 
2142 	/* Under pressure. */
2143 	if (allocated > sk_prot_mem_limits(sk, 1))
2144 		sk_enter_memory_pressure(sk);
2145 
2146 	/* Over hard limit. */
2147 	if (allocated > sk_prot_mem_limits(sk, 2))
2148 		goto suppress_allocation;
2149 
2150 	/* guarantee minimum buffer size under pressure */
2151 	if (kind == SK_MEM_RECV) {
2152 		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2153 			return 1;
2154 
2155 	} else { /* SK_MEM_SEND */
2156 		if (sk->sk_type == SOCK_STREAM) {
2157 			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2158 				return 1;
2159 		} else if (atomic_read(&sk->sk_wmem_alloc) <
2160 			   prot->sysctl_wmem[0])
2161 				return 1;
2162 	}
2163 
2164 	if (sk_has_memory_pressure(sk)) {
2165 		int alloc;
2166 
2167 		if (!sk_under_memory_pressure(sk))
2168 			return 1;
2169 		alloc = sk_sockets_allocated_read_positive(sk);
2170 		if (sk_prot_mem_limits(sk, 2) > alloc *
2171 		    sk_mem_pages(sk->sk_wmem_queued +
2172 				 atomic_read(&sk->sk_rmem_alloc) +
2173 				 sk->sk_forward_alloc))
2174 			return 1;
2175 	}
2176 
2177 suppress_allocation:
2178 
2179 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2180 		sk_stream_moderate_sndbuf(sk);
2181 
2182 		/* Fail only if socket is _under_ its sndbuf.
2183 		 * In this case we cannot block, so that we have to fail.
2184 		 */
2185 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2186 			return 1;
2187 	}
2188 
2189 	trace_sock_exceed_buf_limit(sk, prot, allocated);
2190 
2191 	sk_memory_allocated_sub(sk, amt);
2192 
2193 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2194 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2195 
2196 	return 0;
2197 }
2198 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2199 
2200 /**
2201  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2202  *	@sk: socket
2203  *	@size: memory size to allocate
2204  *	@kind: allocation type
2205  *
2206  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2207  *	rmem allocation. This function assumes that protocols which have
2208  *	memory_pressure use sk_wmem_queued as write buffer accounting.
2209  */
2210 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2211 {
2212 	int ret, amt = sk_mem_pages(size);
2213 
2214 	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2215 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2216 	if (!ret)
2217 		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2218 	return ret;
2219 }
2220 EXPORT_SYMBOL(__sk_mem_schedule);
2221 
2222 /**
2223  *	__sk_mem_reduce_allocated - reclaim memory_allocated
2224  *	@sk: socket
2225  *	@amount: number of quanta
2226  *
2227  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2228  */
2229 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2230 {
2231 	sk_memory_allocated_sub(sk, amount);
2232 
2233 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2234 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2235 
2236 	if (sk_under_memory_pressure(sk) &&
2237 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2238 		sk_leave_memory_pressure(sk);
2239 }
2240 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2241 
2242 /**
2243  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2244  *	@sk: socket
2245  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2246  */
2247 void __sk_mem_reclaim(struct sock *sk, int amount)
2248 {
2249 	amount >>= SK_MEM_QUANTUM_SHIFT;
2250 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2251 	__sk_mem_reduce_allocated(sk, amount);
2252 }
2253 EXPORT_SYMBOL(__sk_mem_reclaim);
2254 
2255 int sk_set_peek_off(struct sock *sk, int val)
2256 {
2257 	if (val < 0)
2258 		return -EINVAL;
2259 
2260 	sk->sk_peek_off = val;
2261 	return 0;
2262 }
2263 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2264 
2265 /*
2266  * Set of default routines for initialising struct proto_ops when
2267  * the protocol does not support a particular function. In certain
2268  * cases where it makes no sense for a protocol to have a "do nothing"
2269  * function, some default processing is provided.
2270  */
2271 
2272 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2273 {
2274 	return -EOPNOTSUPP;
2275 }
2276 EXPORT_SYMBOL(sock_no_bind);
2277 
2278 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2279 		    int len, int flags)
2280 {
2281 	return -EOPNOTSUPP;
2282 }
2283 EXPORT_SYMBOL(sock_no_connect);
2284 
2285 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2286 {
2287 	return -EOPNOTSUPP;
2288 }
2289 EXPORT_SYMBOL(sock_no_socketpair);
2290 
2291 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2292 		   bool kern)
2293 {
2294 	return -EOPNOTSUPP;
2295 }
2296 EXPORT_SYMBOL(sock_no_accept);
2297 
2298 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2299 		    int *len, int peer)
2300 {
2301 	return -EOPNOTSUPP;
2302 }
2303 EXPORT_SYMBOL(sock_no_getname);
2304 
2305 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2306 {
2307 	return 0;
2308 }
2309 EXPORT_SYMBOL(sock_no_poll);
2310 
2311 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2312 {
2313 	return -EOPNOTSUPP;
2314 }
2315 EXPORT_SYMBOL(sock_no_ioctl);
2316 
2317 int sock_no_listen(struct socket *sock, int backlog)
2318 {
2319 	return -EOPNOTSUPP;
2320 }
2321 EXPORT_SYMBOL(sock_no_listen);
2322 
2323 int sock_no_shutdown(struct socket *sock, int how)
2324 {
2325 	return -EOPNOTSUPP;
2326 }
2327 EXPORT_SYMBOL(sock_no_shutdown);
2328 
2329 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2330 		    char __user *optval, unsigned int optlen)
2331 {
2332 	return -EOPNOTSUPP;
2333 }
2334 EXPORT_SYMBOL(sock_no_setsockopt);
2335 
2336 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2337 		    char __user *optval, int __user *optlen)
2338 {
2339 	return -EOPNOTSUPP;
2340 }
2341 EXPORT_SYMBOL(sock_no_getsockopt);
2342 
2343 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2344 {
2345 	return -EOPNOTSUPP;
2346 }
2347 EXPORT_SYMBOL(sock_no_sendmsg);
2348 
2349 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2350 		    int flags)
2351 {
2352 	return -EOPNOTSUPP;
2353 }
2354 EXPORT_SYMBOL(sock_no_recvmsg);
2355 
2356 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2357 {
2358 	/* Mirror missing mmap method error code */
2359 	return -ENODEV;
2360 }
2361 EXPORT_SYMBOL(sock_no_mmap);
2362 
2363 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2364 {
2365 	ssize_t res;
2366 	struct msghdr msg = {.msg_flags = flags};
2367 	struct kvec iov;
2368 	char *kaddr = kmap(page);
2369 	iov.iov_base = kaddr + offset;
2370 	iov.iov_len = size;
2371 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2372 	kunmap(page);
2373 	return res;
2374 }
2375 EXPORT_SYMBOL(sock_no_sendpage);
2376 
2377 /*
2378  *	Default Socket Callbacks
2379  */
2380 
2381 static void sock_def_wakeup(struct sock *sk)
2382 {
2383 	struct socket_wq *wq;
2384 
2385 	rcu_read_lock();
2386 	wq = rcu_dereference(sk->sk_wq);
2387 	if (skwq_has_sleeper(wq))
2388 		wake_up_interruptible_all(&wq->wait);
2389 	rcu_read_unlock();
2390 }
2391 
2392 static void sock_def_error_report(struct sock *sk)
2393 {
2394 	struct socket_wq *wq;
2395 
2396 	rcu_read_lock();
2397 	wq = rcu_dereference(sk->sk_wq);
2398 	if (skwq_has_sleeper(wq))
2399 		wake_up_interruptible_poll(&wq->wait, POLLERR);
2400 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2401 	rcu_read_unlock();
2402 }
2403 
2404 static void sock_def_readable(struct sock *sk)
2405 {
2406 	struct socket_wq *wq;
2407 
2408 	rcu_read_lock();
2409 	wq = rcu_dereference(sk->sk_wq);
2410 	if (skwq_has_sleeper(wq))
2411 		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2412 						POLLRDNORM | POLLRDBAND);
2413 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2414 	rcu_read_unlock();
2415 }
2416 
2417 static void sock_def_write_space(struct sock *sk)
2418 {
2419 	struct socket_wq *wq;
2420 
2421 	rcu_read_lock();
2422 
2423 	/* Do not wake up a writer until he can make "significant"
2424 	 * progress.  --DaveM
2425 	 */
2426 	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2427 		wq = rcu_dereference(sk->sk_wq);
2428 		if (skwq_has_sleeper(wq))
2429 			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2430 						POLLWRNORM | POLLWRBAND);
2431 
2432 		/* Should agree with poll, otherwise some programs break */
2433 		if (sock_writeable(sk))
2434 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2435 	}
2436 
2437 	rcu_read_unlock();
2438 }
2439 
2440 static void sock_def_destruct(struct sock *sk)
2441 {
2442 }
2443 
2444 void sk_send_sigurg(struct sock *sk)
2445 {
2446 	if (sk->sk_socket && sk->sk_socket->file)
2447 		if (send_sigurg(&sk->sk_socket->file->f_owner))
2448 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2449 }
2450 EXPORT_SYMBOL(sk_send_sigurg);
2451 
2452 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2453 		    unsigned long expires)
2454 {
2455 	if (!mod_timer(timer, expires))
2456 		sock_hold(sk);
2457 }
2458 EXPORT_SYMBOL(sk_reset_timer);
2459 
2460 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2461 {
2462 	if (del_timer(timer))
2463 		__sock_put(sk);
2464 }
2465 EXPORT_SYMBOL(sk_stop_timer);
2466 
2467 void sock_init_data(struct socket *sock, struct sock *sk)
2468 {
2469 	skb_queue_head_init(&sk->sk_receive_queue);
2470 	skb_queue_head_init(&sk->sk_write_queue);
2471 	skb_queue_head_init(&sk->sk_error_queue);
2472 
2473 	sk->sk_send_head	=	NULL;
2474 
2475 	init_timer(&sk->sk_timer);
2476 
2477 	sk->sk_allocation	=	GFP_KERNEL;
2478 	sk->sk_rcvbuf		=	sysctl_rmem_default;
2479 	sk->sk_sndbuf		=	sysctl_wmem_default;
2480 	sk->sk_state		=	TCP_CLOSE;
2481 	sk_set_socket(sk, sock);
2482 
2483 	sock_set_flag(sk, SOCK_ZAPPED);
2484 
2485 	if (sock) {
2486 		sk->sk_type	=	sock->type;
2487 		sk->sk_wq	=	sock->wq;
2488 		sock->sk	=	sk;
2489 		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
2490 	} else {
2491 		sk->sk_wq	=	NULL;
2492 		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
2493 	}
2494 
2495 	rwlock_init(&sk->sk_callback_lock);
2496 	if (sk->sk_kern_sock)
2497 		lockdep_set_class_and_name(
2498 			&sk->sk_callback_lock,
2499 			af_kern_callback_keys + sk->sk_family,
2500 			af_family_kern_clock_key_strings[sk->sk_family]);
2501 	else
2502 		lockdep_set_class_and_name(
2503 			&sk->sk_callback_lock,
2504 			af_callback_keys + sk->sk_family,
2505 			af_family_clock_key_strings[sk->sk_family]);
2506 
2507 	sk->sk_state_change	=	sock_def_wakeup;
2508 	sk->sk_data_ready	=	sock_def_readable;
2509 	sk->sk_write_space	=	sock_def_write_space;
2510 	sk->sk_error_report	=	sock_def_error_report;
2511 	sk->sk_destruct		=	sock_def_destruct;
2512 
2513 	sk->sk_frag.page	=	NULL;
2514 	sk->sk_frag.offset	=	0;
2515 	sk->sk_peek_off		=	-1;
2516 
2517 	sk->sk_peer_pid 	=	NULL;
2518 	sk->sk_peer_cred	=	NULL;
2519 	sk->sk_write_pending	=	0;
2520 	sk->sk_rcvlowat		=	1;
2521 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2522 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2523 
2524 	sk->sk_stamp = ktime_set(-1L, 0);
2525 
2526 #ifdef CONFIG_NET_RX_BUSY_POLL
2527 	sk->sk_napi_id		=	0;
2528 	sk->sk_ll_usec		=	sysctl_net_busy_read;
2529 #endif
2530 
2531 	sk->sk_max_pacing_rate = ~0U;
2532 	sk->sk_pacing_rate = ~0U;
2533 	sk->sk_incoming_cpu = -1;
2534 	/*
2535 	 * Before updating sk_refcnt, we must commit prior changes to memory
2536 	 * (Documentation/RCU/rculist_nulls.txt for details)
2537 	 */
2538 	smp_wmb();
2539 	atomic_set(&sk->sk_refcnt, 1);
2540 	atomic_set(&sk->sk_drops, 0);
2541 }
2542 EXPORT_SYMBOL(sock_init_data);
2543 
2544 void lock_sock_nested(struct sock *sk, int subclass)
2545 {
2546 	might_sleep();
2547 	spin_lock_bh(&sk->sk_lock.slock);
2548 	if (sk->sk_lock.owned)
2549 		__lock_sock(sk);
2550 	sk->sk_lock.owned = 1;
2551 	spin_unlock(&sk->sk_lock.slock);
2552 	/*
2553 	 * The sk_lock has mutex_lock() semantics here:
2554 	 */
2555 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2556 	local_bh_enable();
2557 }
2558 EXPORT_SYMBOL(lock_sock_nested);
2559 
2560 void release_sock(struct sock *sk)
2561 {
2562 	spin_lock_bh(&sk->sk_lock.slock);
2563 	if (sk->sk_backlog.tail)
2564 		__release_sock(sk);
2565 
2566 	/* Warning : release_cb() might need to release sk ownership,
2567 	 * ie call sock_release_ownership(sk) before us.
2568 	 */
2569 	if (sk->sk_prot->release_cb)
2570 		sk->sk_prot->release_cb(sk);
2571 
2572 	sock_release_ownership(sk);
2573 	if (waitqueue_active(&sk->sk_lock.wq))
2574 		wake_up(&sk->sk_lock.wq);
2575 	spin_unlock_bh(&sk->sk_lock.slock);
2576 }
2577 EXPORT_SYMBOL(release_sock);
2578 
2579 /**
2580  * lock_sock_fast - fast version of lock_sock
2581  * @sk: socket
2582  *
2583  * This version should be used for very small section, where process wont block
2584  * return false if fast path is taken
2585  *   sk_lock.slock locked, owned = 0, BH disabled
2586  * return true if slow path is taken
2587  *   sk_lock.slock unlocked, owned = 1, BH enabled
2588  */
2589 bool lock_sock_fast(struct sock *sk)
2590 {
2591 	might_sleep();
2592 	spin_lock_bh(&sk->sk_lock.slock);
2593 
2594 	if (!sk->sk_lock.owned)
2595 		/*
2596 		 * Note : We must disable BH
2597 		 */
2598 		return false;
2599 
2600 	__lock_sock(sk);
2601 	sk->sk_lock.owned = 1;
2602 	spin_unlock(&sk->sk_lock.slock);
2603 	/*
2604 	 * The sk_lock has mutex_lock() semantics here:
2605 	 */
2606 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2607 	local_bh_enable();
2608 	return true;
2609 }
2610 EXPORT_SYMBOL(lock_sock_fast);
2611 
2612 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2613 {
2614 	struct timeval tv;
2615 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2616 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2617 	tv = ktime_to_timeval(sk->sk_stamp);
2618 	if (tv.tv_sec == -1)
2619 		return -ENOENT;
2620 	if (tv.tv_sec == 0) {
2621 		sk->sk_stamp = ktime_get_real();
2622 		tv = ktime_to_timeval(sk->sk_stamp);
2623 	}
2624 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2625 }
2626 EXPORT_SYMBOL(sock_get_timestamp);
2627 
2628 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2629 {
2630 	struct timespec ts;
2631 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2632 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2633 	ts = ktime_to_timespec(sk->sk_stamp);
2634 	if (ts.tv_sec == -1)
2635 		return -ENOENT;
2636 	if (ts.tv_sec == 0) {
2637 		sk->sk_stamp = ktime_get_real();
2638 		ts = ktime_to_timespec(sk->sk_stamp);
2639 	}
2640 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2641 }
2642 EXPORT_SYMBOL(sock_get_timestampns);
2643 
2644 void sock_enable_timestamp(struct sock *sk, int flag)
2645 {
2646 	if (!sock_flag(sk, flag)) {
2647 		unsigned long previous_flags = sk->sk_flags;
2648 
2649 		sock_set_flag(sk, flag);
2650 		/*
2651 		 * we just set one of the two flags which require net
2652 		 * time stamping, but time stamping might have been on
2653 		 * already because of the other one
2654 		 */
2655 		if (sock_needs_netstamp(sk) &&
2656 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
2657 			net_enable_timestamp();
2658 	}
2659 }
2660 
2661 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2662 		       int level, int type)
2663 {
2664 	struct sock_exterr_skb *serr;
2665 	struct sk_buff *skb;
2666 	int copied, err;
2667 
2668 	err = -EAGAIN;
2669 	skb = sock_dequeue_err_skb(sk);
2670 	if (skb == NULL)
2671 		goto out;
2672 
2673 	copied = skb->len;
2674 	if (copied > len) {
2675 		msg->msg_flags |= MSG_TRUNC;
2676 		copied = len;
2677 	}
2678 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
2679 	if (err)
2680 		goto out_free_skb;
2681 
2682 	sock_recv_timestamp(msg, sk, skb);
2683 
2684 	serr = SKB_EXT_ERR(skb);
2685 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2686 
2687 	msg->msg_flags |= MSG_ERRQUEUE;
2688 	err = copied;
2689 
2690 out_free_skb:
2691 	kfree_skb(skb);
2692 out:
2693 	return err;
2694 }
2695 EXPORT_SYMBOL(sock_recv_errqueue);
2696 
2697 /*
2698  *	Get a socket option on an socket.
2699  *
2700  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2701  *	asynchronous errors should be reported by getsockopt. We assume
2702  *	this means if you specify SO_ERROR (otherwise whats the point of it).
2703  */
2704 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2705 			   char __user *optval, int __user *optlen)
2706 {
2707 	struct sock *sk = sock->sk;
2708 
2709 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2710 }
2711 EXPORT_SYMBOL(sock_common_getsockopt);
2712 
2713 #ifdef CONFIG_COMPAT
2714 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2715 				  char __user *optval, int __user *optlen)
2716 {
2717 	struct sock *sk = sock->sk;
2718 
2719 	if (sk->sk_prot->compat_getsockopt != NULL)
2720 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2721 						      optval, optlen);
2722 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2723 }
2724 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2725 #endif
2726 
2727 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2728 			int flags)
2729 {
2730 	struct sock *sk = sock->sk;
2731 	int addr_len = 0;
2732 	int err;
2733 
2734 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2735 				   flags & ~MSG_DONTWAIT, &addr_len);
2736 	if (err >= 0)
2737 		msg->msg_namelen = addr_len;
2738 	return err;
2739 }
2740 EXPORT_SYMBOL(sock_common_recvmsg);
2741 
2742 /*
2743  *	Set socket options on an inet socket.
2744  */
2745 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2746 			   char __user *optval, unsigned int optlen)
2747 {
2748 	struct sock *sk = sock->sk;
2749 
2750 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2751 }
2752 EXPORT_SYMBOL(sock_common_setsockopt);
2753 
2754 #ifdef CONFIG_COMPAT
2755 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2756 				  char __user *optval, unsigned int optlen)
2757 {
2758 	struct sock *sk = sock->sk;
2759 
2760 	if (sk->sk_prot->compat_setsockopt != NULL)
2761 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2762 						      optval, optlen);
2763 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2764 }
2765 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2766 #endif
2767 
2768 void sk_common_release(struct sock *sk)
2769 {
2770 	if (sk->sk_prot->destroy)
2771 		sk->sk_prot->destroy(sk);
2772 
2773 	/*
2774 	 * Observation: when sock_common_release is called, processes have
2775 	 * no access to socket. But net still has.
2776 	 * Step one, detach it from networking:
2777 	 *
2778 	 * A. Remove from hash tables.
2779 	 */
2780 
2781 	sk->sk_prot->unhash(sk);
2782 
2783 	/*
2784 	 * In this point socket cannot receive new packets, but it is possible
2785 	 * that some packets are in flight because some CPU runs receiver and
2786 	 * did hash table lookup before we unhashed socket. They will achieve
2787 	 * receive queue and will be purged by socket destructor.
2788 	 *
2789 	 * Also we still have packets pending on receive queue and probably,
2790 	 * our own packets waiting in device queues. sock_destroy will drain
2791 	 * receive queue, but transmitted packets will delay socket destruction
2792 	 * until the last reference will be released.
2793 	 */
2794 
2795 	sock_orphan(sk);
2796 
2797 	xfrm_sk_free_policy(sk);
2798 
2799 	sk_refcnt_debug_release(sk);
2800 
2801 	sock_put(sk);
2802 }
2803 EXPORT_SYMBOL(sk_common_release);
2804 
2805 #ifdef CONFIG_PROC_FS
2806 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
2807 struct prot_inuse {
2808 	int val[PROTO_INUSE_NR];
2809 };
2810 
2811 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2812 
2813 #ifdef CONFIG_NET_NS
2814 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2815 {
2816 	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2817 }
2818 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2819 
2820 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2821 {
2822 	int cpu, idx = prot->inuse_idx;
2823 	int res = 0;
2824 
2825 	for_each_possible_cpu(cpu)
2826 		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2827 
2828 	return res >= 0 ? res : 0;
2829 }
2830 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2831 
2832 static int __net_init sock_inuse_init_net(struct net *net)
2833 {
2834 	net->core.inuse = alloc_percpu(struct prot_inuse);
2835 	return net->core.inuse ? 0 : -ENOMEM;
2836 }
2837 
2838 static void __net_exit sock_inuse_exit_net(struct net *net)
2839 {
2840 	free_percpu(net->core.inuse);
2841 }
2842 
2843 static struct pernet_operations net_inuse_ops = {
2844 	.init = sock_inuse_init_net,
2845 	.exit = sock_inuse_exit_net,
2846 };
2847 
2848 static __init int net_inuse_init(void)
2849 {
2850 	if (register_pernet_subsys(&net_inuse_ops))
2851 		panic("Cannot initialize net inuse counters");
2852 
2853 	return 0;
2854 }
2855 
2856 core_initcall(net_inuse_init);
2857 #else
2858 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2859 
2860 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2861 {
2862 	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2863 }
2864 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2865 
2866 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2867 {
2868 	int cpu, idx = prot->inuse_idx;
2869 	int res = 0;
2870 
2871 	for_each_possible_cpu(cpu)
2872 		res += per_cpu(prot_inuse, cpu).val[idx];
2873 
2874 	return res >= 0 ? res : 0;
2875 }
2876 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2877 #endif
2878 
2879 static void assign_proto_idx(struct proto *prot)
2880 {
2881 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2882 
2883 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2884 		pr_err("PROTO_INUSE_NR exhausted\n");
2885 		return;
2886 	}
2887 
2888 	set_bit(prot->inuse_idx, proto_inuse_idx);
2889 }
2890 
2891 static void release_proto_idx(struct proto *prot)
2892 {
2893 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2894 		clear_bit(prot->inuse_idx, proto_inuse_idx);
2895 }
2896 #else
2897 static inline void assign_proto_idx(struct proto *prot)
2898 {
2899 }
2900 
2901 static inline void release_proto_idx(struct proto *prot)
2902 {
2903 }
2904 #endif
2905 
2906 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
2907 {
2908 	if (!rsk_prot)
2909 		return;
2910 	kfree(rsk_prot->slab_name);
2911 	rsk_prot->slab_name = NULL;
2912 	kmem_cache_destroy(rsk_prot->slab);
2913 	rsk_prot->slab = NULL;
2914 }
2915 
2916 static int req_prot_init(const struct proto *prot)
2917 {
2918 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
2919 
2920 	if (!rsk_prot)
2921 		return 0;
2922 
2923 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
2924 					prot->name);
2925 	if (!rsk_prot->slab_name)
2926 		return -ENOMEM;
2927 
2928 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
2929 					   rsk_prot->obj_size, 0,
2930 					   prot->slab_flags, NULL);
2931 
2932 	if (!rsk_prot->slab) {
2933 		pr_crit("%s: Can't create request sock SLAB cache!\n",
2934 			prot->name);
2935 		return -ENOMEM;
2936 	}
2937 	return 0;
2938 }
2939 
2940 int proto_register(struct proto *prot, int alloc_slab)
2941 {
2942 	if (alloc_slab) {
2943 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2944 					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2945 					NULL);
2946 
2947 		if (prot->slab == NULL) {
2948 			pr_crit("%s: Can't create sock SLAB cache!\n",
2949 				prot->name);
2950 			goto out;
2951 		}
2952 
2953 		if (req_prot_init(prot))
2954 			goto out_free_request_sock_slab;
2955 
2956 		if (prot->twsk_prot != NULL) {
2957 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2958 
2959 			if (prot->twsk_prot->twsk_slab_name == NULL)
2960 				goto out_free_request_sock_slab;
2961 
2962 			prot->twsk_prot->twsk_slab =
2963 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2964 						  prot->twsk_prot->twsk_obj_size,
2965 						  0,
2966 						  prot->slab_flags,
2967 						  NULL);
2968 			if (prot->twsk_prot->twsk_slab == NULL)
2969 				goto out_free_timewait_sock_slab_name;
2970 		}
2971 	}
2972 
2973 	mutex_lock(&proto_list_mutex);
2974 	list_add(&prot->node, &proto_list);
2975 	assign_proto_idx(prot);
2976 	mutex_unlock(&proto_list_mutex);
2977 	return 0;
2978 
2979 out_free_timewait_sock_slab_name:
2980 	kfree(prot->twsk_prot->twsk_slab_name);
2981 out_free_request_sock_slab:
2982 	req_prot_cleanup(prot->rsk_prot);
2983 
2984 	kmem_cache_destroy(prot->slab);
2985 	prot->slab = NULL;
2986 out:
2987 	return -ENOBUFS;
2988 }
2989 EXPORT_SYMBOL(proto_register);
2990 
2991 void proto_unregister(struct proto *prot)
2992 {
2993 	mutex_lock(&proto_list_mutex);
2994 	release_proto_idx(prot);
2995 	list_del(&prot->node);
2996 	mutex_unlock(&proto_list_mutex);
2997 
2998 	kmem_cache_destroy(prot->slab);
2999 	prot->slab = NULL;
3000 
3001 	req_prot_cleanup(prot->rsk_prot);
3002 
3003 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3004 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3005 		kfree(prot->twsk_prot->twsk_slab_name);
3006 		prot->twsk_prot->twsk_slab = NULL;
3007 	}
3008 }
3009 EXPORT_SYMBOL(proto_unregister);
3010 
3011 #ifdef CONFIG_PROC_FS
3012 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3013 	__acquires(proto_list_mutex)
3014 {
3015 	mutex_lock(&proto_list_mutex);
3016 	return seq_list_start_head(&proto_list, *pos);
3017 }
3018 
3019 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3020 {
3021 	return seq_list_next(v, &proto_list, pos);
3022 }
3023 
3024 static void proto_seq_stop(struct seq_file *seq, void *v)
3025 	__releases(proto_list_mutex)
3026 {
3027 	mutex_unlock(&proto_list_mutex);
3028 }
3029 
3030 static char proto_method_implemented(const void *method)
3031 {
3032 	return method == NULL ? 'n' : 'y';
3033 }
3034 static long sock_prot_memory_allocated(struct proto *proto)
3035 {
3036 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3037 }
3038 
3039 static char *sock_prot_memory_pressure(struct proto *proto)
3040 {
3041 	return proto->memory_pressure != NULL ?
3042 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3043 }
3044 
3045 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3046 {
3047 
3048 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3049 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3050 		   proto->name,
3051 		   proto->obj_size,
3052 		   sock_prot_inuse_get(seq_file_net(seq), proto),
3053 		   sock_prot_memory_allocated(proto),
3054 		   sock_prot_memory_pressure(proto),
3055 		   proto->max_header,
3056 		   proto->slab == NULL ? "no" : "yes",
3057 		   module_name(proto->owner),
3058 		   proto_method_implemented(proto->close),
3059 		   proto_method_implemented(proto->connect),
3060 		   proto_method_implemented(proto->disconnect),
3061 		   proto_method_implemented(proto->accept),
3062 		   proto_method_implemented(proto->ioctl),
3063 		   proto_method_implemented(proto->init),
3064 		   proto_method_implemented(proto->destroy),
3065 		   proto_method_implemented(proto->shutdown),
3066 		   proto_method_implemented(proto->setsockopt),
3067 		   proto_method_implemented(proto->getsockopt),
3068 		   proto_method_implemented(proto->sendmsg),
3069 		   proto_method_implemented(proto->recvmsg),
3070 		   proto_method_implemented(proto->sendpage),
3071 		   proto_method_implemented(proto->bind),
3072 		   proto_method_implemented(proto->backlog_rcv),
3073 		   proto_method_implemented(proto->hash),
3074 		   proto_method_implemented(proto->unhash),
3075 		   proto_method_implemented(proto->get_port),
3076 		   proto_method_implemented(proto->enter_memory_pressure));
3077 }
3078 
3079 static int proto_seq_show(struct seq_file *seq, void *v)
3080 {
3081 	if (v == &proto_list)
3082 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3083 			   "protocol",
3084 			   "size",
3085 			   "sockets",
3086 			   "memory",
3087 			   "press",
3088 			   "maxhdr",
3089 			   "slab",
3090 			   "module",
3091 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3092 	else
3093 		proto_seq_printf(seq, list_entry(v, struct proto, node));
3094 	return 0;
3095 }
3096 
3097 static const struct seq_operations proto_seq_ops = {
3098 	.start  = proto_seq_start,
3099 	.next   = proto_seq_next,
3100 	.stop   = proto_seq_stop,
3101 	.show   = proto_seq_show,
3102 };
3103 
3104 static int proto_seq_open(struct inode *inode, struct file *file)
3105 {
3106 	return seq_open_net(inode, file, &proto_seq_ops,
3107 			    sizeof(struct seq_net_private));
3108 }
3109 
3110 static const struct file_operations proto_seq_fops = {
3111 	.owner		= THIS_MODULE,
3112 	.open		= proto_seq_open,
3113 	.read		= seq_read,
3114 	.llseek		= seq_lseek,
3115 	.release	= seq_release_net,
3116 };
3117 
3118 static __net_init int proto_init_net(struct net *net)
3119 {
3120 	if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3121 		return -ENOMEM;
3122 
3123 	return 0;
3124 }
3125 
3126 static __net_exit void proto_exit_net(struct net *net)
3127 {
3128 	remove_proc_entry("protocols", net->proc_net);
3129 }
3130 
3131 
3132 static __net_initdata struct pernet_operations proto_net_ops = {
3133 	.init = proto_init_net,
3134 	.exit = proto_exit_net,
3135 };
3136 
3137 static int __init proto_init(void)
3138 {
3139 	return register_pernet_subsys(&proto_net_ops);
3140 }
3141 
3142 subsys_initcall(proto_init);
3143 
3144 #endif /* PROC_FS */
3145