xref: /openbmc/linux/net/core/sock.c (revision f3a8b664)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *		This program is free software; you can redistribute it and/or
87  *		modify it under the terms of the GNU General Public License
88  *		as published by the Free Software Foundation; either version
89  *		2 of the License, or (at your option) any later version.
90  */
91 
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93 
94 #include <linux/capability.h>
95 #include <linux/errno.h>
96 #include <linux/errqueue.h>
97 #include <linux/types.h>
98 #include <linux/socket.h>
99 #include <linux/in.h>
100 #include <linux/kernel.h>
101 #include <linux/module.h>
102 #include <linux/proc_fs.h>
103 #include <linux/seq_file.h>
104 #include <linux/sched.h>
105 #include <linux/timer.h>
106 #include <linux/string.h>
107 #include <linux/sockios.h>
108 #include <linux/net.h>
109 #include <linux/mm.h>
110 #include <linux/slab.h>
111 #include <linux/interrupt.h>
112 #include <linux/poll.h>
113 #include <linux/tcp.h>
114 #include <linux/init.h>
115 #include <linux/highmem.h>
116 #include <linux/user_namespace.h>
117 #include <linux/static_key.h>
118 #include <linux/memcontrol.h>
119 #include <linux/prefetch.h>
120 
121 #include <asm/uaccess.h>
122 
123 #include <linux/netdevice.h>
124 #include <net/protocol.h>
125 #include <linux/skbuff.h>
126 #include <net/net_namespace.h>
127 #include <net/request_sock.h>
128 #include <net/sock.h>
129 #include <linux/net_tstamp.h>
130 #include <net/xfrm.h>
131 #include <linux/ipsec.h>
132 #include <net/cls_cgroup.h>
133 #include <net/netprio_cgroup.h>
134 #include <linux/sock_diag.h>
135 
136 #include <linux/filter.h>
137 #include <net/sock_reuseport.h>
138 
139 #include <trace/events/sock.h>
140 
141 #ifdef CONFIG_INET
142 #include <net/tcp.h>
143 #endif
144 
145 #include <net/busy_poll.h>
146 
147 static DEFINE_MUTEX(proto_list_mutex);
148 static LIST_HEAD(proto_list);
149 
150 /**
151  * sk_ns_capable - General socket capability test
152  * @sk: Socket to use a capability on or through
153  * @user_ns: The user namespace of the capability to use
154  * @cap: The capability to use
155  *
156  * Test to see if the opener of the socket had when the socket was
157  * created and the current process has the capability @cap in the user
158  * namespace @user_ns.
159  */
160 bool sk_ns_capable(const struct sock *sk,
161 		   struct user_namespace *user_ns, int cap)
162 {
163 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
164 		ns_capable(user_ns, cap);
165 }
166 EXPORT_SYMBOL(sk_ns_capable);
167 
168 /**
169  * sk_capable - Socket global capability test
170  * @sk: Socket to use a capability on or through
171  * @cap: The global capability to use
172  *
173  * Test to see if the opener of the socket had when the socket was
174  * created and the current process has the capability @cap in all user
175  * namespaces.
176  */
177 bool sk_capable(const struct sock *sk, int cap)
178 {
179 	return sk_ns_capable(sk, &init_user_ns, cap);
180 }
181 EXPORT_SYMBOL(sk_capable);
182 
183 /**
184  * sk_net_capable - Network namespace socket capability test
185  * @sk: Socket to use a capability on or through
186  * @cap: The capability to use
187  *
188  * Test to see if the opener of the socket had when the socket was created
189  * and the current process has the capability @cap over the network namespace
190  * the socket is a member of.
191  */
192 bool sk_net_capable(const struct sock *sk, int cap)
193 {
194 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
195 }
196 EXPORT_SYMBOL(sk_net_capable);
197 
198 /*
199  * Each address family might have different locking rules, so we have
200  * one slock key per address family:
201  */
202 static struct lock_class_key af_family_keys[AF_MAX];
203 static struct lock_class_key af_family_slock_keys[AF_MAX];
204 
205 /*
206  * Make lock validator output more readable. (we pre-construct these
207  * strings build-time, so that runtime initialization of socket
208  * locks is fast):
209  */
210 static const char *const af_family_key_strings[AF_MAX+1] = {
211   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
212   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
213   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
214   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
215   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
216   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
217   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
218   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
219   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
220   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
221   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
222   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
223   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
224   "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_KCM"      ,
225   "sk_lock-AF_MAX"
226 };
227 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
228   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
229   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
230   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
231   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
232   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
233   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
234   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
235   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
236   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
237   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
238   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
239   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
240   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
241   "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_KCM"       ,
242   "slock-AF_MAX"
243 };
244 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
245   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
246   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
247   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
248   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
249   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
250   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
251   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
252   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
253   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
254   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
255   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
256   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
257   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
258   "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_KCM"      ,
259   "clock-AF_MAX"
260 };
261 
262 /*
263  * sk_callback_lock locking rules are per-address-family,
264  * so split the lock classes by using a per-AF key:
265  */
266 static struct lock_class_key af_callback_keys[AF_MAX];
267 
268 /* Take into consideration the size of the struct sk_buff overhead in the
269  * determination of these values, since that is non-constant across
270  * platforms.  This makes socket queueing behavior and performance
271  * not depend upon such differences.
272  */
273 #define _SK_MEM_PACKETS		256
274 #define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
275 #define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
276 #define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
277 
278 /* Run time adjustable parameters. */
279 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
280 EXPORT_SYMBOL(sysctl_wmem_max);
281 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
282 EXPORT_SYMBOL(sysctl_rmem_max);
283 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
284 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
285 
286 /* Maximal space eaten by iovec or ancillary data plus some space */
287 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
288 EXPORT_SYMBOL(sysctl_optmem_max);
289 
290 int sysctl_tstamp_allow_data __read_mostly = 1;
291 
292 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
293 EXPORT_SYMBOL_GPL(memalloc_socks);
294 
295 /**
296  * sk_set_memalloc - sets %SOCK_MEMALLOC
297  * @sk: socket to set it on
298  *
299  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
300  * It's the responsibility of the admin to adjust min_free_kbytes
301  * to meet the requirements
302  */
303 void sk_set_memalloc(struct sock *sk)
304 {
305 	sock_set_flag(sk, SOCK_MEMALLOC);
306 	sk->sk_allocation |= __GFP_MEMALLOC;
307 	static_key_slow_inc(&memalloc_socks);
308 }
309 EXPORT_SYMBOL_GPL(sk_set_memalloc);
310 
311 void sk_clear_memalloc(struct sock *sk)
312 {
313 	sock_reset_flag(sk, SOCK_MEMALLOC);
314 	sk->sk_allocation &= ~__GFP_MEMALLOC;
315 	static_key_slow_dec(&memalloc_socks);
316 
317 	/*
318 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
319 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
320 	 * it has rmem allocations due to the last swapfile being deactivated
321 	 * but there is a risk that the socket is unusable due to exceeding
322 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
323 	 */
324 	sk_mem_reclaim(sk);
325 }
326 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
327 
328 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
329 {
330 	int ret;
331 	unsigned long pflags = current->flags;
332 
333 	/* these should have been dropped before queueing */
334 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
335 
336 	current->flags |= PF_MEMALLOC;
337 	ret = sk->sk_backlog_rcv(sk, skb);
338 	tsk_restore_flags(current, pflags, PF_MEMALLOC);
339 
340 	return ret;
341 }
342 EXPORT_SYMBOL(__sk_backlog_rcv);
343 
344 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
345 {
346 	struct timeval tv;
347 
348 	if (optlen < sizeof(tv))
349 		return -EINVAL;
350 	if (copy_from_user(&tv, optval, sizeof(tv)))
351 		return -EFAULT;
352 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
353 		return -EDOM;
354 
355 	if (tv.tv_sec < 0) {
356 		static int warned __read_mostly;
357 
358 		*timeo_p = 0;
359 		if (warned < 10 && net_ratelimit()) {
360 			warned++;
361 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
362 				__func__, current->comm, task_pid_nr(current));
363 		}
364 		return 0;
365 	}
366 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
367 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
368 		return 0;
369 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
370 		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
371 	return 0;
372 }
373 
374 static void sock_warn_obsolete_bsdism(const char *name)
375 {
376 	static int warned;
377 	static char warncomm[TASK_COMM_LEN];
378 	if (strcmp(warncomm, current->comm) && warned < 5) {
379 		strcpy(warncomm,  current->comm);
380 		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
381 			warncomm, name);
382 		warned++;
383 	}
384 }
385 
386 static bool sock_needs_netstamp(const struct sock *sk)
387 {
388 	switch (sk->sk_family) {
389 	case AF_UNSPEC:
390 	case AF_UNIX:
391 		return false;
392 	default:
393 		return true;
394 	}
395 }
396 
397 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
398 {
399 	if (sk->sk_flags & flags) {
400 		sk->sk_flags &= ~flags;
401 		if (sock_needs_netstamp(sk) &&
402 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
403 			net_disable_timestamp();
404 	}
405 }
406 
407 
408 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
409 {
410 	unsigned long flags;
411 	struct sk_buff_head *list = &sk->sk_receive_queue;
412 
413 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
414 		atomic_inc(&sk->sk_drops);
415 		trace_sock_rcvqueue_full(sk, skb);
416 		return -ENOMEM;
417 	}
418 
419 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
420 		atomic_inc(&sk->sk_drops);
421 		return -ENOBUFS;
422 	}
423 
424 	skb->dev = NULL;
425 	skb_set_owner_r(skb, sk);
426 
427 	/* we escape from rcu protected region, make sure we dont leak
428 	 * a norefcounted dst
429 	 */
430 	skb_dst_force(skb);
431 
432 	spin_lock_irqsave(&list->lock, flags);
433 	sock_skb_set_dropcount(sk, skb);
434 	__skb_queue_tail(list, skb);
435 	spin_unlock_irqrestore(&list->lock, flags);
436 
437 	if (!sock_flag(sk, SOCK_DEAD))
438 		sk->sk_data_ready(sk);
439 	return 0;
440 }
441 EXPORT_SYMBOL(__sock_queue_rcv_skb);
442 
443 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
444 {
445 	int err;
446 
447 	err = sk_filter(sk, skb);
448 	if (err)
449 		return err;
450 
451 	return __sock_queue_rcv_skb(sk, skb);
452 }
453 EXPORT_SYMBOL(sock_queue_rcv_skb);
454 
455 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
456 		     const int nested, unsigned int trim_cap)
457 {
458 	int rc = NET_RX_SUCCESS;
459 
460 	if (sk_filter_trim_cap(sk, skb, trim_cap))
461 		goto discard_and_relse;
462 
463 	skb->dev = NULL;
464 
465 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
466 		atomic_inc(&sk->sk_drops);
467 		goto discard_and_relse;
468 	}
469 	if (nested)
470 		bh_lock_sock_nested(sk);
471 	else
472 		bh_lock_sock(sk);
473 	if (!sock_owned_by_user(sk)) {
474 		/*
475 		 * trylock + unlock semantics:
476 		 */
477 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
478 
479 		rc = sk_backlog_rcv(sk, skb);
480 
481 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
482 	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
483 		bh_unlock_sock(sk);
484 		atomic_inc(&sk->sk_drops);
485 		goto discard_and_relse;
486 	}
487 
488 	bh_unlock_sock(sk);
489 out:
490 	sock_put(sk);
491 	return rc;
492 discard_and_relse:
493 	kfree_skb(skb);
494 	goto out;
495 }
496 EXPORT_SYMBOL(__sk_receive_skb);
497 
498 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
499 {
500 	struct dst_entry *dst = __sk_dst_get(sk);
501 
502 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
503 		sk_tx_queue_clear(sk);
504 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
505 		dst_release(dst);
506 		return NULL;
507 	}
508 
509 	return dst;
510 }
511 EXPORT_SYMBOL(__sk_dst_check);
512 
513 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
514 {
515 	struct dst_entry *dst = sk_dst_get(sk);
516 
517 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
518 		sk_dst_reset(sk);
519 		dst_release(dst);
520 		return NULL;
521 	}
522 
523 	return dst;
524 }
525 EXPORT_SYMBOL(sk_dst_check);
526 
527 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
528 				int optlen)
529 {
530 	int ret = -ENOPROTOOPT;
531 #ifdef CONFIG_NETDEVICES
532 	struct net *net = sock_net(sk);
533 	char devname[IFNAMSIZ];
534 	int index;
535 
536 	/* Sorry... */
537 	ret = -EPERM;
538 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
539 		goto out;
540 
541 	ret = -EINVAL;
542 	if (optlen < 0)
543 		goto out;
544 
545 	/* Bind this socket to a particular device like "eth0",
546 	 * as specified in the passed interface name. If the
547 	 * name is "" or the option length is zero the socket
548 	 * is not bound.
549 	 */
550 	if (optlen > IFNAMSIZ - 1)
551 		optlen = IFNAMSIZ - 1;
552 	memset(devname, 0, sizeof(devname));
553 
554 	ret = -EFAULT;
555 	if (copy_from_user(devname, optval, optlen))
556 		goto out;
557 
558 	index = 0;
559 	if (devname[0] != '\0') {
560 		struct net_device *dev;
561 
562 		rcu_read_lock();
563 		dev = dev_get_by_name_rcu(net, devname);
564 		if (dev)
565 			index = dev->ifindex;
566 		rcu_read_unlock();
567 		ret = -ENODEV;
568 		if (!dev)
569 			goto out;
570 	}
571 
572 	lock_sock(sk);
573 	sk->sk_bound_dev_if = index;
574 	sk_dst_reset(sk);
575 	release_sock(sk);
576 
577 	ret = 0;
578 
579 out:
580 #endif
581 
582 	return ret;
583 }
584 
585 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
586 				int __user *optlen, int len)
587 {
588 	int ret = -ENOPROTOOPT;
589 #ifdef CONFIG_NETDEVICES
590 	struct net *net = sock_net(sk);
591 	char devname[IFNAMSIZ];
592 
593 	if (sk->sk_bound_dev_if == 0) {
594 		len = 0;
595 		goto zero;
596 	}
597 
598 	ret = -EINVAL;
599 	if (len < IFNAMSIZ)
600 		goto out;
601 
602 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
603 	if (ret)
604 		goto out;
605 
606 	len = strlen(devname) + 1;
607 
608 	ret = -EFAULT;
609 	if (copy_to_user(optval, devname, len))
610 		goto out;
611 
612 zero:
613 	ret = -EFAULT;
614 	if (put_user(len, optlen))
615 		goto out;
616 
617 	ret = 0;
618 
619 out:
620 #endif
621 
622 	return ret;
623 }
624 
625 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
626 {
627 	if (valbool)
628 		sock_set_flag(sk, bit);
629 	else
630 		sock_reset_flag(sk, bit);
631 }
632 
633 bool sk_mc_loop(struct sock *sk)
634 {
635 	if (dev_recursion_level())
636 		return false;
637 	if (!sk)
638 		return true;
639 	switch (sk->sk_family) {
640 	case AF_INET:
641 		return inet_sk(sk)->mc_loop;
642 #if IS_ENABLED(CONFIG_IPV6)
643 	case AF_INET6:
644 		return inet6_sk(sk)->mc_loop;
645 #endif
646 	}
647 	WARN_ON(1);
648 	return true;
649 }
650 EXPORT_SYMBOL(sk_mc_loop);
651 
652 /*
653  *	This is meant for all protocols to use and covers goings on
654  *	at the socket level. Everything here is generic.
655  */
656 
657 int sock_setsockopt(struct socket *sock, int level, int optname,
658 		    char __user *optval, unsigned int optlen)
659 {
660 	struct sock *sk = sock->sk;
661 	int val;
662 	int valbool;
663 	struct linger ling;
664 	int ret = 0;
665 
666 	/*
667 	 *	Options without arguments
668 	 */
669 
670 	if (optname == SO_BINDTODEVICE)
671 		return sock_setbindtodevice(sk, optval, optlen);
672 
673 	if (optlen < sizeof(int))
674 		return -EINVAL;
675 
676 	if (get_user(val, (int __user *)optval))
677 		return -EFAULT;
678 
679 	valbool = val ? 1 : 0;
680 
681 	lock_sock(sk);
682 
683 	switch (optname) {
684 	case SO_DEBUG:
685 		if (val && !capable(CAP_NET_ADMIN))
686 			ret = -EACCES;
687 		else
688 			sock_valbool_flag(sk, SOCK_DBG, valbool);
689 		break;
690 	case SO_REUSEADDR:
691 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
692 		break;
693 	case SO_REUSEPORT:
694 		sk->sk_reuseport = valbool;
695 		break;
696 	case SO_TYPE:
697 	case SO_PROTOCOL:
698 	case SO_DOMAIN:
699 	case SO_ERROR:
700 		ret = -ENOPROTOOPT;
701 		break;
702 	case SO_DONTROUTE:
703 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
704 		break;
705 	case SO_BROADCAST:
706 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
707 		break;
708 	case SO_SNDBUF:
709 		/* Don't error on this BSD doesn't and if you think
710 		 * about it this is right. Otherwise apps have to
711 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
712 		 * are treated in BSD as hints
713 		 */
714 		val = min_t(u32, val, sysctl_wmem_max);
715 set_sndbuf:
716 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
717 		sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
718 		/* Wake up sending tasks if we upped the value. */
719 		sk->sk_write_space(sk);
720 		break;
721 
722 	case SO_SNDBUFFORCE:
723 		if (!capable(CAP_NET_ADMIN)) {
724 			ret = -EPERM;
725 			break;
726 		}
727 		goto set_sndbuf;
728 
729 	case SO_RCVBUF:
730 		/* Don't error on this BSD doesn't and if you think
731 		 * about it this is right. Otherwise apps have to
732 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
733 		 * are treated in BSD as hints
734 		 */
735 		val = min_t(u32, val, sysctl_rmem_max);
736 set_rcvbuf:
737 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
738 		/*
739 		 * We double it on the way in to account for
740 		 * "struct sk_buff" etc. overhead.   Applications
741 		 * assume that the SO_RCVBUF setting they make will
742 		 * allow that much actual data to be received on that
743 		 * socket.
744 		 *
745 		 * Applications are unaware that "struct sk_buff" and
746 		 * other overheads allocate from the receive buffer
747 		 * during socket buffer allocation.
748 		 *
749 		 * And after considering the possible alternatives,
750 		 * returning the value we actually used in getsockopt
751 		 * is the most desirable behavior.
752 		 */
753 		sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
754 		break;
755 
756 	case SO_RCVBUFFORCE:
757 		if (!capable(CAP_NET_ADMIN)) {
758 			ret = -EPERM;
759 			break;
760 		}
761 		goto set_rcvbuf;
762 
763 	case SO_KEEPALIVE:
764 #ifdef CONFIG_INET
765 		if (sk->sk_protocol == IPPROTO_TCP &&
766 		    sk->sk_type == SOCK_STREAM)
767 			tcp_set_keepalive(sk, valbool);
768 #endif
769 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
770 		break;
771 
772 	case SO_OOBINLINE:
773 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
774 		break;
775 
776 	case SO_NO_CHECK:
777 		sk->sk_no_check_tx = valbool;
778 		break;
779 
780 	case SO_PRIORITY:
781 		if ((val >= 0 && val <= 6) ||
782 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
783 			sk->sk_priority = val;
784 		else
785 			ret = -EPERM;
786 		break;
787 
788 	case SO_LINGER:
789 		if (optlen < sizeof(ling)) {
790 			ret = -EINVAL;	/* 1003.1g */
791 			break;
792 		}
793 		if (copy_from_user(&ling, optval, sizeof(ling))) {
794 			ret = -EFAULT;
795 			break;
796 		}
797 		if (!ling.l_onoff)
798 			sock_reset_flag(sk, SOCK_LINGER);
799 		else {
800 #if (BITS_PER_LONG == 32)
801 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
802 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
803 			else
804 #endif
805 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
806 			sock_set_flag(sk, SOCK_LINGER);
807 		}
808 		break;
809 
810 	case SO_BSDCOMPAT:
811 		sock_warn_obsolete_bsdism("setsockopt");
812 		break;
813 
814 	case SO_PASSCRED:
815 		if (valbool)
816 			set_bit(SOCK_PASSCRED, &sock->flags);
817 		else
818 			clear_bit(SOCK_PASSCRED, &sock->flags);
819 		break;
820 
821 	case SO_TIMESTAMP:
822 	case SO_TIMESTAMPNS:
823 		if (valbool)  {
824 			if (optname == SO_TIMESTAMP)
825 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
826 			else
827 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
828 			sock_set_flag(sk, SOCK_RCVTSTAMP);
829 			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
830 		} else {
831 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
832 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
833 		}
834 		break;
835 
836 	case SO_TIMESTAMPING:
837 		if (val & ~SOF_TIMESTAMPING_MASK) {
838 			ret = -EINVAL;
839 			break;
840 		}
841 
842 		if (val & SOF_TIMESTAMPING_OPT_ID &&
843 		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
844 			if (sk->sk_protocol == IPPROTO_TCP &&
845 			    sk->sk_type == SOCK_STREAM) {
846 				if ((1 << sk->sk_state) &
847 				    (TCPF_CLOSE | TCPF_LISTEN)) {
848 					ret = -EINVAL;
849 					break;
850 				}
851 				sk->sk_tskey = tcp_sk(sk)->snd_una;
852 			} else {
853 				sk->sk_tskey = 0;
854 			}
855 		}
856 		sk->sk_tsflags = val;
857 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
858 			sock_enable_timestamp(sk,
859 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
860 		else
861 			sock_disable_timestamp(sk,
862 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
863 		break;
864 
865 	case SO_RCVLOWAT:
866 		if (val < 0)
867 			val = INT_MAX;
868 		sk->sk_rcvlowat = val ? : 1;
869 		break;
870 
871 	case SO_RCVTIMEO:
872 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
873 		break;
874 
875 	case SO_SNDTIMEO:
876 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
877 		break;
878 
879 	case SO_ATTACH_FILTER:
880 		ret = -EINVAL;
881 		if (optlen == sizeof(struct sock_fprog)) {
882 			struct sock_fprog fprog;
883 
884 			ret = -EFAULT;
885 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
886 				break;
887 
888 			ret = sk_attach_filter(&fprog, sk);
889 		}
890 		break;
891 
892 	case SO_ATTACH_BPF:
893 		ret = -EINVAL;
894 		if (optlen == sizeof(u32)) {
895 			u32 ufd;
896 
897 			ret = -EFAULT;
898 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
899 				break;
900 
901 			ret = sk_attach_bpf(ufd, sk);
902 		}
903 		break;
904 
905 	case SO_ATTACH_REUSEPORT_CBPF:
906 		ret = -EINVAL;
907 		if (optlen == sizeof(struct sock_fprog)) {
908 			struct sock_fprog fprog;
909 
910 			ret = -EFAULT;
911 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
912 				break;
913 
914 			ret = sk_reuseport_attach_filter(&fprog, sk);
915 		}
916 		break;
917 
918 	case SO_ATTACH_REUSEPORT_EBPF:
919 		ret = -EINVAL;
920 		if (optlen == sizeof(u32)) {
921 			u32 ufd;
922 
923 			ret = -EFAULT;
924 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
925 				break;
926 
927 			ret = sk_reuseport_attach_bpf(ufd, sk);
928 		}
929 		break;
930 
931 	case SO_DETACH_FILTER:
932 		ret = sk_detach_filter(sk);
933 		break;
934 
935 	case SO_LOCK_FILTER:
936 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
937 			ret = -EPERM;
938 		else
939 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
940 		break;
941 
942 	case SO_PASSSEC:
943 		if (valbool)
944 			set_bit(SOCK_PASSSEC, &sock->flags);
945 		else
946 			clear_bit(SOCK_PASSSEC, &sock->flags);
947 		break;
948 	case SO_MARK:
949 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
950 			ret = -EPERM;
951 		else
952 			sk->sk_mark = val;
953 		break;
954 
955 	case SO_RXQ_OVFL:
956 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
957 		break;
958 
959 	case SO_WIFI_STATUS:
960 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
961 		break;
962 
963 	case SO_PEEK_OFF:
964 		if (sock->ops->set_peek_off)
965 			ret = sock->ops->set_peek_off(sk, val);
966 		else
967 			ret = -EOPNOTSUPP;
968 		break;
969 
970 	case SO_NOFCS:
971 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
972 		break;
973 
974 	case SO_SELECT_ERR_QUEUE:
975 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
976 		break;
977 
978 #ifdef CONFIG_NET_RX_BUSY_POLL
979 	case SO_BUSY_POLL:
980 		/* allow unprivileged users to decrease the value */
981 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
982 			ret = -EPERM;
983 		else {
984 			if (val < 0)
985 				ret = -EINVAL;
986 			else
987 				sk->sk_ll_usec = val;
988 		}
989 		break;
990 #endif
991 
992 	case SO_MAX_PACING_RATE:
993 		sk->sk_max_pacing_rate = val;
994 		sk->sk_pacing_rate = min(sk->sk_pacing_rate,
995 					 sk->sk_max_pacing_rate);
996 		break;
997 
998 	case SO_INCOMING_CPU:
999 		sk->sk_incoming_cpu = val;
1000 		break;
1001 
1002 	case SO_CNX_ADVICE:
1003 		if (val == 1)
1004 			dst_negative_advice(sk);
1005 		break;
1006 	default:
1007 		ret = -ENOPROTOOPT;
1008 		break;
1009 	}
1010 	release_sock(sk);
1011 	return ret;
1012 }
1013 EXPORT_SYMBOL(sock_setsockopt);
1014 
1015 
1016 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1017 			  struct ucred *ucred)
1018 {
1019 	ucred->pid = pid_vnr(pid);
1020 	ucred->uid = ucred->gid = -1;
1021 	if (cred) {
1022 		struct user_namespace *current_ns = current_user_ns();
1023 
1024 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1025 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1026 	}
1027 }
1028 
1029 int sock_getsockopt(struct socket *sock, int level, int optname,
1030 		    char __user *optval, int __user *optlen)
1031 {
1032 	struct sock *sk = sock->sk;
1033 
1034 	union {
1035 		int val;
1036 		struct linger ling;
1037 		struct timeval tm;
1038 	} v;
1039 
1040 	int lv = sizeof(int);
1041 	int len;
1042 
1043 	if (get_user(len, optlen))
1044 		return -EFAULT;
1045 	if (len < 0)
1046 		return -EINVAL;
1047 
1048 	memset(&v, 0, sizeof(v));
1049 
1050 	switch (optname) {
1051 	case SO_DEBUG:
1052 		v.val = sock_flag(sk, SOCK_DBG);
1053 		break;
1054 
1055 	case SO_DONTROUTE:
1056 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1057 		break;
1058 
1059 	case SO_BROADCAST:
1060 		v.val = sock_flag(sk, SOCK_BROADCAST);
1061 		break;
1062 
1063 	case SO_SNDBUF:
1064 		v.val = sk->sk_sndbuf;
1065 		break;
1066 
1067 	case SO_RCVBUF:
1068 		v.val = sk->sk_rcvbuf;
1069 		break;
1070 
1071 	case SO_REUSEADDR:
1072 		v.val = sk->sk_reuse;
1073 		break;
1074 
1075 	case SO_REUSEPORT:
1076 		v.val = sk->sk_reuseport;
1077 		break;
1078 
1079 	case SO_KEEPALIVE:
1080 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1081 		break;
1082 
1083 	case SO_TYPE:
1084 		v.val = sk->sk_type;
1085 		break;
1086 
1087 	case SO_PROTOCOL:
1088 		v.val = sk->sk_protocol;
1089 		break;
1090 
1091 	case SO_DOMAIN:
1092 		v.val = sk->sk_family;
1093 		break;
1094 
1095 	case SO_ERROR:
1096 		v.val = -sock_error(sk);
1097 		if (v.val == 0)
1098 			v.val = xchg(&sk->sk_err_soft, 0);
1099 		break;
1100 
1101 	case SO_OOBINLINE:
1102 		v.val = sock_flag(sk, SOCK_URGINLINE);
1103 		break;
1104 
1105 	case SO_NO_CHECK:
1106 		v.val = sk->sk_no_check_tx;
1107 		break;
1108 
1109 	case SO_PRIORITY:
1110 		v.val = sk->sk_priority;
1111 		break;
1112 
1113 	case SO_LINGER:
1114 		lv		= sizeof(v.ling);
1115 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1116 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1117 		break;
1118 
1119 	case SO_BSDCOMPAT:
1120 		sock_warn_obsolete_bsdism("getsockopt");
1121 		break;
1122 
1123 	case SO_TIMESTAMP:
1124 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1125 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1126 		break;
1127 
1128 	case SO_TIMESTAMPNS:
1129 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1130 		break;
1131 
1132 	case SO_TIMESTAMPING:
1133 		v.val = sk->sk_tsflags;
1134 		break;
1135 
1136 	case SO_RCVTIMEO:
1137 		lv = sizeof(struct timeval);
1138 		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1139 			v.tm.tv_sec = 0;
1140 			v.tm.tv_usec = 0;
1141 		} else {
1142 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1143 			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1144 		}
1145 		break;
1146 
1147 	case SO_SNDTIMEO:
1148 		lv = sizeof(struct timeval);
1149 		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1150 			v.tm.tv_sec = 0;
1151 			v.tm.tv_usec = 0;
1152 		} else {
1153 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1154 			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1155 		}
1156 		break;
1157 
1158 	case SO_RCVLOWAT:
1159 		v.val = sk->sk_rcvlowat;
1160 		break;
1161 
1162 	case SO_SNDLOWAT:
1163 		v.val = 1;
1164 		break;
1165 
1166 	case SO_PASSCRED:
1167 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1168 		break;
1169 
1170 	case SO_PEERCRED:
1171 	{
1172 		struct ucred peercred;
1173 		if (len > sizeof(peercred))
1174 			len = sizeof(peercred);
1175 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1176 		if (copy_to_user(optval, &peercred, len))
1177 			return -EFAULT;
1178 		goto lenout;
1179 	}
1180 
1181 	case SO_PEERNAME:
1182 	{
1183 		char address[128];
1184 
1185 		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1186 			return -ENOTCONN;
1187 		if (lv < len)
1188 			return -EINVAL;
1189 		if (copy_to_user(optval, address, len))
1190 			return -EFAULT;
1191 		goto lenout;
1192 	}
1193 
1194 	/* Dubious BSD thing... Probably nobody even uses it, but
1195 	 * the UNIX standard wants it for whatever reason... -DaveM
1196 	 */
1197 	case SO_ACCEPTCONN:
1198 		v.val = sk->sk_state == TCP_LISTEN;
1199 		break;
1200 
1201 	case SO_PASSSEC:
1202 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1203 		break;
1204 
1205 	case SO_PEERSEC:
1206 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1207 
1208 	case SO_MARK:
1209 		v.val = sk->sk_mark;
1210 		break;
1211 
1212 	case SO_RXQ_OVFL:
1213 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1214 		break;
1215 
1216 	case SO_WIFI_STATUS:
1217 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1218 		break;
1219 
1220 	case SO_PEEK_OFF:
1221 		if (!sock->ops->set_peek_off)
1222 			return -EOPNOTSUPP;
1223 
1224 		v.val = sk->sk_peek_off;
1225 		break;
1226 	case SO_NOFCS:
1227 		v.val = sock_flag(sk, SOCK_NOFCS);
1228 		break;
1229 
1230 	case SO_BINDTODEVICE:
1231 		return sock_getbindtodevice(sk, optval, optlen, len);
1232 
1233 	case SO_GET_FILTER:
1234 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1235 		if (len < 0)
1236 			return len;
1237 
1238 		goto lenout;
1239 
1240 	case SO_LOCK_FILTER:
1241 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1242 		break;
1243 
1244 	case SO_BPF_EXTENSIONS:
1245 		v.val = bpf_tell_extensions();
1246 		break;
1247 
1248 	case SO_SELECT_ERR_QUEUE:
1249 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1250 		break;
1251 
1252 #ifdef CONFIG_NET_RX_BUSY_POLL
1253 	case SO_BUSY_POLL:
1254 		v.val = sk->sk_ll_usec;
1255 		break;
1256 #endif
1257 
1258 	case SO_MAX_PACING_RATE:
1259 		v.val = sk->sk_max_pacing_rate;
1260 		break;
1261 
1262 	case SO_INCOMING_CPU:
1263 		v.val = sk->sk_incoming_cpu;
1264 		break;
1265 
1266 	default:
1267 		/* We implement the SO_SNDLOWAT etc to not be settable
1268 		 * (1003.1g 7).
1269 		 */
1270 		return -ENOPROTOOPT;
1271 	}
1272 
1273 	if (len > lv)
1274 		len = lv;
1275 	if (copy_to_user(optval, &v, len))
1276 		return -EFAULT;
1277 lenout:
1278 	if (put_user(len, optlen))
1279 		return -EFAULT;
1280 	return 0;
1281 }
1282 
1283 /*
1284  * Initialize an sk_lock.
1285  *
1286  * (We also register the sk_lock with the lock validator.)
1287  */
1288 static inline void sock_lock_init(struct sock *sk)
1289 {
1290 	sock_lock_init_class_and_name(sk,
1291 			af_family_slock_key_strings[sk->sk_family],
1292 			af_family_slock_keys + sk->sk_family,
1293 			af_family_key_strings[sk->sk_family],
1294 			af_family_keys + sk->sk_family);
1295 }
1296 
1297 /*
1298  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1299  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1300  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1301  */
1302 static void sock_copy(struct sock *nsk, const struct sock *osk)
1303 {
1304 #ifdef CONFIG_SECURITY_NETWORK
1305 	void *sptr = nsk->sk_security;
1306 #endif
1307 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1308 
1309 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1310 	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1311 
1312 #ifdef CONFIG_SECURITY_NETWORK
1313 	nsk->sk_security = sptr;
1314 	security_sk_clone(osk, nsk);
1315 #endif
1316 }
1317 
1318 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1319 		int family)
1320 {
1321 	struct sock *sk;
1322 	struct kmem_cache *slab;
1323 
1324 	slab = prot->slab;
1325 	if (slab != NULL) {
1326 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1327 		if (!sk)
1328 			return sk;
1329 		if (priority & __GFP_ZERO)
1330 			sk_prot_clear_nulls(sk, prot->obj_size);
1331 	} else
1332 		sk = kmalloc(prot->obj_size, priority);
1333 
1334 	if (sk != NULL) {
1335 		kmemcheck_annotate_bitfield(sk, flags);
1336 
1337 		if (security_sk_alloc(sk, family, priority))
1338 			goto out_free;
1339 
1340 		if (!try_module_get(prot->owner))
1341 			goto out_free_sec;
1342 		sk_tx_queue_clear(sk);
1343 	}
1344 
1345 	return sk;
1346 
1347 out_free_sec:
1348 	security_sk_free(sk);
1349 out_free:
1350 	if (slab != NULL)
1351 		kmem_cache_free(slab, sk);
1352 	else
1353 		kfree(sk);
1354 	return NULL;
1355 }
1356 
1357 static void sk_prot_free(struct proto *prot, struct sock *sk)
1358 {
1359 	struct kmem_cache *slab;
1360 	struct module *owner;
1361 
1362 	owner = prot->owner;
1363 	slab = prot->slab;
1364 
1365 	cgroup_sk_free(&sk->sk_cgrp_data);
1366 	mem_cgroup_sk_free(sk);
1367 	security_sk_free(sk);
1368 	if (slab != NULL)
1369 		kmem_cache_free(slab, sk);
1370 	else
1371 		kfree(sk);
1372 	module_put(owner);
1373 }
1374 
1375 /**
1376  *	sk_alloc - All socket objects are allocated here
1377  *	@net: the applicable net namespace
1378  *	@family: protocol family
1379  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1380  *	@prot: struct proto associated with this new sock instance
1381  *	@kern: is this to be a kernel socket?
1382  */
1383 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1384 		      struct proto *prot, int kern)
1385 {
1386 	struct sock *sk;
1387 
1388 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1389 	if (sk) {
1390 		sk->sk_family = family;
1391 		/*
1392 		 * See comment in struct sock definition to understand
1393 		 * why we need sk_prot_creator -acme
1394 		 */
1395 		sk->sk_prot = sk->sk_prot_creator = prot;
1396 		sock_lock_init(sk);
1397 		sk->sk_net_refcnt = kern ? 0 : 1;
1398 		if (likely(sk->sk_net_refcnt))
1399 			get_net(net);
1400 		sock_net_set(sk, net);
1401 		atomic_set(&sk->sk_wmem_alloc, 1);
1402 
1403 		mem_cgroup_sk_alloc(sk);
1404 		cgroup_sk_alloc(&sk->sk_cgrp_data);
1405 		sock_update_classid(&sk->sk_cgrp_data);
1406 		sock_update_netprioidx(&sk->sk_cgrp_data);
1407 	}
1408 
1409 	return sk;
1410 }
1411 EXPORT_SYMBOL(sk_alloc);
1412 
1413 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1414  * grace period. This is the case for UDP sockets and TCP listeners.
1415  */
1416 static void __sk_destruct(struct rcu_head *head)
1417 {
1418 	struct sock *sk = container_of(head, struct sock, sk_rcu);
1419 	struct sk_filter *filter;
1420 
1421 	if (sk->sk_destruct)
1422 		sk->sk_destruct(sk);
1423 
1424 	filter = rcu_dereference_check(sk->sk_filter,
1425 				       atomic_read(&sk->sk_wmem_alloc) == 0);
1426 	if (filter) {
1427 		sk_filter_uncharge(sk, filter);
1428 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1429 	}
1430 	if (rcu_access_pointer(sk->sk_reuseport_cb))
1431 		reuseport_detach_sock(sk);
1432 
1433 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1434 
1435 	if (atomic_read(&sk->sk_omem_alloc))
1436 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1437 			 __func__, atomic_read(&sk->sk_omem_alloc));
1438 
1439 	if (sk->sk_peer_cred)
1440 		put_cred(sk->sk_peer_cred);
1441 	put_pid(sk->sk_peer_pid);
1442 	if (likely(sk->sk_net_refcnt))
1443 		put_net(sock_net(sk));
1444 	sk_prot_free(sk->sk_prot_creator, sk);
1445 }
1446 
1447 void sk_destruct(struct sock *sk)
1448 {
1449 	if (sock_flag(sk, SOCK_RCU_FREE))
1450 		call_rcu(&sk->sk_rcu, __sk_destruct);
1451 	else
1452 		__sk_destruct(&sk->sk_rcu);
1453 }
1454 
1455 static void __sk_free(struct sock *sk)
1456 {
1457 	if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
1458 		sock_diag_broadcast_destroy(sk);
1459 	else
1460 		sk_destruct(sk);
1461 }
1462 
1463 void sk_free(struct sock *sk)
1464 {
1465 	/*
1466 	 * We subtract one from sk_wmem_alloc and can know if
1467 	 * some packets are still in some tx queue.
1468 	 * If not null, sock_wfree() will call __sk_free(sk) later
1469 	 */
1470 	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1471 		__sk_free(sk);
1472 }
1473 EXPORT_SYMBOL(sk_free);
1474 
1475 /**
1476  *	sk_clone_lock - clone a socket, and lock its clone
1477  *	@sk: the socket to clone
1478  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1479  *
1480  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1481  */
1482 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1483 {
1484 	struct sock *newsk;
1485 	bool is_charged = true;
1486 
1487 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1488 	if (newsk != NULL) {
1489 		struct sk_filter *filter;
1490 
1491 		sock_copy(newsk, sk);
1492 
1493 		/* SANITY */
1494 		if (likely(newsk->sk_net_refcnt))
1495 			get_net(sock_net(newsk));
1496 		sk_node_init(&newsk->sk_node);
1497 		sock_lock_init(newsk);
1498 		bh_lock_sock(newsk);
1499 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1500 		newsk->sk_backlog.len = 0;
1501 
1502 		atomic_set(&newsk->sk_rmem_alloc, 0);
1503 		/*
1504 		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1505 		 */
1506 		atomic_set(&newsk->sk_wmem_alloc, 1);
1507 		atomic_set(&newsk->sk_omem_alloc, 0);
1508 		skb_queue_head_init(&newsk->sk_receive_queue);
1509 		skb_queue_head_init(&newsk->sk_write_queue);
1510 
1511 		rwlock_init(&newsk->sk_callback_lock);
1512 		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1513 				af_callback_keys + newsk->sk_family,
1514 				af_family_clock_key_strings[newsk->sk_family]);
1515 
1516 		newsk->sk_dst_cache	= NULL;
1517 		newsk->sk_wmem_queued	= 0;
1518 		newsk->sk_forward_alloc = 0;
1519 		atomic_set(&newsk->sk_drops, 0);
1520 		newsk->sk_send_head	= NULL;
1521 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1522 
1523 		sock_reset_flag(newsk, SOCK_DONE);
1524 		skb_queue_head_init(&newsk->sk_error_queue);
1525 
1526 		filter = rcu_dereference_protected(newsk->sk_filter, 1);
1527 		if (filter != NULL)
1528 			/* though it's an empty new sock, the charging may fail
1529 			 * if sysctl_optmem_max was changed between creation of
1530 			 * original socket and cloning
1531 			 */
1532 			is_charged = sk_filter_charge(newsk, filter);
1533 
1534 		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1535 			/* It is still raw copy of parent, so invalidate
1536 			 * destructor and make plain sk_free() */
1537 			newsk->sk_destruct = NULL;
1538 			bh_unlock_sock(newsk);
1539 			sk_free(newsk);
1540 			newsk = NULL;
1541 			goto out;
1542 		}
1543 		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1544 
1545 		newsk->sk_err	   = 0;
1546 		newsk->sk_priority = 0;
1547 		newsk->sk_incoming_cpu = raw_smp_processor_id();
1548 		atomic64_set(&newsk->sk_cookie, 0);
1549 
1550 		mem_cgroup_sk_alloc(newsk);
1551 		cgroup_sk_alloc(&newsk->sk_cgrp_data);
1552 
1553 		/*
1554 		 * Before updating sk_refcnt, we must commit prior changes to memory
1555 		 * (Documentation/RCU/rculist_nulls.txt for details)
1556 		 */
1557 		smp_wmb();
1558 		atomic_set(&newsk->sk_refcnt, 2);
1559 
1560 		/*
1561 		 * Increment the counter in the same struct proto as the master
1562 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1563 		 * is the same as sk->sk_prot->socks, as this field was copied
1564 		 * with memcpy).
1565 		 *
1566 		 * This _changes_ the previous behaviour, where
1567 		 * tcp_create_openreq_child always was incrementing the
1568 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1569 		 * to be taken into account in all callers. -acme
1570 		 */
1571 		sk_refcnt_debug_inc(newsk);
1572 		sk_set_socket(newsk, NULL);
1573 		newsk->sk_wq = NULL;
1574 
1575 		if (newsk->sk_prot->sockets_allocated)
1576 			sk_sockets_allocated_inc(newsk);
1577 
1578 		if (sock_needs_netstamp(sk) &&
1579 		    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1580 			net_enable_timestamp();
1581 	}
1582 out:
1583 	return newsk;
1584 }
1585 EXPORT_SYMBOL_GPL(sk_clone_lock);
1586 
1587 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1588 {
1589 	u32 max_segs = 1;
1590 
1591 	sk_dst_set(sk, dst);
1592 	sk->sk_route_caps = dst->dev->features;
1593 	if (sk->sk_route_caps & NETIF_F_GSO)
1594 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1595 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1596 	if (sk_can_gso(sk)) {
1597 		if (dst->header_len) {
1598 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1599 		} else {
1600 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1601 			sk->sk_gso_max_size = dst->dev->gso_max_size;
1602 			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1603 		}
1604 	}
1605 	sk->sk_gso_max_segs = max_segs;
1606 }
1607 EXPORT_SYMBOL_GPL(sk_setup_caps);
1608 
1609 /*
1610  *	Simple resource managers for sockets.
1611  */
1612 
1613 
1614 /*
1615  * Write buffer destructor automatically called from kfree_skb.
1616  */
1617 void sock_wfree(struct sk_buff *skb)
1618 {
1619 	struct sock *sk = skb->sk;
1620 	unsigned int len = skb->truesize;
1621 
1622 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1623 		/*
1624 		 * Keep a reference on sk_wmem_alloc, this will be released
1625 		 * after sk_write_space() call
1626 		 */
1627 		atomic_sub(len - 1, &sk->sk_wmem_alloc);
1628 		sk->sk_write_space(sk);
1629 		len = 1;
1630 	}
1631 	/*
1632 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1633 	 * could not do because of in-flight packets
1634 	 */
1635 	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1636 		__sk_free(sk);
1637 }
1638 EXPORT_SYMBOL(sock_wfree);
1639 
1640 /* This variant of sock_wfree() is used by TCP,
1641  * since it sets SOCK_USE_WRITE_QUEUE.
1642  */
1643 void __sock_wfree(struct sk_buff *skb)
1644 {
1645 	struct sock *sk = skb->sk;
1646 
1647 	if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1648 		__sk_free(sk);
1649 }
1650 
1651 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1652 {
1653 	skb_orphan(skb);
1654 	skb->sk = sk;
1655 #ifdef CONFIG_INET
1656 	if (unlikely(!sk_fullsock(sk))) {
1657 		skb->destructor = sock_edemux;
1658 		sock_hold(sk);
1659 		return;
1660 	}
1661 #endif
1662 	skb->destructor = sock_wfree;
1663 	skb_set_hash_from_sk(skb, sk);
1664 	/*
1665 	 * We used to take a refcount on sk, but following operation
1666 	 * is enough to guarantee sk_free() wont free this sock until
1667 	 * all in-flight packets are completed
1668 	 */
1669 	atomic_add(skb->truesize, &sk->sk_wmem_alloc);
1670 }
1671 EXPORT_SYMBOL(skb_set_owner_w);
1672 
1673 /* This helper is used by netem, as it can hold packets in its
1674  * delay queue. We want to allow the owner socket to send more
1675  * packets, as if they were already TX completed by a typical driver.
1676  * But we also want to keep skb->sk set because some packet schedulers
1677  * rely on it (sch_fq for example). So we set skb->truesize to a small
1678  * amount (1) and decrease sk_wmem_alloc accordingly.
1679  */
1680 void skb_orphan_partial(struct sk_buff *skb)
1681 {
1682 	/* If this skb is a TCP pure ACK or already went here,
1683 	 * we have nothing to do. 2 is already a very small truesize.
1684 	 */
1685 	if (skb->truesize <= 2)
1686 		return;
1687 
1688 	/* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1689 	 * so we do not completely orphan skb, but transfert all
1690 	 * accounted bytes but one, to avoid unexpected reorders.
1691 	 */
1692 	if (skb->destructor == sock_wfree
1693 #ifdef CONFIG_INET
1694 	    || skb->destructor == tcp_wfree
1695 #endif
1696 		) {
1697 		atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1698 		skb->truesize = 1;
1699 	} else {
1700 		skb_orphan(skb);
1701 	}
1702 }
1703 EXPORT_SYMBOL(skb_orphan_partial);
1704 
1705 /*
1706  * Read buffer destructor automatically called from kfree_skb.
1707  */
1708 void sock_rfree(struct sk_buff *skb)
1709 {
1710 	struct sock *sk = skb->sk;
1711 	unsigned int len = skb->truesize;
1712 
1713 	atomic_sub(len, &sk->sk_rmem_alloc);
1714 	sk_mem_uncharge(sk, len);
1715 }
1716 EXPORT_SYMBOL(sock_rfree);
1717 
1718 /*
1719  * Buffer destructor for skbs that are not used directly in read or write
1720  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1721  */
1722 void sock_efree(struct sk_buff *skb)
1723 {
1724 	sock_put(skb->sk);
1725 }
1726 EXPORT_SYMBOL(sock_efree);
1727 
1728 kuid_t sock_i_uid(struct sock *sk)
1729 {
1730 	kuid_t uid;
1731 
1732 	read_lock_bh(&sk->sk_callback_lock);
1733 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1734 	read_unlock_bh(&sk->sk_callback_lock);
1735 	return uid;
1736 }
1737 EXPORT_SYMBOL(sock_i_uid);
1738 
1739 unsigned long sock_i_ino(struct sock *sk)
1740 {
1741 	unsigned long ino;
1742 
1743 	read_lock_bh(&sk->sk_callback_lock);
1744 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1745 	read_unlock_bh(&sk->sk_callback_lock);
1746 	return ino;
1747 }
1748 EXPORT_SYMBOL(sock_i_ino);
1749 
1750 /*
1751  * Allocate a skb from the socket's send buffer.
1752  */
1753 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1754 			     gfp_t priority)
1755 {
1756 	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1757 		struct sk_buff *skb = alloc_skb(size, priority);
1758 		if (skb) {
1759 			skb_set_owner_w(skb, sk);
1760 			return skb;
1761 		}
1762 	}
1763 	return NULL;
1764 }
1765 EXPORT_SYMBOL(sock_wmalloc);
1766 
1767 /*
1768  * Allocate a memory block from the socket's option memory buffer.
1769  */
1770 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1771 {
1772 	if ((unsigned int)size <= sysctl_optmem_max &&
1773 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1774 		void *mem;
1775 		/* First do the add, to avoid the race if kmalloc
1776 		 * might sleep.
1777 		 */
1778 		atomic_add(size, &sk->sk_omem_alloc);
1779 		mem = kmalloc(size, priority);
1780 		if (mem)
1781 			return mem;
1782 		atomic_sub(size, &sk->sk_omem_alloc);
1783 	}
1784 	return NULL;
1785 }
1786 EXPORT_SYMBOL(sock_kmalloc);
1787 
1788 /* Free an option memory block. Note, we actually want the inline
1789  * here as this allows gcc to detect the nullify and fold away the
1790  * condition entirely.
1791  */
1792 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1793 				  const bool nullify)
1794 {
1795 	if (WARN_ON_ONCE(!mem))
1796 		return;
1797 	if (nullify)
1798 		kzfree(mem);
1799 	else
1800 		kfree(mem);
1801 	atomic_sub(size, &sk->sk_omem_alloc);
1802 }
1803 
1804 void sock_kfree_s(struct sock *sk, void *mem, int size)
1805 {
1806 	__sock_kfree_s(sk, mem, size, false);
1807 }
1808 EXPORT_SYMBOL(sock_kfree_s);
1809 
1810 void sock_kzfree_s(struct sock *sk, void *mem, int size)
1811 {
1812 	__sock_kfree_s(sk, mem, size, true);
1813 }
1814 EXPORT_SYMBOL(sock_kzfree_s);
1815 
1816 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1817    I think, these locks should be removed for datagram sockets.
1818  */
1819 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1820 {
1821 	DEFINE_WAIT(wait);
1822 
1823 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1824 	for (;;) {
1825 		if (!timeo)
1826 			break;
1827 		if (signal_pending(current))
1828 			break;
1829 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1830 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1831 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1832 			break;
1833 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1834 			break;
1835 		if (sk->sk_err)
1836 			break;
1837 		timeo = schedule_timeout(timeo);
1838 	}
1839 	finish_wait(sk_sleep(sk), &wait);
1840 	return timeo;
1841 }
1842 
1843 
1844 /*
1845  *	Generic send/receive buffer handlers
1846  */
1847 
1848 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1849 				     unsigned long data_len, int noblock,
1850 				     int *errcode, int max_page_order)
1851 {
1852 	struct sk_buff *skb;
1853 	long timeo;
1854 	int err;
1855 
1856 	timeo = sock_sndtimeo(sk, noblock);
1857 	for (;;) {
1858 		err = sock_error(sk);
1859 		if (err != 0)
1860 			goto failure;
1861 
1862 		err = -EPIPE;
1863 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1864 			goto failure;
1865 
1866 		if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1867 			break;
1868 
1869 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1870 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1871 		err = -EAGAIN;
1872 		if (!timeo)
1873 			goto failure;
1874 		if (signal_pending(current))
1875 			goto interrupted;
1876 		timeo = sock_wait_for_wmem(sk, timeo);
1877 	}
1878 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1879 				   errcode, sk->sk_allocation);
1880 	if (skb)
1881 		skb_set_owner_w(skb, sk);
1882 	return skb;
1883 
1884 interrupted:
1885 	err = sock_intr_errno(timeo);
1886 failure:
1887 	*errcode = err;
1888 	return NULL;
1889 }
1890 EXPORT_SYMBOL(sock_alloc_send_pskb);
1891 
1892 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1893 				    int noblock, int *errcode)
1894 {
1895 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1896 }
1897 EXPORT_SYMBOL(sock_alloc_send_skb);
1898 
1899 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
1900 		     struct sockcm_cookie *sockc)
1901 {
1902 	u32 tsflags;
1903 
1904 	switch (cmsg->cmsg_type) {
1905 	case SO_MARK:
1906 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1907 			return -EPERM;
1908 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1909 			return -EINVAL;
1910 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
1911 		break;
1912 	case SO_TIMESTAMPING:
1913 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1914 			return -EINVAL;
1915 
1916 		tsflags = *(u32 *)CMSG_DATA(cmsg);
1917 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
1918 			return -EINVAL;
1919 
1920 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
1921 		sockc->tsflags |= tsflags;
1922 		break;
1923 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
1924 	case SCM_RIGHTS:
1925 	case SCM_CREDENTIALS:
1926 		break;
1927 	default:
1928 		return -EINVAL;
1929 	}
1930 	return 0;
1931 }
1932 EXPORT_SYMBOL(__sock_cmsg_send);
1933 
1934 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
1935 		   struct sockcm_cookie *sockc)
1936 {
1937 	struct cmsghdr *cmsg;
1938 	int ret;
1939 
1940 	for_each_cmsghdr(cmsg, msg) {
1941 		if (!CMSG_OK(msg, cmsg))
1942 			return -EINVAL;
1943 		if (cmsg->cmsg_level != SOL_SOCKET)
1944 			continue;
1945 		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
1946 		if (ret)
1947 			return ret;
1948 	}
1949 	return 0;
1950 }
1951 EXPORT_SYMBOL(sock_cmsg_send);
1952 
1953 /* On 32bit arches, an skb frag is limited to 2^15 */
1954 #define SKB_FRAG_PAGE_ORDER	get_order(32768)
1955 
1956 /**
1957  * skb_page_frag_refill - check that a page_frag contains enough room
1958  * @sz: minimum size of the fragment we want to get
1959  * @pfrag: pointer to page_frag
1960  * @gfp: priority for memory allocation
1961  *
1962  * Note: While this allocator tries to use high order pages, there is
1963  * no guarantee that allocations succeed. Therefore, @sz MUST be
1964  * less or equal than PAGE_SIZE.
1965  */
1966 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
1967 {
1968 	if (pfrag->page) {
1969 		if (page_ref_count(pfrag->page) == 1) {
1970 			pfrag->offset = 0;
1971 			return true;
1972 		}
1973 		if (pfrag->offset + sz <= pfrag->size)
1974 			return true;
1975 		put_page(pfrag->page);
1976 	}
1977 
1978 	pfrag->offset = 0;
1979 	if (SKB_FRAG_PAGE_ORDER) {
1980 		/* Avoid direct reclaim but allow kswapd to wake */
1981 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
1982 					  __GFP_COMP | __GFP_NOWARN |
1983 					  __GFP_NORETRY,
1984 					  SKB_FRAG_PAGE_ORDER);
1985 		if (likely(pfrag->page)) {
1986 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
1987 			return true;
1988 		}
1989 	}
1990 	pfrag->page = alloc_page(gfp);
1991 	if (likely(pfrag->page)) {
1992 		pfrag->size = PAGE_SIZE;
1993 		return true;
1994 	}
1995 	return false;
1996 }
1997 EXPORT_SYMBOL(skb_page_frag_refill);
1998 
1999 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2000 {
2001 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2002 		return true;
2003 
2004 	sk_enter_memory_pressure(sk);
2005 	sk_stream_moderate_sndbuf(sk);
2006 	return false;
2007 }
2008 EXPORT_SYMBOL(sk_page_frag_refill);
2009 
2010 static void __lock_sock(struct sock *sk)
2011 	__releases(&sk->sk_lock.slock)
2012 	__acquires(&sk->sk_lock.slock)
2013 {
2014 	DEFINE_WAIT(wait);
2015 
2016 	for (;;) {
2017 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2018 					TASK_UNINTERRUPTIBLE);
2019 		spin_unlock_bh(&sk->sk_lock.slock);
2020 		schedule();
2021 		spin_lock_bh(&sk->sk_lock.slock);
2022 		if (!sock_owned_by_user(sk))
2023 			break;
2024 	}
2025 	finish_wait(&sk->sk_lock.wq, &wait);
2026 }
2027 
2028 static void __release_sock(struct sock *sk)
2029 	__releases(&sk->sk_lock.slock)
2030 	__acquires(&sk->sk_lock.slock)
2031 {
2032 	struct sk_buff *skb, *next;
2033 
2034 	while ((skb = sk->sk_backlog.head) != NULL) {
2035 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2036 
2037 		spin_unlock_bh(&sk->sk_lock.slock);
2038 
2039 		do {
2040 			next = skb->next;
2041 			prefetch(next);
2042 			WARN_ON_ONCE(skb_dst_is_noref(skb));
2043 			skb->next = NULL;
2044 			sk_backlog_rcv(sk, skb);
2045 
2046 			cond_resched();
2047 
2048 			skb = next;
2049 		} while (skb != NULL);
2050 
2051 		spin_lock_bh(&sk->sk_lock.slock);
2052 	}
2053 
2054 	/*
2055 	 * Doing the zeroing here guarantee we can not loop forever
2056 	 * while a wild producer attempts to flood us.
2057 	 */
2058 	sk->sk_backlog.len = 0;
2059 }
2060 
2061 void __sk_flush_backlog(struct sock *sk)
2062 {
2063 	spin_lock_bh(&sk->sk_lock.slock);
2064 	__release_sock(sk);
2065 	spin_unlock_bh(&sk->sk_lock.slock);
2066 }
2067 
2068 /**
2069  * sk_wait_data - wait for data to arrive at sk_receive_queue
2070  * @sk:    sock to wait on
2071  * @timeo: for how long
2072  * @skb:   last skb seen on sk_receive_queue
2073  *
2074  * Now socket state including sk->sk_err is changed only under lock,
2075  * hence we may omit checks after joining wait queue.
2076  * We check receive queue before schedule() only as optimization;
2077  * it is very likely that release_sock() added new data.
2078  */
2079 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2080 {
2081 	int rc;
2082 	DEFINE_WAIT(wait);
2083 
2084 	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2085 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2086 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
2087 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2088 	finish_wait(sk_sleep(sk), &wait);
2089 	return rc;
2090 }
2091 EXPORT_SYMBOL(sk_wait_data);
2092 
2093 /**
2094  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2095  *	@sk: socket
2096  *	@size: memory size to allocate
2097  *	@kind: allocation type
2098  *
2099  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2100  *	rmem allocation. This function assumes that protocols which have
2101  *	memory_pressure use sk_wmem_queued as write buffer accounting.
2102  */
2103 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2104 {
2105 	struct proto *prot = sk->sk_prot;
2106 	int amt = sk_mem_pages(size);
2107 	long allocated;
2108 
2109 	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
2110 
2111 	allocated = sk_memory_allocated_add(sk, amt);
2112 
2113 	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2114 	    !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2115 		goto suppress_allocation;
2116 
2117 	/* Under limit. */
2118 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2119 		sk_leave_memory_pressure(sk);
2120 		return 1;
2121 	}
2122 
2123 	/* Under pressure. */
2124 	if (allocated > sk_prot_mem_limits(sk, 1))
2125 		sk_enter_memory_pressure(sk);
2126 
2127 	/* Over hard limit. */
2128 	if (allocated > sk_prot_mem_limits(sk, 2))
2129 		goto suppress_allocation;
2130 
2131 	/* guarantee minimum buffer size under pressure */
2132 	if (kind == SK_MEM_RECV) {
2133 		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2134 			return 1;
2135 
2136 	} else { /* SK_MEM_SEND */
2137 		if (sk->sk_type == SOCK_STREAM) {
2138 			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2139 				return 1;
2140 		} else if (atomic_read(&sk->sk_wmem_alloc) <
2141 			   prot->sysctl_wmem[0])
2142 				return 1;
2143 	}
2144 
2145 	if (sk_has_memory_pressure(sk)) {
2146 		int alloc;
2147 
2148 		if (!sk_under_memory_pressure(sk))
2149 			return 1;
2150 		alloc = sk_sockets_allocated_read_positive(sk);
2151 		if (sk_prot_mem_limits(sk, 2) > alloc *
2152 		    sk_mem_pages(sk->sk_wmem_queued +
2153 				 atomic_read(&sk->sk_rmem_alloc) +
2154 				 sk->sk_forward_alloc))
2155 			return 1;
2156 	}
2157 
2158 suppress_allocation:
2159 
2160 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2161 		sk_stream_moderate_sndbuf(sk);
2162 
2163 		/* Fail only if socket is _under_ its sndbuf.
2164 		 * In this case we cannot block, so that we have to fail.
2165 		 */
2166 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2167 			return 1;
2168 	}
2169 
2170 	trace_sock_exceed_buf_limit(sk, prot, allocated);
2171 
2172 	/* Alas. Undo changes. */
2173 	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2174 
2175 	sk_memory_allocated_sub(sk, amt);
2176 
2177 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2178 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2179 
2180 	return 0;
2181 }
2182 EXPORT_SYMBOL(__sk_mem_schedule);
2183 
2184 /**
2185  *	__sk_mem_reclaim - reclaim memory_allocated
2186  *	@sk: socket
2187  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2188  */
2189 void __sk_mem_reclaim(struct sock *sk, int amount)
2190 {
2191 	amount >>= SK_MEM_QUANTUM_SHIFT;
2192 	sk_memory_allocated_sub(sk, amount);
2193 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2194 
2195 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2196 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2197 
2198 	if (sk_under_memory_pressure(sk) &&
2199 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2200 		sk_leave_memory_pressure(sk);
2201 }
2202 EXPORT_SYMBOL(__sk_mem_reclaim);
2203 
2204 int sk_set_peek_off(struct sock *sk, int val)
2205 {
2206 	if (val < 0)
2207 		return -EINVAL;
2208 
2209 	sk->sk_peek_off = val;
2210 	return 0;
2211 }
2212 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2213 
2214 /*
2215  * Set of default routines for initialising struct proto_ops when
2216  * the protocol does not support a particular function. In certain
2217  * cases where it makes no sense for a protocol to have a "do nothing"
2218  * function, some default processing is provided.
2219  */
2220 
2221 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2222 {
2223 	return -EOPNOTSUPP;
2224 }
2225 EXPORT_SYMBOL(sock_no_bind);
2226 
2227 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2228 		    int len, int flags)
2229 {
2230 	return -EOPNOTSUPP;
2231 }
2232 EXPORT_SYMBOL(sock_no_connect);
2233 
2234 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2235 {
2236 	return -EOPNOTSUPP;
2237 }
2238 EXPORT_SYMBOL(sock_no_socketpair);
2239 
2240 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2241 {
2242 	return -EOPNOTSUPP;
2243 }
2244 EXPORT_SYMBOL(sock_no_accept);
2245 
2246 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2247 		    int *len, int peer)
2248 {
2249 	return -EOPNOTSUPP;
2250 }
2251 EXPORT_SYMBOL(sock_no_getname);
2252 
2253 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2254 {
2255 	return 0;
2256 }
2257 EXPORT_SYMBOL(sock_no_poll);
2258 
2259 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2260 {
2261 	return -EOPNOTSUPP;
2262 }
2263 EXPORT_SYMBOL(sock_no_ioctl);
2264 
2265 int sock_no_listen(struct socket *sock, int backlog)
2266 {
2267 	return -EOPNOTSUPP;
2268 }
2269 EXPORT_SYMBOL(sock_no_listen);
2270 
2271 int sock_no_shutdown(struct socket *sock, int how)
2272 {
2273 	return -EOPNOTSUPP;
2274 }
2275 EXPORT_SYMBOL(sock_no_shutdown);
2276 
2277 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2278 		    char __user *optval, unsigned int optlen)
2279 {
2280 	return -EOPNOTSUPP;
2281 }
2282 EXPORT_SYMBOL(sock_no_setsockopt);
2283 
2284 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2285 		    char __user *optval, int __user *optlen)
2286 {
2287 	return -EOPNOTSUPP;
2288 }
2289 EXPORT_SYMBOL(sock_no_getsockopt);
2290 
2291 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2292 {
2293 	return -EOPNOTSUPP;
2294 }
2295 EXPORT_SYMBOL(sock_no_sendmsg);
2296 
2297 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2298 		    int flags)
2299 {
2300 	return -EOPNOTSUPP;
2301 }
2302 EXPORT_SYMBOL(sock_no_recvmsg);
2303 
2304 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2305 {
2306 	/* Mirror missing mmap method error code */
2307 	return -ENODEV;
2308 }
2309 EXPORT_SYMBOL(sock_no_mmap);
2310 
2311 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2312 {
2313 	ssize_t res;
2314 	struct msghdr msg = {.msg_flags = flags};
2315 	struct kvec iov;
2316 	char *kaddr = kmap(page);
2317 	iov.iov_base = kaddr + offset;
2318 	iov.iov_len = size;
2319 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2320 	kunmap(page);
2321 	return res;
2322 }
2323 EXPORT_SYMBOL(sock_no_sendpage);
2324 
2325 /*
2326  *	Default Socket Callbacks
2327  */
2328 
2329 static void sock_def_wakeup(struct sock *sk)
2330 {
2331 	struct socket_wq *wq;
2332 
2333 	rcu_read_lock();
2334 	wq = rcu_dereference(sk->sk_wq);
2335 	if (skwq_has_sleeper(wq))
2336 		wake_up_interruptible_all(&wq->wait);
2337 	rcu_read_unlock();
2338 }
2339 
2340 static void sock_def_error_report(struct sock *sk)
2341 {
2342 	struct socket_wq *wq;
2343 
2344 	rcu_read_lock();
2345 	wq = rcu_dereference(sk->sk_wq);
2346 	if (skwq_has_sleeper(wq))
2347 		wake_up_interruptible_poll(&wq->wait, POLLERR);
2348 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2349 	rcu_read_unlock();
2350 }
2351 
2352 static void sock_def_readable(struct sock *sk)
2353 {
2354 	struct socket_wq *wq;
2355 
2356 	rcu_read_lock();
2357 	wq = rcu_dereference(sk->sk_wq);
2358 	if (skwq_has_sleeper(wq))
2359 		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2360 						POLLRDNORM | POLLRDBAND);
2361 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2362 	rcu_read_unlock();
2363 }
2364 
2365 static void sock_def_write_space(struct sock *sk)
2366 {
2367 	struct socket_wq *wq;
2368 
2369 	rcu_read_lock();
2370 
2371 	/* Do not wake up a writer until he can make "significant"
2372 	 * progress.  --DaveM
2373 	 */
2374 	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2375 		wq = rcu_dereference(sk->sk_wq);
2376 		if (skwq_has_sleeper(wq))
2377 			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2378 						POLLWRNORM | POLLWRBAND);
2379 
2380 		/* Should agree with poll, otherwise some programs break */
2381 		if (sock_writeable(sk))
2382 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2383 	}
2384 
2385 	rcu_read_unlock();
2386 }
2387 
2388 static void sock_def_destruct(struct sock *sk)
2389 {
2390 }
2391 
2392 void sk_send_sigurg(struct sock *sk)
2393 {
2394 	if (sk->sk_socket && sk->sk_socket->file)
2395 		if (send_sigurg(&sk->sk_socket->file->f_owner))
2396 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2397 }
2398 EXPORT_SYMBOL(sk_send_sigurg);
2399 
2400 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2401 		    unsigned long expires)
2402 {
2403 	if (!mod_timer(timer, expires))
2404 		sock_hold(sk);
2405 }
2406 EXPORT_SYMBOL(sk_reset_timer);
2407 
2408 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2409 {
2410 	if (del_timer(timer))
2411 		__sock_put(sk);
2412 }
2413 EXPORT_SYMBOL(sk_stop_timer);
2414 
2415 void sock_init_data(struct socket *sock, struct sock *sk)
2416 {
2417 	skb_queue_head_init(&sk->sk_receive_queue);
2418 	skb_queue_head_init(&sk->sk_write_queue);
2419 	skb_queue_head_init(&sk->sk_error_queue);
2420 
2421 	sk->sk_send_head	=	NULL;
2422 
2423 	init_timer(&sk->sk_timer);
2424 
2425 	sk->sk_allocation	=	GFP_KERNEL;
2426 	sk->sk_rcvbuf		=	sysctl_rmem_default;
2427 	sk->sk_sndbuf		=	sysctl_wmem_default;
2428 	sk->sk_state		=	TCP_CLOSE;
2429 	sk_set_socket(sk, sock);
2430 
2431 	sock_set_flag(sk, SOCK_ZAPPED);
2432 
2433 	if (sock) {
2434 		sk->sk_type	=	sock->type;
2435 		sk->sk_wq	=	sock->wq;
2436 		sock->sk	=	sk;
2437 	} else
2438 		sk->sk_wq	=	NULL;
2439 
2440 	rwlock_init(&sk->sk_callback_lock);
2441 	lockdep_set_class_and_name(&sk->sk_callback_lock,
2442 			af_callback_keys + sk->sk_family,
2443 			af_family_clock_key_strings[sk->sk_family]);
2444 
2445 	sk->sk_state_change	=	sock_def_wakeup;
2446 	sk->sk_data_ready	=	sock_def_readable;
2447 	sk->sk_write_space	=	sock_def_write_space;
2448 	sk->sk_error_report	=	sock_def_error_report;
2449 	sk->sk_destruct		=	sock_def_destruct;
2450 
2451 	sk->sk_frag.page	=	NULL;
2452 	sk->sk_frag.offset	=	0;
2453 	sk->sk_peek_off		=	-1;
2454 
2455 	sk->sk_peer_pid 	=	NULL;
2456 	sk->sk_peer_cred	=	NULL;
2457 	sk->sk_write_pending	=	0;
2458 	sk->sk_rcvlowat		=	1;
2459 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2460 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2461 
2462 	sk->sk_stamp = ktime_set(-1L, 0);
2463 
2464 #ifdef CONFIG_NET_RX_BUSY_POLL
2465 	sk->sk_napi_id		=	0;
2466 	sk->sk_ll_usec		=	sysctl_net_busy_read;
2467 #endif
2468 
2469 	sk->sk_max_pacing_rate = ~0U;
2470 	sk->sk_pacing_rate = ~0U;
2471 	sk->sk_incoming_cpu = -1;
2472 	/*
2473 	 * Before updating sk_refcnt, we must commit prior changes to memory
2474 	 * (Documentation/RCU/rculist_nulls.txt for details)
2475 	 */
2476 	smp_wmb();
2477 	atomic_set(&sk->sk_refcnt, 1);
2478 	atomic_set(&sk->sk_drops, 0);
2479 }
2480 EXPORT_SYMBOL(sock_init_data);
2481 
2482 void lock_sock_nested(struct sock *sk, int subclass)
2483 {
2484 	might_sleep();
2485 	spin_lock_bh(&sk->sk_lock.slock);
2486 	if (sk->sk_lock.owned)
2487 		__lock_sock(sk);
2488 	sk->sk_lock.owned = 1;
2489 	spin_unlock(&sk->sk_lock.slock);
2490 	/*
2491 	 * The sk_lock has mutex_lock() semantics here:
2492 	 */
2493 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2494 	local_bh_enable();
2495 }
2496 EXPORT_SYMBOL(lock_sock_nested);
2497 
2498 void release_sock(struct sock *sk)
2499 {
2500 	spin_lock_bh(&sk->sk_lock.slock);
2501 	if (sk->sk_backlog.tail)
2502 		__release_sock(sk);
2503 
2504 	/* Warning : release_cb() might need to release sk ownership,
2505 	 * ie call sock_release_ownership(sk) before us.
2506 	 */
2507 	if (sk->sk_prot->release_cb)
2508 		sk->sk_prot->release_cb(sk);
2509 
2510 	sock_release_ownership(sk);
2511 	if (waitqueue_active(&sk->sk_lock.wq))
2512 		wake_up(&sk->sk_lock.wq);
2513 	spin_unlock_bh(&sk->sk_lock.slock);
2514 }
2515 EXPORT_SYMBOL(release_sock);
2516 
2517 /**
2518  * lock_sock_fast - fast version of lock_sock
2519  * @sk: socket
2520  *
2521  * This version should be used for very small section, where process wont block
2522  * return false if fast path is taken
2523  *   sk_lock.slock locked, owned = 0, BH disabled
2524  * return true if slow path is taken
2525  *   sk_lock.slock unlocked, owned = 1, BH enabled
2526  */
2527 bool lock_sock_fast(struct sock *sk)
2528 {
2529 	might_sleep();
2530 	spin_lock_bh(&sk->sk_lock.slock);
2531 
2532 	if (!sk->sk_lock.owned)
2533 		/*
2534 		 * Note : We must disable BH
2535 		 */
2536 		return false;
2537 
2538 	__lock_sock(sk);
2539 	sk->sk_lock.owned = 1;
2540 	spin_unlock(&sk->sk_lock.slock);
2541 	/*
2542 	 * The sk_lock has mutex_lock() semantics here:
2543 	 */
2544 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2545 	local_bh_enable();
2546 	return true;
2547 }
2548 EXPORT_SYMBOL(lock_sock_fast);
2549 
2550 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2551 {
2552 	struct timeval tv;
2553 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2554 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2555 	tv = ktime_to_timeval(sk->sk_stamp);
2556 	if (tv.tv_sec == -1)
2557 		return -ENOENT;
2558 	if (tv.tv_sec == 0) {
2559 		sk->sk_stamp = ktime_get_real();
2560 		tv = ktime_to_timeval(sk->sk_stamp);
2561 	}
2562 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2563 }
2564 EXPORT_SYMBOL(sock_get_timestamp);
2565 
2566 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2567 {
2568 	struct timespec ts;
2569 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2570 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2571 	ts = ktime_to_timespec(sk->sk_stamp);
2572 	if (ts.tv_sec == -1)
2573 		return -ENOENT;
2574 	if (ts.tv_sec == 0) {
2575 		sk->sk_stamp = ktime_get_real();
2576 		ts = ktime_to_timespec(sk->sk_stamp);
2577 	}
2578 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2579 }
2580 EXPORT_SYMBOL(sock_get_timestampns);
2581 
2582 void sock_enable_timestamp(struct sock *sk, int flag)
2583 {
2584 	if (!sock_flag(sk, flag)) {
2585 		unsigned long previous_flags = sk->sk_flags;
2586 
2587 		sock_set_flag(sk, flag);
2588 		/*
2589 		 * we just set one of the two flags which require net
2590 		 * time stamping, but time stamping might have been on
2591 		 * already because of the other one
2592 		 */
2593 		if (sock_needs_netstamp(sk) &&
2594 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
2595 			net_enable_timestamp();
2596 	}
2597 }
2598 
2599 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2600 		       int level, int type)
2601 {
2602 	struct sock_exterr_skb *serr;
2603 	struct sk_buff *skb;
2604 	int copied, err;
2605 
2606 	err = -EAGAIN;
2607 	skb = sock_dequeue_err_skb(sk);
2608 	if (skb == NULL)
2609 		goto out;
2610 
2611 	copied = skb->len;
2612 	if (copied > len) {
2613 		msg->msg_flags |= MSG_TRUNC;
2614 		copied = len;
2615 	}
2616 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
2617 	if (err)
2618 		goto out_free_skb;
2619 
2620 	sock_recv_timestamp(msg, sk, skb);
2621 
2622 	serr = SKB_EXT_ERR(skb);
2623 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2624 
2625 	msg->msg_flags |= MSG_ERRQUEUE;
2626 	err = copied;
2627 
2628 out_free_skb:
2629 	kfree_skb(skb);
2630 out:
2631 	return err;
2632 }
2633 EXPORT_SYMBOL(sock_recv_errqueue);
2634 
2635 /*
2636  *	Get a socket option on an socket.
2637  *
2638  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2639  *	asynchronous errors should be reported by getsockopt. We assume
2640  *	this means if you specify SO_ERROR (otherwise whats the point of it).
2641  */
2642 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2643 			   char __user *optval, int __user *optlen)
2644 {
2645 	struct sock *sk = sock->sk;
2646 
2647 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2648 }
2649 EXPORT_SYMBOL(sock_common_getsockopt);
2650 
2651 #ifdef CONFIG_COMPAT
2652 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2653 				  char __user *optval, int __user *optlen)
2654 {
2655 	struct sock *sk = sock->sk;
2656 
2657 	if (sk->sk_prot->compat_getsockopt != NULL)
2658 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2659 						      optval, optlen);
2660 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2661 }
2662 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2663 #endif
2664 
2665 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2666 			int flags)
2667 {
2668 	struct sock *sk = sock->sk;
2669 	int addr_len = 0;
2670 	int err;
2671 
2672 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2673 				   flags & ~MSG_DONTWAIT, &addr_len);
2674 	if (err >= 0)
2675 		msg->msg_namelen = addr_len;
2676 	return err;
2677 }
2678 EXPORT_SYMBOL(sock_common_recvmsg);
2679 
2680 /*
2681  *	Set socket options on an inet socket.
2682  */
2683 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2684 			   char __user *optval, unsigned int optlen)
2685 {
2686 	struct sock *sk = sock->sk;
2687 
2688 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2689 }
2690 EXPORT_SYMBOL(sock_common_setsockopt);
2691 
2692 #ifdef CONFIG_COMPAT
2693 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2694 				  char __user *optval, unsigned int optlen)
2695 {
2696 	struct sock *sk = sock->sk;
2697 
2698 	if (sk->sk_prot->compat_setsockopt != NULL)
2699 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2700 						      optval, optlen);
2701 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2702 }
2703 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2704 #endif
2705 
2706 void sk_common_release(struct sock *sk)
2707 {
2708 	if (sk->sk_prot->destroy)
2709 		sk->sk_prot->destroy(sk);
2710 
2711 	/*
2712 	 * Observation: when sock_common_release is called, processes have
2713 	 * no access to socket. But net still has.
2714 	 * Step one, detach it from networking:
2715 	 *
2716 	 * A. Remove from hash tables.
2717 	 */
2718 
2719 	sk->sk_prot->unhash(sk);
2720 
2721 	/*
2722 	 * In this point socket cannot receive new packets, but it is possible
2723 	 * that some packets are in flight because some CPU runs receiver and
2724 	 * did hash table lookup before we unhashed socket. They will achieve
2725 	 * receive queue and will be purged by socket destructor.
2726 	 *
2727 	 * Also we still have packets pending on receive queue and probably,
2728 	 * our own packets waiting in device queues. sock_destroy will drain
2729 	 * receive queue, but transmitted packets will delay socket destruction
2730 	 * until the last reference will be released.
2731 	 */
2732 
2733 	sock_orphan(sk);
2734 
2735 	xfrm_sk_free_policy(sk);
2736 
2737 	sk_refcnt_debug_release(sk);
2738 
2739 	if (sk->sk_frag.page) {
2740 		put_page(sk->sk_frag.page);
2741 		sk->sk_frag.page = NULL;
2742 	}
2743 
2744 	sock_put(sk);
2745 }
2746 EXPORT_SYMBOL(sk_common_release);
2747 
2748 #ifdef CONFIG_PROC_FS
2749 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
2750 struct prot_inuse {
2751 	int val[PROTO_INUSE_NR];
2752 };
2753 
2754 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2755 
2756 #ifdef CONFIG_NET_NS
2757 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2758 {
2759 	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2760 }
2761 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2762 
2763 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2764 {
2765 	int cpu, idx = prot->inuse_idx;
2766 	int res = 0;
2767 
2768 	for_each_possible_cpu(cpu)
2769 		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2770 
2771 	return res >= 0 ? res : 0;
2772 }
2773 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2774 
2775 static int __net_init sock_inuse_init_net(struct net *net)
2776 {
2777 	net->core.inuse = alloc_percpu(struct prot_inuse);
2778 	return net->core.inuse ? 0 : -ENOMEM;
2779 }
2780 
2781 static void __net_exit sock_inuse_exit_net(struct net *net)
2782 {
2783 	free_percpu(net->core.inuse);
2784 }
2785 
2786 static struct pernet_operations net_inuse_ops = {
2787 	.init = sock_inuse_init_net,
2788 	.exit = sock_inuse_exit_net,
2789 };
2790 
2791 static __init int net_inuse_init(void)
2792 {
2793 	if (register_pernet_subsys(&net_inuse_ops))
2794 		panic("Cannot initialize net inuse counters");
2795 
2796 	return 0;
2797 }
2798 
2799 core_initcall(net_inuse_init);
2800 #else
2801 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2802 
2803 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2804 {
2805 	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2806 }
2807 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2808 
2809 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2810 {
2811 	int cpu, idx = prot->inuse_idx;
2812 	int res = 0;
2813 
2814 	for_each_possible_cpu(cpu)
2815 		res += per_cpu(prot_inuse, cpu).val[idx];
2816 
2817 	return res >= 0 ? res : 0;
2818 }
2819 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2820 #endif
2821 
2822 static void assign_proto_idx(struct proto *prot)
2823 {
2824 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2825 
2826 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2827 		pr_err("PROTO_INUSE_NR exhausted\n");
2828 		return;
2829 	}
2830 
2831 	set_bit(prot->inuse_idx, proto_inuse_idx);
2832 }
2833 
2834 static void release_proto_idx(struct proto *prot)
2835 {
2836 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2837 		clear_bit(prot->inuse_idx, proto_inuse_idx);
2838 }
2839 #else
2840 static inline void assign_proto_idx(struct proto *prot)
2841 {
2842 }
2843 
2844 static inline void release_proto_idx(struct proto *prot)
2845 {
2846 }
2847 #endif
2848 
2849 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
2850 {
2851 	if (!rsk_prot)
2852 		return;
2853 	kfree(rsk_prot->slab_name);
2854 	rsk_prot->slab_name = NULL;
2855 	kmem_cache_destroy(rsk_prot->slab);
2856 	rsk_prot->slab = NULL;
2857 }
2858 
2859 static int req_prot_init(const struct proto *prot)
2860 {
2861 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
2862 
2863 	if (!rsk_prot)
2864 		return 0;
2865 
2866 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
2867 					prot->name);
2868 	if (!rsk_prot->slab_name)
2869 		return -ENOMEM;
2870 
2871 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
2872 					   rsk_prot->obj_size, 0,
2873 					   prot->slab_flags, NULL);
2874 
2875 	if (!rsk_prot->slab) {
2876 		pr_crit("%s: Can't create request sock SLAB cache!\n",
2877 			prot->name);
2878 		return -ENOMEM;
2879 	}
2880 	return 0;
2881 }
2882 
2883 int proto_register(struct proto *prot, int alloc_slab)
2884 {
2885 	if (alloc_slab) {
2886 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2887 					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2888 					NULL);
2889 
2890 		if (prot->slab == NULL) {
2891 			pr_crit("%s: Can't create sock SLAB cache!\n",
2892 				prot->name);
2893 			goto out;
2894 		}
2895 
2896 		if (req_prot_init(prot))
2897 			goto out_free_request_sock_slab;
2898 
2899 		if (prot->twsk_prot != NULL) {
2900 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2901 
2902 			if (prot->twsk_prot->twsk_slab_name == NULL)
2903 				goto out_free_request_sock_slab;
2904 
2905 			prot->twsk_prot->twsk_slab =
2906 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2907 						  prot->twsk_prot->twsk_obj_size,
2908 						  0,
2909 						  prot->slab_flags,
2910 						  NULL);
2911 			if (prot->twsk_prot->twsk_slab == NULL)
2912 				goto out_free_timewait_sock_slab_name;
2913 		}
2914 	}
2915 
2916 	mutex_lock(&proto_list_mutex);
2917 	list_add(&prot->node, &proto_list);
2918 	assign_proto_idx(prot);
2919 	mutex_unlock(&proto_list_mutex);
2920 	return 0;
2921 
2922 out_free_timewait_sock_slab_name:
2923 	kfree(prot->twsk_prot->twsk_slab_name);
2924 out_free_request_sock_slab:
2925 	req_prot_cleanup(prot->rsk_prot);
2926 
2927 	kmem_cache_destroy(prot->slab);
2928 	prot->slab = NULL;
2929 out:
2930 	return -ENOBUFS;
2931 }
2932 EXPORT_SYMBOL(proto_register);
2933 
2934 void proto_unregister(struct proto *prot)
2935 {
2936 	mutex_lock(&proto_list_mutex);
2937 	release_proto_idx(prot);
2938 	list_del(&prot->node);
2939 	mutex_unlock(&proto_list_mutex);
2940 
2941 	kmem_cache_destroy(prot->slab);
2942 	prot->slab = NULL;
2943 
2944 	req_prot_cleanup(prot->rsk_prot);
2945 
2946 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2947 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2948 		kfree(prot->twsk_prot->twsk_slab_name);
2949 		prot->twsk_prot->twsk_slab = NULL;
2950 	}
2951 }
2952 EXPORT_SYMBOL(proto_unregister);
2953 
2954 #ifdef CONFIG_PROC_FS
2955 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2956 	__acquires(proto_list_mutex)
2957 {
2958 	mutex_lock(&proto_list_mutex);
2959 	return seq_list_start_head(&proto_list, *pos);
2960 }
2961 
2962 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2963 {
2964 	return seq_list_next(v, &proto_list, pos);
2965 }
2966 
2967 static void proto_seq_stop(struct seq_file *seq, void *v)
2968 	__releases(proto_list_mutex)
2969 {
2970 	mutex_unlock(&proto_list_mutex);
2971 }
2972 
2973 static char proto_method_implemented(const void *method)
2974 {
2975 	return method == NULL ? 'n' : 'y';
2976 }
2977 static long sock_prot_memory_allocated(struct proto *proto)
2978 {
2979 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2980 }
2981 
2982 static char *sock_prot_memory_pressure(struct proto *proto)
2983 {
2984 	return proto->memory_pressure != NULL ?
2985 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2986 }
2987 
2988 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2989 {
2990 
2991 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2992 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2993 		   proto->name,
2994 		   proto->obj_size,
2995 		   sock_prot_inuse_get(seq_file_net(seq), proto),
2996 		   sock_prot_memory_allocated(proto),
2997 		   sock_prot_memory_pressure(proto),
2998 		   proto->max_header,
2999 		   proto->slab == NULL ? "no" : "yes",
3000 		   module_name(proto->owner),
3001 		   proto_method_implemented(proto->close),
3002 		   proto_method_implemented(proto->connect),
3003 		   proto_method_implemented(proto->disconnect),
3004 		   proto_method_implemented(proto->accept),
3005 		   proto_method_implemented(proto->ioctl),
3006 		   proto_method_implemented(proto->init),
3007 		   proto_method_implemented(proto->destroy),
3008 		   proto_method_implemented(proto->shutdown),
3009 		   proto_method_implemented(proto->setsockopt),
3010 		   proto_method_implemented(proto->getsockopt),
3011 		   proto_method_implemented(proto->sendmsg),
3012 		   proto_method_implemented(proto->recvmsg),
3013 		   proto_method_implemented(proto->sendpage),
3014 		   proto_method_implemented(proto->bind),
3015 		   proto_method_implemented(proto->backlog_rcv),
3016 		   proto_method_implemented(proto->hash),
3017 		   proto_method_implemented(proto->unhash),
3018 		   proto_method_implemented(proto->get_port),
3019 		   proto_method_implemented(proto->enter_memory_pressure));
3020 }
3021 
3022 static int proto_seq_show(struct seq_file *seq, void *v)
3023 {
3024 	if (v == &proto_list)
3025 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3026 			   "protocol",
3027 			   "size",
3028 			   "sockets",
3029 			   "memory",
3030 			   "press",
3031 			   "maxhdr",
3032 			   "slab",
3033 			   "module",
3034 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3035 	else
3036 		proto_seq_printf(seq, list_entry(v, struct proto, node));
3037 	return 0;
3038 }
3039 
3040 static const struct seq_operations proto_seq_ops = {
3041 	.start  = proto_seq_start,
3042 	.next   = proto_seq_next,
3043 	.stop   = proto_seq_stop,
3044 	.show   = proto_seq_show,
3045 };
3046 
3047 static int proto_seq_open(struct inode *inode, struct file *file)
3048 {
3049 	return seq_open_net(inode, file, &proto_seq_ops,
3050 			    sizeof(struct seq_net_private));
3051 }
3052 
3053 static const struct file_operations proto_seq_fops = {
3054 	.owner		= THIS_MODULE,
3055 	.open		= proto_seq_open,
3056 	.read		= seq_read,
3057 	.llseek		= seq_lseek,
3058 	.release	= seq_release_net,
3059 };
3060 
3061 static __net_init int proto_init_net(struct net *net)
3062 {
3063 	if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3064 		return -ENOMEM;
3065 
3066 	return 0;
3067 }
3068 
3069 static __net_exit void proto_exit_net(struct net *net)
3070 {
3071 	remove_proc_entry("protocols", net->proc_net);
3072 }
3073 
3074 
3075 static __net_initdata struct pernet_operations proto_net_ops = {
3076 	.init = proto_init_net,
3077 	.exit = proto_exit_net,
3078 };
3079 
3080 static int __init proto_init(void)
3081 {
3082 	return register_pernet_subsys(&proto_net_ops);
3083 }
3084 
3085 subsys_initcall(proto_init);
3086 
3087 #endif /* PROC_FS */
3088